xeviknal · jaimepedretp · Feb 5, 2021 · Feb 5, 2021 · Feb 8, 2021 · Feb 8, 2021
diff --git a/.gitignore b/.gitignore
@@ -134,3 +134,9 @@ dmypy.json
 
 # IntelliJ - Pycharm
 .idea/
+
+# MacOS bullshit
+.DS_Store
+
+# nohup log files
+nohup.out
diff --git a/actions.py b/actions.py
@@ -1,23 +1,44 @@
-class Actions:
-
-    available_actions = [
-        [0.0, 0.7, 0.0],  # throttle
-        [0.0, 0.5, 0.0],  # throttle
-        [0.0, 0.2, 0.0],  # throttle
-        [0.0, 0.0, 0.7],  # break
+action_sets = [
+    [
+        [0.0, 0.3, 0.0],  # throttle
+        [0.0, 0.1, 0.0],  # throttle
         [0.0, 0.0, 0.5],  # break
         [0.0, 0.0, 0.2],  # break
-        [-0.8, 0.1, 0.0],  # left
-        [-0.5, 0.1, 0.0],  # left
-        [-0.2, 0.1, 0.0],  # left
-        [0.8, 0.1, 0.0],  # right
-        [0.5, 0.1, 0.0],  # right
-        [0.2, 0.1, 0.0],  # right
+        [-1.0, 0.0, 0.05],  # left
+        [-1.0, 0.0, 0.05],  # left
+        [-1.0, 0.0, 0.05],  # left
+        [1.0, 0.0, 0.05],  # right
+        [1.0, 0.0, 0.05],  # right
+        [1.0, 0.0, 0.05],  # right
+    ],
+    [
+        [0.0, 0.8, 0.0],  # throttle
+        [0.0, 0.0, 0.6],  # break
+        [-0.8, 0.0, 0.0],  # left
+        [-0.5, 0.0, 0.0],  # left
+        [-0.2, 0.0, 0.0],  # left
+        [0.8, 0.0, 0.0],  # right
+        [0.5, 0.0, 0.0],  # right
+        [0.2, 0.0, 0.0],  # right
+    ],
+    [
+        [0.0, 0.8, 0.0],  # throttle
+        [0.0, 0.0, 0.6],  # break
+        [-0.8, 0.0, 0.0],  # left
+        [0.8, 0.0, 0.0],  # right
+    ],
+    [
+        [0.0, 0.0, 0.0],  # no action
+        [0.0, 0.8, 0.0],  # throttle
+        [0.0, 0.0, 0.6],  # break
+        [-0.8, 0.0, 0.0],  # left
+        [0.8, 0.0, 0.0],  # right
     ]
+]
+
 
-    def __class_getitem__(cls, item):
-        if item > len(cls.available_actions) - 1:
-            print('Nobody is driving! Action not found: {0}'.format(item))
-            return cls.available_actions[0]
-        else:
-            return cls.available_actions[item]
+def get_action(set_num):
+    if set_num >= len(action_sets):
+        assert "Wrong available set num. It should go from 0 to {}".format(len(action_sets) - 1)
+        return None
+    return action_sets[set_num]
diff --git a/environment.py b/environment.py
@@ -1,23 +1,29 @@
 import gym
 from wrappers.frame_skipper import FrameSkipper
-from gym.wrappers import FrameStack, GrayScaleObservation
+from gym.wrappers import FrameStack, GrayScaleObservation, Monitor
 
 
 class CarRacingEnv:
 
-    def __init__(self, device, stack_frames=4):
+    def __init__(self, device, stack_frames=4, train=False):
         super().__init__()
         self.total_rew = 0
         self.state = None
         self.done = False
         self.device = device
+        self.train = train
 
         self.env = gym.make("CarRacing-v0")
+        if not train:
+            self.env = Monitor(self.env, './video', force=True)
         self.env = GrayScaleObservation(self.env)
         self.env = FrameStack(self.env, stack_frames)
         self.env = FrameSkipper(self.env, 4)
         print(self.env.observation_space)
 
+    def max_episode_steps(self):
+        return self.spec().max_episode_steps
+
     def step(self, action):
         return self.env.step(action)
 
@@ -31,6 +37,9 @@ def spec(self):
         return self.env.spec
 
     def close(self):
-        self.close()
+        self.env.close()
+
+    def render(self):
+        self.env.render()
 
 
diff --git a/helpers.py b/helpers.py
@@ -3,6 +3,7 @@
 import glob
 import io
 import base64
+import os
 from IPython.display import HTML
 from IPython import display as ipythondisplay
 
@@ -27,4 +28,11 @@ def display_start():
 
 
 def save_model(model, path):
-    torch.save(model.state_dict(), path)
+    torch.save(model.state_dict(), path)
+
+def create_directory(path):
+    try:
+        os.mkdir(path)
+        print(f'Directory {path} has been created.')
+    except FileExistsError:
+        print(f'Directory {path} already exists.')
diff --git a/install/install.sh b/install/install.sh
@@ -3,6 +3,7 @@
 ## Install for the GCE, not for the local machine
 
 sudo apt-get install wget
+sudo apt-get install build-essential swig gcc libjpeg-dev zlib1g-dev
 sudo apt-get install -y xvfb python-opengl ffmpeg
 sudo apt-get install xserver-xorg-core xorg-x11-server-Xvfb
 sudo apt-get install tigervnc-standalone-server tigervnc-xorg-extension tigervnc-viewer

diff --git a/main.py b/main.py
@@ -1,26 +1,43 @@
 import torch
-
+import numpy as np
+import helpers
 from environment import CarRacingEnv
 from trainer import Trainer
+from runner import Runner
 
 from pyvirtualdisplay import Display
-display = Display(visible=0, size=(1400, 900))
-display.start()
 
 # if gpu is to be used
-device = torch.device("cuda") if False else torch.device("cpu")
+device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
 
 if __name__ == "__main__":
     hyperparams = {
-        'num_episodes': 20000,  # Number of training episodes
-        'lr': 1e-2,  # Learning rate
+        'num_episodes': 200000,  # Number of training episodes
+        'lr': 1e-3,  # Learning rate
         'gamma': 0.99,  # Discount rate
         'log_interval': 10,  # controls how often we log progress
         'stack_frames': 4,
-        'params_path': './params/policy-params.dl'
+        'device': device,
+        'params_path': './params/policy-params.dl',
+        'action_set_num': 0,
+        'train': True
     }
 
-    env = CarRacingEnv(device, hyperparams['stack_frames'])
-    trainer = Trainer(env, hyperparams)
-    trainer.train()
+   # Reproducibility: manual seeding
+    seed = 1000
+    torch.manual_seed(seed)
+    torch.cuda.manual_seed(seed)
+    np.random.seed(seed)
+
+    #make sure that params folder exists
+    helpers.create_directory('params')
+
+    env = CarRacingEnv(device, hyperparams['stack_frames'], hyperparams['train'])
+    helpers.display_start()
+    if hyperparams['train']:
+        trainer = Trainer(env, hyperparams)
+        trainer.train()
+    else:
+        runner = Runner(env, hyperparams)
+        runner.run()
 
diff --git a/params/.gitignore b/params/.gitignore
diff --git a/params/policy-params.dl b/params/policy-params.dl
diff --git a/policy.py b/policy.py
@@ -1,51 +1,82 @@
 import torch
 import torch.nn as nn
+import torch.nn.functional as F
 from os import path
 
 
 class Policy(nn.Module):
 
-    def __init__(self, inputs=4, outputs=8):
+    def __init__(self, actor_output, critic_output, inputs=4):
         super(Policy, self).__init__()
         self.pipeline = nn.Sequential(
-            nn.Conv2d(inputs, 32, 3),  # [32, 94, 94]
+            nn.Conv2d(inputs, 12, kernel_size=3, stride=2, padding=1),  # [12, 48, 48]
             nn.ReLU(),
-            nn.MaxPool2d(2),  # [32, 47, 47]
-            nn.Conv2d(32, 64, 4),  # [64, 44, 44]
+            nn.MaxPool2d(2),  # [12, 24, 24]
+            nn.Conv2d(12, 24, kernel_size=3),  # [24, 22, 22]
             nn.ReLU(),
-            nn.MaxPool2d(2),  # [64, 22, 22]
+            nn.MaxPool2d(2),  # [24, 11, 11]
+            nn.Conv2d(24, 32, 4),  # [32, 8, 8]
+            nn.ReLU(),
+            nn.MaxPool2d(2),  # [32, 4, 4]
             nn.Flatten(),
-            nn.Linear(64 * 22 * 22, 512),
+            nn.Linear(32 * 4 * 4, 256),  # [ 512, 256 ]
+            nn.ReLU(),
+            nn.Linear(256, 128),
             nn.ReLU(),
-            nn.Linear(512, outputs),
-            nn.LogSoftmax(dim=-1)
         )
+
+        # actor's layer
+        self.actor_head = nn.Linear(128, actor_output)
+
+        # critic's layer
+        self.critic_head = nn.Linear(128, critic_output)
+
         self.saved_log_probs = []
         self.rewards = []
+        self.entropies = []
 
     def forward(self, x):
-        return self.pipeline(x)
+
+        x= self.pipeline(x)
+        # actor: choses action to take from state s_t 
+        # by returning probability of each action
+        action_prob = F.softmax(self.actor_head(x), dim=-1)
+
+        # critic: evaluates being in the state s_t
+        state_values = self.critic_head(x)
+
+        # return values for both actor and critic as a tuple of 2 values:
+        # 1. a list with the probability of each action over the action space
+        # 2. the value from state s_t 
+        return action_prob, state_values
 
     def load_checkpoint(self, params_path):
         epoch = 0
+        running_reward = 10
+        optim_params = None
         if path.exists(params_path):
             params_descriptor = torch.load(params_path)
             epoch = 0
+            running_reward = 0
             if 'params' in params_descriptor:
                 self.load_state_dict(params_descriptor['params'])
+                optim_params = params_descriptor['optimizer_params']
                 epoch = params_descriptor['epoch']
+                running_reward = params_descriptor['running_reward']
             else:
                 self.load_state_dict(params_descriptor)
 
             print("Model params are loaded now")
         else:
             print("Params not found: training from scratch")
 
-        return epoch
+        return epoch, optim_params, running_reward
 
-    def save_checkpoint(self, params_path, epoch):
+    def save_checkpoint(self, params_path, epoch, running_reward, optimizer):
         torch.save({
             'epoch': epoch,
             'params': self.state_dict(),
+            'running_reward': running_reward,
+            'optimizer_params': optimizer.state_dict(),
         }, params_path)
         print("Relax, params are saved now")
diff --git a/runner.py b/runner.py
@@ -0,0 +1,43 @@
+import torch
+import numpy as np
+
+from policy import Policy
+from actions import get_action
+
+class Runner:
+    def __init__(self, env, config):
+        super().__init__()
+        self.env = env
+        self.config = config
+        self.input_channels = config['stack_frames']
+        self.device = config['device']
+        self.action_set = get_action(config['action_set_num'])
+        self.policy = Policy(len(self.action_set), 1, self.input_channels).to(self.device)
+        self.last_epoch, optim_params, self.running_reward = self.policy.load_checkpoint(config['params_path'])
+        self.optimizer = torch.optim.Adam(self.policy.parameters(), lr=config['lr'])
+        if optim_params is not None:
+            self.optimizer.load_state_dict(optim_params)
+
+    def select_action(self, state):
+        if state is None:  # First state is always None
+            # Adding the starting signal as a 0's tensor
+            state = np.zeros((self.input_channels, 96, 96))
+        else:
+            state = np.asarray(state)
+        state = torch.from_numpy(state).float().unsqueeze(0).view(1, self.input_channels, 96, 96).to(self.device)
+        probs, state_value = self.policy(state)
+        # We pick the action from a sample of the probabilities
+        # It prevents the model from picking always the same action
+        m = torch.distributions.Categorical(probs)
+        action = m.sample()
+        return self.action_set[action.item()]
+
+    def run(self):
+        state, done, total_rew = self.env.reset(), False, 0
+        while not done:
+            self.env.render()
+            action = self.select_action(state)
+            state, rew, done, info = self.env.step(action)
+            total_rew += rew
+        print('Cumulative reward:', total_rew)
+        self.env.close()
diff --git a/runs/Apr04_20-13-10_ubuntu-jp/events.out.tfevents.1617559990.ubuntu-jp.639611.0 b/runs/Apr04_20-13-10_ubuntu-jp/events.out.tfevents.1617559990.ubuntu-jp.639611.0
diff --git a/runs/Apr06_12-13-03_ubuntu-jp/events.out.tfevents.1617703983.ubuntu-jp.668137.0 b/runs/Apr06_12-13-03_ubuntu-jp/events.out.tfevents.1617703983.ubuntu-jp.668137.0
diff --git a/runs/Apr11_18-25-48_ubuntu-jp/events.out.tfevents.1618158348.ubuntu-jp.760116.0 b/runs/Apr11_18-25-48_ubuntu-jp/events.out.tfevents.1618158348.ubuntu-jp.760116.0
diff --git a/runs/Apr12_15-21-53_ubuntu-jp/events.out.tfevents.1618233713.ubuntu-jp.4017.0 b/runs/Apr12_15-21-53_ubuntu-jp/events.out.tfevents.1618233713.ubuntu-jp.4017.0
diff --git a/runs/Mar19_13-51-39_kenya/events.out.tfevents.1616158299.kenya.61074.0 b/runs/Mar19_13-51-39_kenya/events.out.tfevents.1616158299.kenya.61074.0
diff --git a/runs/Mar22_11-11-38_kenya/events.out.tfevents.1616407898.kenya.104829.0 b/runs/Mar22_11-11-38_kenya/events.out.tfevents.1616407898.kenya.104829.0
diff --git a/runs/mar23_15-48-47_kenya/events.out.tfevents.1616510927.kenya.2797.0 b/runs/mar23_15-48-47_kenya/events.out.tfevents.1616510927.kenya.2797.0
diff --git a/spec/actions_test.py b/spec/actions_test.py