Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
38 commits
Select commit Hold shift + click to select a range
358087d
log_interval 5
jaimepedretp Feb 5, 2021
3635b91
Merge pull request #2 from xeviknal/with-baseline
ziritrion Feb 5, 2021
2ddf6bd
Merge branch 'main' of https://github.com/xeviknal/aidl-2021-wo-rl in…
jaimepedretp Feb 8, 2021
a7ac921
RL-with baseline. JuanJo's proposal added
jaimepedretp Feb 8, 2021
67bea13
Porting differences to RL-with-baseline (#9)
xeviknal Feb 19, 2021
472e1f7
Fixed trainer.py for GPU execution and modified main.py to run in Ubuntu
ziritrion Feb 20, 2021
caf0744
Add visualization skills: evaluation mode (#12)
xeviknal Feb 25, 2021
4d7130e
Finished 30k runs; reward was not significantly improved (#13)
xeviknal Mar 11, 2021
2106b4b
Add metrics and logsoftmax
xeviknal Mar 2, 2021
0f2f195
Updating the model
xeviknal Mar 11, 2021
39b3ad3
Add new model to baseline
xeviknal Mar 11, 2021
907af7a
The line that fixes all
xeviknal Mar 14, 2021
8e4ee6c
Add mean entropy - to reduce tensorboard runs
xeviknal Mar 14, 2021
3970702
Add action prob mean: mean of prob of actions taken in the episode
xeviknal Mar 14, 2021
957a3b4
Added simple directory check to params folder
ziritrion Mar 14, 2021
0105564
Added additional param save conditions (end of log_interval, last epi…
ziritrion Mar 14, 2021
b5a5184
Merge branch 'RL-baseline-new-model' of github.com:xeviknal/aidl-2021…
ziritrion Mar 14, 2021
c6954ec
Removing old runs; they don't apply to this branch
ziritrion Mar 14, 2021
a7c907c
RL-baseline-NM-save-optim
xeviknal Mar 14, 2021
d5b676c
Load optimizer params
xeviknal Mar 14, 2021
bd7f6c0
8k runs
ziritrion Mar 15, 2021
5f246c0
Fresh start with latest checkpoint load-save changes. Also, small git…
ziritrion Mar 15, 2021
e8aa5e4
bugfix
ziritrion Mar 15, 2021
c52a4f2
10k runs
ziritrion Mar 15, 2021
192bece
Almost 20k runs. Reward is starting to improve little by little
ziritrion Mar 16, 2021
11b46c4
Fixed runner.py for generating videos
ziritrion Mar 16, 2021
befd201
25k runs. Slight improvement but far from desirable
ziritrion Mar 16, 2021
970f84a
10k episodes. Learning rate 1e-3. Original actions. RR of almost 700 …
ziritrion Mar 19, 2021
1038b29
Added new action set and removed previous runs for fresh start
ziritrion Mar 19, 2021
1204087
5k runs, reward around 450
ziritrion Mar 19, 2021
b44f017
20k episodes. Running reward 513
ziritrion Mar 23, 2021
d3dda5a
Run up to 33.7k episodes using actions set 3 - max/final av. reward 8…
jaimepedretp Apr 6, 2021
447dd0c
added video eval mode - using model max/final av. reward 881/825
jaimepedretp Apr 6, 2021
9d0e23f
Run from scratch on RL-baseline-new-model-exp1-act3 - 8.1k episodes m…
jaimepedretp Apr 7, 2021
db82d3e
Added video eval mode - 8.1k episodes max/final av.reward 868/815
jaimepedretp Apr 7, 2021
b1e6fe5
Final training - rl-baseline-exp2 _ 6.4k episodes - best/final reward…
jaimepedretp Apr 12, 2021
74b7903
Runs up to 20.6k
jaimepedretp Apr 14, 2021
5990457
Added video eval model 926reward - rl-baseline-exp2
jaimepedretp Apr 16, 2021
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 6 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -134,3 +134,9 @@ dmypy.json

# IntelliJ - Pycharm
.idea/

# MacOS bullshit
.DS_Store

# nohup log files
nohup.out
59 changes: 40 additions & 19 deletions actions.py
Original file line number Diff line number Diff line change
@@ -1,23 +1,44 @@
class Actions:

available_actions = [
[0.0, 0.7, 0.0], # throttle
[0.0, 0.5, 0.0], # throttle
[0.0, 0.2, 0.0], # throttle
[0.0, 0.0, 0.7], # break
action_sets = [
[
[0.0, 0.3, 0.0], # throttle
[0.0, 0.1, 0.0], # throttle
[0.0, 0.0, 0.5], # break
[0.0, 0.0, 0.2], # break
[-0.8, 0.1, 0.0], # left
[-0.5, 0.1, 0.0], # left
[-0.2, 0.1, 0.0], # left
[0.8, 0.1, 0.0], # right
[0.5, 0.1, 0.0], # right
[0.2, 0.1, 0.0], # right
[-1.0, 0.0, 0.05], # left
[-1.0, 0.0, 0.05], # left
[-1.0, 0.0, 0.05], # left
[1.0, 0.0, 0.05], # right
[1.0, 0.0, 0.05], # right
[1.0, 0.0, 0.05], # right
],
[
[0.0, 0.8, 0.0], # throttle
[0.0, 0.0, 0.6], # break
[-0.8, 0.0, 0.0], # left
[-0.5, 0.0, 0.0], # left
[-0.2, 0.0, 0.0], # left
[0.8, 0.0, 0.0], # right
[0.5, 0.0, 0.0], # right
[0.2, 0.0, 0.0], # right
],
[
[0.0, 0.8, 0.0], # throttle
[0.0, 0.0, 0.6], # break
[-0.8, 0.0, 0.0], # left
[0.8, 0.0, 0.0], # right
],
[
[0.0, 0.0, 0.0], # no action
[0.0, 0.8, 0.0], # throttle
[0.0, 0.0, 0.6], # break
[-0.8, 0.0, 0.0], # left
[0.8, 0.0, 0.0], # right
]
]


def __class_getitem__(cls, item):
if item > len(cls.available_actions) - 1:
print('Nobody is driving! Action not found: {0}'.format(item))
return cls.available_actions[0]
else:
return cls.available_actions[item]
def get_action(set_num):
if set_num >= len(action_sets):
assert "Wrong available set num. It should go from 0 to {}".format(len(action_sets) - 1)
return None
return action_sets[set_num]
15 changes: 12 additions & 3 deletions environment.py
Original file line number Diff line number Diff line change
@@ -1,23 +1,29 @@
import gym
from wrappers.frame_skipper import FrameSkipper
from gym.wrappers import FrameStack, GrayScaleObservation
from gym.wrappers import FrameStack, GrayScaleObservation, Monitor


class CarRacingEnv:

def __init__(self, device, stack_frames=4):
def __init__(self, device, stack_frames=4, train=False):
super().__init__()
self.total_rew = 0
self.state = None
self.done = False
self.device = device
self.train = train

self.env = gym.make("CarRacing-v0")
if not train:
self.env = Monitor(self.env, './video', force=True)
self.env = GrayScaleObservation(self.env)
self.env = FrameStack(self.env, stack_frames)
self.env = FrameSkipper(self.env, 4)
print(self.env.observation_space)

def max_episode_steps(self):
return self.spec().max_episode_steps

def step(self, action):
return self.env.step(action)

Expand All @@ -31,6 +37,9 @@ def spec(self):
return self.env.spec

def close(self):
self.close()
self.env.close()

def render(self):
self.env.render()


10 changes: 9 additions & 1 deletion helpers.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
import glob
import io
import base64
import os
from IPython.display import HTML
from IPython import display as ipythondisplay

Expand All @@ -27,4 +28,11 @@ def display_start():


def save_model(model, path):
torch.save(model.state_dict(), path)
torch.save(model.state_dict(), path)

def create_directory(path):
try:
os.mkdir(path)
print(f'Directory {path} has been created.')
except FileExistsError:
print(f'Directory {path} already exists.')
1 change: 1 addition & 0 deletions install/install.sh
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
## Install for the GCE, not for the local machine

sudo apt-get install wget
sudo apt-get install build-essential swig gcc libjpeg-dev zlib1g-dev
sudo apt-get install -y xvfb python-opengl ffmpeg
sudo apt-get install xserver-xorg-core xorg-x11-server-Xvfb
sudo apt-get install tigervnc-standalone-server tigervnc-xorg-extension tigervnc-viewer
Expand Down
37 changes: 27 additions & 10 deletions main.py
Original file line number Diff line number Diff line change
@@ -1,26 +1,43 @@
import torch

import numpy as np
import helpers
from environment import CarRacingEnv
from trainer import Trainer
from runner import Runner

from pyvirtualdisplay import Display
display = Display(visible=0, size=(1400, 900))
display.start()

# if gpu is to be used
device = torch.device("cuda") if False else torch.device("cpu")
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

if __name__ == "__main__":
hyperparams = {
'num_episodes': 20000, # Number of training episodes
'lr': 1e-2, # Learning rate
'num_episodes': 200000, # Number of training episodes
'lr': 1e-3, # Learning rate
'gamma': 0.99, # Discount rate
'log_interval': 10, # controls how often we log progress
'stack_frames': 4,
'params_path': './params/policy-params.dl'
'device': device,
'params_path': './params/policy-params.dl',
'action_set_num': 0,
'train': True
}

env = CarRacingEnv(device, hyperparams['stack_frames'])
trainer = Trainer(env, hyperparams)
trainer.train()
# Reproducibility: manual seeding
seed = 1000
torch.manual_seed(seed)
torch.cuda.manual_seed(seed)
np.random.seed(seed)

#make sure that params folder exists
helpers.create_directory('params')

env = CarRacingEnv(device, hyperparams['stack_frames'], hyperparams['train'])
helpers.display_start()
if hyperparams['train']:
trainer = Trainer(env, hyperparams)
trainer.train()
else:
runner = Runner(env, hyperparams)
runner.run()

1 change: 0 additions & 1 deletion params/.gitignore

This file was deleted.

Binary file modified params/policy-params.dl
Binary file not shown.
53 changes: 42 additions & 11 deletions policy.py
Original file line number Diff line number Diff line change
@@ -1,51 +1,82 @@
import torch
import torch.nn as nn
import torch.nn.functional as F
from os import path


class Policy(nn.Module):

def __init__(self, inputs=4, outputs=8):
def __init__(self, actor_output, critic_output, inputs=4):
super(Policy, self).__init__()
self.pipeline = nn.Sequential(
nn.Conv2d(inputs, 32, 3), # [32, 94, 94]
nn.Conv2d(inputs, 12, kernel_size=3, stride=2, padding=1), # [12, 48, 48]
nn.ReLU(),
nn.MaxPool2d(2), # [32, 47, 47]
nn.Conv2d(32, 64, 4), # [64, 44, 44]
nn.MaxPool2d(2), # [12, 24, 24]
nn.Conv2d(12, 24, kernel_size=3), # [24, 22, 22]
nn.ReLU(),
nn.MaxPool2d(2), # [64, 22, 22]
nn.MaxPool2d(2), # [24, 11, 11]
nn.Conv2d(24, 32, 4), # [32, 8, 8]
nn.ReLU(),
nn.MaxPool2d(2), # [32, 4, 4]
nn.Flatten(),
nn.Linear(64 * 22 * 22, 512),
nn.Linear(32 * 4 * 4, 256), # [ 512, 256 ]
nn.ReLU(),
nn.Linear(256, 128),
nn.ReLU(),
nn.Linear(512, outputs),
nn.LogSoftmax(dim=-1)
)

# actor's layer
self.actor_head = nn.Linear(128, actor_output)

# critic's layer
self.critic_head = nn.Linear(128, critic_output)

self.saved_log_probs = []
self.rewards = []
self.entropies = []

def forward(self, x):
return self.pipeline(x)

x= self.pipeline(x)
# actor: choses action to take from state s_t
# by returning probability of each action
action_prob = F.softmax(self.actor_head(x), dim=-1)

# critic: evaluates being in the state s_t
state_values = self.critic_head(x)

# return values for both actor and critic as a tuple of 2 values:
# 1. a list with the probability of each action over the action space
# 2. the value from state s_t
return action_prob, state_values

def load_checkpoint(self, params_path):
epoch = 0
running_reward = 10
optim_params = None
if path.exists(params_path):
params_descriptor = torch.load(params_path)
epoch = 0
running_reward = 0
if 'params' in params_descriptor:
self.load_state_dict(params_descriptor['params'])
optim_params = params_descriptor['optimizer_params']
epoch = params_descriptor['epoch']
running_reward = params_descriptor['running_reward']
else:
self.load_state_dict(params_descriptor)

print("Model params are loaded now")
else:
print("Params not found: training from scratch")

return epoch
return epoch, optim_params, running_reward

def save_checkpoint(self, params_path, epoch):
def save_checkpoint(self, params_path, epoch, running_reward, optimizer):
torch.save({
'epoch': epoch,
'params': self.state_dict(),
'running_reward': running_reward,
'optimizer_params': optimizer.state_dict(),
}, params_path)
print("Relax, params are saved now")
43 changes: 43 additions & 0 deletions runner.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,43 @@
import torch
import numpy as np

from policy import Policy
from actions import get_action

class Runner:
def __init__(self, env, config):
super().__init__()
self.env = env
self.config = config
self.input_channels = config['stack_frames']
self.device = config['device']
self.action_set = get_action(config['action_set_num'])
self.policy = Policy(len(self.action_set), 1, self.input_channels).to(self.device)
self.last_epoch, optim_params, self.running_reward = self.policy.load_checkpoint(config['params_path'])
self.optimizer = torch.optim.Adam(self.policy.parameters(), lr=config['lr'])
if optim_params is not None:
self.optimizer.load_state_dict(optim_params)

def select_action(self, state):
if state is None: # First state is always None
# Adding the starting signal as a 0's tensor
state = np.zeros((self.input_channels, 96, 96))
else:
state = np.asarray(state)
state = torch.from_numpy(state).float().unsqueeze(0).view(1, self.input_channels, 96, 96).to(self.device)
probs, state_value = self.policy(state)
# We pick the action from a sample of the probabilities
# It prevents the model from picking always the same action
m = torch.distributions.Categorical(probs)
action = m.sample()
return self.action_set[action.item()]

def run(self):
state, done, total_rew = self.env.reset(), False, 0
while not done:
self.env.render()
action = self.select_action(state)
state, rew, done, info = self.env.step(action)
total_rew += rew
print('Cumulative reward:', total_rew)
self.env.close()
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
15 changes: 0 additions & 15 deletions spec/actions_test.py

This file was deleted.

Loading