Add visualization skills: evaluation mode (#12)

xeviknal · ziritrion · web-flow · commit caf074408465 · 2021-02-25T19:58:16.000+01:00
Co-authored-by: ziritrion &lt;ziritrion@gmail.com&gt;
diff --git a/environment.py b/environment.py
@@ -1,18 +1,21 @@
 import gym
 from wrappers.frame_skipper import FrameSkipper
-from gym.wrappers import FrameStack, GrayScaleObservation
+from gym.wrappers import FrameStack, GrayScaleObservation, Monitor
 
 
 class CarRacingEnv:
 
-    def __init__(self, device, stack_frames=4):
+    def __init__(self, device, stack_frames=4, train=False):
         super().__init__()
         self.total_rew = 0
         self.state = None
         self.done = False
         self.device = device
+        self.train = train
 
         self.env = gym.make("CarRacing-v0")
+        if not train:
+            self.env = Monitor(self.env, './video', force=True)
         self.env = GrayScaleObservation(self.env)
         self.env = FrameStack(self.env, stack_frames)
         self.env = FrameSkipper(self.env, 4)
@@ -31,6 +34,9 @@ def spec(self):
         return self.env.spec
 
     def close(self):
-        self.close()
+        self.env.close()
+
+    def render(self):
+        self.env.render()
 
 
diff --git a/main.py b/main.py
@@ -1,27 +1,33 @@
 import torch
 
+import helpers
 from environment import CarRacingEnv
 from trainer import Trainer
+from runner import Runner
 
 from pyvirtualdisplay import Display
-display = Display(visible=0, size=(1400, 900))
-display.start()
 
 # if gpu is to be used
 device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
 
 if __name__ == "__main__":
     hyperparams = {
-        'num_episodes': 20000,  # Number of training episodes
+        'num_episodes': 40000,  # Number of training episodes
         'lr': 1e-2,  # Learning rate
         'gamma': 0.99,  # Discount rate
         'log_interval': 5,  # controls how often we log progress
         'stack_frames': 4,
         'device': device,
-        'params_path': './params/policy-params.dl'
+        'params_path': './params/policy-params.dl',
+        'train': True
     }
 
-    env = CarRacingEnv(device, hyperparams['stack_frames'])
-    trainer = Trainer(env, hyperparams)
-    trainer.train()
+    env = CarRacingEnv(device, hyperparams['stack_frames'], hyperparams['train'])
+    helpers.display_start()
+    if(hyperparams['train']):
+        trainer = Trainer(env, hyperparams)
+        trainer.train()
+    else:
+        runner = Runner(env, hyperparams)
+        runner.run()
 
diff --git a/runner.py b/runner.py
@@ -0,0 +1,39 @@
+import torch
+import numpy as np
+
+from policy import Policy
+from actions import available_actions
+
+class Runner:
+    def __init__(self, env, config):
+        super().__init__()
+        self.env = env
+        self.config = config
+        self.input_channels = config['stack_frames']
+        #self.device = config['device']
+        self.policy = Policy(self.input_channels, len(available_actions))
+        self.policy.load_checkpoint(config['params_path'])
+
+    def select_action(self, state):
+        if state is None:  # First state is always None
+            # Adding the starting signal as a 0's tensor
+            state = np.zeros((self.input_channels, 96, 96))
+        else:
+            state = np.asarray(state)
+        state = torch.from_numpy(state).float().unsqueeze(0)
+        probs = self.policy(state)
+        # We pick the action from a sample of the probabilities
+        # It prevents the model from picking always the same action
+        m = torch.distributions.Categorical(probs)
+        action = m.sample()
+        return available_actions[action.item()]
+
+    def run(self):
+        state, done, total_rew = self.env.reset(), False, 0
+        while not done:
+            self.env.render()
+            action = self.select_action(state)
+            state, rew, done, info = self.env.step(action)
+            total_rew += rew
+        print('Cumulative reward:', total_rew)
+        self.env.close()