vwxyzjn · vwxyzjn · Jul 7, 2022 · Jun 20, 2022 · Jul 6, 2022 · Jul 6, 2022
diff --git a/cleanrl/ppg_procgen.py b/cleanrl/ppg_procgen.py
@@ -249,7 +249,7 @@ def get_pi(self, x):
     envs = gym.wrappers.RecordEpisodeStatistics(envs)
     if args.capture_video:
         envs = gym.wrappers.RecordVideo(envs, f"videos/{run_name}")
-    envs = gym.wrappers.NormalizeReward(envs)
+    envs = gym.wrappers.NormalizeReward(envs, gamma=args.gamma)
     envs = gym.wrappers.TransformReward(envs, lambda reward: np.clip(reward, -10, 10))
     assert isinstance(envs.single_action_space, gym.spaces.Discrete), "only discrete action space is supported"
 

diff --git a/cleanrl/ppo_continuous_action.py b/cleanrl/ppo_continuous_action.py
@@ -79,7 +79,7 @@ def parse_args():
     return args
 
 
-def make_env(env_id, seed, idx, capture_video, run_name):
+def make_env(env_id, seed, idx, capture_video, run_name, gamma):
     def thunk():
         env = gym.make(env_id)
         env = gym.wrappers.RecordEpisodeStatistics(env)
@@ -89,7 +89,7 @@ def thunk():
         env = gym.wrappers.ClipAction(env)
         env = gym.wrappers.NormalizeObservation(env)
         env = gym.wrappers.TransformObservation(env, lambda obs: np.clip(obs, -10, 10))
-        env = gym.wrappers.NormalizeReward(env)
+        env = gym.wrappers.NormalizeReward(env, gamma=gamma)
         env = gym.wrappers.TransformReward(env, lambda reward: np.clip(reward, -10, 10))
         env.seed(seed)
         env.action_space.seed(seed)
@@ -168,7 +168,7 @@ def get_action_and_value(self, x, action=None):
 
     # env setup
     envs = gym.vector.SyncVectorEnv(
-        [make_env(args.env_id, args.seed + i, i, args.capture_video, run_name) for i in range(args.num_envs)]
+        [make_env(args.env_id, args.seed + i, i, args.capture_video, run_name, args.gamma) for i in range(args.num_envs)]
     )
     assert isinstance(envs.single_action_space, gym.spaces.Box), "only continuous action space is supported"
 

diff --git a/cleanrl/ppo_procgen.py b/cleanrl/ppo_procgen.py
@@ -193,7 +193,7 @@ def get_action_and_value(self, x, action=None):
     envs = gym.wrappers.RecordEpisodeStatistics(envs)
     if args.capture_video:
         envs = gym.wrappers.RecordVideo(envs, f"videos/{run_name}")
-    envs = gym.wrappers.NormalizeReward(envs)
+    envs = gym.wrappers.NormalizeReward(envs, gamma=args.gamma)
     envs = gym.wrappers.TransformReward(envs, lambda reward: np.clip(reward, -10, 10))
     assert isinstance(envs.single_action_space, gym.spaces.Discrete), "only discrete action space is supported"
 

diff --git a/docs/rl-algorithms/ppg.md b/docs/rl-algorithms/ppg.md
@@ -77,6 +77,7 @@ PPG specific:
             * Fully connected layer after last conv later - 1.4
             * Convolutional layers - Approximately 0.638
 1. The Adam Optimizer's Epsilon Parameter -(:material-github: [phasic_policy_gradient/ppg.py#L239](https://github.com/openai/phasic-policy-gradient/blob/c789b00be58aa704f7223b6fc8cd28a5aaa2e101/phasic_policy_gradient/ppg.py#L239)) - Set to torch default of 1e-8 instead of 1e-5 which is used in PPO.
+1. Use the same `gamma` parameter in the `NormalizeReward` wrapper. Note that the original implementation from [openai/train-procgen](https://github.com/openai/train-procgen) uses the default `gamma=0.99` in [the `VecNormalize` wrapper](https://github.com/openai/train-procgen/blob/1a2ae2194a61f76a733a39339530401c024c3ad8/train_procgen/train.py#L43) but `gamma=0.999` as PPO's parameter. The mismatch between the `gamma`s is technically incorrect. See [#209](https://github.com/vwxyzjn/cleanrl/pull/209)
 
 Here are some additional notes:
 
@@ -105,9 +106,9 @@ Below are the average episodic returns for `ppg_procgen.py`, and comparison with
 
 | Environment      | `ppg_procgen.py` | `ppo_procgen.py` | `openai/phasic-policy-gradient` (easy) |
 |------------------|------------------|------------------|----------------------------------------|
-| Starpilot (easy) | 35.19 ± 13.07    | 33.15 ± 11.99    | 42.01 ± 9.59                           |
-| Bossfight (easy) | 10.34 ± 2.27     | 9.48 ± 2.42      | 10.71 ± 2.05                           |
-| Bigfish (easy)   | 27.25 ± 7.55     | 22.21 ± 7.42     | 15.94 ± 10.80                          |
+| Starpilot (easy) | 34.82 ± 13.77    | 32.47 ± 11.21    | 42.01 ± 9.59                           |
+| Bossfight (easy) | 10.78 ± 1.90     | 9.63 ± 2.35      | 10.71 ± 2.05                           |
+| Bigfish (easy)   | 24.23 ± 10.73    | 16.80 ± 9.49     | 15.94 ± 10.80                          |
 
 
 ???+ warning

diff --git a/docs/rl-algorithms/ppg/BigFish.png b/docs/rl-algorithms/ppg/BigFish.png
diff --git a/docs/rl-algorithms/ppg/BossFight.png b/docs/rl-algorithms/ppg/BossFight.png
diff --git a/docs/rl-algorithms/ppg/StarPilot.png b/docs/rl-algorithms/ppg/StarPilot.png
diff --git a/docs/rl-algorithms/ppg/comparison/BigFish.png b/docs/rl-algorithms/ppg/comparison/BigFish.png
diff --git a/docs/rl-algorithms/ppg/comparison/BossFight.png b/docs/rl-algorithms/ppg/comparison/BossFight.png
diff --git a/docs/rl-algorithms/ppg/comparison/StarPilot.png b/docs/rl-algorithms/ppg/comparison/StarPilot.png
diff --git a/docs/rl-algorithms/ppo.md b/docs/rl-algorithms/ppo.md
@@ -444,7 +444,7 @@ See [related docs](/rl-algorithms/ppo/#explanation-of-the-logged-metrics) for `p
 [ppo_procgen.py](https://github.com/vwxyzjn/cleanrl/blob/master/cleanrl/ppo_procgen.py) is based on the details in "Appendix" in [The 37 Implementation Details of Proximal Policy Optimization](https://iclr-blog-track.github.io/2022/03/25/ppo-implementation-details/), which are as follows:
 
 1. IMPALA-style Neural Network (:material-github: [common/models.py#L28](https://github.com/openai/baselines/blob/ea25b9e8b234e6ee1bca43083f8f3cf974143998/baselines/common/models.py#L28))
-
+1. Use the same `gamma` parameter in the `NormalizeReward` wrapper. Note that the original implementation from [openai/train-procgen](https://github.com/openai/train-procgen) uses the default `gamma=0.99` in [the `VecNormalize` wrapper](https://github.com/openai/train-procgen/blob/1a2ae2194a61f76a733a39339530401c024c3ad8/train_procgen/train.py#L43) but `gamma=0.999` as PPO's parameter. The mismatch between the `gamma`s is technically incorrect. See [#209](https://github.com/vwxyzjn/cleanrl/pull/209)
 
 ### Experiment results
 
@@ -465,9 +465,9 @@ Below are the average episodic returns for `ppo_procgen.py`. To ensure the quali
 
 | Environment      | `ppo_procgen.py` | `openai/baselies`' PPO (Huang et al., 2022)[^1]
 | ----------- | ----------- | ----------- |
-| StarPilot (easy)      | 31.40 ± 11.73     | 33.97 ± 7.86  |
-| BossFight (easy)   | 9.09 ± 2.35    |  9.35 ± 2.04 |
-| BigFish  (easy)  | 21.44 ± 6.73         | 20.06 ± 5.34 |
+| StarPilot (easy)      | 32.47 ± 11.21      | 33.97 ± 7.86  |
+| BossFight (easy)   | 9.63 ± 2.35    |  9.35 ± 2.04 |
+| BigFish  (easy)  | 16.80 ± 9.49         | 20.06 ± 5.34 |
 
 
 ???+ info

diff --git a/docs/rl-algorithms/ppo/BigFish.png b/docs/rl-algorithms/ppo/BigFish.png
diff --git a/docs/rl-algorithms/ppo/BossFight.png b/docs/rl-algorithms/ppo/BossFight.png
diff --git a/docs/rl-algorithms/ppo/StarPilot.png b/docs/rl-algorithms/ppo/StarPilot.png