[RLlib] MetricsLogger: Fix get/set_state to handle tensors in self.values. (#53514)

sven1977 · web-flow · commit 8ab086822cc6 · 2025-06-17T17:05:11.000+02:00
diff --git a/rllib/BUILD b/rllib/BUILD
@@ -4209,7 +4209,7 @@ py_test(
         "--num-agents=2",
         "--as-test",
         "--evaluation-parallel-to-training",
-        "--stop-reward=900.0",
+        "--stop-reward=800.0",
         "--num-cpus=6",
         "--evaluation-duration=auto",
         "--evaluation-duration-unit=episodes",
diff --git a/rllib/algorithms/algorithm_config.py b/rllib/algorithms/algorithm_config.py
@@ -4349,10 +4349,10 @@ def get_default_learner_class(self) -> Union[Type["Learner"], str]:
     def get_rl_module_spec(
         self,
         env: Optional[EnvType] = None,
-        spaces: Optional[Dict[str, gym.Space]] = None,
+        spaces: Optional[Dict[str, Tuple[gym.Space, gym.Space]]] = None,
         inference_only: Optional[bool] = None,
     ) -> RLModuleSpec:
-        """Returns the RLModuleSpec based on the given env/spaces.
+        """Returns the RLModuleSpec based on the given env/spaces and this config.
 
         Args:
             env: An optional environment instance, from which to infer the observation-
@@ -4363,10 +4363,10 @@ def get_rl_module_spec(
             spaces: Optional dict mapping ModuleIDs to 2-tuples of observation- and
                 action space that should be used for the respective RLModule.
                 These spaces are usually provided by an already instantiated remote
-                EnvRunner (call `EnvRunner.get_spaces()`). If not provided, tries
-                to infer from `env`, otherwise from `self.observation_space` and
-                `self.action_space`. Raises an error, if no information on spaces can be
-                inferred.
+                EnvRunner (call `EnvRunner.get_spaces()` to receive this dict). If not
+                provided, RLlib tries to infer this from `env`, if provided, otherwise
+                from `self.observation_space` and `self.action_space`. Raises an error,
+                if no information on spaces can be inferred.
             inference_only: If `True`, the returned module spec is used in an
                 inference-only setting (sampling) and the RLModule can thus be built in
                 its light version (if available). For example, the `inference_only`
diff --git a/rllib/algorithms/dqn/torch/dqn_torch_learner.py b/rllib/algorithms/dqn/torch/dqn_torch_learner.py
@@ -247,13 +247,13 @@ def possibly_masked_max(data_):
             key=module_id,
             window=1,  # <- single items (should not be mean/ema-reduced over time).
         )
-        # If we learn a Q-value distribution store the support and average
+        # If we learn a Q-value distribution log the support and average
         # probabilities.
         if config.num_atoms > 1:
             # Log important loss stats.
             self.metrics.log_dict(
                 {
-                    ATOMS: z,
+                    ATOMS: torch.mean(z),
                     # The absolute difference in expectation between the actions
                     # should (at least mildly) rise.
                     "expectations_abs_diff": torch.mean(
diff --git a/rllib/examples/evaluation/evaluation_parallel_to_training.py b/rllib/examples/evaluation/evaluation_parallel_to_training.py
@@ -88,7 +88,10 @@
 from ray.rllib.utils.typing import ResultDict
 from ray.tune.registry import get_trainable_cls, register_env
 
-parser = add_rllib_example_script_args(default_reward=500.0)
+parser = add_rllib_example_script_args(
+    default_timesteps=200000,
+    default_reward=500.0,
+)
 parser.set_defaults(
     evaluation_num_env_runners=2,
     evaluation_interval=1,
diff --git a/rllib/utils/metrics/stats.py b/rllib/utils/metrics/stats.py