ray-project
diff --git a/‎doc/source/rllib/rllib-examples.rst
Lines changed: 13 additions & 3 deletions b/‎doc/source/rllib/rllib-examples.rst
Lines changed: 13 additions & 3 deletions
diff --git a/‎rllib/BUILD
Lines changed: 20 additions & 3 deletions b/‎rllib/BUILD
Lines changed: 20 additions & 3 deletions
diff --git a/‎rllib/connectors/env_to_module/observation_preprocessor.py
Lines changed: 113 additions & 12 deletions b/‎rllib/connectors/env_to_module/observation_preprocessor.py
Lines changed: 113 additions & 12 deletions
diff --git a/‎rllib/env/single_agent_episode.py
Lines changed: 0 additions & 6 deletions b/‎rllib/env/single_agent_episode.py
Lines changed: 0 additions & 6 deletions
diff --git a/‎rllib/examples/connectors/classes/add_other_agents_row_index_to_xy_pos.py
Lines changed: 114 additions & 0 deletions b/‎rllib/examples/connectors/classes/add_other_agents_row_index_to_xy_pos.py
Lines changed: 114 additions & 0 deletions
@@ -134,13 +134,21 @@ Connectors
    This type of filtering can improve learning stability in environments with highly variable state magnitudes
    by scaling observations to a normalized range.
 
-- `Multi-agent connector mapping global observations to different per-agent/policy observations <https://github.com/ray-project/ray/blob/master/rllib/examples/connectors/multi_agent_with_different_observation_spaces.py>`__:
-   A connector example showing how to map from a global, multi-agent observation space to n individual, per-agent, per-module observation spaces.
+- `Multi-agent observation preprocessor enhancing non-Markovian observations to Markovian ones <https://github.com/ray-project/ray/blob/master/rllib/examples/connectors/multi_agent_observation_preprocessor.py>`__:
+   A multi-agent preprocessor enhances the per-agent observations of a multi-agent env, which by themselves are non-Markovian,
+   partial observations and converts them into Markovian observations by adding information from
+   the respective other agent. A policy can only be trained optimally through this additional information.
 
 - `Prev-actions, prev-rewards connector <https://github.com/ray-project/ray/blob/master/rllib/examples/connectors/prev_actions_prev_rewards.py>`__:
    Augments observations with previous actions and rewards, giving the agent a short-term memory of past events, which can improve
    decision-making in partially observable or sequentially dependent tasks.
 
+- `Single-agent observation preprocessor <https://github.com/ray-project/ray/blob/master/rllib/examples/connectors/single_agent_observation_preprocessor.py>`__:
+   A connector alters the CartPole-v1 environment observations from the Markovian 4-tuple (x-pos,
+   angular-pos, x-velocity, angular-velocity) to a non-Markovian, simpler 2-tuple (only
+   x-pos and angular-pos). The resulting problem can only be solved through a
+   memory/stateful model, for example an LSTM.
+
 
 Curiosity
 +++++++++
@@ -308,8 +316,10 @@ Multi-agent RL
    a hand-coded random policy while another agent trains with PPO. This example highlights integrating static and dynamic policies,
    suitable for environments with a mix of fixed-strategy and adaptive agents.
 
-- `Different spaces for agents <https://github.com/ray-project/ray/blob/master/rllib/examples/multi_agent/different_spaces_for_agents.py>`__:
+- `Different observation- and action spaces for different agents <https://github.com/ray-project/ray/blob/master/rllib/examples/multi_agent/different_spaces_for_agents.py>`__:
    Configures agents with differing observation and action spaces within the same environment, showcasing RLlib's support for heterogeneous agents with varying space requirements in a single multi-agent environment.
+   Another example, which also makes use of connectors, and that covers the same topic, agents having different spaces, can be found
+   `here <https://github.com/ray-project/ray/blob/master/rllib/examples/connectors/multi_agent_observation_preprocessor.py>`__.
 
 - `Grouped agents, two-step game <https://github.com/ray-project/ray/blob/master/rllib/examples/multi_agent/two_step_game_with_grouped_agents.py>`__:
    Implements a multi-agent, grouped setup within a two-step game environment from the `QMIX paper <https://arxiv.org/pdf/1803.11485.pdf>`__.
 
@@ -3857,14 +3857,31 @@ py_test(
 )
 
 py_test(
-    name = "examples/connectors/multi_agent_with_different_observation_spaces",
+    name = "examples/connectors/multi_agent_observation_preprocessor",
     size = "medium",
-    srcs = ["examples/connectors/multi_agent_with_different_observation_spaces.py"],
+    srcs = ["examples/connectors/multi_agent_observation_preprocessor.py"],
     args = [
         "--enable-new-api-stack",
         "--num-agents=2",
+        "--algo=PPO",
+    ],
+    main = "examples/connectors/multi_agent_observation_preprocessor.py",
+    tags = [
+        "examples",
+        "exclusive",
+        "team:rllib",
+    ],
+)
+
+py_test(
+    name = "examples/connectors/single_agent_observation_preprocessor",
+    size = "medium",
+    srcs = ["examples/connectors/single_agent_observation_preprocessor.py"],
+    args = [
+        "--enable-new-api-stack",
+        "--algo=PPO",
     ],
-    main = "examples/connectors/multi_agent_with_different_observation_spaces.py",
+    main = "examples/connectors/single_agent_observation_preprocessor.py",
     tags = [
         "examples",
         "exclusive",
 
@@ -5,18 +5,24 @@
 
 from ray.rllib.connectors.connector_v2 import ConnectorV2
 from ray.rllib.core.rl_module.rl_module import RLModule
+from ray.rllib.env.multi_agent_episode import MultiAgentEpisode
+from ray.rllib.env.single_agent_episode import SingleAgentEpisode
 from ray.rllib.utils.annotations import override
 from ray.rllib.utils.typing import EpisodeType
 from ray.util.annotations import PublicAPI
 
 
 @PublicAPI(stability="alpha")
-class ObservationPreprocessor(ConnectorV2, abc.ABC):
-    """Env-to-module connector performing one preprocessor step on the last observation.
+class SingleAgentObservationPreprocessor(ConnectorV2, abc.ABC):
+    """Env-to-module connector preprocessing the most recent single-agent observation.
 
     This is a convenience class that simplifies the writing of few-step preprocessor
     connectors.
 
+    Note that this class also works in a multi-agent setup, in which case RLlib
+    separately calls this connector piece with each agents' observation and
+    `SingleAgentEpisode` object.
+
     Users must implement the `preprocess()` method, which simplifies the usual procedure
     of extracting some data from a list of episodes and adding it to the batch to a mere
     "old-observation --transform--> return new-observation" step.
@@ -28,23 +34,27 @@ def recompute_output_observation_space(
         input_observation_space: gym.Space,
         input_action_space: gym.Space,
     ) -> gym.Space:
-        # Users should override this method only in case the `ObservationPreprocessor`
-        # changes the observation space of the pipeline. In this case, return the new
-        # observation space based on the incoming one (`input_observation_space`).
+        # Users should override this method only in case the
+        # `SingleAgentObservationPreprocessor` changes the observation space of the
+        # pipeline. In this case, return the new observation space based on the
+        # incoming one (`input_observation_space`).
         return super().recompute_output_observation_space(
             input_observation_space, input_action_space
         )
 
     @abc.abstractmethod
-    def preprocess(self, observation):
+    def preprocess(self, observation, episode: SingleAgentEpisode):
         """Override to implement the preprocessing logic.
 
         Args:
             observation: A single (non-batched) observation item for a single agent to
-                be processed by this connector.
+                be preprocessed by this connector.
+            episode: The `SingleAgentEpisode` instance, from which `observation` was
+                taken. You can extract information on the particular AgentID and the
+                ModuleID through `episode.agent_id` and `episode.module_id`.
 
         Returns:
-            The new observation after `observation` has been preprocessed.
+            The new observation for the agent after `observation` has been preprocessed.
         """
 
     @override(ConnectorV2)
@@ -67,14 +77,105 @@ def __call__(
 
             # Process the observation and write the new observation back into the
             # episode.
-            new_observation = self.preprocess(observation=observation)
+            new_observation = self.preprocess(
+                observation=observation,
+                episode=sa_episode,
+            )
             sa_episode.set_observations(at_indices=-1, new_data=new_observation)
             #  We set the Episode's observation space to ours so that we can safely
             #  set the last obs to the new value (without causing a space mismatch
             #  error).
             sa_episode.observation_space = self.observation_space
 
-        # Leave `batch` as is. RLlib's default connector will automatically
-        # populate the OBS column therein from the episodes' now transformed
-        # observations.
+        # Leave `batch` as is. RLlib's default connector automatically populates
+        # the OBS column therein from the episodes' now transformed observations.
         return batch
+
+
+@PublicAPI(stability="alpha")
+class MultiAgentObservationPreprocessor(ConnectorV2, abc.ABC):
+    """Env-to-module connector preprocessing the most recent multi-agent observation.
+
+    The observation is always a dict of individual agents' observations.
+
+    This is a convenience class that simplifies the writing of few-step preprocessor
+    connectors.
+
+    Users must implement the `preprocess()` method, which simplifies the usual procedure
+    of extracting some data from a list of episodes and adding it to the batch to a mere
+    "old-observation --transform--> return new-observation" step.
+    """
+
+    @override(ConnectorV2)
+    def recompute_output_observation_space(
+        self,
+        input_observation_space: gym.Space,
+        input_action_space: gym.Space,
+    ) -> gym.Space:
+        # Users should override this method only in case the
+        # `MultiAgentObservationPreprocessor` changes the observation space of the
+        # pipeline. In this case, return the new observation space based on the
+        # incoming one (`input_observation_space`).
+        return super().recompute_output_observation_space(
+            input_observation_space, input_action_space
+        )
+
+    @abc.abstractmethod
+    def preprocess(self, observations, episode: MultiAgentEpisode):
+        """Override to implement the preprocessing logic.
+
+        Args:
+            observations: An observation dict containing each stepping agents'
+                (non-batched) observation to be preprocessed by this connector.
+            episode: The MultiAgentEpisode instance, where the `observation` dict
+                originated from.
+
+        Returns:
+            The new multi-agent observation dict after `observations` has been
+            preprocessed.
+        """
+
+    @override(ConnectorV2)
+    def __call__(
+        self,
+        *,
+        rl_module: RLModule,
+        batch: Dict[str, Any],
+        episodes: List[EpisodeType],
+        explore: Optional[bool] = None,
+        persistent_data: Optional[dict] = None,
+        **kwargs,
+    ) -> Any:
+        # We process and then replace observations inside the episodes directly.
+        # Thus, all following connectors will only see and operate on the already
+        # processed observation (w/o having access anymore to the original
+        # observations).
+        for ma_episode in episodes:
+            observations = ma_episode.get_observations(-1)
+
+            # Process the observation and write the new observation back into the
+            # episode.
+            new_observation = self.preprocess(
+                observations=observations,
+                episode=ma_episode,
+            )
+            # TODO (sven): Implement set_observations API for multi-agent episodes.
+            #  For now, we'll hack it through the single agent APIs.
+            # ma_episode.set_observations(at_indices=-1, new_data=new_observation)
+            for agent_id, obs in new_observation.items():
+                ma_episode.agent_episodes[agent_id].set_observations(
+                    at_indices=-1,
+                    new_data=obs,
+                )
+            #  We set the Episode's observation space to ours so that we can safely
+            #  set the last obs to the new value (without causing a space mismatch
+            #  error).
+            ma_episode.observation_space = self.observation_space
+
+        # Leave `batch` as is. RLlib's default connector automatically populates
+        # the OBS column therein from the episodes' now transformed observations.
+        return batch
+
+
+# Backward compatibility
+ObservationPreprocessor = SingleAgentObservationPreprocessor
@@ -373,12 +373,6 @@ def add_env_reset(
 
         infos = infos or {}
 
-        if self.observation_space is not None:
-            assert self.observation_space.contains(observation), (
-                f"`observation` {observation} does NOT fit SingleAgentEpisode's "
-                f"observation_space: {self.observation_space}!"
-            )
-
         self.observations.append(observation)
         self.infos.append(infos)
 
 
@@ -0,0 +1,114 @@
+from typing import Any
+
+import gymnasium as gym
+import numpy as np
+
+from ray.rllib.connectors.env_to_module.observation_preprocessor import (
+    MultiAgentObservationPreprocessor,
+)
+from ray.rllib.utils.annotations import override
+
+
+class AddOtherAgentsRowIndexToXYPos(MultiAgentObservationPreprocessor):
+    """Adds other agent's row index to an x/y-observation for an agent.
+
+    Run this connector with this env:
+    :py:class:`~ray.rllib.examples.env.classes.multi_agent.double_row_corridor_env.DoubleRowCorridorEnv`  # noqa
+
+    In this env, 2 agents walk around in a grid-world and must, each separately, reach
+    their individual goal position to receive a final reward. However, if they collide
+    while search for these goal positions, another larger reward is given to both
+    agents. Thus, optimal policies aim at seeking the other agent first, and only then
+    proceeding to their agent's goal position.
+
+    Each agents' observation space is a 2-tuple encoding the x/y position
+    (x=row, y=column).
+    This connector converts these observations to:
+    A dict for `agent_0` of structure:
+    {
+        "agent": Discrete index encoding the position of the agent,
+        "other_agent_row": Discrete(2), indicating whether the other agent is in row 0
+        or row 1,
+    }
+    And a 3-tuple for `agent_1`, encoding the x/y position of `agent_1` plus the row
+    index (0 or 1) of `agent_0`.
+
+    Note that the row information for the respective other agent, which this connector
+    provides, is needed for learning an optimal policy for any of the agents, because
+    the env rewards the first collision between the two agents. Hence, an agent needs to
+    have information on which row the respective other agent is currently in, so it can
+    change to this row and try to collide with this other agent.
+    """
+
+    @override(MultiAgentObservationPreprocessor)
+    def recompute_output_observation_space(
+        self,
+        input_observation_space,
+        input_action_space,
+    ) -> gym.Space:
+        """Maps the original (input) observation space to the new one.
+
+        Original observation space is `Dict({agent_n: Box(4,), ...})`.
+        Converts the space for `self.agent` into information specific to this agent,
+        plus the current row of the respective other agent.
+        Output observation space is then:
+        `Dict({`agent_n`: Dict(Discrete, Discrete), ...}), where the 1st Discrete
+        is the position index of the agent and the 2nd Discrete encodes the current row
+        of the other agent (0 or 1). If the other agent is already done with the episode
+        (has reached its goal state) a special value of 2 is used.
+        """
+        agent_0_space = input_observation_space.spaces["agent_0"]
+        self._env_corridor_len = agent_0_space.high[1] + 1  # Box.high is inclusive.
+        # Env has always 2 rows (and `self._env_corridor_len` columns).
+        num_discrete = int(2 * self._env_corridor_len)
+        spaces = {
+            "agent_0": gym.spaces.Dict(
+                {
+                    # Exact position of this agent (as an int index).
+                    "agent": gym.spaces.Discrete(num_discrete),
+                    # Row (0 or 1) of other agent. Or 2, if other agent is already done.
+                    "other_agent_row": gym.spaces.Discrete(3),
+                }
+            ),
+            "agent_1": gym.spaces.Box(
+                0,
+                agent_0_space.high[1],  # 1=column
+                shape=(3,),
+                dtype=np.float32,
+            ),
+        }
+        return gym.spaces.Dict(spaces)
+
+    @override(MultiAgentObservationPreprocessor)
+    def preprocess(self, observations, episode) -> Any:
+        # Observations: dict of keys "agent_0" and "agent_1", mapping to the respective
+        # x/y positions of these agents (x=row, y=col).
+        # For example: [1.0, 4.0] means the agent is in row 1 and column 4.
+
+        new_obs = {}
+        # 2=agent is already done
+        row_agent_0 = observations.get("agent_0", [2])[0]
+        row_agent_1 = observations.get("agent_1", [2])[0]
+
+        if "agent_0" in observations:
+            # Compute `agent_0` and `agent_1` enhanced observation.
+            index_obs_agent_0 = (
+                observations["agent_0"][0] * self._env_corridor_len
+                + observations["agent_0"][1]
+            )
+            new_obs["agent_0"] = {
+                "agent": index_obs_agent_0,
+                "other_agent_row": row_agent_1,
+            }
+
+        if "agent_1" in observations:
+            new_obs["agent_1"] = np.array(
+                [
+                    observations["agent_1"][0],
+                    observations["agent_1"][1],
+                    row_agent_0,
+                ],
+                dtype=np.float32,
+            )
+
+        return new_obs