Improve documentation/comments on the random walk example (#208)

alan-cooney · web-flow · commit dcbf7b0a37c1 · 2023-01-30T20:00:37.000-05:00
* Improve documentation for the Random walk example

* Add additional notes on PPO random walks

* Add image for documentation

* Fix requested changes

* Fix remaining issues
diff --git a/examples/randomwalks/README.md b/examples/randomwalks/README.md
@@ -1,14 +1,30 @@
-Toy problem similar to the one described in [Decision Transformer (Lili Chen et al. 2021)](https://arxiv.org/abs/2106.01345) [1]:
-finding graph's shortest paths by learning from a dataset of sampled random
-walks.
-
-In this implementation there are not environment dynamics – impossible and
-incorrect paths are penalized the same way by a single reward which is given at
-the end of the trajectory, measuring how optimal the path is compared to the
-shortest possible (bounded in [0, 1]). Paths are represented as strings of
-letters, with each letter corresponding to a node in a graph. PPO example uses a
-pretrained model for starting transition probabilities, ILQL learns them from
-the samples directly.
-
-[1] code for which is not present in the official repo, see issue
-https://github.com/kzl/decision-transformer/issues/48
+# Random Walks: Decision Tree Example
+
+This example uses the Toy Problem described in [Decision Transformer (Lili Chen
+et al. 2021)](https://arxiv.org/abs/2106.01345).
+
+## Game Description
+
+The task is to find the shortest path on a directed graph. The reward is based
+on how optimal the path is compared to the shortest possible (bounded in [0,
+1]).
+
+Note this is different to the paper, which gave rewards of -1 for every
+turn not at the goal state, and 0 at the goal state. Here the model instead
+receives its reward at the end of the full trajectory, based on how optimal it
+is compared to the minimum number of steps to reach the goal state (bounded in
+[0, 1]).
+
+Paths are represented as strings of letters, with each letter corresponding to a
+node in the graph.
+
+## Training
+
+![Graph Example](graph-example.png)
+Source: Decision Transformer (Lili Chen et al. 2021)
+
+For PPO, a language model was fine-tuned to predict the next token in a sequence
+of returns-to-go (sum of future rewards), states and actions. It was trained
+only on random walk data.
+
+ILQL by contrast learns from the samples directly.
diff --git a/examples/randomwalks/graph-example.png b/examples/randomwalks/graph-example.png
diff --git a/examples/randomwalks/ppo_randomwalks.py b/examples/randomwalks/ppo_randomwalks.py
@@ -18,10 +18,14 @@ def main(hparams={}):
 
     trlx.train(
         "CarperAI/randomwalks",
-        reward_fn=lambda samples, **kwargs: metric_fn(samples)["optimality"],
+        # An "optimality" reward function is used, with scores in [0,1]
+        # depending on how close the path is to the shortest possible path.
+        reward_fn=lambda samples, prompts, outputs: metric_fn(samples)["optimality"],
+        # The prompts are simply the first nodes (represented as letters) to
+        # start from.
         prompts=prompts,
         eval_prompts=prompts,
-        metric_fn=lambda samples, **kwargs: metric_fn(samples),
+        metric_fn=lambda samples, prompts, outputs: metric_fn(samples),
         config=config,
     )
 
diff --git a/examples/randomwalks/randomwalks.py b/examples/randomwalks/randomwalks.py
@@ -1,106 +1,235 @@
+from typing import Callable, Dict, List, Optional, Tuple
+
 import networkx as nx
 import numpy as np
 import torch
 
 
-def randexclude(rng: np.random.RandomState, n: int, exclude: int) -> int:
+def generate_rand_int_excluding(
+    rng: np.random.RandomState, max: int, exclude: int
+) -> int:
+    """Random integer generator, excluding a specific number
+
+    Args:
+        rng: Numpy random number generator
+        max: Max number
+        exclude: Number to exclude
+
+    Returns:
+        Random integer in [0, max], excluding the `exclude` integer.
+    """
     while True:
-        x = rng.randint(n)
+        # Create the random integer
+        x = rng.randint(max)
+
+        # Return the random integer if it isn't the exclude value, otherwise try
+        # again
         if x != exclude:
             return x
 
 
 def generate_random_walks(  # noqa: max-complexity
-    n_nodes=21, max_length=10, n_walks=1000, p_edge=0.1, seed=1002, gpt2_tokenizer=False
-):
+    n_nodes: int = 21,
+    max_length: int = 10,
+    n_walks: int = 1000,
+    p_edge: float = 0.1,
+    seed: int = 1002,
+    gpt2_tokenizer: bool = False,
+) -> Tuple[
+    Callable[[List[str]], Dict[str, List[float]]],
+    List[str],
+    List[str],
+    torch.Tensor,
+]:
+    """Generate random walks
+
+    Args:
+        n_nodes: Number of nodes. This should not be more than 26, as we use
+        single letters to represent each node.
+        max_length: Maximum number of steps in each random walk
+        n_walks: Number of random walks (samples) to create
+        p_edge: Probability that any source node connects to any other
+        destination node
+        seed: Random seed
+        gpt2_tokenizer: True if GPT2's tokenizer is being used
+
+    Returns:
+        Tuple of metric function,
+    """
+    # Initialise a random state with the seed
     rng = np.random.RandomState(seed)
 
+    # Create the adjacency matrix
+    # https://en.wikipedia.org/wiki/Adjacency_matrix
+    # This is a 2d matrix, where the rows represent the source nodes and the
+    # columns represent the destination nodes. If a cell (i,j) is True, then
+    # there is a directional edge from the source node (i) to the destination
+    # node (j). If it is false there is no connection.
     while True:
-        adj = rng.rand(n_nodes, n_nodes) > (1 - p_edge)
-        np.fill_diagonal(adj, 0)
-        if np.all(adj.sum(1)):
+        # Create the adjacency matrix, where each node is connected to each
+        # other node, with probability p_edge
+        adjacency_matrix: np.ndarray = rng.rand(n_nodes, n_nodes) > (1 - p_edge)
+
+        # Nodes can't be connected to themselves, so the diagonal values must
+        # all be False
+        np.fill_diagonal(adjacency_matrix, 0)
+
+        # Each destination node (column) must be connected to at least one
+        # source node. This checks if this is the case, by checking there is a
+        # True value in every column. If it is not the case, we try to generate
+        # a new adjacency matrix again from scratch (in the while loop).
+        if np.all(adjacency_matrix.sum(1)):
             break
 
-    # terminal state
-    adj[0, :] = 0
-    adj[0, 0] = 1
+    # Set the goal node as 0
+    goal: int = 0
 
-    char_to_node = {chr(ix + ord("a")): ix for ix in range(n_nodes)}
-    node_to_char = {ix: chr(ix + ord("a")) for ix in range(n_nodes)}
+    # The goal node is the terminal state, so we make sure that it doesn't
+    # have a directional edge going to any other nodes (i.e. it can only be
+    # connected to from previous nodes). We also set the connection to itself as
+    # True.
+    adjacency_matrix[goal, :] = 0
+    adjacency_matrix[goal, goal] = 1
 
-    goal = 0
-    sample_walks = []
-    delimiter = "|" if gpt2_tokenizer else ""
+    # Create dicts for converting nodes into characters and vice versa
+    # Nodes are converted into characters as these (when split by the delimiter) are
+    # guaranteed to be tokenized as individual tokens.
+    char_to_node: Dict[str, int] = {chr(ix + ord("a")): ix for ix in range(n_nodes)}
+    node_to_char: Dict[int, str] = {ix: chr(ix + ord("a")) for ix in range(n_nodes)}
 
+    # Initialise a list of sample walks
+    sample_walks: List[str] = []
+
+    # String delimiter (to force the tokenizer to keep all nodes as separate
+    # tokens)
+    delimiter: str = "|" if gpt2_tokenizer else ""
+
+    # Create n_walks samples
     for _ in range(n_walks):
-        node = randexclude(rng, n_nodes, goal)
-        walk = [node]
 
-        for istep in range(max_length - 1):
-            node = rng.choice(np.nonzero(adj[node])[0])
-            walk.append(node)
+        # Create a random starting node (that isn't already at the goal state)
+        node: int = generate_rand_int_excluding(rng, n_nodes, goal)
+
+        # Initialise the list of nodes that we visit
+        walk_nodes: List[int] = [node]
+
+        # Do a series of steps, until we hit the maximum number of steps or the
+        # goal state (whichever comes first)
+        for _step in range(max_length - 1):
+
+            # From the starting node, get all the nodes we can move to. Pick one
+            # of these at random, and add it to the list of visited nodes
+            node = rng.choice(np.nonzero(adjacency_matrix[node])[0])
+            walk_nodes.append(node)
+
+            # If we're at the goal state, stop
             if node == goal:
                 break
 
-        # code each node by a letter
-        # for bpe tokenizer join them over | for a guaranteed split
-        walk = [node_to_char[ix] for ix in walk]
+        # Convert the nodes visited to letters (not integers)
+        walk: List[str] = [node_to_char[ix] for ix in walk_nodes]
 
+        # Concatenate into a journey, with each node letter separated by the
+        # delimiter.
         sample_walks.append(delimiter.join(walk))
 
-    # calculate the shortest paths for comparison
-    shortest_lengths = []
-    g = nx.from_numpy_array(adj, create_using=nx.DiGraph)
+    # Initialise list of shortest lengths for each node (to the goal node)
+    shortest_lengths: List[int] = []
+
+    # Create a directional graph from the adjacency list
+    directional_graph = nx.from_numpy_array(adjacency_matrix, create_using=nx.DiGraph)
+
+    # Fore each node (except for the goal node), find the shortest path
     for start in set(range(n_nodes)) - {goal}:
         try:
-            shortest_path = nx.shortest_path(g, start, goal)[:max_length]
+            # Find the shortest path (up to the max_length)
+            shortest_path = nx.shortest_path(directional_graph, start, goal)[
+                :max_length
+            ]
             shortest_lengths.append(len(shortest_path))
         except Exception:
+            # If there is no path, use the maximum length instead
             shortest_lengths.append(max_length)
 
-    shortest_lengths = torch.tensor(shortest_lengths)
+    def metric_fn(
+        samples: List[str],
+    ) -> Dict[str, List[float]]:
+        """Metric Function
 
-    def metric_fn(samples):
-        # a measure for an invalid or a not found path
-        infty = 100
-        lengths = []
-        ref_lengths = []
+        Args:
+            samples: Batch of samples
 
-        for s in samples:
+        Returns:
+            Dict of metrics, each with a key of the metric name and value as a
+            list of metric values for each batch item.
+        """
+        # Length to set if the path is invalid
+        invalid_path_length: int = 100
+
+        # Initialise batch lengths & reference lengths (the optimal length
+        # starting from each batch items specific start node)
+        lengths: List[float] = []
+        sample_optimal_lengths: List[int] = []
+
+        for sample_str in samples:
+            # Remove GPT2 specific tokenizer delimiter
             if gpt2_tokenizer:
-                s = s.replace("|", "")
-
-            s = [char_to_node.get(c, 1000) for c in s]
-            length = None
-            for ix in range(len(s)):
-                # a nonexisting path is taken
-                if s[ix] >= n_nodes or ix > 0 and not adj[s[ix - 1], s[ix]]:
-                    length = infty
+                sample_str = sample_str.replace("|", "")
+
+            # Convert the sample into a list of nodes (default to an unused
+            # integer if the node is not found)
+            sample: List[int] = [char_to_node.get(c, 1000) for c in sample_str]
+
+            # Initialise the specific sample length
+            length: Optional[float] = None
+
+            for node in range(len(sample)):
+                # If an invalid path is taken, set the length to the invalid
+                # path score
+                if (
+                    sample[node] >= n_nodes
+                    or node > 0
+                    and not adjacency_matrix[sample[node - 1], sample[node]]
+                ):
+                    length = invalid_path_length
                     break
-                elif s[ix] == 0:
-                    length = ix + 1
+
+                # Otherwise increment the length for each move (where we don't
+                # end up at the goal node)
+                elif sample[node] == 0:
+                    length = node + 1
                     break
 
+            # Catch the case where there are no moves
             if length is None:
-                length = infty
+                length = invalid_path_length
+
+            # Store the batch item length & optimal length staring from the
+            # start node
+            lengths.append(float(length))
+            sample_optimal_lengths.append(shortest_lengths[sample[0] - 1])
 
-            lengths.append(length)
-            # allows for inorder checking of % optimality
-            ref_lengths.append(shortest_lengths[s[0] - 1])
+        # Calculate optimality scores, in [0, 1], as compared to the shortest
+        # path
+        lengths_tensor = torch.tensor(lengths, dtype=torch.float)
+        bound_lengths: torch.Tensor = torch.where(
+            lengths_tensor.eq(invalid_path_length), max_length, lengths_tensor
+        ).abs()
+        optimal_lengths = torch.as_tensor(sample_optimal_lengths)
 
-        lengths = torch.tensor(lengths, dtype=torch.float)
-        bound_lengths = torch.where(lengths.eq(infty), max_length, lengths).abs()
-        ref_lengths = torch.as_tensor(ref_lengths)
+        # Optimality scores, in [0, 1], as compared to the shortest path
+        optimality = (max_length - bound_lengths) / (max_length - optimal_lengths)
 
         return {
             "lengths": lengths,
-            # percentage-optimal \in (0, 1) when compared to the shortest path
-            "optimality": (max_length - bound_lengths) / (max_length - ref_lengths),
+            "optimality": optimality.tolist(),
         }
 
-    logit_mask = torch.tensor(adj)
+    logit_mask = torch.tensor(adjacency_matrix)
 
+    # Set the evaluation prompts as a list of unique random walk samples, using
+    # just the start point (first character) from each samples.
     eval_prompts = list(sorted(set(w[0] for w in sample_walks)))
     eval_prompts = [prompt + delimiter for prompt in eval_prompts]
 
-    return metric_fn, eval_prompts, sample_walks, logit_mask
+    return (metric_fn, eval_prompts, sample_walks, logit_mask)
diff --git a/trlx/trainer/accelerate_base_trainer.py b/trlx/trainer/accelerate_base_trainer.py
@@ -383,7 +383,11 @@ def evaluate(self):  # noqa: C901
                 # additionally log any other metrics
                 if self.metric_fn:
                     metric_time = time()
-                    metrics = self.metric_fn(str_samples)
+                    metrics = self.metric_fn(
+                        samples=str_samples,
+                        prompts=str_prompts,
+                        outputs=str_outputs,
+                    )
                     stats["time/metric"] = time() - metric_time
 
                     mean_metrics = {