[feat] Add benchmark tools (#357)

maxreciprocate · web-flow · commit 114998b8171e · 2023-03-28T15:13:45.000+03:00
* feat(configs): add `tags` config option

* feat(scripts): add benchmark tools

* refactor(reference): clean up debug prints

* style(reference): satisfy isort

* style(reference): satisfy CI's isort

* feat(scripts/benchmark): add `ppo_sentiments_t5`

* fix(benchmark): `ddp` -&gt; `zero2-bf16` even with 1 process

* fix(benchmark): rename `wandb` project name

* feat(reference): separate metrics per experiment

* chore(benchmark): add `ppo_hh` to runs, but keep it under 2 hours

* feat(reference): add git hashes to descriptions

* fix(ppo_sentiments_t5): use `hparams` from sys.argv

* chore(benchmark): limit `ppo_hh`'s `total_steps` across branches

* fix(reference): set `max_runs_to_show` to 2

* feat(benchmark): add hh 6b to set of runs

* style: satisfy black

* feat(reference): add a few simple prints

* feat(benchmark): pin dependencies

* fix(benchmark): ignore git apply patch failed error

* chore(README): add a link to reference runs

* refactor(reference): move script under `trlx` (same as sweeps)

* chore(benchmark): remove patch for other branches

* revert(ppo_hh): restore default `total_steps`

* style(reference): satisfy black

* style(reference): satisfy isort

* feat(README): add benchmarking instruction
diff --git a/README.md b/README.md
@@ -35,6 +35,8 @@ For more usage see [examples](./examples). You can also try the colab notebooks
 | Simulacra (GPT2, ILQL) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/CarperAI/trlx/blob/main/examples/notebooks/trlx_simulacra.ipynb)|
 | Sentiment (GPT2, ILQL) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/CarperAI/trlx/blob/main/examples/notebooks/trlx_sentiments.ipynb)|
 
+Latest runs of the examples are on our [Weights & Biases](https://wandb.ai/sorry/trlx-references/reportlist)
+
 ## How to Train
 
 You can train a model using a reward function or a reward-labeled dataset.
@@ -99,6 +101,11 @@ For more usage see the [NeMo README](./trlx/models)
 python -m trlx.sweep --config configs/sweeps/ppo_sweep.yml examples/ppo_sentiments.py
 ```
 
+#### Benchmark your trlX fork against trlX's `main` branch
+```bash
+python -m trlx.reference octocat/trlx-fork:fix-branch
+```
+
 ## Logging
 
 trlX uses the standard Python `logging` library to log training information to the console. The default logger is set to the `INFO` level, which means that `INFO`, `WARNING`, `ERROR`, and `CRITICAL` level messages will be printed to standard output.
diff --git a/examples/hh/ppo_hh.py b/examples/hh/ppo_hh.py
@@ -85,6 +85,7 @@
     default_config.method.chunk_size = 16
 elif config_name == "6B":
     default_config.train.batch_size = 4
+    default_config.train.seq_length = 512
     default_config.train.total_steps = 6000
     default_config.train.checkpoint_dir = "checkpoints/ppo_hh_6B"
     default_config.model.model_path = "Dahoas/pythia-6B-static-sft"
diff --git a/examples/ppo_sentiments_t5.py b/examples/ppo_sentiments_t5.py
@@ -1,4 +1,6 @@
+import json
 import os
+import sys
 from typing import Dict, List
 
 import numpy as np
@@ -166,4 +168,5 @@ def tokenize(sample):
 
 
 if __name__ == "__main__":
-    main()
+    hparams = {} if len(sys.argv) == 1 else json.loads(sys.argv[1])
+    main(hparams)
diff --git a/scripts/benchmark.sh b/scripts/benchmark.sh
@@ -0,0 +1,66 @@
+#!/bin/bash
+set -e
+
+origin=CarperAI/trlx
+branch=main
+entity=null
+only_hash=false
+only_tiny=false
+
+while [[ "$#" -gt 0 ]]; do
+    case $1 in
+        --origin) origin="$2"; shift ;;
+        --branch) branch="$2"; shift ;;
+        --public) entity='"CarperAI"' ;;
+        --only_hash) only_hash=true ;;
+        --only_tiny) only_tiny=true ;;
+        *) echo "Unknown parameter passed: $1"; exit 1 ;;
+    esac
+    shift
+done
+
+dir=`mktemp -d -p .`
+if [ ! -d "$dir" ]; then
+   echo "Couldn't create a temporary directory, aborting"
+   exit 1
+fi
+
+cd $dir
+trap "rm -rf ../$dir" EXIT
+
+git clone --depth 1 --single-branch -b $branch https://github.com/$origin .
+
+hash=`find . -not \( -path ./.git -prune \) -not -name "*.md" -type f -print0 | sort -z | xargs -0 sha1sum | sha1sum | cut -f1 -d" "`
+git_hash=`git log --format=%h/%s/%as -n1`
+
+if [ "$only_hash" = true ]; then
+   echo "$hash"
+   echo "$git_hash"
+   exit 0
+fi
+
+python -m venv venv
+. venv/bin/activate
+python -m pip install pip --upgrade
+pip install -r requirements.txt
+pip install -e .
+
+args='{"train": {"project_name": "trlx-references", "entity_name": '$entity', "tags": ["'$hash'"]}}'
+python examples/randomwalks/ilql_randomwalks.py "$args"
+python examples/randomwalks/ppo_randomwalks.py "$args"
+
+if [ "$only_tiny" = true ]; then
+    exit 0
+fi
+
+rm -rf ../benchmark_logs && mkdir ../benchmark_logs
+
+CUDA_VISIBLE_DEVICES=0 accelerate launch --num_processes 1 --config_file configs/accelerate/zero2-bf16.yaml --main_process_port 8880 examples/ppo_sentiments.py "$args" > ../benchmark_logs/ppo_sentiments.log 2>&1 &
+CUDA_VISIBLE_DEVICES=1 accelerate launch --num_processes 1 --config_file configs/accelerate/zero2-bf16.yaml --main_process_port 8881 examples/sft_sentiments.py "$args" > ../benchmark_logs/sft_sentiments.log 2>&1 &
+CUDA_VISIBLE_DEVICES=2 accelerate launch --num_processes 1 --config_file configs/accelerate/zero2-bf16.yaml --main_process_port 8882 examples/ilql_sentiments.py "$args" > ../benchmark_logs/ilql_sentiments.log 2>&1 &
+CUDA_VISIBLE_DEVICES=3 accelerate launch --num_processes 1 --config_file configs/accelerate/zero2-bf16.yaml --main_process_port 8883 examples/ppo_sentiments_t5.py "$args" > ../benchmark_logs/ppo_sentiments_t5.log 2>&1 &
+
+wait
+
+args='{"train": {"total_steps": 1500, "seq_length": 512, "project_name": "trlx-references", "entity_name": '$entity', "tags": ["'$hash'"]}}'
+CONFIG_NAME=6B accelerate launch --num_processes 7 --config_file configs/accelerate/zero2-bf16.yaml examples/hh/ppo_hh.py "$args"
diff --git a/trlx/data/configs.py b/trlx/data/configs.py
@@ -1,6 +1,6 @@
 from copy import deepcopy
 from dataclasses import dataclass, field
-from typing import Any, Dict, Optional, Set
+from typing import Any, Dict, List, Optional, Set
 
 import yaml
 
@@ -220,6 +220,7 @@ class TrainConfig:
 
     tracker: Optional[str] = "wandb"
     logging_dir: Optional[str] = None
+    tags: Optional[List[str]] = field(default_factory=list)
 
     seed: int = 1000
 
diff --git a/trlx/reference.py b/trlx/reference.py
@@ -0,0 +1,103 @@
+# python -m trlx.reference CarperAI/trlx:add-benchmark-tools --against CarperAI/trlx:main
+
+import argparse
+import os
+import subprocess
+
+import wandb
+import wandb.apis.reports as wb
+
+parser = argparse.ArgumentParser()
+parser.add_argument("branch", type=str, help="Git branch in the format `origin:branch`")
+parser.add_argument("--against", type=str, default="CarperAI/trlx:main", help="Reference git branch")
+parser.add_argument("--public", action="store_true", help="Use CarperAI entity to store/pull from w&b runs")
+args = parser.parse_args()
+
+pr_origin = ref_origin = "CarperAI/trlx"
+pr_branch = args.branch
+ref_branch = args.against
+if ":" in pr_branch:
+    pr_origin, pr_branch = pr_branch.rsplit(":", 1)
+if ":" in ref_branch:
+    ref_origin, ref_branch = ref_branch.rsplit(":", 1)
+
+out = os.popen(f"./scripts/benchmark.sh --origin {pr_origin} --branch {pr_branch} --only_hash")
+pr_hash, pr_git_hash = [x[:-1] for x in out.readlines()]
+
+out = os.popen(f"./scripts/benchmark.sh --origin {ref_origin} --branch {ref_branch} --only_hash")
+ref_hash, ref_git_hash = [x[:-1] for x in out.readlines()]
+
+print(f"{pr_origin}:{pr_branch=} {pr_hash=} {pr_git_hash=}")
+print(f"{ref_origin}:{ref_branch} {ref_hash=} {ref_git_hash=}")
+
+api = wandb.Api()
+project_name = "CarperAI/trlx-references" if args.public else "trlx-references"
+public = "--public" if args.public else ""
+
+runs = api.runs(project_name, filters={"tags": {"$in": [ref_hash]}})
+if runs:
+    print(f"On {ref_branch} @{ref_git_hash} these runs were already made: \n{chr(10).join(run.name for run in runs)}")
+else:
+    print(f"Making runs on {ref_branch} @{ref_git_hash}")
+    subprocess.run(f"./scripts/benchmark.sh --origin {ref_origin} --branch {ref_branch} {public}".split())
+
+runs = api.runs(project_name, filters={"tags": {"$in": [pr_hash]}})
+if runs:
+    print(f"On {pr_branch} @{pr_git_hash} these runs were already made: \n{chr(10).join(run.name for run in runs)}")
+else:
+    print(f"Making runs on {pr_branch} @{pr_git_hash}")
+    subprocess.run(f"./scripts/benchmark.sh --origin {pr_origin} --branch {pr_branch} {public}".split())
+
+report = wb.Report(
+    project=project_name.split("/")[1] if args.public else project_name,
+    title=f"{pr_branch} v. {ref_branch}",
+    description=f"{pr_branch}\n@{pr_git_hash}\n\n{ref_branch}\n@{ref_git_hash}",
+)
+blocks = []
+
+experiment_names = set(x.name.split(":")[0] for x in api.runs(project_name))
+for name in experiment_names:
+    filters = {"$and": [{"display_name": {"$regex": f"^{name}"}}, {"tags": {"$in": [pr_hash, ref_hash]}}]}
+
+    runs = api.runs(project_name, filters=filters)
+    metrics = set(sum([[metric for metric in run.history().columns if not metric.startswith("_")] for run in runs], []))
+
+    metrics_panels = [
+        wb.LinePlot(
+            title=f"{metric}",
+            x="Step",
+            y=[metric],
+            title_x="Step",
+            smoothing_show_original=True,
+            max_runs_to_show=2,
+            plot_type="line",
+            font_size="auto",
+            legend_position="north",
+        )
+        for metric in metrics
+    ]
+
+    # sort the most important metrics to be shown first
+    major_metrics = set()
+    for metric in metrics:
+        if metric.startswith("reward") or metric.startswith("metric"):
+            major_metrics.add(metric)
+    metrics = metrics - major_metrics
+
+    blocks.extend(
+        [
+            wb.H1(text=name),
+            wb.PanelGrid(
+                panels=[panel for panel in metrics_panels if panel.title in major_metrics],
+                runsets=[wb.Runset(project=project_name, filters=filters)],
+            ),
+            wb.PanelGrid(
+                panels=[panel for panel in metrics_panels if panel.title in metrics],
+                runsets=[wb.Runset(project=project_name, filters=filters)],
+            ),
+        ]
+    )
+
+report.blocks = blocks
+report.save()
+print(report.url)
diff --git a/trlx/trainer/accelerate_base_trainer.py b/trlx/trainer/accelerate_base_trainer.py
@@ -92,7 +92,7 @@ def __init__(self, config, **kwargs):  # noqa: C901
                     "name": run_name,
                     "entity": self.config.train.entity_name,
                     "group": self.config.train.group_name,
-                    "tags": ["/".join(get_git_tag())],
+                    "tags": self.config.train.tags + ["/".join(get_git_tag())],
                     "mode": "disabled" if os.environ.get("debug", False) else "online",
                 }
 

Original file line number	Diff line number	Diff line change
`@@ -92,7 +92,7 @@ def __init__(self, config, **kwargs): # noqa: C901`
`92`	`92`	`"name": run_name,`
`93`	`93`	`"entity": self.config.train.entity_name,`
`94`	`94`	`"group": self.config.train.group_name,`
`95`		`- "tags": ["/".join(get_git_tag())],`
	`95`	`+ "tags": self.config.train.tags + ["/".join(get_git_tag())],`
`96`	`96`	`"mode": "disabled" if os.environ.get("debug", False) else "online",`
`97`	`97`	`}`
`98`	`98`