CarperAI · LouisCastricato · Jan 9, 2023 · Dec 20, 2022 · Dec 20, 2022 · Dec 20, 2022
diff --git a/examples/configs/ppo_config_cnn_daily.yml b/examples/configs/ppo_config_cnn_daily.yml
@@ -0,0 +1,58 @@
+train:
+  seq_length: 612
+  epochs: 100
+  total_steps: 100000
+  batch_size: 12
+
+  checkpoint_interval: 10000
+  eval_interval: 500
+  save_best: False
+
+  pipeline: "PromptPipeline"
+  orchestrator: "PPOOrchestrator"
+  trainer: "AcceleratePPOTrainer"
+
+model:
+  model_path:  "google/flan-t5-large"
+  model_arch_type: "seq2seq"
+  tokenizer_path:  "google/flan-t5-large"
+  num_layers_unfrozen: 2
+
+optimizer:
+  name: "adamw"
+  kwargs:
+    lr:  1.0e-5
+    betas: [0.9, 0.999]
+    eps: 1.0e-8
+    weight_decay: 1.0e-6
+
+scheduler:
+  name: "cosine_annealing"
+  kwargs:
+    T_max: 10000
+    eta_min: 1.0e-6
+
+method:
+  name: "ppoconfig"
+  num_rollouts: 512
+  chunk_size: 12
+  ppo_epochs: 4
+  init_kl_coef: 0.05
+  target:  6
+  horizon: 10000
+  gamma: 0.99
+  lam: 0.95
+  cliprange: 0.2
+  cliprange_value: 0.2
+  vf_coef: 1.0
+  scale_reward: False
+  ref_mean: null
+  ref_std: null
+  cliprange_reward: 10
+  gen_kwargs:
+    max_new_tokens: 100
+    # top_k: 50
+    # top_p: 0.95
+    # do_sample: True
+  gen_inference_kwargs:
+    max_new_tokens: 100
diff --git a/examples/ppo_sentiments.py b/examples/ppo_sentiments.py
diff --git a/examples/trlx_t5_summ_daily_cnn.py b/examples/trlx_t5_summ_daily_cnn.py
@@ -0,0 +1,72 @@
+from typing import List
+
+import evaluate
+from datasets import load_dataset
+from tqdm import tqdm
+from transformers import AutoTokenizer
+
+import trlx
+from trlx.data.configs import TRLConfig
+
+meteor = evaluate.load("meteor")  # use meteor as the reward function
+
+if __name__ == "__main__":
+
+    def reward_fn(samples: List[str]):
+        sep_token = tokenizer.sep_token
+        articles = [sample.split(sep_token)[0].strip() for sample in samples]
+        predicted_summaries = [sample.split(sep_token)[1].strip() for sample in samples]
+        labels = [prompt_label[sample] for sample in articles]
+        scores = [
+            meteor.compute(predictions=[summary], references=[label])
+            for (summary, label) in zip(predicted_summaries, labels)
+        ]
+        scores = [score["meteor"] for score in scores]
+        return scores
+
+    config = TRLConfig.load_yaml("configs/ppo_config_cnn_daily.yml")
+
+    # samples 10000 samples from the training set as prompts for training
+    dataset = load_dataset("cnn_dailymail", "3.0.0", split="train", cache_dir="data")
+    prompts = dataset["article"][0:20000]
+    summaries = dataset["highlights"][0:20000]
+    prompts = ["Summarize: " + prompt for prompt in prompts]
+
+    # samples 100 samples from the validation set as prompts for evaluation
+    val_dataset = load_dataset(
+        "cnn_dailymail", "3.0.0", split="validation", cache_dir="data"
+    )
+    val_prompts = ["Summarize: " + prompt for prompt in val_dataset["article"][0:1000]]
+    val_summaries = val_dataset["highlights"][0:1000]
+
+    # make dictionary of prompts and labels to use for reward function
+    tokenizer = AutoTokenizer.from_pretrained(config.model.model_path)
+    tokenizer.padding_side = "left"
+    tokenizer.truncation_side = "right"
+    tokenizer.sep_token = "<sep>"
+    prompt_label = {}
+    max_length = config.train.seq_length - config.method.gen_kwargs["max_new_tokens"]
+
+    for i in tqdm(range(len(prompts))):
+        key = tokenizer.decode(
+            tokenizer(prompts[i], truncation=True, max_length=max_length)["input_ids"],
+            skip_special_tokens=True,
+        )  # get prompt like trlx's prompt
+        prompt_label[key.strip()] = summaries[i]
+
+    for i in tqdm(range(len(val_prompts))):
+        key = tokenizer.decode(
+            tokenizer(val_prompts[i], truncation=True, max_length=max_length)[
+                "input_ids"
+            ],
+            skip_special_tokens=True,
+        )  # get prompt like trlx's prompt
+        prompt_label[key.strip()] = val_summaries[i]
+
+    model = trlx.train(
+        config.model.model_path,
+        reward_fn=reward_fn,
+        prompts=prompts,
+        eval_prompts=val_prompts,
+        config=config,
+    )
diff --git a/trlx/data/configs.py b/trlx/data/configs.py
@@ -29,13 +29,17 @@ class ModelConfig:
     :param tokenizer_path: Path or name of the tokenizer (local or on huggingface hub)
     :type tokenizer_path: str
 
+    :param model_arch_type: Type of model architecture. Either "causal" or "seq2seq"
+    :type model_arch_type: str
+
     :param num_layers_unfrozen: Number of layers to unfreeze for fine-tuning.
         -1 means all layers are unfrozen.
     :type num_layers_unfrozen: int
     """
 
     model_path: str
     tokenizer_path: str
+    model_arch_type: str = "causal"
     num_layers_unfrozen: int = -1
 
     @classmethod
@@ -151,6 +155,7 @@ class TrainConfig:
 
     checkpoint_dir: str = "ckpts"
     rollout_logging_dir: Optional[str] = None
+    save_best: bool = True
 
     trackers: Tuple[str] = ("wandb",)
     seed: int = 1000