rerun #89 (#92)

cat-state · web-flow · commit 3900999402a7 · 2022-11-11T17:56:47.000Z
In this, we refactor out the ILQL loss function and model additions so they can be reused with other accelerator libraries. I also refactored the loss to be slightly clearer and fixed some type errors. First part of #75 W&B run: https://wandb.ai/carperai/trlx/runs/3tam2www
diff --git a/configs/ilql_config.yml b/configs/ilql_config.yml
@@ -1,7 +1,7 @@
 model:
-  model_path : "gpt2"
+  model_path: "gpt2"
   tokenizer_path: "gpt2"
-  model_type : "ILQLModel"
+  model_type: "ILQLModel"
   num_layers_unfrozen: -1
 
 train:
@@ -19,8 +19,8 @@ train:
   checkpoint_interval: 1000
   eval_interval: 128
 
-  pipeline : "OfflinePipeline"
-  orchestrator : "OfflineOrchestrator"
+  pipeline: "OfflinePipeline"
+  orchestrator: "OfflineOrchestrator"
   seed: 1000
 
 method:
diff --git a/examples/ilql_sentiments.py b/examples/ilql_sentiments.py
@@ -1,5 +1,3 @@
-# Generates positive movie reviews by learning from sentiment-labeled IMDB dataset
-
 from datasets import load_dataset
 from transformers import pipeline
 
diff --git a/trlx/data/ilql_types.py b/trlx/data/ilql_types.py
@@ -1,6 +1,6 @@
 from dataclasses import dataclass
 
-from torchtyping import TensorType
+from torchtyping import TensorType  # type: ignore
 
 
 @dataclass
diff --git a/trlx/data/method_configs.py b/trlx/data/method_configs.py
@@ -3,7 +3,7 @@
 from typing import Any, Callable, Dict, List
 
 # specifies a dictionary of method configs
-_METHODS: Dict[str, any] = {}  # registry
+_METHODS: Dict[str, Any] = {}  # registry
 
 
 def register_method(name):
@@ -28,17 +28,6 @@ def register_class(cls, name):
     return cls
 
 
-def get_method(name: str) -> Callable:
-    """
-    Return constructor for specified method config
-    """
-    name = name.lower()
-    if name in _METHODS:
-        return _METHODS[name]
-    else:
-        raise Exception("Error: Trying to access a method that has not been registered")
-
-
 @dataclass
 @register_method
 class MethodConfig:
@@ -56,36 +45,12 @@ def from_dict(cls, config: Dict[str, Any]):
         return cls(**config)
 
 
-@dataclass
-@register_method
-class ILQLConfig(MethodConfig):
+def get_method(name: str) -> MethodConfig:
     """
-    Config for ILQL method
-
-    :param tau: Control tradeoff in value loss between punishing value network for underestimating the target Q (i.e. Q value corresponding to the action taken) (high tau) and overestimating the target Q (low tau)
-    :type tau: float
-
-    :param gamma: Discount factor for future rewards
-    :type gamma: float
-
-    :param cql_scale: Weight for CQL loss term
-    :type cql_scale: float
-
-    :param awac_scale: Weight for AWAC loss term
-    :type awac_scale: float
-
-    :param steps_for_target_q_sync: Number of steps to wait before syncing target Q network with Q network
-    :type steps_for_target_q_sync: int
-
-    :param two_qs: Use minimum of two Q-value estimates
-    :type two_qs: bool
+    Return constructor for specified method config
     """
-
-    tau: float
-    gamma: float
-    cql_scale: float
-    awac_scale: float
-    alpha: float
-    steps_for_target_q_sync: int
-    betas: List[float]
-    two_qs: bool
+    name = name.lower()
+    if name in _METHODS:
+        return _METHODS[name]
+    else:
+        raise Exception("Error: Trying to access a method that has not been registered")
diff --git a/trlx/model/__init__.py b/trlx/model/__init__.py
@@ -1,7 +1,7 @@
 import os
 import sys
 from abc import abstractmethod
-from typing import Callable, Dict, Iterable
+from typing import Any, Callable, Dict, Iterable
 
 import torch
 
@@ -11,11 +11,11 @@
 from trlx.utils import safe_mkdir
 
 # specifies a dictionary of architectures
-_MODELS: Dict[str, any] = {}  # registry
+_MODELS: Dict[str, Any] = {}  # registry
 
 
 def register_model(name):
-    """Decorator used register a CARP architecture
+    """Decorator used register an architecture
     Args:
         name: Name of the architecture
     """
@@ -46,6 +46,10 @@ def __init__(self, config: TRLConfig, train_mode=False):
     def push_to_store(self, data):
         self.store.push(data)
 
+    def add_eval_pipeline(self, eval_pipeline):
+        """Adds pipeline from with validation prompts"""
+        self.eval_pipeline = eval_pipeline
+
     @abstractmethod
     def act(self, data: RLElement) -> RLElement:
         """
@@ -92,7 +96,7 @@ def learn(
         pass
 
     @abstractmethod
-    def get_components(self) -> Dict[str, any]:
+    def get_components(self) -> Dict[str, Any]:
         """
         Get pytorch components (mainly for saving/loading)
         """
diff --git a/trlx/model/accelerate_base_model.py b/trlx/model/accelerate_base_model.py
@@ -2,11 +2,11 @@
 import os
 from abc import abstractmethod
 from time import time
-from typing import Dict, Iterable, Tuple
+from typing import Any, Dict, Iterable, Sequence, Tuple, Union
 
 import torch
 import torch.nn.functional as F
-from accelerate import Accelerator
+from accelerate import Accelerator  # type: ignore
 from transformers import AutoTokenizer
 
 import wandb
@@ -92,10 +92,13 @@ def __init__(self, config, train_mode=True):
             eta_min=self.config.train.lr_target,
         )
 
-    def tokenize(self, text: Iterable[str]):
+    def tokenize(self, text: Union[Sequence[str], Sequence[torch.LongTensor]]):
         """
         Tokenize a batch of text after adding bos token to each of the samples
         """
+        if isinstance(text[0], torch.LongTensor):
+            return text
+
         text = [self.tokenizer.bos_token + txt for txt in text]
         return self.tokenizer(
             text,
@@ -117,7 +120,7 @@ def generate(self, input_ids, attention_mask=None, **kwargs):
                 input_ids=input_ids, attention_mask=attention_mask, **kwargs
             )
 
-    def get_components(self) -> Dict[str, any]:
+    def get_components(self) -> Dict[str, Any]:
         components = (
             {"model": self.model, "opt": self.opt, "scheduler": self.scheduler}
             if self.train_mode
diff --git a/trlx/model/accelerate_ilql_model.py b/trlx/model/accelerate_ilql_model.py
@@ -1,10 +1,14 @@
-from typing import Iterable, Union
+from typing import Iterable, Sequence, Union, cast
 
 import torch
 import torch.nn.functional as F
 
+
 from trlx.model import register_model
-from trlx.model.nn.ilql_models import CausalLMWithValueHeads
+from trlx.model.nn.ilql_models import ILQLConfig, CausalLMWithValueHeads
+from trlx.data.ilql_types import ILQLBatch
+from trlx.data.configs import TRLConfig
+from trlx.utils import to_device
 
 from .accelerate_base_model import AccelerateRLModel
 
@@ -13,7 +17,7 @@
 class AccelerateILQLModel(AccelerateRLModel):
     def __init__(
         self,
-        config,
+        config: TRLConfig,
         logit_mask=None,
         metric_fn=None,
         train_mode=True,
@@ -22,16 +26,20 @@ def __init__(
         self.logit_mask = logit_mask
         self.metric_fn = metric_fn
         self.reward_fn = None
-        self.params = config.method
+
+        if not isinstance(config.method, ILQLConfig):
+            raise ValueError("config.method must be ILQLConfig")
+
+        self.ilql: ILQLConfig = cast(ILQLConfig, config.method)
 
     def get_arch(self, config):
         return CausalLMWithValueHeads(
             config.model.model_path,
-            params=config.method,
+            ilql_config=config.method,
             num_layers_unfrozen=config.model.num_layers_unfrozen,
         )
 
-    def tokenize(self, texts: Union[Iterable[str], Iterable[torch.LongTensor]]):
+    def tokenize(self, texts: Union[Sequence[str], Sequence[torch.LongTensor]]):
         if isinstance(texts[0], torch.LongTensor):
             return texts
 
@@ -47,113 +55,17 @@ def post_backward_callback(self):
         if self.iter_count % self.config.method.steps_for_target_q_sync == 0:
             self.accelerator.unwrap_model(self.model).sync_target_q_heads()
 
-    def loss(self, batch):
-        input_ids = batch.input_ids.to(self.accelerator.device)
-        attn = batch.attention_mask.to(self.accelerator.device)
-        rewards = batch.rewards.to(self.accelerator.device)
-        states_ixs = batch.states_ixs.to(self.accelerator.device)
-        actions_ixs = batch.actions_ixs.to(self.accelerator.device)
-        dones = batch.dones.to(self.accelerator.device)
+    def loss(self, batch: ILQLBatch):
+        batch = to_device(batch, self.accelerator.device)
 
         logits, qs, target_qs, vs, _ = self.model(
-            input_ids=input_ids,
-            attention_mask=attn,
-            actions_ixs=actions_ixs,
-            states_ixs=states_ixs,
+            input_ids=batch.input_ids,
+            attention_mask=batch.attention_mask,
+            actions_ixs=batch.actions_ixs,
+            states_ixs=batch.states_ixs,
         )
 
-        actions = input_ids[:, 1:].gather(dim=1, index=actions_ixs).unsqueeze(-1)
-        bsize, ntokens, dsize = logits.shape
-
-        # compute two separate q-value estimates, to then select minimum values from both
-        if self.params.two_qs:
-            Q1 = qs[0].gather(-1, actions).squeeze(-1)
-            Q2 = qs[1].gather(-1, actions).squeeze(-1)
-
-            targetQ1 = target_qs[0].gather(-1, actions).squeeze(-1).detach()
-            targetQ2 = target_qs[1].gather(-1, actions).squeeze(-1).detach()
-            targetQ = torch.minimum(targetQ1, targetQ2)
-        else:
-            Q = qs.gather(-1, actions).squeeze(-1)
-            targetQ = target_qs.gather(-1, actions).squeeze(-1).detach()
-
-        terminal_mask = dones[:, :-1]
-        n_nonterminal = max(1, terminal_mask.sum())
-
-        # values of current states
-        V = vs[:, :-1].squeeze()
-        # values of next states
-        Vnext = vs[:, 1:].squeeze() * dones[:, 1:]
-        # target to fit Q
-        Q_ = rewards + self.params.gamma * Vnext.detach()
-
-        if self.params.two_qs:
-            loss_q1 = ((Q1 - Q_) * terminal_mask).pow(2).sum() / n_nonterminal
-            loss_q2 = ((Q2 - Q_) * terminal_mask).pow(2).sum() / n_nonterminal
-            loss_q = loss_q1 + loss_q2
-        else:
-            loss_q = ((Q - Q_) * terminal_mask).pow(2).sum() / n_nonterminal
-
-        targetQ = targetQ.detach()
-
-        loss_v = (
-            (
-                (targetQ >= V).int() * self.params.tau * (targetQ - V).pow(2)
-                + (targetQ < V).int() * (1 - self.params.tau) * (targetQ - V).pow(2)
-            )
-            * terminal_mask
-        ).sum() / n_nonterminal
-
-        if self.params.two_qs:
-            nactions = qs[0].shape[1]
-            loss_cql_q1 = (
-                F.cross_entropy(
-                    qs[0].reshape(-1, dsize),
-                    actions.reshape(-1),
-                    reduction="none",
-                ).reshape(bsize, nactions)
-                * terminal_mask
-            ).sum() / n_nonterminal
-            loss_cql_q2 = (
-                F.cross_entropy(
-                    qs[1].reshape(-1, dsize),
-                    actions.reshape(-1),
-                    reduction="none",
-                ).reshape(bsize, nactions)
-                * terminal_mask
-            ).sum() / n_nonterminal
-            loss_cql = loss_cql_q1 + loss_cql_q2
-        else:
-            nactions = qs.shape[1]
-            loss_cql = (
-                F.cross_entropy(
-                    qs.reshape(-1, dsize), actions.reshape(-1), reduction="none"
-                ).reshape(bsize, nactions)
-                * terminal_mask
-            ).sum() / n_nonterminal
-
-        loss_awac = (
-            F.cross_entropy(
-                logits[:, :-1, :].reshape(-1, dsize),
-                input_ids[:, 1:].reshape(-1),
-                reduction="none",
-            ).reshape(bsize, ntokens - 1)
-            * attn[:, 1:]
-        ).sum() / attn[:, 1:].sum()
-
-        loss = (
-            loss_q
-            + loss_v
-            + self.params.cql_scale * loss_cql
-            + self.params.awac_scale * loss_awac
-        )
-        stats = {
-            f"losses/{k}": v
-            for k, v in locals().items()
-            if k in ["loss", "loss_v", "loss_q", "loss_cql", "loss_awac"]
-        }
-
-        return loss, stats
+        return self.ilql.loss((logits, (qs, target_qs, vs)), batch)
 
     def prepare_learning(self):
         train_dataloader = self.store.create_loader(self.config.train.batch_size)
diff --git a/trlx/model/nn/ilql_models.py b/trlx/model/nn/ilql_models.py
diff --git a/trlx/trlx.py b/trlx/trlx.py
diff --git a/trlx/utils/__init__.py b/trlx/utils/__init__.py

Original file line number	Diff line number	Diff line change
`@@ -1,5 +1,3 @@`
`1`		`-# Generates positive movie reviews by learning from sentiment-labeled IMDB dataset`
`2`		`-`
`3`	`1`	`from datasets import load_dataset`
`4`	`2`	`from transformers import pipeline`
`5`	`3`