Revert "Merge pull request #89 from CarperAI/loss-refactor"

cat-state · cat-state · commit de79b3705064 · 2022-11-11T17:25:47.000Z
This reverts commit da935b0, reversing changes made to ea38a94.
diff --git a/configs/ilql_config.yml b/configs/ilql_config.yml
@@ -1,7 +1,7 @@
 model:
-  model_path: "gpt2"
+  model_path : "gpt2"
   tokenizer_path: "gpt2"
-  model_type: "ILQLModel"
+  model_type : "ILQLModel"
   num_layers_unfrozen: -1
 
 train:
@@ -19,8 +19,8 @@ train:
   checkpoint_interval: 1000
   eval_interval: 128
 
-  pipeline: "OfflinePipeline"
-  orchestrator: "OfflineOrchestrator"
+  pipeline : "OfflinePipeline"
+  orchestrator : "OfflineOrchestrator"
   seed: 1000
 
 method:
diff --git a/examples/ilql_sentiments.py b/examples/ilql_sentiments.py
@@ -1,3 +1,5 @@
+# Generates positive movie reviews by learning from sentiment-labeled IMDB dataset
+
 from datasets import load_dataset
 from transformers import pipeline
 
diff --git a/trlx/data/ilql_types.py b/trlx/data/ilql_types.py
@@ -1,6 +1,6 @@
 from dataclasses import dataclass
 
-from torchtyping import TensorType  # type: ignore
+from torchtyping import TensorType
 
 
 @dataclass
diff --git a/trlx/data/method_configs.py b/trlx/data/method_configs.py
@@ -3,7 +3,7 @@
 from typing import Any, Callable, Dict, List
 
 # specifies a dictionary of method configs
-_METHODS: Dict[str, Any] = {}  # registry
+_METHODS: Dict[str, any] = {}  # registry
 
 
 def register_method(name):
@@ -28,6 +28,17 @@ def register_class(cls, name):
     return cls
 
 
+def get_method(name: str) -> Callable:
+    """
+    Return constructor for specified method config
+    """
+    name = name.lower()
+    if name in _METHODS:
+        return _METHODS[name]
+    else:
+        raise Exception("Error: Trying to access a method that has not been registered")
+
+
 @dataclass
 @register_method
 class MethodConfig:
@@ -45,12 +56,36 @@ def from_dict(cls, config: Dict[str, Any]):
         return cls(**config)
 
 
-def get_method(name: str) -> MethodConfig:
+@dataclass
+@register_method
+class ILQLConfig(MethodConfig):
     """
-    Return constructor for specified method config
+    Config for ILQL method
+
+    :param tau: Control tradeoff in value loss between punishing value network for underestimating the target Q (i.e. Q value corresponding to the action taken) (high tau) and overestimating the target Q (low tau)
+    :type tau: float
+
+    :param gamma: Discount factor for future rewards
+    :type gamma: float
+
+    :param cql_scale: Weight for CQL loss term
+    :type cql_scale: float
+
+    :param awac_scale: Weight for AWAC loss term
+    :type awac_scale: float
+
+    :param steps_for_target_q_sync: Number of steps to wait before syncing target Q network with Q network
+    :type steps_for_target_q_sync: int
+
+    :param two_qs: Use minimum of two Q-value estimates
+    :type two_qs: bool
     """
-    name = name.lower()
-    if name in _METHODS:
-        return _METHODS[name]
-    else:
-        raise Exception("Error: Trying to access a method that has not been registered")
+
+    tau: float
+    gamma: float
+    cql_scale: float
+    awac_scale: float
+    alpha: float
+    steps_for_target_q_sync: int
+    betas: List[float]
+    two_qs: bool
diff --git a/trlx/model/__init__.py b/trlx/model/__init__.py
@@ -1,7 +1,7 @@
 import os
 import sys
 from abc import abstractmethod
-from typing import Any, Callable, Dict, Iterable
+from typing import Callable, Dict, Iterable
 
 import torch
 
@@ -11,11 +11,11 @@
 from trlx.utils import safe_mkdir
 
 # specifies a dictionary of architectures
-_MODELS: Dict[str, Any] = {}  # registry
+_MODELS: Dict[str, any] = {}  # registry
 
 
 def register_model(name):
-    """Decorator used register an architecture
+    """Decorator used register a CARP architecture
     Args:
         name: Name of the architecture
     """
@@ -46,10 +46,6 @@ def __init__(self, config: TRLConfig, train_mode=False):
     def push_to_store(self, data):
         self.store.push(data)
 
-    def add_eval_pipeline(self, eval_pipeline):
-        """Adds pipeline from with validation prompts"""
-        self.eval_pipeline = eval_pipeline
-
     @abstractmethod
     def act(self, data: RLElement) -> RLElement:
         """
@@ -96,7 +92,7 @@ def learn(
         pass
 
     @abstractmethod
-    def get_components(self) -> Dict[str, Any]:
+    def get_components(self) -> Dict[str, any]:
         """
         Get pytorch components (mainly for saving/loading)
         """
diff --git a/trlx/model/accelerate_base_model.py b/trlx/model/accelerate_base_model.py
@@ -2,11 +2,11 @@
 import os
 from abc import abstractmethod
 from time import time
-from typing import Any, Dict, Iterable, Sequence, Tuple, Union
+from typing import Dict, Iterable, Tuple
 
 import torch
 import torch.nn.functional as F
-from accelerate import Accelerator  # type: ignore
+from accelerate import Accelerator
 from transformers import AutoTokenizer
 
 import wandb
@@ -92,13 +92,10 @@ def __init__(self, config, train_mode=True):
             eta_min=self.config.train.lr_target,
         )
 
-    def tokenize(self, text: Union[Sequence[str], Sequence[torch.LongTensor]]):
+    def tokenize(self, text: Iterable[str]):
         """
         Tokenize a batch of text after adding bos token to each of the samples
         """
-        if isinstance(text[0], torch.LongTensor):
-            return text
-
         text = [self.tokenizer.bos_token + txt for txt in text]
         return self.tokenizer(
             text,
@@ -120,7 +117,7 @@ def generate(self, input_ids, attention_mask=None, **kwargs):
                 input_ids=input_ids, attention_mask=attention_mask, **kwargs
             )
 
-    def get_components(self) -> Dict[str, Any]:
+    def get_components(self) -> Dict[str, any]:
         components = (
             {"model": self.model, "opt": self.opt, "scheduler": self.scheduler}
             if self.train_mode
diff --git a/trlx/model/accelerate_ilql_model.py b/trlx/model/accelerate_ilql_model.py
@@ -1,14 +1,10 @@
-from typing import Iterable, Sequence, Union, cast
+from typing import Iterable, Union
 
 import torch
 import torch.nn.functional as F
 
-
 from trlx.model import register_model
-from trlx.model.nn.ilql_models import ILQLConfig, CausalLMWithValueHeads
-from trlx.data.ilql_types import ILQLBatch
-from trlx.data.configs import TRLConfig
-from trlx.utils import to_device
+from trlx.model.nn.ilql_models import CausalLMWithValueHeads
 
 from .accelerate_base_model import AccelerateRLModel
 
@@ -17,7 +13,7 @@
 class AccelerateILQLModel(AccelerateRLModel):
     def __init__(
         self,
-        config: TRLConfig,
+        config,
         logit_mask=None,
         metric_fn=None,
         train_mode=True,
@@ -26,20 +22,16 @@ def __init__(
         self.logit_mask = logit_mask
         self.metric_fn = metric_fn
         self.reward_fn = None
-
-        if not isinstance(config.method, ILQLConfig):
-            raise ValueError("config.method must be ILQLConfig")
-
-        self.ilql: ILQLConfig = cast(ILQLConfig, config.method)
+        self.params = config.method
 
     def get_arch(self, config):
         return CausalLMWithValueHeads(
             config.model.model_path,
-            ilql_config=config.method,
+            params=config.method,
             num_layers_unfrozen=config.model.num_layers_unfrozen,
         )
 
-    def tokenize(self, texts: Union[Sequence[str], Sequence[torch.LongTensor]]):
+    def tokenize(self, texts: Union[Iterable[str], Iterable[torch.LongTensor]]):
         if isinstance(texts[0], torch.LongTensor):
             return texts
 
@@ -55,17 +47,113 @@ def post_backward_callback(self):
         if self.iter_count % self.config.method.steps_for_target_q_sync == 0:
             self.accelerator.unwrap_model(self.model).sync_target_q_heads()
 
-    def loss(self, batch: ILQLBatch):
-        batch = to_device(batch, self.accelerator.device)
+    def loss(self, batch):
+        input_ids = batch.input_ids.to(self.accelerator.device)
+        attn = batch.attention_mask.to(self.accelerator.device)
+        rewards = batch.rewards.to(self.accelerator.device)
+        states_ixs = batch.states_ixs.to(self.accelerator.device)
+        actions_ixs = batch.actions_ixs.to(self.accelerator.device)
+        dones = batch.dones.to(self.accelerator.device)
 
         logits, qs, target_qs, vs, _ = self.model(
-            input_ids=batch.input_ids,
-            attention_mask=batch.attention_mask,
-            actions_ixs=batch.actions_ixs,
-            states_ixs=batch.states_ixs,
+            input_ids=input_ids,
+            attention_mask=attn,
+            actions_ixs=actions_ixs,
+            states_ixs=states_ixs,
         )
 
-        return self.ilql.loss((logits, (qs, target_qs, vs)), batch)
+        actions = input_ids[:, 1:].gather(dim=1, index=actions_ixs).unsqueeze(-1)
+        bsize, ntokens, dsize = logits.shape
+
+        # compute two separate q-value estimates, to then select minimum values from both
+        if self.params.two_qs:
+            Q1 = qs[0].gather(-1, actions).squeeze(-1)
+            Q2 = qs[1].gather(-1, actions).squeeze(-1)
+
+            targetQ1 = target_qs[0].gather(-1, actions).squeeze(-1).detach()
+            targetQ2 = target_qs[1].gather(-1, actions).squeeze(-1).detach()
+            targetQ = torch.minimum(targetQ1, targetQ2)
+        else:
+            Q = qs.gather(-1, actions).squeeze(-1)
+            targetQ = target_qs.gather(-1, actions).squeeze(-1).detach()
+
+        terminal_mask = dones[:, :-1]
+        n_nonterminal = max(1, terminal_mask.sum())
+
+        # values of current states
+        V = vs[:, :-1].squeeze()
+        # values of next states
+        Vnext = vs[:, 1:].squeeze() * dones[:, 1:]
+        # target to fit Q
+        Q_ = rewards + self.params.gamma * Vnext.detach()
+
+        if self.params.two_qs:
+            loss_q1 = ((Q1 - Q_) * terminal_mask).pow(2).sum() / n_nonterminal
+            loss_q2 = ((Q2 - Q_) * terminal_mask).pow(2).sum() / n_nonterminal
+            loss_q = loss_q1 + loss_q2
+        else:
+            loss_q = ((Q - Q_) * terminal_mask).pow(2).sum() / n_nonterminal
+
+        targetQ = targetQ.detach()
+
+        loss_v = (
+            (
+                (targetQ >= V).int() * self.params.tau * (targetQ - V).pow(2)
+                + (targetQ < V).int() * (1 - self.params.tau) * (targetQ - V).pow(2)
+            )
+            * terminal_mask
+        ).sum() / n_nonterminal
+
+        if self.params.two_qs:
+            nactions = qs[0].shape[1]
+            loss_cql_q1 = (
+                F.cross_entropy(
+                    qs[0].reshape(-1, dsize),
+                    actions.reshape(-1),
+                    reduction="none",
+                ).reshape(bsize, nactions)
+                * terminal_mask
+            ).sum() / n_nonterminal
+            loss_cql_q2 = (
+                F.cross_entropy(
+                    qs[1].reshape(-1, dsize),
+                    actions.reshape(-1),
+                    reduction="none",
+                ).reshape(bsize, nactions)
+                * terminal_mask
+            ).sum() / n_nonterminal
+            loss_cql = loss_cql_q1 + loss_cql_q2
+        else:
+            nactions = qs.shape[1]
+            loss_cql = (
+                F.cross_entropy(
+                    qs.reshape(-1, dsize), actions.reshape(-1), reduction="none"
+                ).reshape(bsize, nactions)
+                * terminal_mask
+            ).sum() / n_nonterminal
+
+        loss_awac = (
+            F.cross_entropy(
+                logits[:, :-1, :].reshape(-1, dsize),
+                input_ids[:, 1:].reshape(-1),
+                reduction="none",
+            ).reshape(bsize, ntokens - 1)
+            * attn[:, 1:]
+        ).sum() / attn[:, 1:].sum()
+
+        loss = (
+            loss_q
+            + loss_v
+            + self.params.cql_scale * loss_cql
+            + self.params.awac_scale * loss_awac
+        )
+        stats = {
+            f"losses/{k}": v
+            for k, v in locals().items()
+            if k in ["loss", "loss_v", "loss_q", "loss_cql", "loss_awac"]
+        }
+
+        return loss, stats
 
     def prepare_learning(self):
         train_dataloader = self.store.create_loader(self.config.train.batch_size)
diff --git a/trlx/model/nn/ilql_models.py b/trlx/model/nn/ilql_models.py
diff --git a/trlx/trlx.py b/trlx/trlx.py
diff --git a/trlx/utils/__init__.py b/trlx/utils/__init__.py

Original file line number	Diff line number	Diff line change
`@@ -1,3 +1,5 @@`
	`1`	`+# Generates positive movie reviews by learning from sentiment-labeled IMDB dataset`
	`2`	`+`
`1`	`3`	`from datasets import load_dataset`
`2`	`4`	`from transformers import pipeline`
`3`	`5`