fix(ppo_gpt): prevent position_ids being None (#451)

li-plus · maxreciprocate · web-flow · commit 7331d633affd · 2023-04-26T22:40:02.000+03:00
* fix(ppo_gpt): prevent position_ids being None

* fix(ppo_modeling): pop `position_ids` argument if not required

* fix(ppo_modeling): add `device` argument for `OPTModelBranch`

* fix(modeling_ppo): de-complement if-condition

* fix(ppo_modeling): condition passing `device` in `OPTModelBranch`

---------

Co-authored-by: reciprocated &lt;56548574+reciprocated@users.noreply.github.com&gt;
diff --git a/examples/hh/README.md b/examples/hh/README.md
@@ -6,7 +6,7 @@ Launch training of [GPT-J](https://huggingface.co/EleutherAI/gpt-j-6B) on 7 GPUs
 ```sh
 accelerate launch --num_processes 7 --config_file ../../configs/accelerate/zero2-bf16.yaml ppo_hh.py
 ```
-Or if you want to train a smaller model or start from a supervised checkpoint, you can use one of the [configs](./configs)
+Or if you want to train a smaller model or start from a supervised checkpoint, you can use one of the [configs](../../configs)
 ```sh
 CONFIG_NAME=125M accelerate launch --num_processes 7 --config_file ../../configs/accelerate/zero2-bf16.yaml ppo_hh.py
 ```
diff --git a/trlx/models/modeling_ppo.py b/trlx/models/modeling_ppo.py
@@ -106,7 +106,7 @@ class PPOConfig(MethodConfig):
     :param vf_coef: Value loss scale w.r.t policy loss
     :type vf_coef: float
 
-    :param gen_kwargs: Additioanl kwargs for the generation
+    :param gen_kwargs: Additional kwargs for the generation
     :type gen_kwargs: Dict[str, Any]
 
     :param gen_experience_kwargs: if this is not None, then the experience is generated using this
@@ -445,7 +445,7 @@ def forward(  # noqa: max-complexity
         """Reference:
         https://github.com/huggingface/transformers/blob/2411f0e465e761790879e605a4256f3d4afb7f82/src/transformers/models/gpt2/modeling_gpt2.py#L743  # noqa: E501
         """
-        batch_size = hidden_states.size()[0]
+        batch_size, seq_length = hidden_states.shape[:2]
 
         output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
         output_hidden_states = (
@@ -457,7 +457,16 @@ def forward(  # noqa: max-complexity
         device = hidden_states.device
 
         if past_key_values is None:
+            past_length = 0
             past_key_values = tuple([None] * len(self.decoder_blocks))
+        else:
+            past_length = past_key_values[0][0].size(-2)
+
+        if position_ids is None:
+            position_ids = torch.arange(past_length, seq_length + past_length, dtype=torch.long, device=device)
+            position_ids = position_ids.unsqueeze(0).view(-1, seq_length)
+        else:
+            position_ids = position_ids.view(-1, seq_length)
 
         if attention_mask is not None:
             if batch_size <= 0:
@@ -498,28 +507,27 @@ def forward(  # noqa: max-complexity
             if output_hidden_states:
                 all_hidden_states = all_hidden_states + (hidden_states,)
 
+            kwargs = dict(
+                layer_past=layer_past,
+                attention_mask=attention_mask,
+                position_ids=position_ids,
+                head_mask=head_mask[i],
+                encoder_hidden_states=encoder_hidden_states,
+                encoder_attention_mask=encoder_attention_mask,
+                use_cache=use_cache,
+                output_attentions=output_attentions,
+            )
+
             # Assumes we are never training the branch
             block_params = inspect.getfullargspec(block.forward).args
-            if "encoder_hidden_states" in block_params:
-                outputs = block(
-                    hidden_states,
-                    layer_past=layer_past,
-                    attention_mask=attention_mask,
-                    head_mask=head_mask[i],
-                    encoder_hidden_states=encoder_hidden_states,
-                    encoder_attention_mask=encoder_attention_mask,
-                    use_cache=use_cache,
-                    output_attentions=output_attentions,
-                )
-            else:
-                outputs = block(
-                    hidden_states,
-                    layer_past=layer_past,
-                    attention_mask=attention_mask,
-                    head_mask=head_mask[i],
-                    use_cache=use_cache,
-                    output_attentions=output_attentions,
-                )
+            if "encoder_hidden_states" not in block_params:
+                kwargs.pop("encoder_hidden_states")
+                kwargs.pop("encoder_attention_mask")
+            # Remove position_ids for GPT2Block
+            if "position_ids" not in block_params:
+                kwargs.pop("position_ids")
+
+            outputs = block(hidden_states, **kwargs)
 
             hidden_states = outputs[0]
             if use_cache is True:
@@ -594,10 +602,17 @@ def forward(  # noqa: max-complexity
         input_shape = hidden_states.size()[:-1]
         combined_attention_mask = None
         if input_shape[-1] > 1:
+            # `modeling_opt._make_causal_mask` @ transformers==4.27.1 doesn't have the `device` argument
+            if "device" in inspect.getfullargspec(modeling_opt._make_causal_mask).args:
+                kwargs = dict(device=hidden_state.device)
+            else:
+                kwargs = {}
+
             combined_attention_mask = modeling_opt._make_causal_mask(
                 input_shape,
                 hidden_states.dtype,
                 past_key_values_length=past_key_values_length,
+                **kwargs,
             ).to(hidden_states.device)
 
         if attention_mask is not None: