cleanup

mayank31398 · mayank31398 · commit de02e9168c7e · 2024-08-29T17:09:25.000-04:00
diff --git a/src/transformers/models/granitemoe/modeling_granitemoe.py b/src/transformers/models/granitemoe/modeling_granitemoe.py
@@ -27,7 +27,8 @@
 from ...modeling_flash_attention_utils import _flash_attention_forward
 from ...modeling_outputs import (
     BaseModelOutputWithPast,
-    CausalLMOutputWithPast,
+    MoeCausalLMOutputWithPast,
+    MoeModelOutputWithPast,
 )
 from ...modeling_rope_utils import ROPE_INIT_FUNCTIONS
 from ...modeling_utils import PreTrainedModel
@@ -47,6 +48,60 @@
 _CONFIG_FOR_DOC = "GraniteMoeConfig"
 
 
+# Copied from transformers.models.granite.modeling_granite._prepare_4d_causal_attention_mask_with_cache_position with Granite->GraniteMoe
+def _prepare_4d_causal_attention_mask_with_cache_position(
+    attention_mask: torch.Tensor,
+    sequence_length: int,
+    target_length: int,
+    dtype: torch.dtype,
+    device: torch.device,
+    min_dtype: float,
+    cache_position: torch.Tensor,
+    batch_size: int,
+):
+    """
+    Creates a causal 4D mask of shape `(batch_size, 1, query_length, key_value_length)` from a 2D mask of shape
+    `(batch_size, key_value_length)`, or if the input `attention_mask` is already 4D, do nothing.
+
+    Args:
+        attention_mask (`torch.Tensor`):
+            A 2D attention mask of shape `(batch_size, key_value_length)` or a 4D attention mask of shape `(batch_size, 1, query_length, key_value_length)`.
+        sequence_length (`int`):
+            The sequence length being processed.
+        target_length (`int`):
+            The target length: when generating with static cache, the mask should be as long as the static cache, to account for the 0 padding, the part of the cache that is not filled yet.
+        dtype (`torch.dtype`):
+            The dtype to use for the 4D attention mask.
+        device (`torch.device`):
+            The device to plcae the 4D attention mask on.
+        min_dtype (`float`):
+            The minimum value representable with the dtype `dtype`.
+        cache_position (`torch.Tensor`):
+            Indices depicting the position of the input sequence tokens in the sequence.
+        batch_size (`torch.Tensor`):
+            Batch size.
+    """
+    if attention_mask is not None and attention_mask.dim() == 4:
+        # In this case we assume that the mask comes already in inverted form and requires no inversion or slicing.
+        causal_mask = attention_mask
+    else:
+        causal_mask = torch.full((sequence_length, target_length), fill_value=min_dtype, dtype=dtype, device=device)
+        if sequence_length != 1:
+            causal_mask = torch.triu(causal_mask, diagonal=1)
+        causal_mask *= torch.arange(target_length, device=device) > cache_position.reshape(-1, 1)
+        causal_mask = causal_mask[None, None, :, :].expand(batch_size, 1, -1, -1)
+        if attention_mask is not None:
+            causal_mask = causal_mask.clone()  # copy to contiguous memory for in-place edit
+            mask_length = attention_mask.shape[-1]
+            padding_mask = causal_mask[:, :, :, :mask_length] + attention_mask[:, None, None, :]
+            padding_mask = padding_mask == 0
+            causal_mask[:, :, :, :mask_length] = causal_mask[:, :, :, :mask_length].masked_fill(
+                padding_mask, min_dtype
+            )
+
+    return causal_mask
+
+
 # Copied from transformers.models.jetmoe.modeling_jetmoe.load_balancing_loss_func
 def load_balancing_loss_func(
     gate_logits: torch.Tensor, num_experts: torch.Tensor = None, top_k=2, attention_mask: Optional[torch.Tensor] = None
@@ -708,7 +763,6 @@ def forward(
 }
 
 
-# Copied from transformers.models.granite.modeling_granite.GraniteDecoderLayer with GRANITE->GRANITEMOE,Granite->GraniteMoe
 class GraniteMoeDecoderLayer(nn.Module):
     def __init__(self, config: GraniteMoeConfig, layer_idx: int):
         super().__init__()
@@ -821,7 +875,6 @@ def forward(
     "The bare GraniteMoe Model outputting raw hidden-states without any specific head on top.",
     GRANITEMOE_START_DOCSTRING,
 )
-# Copied from transformers.models.granite.modeling_granite.GranitePreTrainedModel with Granite->GraniteMoe
 class GraniteMoePreTrainedModel(PreTrainedModel):
     config_class = GraniteMoeConfig
     base_model_prefix = "model"
@@ -929,7 +982,6 @@ def _init_weights(self, module):
     "The bare GraniteMoe Model outputting raw hidden-states without any specific head on top.",
     GRANITEMOE_START_DOCSTRING,
 )
-# Copied from transformers.models.granite.modeling_granite.GraniteModel with GRANITE->GRANITEMOE,Granite->GraniteMoe
 class GraniteMoeModel(GraniteMoePreTrainedModel):
     """
     Transformer decoder consisting of *config.num_hidden_layers* layers. Each layer is a [`GraniteMoeDecoderLayer`]
@@ -1180,11 +1232,9 @@ def _update_causal_mask(
         return causal_mask
 
 
-# Copied from transformers.models.granite.modeling_granite.GraniteForCausalLM with GRANITE->GRANITEMOE,Granite->GraniteMoe,granite->granitemoe
 class GraniteMoeForCausalLM(GraniteMoePreTrainedModel):
     _tied_weights_keys = ["lm_head.weight"]
 
-    # Copied from transformers.models.llama.modeling_llama.LlamaForCausalLM.__init__ with Llama->GraniteMoe
     def __init__(self, config: GraniteMoeConfig):
         super().__init__(config)
         self.model = GraniteMoeModel(config)
@@ -1217,7 +1267,7 @@ def get_decoder(self):
         return self.model
 
     @add_start_docstrings_to_model_forward(GRANITEMOE_INPUTS_DOCSTRING)
-    @replace_return_docstrings(output_type=CausalLMOutputWithPast, config_class=_CONFIG_FOR_DOC)
+    @replace_return_docstrings(output_type=MoeCausalLMOutputWithPast, config_class=_CONFIG_FOR_DOC)
     def forward(
         self,
         input_ids: torch.LongTensor = None,
@@ -1232,7 +1282,7 @@ def forward(
         output_router_logits: Optional[bool] = None,
         return_dict: Optional[bool] = None,
         cache_position: Optional[torch.LongTensor] = None,
-    ) -> Union[Tuple, CausalLMOutputWithPast]:
+    ) -> Union[Tuple, MoeCausalLMOutputWithPast]:
         r"""
         Args:
             labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):