add flashmask in modeling_pp

DrownFish19 · DrownFish19 · commit 0d872f14d04d · 2024-10-14T16:43:06.000+08:00
diff --git a/paddlenlp/transformers/qwen2/modeling_pp.py b/paddlenlp/transformers/qwen2/modeling_pp.py
@@ -12,6 +12,9 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+
+from typing import OrderedDict
+
 import paddle
 import paddle.distributed.fleet as fleet
 import paddle.nn as nn
@@ -41,32 +44,37 @@
 
 def parse_args(args):
     if isinstance(args, tuple):
-        if len(args) == 3:
-            hidden_states, attention_mask, position_ids = args
+        if len(args) == 4:
+            hidden_states, attention_mask, attn_mask_startend_row_indices, position_ids = args
+        elif len(args) == 3:
+            hidden_states, attention_mask, attn_mask_startend_row_indices = args
+            position_ids = None
         elif len(args) == 2:
             hidden_states, attention_mask = args
-            position_ids = None
-        elif len(args) == 1:
-            hidden_states = args
-            attention_mask, position_ids = None, None
+            attn_mask_startend_row_indices, position_ids = None, None
     else:
         hidden_states = args
-        attention_mask, position_ids = None, None
+        attention_mask, attn_mask_startend_row_indices, position_ids = None, None, None
 
     if position_ids is not None:
         position_ids.stop_gradient = True
 
     if attention_mask is not None:
         attention_mask.stop_gradient = True
 
-    return hidden_states, attention_mask, position_ids
+    if attn_mask_startend_row_indices is not None:
+        attn_mask_startend_row_indices.stop_gradient = True
+
+    return hidden_states, attention_mask, attn_mask_startend_row_indices, position_ids
 
 
-def return_args(hidden_states, attention_mask=None, position_ids=None):
+def return_args(hidden_states, attention_mask=None, attn_mask_startend_row_indices=None, position_ids=None):
     ret = (hidden_states,)
 
     if attention_mask is not None:
         ret += (attention_mask.clone(),)
+    if attn_mask_startend_row_indices is not None:
+        ret += (attn_mask_startend_row_indices.clone(),)
     if position_ids is not None:
         ret += (position_ids.clone(),)
     if len(ret) == 1:
@@ -112,7 +120,7 @@ def forward(self, args):
         Returns:
             _type_: _description_
         """
-        input_ids, attention_mask, position_ids = parse_args(args)
+        input_ids, attention_mask, attn_mask_startend_row_indices, position_ids = parse_args(args)
         input_embeds = self.embed_tokens(input_ids)
         if self.config.sequence_parallel:
             from paddlenlp.transformers import ScatterOp
@@ -126,6 +134,10 @@ def forward(self, args):
         batch_size, seq_length = input_ids.shape
 
         if attention_mask is not None:
+            assert (
+                attn_mask_startend_row_indices is None
+            ), "attention_mask and attn_mask_startend_row_indices can not be set at same time"
+
             attention_mask = Qwen2Model._prepare_decoder_attention_mask(
                 attention_mask, (batch_size, seq_length), 0, input_embeds.dtype
             )
@@ -136,22 +148,34 @@ def forward(self, args):
             attention_mask = paddle.tril(paddle.ones((seq_length, seq_length), dtype="bool"))
             attention_mask.stop_gradient = True
 
-        return return_args(input_embeds, attention_mask, position_ids)
+        return return_args(input_embeds, attention_mask, attn_mask_startend_row_indices, position_ids)
 
 
 class Qwen2DecoderLayerPipe(Qwen2DecoderLayer):
     def forward(self, args):
-        hidden_states, attention_mask, position_ids = parse_args(args)
+        hidden_states, attention_mask, attn_mask_startend_row_indices, position_ids = parse_args(args)
 
         has_gradient = not hidden_states.stop_gradient
 
+        if attention_mask is not None and attention_mask.dtype == paddle.int32:
+            attention_mask, attn_mask_startend_row_indices, position_ids = (
+                None,
+                attention_mask,
+                attn_mask_startend_row_indices,
+            )
+        elif attention_mask is not None and attention_mask.dtype == paddle.int64:
+            attention_mask, attn_mask_startend_row_indices, position_ids = None, None, attention_mask
+        elif attn_mask_startend_row_indices is not None and attn_mask_startend_row_indices.dtype == paddle.int64:
+            attn_mask_startend_row_indices, position_ids = None, attn_mask_startend_row_indices
+
         if self.enable_recompute and self.config.recompute_granularity == "full" and has_gradient:
-            if attention_mask is not None:
+            if attention_mask is not None or attn_mask_startend_row_indices is not None:
                 hidden_states = recompute(
                     super().forward,
                     hidden_states,
                     position_ids=position_ids,
                     attention_mask=attention_mask,
+                    attn_mask_startend_row_indices=attn_mask_startend_row_indices,
                     use_reentrant=False,
                 )
             else:
@@ -160,12 +184,18 @@ def forward(self, args):
                     super().forward,
                     hidden_states,
                     position_ids=position_ids,
+                    attn_mask_startend_row_indices=attn_mask_startend_row_indices,
                     use_reentrant=self.config.recompute_use_reentrant,
                 )
         else:
-            hidden_states = super().forward(hidden_states, position_ids=position_ids, attention_mask=attention_mask)
+            hidden_states = super().forward(
+                hidden_states,
+                position_ids=position_ids,
+                attention_mask=attention_mask,
+                attn_mask_startend_row_indices=attn_mask_startend_row_indices,
+            )
 
-        return return_args(hidden_states, attention_mask, position_ids)
+        return return_args(hidden_states, attention_mask, attn_mask_startend_row_indices, position_ids)
 
 
 class Qwen2RMSNormPipe(nn.Layer):
@@ -174,7 +204,7 @@ def __init__(self, config):
         self.norm = Qwen2RMSNorm(config)
 
     def forward(self, args):
-        hidden_states, attention_mask, position_ids = parse_args(args)
+        hidden_states, attention_mask, attn_mask_startend_row_indices, position_ids = parse_args(args)
         return self.norm(hidden_states)
 
 
@@ -202,6 +232,31 @@ class Qwen2ForCausalLMPipe(PipelinePretrainedModel, PipelineLayer):
 
     # DONOT Add base_model_prefix !!!!
 
+    @classmethod
+    def _prepare_pipeline_inputs_func(cls, inputs):
+
+        first_stage_keys = ["input_ids", "attention_mask", "attn_mask_startend_row_indices", "position_ids"]
+        last_stage_keys = ["labels"]
+
+        def get_expected_keys(inputs, keys):
+            ret = tuple([inputs.pop(k) if k in inputs else None for k in keys])
+            if len(ret) == 1:
+                ret = ret[0]
+            return ret
+
+        if type(inputs) is dict or type(inputs) is OrderedDict:
+            return [
+                get_expected_keys(inputs, first_stage_keys),
+                get_expected_keys(inputs, last_stage_keys),
+            ]
+
+        keys = list(inputs[0].keys())
+        inputs_batch = {key: [data.pop(key) for data in inputs] for key in keys}
+        return [
+            get_expected_keys(inputs_batch, first_stage_keys),
+            get_expected_keys(inputs_batch, last_stage_keys),
+        ]
+
     def __init__(self, config: Qwen2Config):
         self.config = config