PaddlePaddle
diff --git a/‎llm/llama/README.md‎
Lines changed: 4 additions & 0 deletions b/‎llm/llama/README.md‎
Lines changed: 4 additions & 0 deletions
diff --git a/‎llm/llama/run_pretrain.py‎
Lines changed: 4 additions & 9 deletions b/‎llm/llama/run_pretrain.py‎
Lines changed: 4 additions & 9 deletions
diff --git a/‎paddlenlp/experimental/transformers/chatglm/modeling.py‎
Lines changed: 1 addition & 1 deletion b/‎paddlenlp/experimental/transformers/chatglm/modeling.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎paddlenlp/experimental/transformers/generation_utils.py‎
Lines changed: 11 additions & 18 deletions b/‎paddlenlp/experimental/transformers/generation_utils.py‎
Lines changed: 11 additions & 18 deletions
diff --git a/‎paddlenlp/generation/utils.py‎
Lines changed: 6 additions & 9 deletions b/‎paddlenlp/generation/utils.py‎
Lines changed: 6 additions & 9 deletions
diff --git a/‎paddlenlp/transformers/bloom/modeling.py‎
Lines changed: 5 additions & 6 deletions b/‎paddlenlp/transformers/bloom/modeling.py‎
Lines changed: 5 additions & 6 deletions
diff --git a/‎paddlenlp/transformers/llama/configuration.py‎
Lines changed: 4 additions & 7 deletions b/‎paddlenlp/transformers/llama/configuration.py‎
Lines changed: 4 additions & 7 deletions
diff --git a/‎paddlenlp/transformers/llama/modeling.py‎
Lines changed: 12 additions & 10 deletions b/‎paddlenlp/transformers/llama/modeling.py‎
Lines changed: 12 additions & 10 deletions
diff --git a/‎paddlenlp/transformers/tokenizer_utils.py‎
Lines changed: 22 additions & 0 deletions b/‎paddlenlp/transformers/tokenizer_utils.py‎
Lines changed: 22 additions & 0 deletions
@@ -25,6 +25,10 @@
 | baichuan-inc/Baichuan-7B          |
 | baichuan-inc/Baichuan-13B-Base    |
 | baichuan-inc/Baichuan-13B-Chat    |
+| baichuan-inc/Baichuan2-7B-Base    |
+| baichuan-inc/Baichuan2-7B-Chat    |
+| baichuan-inc/Baichuan2-13B-Base   |
+| baichuan-inc/Baichuan2-13B-Chat   |
 | FlagAlpha/Llama2-Chinese-7b-Chat  |
 | FlagAlpha/Llama2-Chinese-13b-Chat |
 
 
@@ -178,14 +178,9 @@ class ModelArguments:
         default=False,
         metadata={"help": "whether to use fuse sequence parallel allreduce"},
     )
-    rope_fusion_level: Optional[str] = field(
-        default=None,
-        metadata={
-            "help": "The level of fusion of rope embedding. Can be chosen from:\n"
-            "(1) 'full': fuse sin cos compute and rope embedding\n"
-            "(2) 'core': only fuse rope embedding, will compute the sin and cos\n"
-            "(3) None: don't fuse any part of the rope embedding"
-        },
+    use_fused_rope: Optional[bool] = field(
+        default=False,
+        metadata={"help": "Enable rope fusion or not."},
     )
     no_recompute_layers: Optional[List[int]] = field(
         default=None,
@@ -443,7 +438,7 @@ def main():
     config.virtual_pp_degree = model_args.virtual_pp_degree
     config.sequence_parallel = model_args.sequence_parallel
     config.fuse_sequence_parallel_allreduce = model_args.fuse_sequence_parallel_allreduce
-    config.rope_fusion_level = model_args.rope_fusion_level
+    config.use_fused_rope = model_args.use_fused_rope
     config.no_recompute_layers = model_args.no_recompute_layers
     config.pp_recompute_interval = model_args.pp_recompute_interval
     config.recompute_use_reentrant = model_args.recompute_use_reentrant
 
@@ -332,7 +332,7 @@ def set_state_dict(self, state_dict, use_structured_name=True):
                 continue
             elif k.startswith("lm_head.weight"):
                 continue
-            elif k.endswith("rotary_emb.inv_freq"):
+            elif k.endswith("rotary_embeddings.inv_freq") or k.endswith("rotary_emb.inv_freq"):
                 continue
             idx = int(k.split(".")[2])
             if k.endswith("input_layernorm.weight"):
 
@@ -370,24 +370,17 @@ def _post_process_(outputs, top_p, temperature, step_idx_ori, model_kwargs):
 
             return next_tokens, model_kwargs
 
-        if paddle.max(model_kwargs["seq_len_encoder"]) > 0:
-            # encoder
-            outputs = _forward_(**model_kwargs)
-            # first decoder
-            next_tokens, model_kwargs = _post_process_(
-                outputs,
-                top_p,
-                temperature,
-                step_idx_ori,
-                model_kwargs,
-            )
-            step_idx_ori += 1
-        else:
-            outputs = None
-            # first decoder
-            next_tokens = None
-            model_kwargs["next_tokens"] = next_tokens
-            step_idx_ori += 0
+        # encoder
+        outputs = _forward_(**model_kwargs)
+        # first decoder
+        next_tokens, model_kwargs = _post_process_(
+            outputs,
+            top_p,
+            temperature,
+            step_idx_ori,
+            model_kwargs,
+        )
+        step_idx_ori += 1
 
         # gives it a value, means we will entered into decoder phase.
         model_kwargs["cache"] = 0
 
@@ -24,15 +24,9 @@
 from paddle.common_ops_import import convert_dtype
 from paddle.utils import map_structure
 
-try:
-    from paddle import top_p_sampling
-
-    is_top_p_sampling_avaliable = True
-except:
-    is_top_p_sampling_avaliable = False
-
 from paddlenlp.transformers.model_outputs import ModelOutput
 from paddlenlp.transformers.utils import get_scale_by_dtype
+from paddlenlp.utils.import_utils import is_paddlenlp_ops_available
 from paddlenlp.utils.log import logger
 
 from .configuration_utils import DEFAULT_MAX_NEW_TOKENS, GenerationConfig
@@ -55,6 +49,9 @@
 )
 from .streamers import BaseStreamer
 
+if is_paddlenlp_ops_available():
+    import paddlenlp_ops
+
 __all__ = [
     "GenerationMixin",
     "BeamSearchScorer",
@@ -1333,9 +1330,9 @@ def _post_process_(outputs, input_ids, cur_len, origin_len, scores, unfinished_f
             # compute next_tokens
             if use_top_p:
                 logits = logits / temperature
-                if is_top_p_sampling_avaliable:
+                if is_paddlenlp_ops_available():
                     top_ps_tensor = paddle.full(shape=[paddle.shape(probs)[0], 1], fill_value=top_p, dtype=probs.dtype)
-                    _, next_tokens = top_p_sampling(probs, top_ps_tensor)
+                    _, next_tokens = paddlenlp_ops.top_p_sampling(probs, top_ps_tensor, -1)
                 else:
                     probs = TopPProcess(probs, top_p, min_tokens_to_keep)
                     next_tokens = paddle.multinomial(probs)
 
@@ -795,7 +795,7 @@ def get_input_embeddings(self):
         return self.word_embeddings
 
     def _prepare_attn_mask(
-        self, attention_mask: Tensor, input_shape: Tuple[int, int], past_key_values_length: int, num_heads: int, dtype
+        self, attention_mask: Tensor, input_shape: Tuple[int, int], past_key_values_length: int, num_heads: int
     ) -> Tensor:
         # create causal mask
         # [batch_size, seq_length] -> [batch_size, 1, tgt_length, src_length]
@@ -819,8 +819,9 @@ def _prepare_attn_mask(
 
         mask_shape = expanded_attn_mask.shape
         expanded_attn_mask = expanded_attn_mask.expand([mask_shape[0], num_heads, mask_shape[2], mask_shape[3]])
-        zero = paddle.zeros(expanded_attn_mask.shape, dtype=dtype)
-        neg_inf = paddle.full(expanded_attn_mask.shape, paddle.finfo(dtype).min, dtype=dtype)
+        # Attention score will be cast to float32 in the following calculation, therefore we set attention_mask dtype as float32
+        zero = paddle.zeros(expanded_attn_mask.shape, dtype=paddle.float32)
+        neg_inf = paddle.full(expanded_attn_mask.shape, paddle.finfo(paddle.float32).min, dtype=paddle.float32)
         expanded_attn_mask = paddle.where(expanded_attn_mask, zero, neg_inf)
         batch_size, num_heads, sq_len, kv_len = expanded_attn_mask.shape
         return expanded_attn_mask.reshape([batch_size * num_heads, sq_len, kv_len])
@@ -929,7 +930,6 @@ def forward(
                 input_shape=(batch_size, seq_length),
                 past_key_values_length=past_key_values_length,
                 num_heads=block_size,
-                dtype=hidden_states.dtype,
             )
         else:
             alibi = alibi.reshape([batch_size * self.config.n_head, 1, seq_length_with_past])
@@ -938,7 +938,6 @@ def forward(
                 input_shape=(batch_size, seq_length),
                 past_key_values_length=past_key_values_length,
                 num_heads=self.config.n_head,
-                dtype=hidden_states.dtype,
             )
 
         for i, (block, layer_past) in enumerate(zip(self.h, past_key_values)):
@@ -1088,7 +1087,7 @@ def __init__(self, config):
         self.lm_head = BloomLMHead(config, self.bloom.word_embeddings.weight)
         self.criterion = BloomPretrainingCriterion(
             tensor_parallel_degree=config.tensor_parallel_degree,
-            tensor_parallel_output=True,
+            tensor_parallel_output=config.tensor_parallel_output,
         )
 
     def get_output_embeddings(self):
 
@@ -101,11 +101,8 @@ class LlamaConfig(PretrainedConfig):
             relevant if `config.is_decoder=True`.
         tie_word_embeddings(`bool`, *optional*, defaults to `False`):
             Whether to tie weight embeddings
-        rope_fusion_level(`str`, *optional*, defaults to ``):
-            The level of fusion of rope embedding. Can be chosen from:
-            (1) 'full': fuse sin cos compute and rope embedding
-            (2) 'core': only fuse rope embedding, will compute the sin and cos
-            (3) None: don't fuse any part of the rope embedding
+        use_fused_rope(`bool`, *optional*, defaults to False):
+            Enable rope fusion or not.
         num_key_value_heads (`int`, *optional*):
             This is the number of key_value heads that should be used to implement Grouped Query Attention. If
             `num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if
@@ -167,7 +164,7 @@ def __init__(
         eos_token_id=2,
         tie_word_embeddings=False,
         alibi=False,
-        rope_fusion_level=None,
+        use_fused_rope=False,
         rope_scaling_factor=1.0,
         rope_scaling_type=None,
         **kwargs,
@@ -205,7 +202,7 @@ def __init__(
         self.eos_token_id = eos_token_id
         self.alibi = alibi
 
-        self.rope_fusion_level = rope_fusion_level
+        self.use_fused_rope = use_fused_rope
         self.rope_scaling_factor = rope_scaling_factor
         self.rope_scaling_type = rope_scaling_type
 
 
@@ -570,14 +570,14 @@ def __init__(self, config: LlamaConfig, layerwise_recompute: bool = False):
                     ]
                 )
 
-        self.rope_fusion_level = config.rope_fusion_level
-        if self.rope_fusion_level is not None:
+        self.use_fused_rope = config.use_fused_rope
+        if self.use_fused_rope:
             if "gpu" not in paddle.device.get_device() or fused_rotary_position_embedding is None:
                 warnings.warn(
                     "Enable fuse rope in the config, but fuse rope is not available. "
                     "Will disable fuse rope. Try using latest gpu version of Paddle."
                 )
-                self.rope_fusion_level = None
+                self.use_fused_rope = False
 
         if config.sequence_parallel:
             ColumnParallelLinear = ColumnSequenceParallelLinear
@@ -664,7 +664,7 @@ def __init__(self, config: LlamaConfig, layerwise_recompute: bool = False):
                 bias_attr=False,
             )
 
-        if config.rope and self.rope_fusion_level != "full":
+        if config.rope:
             self._init_rope()
 
         self.config = config
@@ -736,15 +736,17 @@ def forward(
             kv_seq_len += past_key_value[0].shape[-3]
 
         if self.config.rope:
-            if self.rope_fusion_level is not None:
+            if self.use_fused_rope:
                 assert past_key_value is None, "fuse rotary not support cache kv for now"
-
-            if self.rope_fusion_level == "full":
-                query_states, key_states, _ = fused_rotary_position_embedding(query_states, key_states, v=None)
-            elif self.rope_fusion_level == "core":
                 cos, sin = self.rotary_emb(value_states, seq_len=kv_seq_len)
                 query_states, key_states, _ = fused_rotary_position_embedding(
-                    query_states, key_states, v=None, sin=sin, cos=cos
+                    query_states,
+                    key_states,
+                    v=None,
+                    sin=sin,
+                    cos=cos,
+                    position_ids=position_ids,
+                    use_neox_rotary_style=False,
                 )
             else:
                 cos, sin = self.rotary_emb(value_states, seq_len=kv_seq_len)
 
@@ -1472,6 +1472,28 @@ def _decode(
         else:
             return text
 
+    def decode_token(
+        self,
+        all_input_ids: List[int],
+        prefix_offset: int = 0,
+        read_offset: int = 0,
+    ) -> Tuple[str, int, int]:
+        """tokenizer decoding for the streaming generation use case. This method can be overrided for tokenizer that doesn't follow this API"""
+        # The prefix text is necessary only to defeat cleanup algorithms in the decode
+        # which decide to add a space or not depending on the surrounding ids.
+        prefix_text = self.decode(all_input_ids[prefix_offset:read_offset], skip_special_tokens=False)
+        new_text = self.decode(all_input_ids[prefix_offset:], skip_special_tokens=False)
+
+        if len(new_text) > len(prefix_text) and not new_text.endswith("�"):
+            # utf-8 char at the end means it's a potential unfinished byte sequence
+            # from byte fallback tokenization.
+            # If it's in the middle, it's probably a real invalid id generated
+            # by the model
+            new_text = new_text[len(prefix_text) :]
+            return new_text, read_offset, len(all_input_ids)
+        else:
+            return "", prefix_offset, read_offset
+
 
 class BPETokenizer(PretrainedTokenizer):
     """