Skip to content

Commit 60c0705

Browse files
committed
Merge branch 'develop' of https://github.com/PaddlePaddle/PaddleNLP into oldexe
2 parents 30df727 + 98fbee3 commit 60c0705

File tree

19 files changed

+462
-82
lines changed

19 files changed

+462
-82
lines changed

llm/llama/README.md

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,10 @@
2525
| baichuan-inc/Baichuan-7B |
2626
| baichuan-inc/Baichuan-13B-Base |
2727
| baichuan-inc/Baichuan-13B-Chat |
28+
| baichuan-inc/Baichuan2-7B-Base |
29+
| baichuan-inc/Baichuan2-7B-Chat |
30+
| baichuan-inc/Baichuan2-13B-Base |
31+
| baichuan-inc/Baichuan2-13B-Chat |
2832
| FlagAlpha/Llama2-Chinese-7b-Chat |
2933
| FlagAlpha/Llama2-Chinese-13b-Chat |
3034

llm/llama/run_pretrain.py

Lines changed: 4 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -178,14 +178,9 @@ class ModelArguments:
178178
default=False,
179179
metadata={"help": "whether to use fuse sequence parallel allreduce"},
180180
)
181-
rope_fusion_level: Optional[str] = field(
182-
default=None,
183-
metadata={
184-
"help": "The level of fusion of rope embedding. Can be chosen from:\n"
185-
"(1) 'full': fuse sin cos compute and rope embedding\n"
186-
"(2) 'core': only fuse rope embedding, will compute the sin and cos\n"
187-
"(3) None: don't fuse any part of the rope embedding"
188-
},
181+
use_fused_rope: Optional[bool] = field(
182+
default=False,
183+
metadata={"help": "Enable rope fusion or not."},
189184
)
190185
no_recompute_layers: Optional[List[int]] = field(
191186
default=None,
@@ -443,7 +438,7 @@ def main():
443438
config.virtual_pp_degree = model_args.virtual_pp_degree
444439
config.sequence_parallel = model_args.sequence_parallel
445440
config.fuse_sequence_parallel_allreduce = model_args.fuse_sequence_parallel_allreduce
446-
config.rope_fusion_level = model_args.rope_fusion_level
441+
config.use_fused_rope = model_args.use_fused_rope
447442
config.no_recompute_layers = model_args.no_recompute_layers
448443
config.pp_recompute_interval = model_args.pp_recompute_interval
449444
config.recompute_use_reentrant = model_args.recompute_use_reentrant

paddlenlp/experimental/transformers/chatglm/modeling.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -332,7 +332,7 @@ def set_state_dict(self, state_dict, use_structured_name=True):
332332
continue
333333
elif k.startswith("lm_head.weight"):
334334
continue
335-
elif k.endswith("rotary_emb.inv_freq"):
335+
elif k.endswith("rotary_embeddings.inv_freq") or k.endswith("rotary_emb.inv_freq"):
336336
continue
337337
idx = int(k.split(".")[2])
338338
if k.endswith("input_layernorm.weight"):

paddlenlp/experimental/transformers/generation_utils.py

Lines changed: 11 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -370,24 +370,17 @@ def _post_process_(outputs, top_p, temperature, step_idx_ori, model_kwargs):
370370

371371
return next_tokens, model_kwargs
372372

373-
if paddle.max(model_kwargs["seq_len_encoder"]) > 0:
374-
# encoder
375-
outputs = _forward_(**model_kwargs)
376-
# first decoder
377-
next_tokens, model_kwargs = _post_process_(
378-
outputs,
379-
top_p,
380-
temperature,
381-
step_idx_ori,
382-
model_kwargs,
383-
)
384-
step_idx_ori += 1
385-
else:
386-
outputs = None
387-
# first decoder
388-
next_tokens = None
389-
model_kwargs["next_tokens"] = next_tokens
390-
step_idx_ori += 0
373+
# encoder
374+
outputs = _forward_(**model_kwargs)
375+
# first decoder
376+
next_tokens, model_kwargs = _post_process_(
377+
outputs,
378+
top_p,
379+
temperature,
380+
step_idx_ori,
381+
model_kwargs,
382+
)
383+
step_idx_ori += 1
391384

392385
# gives it a value, means we will entered into decoder phase.
393386
model_kwargs["cache"] = 0

paddlenlp/generation/utils.py

Lines changed: 6 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -24,15 +24,9 @@
2424
from paddle.common_ops_import import convert_dtype
2525
from paddle.utils import map_structure
2626

27-
try:
28-
from paddle import top_p_sampling
29-
30-
is_top_p_sampling_avaliable = True
31-
except:
32-
is_top_p_sampling_avaliable = False
33-
3427
from paddlenlp.transformers.model_outputs import ModelOutput
3528
from paddlenlp.transformers.utils import get_scale_by_dtype
29+
from paddlenlp.utils.import_utils import is_paddlenlp_ops_available
3630
from paddlenlp.utils.log import logger
3731

3832
from .configuration_utils import DEFAULT_MAX_NEW_TOKENS, GenerationConfig
@@ -55,6 +49,9 @@
5549
)
5650
from .streamers import BaseStreamer
5751

52+
if is_paddlenlp_ops_available():
53+
import paddlenlp_ops
54+
5855
__all__ = [
5956
"GenerationMixin",
6057
"BeamSearchScorer",
@@ -1333,9 +1330,9 @@ def _post_process_(outputs, input_ids, cur_len, origin_len, scores, unfinished_f
13331330
# compute next_tokens
13341331
if use_top_p:
13351332
logits = logits / temperature
1336-
if is_top_p_sampling_avaliable:
1333+
if is_paddlenlp_ops_available():
13371334
top_ps_tensor = paddle.full(shape=[paddle.shape(probs)[0], 1], fill_value=top_p, dtype=probs.dtype)
1338-
_, next_tokens = top_p_sampling(probs, top_ps_tensor)
1335+
_, next_tokens = paddlenlp_ops.top_p_sampling(probs, top_ps_tensor, -1)
13391336
else:
13401337
probs = TopPProcess(probs, top_p, min_tokens_to_keep)
13411338
next_tokens = paddle.multinomial(probs)

paddlenlp/transformers/bloom/modeling.py

Lines changed: 5 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -795,7 +795,7 @@ def get_input_embeddings(self):
795795
return self.word_embeddings
796796

797797
def _prepare_attn_mask(
798-
self, attention_mask: Tensor, input_shape: Tuple[int, int], past_key_values_length: int, num_heads: int, dtype
798+
self, attention_mask: Tensor, input_shape: Tuple[int, int], past_key_values_length: int, num_heads: int
799799
) -> Tensor:
800800
# create causal mask
801801
# [batch_size, seq_length] -> [batch_size, 1, tgt_length, src_length]
@@ -819,8 +819,9 @@ def _prepare_attn_mask(
819819

820820
mask_shape = expanded_attn_mask.shape
821821
expanded_attn_mask = expanded_attn_mask.expand([mask_shape[0], num_heads, mask_shape[2], mask_shape[3]])
822-
zero = paddle.zeros(expanded_attn_mask.shape, dtype=dtype)
823-
neg_inf = paddle.full(expanded_attn_mask.shape, paddle.finfo(dtype).min, dtype=dtype)
822+
# Attention score will be cast to float32 in the following calculation, therefore we set attention_mask dtype as float32
823+
zero = paddle.zeros(expanded_attn_mask.shape, dtype=paddle.float32)
824+
neg_inf = paddle.full(expanded_attn_mask.shape, paddle.finfo(paddle.float32).min, dtype=paddle.float32)
824825
expanded_attn_mask = paddle.where(expanded_attn_mask, zero, neg_inf)
825826
batch_size, num_heads, sq_len, kv_len = expanded_attn_mask.shape
826827
return expanded_attn_mask.reshape([batch_size * num_heads, sq_len, kv_len])
@@ -929,7 +930,6 @@ def forward(
929930
input_shape=(batch_size, seq_length),
930931
past_key_values_length=past_key_values_length,
931932
num_heads=block_size,
932-
dtype=hidden_states.dtype,
933933
)
934934
else:
935935
alibi = alibi.reshape([batch_size * self.config.n_head, 1, seq_length_with_past])
@@ -938,7 +938,6 @@ def forward(
938938
input_shape=(batch_size, seq_length),
939939
past_key_values_length=past_key_values_length,
940940
num_heads=self.config.n_head,
941-
dtype=hidden_states.dtype,
942941
)
943942

944943
for i, (block, layer_past) in enumerate(zip(self.h, past_key_values)):
@@ -1088,7 +1087,7 @@ def __init__(self, config):
10881087
self.lm_head = BloomLMHead(config, self.bloom.word_embeddings.weight)
10891088
self.criterion = BloomPretrainingCriterion(
10901089
tensor_parallel_degree=config.tensor_parallel_degree,
1091-
tensor_parallel_output=True,
1090+
tensor_parallel_output=config.tensor_parallel_output,
10921091
)
10931092

10941093
def get_output_embeddings(self):

paddlenlp/transformers/llama/configuration.py

Lines changed: 4 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -101,11 +101,8 @@ class LlamaConfig(PretrainedConfig):
101101
relevant if `config.is_decoder=True`.
102102
tie_word_embeddings(`bool`, *optional*, defaults to `False`):
103103
Whether to tie weight embeddings
104-
rope_fusion_level(`str`, *optional*, defaults to ``):
105-
The level of fusion of rope embedding. Can be chosen from:
106-
(1) 'full': fuse sin cos compute and rope embedding
107-
(2) 'core': only fuse rope embedding, will compute the sin and cos
108-
(3) None: don't fuse any part of the rope embedding
104+
use_fused_rope(`bool`, *optional*, defaults to False):
105+
Enable rope fusion or not.
109106
num_key_value_heads (`int`, *optional*):
110107
This is the number of key_value heads that should be used to implement Grouped Query Attention. If
111108
`num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if
@@ -167,7 +164,7 @@ def __init__(
167164
eos_token_id=2,
168165
tie_word_embeddings=False,
169166
alibi=False,
170-
rope_fusion_level=None,
167+
use_fused_rope=False,
171168
rope_scaling_factor=1.0,
172169
rope_scaling_type=None,
173170
**kwargs,
@@ -205,7 +202,7 @@ def __init__(
205202
self.eos_token_id = eos_token_id
206203
self.alibi = alibi
207204

208-
self.rope_fusion_level = rope_fusion_level
205+
self.use_fused_rope = use_fused_rope
209206
self.rope_scaling_factor = rope_scaling_factor
210207
self.rope_scaling_type = rope_scaling_type
211208

paddlenlp/transformers/llama/modeling.py

Lines changed: 12 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -570,14 +570,14 @@ def __init__(self, config: LlamaConfig, layerwise_recompute: bool = False):
570570
]
571571
)
572572

573-
self.rope_fusion_level = config.rope_fusion_level
574-
if self.rope_fusion_level is not None:
573+
self.use_fused_rope = config.use_fused_rope
574+
if self.use_fused_rope:
575575
if "gpu" not in paddle.device.get_device() or fused_rotary_position_embedding is None:
576576
warnings.warn(
577577
"Enable fuse rope in the config, but fuse rope is not available. "
578578
"Will disable fuse rope. Try using latest gpu version of Paddle."
579579
)
580-
self.rope_fusion_level = None
580+
self.use_fused_rope = False
581581

582582
if config.sequence_parallel:
583583
ColumnParallelLinear = ColumnSequenceParallelLinear
@@ -664,7 +664,7 @@ def __init__(self, config: LlamaConfig, layerwise_recompute: bool = False):
664664
bias_attr=False,
665665
)
666666

667-
if config.rope and self.rope_fusion_level != "full":
667+
if config.rope:
668668
self._init_rope()
669669

670670
self.config = config
@@ -736,15 +736,17 @@ def forward(
736736
kv_seq_len += past_key_value[0].shape[-3]
737737

738738
if self.config.rope:
739-
if self.rope_fusion_level is not None:
739+
if self.use_fused_rope:
740740
assert past_key_value is None, "fuse rotary not support cache kv for now"
741-
742-
if self.rope_fusion_level == "full":
743-
query_states, key_states, _ = fused_rotary_position_embedding(query_states, key_states, v=None)
744-
elif self.rope_fusion_level == "core":
745741
cos, sin = self.rotary_emb(value_states, seq_len=kv_seq_len)
746742
query_states, key_states, _ = fused_rotary_position_embedding(
747-
query_states, key_states, v=None, sin=sin, cos=cos
743+
query_states,
744+
key_states,
745+
v=None,
746+
sin=sin,
747+
cos=cos,
748+
position_ids=position_ids,
749+
use_neox_rotary_style=False,
748750
)
749751
else:
750752
cos, sin = self.rotary_emb(value_states, seq_len=kv_seq_len)

paddlenlp/transformers/tokenizer_utils.py

Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1472,6 +1472,28 @@ def _decode(
14721472
else:
14731473
return text
14741474

1475+
def decode_token(
1476+
self,
1477+
all_input_ids: List[int],
1478+
prefix_offset: int = 0,
1479+
read_offset: int = 0,
1480+
) -> Tuple[str, int, int]:
1481+
"""tokenizer decoding for the streaming generation use case. This method can be overrided for tokenizer that doesn't follow this API"""
1482+
# The prefix text is necessary only to defeat cleanup algorithms in the decode
1483+
# which decide to add a space or not depending on the surrounding ids.
1484+
prefix_text = self.decode(all_input_ids[prefix_offset:read_offset], skip_special_tokens=False)
1485+
new_text = self.decode(all_input_ids[prefix_offset:], skip_special_tokens=False)
1486+
1487+
if len(new_text) > len(prefix_text) and not new_text.endswith("�"):
1488+
# utf-8 char at the end means it's a potential unfinished byte sequence
1489+
# from byte fallback tokenization.
1490+
# If it's in the middle, it's probably a real invalid id generated
1491+
# by the model
1492+
new_text = new_text[len(prefix_text) :]
1493+
return new_text, read_offset, len(all_input_ids)
1494+
else:
1495+
return "", prefix_offset, read_offset
1496+
14751497

14761498
class BPETokenizer(PretrainedTokenizer):
14771499
"""

0 commit comments

Comments
 (0)