PaddlePaddle
diff --git a/‎llm/README.md‎
Lines changed: 1 addition & 1 deletion b/‎llm/README.md‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎llm/predictor.py‎
Lines changed: 120 additions & 44 deletions b/‎llm/predictor.py‎
Lines changed: 120 additions & 44 deletions
diff --git a/‎llm/utils.py‎
Lines changed: 47 additions & 25 deletions b/‎llm/utils.py‎
Lines changed: 47 additions & 25 deletions
diff --git a/‎paddlenlp/experimental/transformers/__init__.py‎
Lines changed: 1 addition & 0 deletions b/‎paddlenlp/experimental/transformers/__init__.py‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎paddlenlp/experimental/transformers/chatglm/__init__.py‎
Lines changed: 15 additions & 0 deletions b/‎paddlenlp/experimental/transformers/chatglm/__init__.py‎
Lines changed: 15 additions & 0 deletions
@@ -244,7 +244,7 @@ python predictor.py \
 python export_model.py \
     --model_name_or_path meta-llama/Llama-2-7b-chat \
     --output_path ./inference \
-    --dtype float16 \
+    --dtype float16
 
 
 # 静态图模型推理
 
@@ -271,20 +271,32 @@ def __init__(
         else:
             raise ValueError("Please specific the model dtype.")
 
+        self.model_config = AutoConfig.from_pretrained(config.model_name_or_path)
         self.dtype = dtype
-
+        self.architectures = self.model_config.architectures[0].lower()
         self.cache_kvs = [paddle.zeros(shape, dtype=dtype) for shape in cache_kv_shapes]
         self.pre_ids = paddle.full([config.batch_size, config.max_length + 1], -1, dtype="int64")
-        self.attention_mask = paddle.zeros(
-            shape=(config.batch_size, 1, config.max_length, config.max_length),
-            dtype=dtype,
-        )
+
+        if "chatglm" in self.architectures:
+            self.attention_mask = paddle.ones(
+                shape=(config.batch_size, 1, config.max_length, config.max_length),
+                dtype=dtype,
+            )
+            self.tgt_pos = paddle.ones(
+                shape=[config.batch_size, 2, 1],
+                dtype="int64",
+            )
+        else:
+            self.attention_mask = paddle.zeros(
+                shape=(config.batch_size, 1, config.max_length, config.max_length),
+                dtype=dtype,
+            )
+
         self.tgt_generation_mask = paddle.zeros(
             shape=[config.batch_size, 1, 1, config.max_length + 1],
             dtype=dtype,
         )
         self.predictor = self._create_predictor(config)
-        self.model_config = AutoConfig.from_pretrained(config.model_name_or_path)
 
     def _create_predictor(self, predictor_args: PredictorArgument):
         if not is_paddlenlp_ops_available():
@@ -327,16 +339,30 @@ def _create_predictor(self, predictor_args: PredictorArgument):
         return predictor
 
     def _preprocess(self, source):
-        inputs = dybatch_preprocess(self.tokenizer, source, self.config.max_length)
-        for i in range(inputs["input_ids"].shape[0]):
-            length = inputs["seq_len_encoder"][i][0]
-            self.attention_mask[i, 0, :length, :length] = paddle.tril(
-                paddle.ones(shape=(length, length), dtype="float16")
-            )
-            self.tgt_generation_mask[i, 0, 0, :length] = paddle.ones(shape=[1, length], dtype="float16")
+        if "chatglm" in self.architectures:
+            inputs = dybatch_preprocess(self.tokenizer, source, self.config.max_length, self.architectures)
+
+            for i in range(inputs["input_ids"].shape[0]):
+                length = inputs["seq_len_encoder"][i][0]
+                self.attention_mask[i, 0, :length, :length] = 0
+                self.attention_mask[i, 0, : length - 1, length - 1] = 1
+                self.tgt_generation_mask[i, 0, 0, :length] = paddle.ones(shape=[1, length], dtype="float16")
+                self.tgt_pos[i, 0, 0] = paddle.to_tensor([length], dtype="int64")
+
+            inputs["attention_mask"] = self.attention_mask
+            inputs["tgt_generation_mask"] = self.tgt_generation_mask
+            inputs["tgt_pos"] = self.tgt_pos.numpy()
+        else:
+            inputs = dybatch_preprocess(self.tokenizer, source, self.config.max_length, self.architectures)
+            for i in range(inputs["input_ids"].shape[0]):
+                length = inputs["seq_len_encoder"][i][0]
+                self.attention_mask[i, 0, :length, :length] = paddle.tril(
+                    paddle.ones(shape=(length, length), dtype="float16")
+                )
+                self.tgt_generation_mask[i, 0, 0, :length] = paddle.ones(shape=[1, length], dtype="float16")
 
-        inputs["attention_mask"] = self.attention_mask
-        inputs["tgt_generation_mask"] = self.tgt_generation_mask
+            inputs["attention_mask"] = self.attention_mask
+            inputs["tgt_generation_mask"] = self.tgt_generation_mask
         return inputs
 
     @paddle.no_grad()
@@ -387,33 +413,61 @@ def __init__(
             raise ValueError("Please specific the model dtype.")
 
         self.dtype = dtype
+        self.architectures = self.model.config.architectures[0].lower()
 
         self.cache_kvs = [
             paddle.zeros(shape, dtype=dtype)
             for shape in self.model.get_cache_kvs_shape(self.model.config, config.max_batch_size)
         ]
         self.pre_ids = paddle.full([config.max_batch_size, config.max_length], -1, dtype="int64")
-        self.attention_mask = paddle.zeros(
-            shape=(config.max_batch_size, 1, config.max_length, config.max_length),
-            dtype=dtype,
-        )
+        if "chatglm" in self.architectures:
+            self.attention_mask = paddle.ones(
+                shape=(config.batch_size, 1, config.max_length, config.max_length),
+                dtype=dtype,
+            )
+            self.tgt_pos = paddle.ones(
+                shape=[config.batch_size, 2, 1],
+                dtype="int64",
+            )
+        else:
+            self.attention_mask = paddle.zeros(
+                shape=(config.batch_size, 1, config.max_length, config.max_length),
+                dtype=dtype,
+            )
+
         self.tgt_generation_mask = paddle.zeros(
             shape=[config.max_batch_size, 1, 1, config.max_length],
             dtype=dtype,
         )
 
     def _preprocess(self, source):
-        inputs = dybatch_preprocess(self.tokenizer, source, self.config.max_length)
-        for i in range(inputs["input_ids"].shape[0]):
-            length = inputs["seq_len_encoder"][i][0]
-            self.attention_mask[i, 0, :length, :length] = paddle.tril(
-                paddle.ones(shape=(length, length), dtype="float16")
-            )
+        if "chatglm" in self.architectures:
+            inputs = dybatch_preprocess(self.tokenizer, source, self.config.max_length, self.architectures)
+
+            for i in range(inputs["input_ids"].shape[0]):
+                length = inputs["seq_len_encoder"][i][0]
+                self.attention_mask[i, 0, :length, :length] = 0
+                self.attention_mask[i, 0, : length - 1, length - 1] = 1
+                self.tgt_generation_mask[i, 0, 0, :length] = paddle.ones(shape=[1, length], dtype="float16")
+                self.tgt_pos[i, 0, 0] = paddle.to_tensor([length], dtype="int64")
+
             inputs["attention_mask"] = self.attention_mask
-            self.tgt_generation_mask[i, 0, 0, :length] = paddle.ones(shape=[1, length], dtype="float16")
             inputs["tgt_generation_mask"] = self.tgt_generation_mask
-        inputs["cache_kvs"] = self.cache_kvs
-        inputs["pre_ids"] = self.pre_ids
+            inputs["cache_kvs"] = self.cache_kvs
+            inputs["pre_ids"] = self.pre_ids
+            inputs["tgt_pos"] = self.tgt_pos
+        else:
+            inputs = dybatch_preprocess(self.tokenizer, source, self.config.max_length, self.architectures)
+            for i in range(inputs["input_ids"].shape[0]):
+                length = inputs["seq_len_encoder"][i][0]
+                self.attention_mask[i, 0, :length, :length] = paddle.tril(
+                    paddle.ones(shape=(length, length), dtype="float16")
+                )
+                inputs["attention_mask"] = self.attention_mask
+                self.tgt_generation_mask[i, 0, 0, :length] = paddle.ones(shape=[1, length], dtype="float16")
+                inputs["tgt_generation_mask"] = self.tgt_generation_mask
+            inputs["cache_kvs"] = self.cache_kvs
+            inputs["pre_ids"] = self.pre_ids
 
         inputs_tensor = {}
         for key, value in inputs.items():
@@ -497,29 +551,51 @@ def create_predictor(
     else:
         if predictor_args.mode == "dynamic":
             # TODO(wj-Mcat): complete AutoInferenceModel & AutoPredictor
-            assert (
-                "llama" in predictor_args.model_name_or_path
-            ), "only support llama inference model in dygraph-inference predictor"
-            from paddlenlp.experimental.transformers import (
-                LlamaForCausalLMInferenceModel,
-            )
-
             config = AutoConfig.from_pretrained(predictor_args.model_name_or_path)
+            if "llama" in config.architectures[0].lower():
+                from paddlenlp.experimental.transformers import (
+                    LlamaForCausalLMInferenceModel,
+                )
+
+                config.tensor_parallel_degree = tensor_parallel_degree
+                config.tensor_parallel_rank = tensor_parallel_rank
+                model = LlamaForCausalLMInferenceModel.from_pretrained(
+                    predictor_args.model_name_or_path, config=config, dtype=predictor_args.dtype
+                )
+                model.eval()
+            elif "chatglm" in config.architectures[0].lower():
+                from paddlenlp.experimental.transformers import (
+                    ChatGLMForCausalLMInferenceModel,
+                )
+
+                config.tensor_parallel_degree = tensor_parallel_degree
+                config.tensor_parallel_rank = tensor_parallel_rank
 
-            config.tensor_parallel_degree = tensor_parallel_degree
-            config.tensor_parallel_rank = tensor_parallel_rank
-            model = LlamaForCausalLMInferenceModel.from_pretrained(predictor_args.model_name_or_path, config=config)
+                model = ChatGLMForCausalLMInferenceModel.from_pretrained(
+                    predictor_args.model_name_or_path,
+                    config=config,
+                    dtype=predictor_args.dtype,
+                )
+                model.eval()
             predictor = DygraphInferencePredictor(predictor_args, model=model, tokenizer=tokenizer)
         elif predictor_args.mode == "static":
             config = AutoConfig.from_pretrained(predictor_args.model_name_or_path)
+            if "llama" in config.architectures[0].lower():
+                from paddlenlp.experimental.transformers import (
+                    LlamaForCausalLMInferenceModel,
+                )
 
-            # only support llama inference model currently
-            from paddlenlp.experimental.transformers import (
-                LlamaForCausalLMInferenceModel,
-            )
+                cache_kvs_shape = LlamaForCausalLMInferenceModel.get_cache_kvs_shape(config, predictor_args.batch_size)
+                predictor = StaticInferencePredictor(predictor_args, cache_kvs_shape, tokenizer=tokenizer)
+            elif "chatglm" in config.architectures[0].lower():
+                from paddlenlp.experimental.transformers import (
+                    ChatGLMForCausalLMInferenceModel,
+                )
 
-            cache_kvs_shape = LlamaForCausalLMInferenceModel.get_cache_kvs_shape(config, predictor_args.batch_size)
-            predictor = StaticInferencePredictor(predictor_args, cache_kvs_shape, tokenizer=tokenizer)
+                cache_kvs_shape = ChatGLMForCausalLMInferenceModel.get_cache_kvs_shape(
+                    config, predictor_args.batch_size
+                )
+                predictor = StaticInferencePredictor(predictor_args, cache_kvs_shape, tokenizer=tokenizer)
         else:
             raise ValueError("the `mode` should be one of [dynamic, static]")
     return predictor
 
@@ -337,32 +337,54 @@ def pad_batch_data(insts, pad_id=0, return_seq_len=False, pad_style="right"):
         return inst_data.astype("int64").reshape([-1, max_len])
 
 
-def dybatch_preprocess(tokenizer, texts: list[str], max_length: int):
+def dybatch_preprocess(tokenizer, texts: list[str], max_length: int, architectures: str):
     """Pre-process generation inputs."""
-    input_ids = []
-    if isinstance(texts, str):
-        texts = [texts]
-
-    for text in texts:
-        tokens = tokenizer(
-            text,
-            return_tensors="np",
-            padding=False,
-            return_attention_mask=False,
-            return_token_type_ids=False,
-        )
-        input_ids.append(tokens["input_ids"][0])
-
-    inputs = {}
-    pad_token_id = tokenizer([tokenizer.pad_token], return_tensors="np")["input_ids"][0][-1]
-    inputs["input_ids"], seq_len = pad_batch_data(input_ids, pad_id=pad_token_id, return_seq_len=True)
-    bs = inputs["input_ids"].shape[0]
-    max_len = max(map(len, input_ids))
-
-    position_ids = paddle.zeros(shape=[bs, max_len], dtype="int64")
-    for i in range(bs):
-        position_ids[i, : seq_len[i]] = paddle.arange(seq_len[i])
-    inputs["position_ids"] = position_ids
+    if "chatglm" in architectures:
+        input_ids = []
+        position_ids = []
+
+        for text in texts:
+            tokens = tokenizer(text, return_tensors="np", padding=True)
+            input_ids.append(tokens["input_ids"][0])
+            position_ids.append(tokens["position_ids"][0])
+
+        inputs = {}
+        pad_token_id = tokenizer([tokenizer.pad_token], return_tensors="np")["input_ids"][0][0]
+
+        inputs["input_ids"], seq_len = pad_batch_data(input_ids, pad_id=pad_token_id, return_seq_len=True)
+        bs = inputs["input_ids"].shape[0]
+        max_len = max(map(len, input_ids))
+
+        inst_data_pos = []
+        for i in range(len(position_ids)):
+            inst_data_pos.append(np.array([list(inst) + [0] * (max_len - len(inst)) for inst in position_ids[i]]))
+        inputs["position_ids"] = paddle.to_tensor(np.array(inst_data_pos))
+    else:
+        input_ids = []
+        position_ids = []
+        if isinstance(texts, str):
+            texts = [texts]
+
+        for text in texts:
+            tokens = tokenizer(
+                text,
+                return_tensors="np",
+                padding=False,
+                return_attention_mask=False,
+                return_token_type_ids=False,
+            )
+            input_ids.append(tokens["input_ids"][0])
+
+        inputs = {}
+        pad_token_id = tokenizer([tokenizer.pad_token], return_tensors="np")["input_ids"][0][-1]
+        inputs["input_ids"], seq_len = pad_batch_data(input_ids, pad_id=pad_token_id, return_seq_len=True)
+        bs = inputs["input_ids"].shape[0]
+        max_len = max(map(len, input_ids))
+
+        position_ids = paddle.zeros(shape=[bs, max_len], dtype="int64")
+        for i in range(bs):
+            position_ids[i, : seq_len[i]] = paddle.arange(seq_len[i])
+        inputs["position_ids"] = position_ids
 
     tgt_ids = [input[-1:] for input in input_ids]
     tgt_pos = []
 
@@ -12,5 +12,6 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from .chatglm import *
 from .fused_transformer_layers import *
 from .llama import *
@@ -0,0 +1,15 @@
+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from .modeling import *