PaddlePaddle
diff --git a/‎csrc/generation/tune_cublaslt_gemm.cu‎
Lines changed: 3 additions & 3 deletions b/‎csrc/generation/tune_cublaslt_gemm.cu‎
Lines changed: 3 additions & 3 deletions
diff --git a/‎llm/predict/predictor.py‎
Lines changed: 51 additions & 0 deletions b/‎llm/predict/predictor.py‎
Lines changed: 51 additions & 0 deletions
diff --git a/‎llm/utils/utils.py‎
Lines changed: 1 addition & 0 deletions b/‎llm/utils/utils.py‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎paddlenlp/experimental/transformers/__init__.py‎
Lines changed: 1 addition & 0 deletions b/‎paddlenlp/experimental/transformers/__init__.py‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎paddlenlp/experimental/transformers/qwen2/__init__.py‎
Lines changed: 15 additions & 0 deletions b/‎paddlenlp/experimental/transformers/qwen2/__init__.py‎
Lines changed: 15 additions & 0 deletions
@@ -327,7 +327,7 @@ void FindAlgo(const cublasLtHandle_t& ltHandle,
                   sizeof(customOption)));
               CUDA_CHECK(cublasLtMatmulAlgoConfigSetAttribute(
                   &algo, CUBLASLT_ALGO_CONFIG_CTA_SWIZZLING, &k, sizeof(k)));
-              int splitK_val = 0;
+              int splitK_val = 1;
               uint32_t redScheme = CUBLASLT_REDUCTION_SCHEME_NONE;
               CUDA_CHECK(cublasLtMatmulAlgoConfigSetAttribute(
                   &algo,
@@ -346,10 +346,10 @@ void FindAlgo(const cublasLtHandle_t& ltHandle,
                     CUBLASLT_ALGO_CONFIG_SPLITK_NUM,
                     &splitKSequenceA[l - 1],
                     sizeof(splitKSequenceA[l - 1])));
-                for (redScheme = 0;
+                for (redScheme = 1;
                      redScheme < (int)CUBLASLT_REDUCTION_SCHEME_MASK &&
                      (AlgoCount < AlgoCombinations);
-                     redScheme++) {
+                     redScheme <<= 1) {
                   CUDA_CHECK(cublasLtMatmulAlgoConfigSetAttribute(
                       &algo,
                       CUBLASLT_ALGO_CONFIG_REDUCTION_SCHEME,
 
@@ -1395,6 +1395,32 @@ def create_predictor(
                     dtype=predictor_args.dtype,
                 )
                 model.eval()
+            elif "qwen2" in config.architectures[0].lower():
+                if predictor_args.block_attn:
+                    config.max_seq_len = predictor_args.total_max_length
+                    config.block_size = predictor_args.block_size
+                    from paddlenlp.experimental.transformers import (
+                        Qwen2ForCausalLMBlockInferenceModel as Qwen2InferenceModel,
+                    )
+
+                    model = Qwen2InferenceModel.from_pretrained(
+                        predictor_args.model_name_or_path,
+                        config=config,
+                        dtype=predictor_args.dtype,
+                        tensor_parallel_degree=tensor_parallel_degree,
+                        tensor_parallel_rank=tensor_parallel_rank,
+                    )
+                else:
+                    from paddlenlp.experimental.transformers import (
+                        Qwen2ForCausalLMInferenceModel as Qwen2InferenceModel,
+                    )
+
+                    model = Qwen2InferenceModel.from_pretrained(
+                        predictor_args.model_name_or_path,
+                        config=config,
+                        dtype=predictor_args.dtype,
+                    )
+                model.eval()
             elif "qwen" in config.architectures[0].lower():
                 if model_args.model_type == "qwen-img2txt":
                     # we use qwen for img2txt.
@@ -1420,6 +1446,16 @@ def create_predictor(
 
         elif predictor_args.mode == "static":
             config = AutoConfig.from_pretrained(predictor_args.model_name_or_path)
+            config.quant_type = predictor_args.quant_type
+            config.cachekv_int8_type = predictor_args.cachekv_int8_type
+
+            if config.quantization_config.quant_type is not None:
+                predictor_args.quant_type = config.quantization_config.quant_type
+                config.quant_type = config.quantization_config.quant_type
+                if "c8" in config.quant_type:
+                    predictor_args.cachekv_int8_type = "static"
+                    config.cachekv_int8_type = "static"
+
             if "llama" in config.architectures[0].lower():
                 if predictor_args.block_attn:
                     config.block_size = predictor_args.block_size
@@ -1486,6 +1522,21 @@ def create_predictor(
                 cache_kvs_shape = GPTForCausalLMInferenceModel.get_cache_kvs_shape(
                     config, predictor_args.batch_size, predictor_args.total_max_length
                 )
+            elif "qwen2" in config.architectures[0].lower():
+                if predictor_args.block_attn:
+                    config.block_size = predictor_args.block_size
+                    config.max_seq_len = predictor_args.total_max_length
+                    from paddlenlp.experimental.transformers import (
+                        Qwen2ForCausalLMBlockInferenceModel as Qwen2InferenceModel,
+                    )
+                else:
+                    from paddlenlp.experimental.transformers import (
+                        Qwen2ForCausalLMInferenceModel as Qwen2InferenceModel,
+                    )
+                cache_kvs_shape = Qwen2InferenceModel.get_cache_kvs_shape(
+                    config, predictor_args.batch_size, predictor_args.total_max_length
+                )
+
             elif "qwen" in config.architectures[0].lower():
                 from paddlenlp.experimental.transformers import (
                     QWenForCausalLMInferenceModel,
 
@@ -34,6 +34,7 @@
     AutoTokenizer,
     ChatGLMv2Tokenizer,
     LlamaForCausalLMPipe,
+    PretrainedConfig,
     Qwen2ForCausalLMPipe,
 )
 from paddlenlp.transformers.tokenizer_utils import PretrainedTokenizer
 
@@ -20,3 +20,4 @@
 from .llama import *
 from .opt import *
 from .qwen import *
+from .qwen2 import *
@@ -0,0 +1,15 @@
+# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from .modeling import *
Original file line number	Diff line number	Diff line change
`@@ -34,6 +34,7 @@`
`34`	`34`	`AutoTokenizer,`
`35`	`35`	`ChatGLMv2Tokenizer,`
`36`	`36`	`LlamaForCausalLMPipe,`
	`37`	`+ PretrainedConfig,`
`37`	`38`	`Qwen2ForCausalLMPipe,`
`38`	`39`	`)`
`39`	`40`	`from paddlenlp.transformers.tokenizer_utils import PretrainedTokenizer`