PaddlePaddle
diff --git a/‎csrc/generation/set_alibi_mask_value.cu‎
Lines changed: 136 additions & 0 deletions b/‎csrc/generation/set_alibi_mask_value.cu‎
Lines changed: 136 additions & 0 deletions
diff --git a/‎csrc/setup_cuda.py‎
Lines changed: 1 addition & 0 deletions b/‎csrc/setup_cuda.py‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎llm/benchmark.sh‎
Lines changed: 16 additions & 2 deletions b/‎llm/benchmark.sh‎
Lines changed: 16 additions & 2 deletions
diff --git a/‎llm/llama/README.md‎
Lines changed: 24 additions & 21 deletions b/‎llm/llama/README.md‎
Lines changed: 24 additions & 21 deletions
@@ -0,0 +1,136 @@
+// Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+// 
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+// 
+//     http://www.apache.org/licenses/LICENSE-2.0
+// 
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "helper.h"
+
+template <typename T>
+__global__ void set_value_by_id(const int *seq_lens, 
+                               const bool *stop_flags, 
+                              const float *alibi_slopes, 
+                              const int64_t *tgt_pos, 
+                              T *output_data, 
+                              int *sequence_lengths, 
+                              int bs, 
+                              int length,
+                              int num_head) {
+    int bs_id = blockIdx.x;                          
+    int hid = threadIdx.x;
+    if (bs_id < bs) {
+        T *output_data_now = output_data + bs_id * num_head * length + hid * length;
+        float tgt_pos_now = static_cast<float>(tgt_pos[bs_id]);
+        output_data_now[seq_lens[bs_id]] = static_cast<T>(tgt_pos_now * alibi_slopes[hid]);
+        if (stop_flags[bs_id]) {
+            sequence_lengths[bs_id] = 0;
+        }
+    }
+}
+
+template <paddle::DataType D>
+std::vector<paddle::Tensor> set_mask_value(const paddle::Tensor& input_data, 
+                                           const paddle::Tensor& stop_flags, 
+                                          const paddle::Tensor& seq_lens,
+                                          const paddle::Tensor& alibi_slopes,
+                                          const paddle::Tensor& tgt_pos
+                                          ) {
+    typedef PDTraits<D> traits_;
+    typedef typename traits_::DataType DataType_;
+    typedef typename traits_::data_t data_t;
+
+    PD_CHECK(seq_lens.dtype() == paddle::DataType::INT32);
+    PD_CHECK(stop_flags.dtype() == paddle::DataType::BOOL);
+    auto cu_stream = input_data.stream();
+    std::vector<int64_t> input_data_shape = input_data.shape();
+    std::vector<int64_t> seq_lens_shape = seq_lens.shape();
+    auto sequence_lengths = seq_lens.copy_to(seq_lens.place(), false);
+
+    int input_bs = input_data_shape[0];
+    int length = input_data_shape[3];
+    int seq_bs = seq_lens_shape[0];
+    int num_head = alibi_slopes.shape()[0];
+
+    int grid_size = input_bs;
+    int block_size = num_head;
+    set_value_by_id<<<grid_size, block_size, 0, cu_stream>>>(seq_lens.data<int>(), 
+                                                     stop_flags.data<bool>(), 
+                                                     alibi_slopes.data<float>(),
+                                                     tgt_pos.data<int64_t>(),
+                                                     reinterpret_cast<DataType_*>(const_cast<data_t*>(input_data.data<data_t>())), 
+                                                     sequence_lengths.data<int>(), seq_bs, length, num_head);
+    return {sequence_lengths};
+}
+
+std::vector<paddle::Tensor> SetMaskValue(const paddle::Tensor& input_data, 
+                                          const paddle::Tensor& stop_flags, 
+                                          const paddle::Tensor& seq_lens,
+                                          const paddle::Tensor& alibi_slopes,
+                                          const paddle::Tensor& tgt_pos) {
+    switch (input_data.type()) {
+        case paddle::DataType::BFLOAT16: {
+            return set_mask_value<paddle::DataType::BFLOAT16>(
+                input_data,
+                stop_flags,
+                seq_lens,
+                alibi_slopes,
+                tgt_pos
+            );
+        }
+        case paddle::DataType::FLOAT16: {
+            return set_mask_value<paddle::DataType::FLOAT16>(
+                input_data,
+                stop_flags,
+                seq_lens,
+                alibi_slopes,
+                tgt_pos
+            );
+        }
+        case paddle::DataType::FLOAT32: {
+            return set_mask_value<paddle::DataType::FLOAT32>(
+                input_data,
+                stop_flags,
+                seq_lens,
+                alibi_slopes,
+                tgt_pos
+            );
+        }
+        default: {
+            PD_THROW(
+                "NOT supported data type. "
+                "Only float16, bfloat16 and float32 are supported. ");
+            break;
+        }
+    }
+}
+
+std::vector<std::vector<int64_t>> SetMaskValueInferShape(const std::vector<int64_t>& input_data_shape, 
+                                                         const std::vector<int64_t>& stop_flags_shape, 
+                                                         const std::vector<int64_t>& seq_lens_shape,
+                                                         const std::vector<int64_t>& alibi_slopes_shape,
+                                                         const std::vector<int64_t>& tgt_pos) {
+    return {seq_lens_shape};
+}
+
+std::vector<paddle::DataType> SetMaskValueInferDtype(const paddle::DataType& input_data_dtype, 
+                                                      const paddle::DataType& stop_flags_dtype, 
+                                                      const paddle::DataType& seq_lens_dtype,
+                                                      const paddle::DataType& alibi_slopes_dtype,
+                                                      const paddle::DataType& tgt_pos_dtype) {
+    return {seq_lens_dtype};
+}
+
+PD_BUILD_OP(set_alibi_mask_value)
+    .Inputs({"input_data", "stop_flags", "seq_lens", "alibi_slopes", "tgt_pos"})
+    .Outputs({"sequence_lengths"})
+    .SetKernelFn(PD_KERNEL(SetMaskValue))
+    .SetInferShapeFn(PD_INFER_SHAPE(SetMaskValueInferShape))
+    .SetInferDtypeFn(PD_INFER_DTYPE(SetMaskValueInferDtype));
@@ -31,6 +31,7 @@
             "./generation/write_cache_kv.cu",
             "./generation/encode_rotary_qk.cu",
             "./generation/top_p_sampling.cu",
+            "./generation/set_alibi_mask_value.cu",
         ]
     ),
 )
@@ -1,3 +1,17 @@
+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 export PYTHONPATH=$(dirname $(pwd)):$PYTHONPATH
 
 export FLAGS_control_flow_use_new_executor=1
@@ -6,10 +20,10 @@ export FLAGS_allocator_strategy=naive_best_fit
 export FLAGS_fraction_of_gpu_memory_to_use=0.92
 
 python predictor.py \
-    --model_name_or_path ./llama-13b-inference_model_fp16 \
+    --model_name_or_path ./llama7b-inference_model_fp16 \
     --dtype float16 \
     --src_length 300 \
-    --max_length 400 \
+    --max_length 100 \
     --output_file "infer.json" \
     --mode "static" \
     --batch_size 1 \
 
@@ -4,27 +4,30 @@
 
 **支持模型权重:**
 
-| Model                          |
-| ------------------------------ |
-| facebook/llama-7b              |
-| facebook/llama-13b             |
-| facebook/llama-30b             |
-| facebook/llama-65b             |
-| meta-llama/Llama-2-7b          |
-| meta-llama/Llama-2-7b-chat     |
-| meta-llama/Llama-2-13b         |
-| meta-llama/Llama-2-13b-chat    |
-| meta-llama/Llama-2-70b         |
-| meta-llama/Llama-2-70b-chat    |
-| ziqingyang/chinese-llama-7b    |
-| ziqingyang/chinese-llama-13b   |
-| ziqingyang/chinese-alpaca-7b   |
-| ziqingyang/chinese-alpaca-13b  |
-| idea-ccnl/ziya-llama-13b-v1    |
-| linly-ai/chinese-llama-2-7b    |
-| baichuan-inc/Baichuan-7B       |
-| baichuan-inc/Baichuan-13B-Base |
-| baichuan-inc/Baichuan-13B-Chat |
+| Model                            |
+| ---------------------------------|
+| facebook/llama-7b                 |
+| facebook/llama-13b                |
+| facebook/llama-30b                |
+| facebook/llama-65b                |
+| meta-llama/Llama-2-7b             |
+| meta-llama/Llama-2-7b-chat        |
+| meta-llama/Llama-2-13b            |
+| meta-llama/Llama-2-13b-chat       |
+| meta-llama/Llama-2-70b            |
+| meta-llama/Llama-2-70b-chat       |
+| ziqingyang/chinese-llama-7b       |
+| ziqingyang/chinese-llama-13b      |
+| ziqingyang/chinese-alpaca-7b      |
+| ziqingyang/chinese-alpaca-13b     |
+| idea-ccnl/ziya-llama-13b-v1       |
+| linly-ai/chinese-llama-2-7b       |
+| baichuan-inc/Baichuan-7B          |
+| baichuan-inc/Baichuan-13B-Base    |
+| baichuan-inc/Baichuan-13B-Chat    |
+| FlagAlpha/Llama2-Chinese-7b-Chat  |
+| FlagAlpha/Llama2-Chinese-13b-Chat |
+
 
 
 使用方法：
Original file line number	Diff line number	Diff line change
`@@ -31,6 +31,7 @@`
`31`	`31`	`"./generation/write_cache_kv.cu",`
`32`	`32`	`"./generation/encode_rotary_qk.cu",`
`33`	`33`	`"./generation/top_p_sampling.cu",`
	`34`	`+ "./generation/set_alibi_mask_value.cu",`
`34`	`35`	`]`
`35`	`36`	`),`
`36`	`37`	`)`