update

Wanglongzhi2001 · Wanglongzhi2001 · commit c2febd99f5cf · 2024-10-23T17:15:45.000+08:00
diff --git a/llm/predict/env.py b/llm/predict/env.py
@@ -10,4 +10,10 @@
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
-# limitations under the License.
+# limitations under the License.
+
+# Note(@RochardWooSJTU): MAX_BSZ must be the same as definition in get_output / save_output
+MAX_BSZ = 512
+# Note(@Wanglongzhi2001): SPECULATE_MAX_BSZ must be the same as definition in speculate_get_output / speculate_save_output
+SPECULATE_MAX_BSZ = 256
+MAX_DRAFT_TOKENS = 6
diff --git a/llm/predict/predictor.py b/llm/predict/predictor.py
@@ -24,10 +24,11 @@
 import numpy as np
 import paddle
 import paddle.incubate.multiprocessing as mp
+from env import MAX_BSZ, MAX_DRAFT_TOKENS, SPECULATE_MAX_BSZ
 from paddle.base.framework import in_cinn_mode, in_pir_executor_mode, use_pir_api
 from paddle.distributed import fleet
+from proposers import InferenceWithReferenceProposer
 
-from llm.speculate_decoding.proposers import InferenceWithReferenceProposer
 from paddlenlp.generation import GenerationConfig, TextIteratorStreamer
 from paddlenlp.peft import LoRAConfig, LoRAModel, PrefixConfig, PrefixModelForCausalLM
 from paddlenlp.taskflow.utils import static_mode_guard
@@ -47,12 +48,6 @@
 from paddlenlp.utils.import_utils import is_paddlenlp_ops_available
 from paddlenlp.utils.log import logger
 
-# Note(@RochardWooSJTU): MAX_BSZ must be the same as definition in get_output / save_output
-MAX_BSZ = 512
-# Note(@Wanglongzhi2001): SPECULATE_MAX_BSZ must be the same as definition in speculate_get_output / speculate_save_output
-SPECULATE_MAX_BSZ = 256
-MAX_DRAFT_TOKENS = 6
-
 
 @dataclass
 class PredictorArgument:
@@ -108,7 +103,7 @@ class PredictorArgument:
         default="fp16",
         metadata={"help": "avx cachekv type. Supported values: fp16,int8"},
     )
-    batch_size: int = field(default=10, metadata={"help": "The batch size of data."})
+    batch_size: int = field(default=1, metadata={"help": "The batch size of data."})
     benchmark: bool = field(
         default=False,
         metadata={
@@ -1242,15 +1237,11 @@ def predict(self, input_texts: list[str], return_tokens=False):
             else:
                 return outputs
 
-    def _preprocess(self, input_text: list[str]):
-        super()._preprocess(input_text)
-
+    def init_proposer_args(self):
         for bid in range(self.config.batch_size):
             self.model_inputs["pre_ids"][bid, 0] = self.model_inputs["input_ids"][bid][
                 self.model_inputs["seq_lens_this_time"][bid] - 1
             ]  # get the last token before padding of this batch
-
-    def init_proposer_args(self):
         self.model_inputs["accept_tokens"] = paddle.full(
             shape=[self.config.batch_size, self.config.speculate_max_draft_token_num + 1], fill_value=0, dtype="int64"
         )
@@ -1286,6 +1277,11 @@ def __init__(
             self.proposer = None
 
     def init_proposer_args(self):
+        for bid in range(self.config.batch_size):
+            self.model_inputs["pre_ids"][bid, 0] = self.model_inputs["input_ids"][bid][
+                self.model_inputs["seq_lens_this_time"][bid] - 1
+            ]  # get the last token before padding of this batch
+
         self.model_inputs["accept_tokens"] = paddle.full(
             shape=[self.config.batch_size, self.config.speculate_max_draft_token_num + 1], fill_value=0, dtype="int64"
         )
@@ -1299,14 +1295,6 @@ def init_proposer_args(self):
         if self.config.speculate_method == "inference_with_reference":
             self.proposer.input_ids_cpu = self.model_inputs["input_ids"].cpu()
 
-    def _preprocess(self, input_text: list[str]):
-        super()._preprocess(input_text)
-
-        for bid in range(self.config.batch_size):
-            self.model_inputs["pre_ids"][bid, 0] = self.model_inputs["input_ids"][bid][
-                self.model_inputs["seq_lens_this_time"][bid] - 1
-            ]  # get the last token before padding of this batch
-
     def predict(self, input_texts: list[str], return_tokens=False):
         s_time = time.time()
         self._preprocess(input_texts)
diff --git a/llm/predict/proposers.py b/llm/predict/proposers.py
diff --git a/paddlenlp/utils/llm_utils.py b/paddlenlp/utils/llm_utils.py
@@ -28,7 +28,7 @@
 from paddle.io import BatchSampler, DataLoader, DistributedBatchSampler
 from sklearn.metrics import accuracy_score
 
-from llm.predict.predictor import MAX_DRAFT_TOKENS, SPECULATE_MAX_BSZ
+from llm.predict.env import MAX_DRAFT_TOKENS, SPECULATE_MAX_BSZ
 from paddlenlp.datasets import ZeroPaddingIterableDataset
 from paddlenlp.generation import GenerationConfig
 from paddlenlp.trainer import Trainer, TrainerCallback