1313# limitations under the License.
1414from __future__ import annotations
1515
16+ import argparse
1617import json
1718import math
1819import re
2122
2223import numpy as np
2324import paddle
24- from args import get_parser
25+ from paddle . distributed import fleet
2526from paddle .io import DataLoader
26- from utils import load_model
2727
2828from paddlenlp .data import Stack , Tuple
2929from paddlenlp .transformers import AutoTokenizer , BloomForPretraining
3030from paddlenlp .utils .log import logger
3131
3232
33+ def get_parser ():
34+ parser = argparse .ArgumentParser ()
35+ parser .add_argument ("--model_type" , default = None , type = str , required = True , help = "Model type selected in the list" )
36+ parser .add_argument (
37+ "--model_name_or_path" ,
38+ default = None ,
39+ type = str ,
40+ required = True ,
41+ help = "Path to pre-trained model or shortcut name selected in the list: " ,
42+ )
43+
44+ # only support tensor_parallel_degree
45+ parser .add_argument (
46+ "--tensor_parallel_degree" ,
47+ type = int ,
48+ default = 1 ,
49+ help = "Model Parallelism degree. Spliting the linear layers to many cards." ,
50+ )
51+
52+ # Other config
53+ parser .add_argument ("--seed" , type = int , default = 1234 , help = "Random seed for initialization" )
54+ parser .add_argument (
55+ "--device" , type = str , default = "gpu" , choices = ["cpu" , "gpu" , "xpu" , "npu" ], help = "select cpu, gpu, xpu devices."
56+ )
57+ parser .add_argument (
58+ "--dtype" ,
59+ type = str ,
60+ default = "float16" ,
61+ choices = ["bfloat16" , "float16" , "float32" ],
62+ help = "set the dtype of model" ,
63+ )
64+
65+ # load autodist name files, eg: bloom-176b
66+ parser .add_argument ("--load_autodist" , action = "store_true" , help = "whether load auto-dist wieght file" )
67+ return parser
68+
69+
3370def get_eval_parser ():
3471 parser = get_parser ()
3572 parser .add_argument (
@@ -45,9 +82,9 @@ def get_eval_parser():
4582 parser .add_argument ("--overlapping_eval" , type = int , default = 32 , help = "Sliding window for overlapping eval." )
4683 parser .add_argument ("--batch_size" , default = 8 , type = int , help = "Batch size per GPU/CPU for training." )
4784 parser .add_argument (
48- "--seq_length" , type = int , default = 1024 , help = "Maximum sequence length to process for evaluation."
85+ "--seq_length" , type = int , default = 512 , help = "Maximum sequence length to process for evaluation."
4986 )
50- parser .add_argument ("--logging_steps" , type = int , default = 100 , help = "Log every X updates steps. " )
87+ parser .add_argument ("--logging_steps" , type = int , default = 10 , help = "logging step for eval " )
5188 return parser
5289
5390
@@ -244,14 +281,38 @@ def create_eval_dataset(args):
244281 return val_dataloader
245282
246283
247- @paddle .no_grad ()
248284def do_generation ():
249285 parser = get_eval_parser ()
250286 args = parser .parse_args ()
287+ paddle .set_default_dtype (args .dtype )
288+
289+ if args .tensor_parallel_degree > 1 :
290+ strategy = fleet .DistributedStrategy ()
291+ strategy .hybrid_configs = {
292+ "mp_degree" : args .tensor_parallel_degree ,
293+ }
294+ # Set control in tensor parallel
295+ strategy .tensor_parallel_configs = {"tensor_init_seed" : args .seed }
296+ fleet .init (is_collective = True , strategy = strategy )
297+
298+ # add compatiblity code for bloom-176b weight files
299+ if args .load_autodist :
300+ BloomForPretraining .resource_files_names [
301+ "model_state"
302+ ] = f"auto_dist{ paddle .distributed .get_rank ()} .pdparams"
251303
252304 eval_data_loader = create_eval_dataset (args )
253305 tic_eval = time .time ()
254- model = load_model (args , model_class = BloomForPretraining )
306+
307+ model = BloomForPretraining .from_pretrained (
308+ args .model_name_or_path ,
309+ load_state_as_np = True ,
310+ low_cpu_mem_usage = True , # todo enable low_cpu_mem_usage=True
311+ dtype = args .dtype , # todo enable set dtype to avoid additional mem usage
312+ tensor_parallel_degree = args .tensor_parallel_degree ,
313+ tensor_parallel_rank = paddle .distributed .get_rank (),
314+ )
315+
255316 model .eval ()
256317 total_score = 0
257318 score_name = "loss" if not args .cloze_eval else "number correct"
@@ -261,14 +322,14 @@ def do_generation():
261322
262323 tokens , loss_mask = batch [:2 ]
263324 labels = batch [- 1 ]
325+ with paddle .amp .auto_cast (args .use_pure_fp16 ):
326+ preds = model (tokens ).detach ()
264327
265- with paddle . amp . auto_cast ( args . use_pure_fp16 , level = "O2" , dtype = model . config . dtype ):
266- preds = paddle . cast ( model ( tokens ). detach (), dtype = paddle .float32 )
328+ # cast preds to float32 to keep high-precision
329+ preds = preds . astype ( paddle .float32 )
267330
268331 if not args .cloze_eval :
269332 masked_lm_loss = paddle .nn .functional .cross_entropy (preds , labels , reduction = "none" )
270- masked_lm_loss = paddle .cast (masked_lm_loss , "float32" )
271-
272333 loss = paddle .sum (masked_lm_loss * loss_mask )
273334 total_score += loss .numpy () / (args .num_tokenized_tokens - 1 )
274335 else :
0 commit comments