@@ -4615,6 +4615,9 @@ def quantize(self, tune_cfg, model, dataloader, calib_func=None):
46154615 q_model ._model = self .awq_quantize (q_model ._model , tune_cfg , dataloader , calib_func )
46164616 if "RTN" in all_algo :
46174617 q_model ._model = self .rtn_quantize (q_model ._model , tune_cfg )
4618+ if "AUTOROUND" in all_algo :
4619+ q_model ._model , autoround_config = self .autoround_quantize (q_model ._model , tune_cfg , dataloader )
4620+ q_model .autoround_config = autoround_config
46184621
46194622 q_model .q_config = copy .deepcopy (self .tune_cfg )
46204623 q_model .is_quantized = True
@@ -4911,6 +4914,93 @@ def awq_quantize(self, model, tune_cfg, dataloader, calib_func):
49114914 )
49124915 return model
49134916
4917+ def autoround_quantize (self , model , tune_cfg , dataloader ):
4918+ logger .info ("quantizing with the AutoRound algorithm" )
4919+ from .torch_utils .weight_only import autoround_quantize
4920+
4921+ # build weight_config
4922+ """
4923+ weight_config={
4924+ 'layer1':##layer_name
4925+ {
4926+ 'data_type': 'int',
4927+ 'bits': 4,
4928+ 'group_size': 32,
4929+ 'scheme': "asym", ## or sym
4930+ }
4931+ ...
4932+ }
4933+ """
4934+ weight_config = {}
4935+ for key , config in tune_cfg ["op" ].items ():
4936+ if config ["weight" ]["dtype" ] == "fp32" :
4937+ continue
4938+ op_name , op_type = key
4939+ weight_config [op_name ] = {}
4940+ weight_config [op_name ]["data_type" ] = config ["weight" ]["dtype" ]
4941+ weight_config [op_name ]["bits" ] = config ["weight" ]["bits" ]
4942+ weight_config [op_name ]["group_size" ] = config ["weight" ]["group_size" ]
4943+ weight_config [op_name ]["scheme" ] = config ["weight" ]["scheme" ]
4944+
4945+ # auto round recipes
4946+ enable_full_range = self .recipes ["autoround_args" ].get ("enable_full_range" , False )
4947+ bs = self .recipes ["autoround_args" ].get ("bs" , 8 )
4948+ amp = self .recipes ["autoround_args" ].get ("amp" , True )
4949+ device = self .recipes ["autoround_args" ].get ("device" , "cpu" )
4950+ lr_scheduler = self .recipes ["autoround_args" ].get ("lr_scheduler" , None )
4951+ dataset_name = self .recipes ["autoround_args" ].get ("dataset_name" , "NeelNanda/pile-10k" )
4952+ dataset_split = self .recipes ["autoround_args" ].get ("dataset_split" , "train" )
4953+ use_quant_input = self .recipes ["autoround_args" ].get ("use_quant_input" , True )
4954+ enable_minmax_tuning = self .recipes ["autoround_args" ].get ("enable_minmax_tuning" , True )
4955+ lr = self .recipes ["autoround_args" ].get ("lr" , None )
4956+ minmax_lr = self .recipes ["autoround_args" ].get ("minmax_lr" , None )
4957+ low_gpu_mem_usage = self .recipes ["autoround_args" ].get ("low_gpu_mem_usage" , True )
4958+ iters = self .recipes ["autoround_args" ].get ("iters" , 200 )
4959+ seqlen = self .recipes ["autoround_args" ].get ("seqlen" , 2048 )
4960+ n_samples = self .recipes ["autoround_args" ].get ("n_samples" , 512 )
4961+ sampler = self .recipes ["autoround_args" ].get ("sampler" , "rand" )
4962+ seed = self .recipes ["autoround_args" ].get ("seed" , 42 )
4963+ n_blocks = self .recipes ["autoround_args" ].get ("n_blocks" , 1 )
4964+ gradient_accumulate_steps = self .recipes ["autoround_args" ].get ("gradient_accumulate_steps" , 1 )
4965+ not_use_best_mse = self .recipes ["autoround_args" ].get ("not_use_best_mse" , False )
4966+ dynamic_max_gap = self .recipes ["autoround_args" ].get ("dynamic_max_gap" , - 1 )
4967+ data_type = self .recipes ["autoround_args" ].get ("data_type" , "int" ) ##only support data_type
4968+ scale_dtype = self .recipes ["autoround_args" ].get ("scale_dtype" , "fp16" )
4969+
4970+ model , autoround_config = autoround_quantize (
4971+ model = model ,
4972+ tokenizer = None ,
4973+ bits = 4 ,
4974+ group_size = 128 ,
4975+ scheme = "asym" ,
4976+ weight_config = weight_config ,
4977+ enable_full_range = enable_full_range ,
4978+ bs = bs ,
4979+ amp = amp ,
4980+ device = device ,
4981+ lr_scheduler = lr_scheduler ,
4982+ dataloader = dataloader ,
4983+ dataset_name = dataset_name ,
4984+ dataset_split = dataset_split ,
4985+ use_quant_input = use_quant_input ,
4986+ enable_minmax_tuning = enable_minmax_tuning ,
4987+ lr = lr ,
4988+ minmax_lr = minmax_lr ,
4989+ low_gpu_mem_usage = low_gpu_mem_usage ,
4990+ iters = iters ,
4991+ seqlen = seqlen ,
4992+ n_samples = n_samples ,
4993+ sampler = sampler ,
4994+ seed = seed ,
4995+ n_blocks = n_blocks ,
4996+ gradient_accumulate_steps = gradient_accumulate_steps ,
4997+ not_use_best_mse = not_use_best_mse ,
4998+ dynamic_max_gap = dynamic_max_gap ,
4999+ data_type = data_type ,
5000+ scale_dtype = scale_dtype ,
5001+ )
5002+ return model , autoround_config
5003+
49145004 def _dump_model_op_stats (self , model , tune_cfg ):
49155005 """This is a function to dump quantizable ops of model to user.
49165006
0 commit comments