2020
2121import math
2222from copy import deepcopy
23- from typing import OrderedDict
23+ from typing import Optional , OrderedDict , Union
2424
2525from ...utils import logger
2626from ...utils .utility import LazyImport
@@ -679,7 +679,7 @@ def quant_weight_w_scale(weight, scale, zp, group_size=-1):
679679
680680def autoround_quantize (
681681 model ,
682- tokenizer ,
682+ tokenizer = None ,
683683 bits : int = 4 ,
684684 group_size : int = 128 ,
685685 sym : bool = False ,
@@ -689,10 +689,8 @@ def autoround_quantize(
689689 amp : bool = True ,
690690 device = None ,
691691 lr_scheduler = None ,
692- dataloader = None , ## to support later
693- dataset_name : str = "NeelNanda/pile-10k" ,
694- dataset_split : str = "train" ,
695- use_quant_input : bool = True ,
692+ dataset : Union [str , list , tuple , torch .utils .data .DataLoader ] = "NeelNanda/pile-10k" ,
693+ enable_quanted_input : bool = True ,
696694 enable_minmax_tuning : bool = True ,
697695 lr : float = None ,
698696 minmax_lr : float = None ,
@@ -706,52 +704,52 @@ def autoround_quantize(
706704 gradient_accumulate_steps : int = 1 ,
707705 not_use_best_mse : bool = False ,
708706 dynamic_max_gap : int = - 1 ,
709- data_type : str = "int" , ##only support data_type
710- scale_dtype = "fp16" ,
707+ data_type : str = "int" , ##only support int for now
708+ scale_dtype : str = "fp16" ,
711709 ** kwargs ,
712710):
713711 """Run autoround weight-only quantization.
714712 Args:
715- model: The PyTorch model to be quantized.
716- tokenizer: Tokenizer for processing input data. Temporarily set as a mandatory parameter .
717- bits (int): Number of bits for quantization (default is 4).
718- group_size (int): Size of the quantization group (default is 128).
719- sym (bool): Whether the symmetric quantization is to be used.
720- weight_config (dict): Configuration for weight quantization (default is an empty dictionary).
721- weight_config={
722- 'layer1':##layer_name
723- {
724- 'data_type': 'int',
725- 'bits': 4,
726- 'group_size': 32,
727- 'scheme': "asym", ## or sym
728- }
729- ...
730- }
731- enable_full_range (bool): Whether to enable full range quantization (default is False).
732- bs (int): Batch size for training (default is 8).
733- amp (bool): Whether to use automatic mixed precision (default is True). Automatically detect and set .
734- device: The device to be used for tuning (default is None). Automatically detect and set .
735- lr_scheduler: The learning rate scheduler to be used.
736- dataloader: The dataloader for input data (to be supported in future ).
737- dataset_name (str ): The default dataset name (default is "NeelNanda/pile-10k").
738- dataset_split (str): The split of the dataset to be used (default is "train" ).
739- use_quant_input (bool): Whether to use quantized input data (default is True).
740- enable_minmax_tuning (bool ): Whether to enable min-max tuning (default is True ).
741- lr (float): The learning rate (default is 0.005 ).
742- minmax_lr (float ): The learning rate for min-max tuning (default is None ).
743- low_gpu_mem_usage (bool): Whether to use low GPU memory (default is True ).
744- iters (int): Number of iterations (default is 200 ).
745- seqlen (int): Length of the sequence .
746- n_samples (int ): Number of samples (default is 512 ).
747- sampler (str ): The sampling method (default is "rand" ).
748- seed (int): The random seed (default is 42 ).
749- n_blocks (int): Number of blocks (default is 1).
750- gradient_accumulate_steps (int ): Number of gradient accumulation steps (default is 1 ).
751- not_use_best_mse (bool ): Whether to use mean squared error (default is False ).
752- dynamic_max_gap (int ): The dynamic maximum gap (default is -1 ).
753- data_type (str): The data type to be used (default is "int").
754- **kwargs: Additional keyword arguments .
713+ model: The PyTorch model to be quantized.
714+ tokenizer: An optional tokenizer for processing input data. If none is provided, a dataloader must be supplied .
715+ bits (int): Number of bits for quantization (default is 4).
716+ group_size (int): Size of the quantization group (default is 128).
717+ sym (bool): Whether symmetric quantization is to be used (default is False) .
718+ weight_config (dict): Configuration for weight quantization (default is an empty dictionary).
719+ weight_config={
720+ 'layer1':##layer_name
721+ {
722+ 'data_type': 'int',
723+ 'bits': 4,
724+ 'group_size': 32,
725+ ' sym': False
726+ }
727+ ...
728+ }
729+ enable_full_range (bool): Whether to enable full range quantization (default is False).
730+ batch_size (int): Batch size for training (default is 8).
731+ amp (bool): Whether to use automatic mixed precision (default is True).
732+ device: The device to be used for tuning (default is "auto") .
733+ lr_scheduler: The learning rate scheduler to be used.
734+ dataset (str): The default dataset name (default is "NeelNanda/pile-10k" ).
735+ enable_quanted_input (bool ): Whether to use the output of the previous quantized block as
736+ the input for the current block (default is True ).
737+ enable_minmax_tuning (bool): Whether to enable weight min-max tuning (default is True).
738+ lr (float ): The learning rate (default is None, will be set to 1.0/iters ).
739+ minmax_lr (float): The learning rate for min-max tuning (default is None, it will be set to lr automatically ).
740+ low_gpu_mem_usage (bool ): Whether to use low GPU memory (default is True ).
741+ iters (int): Number of iterations (default is 200 ).
742+ seqlen (int): Data length of the sequence for tuning (default is 2048 ).
743+ n_samples (int): Number of samples (default is 512) .
744+ sampler (str ): The sampling method (default is "rand" ).
745+ seed (int ): The random seed (default is 42 ).
746+ n_blocks (int): Number of blocks (default is 1 ).
747+ gradient_accumulate_steps (int): Number of gradient accumulation steps (default is 1).
748+ not_use_best_mse (bool ): Whether to use mean squared error (default is False ).
749+ dynamic_max_gap (int ): The dynamic maximum gap (default is -1 ).
750+ data_type (str ): The data type to be used (default is "int" ).
751+ scale_dtype (str): The data type of quantization scale to be used (default is "float32"), different kernels
752+ have different choices .
755753
756754 Returns:
757755 The quantized model.
@@ -770,10 +768,8 @@ def autoround_quantize(
770768 amp = amp ,
771769 device = device ,
772770 lr_scheduler = lr_scheduler ,
773- dataloader = dataloader , ## to support later
774- dataset_name = dataset_name ,
775- dataset_split = dataset_split ,
776- use_quant_input = use_quant_input ,
771+ dataset = dataset ,
772+ enable_quanted_input = enable_quanted_input ,
777773 enable_minmax_tuning = enable_minmax_tuning ,
778774 lr = lr ,
779775 minmax_lr = minmax_lr ,
0 commit comments