Add save/load for pt2e example (#1927)

Kaihui-intel · web-flow · commit 0e724a4d96ca · 2024-07-29T16:22:13.000+08:00
Signed-off-by: Kaihui-intel &lt;kaihui.tang@intel.com&gt;
diff --git a/examples/3.x_api/pytorch/cv/static_quant/main.py b/examples/3.x_api/pytorch/cv/static_quant/main.py
diff --git a/examples/3.x_api/pytorch/cv/static_quant/run_benchmark.sh b/examples/3.x_api/pytorch/cv/static_quant/run_benchmark.sh
@@ -0,0 +1,103 @@
+#!/bin/bash
+set -x
+
+function main {
+
+  init_params "$@"
+  run_benchmark
+
+}
+
+# init params
+function init_params {
+  iters=100
+  batch_size=16
+  tuned_checkpoint=saved_results
+  echo ${max_eval_samples}
+  for var in "$@"
+  do
+    case $var in
+      --topology=*)
+          topology=$(echo $var |cut -f2 -d=)
+      ;;
+      --dataset_location=*)
+          dataset_location=$(echo $var |cut -f2 -d=)
+      ;;
+      --input_model=*)
+          input_model=$(echo $var |cut -f2 -d=)
+      ;;
+      --mode=*)
+          mode=$(echo $var |cut -f2 -d=)
+      ;;
+      --batch_size=*)
+          batch_size=$(echo $var |cut -f2 -d=)
+      ;;
+      --iters=*)
+          iters=$(echo ${var} |cut -f2 -d=)
+      ;;
+      --int8=*)
+          int8=$(echo ${var} |cut -f2 -d=)
+      ;;
+      --config=*)
+          tuned_checkpoint=$(echo $var |cut -f2 -d=)
+      ;;
+      *)
+          echo "Error: No such parameter: ${var}"
+          exit 1
+      ;;
+    esac
+  done
+
+}
+
+
+# run_benchmark
+function run_benchmark {
+    extra_cmd=''
+
+    if [[ ${mode} == "accuracy" ]]; then
+        mode_cmd=" --accuracy "
+    elif [[ ${mode} == "performance" ]]; then
+        mode_cmd=" --performance --iters "${iters}
+    else
+        echo "Error: No such mode: ${mode}"
+        exit 1
+    fi
+    if [[ ${int8} == "true" ]]; then
+        extra_cmd=$extra_cmd" --int8"
+    fi
+    echo $extra_cmd
+
+
+    echo $extra_cmd
+
+    if [ "${topology}" = "resnet18_pt2e_static" ]; then
+        model_name_or_path="resnet18"
+    fi
+
+    if [[ ${mode} == "accuracy" ]]; then
+        python main.py \
+                --pretrained \
+                -a resnet18 \
+                -b 30 \
+                --tuned_checkpoint ${tuned_checkpoint} \
+                ${dataset_location} \
+                ${extra_cmd} \
+                ${mode_cmd}
+    elif [[ ${mode} == "performance" ]]; then
+        incbench --num_cores_per_instance 4 \
+                main.py \
+                --pretrained \
+                -a resnet18 \
+                -b 30 \
+                --tuned_checkpoint ${tuned_checkpoint} \
+                ${dataset_location} \
+                ${extra_cmd} \
+                ${mode_cmd}
+    else
+        echo "Error: No such mode: ${mode}"
+        exit 1
+    fi
+}
+
+main "$@"
diff --git a/examples/3.x_api/pytorch/cv/static_quant/run_quant.sh b/examples/3.x_api/pytorch/cv/static_quant/run_quant.sh
@@ -10,6 +10,7 @@ function main {
 
 # init params
 function init_params {
+  tuned_checkpoint="saved_results"
   for var in "$@"
   do
     case $var in
@@ -39,7 +40,13 @@ function run_tuning {
     if [ "${topology}" = "resnet18_pt2e_static" ]; then
         model_name_or_path="resnet18"
     fi
-    python main.py -a ${model_name_or_path} ${dataset_location} -q -e
+    python main.py \
+            --pretrained \
+            -t \
+            -a resnet18 \
+            -b 30 \
+            --tuned_checkpoint ${tuned_checkpoint} \
+            ${dataset_location}
 }
 
 main "$@"
diff --git a/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/static_quant/pt2e/run_benchmark.sh b/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/static_quant/pt2e/run_benchmark.sh
@@ -0,0 +1,99 @@
+#!/bin/bash
+set -x
+
+function main {
+
+  init_params "$@"
+  run_benchmark
+
+}
+
+# init params
+function init_params {
+  iters=100
+  batch_size=16
+  tuned_checkpoint=saved_results
+  task=lambada_openai
+  echo ${max_eval_samples}
+  for var in "$@"
+  do
+    case $var in
+      --topology=*)
+          topology=$(echo $var |cut -f2 -d=)
+      ;;
+      --dataset_location=*)
+          dataset_location=$(echo $var |cut -f2 -d=)
+      ;;
+      --input_model=*)
+          input_model=$(echo $var |cut -f2 -d=)
+      ;;
+      --mode=*)
+          mode=$(echo $var |cut -f2 -d=)
+      ;;
+      --batch_size=*)
+          batch_size=$(echo $var |cut -f2 -d=)
+      ;;
+      --iters=*)
+          iters=$(echo ${var} |cut -f2 -d=)
+      ;;
+      --int8=*)
+          int8=$(echo ${var} |cut -f2 -d=)
+      ;;
+      --config=*)
+          tuned_checkpoint=$(echo $var |cut -f2 -d=)
+      ;;
+      *)
+          echo "Error: No such parameter: ${var}"
+          exit 1
+      ;;
+    esac
+  done
+
+}
+
+
+# run_benchmark
+function run_benchmark {
+    extra_cmd=''
+
+    if [[ ${mode} == "accuracy" ]]; then
+        mode_cmd=" --accuracy "
+        extra_cmd=$extra_cmd
+    elif [[ ${mode} == "performance" ]]; then
+        mode_cmd=" --performance --iters "${iters}
+        extra_cmd=$extra_cmd
+    else
+        echo "Error: No such mode: ${mode}"
+        exit 1
+    fi
+
+    if [[ ${int8} == "true" ]]; then
+        extra_cmd=$extra_cmd" --int8"
+    fi
+    echo $extra_cmd
+
+    echo $extra_cmd
+    
+    if [ "${topology}" = "opt_125m_pt2e_static" ]; then
+        model_name_or_path="facebook/opt-125m"
+    fi
+    if [[ ${mode} == "accuracy" ]]; then
+        python -u run_clm_no_trainer.py \
+            --model ${model_name_or_path} \
+            --output_dir ${tuned_checkpoint} \
+            --task ${task} \
+            --batch_size ${batch_size} \
+            ${extra_cmd} ${mode_cmd}
+    elif [[ ${mode} == "performance" ]]; then
+        incbench --num_cores_per_instance 4 run_clm_no_trainer.py \
+            --model ${model_name_or_path} \
+            --batch_size ${batch_size} \
+            --output_dir ${tuned_checkpoint} \
+            ${extra_cmd} ${mode_cmd}
+    else
+        echo "Error: No such mode: ${mode}"
+        exit 1
+    fi
+}
+
+main "$@"
diff --git a/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/static_quant/pt2e/run_clm_no_trainer.py b/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/static_quant/pt2e/run_clm_no_trainer.py
@@ -14,7 +14,7 @@
     "--revision", default=None,
     help="Transformers parameter: set the model hub commit number")
 parser.add_argument("--dataset", nargs="?", default="NeelNanda/pile-10k", const="NeelNanda/pile-10k")
-parser.add_argument("--output_dir", nargs="?", default="./saved_results")
+parser.add_argument("--output_dir", nargs="?", default="")
 parser.add_argument("--quantize", action="store_true")
 parser.add_argument("--approach", type=str, default='static',
                     help="Select from ['dynamic', 'static', 'weight-only']")
@@ -80,7 +80,7 @@ def get_example_inputs(tokenizer):
     dynamic_shapes = {"input_ids": (batch, seq_len)}
     example_inputs = get_example_inputs(tokenizer)
     exported_model = export(user_model, example_inputs=example_inputs, dynamic_shapes=dynamic_shapes)
-
+    
     quant_config = get_default_static_config()
     # prepare
     prepare_model = prepare(exported_model, quant_config)
@@ -90,17 +90,32 @@ def get_example_inputs(tokenizer):
         prepare_model(*example_inputs)
     # convert
     converted_model = convert(prepare_model)
-    # inference
-    from torch._inductor import config
+    
+    # save
+    if args.output_dir:
+        converted_model.save(example_inputs=example_inputs, output_dir = args.output_dir)
+
+
+
+if args.int8:
+    if args.output_dir:
+        print("Load int8 model.")
+        from neural_compressor.torch.quantization import load
+        model = load(args.output_dir)
 
-    config.freezing = True
-    opt_model = torch.compile(converted_model)
+        model.config = user_model.config # for lm eval
+        
+        # Compile the quantized model and replace the Q/DQ pattern with Q-operator
+        from torch._inductor import config
 
-    opt_model.config = user_model.config # for lm eval
-    user_model = opt_model
+        config.freezing = True
+        opt_model = torch.compile(model)
 
+        opt_model.config = user_model.config # for lm eval
+        user_model = opt_model
 
 if args.accuracy:
+
     from intel_extension_for_transformers.transformers.llm.evaluation.lm_eval import evaluate, LMEvalParser
     eval_args = LMEvalParser(
         model="hf",
@@ -120,29 +135,21 @@ def get_example_inputs(tokenizer):
     print('Batch size = %d' % args.batch_size)
 
 if args.performance:
-    # user_model.eval()
-    from intel_extension_for_transformers.transformers.llm.evaluation.lm_eval import evaluate, LMEvalParser
+    batch_size, input_leng = args.batch_size, 512
+    example_inputs = torch.ones((batch_size, input_leng), dtype=torch.long)
+    print("Batch size = {:d}".format(batch_size))
+    print("The length of input tokens = {:d}".format(input_leng))
     import time
 
-    samples = args.iters * args.batch_size
-    eval_args = LMEvalParser(
-        model="hf",
-        user_model=user_model,
-        tokenizer=tokenizer,
-        batch_size=args.batch_size,
-        tasks=args.tasks,
-        limit=samples,
-        device="cpu",
-    )
-    start = time.time()
-    results = evaluate(eval_args)
-    end = time.time()
-    for task_name in args.tasks.split(","):
-        if task_name == "wikitext":
-            acc = results["results"][task_name]["word_perplexity,none"]
-        else:
-            acc = results["results"][task_name]["acc,none"]
-    print("Accuracy: %.5f" % acc)
-    print('Throughput: %.3f samples/sec' % (samples / (end - start)))
-    print('Latency: %.3f ms' % ((end - start) * 1000 / samples))
-    print('Batch size = %d' % args.batch_size)
+    total_iters = args.iters
+    warmup_iters = 5
+    with torch.no_grad():
+        for i in range(total_iters):
+            if i == warmup_iters:
+                start = time.time()
+            user_model(example_inputs)
+        end = time.time()
+    latency = (end - start) / ((total_iters - warmup_iters) * args.batch_size)
+    throughput = ((total_iters - warmup_iters) * args.batch_size) / (end - start)
+    print("Latency: {:.3f} ms".format(latency * 10**3))
+    print("Throughput: {:.3f} samples/sec".format(throughput))
diff --git a/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/static_quant/pt2e/run_quant.sh b/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/static_quant/pt2e/run_quant.sh
@@ -39,8 +39,9 @@ function run_tuning {
 
     if [ "${topology}" = "opt_125m_pt2e_static" ]; then
         model_name_or_path="facebook/opt-125m"
+        output_dir="saved_results"
     fi
-    python run_clm_no_trainer.py --model ${model_name_or_path} --quantize --accuracy --tasks "lambada_openai"
+    python run_clm_no_trainer.py --model ${model_name_or_path} --quantize --output_dir ${output_dir} --tasks "lambada_openai"
 }
 
 main "$@"
diff --git a/neural_compressor/common/__init__.py b/neural_compressor/common/__init__.py
@@ -15,6 +15,7 @@
 
 from neural_compressor.common.utils import (
     level,
+    level_name,
     logger,
     Logger,
     TuningLogger,
@@ -31,6 +32,7 @@
 __all__ = [
     "options",
     "level",
+    "level_name",
     "logger",
     "Logger",
     "TuningLogger",
diff --git a/neural_compressor/common/utils/logger.py b/neural_compressor/common/utils/logger.py
@@ -24,6 +24,7 @@
 
 __all__ = [
     "level",
+    "level_name",
     "Logger",  # TODO: not expose it
     "logger",
     "TuningLogger",
@@ -138,6 +139,7 @@ def warning(msg, *args, **kwargs):
 
 
 level = Logger().get_logger().level
+level_name = logging.getLevelName(level)
 
 logger = Logger
 
diff --git a/neural_compressor/torch/algorithms/pt2e_quant/half_precision_rewriter.py b/neural_compressor/torch/algorithms/pt2e_quant/half_precision_rewriter.py
@@ -185,7 +185,8 @@ def transformation(gm: torch.fx.GraphModule, node_candidate_list: List[str], tar
     for pattern_pair in HALF_PRECISION_PATTERN_REGISTRY[target_dtype].values():
         apply_single_pattern_pair(gm, pattern_pair, node_candidate_list)
     utils.logger.info("Half precision conversion is done:")
-    gm.print_readable(True)
+    if utils.level_name == "DEBUG":  # pragma: no cover
+        gm.print_readable(True)
 
 
 # =============================================================================
diff --git a/neural_compressor/torch/algorithms/pt2e_quant/save_load.py b/neural_compressor/torch/algorithms/pt2e_quant/save_load.py
@@ -34,7 +34,8 @@ def save(model, example_inputs, output_dir="./saved_results"):
     os.makedirs(output_dir, exist_ok=True)
     qmodel_file_path = os.path.join(os.path.abspath(os.path.expanduser(output_dir)), WEIGHT_NAME)
     qconfig_file_path = os.path.join(os.path.abspath(os.path.expanduser(output_dir)), QCONFIG_NAME)
-    quantized_ep = torch.export.export(model, example_inputs)
+    dynamic_shapes = model.dynamic_shapes
+    quantized_ep = torch.export.export(model, example_inputs, dynamic_shapes=dynamic_shapes)
     torch.export.save(quantized_ep, qmodel_file_path)
     for key, op_config in model.qconfig.items():
         model.qconfig[key] = op_config.to_dict()
diff --git a/neural_compressor/torch/export/pt2e_export.py b/neural_compressor/torch/export/pt2e_export.py
diff --git a/neural_compressor/torch/quantization/algorithm_entry.py b/neural_compressor/torch/quantization/algorithm_entry.py

Original file line number	Diff line number	Diff line change
`@@ -39,8 +39,9 @@ function run_tuning {`
`39`	`39`
`40`	`40`	`if [ "${topology}" = "opt_125m_pt2e_static" ]; then`
`41`	`41`	`model_name_or_path="facebook/opt-125m"`
	`42`	`+ output_dir="saved_results"`
`42`	`43`	`fi`
`43`		`- python run_clm_no_trainer.py --model ${model_name_or_path} --quantize --accuracy --tasks "lambada_openai"`
	`44`	`+ python run_clm_no_trainer.py --model ${model_name_or_path} --quantize --output_dir ${output_dir} --tasks "lambada_openai"`
`44`	`45`	`}`
`45`	`46`
`46`	`47`	`main "$@"`