Fix examples style, Fix noqa comment (#123)

kylesayrs · web-flow · commit b6dc7d4ca728 · 2024-08-28T16:13:58.000-04:00
diff --git a/examples/quantization_kv_cache/llama3_fp8_kv_example.py b/examples/quantization_kv_cache/llama3_fp8_kv_example.py
@@ -25,9 +25,17 @@
 ds = load_dataset(DATASET_ID, split=DATASET_SPLIT)
 ds = ds.shuffle(seed=42).select(range(NUM_CALIBRATION_SAMPLES))
 
+
 def process_and_tokenize(example):
     text = tokenizer.apply_chat_template(example["messages"], tokenize=False)
-    return tokenizer(text, padding=False, max_length=MAX_SEQUENCE_LENGTH, truncation=True, add_special_tokens=False)
+    return tokenizer(
+        text,
+        padding=False,
+        max_length=MAX_SEQUENCE_LENGTH,
+        truncation=True,
+        add_special_tokens=False,
+    )
+
 
 ds = ds.map(process_and_tokenize, remove_columns=ds.column_names)
 
diff --git a/examples/quantization_w8a8_fp8/gemma2_example.py b/examples/quantization_w8a8_fp8/gemma2_example.py
@@ -7,22 +7,21 @@
 
 # 1) Load model.
 model = SparseAutoModelForCausalLM.from_pretrained(
-    MODEL_ID, device_map="auto", torch_dtype="auto")
+    MODEL_ID, device_map="auto", torch_dtype="auto"
+)
 tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
 
 # 2) Configure the quantization algorithm and scheme.
 # In this case, we:
 #   * quantize the weights to fp8 with per channel via ptq
 #   * quantize the activations to fp8 with dynamic per token
 recipe = QuantizationModifier(
-    targets="Linear", scheme="FP8_DYNAMIC", ignore=["lm_head"])
+    targets="Linear", scheme="FP8_DYNAMIC", ignore=["lm_head"]
+)
 
 # 3) Apply quantization and save in compressed-tensors format.
 OUTPUT_DIR = MODEL_ID.split("/")[1] + "-FP8-Dynamic"
-oneshot(model=model,
-        recipe=recipe,
-        output_dir=OUTPUT_DIR,
-        tokenizer=tokenizer)
+oneshot(model=model, recipe=recipe, output_dir=OUTPUT_DIR, tokenizer=tokenizer)
 
 # Confirm generations of the quantized model look sane.
 print("========== SAMPLE GENERATION ==============")
diff --git a/examples/quantization_w8a8_int8/gemma2_example.py b/examples/quantization_w8a8_int8/gemma2_example.py
@@ -7,7 +7,10 @@
 # 1) Select model and load it.
 MODEL_ID = "google/gemma-2-2b-it"
 model = SparseAutoModelForCausalLM.from_pretrained(
-    MODEL_ID, device_map="auto", torch_dtype="auto",)
+    MODEL_ID,
+    device_map="auto",
+    torch_dtype="auto",
+)
 tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
 
 # 2) Prepare calibration dataset.
@@ -62,7 +65,7 @@ def tokenize(sample):
     recipe=recipe,
     max_seq_length=MAX_SEQUENCE_LENGTH,
     num_calibration_samples=NUM_CALIBRATION_SAMPLES,
-    output_dir=MODEL_ID.split("/")[1] + "-INT8"
+    output_dir=MODEL_ID.split("/")[1] + "-INT8",
 )
 
 # Confirm generations of the quantized model look sane.
diff --git a/tests/e2e/vLLM/test_vllm.py b/tests/e2e/vLLM/test_vllm.py
@@ -41,11 +41,11 @@ class TestvLLM(unittest.TestCase):
     run on a cadence defined by the `cadence` field. Each config defines the model
     to quantize. Optionally, a dataset id and split can be provided for calibration.
     Finally, all config files must list a scheme. The scheme can be a preset scheme
-    from https://github.com/neuralmagic/compressed-tensors/blob/main/src/compressed_tensors/quantization/quant_scheme.py # noqa: E501
+    from https://github.com/neuralmagic/compressed-tensors/blob/main/src/compressed_tensors/quantization/quant_scheme.py
     or another identifier which can be used for the particular test case. If a recipe
     is not provided, it is assumed that the scheme provided is a preset scheme and will
     be used for quantization. Otherwise, the recipe will always be used if given.
-    """
+    """  # noqa: E501
 
     model = None
     scheme = None