fix lm eval test reproducibility issues (#1260)

brian-dellabetta · shanjiaz · commit 59f842294482 · 2025-05-06T13:46:30.000-04:00
SUMMARY:
lm-eval multimodal tests were failing to reproduce across different
versions of compressed tensors. After upgrading the models from 2B to
7B, the tests appear to be reproducing across compressed tensors 0.9.1,
0.9.2 and nightly. I ran extensively for the fp8 config across different
versions of CT, and it always returned the same result.

I also removed the random seed from the configs. after running several
of each of the 3 configs, i did not see any change in result. this may
cause errors during ci/cd testing but I'd like to see if it does, i feel
that is a better e2e test anyway.

Tests take roughly 1hr30m - 1h45m to run.

TEST PLAN:
no new src code, just fixing tests

---------

Signed-off-by: Brian Dellabetta &lt;bdellabe@redhat.com&gt;
Signed-off-by: shanjiaz &lt;zsjwpianpian@gmail.com&gt;
diff --git a/tests/lmeval/configs/vl_fp8_dynamic_per_token.yaml b/tests/lmeval/configs/vl_fp8_dynamic_per_token.yaml
@@ -0,0 +1,17 @@
+cadence: weekly
+model: Qwen/Qwen2.5-VL-7B-Instruct
+model_class: TraceableQwen2_5_VLForConditionalGeneration
+scheme: FP8_DYNAMIC
+lmeval:
+  model: "hf-multimodal"
+  model_args:
+    dtype: bfloat16
+    add_bos_token: True
+    convert_img_format: True
+  task: mmmu_val_literature
+  num_fewshot: 0
+  batch_size: 8
+  # dense model achieves accuracy of 0.9 +/ 0.0557
+  metrics:
+    acc,none: 0.8667
+    acc_stderr,none: 0.0557
diff --git a/tests/lmeval/configs/vl_int8_w8a8_dynamic_per_token.yaml b/tests/lmeval/configs/vl_int8_w8a8_dynamic_per_token.yaml
@@ -1,19 +1,20 @@
 cadence: "weekly"
-model: llava-hf/llava-1.5-7b-hf
-model_class: TraceableLlavaForConditionalGeneration
+model: Qwen/Qwen2.5-VL-7B-Instruct
+model_class: TraceableQwen2_5_VLForConditionalGeneration
 scheme: INT8_dyn_per_token
 recipe: tests/e2e/vLLM/recipes/INT8/recipe_int8_channel_weight_dynamic_per_token.yaml
 dataset_id: lmms-lab/flickr30k
 dataset_split: "test[:512]"
-seed: 42 #compressed model is sensitive to random seed
 lmeval:
   model: "hf-multimodal"
   model_args:
     dtype: bfloat16
     add_bos_token: True
     convert_img_format: True
-  task: mmmu_val_economics
+  task: mmmu_val_literature
   num_fewshot: 0
+  batch_size: 8
+  # dense model achieves accuracy of 0.9 +/ 0.0557
   metrics:
-    acc,none: 0.233
-  batch_size: 8
+    acc,none: 0.833
+    acc_stderr,none: 0.0557
diff --git a/tests/lmeval/configs/vl_w4a16_actorder_weight.yaml b/tests/lmeval/configs/vl_w4a16_actorder_weight.yaml
@@ -1,19 +1,20 @@
 cadence: "weekly"
-model: Qwen/Qwen2-VL-2B-Instruct
-model_class: TraceableQwen2VLForConditionalGeneration
+model: Qwen/Qwen2.5-VL-7B-Instruct
+model_class: TraceableQwen2_5_VLForConditionalGeneration
+scheme: W4A16_actorder_weight
 recipe: tests/e2e/vLLM/recipes/actorder/recipe_w4a16_actorder_weight.yaml
 dataset_id: lmms-lab/flickr30k
 dataset_split: "test[:512]"
-scheme: W4A16_actorder_group
-seed: 42 #compressed model is sensitive to random seed
 lmeval:
   model: "hf-multimodal"
   model_args:
     dtype: bfloat16
     add_bos_token: True
     convert_img_format: True
-  task: mmmu_val_economics
+  task: mmmu_val_literature
   num_fewshot: 0
+  batch_size: 8
+  # dense model achieves accuracy of 0.9 +/ 0.0557
   metrics:
-    acc,none: 0.366
-  batch_size: 4
+    acc,none: 0.8333
+    acc_stderr,none: 0.0557
diff --git a/tests/lmeval/skipped_configs/vl_fp8_dynamic_per_token.yaml b/tests/lmeval/skipped_configs/vl_fp8_dynamic_per_token.yaml
diff --git a/tests/lmeval/test_lmeval.py b/tests/lmeval/test_lmeval.py
@@ -155,12 +155,28 @@ def _run_lm_eval(self):
         )
 
         metrics = results["results"][self.lmeval.task]
-        for metric, expected_val in self.lmeval.metrics.items():
-            actual_val = metrics.get(metric)
-            logger.info(
-                f"Comparing {metric}: Expected {expected_val}, Got {actual_val}"
-            )
-            assert numpy.isclose(expected_val, actual_val, rtol=0.05)
+        for metric_key, expected_val in self.lmeval.metrics.items():
+            # stderr metrics are only used as absolute tolerance
+            # checks for actual values
+            if "stderr" in metric_key:
+                continue
+            actual_val = metrics.get(metric_key)
+            # If stderr is provided, use it as absolute tolerance
+            # Otherwise, default to a 5% relative tolerance
+            stderr_key = metric_key.replace(",", "_stderr,")
+            std_err = self.lmeval.metrics.get(stderr_key)
+            if std_err is None:
+                logger.info(
+                    f"Comparing {metric_key}: Expected {expected_val} "
+                    f"±5%, Got {actual_val}"
+                )
+                assert numpy.isclose(expected_val, actual_val, rtol=0.05)
+            else:
+                logger.info(
+                    f"Comparing {metric_key}: Expected {expected_val} "
+                    f"±{std_err*100}%, Got {actual_val}"
+                )
+                assert numpy.isclose(expected_val, actual_val, atol=std_err)
 
     def tear_down(self):
         timer = get_singleton_manager()