[Tracing][Testing] Add tracing tests (#1335)

kylesayrs · web-flow · commit ffa570c6a12d · 2025-05-05T20:15:44.000Z
## Purpose ## * Add regression testing to model tracing beyond example tests * These tests complete in ~1 min and can be run at a quicker cadence than example tests * These can also be used to test tracing capabilities beyond those in the examples, for example tracing into linear layers ## Prerequisites ## * #1334 * #1402 ## Changes ## * Fix function signature of peoples speech dataset * Add `trust_remote_code` argument to debugger * Add `tests/llmcompressor/transformers/tracing/models.py` * I did not include phi3 because it's a very difficult model to work with programmatically. I will revisit once the major tracing improvements have landed --------- Signed-off-by: Kyle Sayers <kylesayrs@gmail.com>
diff --git a/.github/workflows/test-check-transformers.yaml b/.github/workflows/test-check-transformers.yaml
@@ -8,6 +8,7 @@ on:
 
 env:
   CADENCE: "commit"
+  HF_TOKEN: ${{ secrets.HF_TOKEN_READ }}
 
 jobs:
   detect-changes:
@@ -95,6 +96,10 @@ jobs:
         if: (success() || failure()) && steps.install.outcome == 'success'
         run: |
           pytest -v tests/llmcompressor/transformers/obcq
+      - name: Running Tracing Tests
+        if: (success() || failure()) && steps.install.outcome == 'success'
+        run: |
+          pytest -v tests/llmcompressor/transformers/tracing
       - name: Running KV Cache Tests
         if: (success() || failure()) && steps.install.outcome == 'success'
         run: |
diff --git a/src/llmcompressor/transformers/finetune/data/peoples_speech.py b/src/llmcompressor/transformers/finetune/data/peoples_speech.py
@@ -26,20 +26,20 @@ class PeoplesSpeech(TextGenerationDataset):
     :param processor: processor or tokenizer to use on dataset
     """
 
-    def __init__(self, data_args: "DataArgs", split: str, processor: Processor):
-        data_args = deepcopy(data_args)
-        data_args.dataset = "MLCommons/peoples_speech"
-        data_args.dataset_config_name = "test"
-        if not data_args.overwrite_cache:
+    def __init__(self, dataset_args: "DataArgs", split: str, processor: Processor):
+        dataset_args = deepcopy(dataset_args)
+        dataset_args.dataset = "MLCommons/peoples_speech"
+        dataset_args.dataset_config_name = "test"
+        if not dataset_args.overwrite_cache:
             logger.warning(
                 "Because audio processors are more complex, dataset mapping functions "
                 "vary with model architecture and their results cannot be cached. "
                 "Setting overwrite_cache=True"
             )
-            data_args.overwrite_cache = True
+            dataset_args.overwrite_cache = True
         self.processor_type = processor.__class__.__name__
 
-        super().__init__(data_args=data_args, split=split, processor=processor)
+        super().__init__(dataset_args=dataset_args, split=split, processor=processor)
 
     def dataset_template(self, example):
         audio = example["audio"]["array"]
diff --git a/src/llmcompressor/transformers/tracing/debug.py b/src/llmcompressor/transformers/tracing/debug.py
@@ -12,6 +12,8 @@
 from llmcompressor.transformers import TextGenerationDataset
 from llmcompressor.args import DatasetArguments
 
+from llmcompressor.utils.dev import skip_weights_download
+
 __all__ = [
     "get_model_class"
 ]
@@ -24,6 +26,7 @@ def parse_args():
     parser.add_argument("--sequential_targets", type=str, nargs="*", default=None, metavar="TARGET", help="List of targets for sequential tracing")  # noqa: E501
     parser.add_argument("--ignore", type=str, nargs="*", default=[], metavar="PATTERN", help="List of patterns to ignore during tracing")  # noqa: E501
     parser.add_argument("--modality", type=str, default="text", help="Modality of calibration dataset, defaults to text")  # noqa: E501
+    parser.add_argument("--trust_remote_code", type=bool, default=False, help="Whether to trust model remote code")  # noqa: E501
     return parser.parse_args()
 
 
@@ -33,6 +36,7 @@ def trace(
     sequential_targets: Optional[Union[List[str], str]] = None,
     ignore: Union[List[str], str] = [],
     modality: str = "text",
+    trust_remote_code: bool = True
 ):
     """
     Debug traceability by tracing a pre-trained model into subgraphs
@@ -44,6 +48,7 @@ def trace(
         inference
     :param ignore: patterns to ignore during tracing
     :param modality: data modality for dummy tracing data, defaults to 'text'
+    :param trust_remote_code: trust remote model code
 
     Example usage from CLI
     llmcompressor.trace \
@@ -54,12 +59,16 @@ def trace(
         --modality text
     """
     # Load model
-    model = model_class.from_pretrained(
-        model_id,
-        device_map="auto",
-        torch_dtype="auto",
+    with skip_weights_download(model_class):
+        model = model_class.from_pretrained(
+            model_id,
+            device_map="cpu",
+            torch_dtype="auto",
+            trust_remote_code=trust_remote_code,
+        )
+    processor = AutoProcessor.from_pretrained(
+        model_id, trust_remote_code=trust_remote_code
     )
-    processor = AutoProcessor.from_pretrained(model_id, trust_remote_code=True)
     print("Loaded model")
 
     # Prepare sample data
@@ -138,6 +147,7 @@ def main():
         sequential_targets=args.sequential_targets,
         ignore=args.ignore,
         modality=args.modality,
+        trust_remote_code=args.trust_remote_code
     )
 
 
diff --git a/src/llmcompressor/transformers/tracing/idefics3.py b/src/llmcompressor/transformers/tracing/idefics3.py
@@ -285,7 +285,7 @@ def __init__(self, config: Idefics3Config):
 
     def forward(
         self,
-        input_ids: torch.LongTensor = None,
+        input_ids: Optional[torch.LongTensor] = None,
         attention_mask: Optional[torch.Tensor] = None,
         position_ids: Optional[torch.LongTensor] = None,
         past_key_values: Optional[List[torch.FloatTensor]] = None,
@@ -296,6 +296,7 @@ def forward(
         use_cache: Optional[bool] = None,
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
+        cache_position: Optional[torch.LongTensor] = None,
         return_dict: Optional[bool] = None,
     ) -> Union[Tuple, Idefics3BaseModelOutputWithPast]:
         output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
@@ -394,6 +395,7 @@ def forward(
             use_cache=use_cache,
             output_attentions=output_attentions,
             output_hidden_states=output_hidden_states,
+            cache_position=cache_position,
             return_dict=return_dict,
         )
 
diff --git a/tests/llmcompressor/transformers/tracing/test_models.py b/tests/llmcompressor/transformers/tracing/test_models.py
@@ -0,0 +1,106 @@
+import pytest
+from transformers import AutoModelForCausalLM, WhisperForConditionalGeneration
+
+from llmcompressor.transformers.tracing import (
+    TraceableIdefics3ForConditionalGeneration,
+    TraceableLlavaForConditionalGeneration,
+    TraceableMllamaForConditionalGeneration,
+    TraceableQwen2_5_VLForConditionalGeneration,
+    TraceableQwen2VLForConditionalGeneration,
+)
+from llmcompressor.transformers.tracing.debug import trace
+
+
+@pytest.mark.parametrize(
+    "model_id,model_class,targets",
+    [
+        ("meta-llama/Meta-Llama-3-8B-Instruct", AutoModelForCausalLM, None),
+    ],
+)
+def test_text_trace(model_id, model_class, targets):
+    trace(
+        model_id,
+        model_class,
+        targets,
+        ignore=[],
+        modality="text",
+        trust_remote_code=True,
+    )
+
+
+@pytest.mark.parametrize(
+    "model_id,model_class,targets,ignore",
+    [
+        (
+            "HuggingFaceM4/Idefics3-8B-Llama3",
+            TraceableIdefics3ForConditionalGeneration,
+            ["LlamaDecoderLayer"],
+            [],
+        ),
+        (
+            "llava-hf/llava-1.5-7b-hf",
+            TraceableLlavaForConditionalGeneration,
+            ["LlamaDecoderLayer"],
+            [],
+        ),
+        (
+            "meta-llama/Llama-3.2-11B-Vision-Instruct",
+            TraceableMllamaForConditionalGeneration,
+            ["MllamaSelfAttentionDecoderLayer"],
+            [],
+        ),
+        # skip phi3_v because of its processor is annoying and requires special code
+        (
+            "mgoin/pixtral-12b",
+            TraceableLlavaForConditionalGeneration,
+            ["MistralDecoderLayer"],
+            [],
+        ),
+        (
+            "Qwen/Qwen2.5-VL-7B-Instruct",
+            TraceableQwen2_5_VLForConditionalGeneration,
+            ["Qwen2_5_VLDecoderLayer"],
+            [],
+        ),
+        (
+            "Qwen/Qwen2-VL-2B-Instruct",
+            TraceableQwen2VLForConditionalGeneration,
+            ["Qwen2VLDecoderLayer"],
+            [],
+        ),
+    ],
+)
+def test_vision_trace(model_id, model_class, targets, ignore):
+    trace(
+        model_id,
+        model_class,
+        targets,
+        ignore=ignore,
+        modality="vision",
+        trust_remote_code=True,
+    )
+
+
+@pytest.mark.parametrize(
+    "model_id,model_class,targets,ignore",
+    [
+        (
+            "openai/whisper-large-v3",
+            WhisperForConditionalGeneration,
+            None,
+            [],
+        ),
+    ],
+)
+def test_audio_trace(model_id, model_class, targets, ignore):
+    pytest.importorskip("librosa")
+    pytest.importorskip("soundfile")
+
+    trace(
+        model_id,
+        model_class,
+        targets,
+        ignore=ignore,
+        modality="audio",
+        trust_remote_code=True,
+    )