Merge branch 'hz-fix-example-quantization-2of4' of https://github.com/vllm-project/llm-compressor into hz-fix-example-quantization-2of4

shanjiaz · shanjiaz · commit dfaaceaabe4d · 2025-06-18T15:03:19.000-04:00
diff --git a/src/llmcompressor/transformers/finetune/session_mixin.py b/src/llmcompressor/transformers/finetune/session_mixin.py
@@ -270,7 +270,7 @@ def compute_loss(
         model: Module,
         inputs: Dict[str, Any],
         return_outputs: bool = False,
-        num_items_in_batch: Optional[int] = None,
+        num_items_in_batch: Optional[torch.Tensor] = None,
     ) -> Union[torch.Tensor, Tuple[torch.Tensor, Any]]:
         """
         Override for the compute_loss to factor trigger callbacks and filter columns
@@ -279,6 +279,7 @@ def compute_loss(
         :param inputs: the inputs to pass through the model for calculating the loss
         :param return_outputs: True to return the outputs with the loss,
             False otherwise
+        :param num_items_in_batch: the number of items which contribute to loss
         :return: the resulting loss if not return_outputs, otherwise a tuple
             containing the loss and the model's outputs
         """
diff --git a/src/llmcompressor/utils/dev.py b/src/llmcompressor/utils/dev.py
@@ -126,11 +126,17 @@ def dispatch_for_generation(model: PreTrainedModel) -> PreTrainedModel:
     """
     remove_dispatch(model)
 
+    no_split_module_classes = model._get_no_split_modules("auto")
     max_memory = get_balanced_memory(
         model,
         dtype=model.dtype,
-        no_split_module_classes=model._get_no_split_modules("auto"),
+        no_split_module_classes=no_split_module_classes,
+    )
+    device_map = infer_auto_device_map(
+        model,
+        dtype=model.dtype,
+        max_memory=max_memory,
+        no_split_module_classes=no_split_module_classes,
     )
-    device_map = infer_auto_device_map(model, dtype=model.dtype, max_memory=max_memory)
 
     return dispatch_model(model, device_map=device_map)
diff --git a/tests/llmcompressor/transformers/finetune/test_oneshot_and_finetune_with_tokenizer.py b/tests/llmcompressor/transformers/finetune/test_oneshot_and_finetune_with_tokenizer.py
@@ -36,7 +36,7 @@ def test_oneshot_and_finetune_with_tokenizer(self):
             self.model,
         )
         model_loaded = AutoModelForCausalLM.from_pretrained(
-            self.model, device_map="cuda:0", torch_dtype="auto"
+            self.model, torch_dtype="auto"
         )
 
         dataset_loaded = load_dataset(

Original file line number	Diff line number	Diff line change
`@@ -36,7 +36,7 @@ def test_oneshot_and_finetune_with_tokenizer(self):`
`36`	`36`	`self.model,`
`37`	`37`	`)`
`38`	`38`	`model_loaded = AutoModelForCausalLM.from_pretrained(`
`39`		`- self.model, device_map="cuda:0", torch_dtype="auto"`
	`39`	`+ self.model, torch_dtype="auto"`
`40`	`40`	`)`
`41`	`41`
`42`	`42`	`dataset_loaded = load_dataset(`