NVIDIA-NeMo · chtruong814 · Oct 17, 2025 · Oct 17, 2025
@@ -78,6 +78,8 @@ def __init__(
         tokenizer_truncation=True,
         tokenizer_padding_side="left",
         task: Optional[str] = "text-generation",
+        torch_dtype: Optional[torch.dtype] = "auto",
+        device_map: Optional[str] = "auto",
         **hf_kwargs,
     ):
         if not HAVE_TRITON:
@@ -109,7 +111,9 @@ def __init__(
         if model is None:
             self._load(**hf_kwargs)
 
-    def _load(self, **hf_kwargs) -> None:
+    def _load(
+        self, torch_dtype: Optional[torch.dtype] = "auto", device_map: Optional[str] = "auto", **hf_kwargs
+    ) -> None:
         """Load the HuggingFace pipeline with the specified model and task.
 
         This method initializes the HuggingFace AutoModel classes using the provided model
@@ -122,7 +126,9 @@ def _load(self, **hf_kwargs) -> None:
         assert self.task is not None, "A task has to be given for the generation task."
 
         if self.task == "text-generation":
-            self.model = AutoModelForCausalLM.from_pretrained(self.hf_model_id_path, **hf_kwargs)
+            self.model = AutoModelForCausalLM.from_pretrained(
+                self.hf_model_id_path, torch_dtype=torch_dtype, device_map=device_map, **hf_kwargs
+            )
 
             if self.hf_peft_model_id_path is not None:
                 self.model = PeftModel.from_pretrained(self.model, self.hf_peft_model_id_path)
@@ -131,7 +137,7 @@ def _load(self, **hf_kwargs) -> None:
         num_gpus = torch.cuda.device_count()
         # If there is only one GPU, move the model to GPU. If you are using device_map as "auto" or "balanced",
         # the model will be moved to GPU automatically.
-        if num_gpus == 1:
+        if device_map == None and num_gpus >= 1 and self.model.device.type != "cuda":
             self.model.cuda()
         self.tokenizer = AutoTokenizer.from_pretrained(
             self.tokenizer_id_path,

@@ -62,9 +62,11 @@ def __init__(
         task: str = "text-generation",
         trust_remote_code: bool = True,
         model_id: str = "nemo-model",
-        device_map: Optional[str] = None,
+        device_map: Optional[str] = "auto",
+        torch_dtype: Optional[torch.dtype] = "auto",
         max_memory: Optional[str] = None,
         use_vllm_backend: bool = False,
+        **kwargs,
     ):
         """Initialize the HuggingFace model deployment.
 
@@ -73,11 +75,12 @@ def __init__(
             task (str): HuggingFace task type. Defaults to "text-generation".
             trust_remote_code (bool): Whether to trust remote code. Defaults to True.
             device_map (str): Device mapping strategy. Defaults to "auto".
+            torch_dtype (torch.dtype): Torch dtype for the model. Defaults to "auto".
             model_id (str): Model identifier. Defaults to "nemo-model".
             max_memory (str): Maximum memory allocation when using balanced device map.
             use_vllm_backend (bool, optional): Whether to use vLLM backend for deployment. If True, exports the HF ckpt
             to vLLM format and uses vLLM backend for inference. Defaults to False.
-
+            **kwargs: Additional keyword arguments to pass to HuggingFace model loading or vLLM exporter.
         Raises:
             ImportError: If Ray is not installed.
             Exception: If model initialization fails.
@@ -96,15 +99,17 @@ def __init__(
                 from nemo_export.vllm_exporter import vLLMExporter
 
                 vllm_exporter = vLLMExporter()
-                vllm_exporter.export(model_path_id=hf_model_id_path)
+                vllm_exporter.export(model_path_id=hf_model_id_path, **kwargs)
                 self.model = vllm_exporter
             else:
                 self.model = HuggingFaceLLMDeploy(
                     hf_model_id_path=hf_model_id_path,
                     task=task,
                     trust_remote_code=trust_remote_code,
                     device_map=device_map,
+                    torch_dtype=torch_dtype,
                     max_memory=max_memory_dict,
+                    **kwargs,
                 )
             self.model_id = model_id
 

@@ -80,10 +80,19 @@ def get_args(argv):
         "--device_map",
         nargs="?",
         choices=["auto", "balanced", "balanced_low_0", "sequential"],
-        default=None,
+        default="auto",
         type=str,
         help="Device mapping strategy for model placement (e.g. 'auto', 'sequential', etc)",
     )
+    parser.add_argument(
+        "-td",
+        "--torch_dtype",
+        nargs="?",
+        choices=["auto", "bfloat16", "float16", "float32"],
+        default="auto",
+        type=str,
+        help="Torch dtype for the model",
+    )
     parser.add_argument(
         "-tpp",
         "--tp_plan",
@@ -196,6 +205,7 @@ def hf_deploy(argv):
         task=args.task,
         trust_remote_code=args.trust_remote_code,
         device_map=args.device_map,
+        torch_dtype=args.torch_dtype,
         tp_plan=args.tp_plan,
     )
 

@@ -43,9 +43,15 @@ def parse_args():
     parser.add_argument(
         "--device_map",
         type=str,
-        default=None,
+        default="auto",
         help="Device mapping strategy for model placement",
     )
+    parser.add_argument(
+        "--torch_dtype",
+        type=str,
+        default="auto",
+        help="Torch dtype for the model",
+    )
     parser.add_argument(
         "--max_memory",
         type=str,