Skip to content

Commit 00bfb89

Browse files
committed
use envs
Signed-off-by: Dipika Sikka <[email protected]>
1 parent 095b4c8 commit 00bfb89

File tree

3 files changed

+12
-7
lines changed

3 files changed

+12
-7
lines changed

vllm/envs.py

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -133,6 +133,7 @@
133133
VLLM_EXECUTE_MODEL_TIMEOUT_SECONDS: int = 300
134134
VLLM_KV_CACHE_LAYOUT: Optional[str] = None
135135
VLLM_COMPUTE_NANS_IN_LOGITS: bool = False
136+
VLLM_USE_NVFP4_CT_EMULATIONS: bool = False
136137

137138

138139
def get_default_cache_root():
@@ -918,6 +919,12 @@ def get_vllm_port() -> Optional[int]:
918919
# or bad hardware but it may add compute overhead.
919920
"VLLM_COMPUTE_NANS_IN_LOGITS":
920921
lambda: bool(int(os.getenv("VLLM_COMPUTE_NANS_IN_LOGITS", "0"))),
922+
923+
# Controls whether or not emulations are used for NVFP4
924+
# generations on machines < 100 for compressed-tensors
925+
# models
926+
"VLLM_USE_NVFP4_CT_EMULATIONS":
927+
lambda: bool(int(os.getenv("VLLM_USE_NVFP4_CT_EMULATIONS", "0")))
921928
}
922929

923930
# --8<-- [end:env-vars-definition]

vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,6 @@
11
# SPDX-License-Identifier: Apache-2.0
22
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
33

4-
import os
54
from contextlib import suppress
65
from typing import Any, Literal, Optional, cast
76

@@ -14,6 +13,7 @@
1413
QuantizationType)
1514
from pydantic import BaseModel
1615

16+
import vllm.envs as envs
1717
from vllm.logger import init_logger
1818
from vllm.model_executor.layers.fused_moe import FusedMoE
1919
from vllm.model_executor.layers.linear import (LinearBase, LinearMethodBase,
@@ -376,7 +376,7 @@ def _get_scheme_from_parts(
376376
if is_activation_quantization_format(self.quant_format):
377377
if self._is_fp4a4_nvfp4(weight_quant, input_quant):
378378
if CompressedTensorsW4A4Fp4.cutlass_fp4_supported(
379-
) or os.environ.get("USE_NVFP4_CT_EMULATIONS", "0") == "1":
379+
) or envs.VLLM_USE_NVFP4_CT_EMULATIONS:
380380
return CompressedTensorsW4A4Fp4()
381381
else:
382382
logger.warning_once(

vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w4a4_nvfp4.py

Lines changed: 3 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1,10 +1,10 @@
11
# SPDX-License-Identifier: Apache-2.0
2-
import os
32
from typing import Callable, Optional
43

54
import torch
65
from torch.nn.parameter import Parameter
76

7+
import vllm.envs as envs
88
from vllm._custom_ops import (cutlass_scaled_fp4_mm,
99
cutlass_scaled_mm_supports_fp4, scaled_fp4_quant)
1010
from vllm.logger import init_logger
@@ -21,8 +21,6 @@
2121

2222
__all__ = ["CompressedTensorsW4A4Fp4"]
2323

24-
USE_NVFP4_CT_EMULATIONS = os.environ.get("USE_NVFP4_CT_EMULATIONS", '0')
25-
2624

2725
class CompressedTensorsW4A4Fp4(CompressedTensorsScheme):
2826

@@ -31,7 +29,7 @@ def __init__(self):
3129

3230
@classmethod
3331
def get_min_capability(cls) -> int:
34-
if USE_NVFP4_CT_EMULATIONS == "1":
32+
if envs.VLLM_USE_NVFP4_CT_EMULATIONS:
3533
return 80
3634
return 100
3735

@@ -136,7 +134,7 @@ def apply_weights(self,
136134
x: torch.Tensor,
137135
bias: Optional[torch.Tensor] = None) -> torch.Tensor:
138136

139-
if USE_NVFP4_CT_EMULATIONS == "1":
137+
if envs.VLLM_USE_NVFP4_CT_EMULATIONS:
140138
out = run_nvfp4_emulations(
141139
x=x,
142140
input_global_scale=layer.input_global_scale,

0 commit comments

Comments
 (0)