Fix GPTQ ROCm type conversion bug causing gibberish output

btbtyler09 · claude · btbtyler09 · commit f1b2aec31b04 · 2025-06-20T12:31:23.000Z
- Fix double type conversion bug in q_gemm.cu affecting all GPTQ models with tensor parallelism on ROCm - Move half2 res2 declaration inside loop with proper zero initialization - Remove problematic __half_as_ushort/__ushort_as_half conversions - Fix false Triton flash attention warning for models with sliding window when VLLM_USE_TRITON_FLASH_ATTN=0 - Changes match upstream PR vllm-project#17583 This fixes silent data corruption that was causing GPTQ models to produce gibberish output on ROCm with tensor parallelism. 🤖 Generated with [Claude Code](https://claude.ai/code) Co-Authored-By: Claude <noreply@anthropic.com>
diff --git a/csrc/quantization/gptq/q_gemm.cu b/csrc/quantization/gptq/q_gemm.cu
@@ -1223,7 +1223,6 @@ __global__ void gemm_half_q_half_alt_4bit_kernel(
   int k = 0;
   int z_w = w / 8;
   int z_mod = (w % 8) * 4;
-  half2 res2;
   half res[BLOCK_M_SIZE_MAX] = {};
 
   unsigned int tmp;
@@ -1248,12 +1247,7 @@ __global__ void gemm_half_q_half_alt_4bit_kernel(
       zeros_tmp[tmp_k] = zero;
     }
     for (int m = 0; m < b_end; m++) {
-#ifndef USE_ROCM
-      res2 = {};
-#else
-      res2.x = __half_as_ushort(__float2half(0));
-      res2.y = __half_as_ushort(__float2half(0));
-#endif
+      half2 res2{};
       res2 = __hfma2(
           __hfma2(deq2[(tmp >> 0) & 0xff][off], scales_tmp[0], zeros_tmp[0]),
           blockvec[m][k + 0], res2);
@@ -1266,12 +1260,7 @@ __global__ void gemm_half_q_half_alt_4bit_kernel(
       res2 = __hfma2(
           __hfma2(deq2[(tmp >> 24) & 0xff][off], scales_tmp[3], zeros_tmp[3]),
           blockvec[m][k + 3], res2);
-#ifndef USE_ROCM
       res[m] = __hadd(res[m], __hadd(res2.x, res2.y));
-#else
-      res[m] = __hadd(
-          res[m], __hadd(__ushort_as_half(res2.x), __ushort_as_half(res2.y)));
-#endif
     }
     i += width;
     k += 4;
@@ -1314,7 +1303,6 @@ __global__ void gemm_half_q_half_alt_8bit_kernel(
   int k = 0;
   int z_w = w / 4;
   int z_mod = (w % 4) * 8;
-  half2 res2;
   half res[BLOCK_M_SIZE_MAX] = {};
 
   unsigned int tmp;
@@ -1339,12 +1327,7 @@ __global__ void gemm_half_q_half_alt_8bit_kernel(
       zeros_tmp[tmp_k] = zero;
     }
     for (int m = 0; m < b_end; m++) {
-#ifndef USE_ROCM
-      res2 = {};
-#else
-      res2.x = __half_as_ushort(__float2half(0));
-      res2.y = __half_as_ushort(__float2half(0));
-#endif
+      half2 res2{};
       half2 v12 = __halves2half2(__int2half_rn(tmp & 0xFF),
                                  __int2half_rn((tmp >> 8) & 0xFF));
       res2 = __hfma2(__hfma2(v12, scales_tmp[0], zeros_tmp[0]),
@@ -1353,12 +1336,7 @@ __global__ void gemm_half_q_half_alt_8bit_kernel(
                                  __int2half_rn((tmp >> 24) & 0xFF));
       res2 = __hfma2(__hfma2(v34, scales_tmp[1], zeros_tmp[1]),
                      blockvec[m][k + 1], res2);
-#ifndef USE_ROCM
       res[m] = __hadd(res[m], __hadd(res2.x, res2.y));
-#else
-      res[m] = __hadd(
-          res[m], __hadd(__ushort_as_half(res2.x), __ushort_as_half(res2.y)));
-#endif
     }
     i += width;
     k += 2;
diff --git a/vllm/platforms/rocm.py b/vllm/platforms/rocm.py
@@ -111,6 +111,12 @@ def on_mi3xx() -> bool:
     return any(arch in GPU_ARCH for arch in ["gfx942", "gfx950"])
 
 
+@cache
+def on_mi100() -> bool:
+    GPU_ARCH = torch.cuda.get_device_properties("cuda").gcnArchName
+    return any(arch in GPU_ARCH for arch in ["gfx900", "gfx902", "gfx906", "gfx908"])
+
+
 @cache
 def on_gfx9() -> bool:
     GPU_ARCH = torch.cuda.get_device_properties("cuda").gcnArchName
@@ -328,6 +334,10 @@ def verify_model_arch(cls, model_arch: str) -> None:
 
         if model_arch in _ROCM_PARTIALLY_SUPPORTED_MODELS:
             msg = _ROCM_PARTIALLY_SUPPORTED_MODELS[model_arch]
+            # Only show Triton-related warnings if Triton is actually being used
+            if "Triton flash attention" in msg and not envs.VLLM_USE_TRITON_FLASH_ATTN:
+                # Skip warning since Triton is not being used
+                return
             logger.warning(
                 "Model architecture '%s' is partially "
                 "supported by ROCm: %s", model_arch, msg)