Merge pull request #1 from ROCm/greg/fp8_tests

HaiShaw · web-flow · commit 475a2ef161bf · 2024-02-08T17:00:26.000-08:00
Greg/fp8 tests
diff --git a/.gitignore b/.gitignore
@@ -181,6 +181,7 @@ _build/
 # hip files generated by PyTorch
 *.hip
 *_hip*
+hip_compat.h
 
 # Benchmark dataset
 *.json
diff --git a/csrc/attention/attention_dtypes.h b/csrc/attention/attention_dtypes.h
@@ -4,4 +4,4 @@
 #include "dtype_float16.cuh"
 #include "dtype_float32.cuh"
 #include "dtype_bfloat16.cuh"
-#include "dtype_fp8_e5m2.cuh"
+#include "dtype_fp8.cuh"
diff --git a/csrc/attention/dtype_fp8.cuh b/csrc/attention/dtype_fp8.cuh
@@ -8,7 +8,7 @@
 #endif
 
 namespace vllm {
-#ifdef ENABLE_FP8_E5M2
+#if defined(ENABLE_FP8_E5M2) || defined(ENABLE_FP8_E4M3)
 // fp8 vector types for quantization of kv cache
 
 template<>
diff --git a/csrc/cache.h b/csrc/cache.h
@@ -31,6 +31,6 @@ void gather_cached_kv(
   torch::Tensor& slot_mapping);
 
 // Just for unittest
-void convert_fp8_e5m2(
+void convert_fp8(
   torch::Tensor& src_cache,
   torch::Tensor& dst_cache);
diff --git a/csrc/cache_kernels.cu b/csrc/cache_kernels.cu
@@ -4,8 +4,10 @@
 
 #include "cuda_compat.h"
 #include "dispatch_utils.h"
-#ifdef ENABLE_FP8_E5M2
+#if defined(ENABLE_FP8_E5M2)
 #include "quantization/fp8_e5m2_kvcache/quant_utils.cuh"
+#else if defined(ENABLE_FP8_E4M3)
+#include "quantization/fp8/amd_detail/quant_utils.cuh"
 #endif
 
 #include <algorithm>
@@ -196,9 +198,12 @@ __global__ void reshape_and_cache_kernel(
     scalar_t tgt_key = key[src_key_idx];
     scalar_t tgt_value = value[src_value_idx];
     if constexpr (is_fp8_e5m2_kv_cache) {
-#ifdef ENABLE_FP8_E5M2
+#if defined(ENABLE_FP8_E5M2)
       key_cache[tgt_key_idx] = fp8_e5m2_unscaled::vec_conversion<uint8_t, scalar_t>(tgt_key);
       value_cache[tgt_value_idx] = fp8_e5m2_unscaled::vec_conversion<uint8_t, scalar_t>(tgt_value);
+#elif defined(ENABLE_FP8_E4M3)
+      key_cache[tgt_key_idx] = fp8_e4m3::vec_conversion<uint8_t, scalar_t>(tgt_key);
+      value_cache[tgt_value_idx] = fp8_e4m3::vec_conversion<uint8_t, scalar_t>(tgt_value);
 #else
       assert(false);
 #endif
@@ -431,15 +436,17 @@ void gather_cached_kv(
 namespace vllm {
 
 template<typename Tout, typename Tin>
-__global__ void convert_fp8_e5m2_kernel(
+__global__ void convert_fp8_kernel(
   const Tin* __restrict__ src_cache,
   Tout* __restrict__ dst_cache,
   const int64_t block_stride) {
   const int64_t block_idx = blockIdx.x;
   for (int i = threadIdx.x; i < block_stride; i += blockDim.x) {
     int64_t idx = block_idx * block_stride + i;
-#ifdef ENABLE_FP8_E5M2
+#if defined(ENABLE_FP8_E5M2)
     dst_cache[idx] = fp8_e5m2_unscaled::vec_conversion<Tout, Tin>(src_cache[idx]);
+#elif defined(ENABLE_FP8_E4M3)
+    dst_cache[idx] = fp8_e4m3::vec_conversion<Tout, Tin>(src_cache[idx]);
 #else
     assert(false);
 #endif
@@ -448,16 +455,29 @@ __global__ void convert_fp8_e5m2_kernel(
 
 } // namespace vllm
 
-#define CALL_CONVERT_FP8_E5M2(Tout, Tin)                                 \
-  vllm::convert_fp8_e5m2_kernel<Tout, Tin><<<grid, block, 0, stream>>>(  \
-    reinterpret_cast<Tin*>(src_cache.data_ptr()),                        \
-    reinterpret_cast<Tout*>(dst_cache.data_ptr()),                       \
+#define CALL_CONVERT_FP8(Tout, Tin)                                 \
+  vllm::convert_fp8_kernel<Tout, Tin><<<grid, block, 0, stream>>>(  \
+    reinterpret_cast<Tin*>(src_cache.data_ptr()),                   \
+    reinterpret_cast<Tout*>(dst_cache.data_ptr()),                  \
     block_stride);
 
-void convert_fp8_e5m2(
+void convert_fp8(
   torch::Tensor& src_cache,
   torch::Tensor& dst_cache)
 {
+  torch::Device src_device = src_cache.device();
+  torch::Device dst_device = dst_cache.device();
+  if (src_device.is_cuda() && dst_device.is_cuda()) {
+    TORCH_CHECK(
+      src_device.index() == dst_device.index(),
+      "src and dst must be on the same GPU");
+  }
+  at::cuda::OptionalCUDAGuard device_guard;
+  if (src_device.is_cuda()) {
+    device_guard.set_device(src_device);
+  } else if (dst_device.is_cuda()) {
+    device_guard.set_device(dst_device);
+  }
   int64_t num_blocks = src_cache.size(0);
   int64_t block_stride = src_cache.stride(0);
 
@@ -466,16 +486,16 @@ void convert_fp8_e5m2(
   const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
 
   if (src_cache.dtype() == at::ScalarType::Float) {
-    CALL_CONVERT_FP8_E5M2(uint8_t, float);
+    CALL_CONVERT_FP8(uint8_t, float);
   } else if (src_cache.dtype() == at::ScalarType::Half) {
-    CALL_CONVERT_FP8_E5M2(uint8_t, uint16_t);
+    CALL_CONVERT_FP8(uint8_t, uint16_t);
   } else if (src_cache.dtype() == at::ScalarType::BFloat16) {
-    CALL_CONVERT_FP8_E5M2(uint8_t, __nv_bfloat16);
+    CALL_CONVERT_FP8(uint8_t, __nv_bfloat16);
   } else if (dst_cache.dtype() == at::ScalarType::Float) {
-    CALL_CONVERT_FP8_E5M2(float, uint8_t);
+    CALL_CONVERT_FP8(float, uint8_t);
   } else if (dst_cache.dtype() == at::ScalarType::Half) {
-    CALL_CONVERT_FP8_E5M2(uint16_t, uint8_t);
+    CALL_CONVERT_FP8(uint16_t, uint8_t);
   } else if (dst_cache.dtype() == at::ScalarType::BFloat16) {
-    CALL_CONVERT_FP8_E5M2(__nv_bfloat16, uint8_t);
+    CALL_CONVERT_FP8(__nv_bfloat16, uint8_t);
   }
 }
diff --git a/csrc/pybind.cpp b/csrc/pybind.cpp
@@ -80,8 +80,8 @@ PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
     &gather_cached_kv,
     "Gather key and value from the cache into contiguous QKV tensors");
   cache_ops.def(
-    "convert_fp8_e5m2",
-    &convert_fp8_e5m2,
+    "convert_fp8",
+    &convert_fp8,
     "Convert the key and value cache to fp8_e5m2 data type");
 
   // Cuda utils
diff --git a/csrc/quantization/fp8/amd_detail/quant_utils.cuh b/csrc/quantization/fp8/amd_detail/quant_utils.cuh
@@ -10,7 +10,7 @@
 
 namespace vllm
 {
-
+namespace fp8_e4m3 {
 template <typename Tout, typename Tin>
 __inline__ __device__ Tout vec_conversion(const Tin& x)
 {
@@ -290,4 +290,5 @@ __inline__ __device__ bf16_8_t vec_conversion<bf16_8_t, Float8_>(const Float8_&
     b.w = __float22bfloat162_rn(a.w);
     return b;
 }
+}
 } // namespace vllm
diff --git a/setup.py b/setup.py
@@ -19,7 +19,7 @@
 
 # Supported NVIDIA GPU architectures.
 NVIDIA_SUPPORTED_ARCHS = {"7.0", "7.5", "8.0", "8.6", "8.9", "9.0"}
-ROCM_SUPPORTED_ARCHS = {"gfx90a", "gfx908", "gfx906", "gfx1030", "gfx1100"}
+ROCM_SUPPORTED_ARCHS = {"gfx90a", "gfx908", "gfx906", "gfx942", "gfx1030", "gfx1100"}
 # SUPPORTED_ARCHS = NVIDIA_SUPPORTED_ARCHS.union(ROCM_SUPPORTED_ARCHS)
 
 
@@ -296,6 +296,7 @@ def get_torch_arch_list() -> Set[str]:
                 f"Only the following arch is supported: {ROCM_SUPPORTED_ARCHS}"
                 f"amdgpu_arch_found: {arch}")
         NVCC_FLAGS += [f"--offload-arch={arch}"]
+    NVCC_FLAGS += ["-DENABLE_FP8_E4M3"]
 
 elif _is_neuron():
     neuronxcc_version = get_neuronxcc_version()
diff --git a/tests/kernels/test_attention.py b/tests/kernels/test_attention.py
@@ -230,14 +230,14 @@ def test_paged_attention(
         dequantized_key_cache = torch.empty(size=key_cache_shape,
                                             dtype=dtype,
                                             device=gpu_id)
-        cache_ops.convert_fp8_e5m2(key_cache, dequantized_key_cache)
+        cache_ops.convert_fp8(key_cache, dequantized_key_cache)
         key_cache = dequantized_key_cache
 
         value_cache_shape = value_cache.shape
         dequantized_value_cache = torch.empty(size=value_cache_shape,
                                               dtype=dtype,
                                               device=gpu_id)
-        cache_ops.convert_fp8_e5m2(value_cache, dequantized_value_cache)
+        cache_ops.convert_fp8(value_cache, dequantized_value_cache)
         value_cache = dequantized_value_cache
 
     ref_output = torch.empty_like(query)
diff --git a/tests/kernels/test_cache.py b/tests/kernels/test_cache.py
@@ -99,6 +99,7 @@ def test_copy_blocks(
 @pytest.mark.parametrize("dtype", DTYPES)
 @pytest.mark.parametrize("seed", SEEDS)
 @pytest.mark.parametrize("device", DEVICES)
+@pytest.mark.parametrize("kv_cache_dtype", KV_CACHE_DTYPE)
 @torch.inference_mode()
 def test_reshape_and_cache(
     kv_cache_factory,
@@ -110,6 +111,7 @@ def test_reshape_and_cache(
     dtype: torch.dtype,
     seed: int,
     device: int,
+    kv_cache_dtype: str,
 ) -> None:
     random.seed(seed)
     torch.random.manual_seed(seed)
@@ -130,17 +132,29 @@ def test_reshape_and_cache(
 
     # Create the KV caches.
     key_caches, value_caches = kv_cache_factory(num_blocks, block_size, 1,
-                                                num_heads, head_size, dtype,
-                                                None, seed, gpu_id)
+                                                num_heads, head_size, kv_cache_dtype,
+                                                dtype, seed, gpu_id)
     key_cache, value_cache = key_caches[0], value_caches[0]
 
     # Clone the KV caches.
-    cloned_key_cache = key_cache.clone()
-    cloned_value_cache = value_cache.clone()
+    if kv_cache_dtype == "fp8_e5m2":
+        cloned_key_cache = torch.empty_like(key_cache, dtype=torch.float16)
+        cache_ops.convert_fp8(key_cache, cloned_key_cache)
+        cloned_value_cache = torch.empty_like(value_cache, dtype=torch.float16)
+        cache_ops.convert_fp8(value_cache, cloned_value_cache)
+    else:
+        cloned_key_cache = key_cache.clone()
+        cloned_value_cache = value_cache.clone()
 
     # Call the reshape_and_cache kernel.
     cache_ops.reshape_and_cache(key, value, key_cache, value_cache,
-                                slot_mapping, "auto")
+                                slot_mapping, kv_cache_dtype)
+    
+    if kv_cache_dtype == "fp8_e5m2":
+        result_key_cache = torch.empty_like(key_cache, dtype=torch.float16)
+        cache_ops.convert_fp8(key_cache, result_key_cache)
+        result_value_cache = torch.empty_like(value_cache, dtype=torch.float16)
+        cache_ops.convert_fp8(value_cache, result_value_cache)
 
     # Run the reference implementation.
     reshaped_key = key.reshape(num_tokens, *key_cache[0, :, :, 0, :].shape)
@@ -153,9 +167,13 @@ def test_reshape_and_cache(
         block_offset = block_offsets[i]
         cloned_key_cache[block_idx, :, :, block_offset, :] = reshaped_key[i]
         cloned_value_cache[block_idx, :, :, block_offset] = value[i]
-
-    assert torch.allclose(key_cache, cloned_key_cache)
-    assert torch.allclose(value_cache, cloned_value_cache)
+    
+    if kv_cache_dtype == "fp8_e5m2":
+        assert torch.allclose(result_key_cache, cloned_key_cache, atol=0.001, rtol=0.1)
+        assert torch.allclose(result_value_cache, cloned_value_cache, atol=0.001, rtol=0.1)
+    else:
+        assert torch.allclose(key_cache, cloned_key_cache)
+        assert torch.allclose(value_cache, cloned_value_cache)
 
 
 @pytest.mark.parametrize("direction", COPYING_DIRECTION)
@@ -167,6 +185,7 @@ def test_reshape_and_cache(
 @pytest.mark.parametrize("dtype", DTYPES)
 @pytest.mark.parametrize("seed", SEEDS)
 @pytest.mark.parametrize("device", DEVICES)
+@pytest.mark.parametrize("kv_cache_dtype", KV_CACHE_DTYPE)
 @torch.inference_mode()
 def test_swap_blocks(
     kv_cache_factory,
@@ -179,7 +198,10 @@ def test_swap_blocks(
     dtype: torch.dtype,
     seed: int,
     device: int,
+    kv_cache_dtype: str,
 ) -> None:
+    if kv_cache_dtype == "fp8_e5m2" and "cpu" in direction:
+        return
     random.seed(seed)
     torch.random.manual_seed(seed)
     torch.cuda.manual_seed(seed)
@@ -200,12 +222,12 @@ def test_swap_blocks(
 
     # Create the KV caches on the first device.
     src_key_caches, src_value_caches = kv_cache_factory(
-        num_blocks, block_size, 1, num_heads, head_size, dtype, None, seed,
+        num_blocks, block_size, 1, num_heads, head_size, kv_cache_dtype, dtype, seed,
         src_device)
 
     # Create the KV caches on the second device.
     dist_key_caches, dist_value_caches = kv_cache_factory(
-        num_blocks, block_size, 1, num_heads, head_size, dtype, None, seed,
+        num_blocks, block_size, 1, num_heads, head_size, kv_cache_dtype, dtype, seed,
         dst_device)
 
     src_key_caches_clone = src_key_caches[0].clone()
@@ -221,3 +243,40 @@ def test_swap_blocks(
                               dist_key_caches[0][dst].cpu())
         assert torch.allclose(src_value_caches_clone[src].cpu(),
                               dist_value_caches[0][dst].cpu())
+
+
+@pytest.mark.parametrize("num_heads", NUM_HEADS)
+@pytest.mark.parametrize("head_size", HEAD_SIZES)
+@pytest.mark.parametrize("block_size", BLOCK_SIZES)
+@pytest.mark.parametrize("num_blocks", NUM_BLOCKS)
+@pytest.mark.parametrize("dtype", DTYPES)
+@pytest.mark.parametrize("seed", SEEDS)
+@pytest.mark.parametrize("device", DEVICES)
+@torch.inference_mode()
+def test_fp8_conversion(
+    num_heads: int,
+    head_size: int,
+    block_size: int,
+    num_blocks: int,
+    dtype: torch.dtype,
+    seed: int,
+    device: int,
+) -> None:
+    random.seed(seed)
+    torch.random.manual_seed(seed)
+    torch.cuda.manual_seed(seed)
+    gpu_id = f"cuda:{device}"
+
+    low = -240.0
+    high = 240.0
+    shape = (num_blocks, num_heads, head_size, block_size)
+    cache = torch.empty(shape, dtype=dtype, device=gpu_id)
+    cache.uniform_(low, high)
+
+    cache_fp8 = torch.empty_like(cache, dtype=torch.uint8)
+    cache_ops.convert_fp8(cache, cache_fp8)
+
+    converted_cache = torch.empty_like(cache)
+    cache_ops.convert_fp8(cache_fp8, converted_cache)
+
+    assert torch.allclose(cache, converted_cache, atol=0.001, rtol=0.1)
diff --git a/vllm/utils.py b/vllm/utils.py
@@ -212,7 +212,7 @@ def _generate_random_fp8_e5m2(
     from vllm._C import cache_ops
     tensor_tmp = torch.empty_like(tensor, dtype=torch.float16)
     tensor_tmp.uniform_(low, high)
-    cache_ops.convert_fp8_e5m2(tensor_tmp, tensor)
+    cache_ops.convert_fp8(tensor_tmp, tensor)
     del tensor_tmp
 
 

Original file line number	Diff line number	Diff line change
`@@ -10,7 +10,7 @@`
`10`	`10`
`11`	`11`	`namespace vllm`
`12`	`12`	`{`
`13`		`-`
	`13`	`+namespace fp8_e4m3 {`
`14`	`14`	`template <typename Tout, typename Tin>`
`15`	`15`	`__inline__ __device__ Tout vec_conversion(const Tin& x)`
`16`	`16`	`{`
`@@ -290,4 +290,5 @@ __inline__ __device__ bf16_8_t vec_conversion<bf16_8_t, Float8_>(const Float8_&`
`290`	`290`	`b.w = __float22bfloat162_rn(a.w);`
`291`	`291`	`return b;`
`292`	`292`	`}`
	`293`	`+}`
`293`	`294`	`} // namespace vllm`