feat: Add TORCH_CHECK for unsupported deterministic mode and related test script

yuwei · yuwei · commit 78ab9e806f94 · 2025-07-05T21:32:04.000+08:00
diff --git a/hopper/flash_api.cpp b/hopper/flash_api.cpp
@@ -1258,6 +1258,8 @@ std::tuple<at::Tensor, at::Tensor, at::Tensor, at::Tensor, at::Tensor, at::Tenso
     }
     // This is what we will template on
     bool const is_varlen = is_varlen_q || is_varlen_k || seqused_q_.has_value() || seqused_k_.has_value();
+    TORCH_CHECK(!(seqused_q_.has_value() && deterministic), "FlashAttention backward does not support 'seqused_q' parameter when deterministic is true.");
+
     #ifdef FLASHATTENTION_DISABLE_VARLEN
         TORCH_CHECK(!is_varlen, "This flash attention build does not support varlen.");
     #endif
@@ -1274,6 +1276,7 @@ std::tuple<at::Tensor, at::Tensor, at::Tensor, at::Tensor, at::Tensor, at::Tenso
     int const num_heads_k = k.size(-2);
     TORCH_CHECK(head_size % 8 == 0, "head_size should be a multiple of 8");
     TORCH_CHECK(head_size_v % 8 == 0, "head_size_v should be a multiple of 8");
+    TORCH_CHECK((head_size_v < 256 && head_size < 256) || !deterministic, "FlashAttention backward only supports deterministic when head dimension less than 256");
     int const max_headdim = get_max_headdim();
     TORCH_CHECK(std::max(head_size, head_size_v) <= max_headdim, "FlashAttention forward only supports head dimension at most " + std::to_string(max_headdim));
     TORCH_CHECK(num_heads % num_heads_k == 0, "Number of heads in key/value must divide number of heads in query");
@@ -1291,6 +1294,10 @@ std::tuple<at::Tensor, at::Tensor, at::Tensor, at::Tensor, at::Tensor, at::Tenso
     int const head_size_v_rounded = head_size_rounded;
     // Very important that these match the kernel configs
     bool const is_local = (window_size_left >= 0 || window_size_right >= 0) && !is_causal;
+    TORCH_CHECK(
+        !deterministic || !is_local || !cu_seqlens_q_.has_value() || torch::equal(cu_seqlens_q_.value(), cu_seqlens_k_.value()), 
+        "FlashAttention backward only supports deterministic when local is false"
+    );
     int const kBlockM_sm90 = head_size_rounded <= 64 ? (is_causal && softcap > 0.0 ? 96 : 128)
         : (head_size_rounded <= 96 ? 64
            : (head_size_rounded <= 128 ? (is_causal || is_local || softcap > 0.0 ? 64 : 80)
diff --git a/hopper/test_flash_attn.py b/hopper/test_flash_attn.py
@@ -203,6 +203,7 @@ def test_flash_attn_output(
                 attention_chunk=attention_chunk,
                 softcap=softcap,
                 pack_gqa=pack_gqa,
+                deterministic=deterministic,
                 num_splits=num_splits
             )
             print(f"Output max diff: {(out - out_ref).abs().max().item()}")
@@ -222,6 +223,10 @@ def test_flash_attn_output(
             and not has_qv
             and not dv > 256
             and not attention_chunk != 0
+            and not (
+                deterministic == True
+                and (dv >= 256 or dv == 64)
+            )
         ):
             g = torch.randn_like(out)
             do_o = ((g.float() * out.float()).sum(-1)).transpose(1, 2)
@@ -475,6 +480,7 @@ def _gen_unused_masks(padding_mask, add_unused, max_seq_len, bs, device):
                 k_descale=k_descale, v_descale=v_descale,
                 window_size=window_size,
                 attention_chunk=attention_chunk,
+                deterministic=deterministic,
                 softcap=softcap,
             )
             out = output_pad_fn(out_unpad)
@@ -497,6 +503,10 @@ def _gen_unused_masks(padding_mask, add_unused, max_seq_len, bs, device):
             and not has_qv
             and not dv > 256
             and not attention_chunk != 0
+            and not (
+                deterministic == True
+                and (dv >= 256 or local == True or seqused_k is None)
+            )
         ):
             g_unpad = torch.randn_like(out_unpad)
             do_o = ((g_unpad.float() * out_unpad.float()).sum(-1)).transpose(-1, -2)
diff --git a/hopper/test_flash_attn_deterministic.py b/hopper/test_flash_attn_deterministic.py