Move Hq / Hk into function constants.

liuliu · liuliu · commit e295638ea58a · 2025-11-11T18:30:07.000-05:00
diff --git a/lib/nnc/mfa/v2/AttentionDescriptor.cpp b/lib/nnc/mfa/v2/AttentionDescriptor.cpp
@@ -121,9 +121,9 @@ AttentionKernelDescriptor AttentionDescriptor::kernelDescriptor(MTL::Device *con
   };
 
   if (device->supportsFamily(MTL::GPUFamily(1009))) {
-    return AttentionKernelDescriptor(createBlockDimensions(), createCacheState(), createHeadDimension(), Hq, Hk, createMemoryPrecisions(), true, false, createRegisterPrecisions(device), createTransposeState(), createLeadingDimensions(), type, scale);
+    return AttentionKernelDescriptor(createBlockDimensions(), createCacheState(), createHeadDimension(), createMemoryPrecisions(), true, false, createRegisterPrecisions(device), createTransposeState(), createLeadingDimensions(), type, scale);
   } else {
-    return AttentionKernelDescriptor(createBlockDimensions(), createCacheState(), createHeadDimension(), Hq, Hk, createMemoryPrecisions(), false, true, createRegisterPrecisions(device), createTransposeState(), createLeadingDimensions(), type, scale);
+    return AttentionKernelDescriptor(createBlockDimensions(), createCacheState(), createHeadDimension(), createMemoryPrecisions(), false, true, createRegisterPrecisions(device), createTransposeState(), createLeadingDimensions(), type, scale);
   }
 }
 
@@ -137,6 +137,10 @@ std::pair<AttentionKernelDescriptor, PipelineValue<AttentionKernel> *> Attention
     uint32_t columnDimension = matrixDimensions[1];
     constants->setConstantValue(&rowDimension, MTL::DataTypeUInt, NS::Integer(0));
     constants->setConstantValue(&columnDimension, MTL::DataTypeUInt, 1);
+    uint32_t Hq = this->Hq;
+    constants->setConstantValue(&Hq, MTL::DataTypeUInt, 2);
+    uint32_t HHkRatio = this->Hq / this->Hk;
+    constants->setConstantValue(&HHkRatio, MTL::DataTypeUInt, 3);
     std::vector<AttentionOperand> operands;
     switch (type.value) {
     case AttentionKernelType::forward:
@@ -151,7 +155,7 @@ std::pair<AttentionKernelDescriptor, PipelineValue<AttentionKernel> *> Attention
     }
     for (const auto& operand : operands) {
       uint32_t batchStride = batchStrides[operand].value_or(0);
-      constants->setConstantValue(&batchStride, MTL::DataTypeUInt, 2 + operand.bufferIndex());
+      constants->setConstantValue(&batchStride, MTL::DataTypeUInt, 4 + operand.bufferIndex());
     }
 
     NS::String* swiftName = NS::String::string("attention", NS::UTF8StringEncoding);
diff --git a/lib/nnc/mfa/v2/AttentionKernel.cpp b/lib/nnc/mfa/v2/AttentionKernel.cpp
@@ -17,8 +17,6 @@ AttentionKernel::AttentionKernel(AttentionKernelDescriptor descriptor, MTL::Devi
 
   blockDimensions = descriptor.blockDimensions;
   headDimension = descriptor.headDimension;
-  Hq = descriptor.Hq;
-  Hk = descriptor.Hk;
   leadingDimensions = descriptor.leadingDimensions;
   scale = descriptor.scale;
   disableAsyncCopy = false;
@@ -498,10 +496,10 @@ std::string AttentionKernel::createConstants() const noexcept {
     operands = {AttentionOperand::Q, AttentionOperand::K, AttentionOperand::V, AttentionOperand::O, AttentionOperand::dO, AttentionOperand::dV, AttentionOperand::dK};
     break;
   }
-  std::string output = "#define Hq (" + std::to_string(Hq) + ")\n";
+  std::string output = "";
   for (const auto& operand : operands) {
     output += "  constant uint " + operand.name() + "_batch_stride [[function_constant(";
-    output += std::to_string(operand.bufferIndex() + 2) + ")]];\n";
+    output += std::to_string(operand.bufferIndex() + 4) + ")]];\n";
   }
   return R"(
 
@@ -511,6 +509,9 @@ std::string AttentionKernel::createConstants() const noexcept {
     constant uint R [[function_constant(0)]];
     constant uint C [[function_constant(1)]];
 
+    constant uint Hq [[function_constant(2)]];
+    constant uint H_Hk_ratio [[function_constant(3)]];
+
 )" + output;
 }
 
@@ -542,15 +543,14 @@ std::string AttentionKernel::operandLocationWithHeadOffsetValue(AttentionOperand
   source.SetValue("OPERAND", operand.name());
   if (operand.value == AttentionOperand::L || operand.value == AttentionOperand::D) {
     source += "{{OPERAND}} + (gid.z * Hq + gid.y) * R\\";
-  } else if (Hq > 1) {
+  } else {
     source.SetValue("HEAD_DIMENSION", std::to_string(headDimension));
-    if (Hq != Hk && (operand.value == AttentionOperand::K || operand.value == AttentionOperand::V || operand.value == AttentionOperand::dK || operand.value == AttentionOperand::dV)) {
-      source.SetValue("H_HK_RATIO", std::to_string(Hq / Hk));
+    if (operand.value == AttentionOperand::K || operand.value == AttentionOperand::V || operand.value == AttentionOperand::dK || operand.value == AttentionOperand::dV) {
       if (!transposed(operand)) {
-        source += "{{OPERAND}} + gid.z * {{OPERAND}}_batch_stride + gid.y / {{H_HK_RATIO}} * {{HEAD_DIMENSION}}\\";
+        source += "{{OPERAND}} + gid.z * {{OPERAND}}_batch_stride + gid.y / H_Hk_ratio * {{HEAD_DIMENSION}}\\";
       } else {
         source.SetValue("SEQUENCE_LENGTH", sequenceLength(operand));
-        source += "{{OPERAND}} + gid.z * {{OPERAND}}_batch_stride + gid.y / {{H_HK_RATIO}} * {{HEAD_DIMENSION}} * {{SEQUENCE_LENGTH}}\\";
+        source += "{{OPERAND}} + gid.z * {{OPERAND}}_batch_stride + gid.y / H_Hk_ratio * {{HEAD_DIMENSION}} * {{SEQUENCE_LENGTH}}\\";
       }
     } else {
       if (!transposed(operand)) {
@@ -560,8 +560,6 @@ std::string AttentionKernel::operandLocationWithHeadOffsetValue(AttentionOperand
         source += "{{OPERAND}} + gid.z * {{OPERAND}}_batch_stride + gid.y * {{HEAD_DIMENSION}} * {{SEQUENCE_LENGTH}}\\";
       }
     }
-  } else {
-    source += "{{OPERAND}} + gid.z * {{OPERAND}}_batch_stride\\";
   }
   return source.ToString();
 }
diff --git a/lib/nnc/mfa/v2/AttentionKernel.hpp b/lib/nnc/mfa/v2/AttentionKernel.hpp
@@ -39,10 +39,6 @@ struct AttentionKernel {
 
   unsigned short headDimension;
 
-  unsigned short Hq;
-
-  unsigned short Hk;
-
   bool disableAsyncCopy;
 
   unsigned short threadgroupMemoryAllocation;
diff --git a/lib/nnc/mfa/v2/AttentionKernelDescriptor.cpp b/lib/nnc/mfa/v2/AttentionKernelDescriptor.cpp
@@ -9,7 +9,6 @@ bool AttentionKernelDescriptor::operator==(const AttentionKernelDescriptor& rhs)
   simd_all(blockDimensions == rhs.blockDimensions) &&
   cacheState == rhs.cacheState &&
   headDimension == rhs.headDimension &&
-  Hq == rhs.Hq && Hk == rhs.Hk &&
   memoryPrecisions == rhs.memoryPrecisions &&
   (preferAsyncCache == rhs.preferAsyncCache) &&
   (preferAsyncLoad == rhs.preferAsyncLoad) &&
@@ -25,19 +24,16 @@ std::size_t std::hash<AttentionKernelDescriptor>::operator()(const AttentionKern
   using namespace ccv::nnc::mfa::hash;
   combine_64(seed, pack_64(simd_make_ushort4(hash.blockDimensions, 0)));
   combine_32(seed, pack_32(simd::ushort2 { hash.headDimension, hash.type.value }));
-  combine_32(seed, pack_32(simd::ushort2 { hash.Hq, hash.Hk }));
   combine_32(seed, pack_32(simd::uchar4 { hash.preferAsyncCache, hash.preferAsyncLoad, 0, 0 }));
   return seed;
 }
 
 // MARK: - Initializer
 
-AttentionKernelDescriptor::AttentionKernelDescriptor(simd::ushort3 blockDimensions, AttentionOperands<bool> cacheState, unsigned short headDimension, unsigned short Hq, unsigned short Hk, AttentionOperands<GEMMOperandPrecision> memoryPrecisions, bool preferAsyncCache, bool preferAsyncLoad, AttentionOperands<GEMMOperandPrecision> registerPrecisions, AttentionOperands<bool> transposeState, AttentionOperands<unsigned short> leadingDimensions, AttentionKernelType type, float scale) noexcept {
+AttentionKernelDescriptor::AttentionKernelDescriptor(simd::ushort3 blockDimensions, AttentionOperands<bool> cacheState, unsigned short headDimension, AttentionOperands<GEMMOperandPrecision> memoryPrecisions, bool preferAsyncCache, bool preferAsyncLoad, AttentionOperands<GEMMOperandPrecision> registerPrecisions, AttentionOperands<bool> transposeState, AttentionOperands<unsigned short> leadingDimensions, AttentionKernelType type, float scale) noexcept {
   this->blockDimensions = blockDimensions;
   this->cacheState = cacheState;
   this->headDimension = headDimension;
-  this->Hq = Hq;
-  this->Hk = Hk;
   this->memoryPrecisions = memoryPrecisions;
   this->preferAsyncCache = preferAsyncCache;
   this->preferAsyncLoad = preferAsyncLoad;
diff --git a/lib/nnc/mfa/v2/AttentionKernelDescriptor.hpp b/lib/nnc/mfa/v2/AttentionKernelDescriptor.hpp
@@ -20,10 +20,6 @@ struct AttentionKernelDescriptor {
   /// Required. The problem size along the head dimension.
   unsigned short headDimension;
 
-  unsigned short Hq;
-
-  unsigned short Hk;
-
   AttentionOperands<GEMMOperandPrecision> memoryPrecisions;
 
   /// Reads with a one-to-one mapping to threads (like GEMM store) and writes.
@@ -62,7 +58,7 @@ struct AttentionKernelDescriptor {
   AttentionKernelDescriptor() = delete;
   
   /// Initialize the kernel descriptor.
-  AttentionKernelDescriptor(simd::ushort3 blockDimensions, AttentionOperands<bool> cacheState, unsigned short headDimension, unsigned short Hq, unsigned short Hk, AttentionOperands<GEMMOperandPrecision> memoryPrecisions, bool preferAsyncCache, bool preferAsyncLoad, AttentionOperands<GEMMOperandPrecision> registerPrecisions, AttentionOperands<bool> transposeState, AttentionOperands<unsigned short> leadingDimensions, AttentionKernelType type, float scale) noexcept;
+  AttentionKernelDescriptor(simd::ushort3 blockDimensions, AttentionOperands<bool> cacheState, unsigned short headDimension, AttentionOperands<GEMMOperandPrecision> memoryPrecisions, bool preferAsyncCache, bool preferAsyncLoad, AttentionOperands<GEMMOperandPrecision> registerPrecisions, AttentionOperands<bool> transposeState, AttentionOperands<unsigned short> leadingDimensions, AttentionKernelType type, float scale) noexcept;
 
   bool operator==(const AttentionKernelDescriptor& rhs) const;
 };