fix: resolve CUTLASS fmin compatibility for DeepSeek-V4 init

Signed-off-by: khluu <khluu000@gmail.com>
2026-06-06 00:16:14 +00:00 · 2026-06-03 17:11:47 -07:00
parent fd56c57bde
commit 0decac0d96
1 changed files with 4 additions and 4 deletions
@@ -320,11 +320,11 @@ class SparseAttnCompressNormRopeStoreC4Kernel:
                bits = _recast_val(scale_raw, Uint32)
                ue8m0 = ((bits + Uint32(0x7FFFFF)) >> Uint32(23)) & Uint32(0xFF)
                inv_scale = _recast_val((Uint32(254) - ue8m0) << Uint32(23), Float32)
-                y0 = cute.arch.fmin(
+                y0 = cutlass.min(
                    cute.arch.fmax(q0 * inv_scale, Float32(-self.fp8_max)),
                    Float32(self.fp8_max),
                )
-                y1 = cute.arch.fmin(
+                y1 = cutlass.min(
                    cute.arch.fmax(q1 * inv_scale, Float32(-self.fp8_max)),
                    Float32(self.fp8_max),
                )
@@ -978,11 +978,11 @@ class SparseAttnNormRopeStoreKernel:
                bits = _recast_val(scale_raw, Uint32)
                ue8m0 = ((bits + Uint32(0x7FFFFF)) >> Uint32(23)) & Uint32(0xFF)
                inv_scale = _recast_val((Uint32(254) - ue8m0) << Uint32(23), Float32)
-                y0 = cute.arch.fmin(
+                y0 = cutlass.min(
                    cute.arch.fmax(q0 * inv_scale, Float32(-self.fp8_max)),
                    Float32(self.fp8_max),
                )
-                y1 = cute.arch.fmin(
+                y1 = cutlass.min(
                    cute.arch.fmax(q1 * inv_scale, Float32(-self.fp8_max)),
                    Float32(self.fp8_max),
                )