fix: resolve CUTLASS fmin compatibility for DeepSeek-V4 init

Signed-off-by: khluu <khluu000@gmail.com>
This commit is contained in:
khluu
2026-06-03 17:11:47 -07:00
parent fd56c57bde
commit 0decac0d96
@@ -320,11 +320,11 @@ class SparseAttnCompressNormRopeStoreC4Kernel:
bits = _recast_val(scale_raw, Uint32)
ue8m0 = ((bits + Uint32(0x7FFFFF)) >> Uint32(23)) & Uint32(0xFF)
inv_scale = _recast_val((Uint32(254) - ue8m0) << Uint32(23), Float32)
y0 = cute.arch.fmin(
y0 = cutlass.min(
cute.arch.fmax(q0 * inv_scale, Float32(-self.fp8_max)),
Float32(self.fp8_max),
)
y1 = cute.arch.fmin(
y1 = cutlass.min(
cute.arch.fmax(q1 * inv_scale, Float32(-self.fp8_max)),
Float32(self.fp8_max),
)
@@ -978,11 +978,11 @@ class SparseAttnNormRopeStoreKernel:
bits = _recast_val(scale_raw, Uint32)
ue8m0 = ((bits + Uint32(0x7FFFFF)) >> Uint32(23)) & Uint32(0xFF)
inv_scale = _recast_val((Uint32(254) - ue8m0) << Uint32(23), Float32)
y0 = cute.arch.fmin(
y0 = cutlass.min(
cute.arch.fmax(q0 * inv_scale, Float32(-self.fp8_max)),
Float32(self.fp8_max),
)
y1 = cute.arch.fmin(
y1 = cutlass.min(
cute.arch.fmax(q1 * inv_scale, Float32(-self.fp8_max)),
Float32(self.fp8_max),
)