[Kernel] Batch invariant NVFP4 linear using cutlass (#39912)

Signed-off-by: Jakub Zakrzewski <jzakrzewski@nvidia.com> Co-authored-by: Wentao Ye <44945378+yewentao256@users.noreply.github.com> Co-authored-by: Yongye Zhu <zyy1102000@gmail.com>
2026-06-06 00:16:14 +00:00 · 2026-05-23 15:41:12 +02:00
parent 3f3e862681
commit 5bb8d2767a
5 changed files with 211 additions and 50 deletions
@@ -367,6 +367,7 @@ steps:
    - VLLM_TEST_MODEL=deepseek-ai/DeepSeek-V2-Lite-Chat pytest -v -s v1/determinism/test_batch_invariance.py::test_v1_generation_is_deterministic_across_batch_sizes_with_needle[TRITON_MLA]
    - VLLM_TEST_MODEL=Qwen/Qwen3-30B-A3B-Thinking-2507-FP8 pytest -v -s v1/determinism/test_batch_invariance.py::test_v1_generation_is_deterministic_across_batch_sizes_with_needle[FLASH_ATTN]
    - pytest -v -s v1/determinism/test_nvfp4_batch_invariant.py
+    - pytest -v -s v1/determinism/test_nvfp4_batch_invariant_scaled_mm.py
  
 - label: Acceptance Length Test (Large Models) # optional
  device: h200_35gb