[nvbugs/5336321][fix] Enable attention dp = False test case, Fix TRTLLM Gen Moe workspace allocation (#5463)

Signed-off-by: Yi Zhang <187001205+yizhang-nv@users.noreply.github.com> Signed-off-by: yizhan <187001205+yizhang-nv@users.noreply.github.com>
2026-01-13 22:18:36 +08:00 · 2025-07-04 22:23:41 +08:00 · 2025-07-04 22:23:41 +08:00 · 9cc4e5d50e
commit 9cc4e5d50e
parent e5e87ecf34
2 changed files with 2 additions and 2 deletions
--- a/cpp/tensorrt_llm/thop/fp4BlockScaleMoe.cpp
+++ b/cpp/tensorrt_llm/thop/fp4BlockScaleMoe.cpp
@ -86,6 +86,7 @@ std::vector<torch::Tensor> run_fp4_block_scale_moe_runner(torch::Tensor const& r

    TORCH_CHECK(num_experts % 4 == 0, "Routing kernel expects that num_experts must be divisible by 4");
    TORCH_CHECK(num_experts > top_k, "num_experts must be greater than top_k");
+    TORCH_CHECK(num_experts <= 256, "num_experts must be less than or equal to 256");

    tensorrt_llm::kernels::trtllmGenFp8BlockScaleMoe::MoE::MoERunnerArgs args;
    tensorrt_llm::kernels::trtllmGenFp8BlockScaleMoe::MoE::MoEWorkspace workspace;
--- a/tests/integration/defs/accuracy/test_llm_api_pytorch.py
+++ b/tests/integration/defs/accuracy/test_llm_api_pytorch.py
@ -1088,8 +1088,7 @@ class TestDeepSeekV3Lite(LlmapiAccuracyTestHarness):
            pytest.skip("https://nvbugs/5252313")
        if torch_compile and pp_size > 1:
            pytest.skip("PP with torch.compile is not supported yet.")
-        if not attention_dp and (tp_size > 1 or ep_size > 1):
-            pytest.skip("https://nvbugs/5336321")
+
        kv_cache_config = KvCacheConfig(free_gpu_memory_fraction=0.9)
        # Picewise Cuda Graph cannot be enabled for nvfp4 attention dp.
        torch_compile_config = TorchCompileConfig(