[nvbugs/5336321][fix] Enable attention dp = False test case, Fix TRTLLM Gen Moe workspace allocation (#5463)

Signed-off-by: Yi Zhang <187001205+yizhang-nv@users.noreply.github.com>
Signed-off-by: yizhan <187001205+yizhang-nv@users.noreply.github.com>
This commit is contained in:
Yi Zhang 2025-07-04 22:23:41 +08:00 committed by Zhenhuan Chen
parent e5e87ecf34
commit 9cc4e5d50e
2 changed files with 2 additions and 2 deletions

View File

@ -86,6 +86,7 @@ std::vector<torch::Tensor> run_fp4_block_scale_moe_runner(torch::Tensor const& r
TORCH_CHECK(num_experts % 4 == 0, "Routing kernel expects that num_experts must be divisible by 4");
TORCH_CHECK(num_experts > top_k, "num_experts must be greater than top_k");
TORCH_CHECK(num_experts <= 256, "num_experts must be less than or equal to 256");
tensorrt_llm::kernels::trtllmGenFp8BlockScaleMoe::MoE::MoERunnerArgs args;
tensorrt_llm::kernels::trtllmGenFp8BlockScaleMoe::MoE::MoEWorkspace workspace;

View File

@ -1088,8 +1088,7 @@ class TestDeepSeekV3Lite(LlmapiAccuracyTestHarness):
pytest.skip("https://nvbugs/5252313")
if torch_compile and pp_size > 1:
pytest.skip("PP with torch.compile is not supported yet.")
if not attention_dp and (tp_size > 1 or ep_size > 1):
pytest.skip("https://nvbugs/5336321")
kv_cache_config = KvCacheConfig(free_gpu_memory_fraction=0.9)
# Picewise Cuda Graph cannot be enabled for nvfp4 attention dp.
torch_compile_config = TorchCompileConfig(