mirror of
https://github.com/NVIDIA/TensorRT-LLM.git
synced 2026-01-13 22:18:36 +08:00
[nvbugs/5336321][fix] Enable attention dp = False test case, Fix TRTLLM Gen Moe workspace allocation (#5463)
Signed-off-by: Yi Zhang <187001205+yizhang-nv@users.noreply.github.com> Signed-off-by: yizhan <187001205+yizhang-nv@users.noreply.github.com>
This commit is contained in:
parent
e5e87ecf34
commit
9cc4e5d50e
@ -86,6 +86,7 @@ std::vector<torch::Tensor> run_fp4_block_scale_moe_runner(torch::Tensor const& r
|
||||
|
||||
TORCH_CHECK(num_experts % 4 == 0, "Routing kernel expects that num_experts must be divisible by 4");
|
||||
TORCH_CHECK(num_experts > top_k, "num_experts must be greater than top_k");
|
||||
TORCH_CHECK(num_experts <= 256, "num_experts must be less than or equal to 256");
|
||||
|
||||
tensorrt_llm::kernels::trtllmGenFp8BlockScaleMoe::MoE::MoERunnerArgs args;
|
||||
tensorrt_llm::kernels::trtllmGenFp8BlockScaleMoe::MoE::MoEWorkspace workspace;
|
||||
|
||||
@ -1088,8 +1088,7 @@ class TestDeepSeekV3Lite(LlmapiAccuracyTestHarness):
|
||||
pytest.skip("https://nvbugs/5252313")
|
||||
if torch_compile and pp_size > 1:
|
||||
pytest.skip("PP with torch.compile is not supported yet.")
|
||||
if not attention_dp and (tp_size > 1 or ep_size > 1):
|
||||
pytest.skip("https://nvbugs/5336321")
|
||||
|
||||
kv_cache_config = KvCacheConfig(free_gpu_memory_fraction=0.9)
|
||||
# Picewise Cuda Graph cannot be enabled for nvfp4 attention dp.
|
||||
torch_compile_config = TorchCompileConfig(
|
||||
|
||||
Loading…
Reference in New Issue
Block a user