diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/blockScaleMoe/DevKernel.cu b/cpp/tensorrt_llm/kernels/trtllmGenKernels/blockScaleMoe/DevKernel.cu index 65cd5f3c59..0115f3b018 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/blockScaleMoe/DevKernel.cu +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/blockScaleMoe/DevKernel.cu @@ -186,14 +186,14 @@ void run(Data const& data, void* stream) if (data.mUseDeepSeekFp8) { int const numThreads = 128; - const dim3 grid(data.innerDim / 128, data.topK, data.numTokens); + const dim3 grid(data.innerDim / 128, data.topK, std::min(8192, data.numTokens)); LAUNCH(data, activationDeepSeekKernel, grid, numThreads, 0, stream); } else { int const numThreads = 256; - const dim3 grid(data.innerDim / 128, data.topK, data.numTokens); + const dim3 grid(data.innerDim / 128, data.topK, std::min(8192, data.numTokens)); LAUNCH(data, activationKernel, grid, numThreads, 0, stream); } @@ -371,7 +371,7 @@ void run(Data const& data, void* stream) constexpr int VecSize = 4; int const numThreads = 128; int const numBlocksX = (data.hiddenDimSf / VecSize - 1 + numThreads) / numThreads; - int const numBlocksY = data.numTokens; + int const numBlocksY = std::min(8192, data.numTokens); dim3 numBlocks(numBlocksX, numBlocksY); #define CONVERT_FP4_SF_LAUNCH(LayoutSrc, LayoutDst) \ if (data.sfLayoutSrc == tg::SfLayout::LayoutSrc && data.sfLayoutDst == tg::SfLayout::LayoutDst) \ @@ -457,7 +457,7 @@ void run(Data const& data, void* stream) { int const numThreads = 256; int const numBlocksX = (data.hiddenDim - 1 + numThreads) / numThreads; - int const numBlocksY = data.numTokens; + int const numBlocksY = std::min(8192, data.numTokens); dim3 numBlocks(numBlocksX, numBlocksY); LAUNCH(data, permuteKernel, numBlocks, numThreads, 0, stream); diff --git a/jenkins/L0_Test.groovy b/jenkins/L0_Test.groovy index ccd86a937c..9ec7e2821d 100644 --- a/jenkins/L0_Test.groovy +++ b/jenkins/L0_Test.groovy @@ -2093,10 +2093,11 @@ def launchTestJobs(pipeline, testFilter) multiNodesSBSAConfigs = [ // Each stage test 1 testcase with 8 GPUs and 2 nodes. - "GB200-8_GPUs-2_Nodes-PyTorch-1": ["gb200-multi-node", "l0_gb200_multi_nodes", 1, 4, 8, 2], - "GB200-8_GPUs-2_Nodes-PyTorch-2": ["gb200-multi-node", "l0_gb200_multi_nodes", 2, 4, 8, 2], - "GB200-8_GPUs-2_Nodes-PyTorch-3": ["gb200-multi-node", "l0_gb200_multi_nodes", 3, 4, 8, 2], - "GB200-8_GPUs-2_Nodes-PyTorch-4": ["gb200-multi-node", "l0_gb200_multi_nodes", 4, 4, 8, 2], + "GB200-8_GPUs-2_Nodes-PyTorch-1": ["gb200-multi-node", "l0_gb200_multi_nodes", 1, 5, 8, 2], + "GB200-8_GPUs-2_Nodes-PyTorch-2": ["gb200-multi-node", "l0_gb200_multi_nodes", 2, 5, 8, 2], + "GB200-8_GPUs-2_Nodes-PyTorch-3": ["gb200-multi-node", "l0_gb200_multi_nodes", 3, 5, 8, 2], + "GB200-8_GPUs-2_Nodes-PyTorch-4": ["gb200-multi-node", "l0_gb200_multi_nodes", 4, 5, 8, 2], + "GB200-8_GPUs-2_Nodes-PyTorch-5": ["gb200-multi-node", "l0_gb200_multi_nodes", 5, 5, 8, 2], "GB200-8_GPUs-2_Nodes-PyTorch-Post-Merge-1": ["gb200-multi-node", "l0_gb200_multi_nodes", 1, 5, 8, 2], "GB200-8_GPUs-2_Nodes-PyTorch-Post-Merge-2": ["gb200-multi-node", "l0_gb200_multi_nodes", 2, 5, 8, 2], "GB200-8_GPUs-2_Nodes-PyTorch-Post-Merge-3": ["gb200-multi-node", "l0_gb200_multi_nodes", 3, 5, 8, 2], diff --git a/tests/integration/defs/accuracy/test_llm_api_pytorch.py b/tests/integration/defs/accuracy/test_llm_api_pytorch.py index 2d8f80f54c..be178923c9 100644 --- a/tests/integration/defs/accuracy/test_llm_api_pytorch.py +++ b/tests/integration/defs/accuracy/test_llm_api_pytorch.py @@ -1918,18 +1918,25 @@ class TestDeepSeekR1(LlmapiAccuracyTestHarness): @pytest.mark.skip_less_mpi_world_size(8) @skip_pre_hopper @pytest.mark.parametrize( - "tp_size,pp_size,ep_size,mtp_nextn,fp8kv,attention_dp,cuda_graph,overlap_scheduler,max_batch_size", - [(8, 1, 4, 3, False, False, True, True, 1), - (8, 1, 8, 0, True, True, True, True, 24), - (8, 1, 8, 1, True, True, True, True, 24)], - ids=["latency", "throughput", "throughput_mtp"]) + "tp_size,pp_size,ep_size,mtp_nextn,fp8kv,attention_dp,cuda_graph,overlap_scheduler,max_batch_size,moe_backend", + [(8, 1, 4, 3, False, False, True, True, 1, "_DEFAULT"), + (8, 1, 8, 0, True, True, True, True, 24, "_DEFAULT"), + (8, 1, 8, 1, True, True, True, True, 24, "_DEFAULT"), + (8, 1, 8, 1, True, True, True, True, 24, "TRTLLM")], + ids=[ + "latency", "throughput", "throughput_mtp", "throughput_mtp_trtllm" + ]) def test_fp8_blockscale(self, tp_size, pp_size, ep_size, mtp_nextn, fp8kv, attention_dp, cuda_graph, overlap_scheduler, - max_batch_size): + max_batch_size, moe_backend): + if get_sm_version() == 100: - moe_config = MoeConfig(backend="DEEPGEMM", max_num_tokens=16384) + moe_backend = "DEEPGEMM" if moe_backend == "_DEFAULT" else moe_backend + moe_config = MoeConfig(backend=moe_backend, max_num_tokens=16384) kv_cache_config = KvCacheConfig(free_gpu_memory_fraction=0.6) else: + if moe_backend != "_DEFAULT": + pytest.skip("Not supported MoE backend!") moe_config = MoeConfig() kv_cache_config = KvCacheConfig(free_gpu_memory_fraction=0.9) diff --git a/tests/integration/test_lists/test-db/l0_dgx_b200.yml b/tests/integration/test_lists/test-db/l0_dgx_b200.yml index 5d05d14159..924f3026cf 100644 --- a/tests/integration/test_lists/test-db/l0_dgx_b200.yml +++ b/tests/integration/test_lists/test-db/l0_dgx_b200.yml @@ -71,6 +71,7 @@ l0_dgx_b200: - accuracy/test_llm_api_pytorch.py::TestDeepSeekR1::test_nvfp4_multi_gpus[throughput_mtp] TIMEOUT (180) - accuracy/test_llm_api_pytorch.py::TestDeepSeekR1::test_fp8_blockscale[throughput] TIMEOUT (180) - accuracy/test_llm_api_pytorch.py::TestDeepSeekR1::test_fp8_blockscale[throughput_mtp] TIMEOUT (180) + - accuracy/test_llm_api_pytorch.py::TestDeepSeekR1::test_fp8_blockscale[throughput_mtp_trtllm] TIMEOUT (180) - condition: ranges: system_gpu_count: diff --git a/tests/integration/test_lists/test-db/l0_gb200_multi_nodes.yml b/tests/integration/test_lists/test-db/l0_gb200_multi_nodes.yml index 7cab7535b8..95528a7258 100644 --- a/tests/integration/test_lists/test-db/l0_gb200_multi_nodes.yml +++ b/tests/integration/test_lists/test-db/l0_gb200_multi_nodes.yml @@ -17,6 +17,7 @@ l0_gb200_multi_nodes: - accuracy/test_llm_api_pytorch.py::TestDeepSeekR1::test_nvfp4_multi_gpus[throughput_mtp] TIMEOUT (180) - accuracy/test_llm_api_pytorch.py::TestDeepSeekR1::test_fp8_blockscale[throughput] TIMEOUT (180) - accuracy/test_llm_api_pytorch.py::TestDeepSeekR1::test_fp8_blockscale[throughput_mtp] TIMEOUT (180) + - accuracy/test_llm_api_pytorch.py::TestDeepSeekR1::test_fp8_blockscale[throughput_mtp_trtllm] TIMEOUT (180) - condition: ranges: # 2 nodes with each node has 4 GPUs