mirror of
https://github.com/NVIDIA/TensorRT-LLM.git
synced 2026-01-13 22:18:36 +08:00
[None][chore] Fix kernel launch param and add TRTLLM MoE backend test (#7524)
Signed-off-by: Pengbo Wang <221450789+pengbowang-nv@users.noreply.github.com>
This commit is contained in:
parent
ac0df0a393
commit
ef0d06df58
@ -186,14 +186,14 @@ void run(Data const& data, void* stream)
|
||||
if (data.mUseDeepSeekFp8)
|
||||
{
|
||||
int const numThreads = 128;
|
||||
const dim3 grid(data.innerDim / 128, data.topK, data.numTokens);
|
||||
const dim3 grid(data.innerDim / 128, data.topK, std::min(8192, data.numTokens));
|
||||
|
||||
LAUNCH(data, activationDeepSeekKernel, grid, numThreads, 0, stream);
|
||||
}
|
||||
else
|
||||
{
|
||||
int const numThreads = 256;
|
||||
const dim3 grid(data.innerDim / 128, data.topK, data.numTokens);
|
||||
const dim3 grid(data.innerDim / 128, data.topK, std::min(8192, data.numTokens));
|
||||
|
||||
LAUNCH(data, activationKernel, grid, numThreads, 0, stream);
|
||||
}
|
||||
@ -371,7 +371,7 @@ void run(Data const& data, void* stream)
|
||||
constexpr int VecSize = 4;
|
||||
int const numThreads = 128;
|
||||
int const numBlocksX = (data.hiddenDimSf / VecSize - 1 + numThreads) / numThreads;
|
||||
int const numBlocksY = data.numTokens;
|
||||
int const numBlocksY = std::min(8192, data.numTokens);
|
||||
dim3 numBlocks(numBlocksX, numBlocksY);
|
||||
#define CONVERT_FP4_SF_LAUNCH(LayoutSrc, LayoutDst) \
|
||||
if (data.sfLayoutSrc == tg::SfLayout::LayoutSrc && data.sfLayoutDst == tg::SfLayout::LayoutDst) \
|
||||
@ -457,7 +457,7 @@ void run(Data const& data, void* stream)
|
||||
{
|
||||
int const numThreads = 256;
|
||||
int const numBlocksX = (data.hiddenDim - 1 + numThreads) / numThreads;
|
||||
int const numBlocksY = data.numTokens;
|
||||
int const numBlocksY = std::min(8192, data.numTokens);
|
||||
dim3 numBlocks(numBlocksX, numBlocksY);
|
||||
|
||||
LAUNCH(data, permuteKernel, numBlocks, numThreads, 0, stream);
|
||||
|
||||
@ -2093,10 +2093,11 @@ def launchTestJobs(pipeline, testFilter)
|
||||
|
||||
multiNodesSBSAConfigs = [
|
||||
// Each stage test 1 testcase with 8 GPUs and 2 nodes.
|
||||
"GB200-8_GPUs-2_Nodes-PyTorch-1": ["gb200-multi-node", "l0_gb200_multi_nodes", 1, 4, 8, 2],
|
||||
"GB200-8_GPUs-2_Nodes-PyTorch-2": ["gb200-multi-node", "l0_gb200_multi_nodes", 2, 4, 8, 2],
|
||||
"GB200-8_GPUs-2_Nodes-PyTorch-3": ["gb200-multi-node", "l0_gb200_multi_nodes", 3, 4, 8, 2],
|
||||
"GB200-8_GPUs-2_Nodes-PyTorch-4": ["gb200-multi-node", "l0_gb200_multi_nodes", 4, 4, 8, 2],
|
||||
"GB200-8_GPUs-2_Nodes-PyTorch-1": ["gb200-multi-node", "l0_gb200_multi_nodes", 1, 5, 8, 2],
|
||||
"GB200-8_GPUs-2_Nodes-PyTorch-2": ["gb200-multi-node", "l0_gb200_multi_nodes", 2, 5, 8, 2],
|
||||
"GB200-8_GPUs-2_Nodes-PyTorch-3": ["gb200-multi-node", "l0_gb200_multi_nodes", 3, 5, 8, 2],
|
||||
"GB200-8_GPUs-2_Nodes-PyTorch-4": ["gb200-multi-node", "l0_gb200_multi_nodes", 4, 5, 8, 2],
|
||||
"GB200-8_GPUs-2_Nodes-PyTorch-5": ["gb200-multi-node", "l0_gb200_multi_nodes", 5, 5, 8, 2],
|
||||
"GB200-8_GPUs-2_Nodes-PyTorch-Post-Merge-1": ["gb200-multi-node", "l0_gb200_multi_nodes", 1, 5, 8, 2],
|
||||
"GB200-8_GPUs-2_Nodes-PyTorch-Post-Merge-2": ["gb200-multi-node", "l0_gb200_multi_nodes", 2, 5, 8, 2],
|
||||
"GB200-8_GPUs-2_Nodes-PyTorch-Post-Merge-3": ["gb200-multi-node", "l0_gb200_multi_nodes", 3, 5, 8, 2],
|
||||
|
||||
@ -1918,18 +1918,25 @@ class TestDeepSeekR1(LlmapiAccuracyTestHarness):
|
||||
@pytest.mark.skip_less_mpi_world_size(8)
|
||||
@skip_pre_hopper
|
||||
@pytest.mark.parametrize(
|
||||
"tp_size,pp_size,ep_size,mtp_nextn,fp8kv,attention_dp,cuda_graph,overlap_scheduler,max_batch_size",
|
||||
[(8, 1, 4, 3, False, False, True, True, 1),
|
||||
(8, 1, 8, 0, True, True, True, True, 24),
|
||||
(8, 1, 8, 1, True, True, True, True, 24)],
|
||||
ids=["latency", "throughput", "throughput_mtp"])
|
||||
"tp_size,pp_size,ep_size,mtp_nextn,fp8kv,attention_dp,cuda_graph,overlap_scheduler,max_batch_size,moe_backend",
|
||||
[(8, 1, 4, 3, False, False, True, True, 1, "_DEFAULT"),
|
||||
(8, 1, 8, 0, True, True, True, True, 24, "_DEFAULT"),
|
||||
(8, 1, 8, 1, True, True, True, True, 24, "_DEFAULT"),
|
||||
(8, 1, 8, 1, True, True, True, True, 24, "TRTLLM")],
|
||||
ids=[
|
||||
"latency", "throughput", "throughput_mtp", "throughput_mtp_trtllm"
|
||||
])
|
||||
def test_fp8_blockscale(self, tp_size, pp_size, ep_size, mtp_nextn, fp8kv,
|
||||
attention_dp, cuda_graph, overlap_scheduler,
|
||||
max_batch_size):
|
||||
max_batch_size, moe_backend):
|
||||
|
||||
if get_sm_version() == 100:
|
||||
moe_config = MoeConfig(backend="DEEPGEMM", max_num_tokens=16384)
|
||||
moe_backend = "DEEPGEMM" if moe_backend == "_DEFAULT" else moe_backend
|
||||
moe_config = MoeConfig(backend=moe_backend, max_num_tokens=16384)
|
||||
kv_cache_config = KvCacheConfig(free_gpu_memory_fraction=0.6)
|
||||
else:
|
||||
if moe_backend != "_DEFAULT":
|
||||
pytest.skip("Not supported MoE backend!")
|
||||
moe_config = MoeConfig()
|
||||
kv_cache_config = KvCacheConfig(free_gpu_memory_fraction=0.9)
|
||||
|
||||
|
||||
@ -71,6 +71,7 @@ l0_dgx_b200:
|
||||
- accuracy/test_llm_api_pytorch.py::TestDeepSeekR1::test_nvfp4_multi_gpus[throughput_mtp] TIMEOUT (180)
|
||||
- accuracy/test_llm_api_pytorch.py::TestDeepSeekR1::test_fp8_blockscale[throughput] TIMEOUT (180)
|
||||
- accuracy/test_llm_api_pytorch.py::TestDeepSeekR1::test_fp8_blockscale[throughput_mtp] TIMEOUT (180)
|
||||
- accuracy/test_llm_api_pytorch.py::TestDeepSeekR1::test_fp8_blockscale[throughput_mtp_trtllm] TIMEOUT (180)
|
||||
- condition:
|
||||
ranges:
|
||||
system_gpu_count:
|
||||
|
||||
@ -17,6 +17,7 @@ l0_gb200_multi_nodes:
|
||||
- accuracy/test_llm_api_pytorch.py::TestDeepSeekR1::test_nvfp4_multi_gpus[throughput_mtp] TIMEOUT (180)
|
||||
- accuracy/test_llm_api_pytorch.py::TestDeepSeekR1::test_fp8_blockscale[throughput] TIMEOUT (180)
|
||||
- accuracy/test_llm_api_pytorch.py::TestDeepSeekR1::test_fp8_blockscale[throughput_mtp] TIMEOUT (180)
|
||||
- accuracy/test_llm_api_pytorch.py::TestDeepSeekR1::test_fp8_blockscale[throughput_mtp_trtllm] TIMEOUT (180)
|
||||
- condition:
|
||||
ranges:
|
||||
# 2 nodes with each node has 4 GPUs
|
||||
|
||||
Loading…
Reference in New Issue
Block a user