[TRTLLM-6357][test] Add accuracy tests for Qwen3 (#6177)

Signed-off-by: Lizhi Zhou <1432185+reasonsolo@users.noreply.github.com>
This commit is contained in:
Lizhi Zhou 2025-08-02 01:33:34 +08:00 committed by GitHub
parent 263c6c0ad0
commit 6f34f3489b
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
6 changed files with 86 additions and 12 deletions

View File

@ -70,6 +70,11 @@ deepseek-ai/DeepSeek-R1:
- quant_algo: FP8_BLOCK_SCALES
spec_dec_algo: MTP
accuracy: 95.413
Qwen3/Qwen3-8B:
- accuracy: 87.1114
- quant_algo: FP8
kv_cache_quant_algo: FP8
accuracy: 87.1114
Qwen3/Qwen3-30B-A3B:
- quant_algo: FP8_BLOCK_SCALES
accuracy: 84.36

View File

@ -533,3 +533,44 @@ class TestGemma3_1BInstruct(LlmapiAccuracyTestHarness):
self.MODEL_PATH) as llm:
task = GSM8K(self.MODEL_NAME)
task.evaluate(llm)
@pytest.mark.timeout(3600)
class TestQwen3_8B(LlmapiAccuracyTestHarness):
MODEL_NAME = "Qwen3/Qwen3-8B"
MODEL_PATH = f"{llm_models_root()}/Qwen3/Qwen3-8B-FP8"
@pytest.mark.parametrize("overlap_scheduler", [False, True])
def test_auto_dtype(self, overlap_scheduler):
ctx_server_config = {
"disable_overlap_scheduler": True,
"cuda_graph_config": None,
"cache_transceiver_config": {
"backend": "default"
}
}
gen_server_config = {
"disable_overlap_scheduler": overlap_scheduler,
"cuda_graph_config": None,
"cache_transceiver_config": {
"backend": "default"
}
}
disaggregated_server_config = {
"hostname": "localhost",
"port": 8000,
"backend": "pytorch",
"context_servers": {
"num_instances": 1,
"urls": ["localhost:8001"]
},
"generation_servers": {
"num_instances": 1,
"urls": ["localhost:8002"]
}
}
with launch_disaggregated_llm(disaggregated_server_config,
ctx_server_config, gen_server_config,
self.MODEL_PATH) as llm:
task = GSM8K(self.MODEL_NAME)
task.evaluate(llm)

View File

@ -27,21 +27,21 @@ MPI_READY = MPI_TAG + 2
MPI_REQUEST = MPI_TAG
MPI_RESULT = MPI_TAG + 1
MODEL_PATHS = {
"DeepSeek-V3-Lite-fp8": "DeepSeek-V3-Lite/fp8",
"TinyLlama-1.1B-Chat-v1.0": "llama-models-v2/TinyLlama-1.1B-Chat-v1.0",
"Llama-3.1-8B-Instruct": "llama-3.1-model/Llama-3.1-8B-Instruct/",
"EAGLE3-LLaMA3.1-Instruct-8B": "EAGLE3-LLaMA3.1-Instruct-8B",
"Qwen3-8B-FP8": "Qwen3/Qwen3-8B-FP8",
}
def model_path(model_name):
llm_models_root = os.environ["LLM_MODELS_ROOT"]
if 'DeepSeek-V3-Lite-fp8' in model_name:
return os.path.join(llm_models_root, 'DeepSeek-V3-Lite', 'fp8')
elif 'TinyLlama-1.1B-Chat-v1.0' in model_name:
return os.path.join(llm_models_root, 'llama-models-v2',
'TinyLlama-1.1B-Chat-v1.0')
elif 'Llama-3.1-8B-Instruct' in model_name:
return os.path.join(llm_models_root, 'llama-3.1-model',
'Llama-3.1-8B-Instruct/')
elif 'EAGLE3-LLaMA3.1-Instruct-8B' in model_name:
return os.path.join(llm_models_root, 'EAGLE3-LLaMA3.1-Instruct-8B')
else:
raise ValueError(f"Unknown model: {model_name}")
for name, path in MODEL_PATHS.items():
if name in model_name:
return os.path.join(llm_models_root, path)
raise ValueError(f"Unknown model: {model_name}")
async def run_worker(kv_cache_config, cache_transceiver_config, pytorch_config,
@ -232,6 +232,22 @@ def test_disaggregated_simple_deepseek(model, generation_overlap,
])
@skip_no_hopper
@pytest.mark.parametrize("model", ["Qwen3-8B-FP8"])
@pytest.mark.parametrize("generation_overlap", [False, True])
@pytest.mark.parametrize("enable_cuda_graph", [False, True])
def test_disaggregated_simple_qwen3(model, generation_overlap,
enable_cuda_graph):
verify_disaggregated(
model, generation_overlap, enable_cuda_graph,
" What is the capital of China?",
" The capital of China is Beijing. 2. What is the population of China? The population of China is about 1",
[
576, 6722, 315, 5616, 374, 26549, 13, 220, 17, 13, 3555, 374, 279,
7042, 315, 5616, 30, 576, 7042, 315, 5616, 374, 911, 220, 16
])
@pytest.mark.parametrize("model", ["DeepSeek-V3-Lite-fp8/fp8"])
@pytest.mark.parametrize("enable_cuda_graph", [False])
@pytest.mark.parametrize("generation_overlap", [False])

View File

@ -488,6 +488,8 @@ accuracy/test_disaggregated_serving.py::TestLlama3_1_8BInstruct::test_auto_dtype
accuracy/test_disaggregated_serving.py::TestLlama3_1_8BInstruct::test_auto_dtype[True]
accuracy/test_disaggregated_serving.py::TestLlama4ScoutInstruct::test_auto_dtype[False]
accuracy/test_disaggregated_serving.py::TestLlama4ScoutInstruct::test_auto_dtype[True]
accuracy/test_disaggregated_serving.py::TestQwen3_8B::test_auto_dtype[False]
accuracy/test_disaggregated_serving.py::TestQwen3_8B::test_auto_dtype[True]
accuracy/test_llm_api_pytorch.py::TestQwen3_235B_A22B::test_fp8[throughput_latency]
accuracy/test_llm_api_pytorch.py::TestQwen3_235B_A22B::test_fp8[latency]
accuracy/test_llm_api_pytorch.py::TestQwen3_235B_A22B::test_nvfp4[latency_moe_cutlass]
@ -608,6 +610,10 @@ disaggregated/test_disaggregated.py::test_disaggregated_deepseek_v3_lite_fp8_att
disaggregated/test_disaggregated.py::test_disaggregated_load_balance[TinyLlama-1.1B-Chat-v1.0]
disaggregated/test_disaggregated.py::test_disaggregated_cache_aware_balance[TinyLlama-1.1B-Chat-v1.0]
disaggregated/test_disaggregated.py::test_disaggregated_trtllm_sampler[TinyLlama-1.1B-Chat-v1.0]
disaggregated/test_disaggregated_single_gpu.py::test_disaggregated_simple_qwen3[False-False-Qwen3-8B-FP8]
disaggregated/test_disaggregated_single_gpu.py::test_disaggregated_simple_qwen3[False-True-Qwen3-8B-FP8]
disaggregated/test_disaggregated_single_gpu.py::test_disaggregated_simple_qwen3[True-False-Qwen3-8B-FP8]
disaggregated/test_disaggregated_single_gpu.py::test_disaggregated_simple_qwen3[True-True-Qwen3-8B-FP8]
disaggregated/test_workers.py::test_workers_conditional_disaggregation[TinyLlama-1.1B-Chat-v1.0]
disaggregated/test_workers.py::test_workers_kv_cache_events[TinyLlama-1.1B-Chat-v1.0]
disaggregated/test_workers.py::test_workers_kv_cache_aware_router[TinyLlama-1.1B-Chat-v1.0]

View File

@ -39,6 +39,8 @@ l0_dgx_h100:
- accuracy/test_disaggregated_serving.py::TestGemma3_1BInstruct::test_auto_dtype[True]
- accuracy/test_disaggregated_serving.py::TestLlama3_1_8BInstruct::test_ngram
- accuracy/test_disaggregated_serving.py::TestLlama3_1_8BInstruct::test_eagle3[False]
- accuracy/test_disaggregated_serving.py::TestQwen3_8B::test_auto_dtype[False]
- accuracy/test_disaggregated_serving.py::TestQwen3_8B::test_auto_dtype[True]
- accuracy/test_disaggregated_serving.py::TestLlama3_1_8BInstruct::test_tp_pp_symmetric[GSM8K-tp1pp2]
- accuracy/test_disaggregated_serving.py::TestLlama3_1_8BInstruct::test_tp_pp_symmetric[MMLU-tp1pp2]
- accuracy/test_disaggregated_serving.py::TestLlama3_1_8BInstruct::test_tp_pp_symmetric[GSM8K-tp2pp1]

View File

@ -61,6 +61,10 @@ l0_h100:
- disaggregated/test_disaggregated_single_gpu.py::test_disaggregated_simple_deepseek[False-True-DeepSeek-V3-Lite-fp8/fp8]
- disaggregated/test_disaggregated_single_gpu.py::test_disaggregated_simple_deepseek[True-False-DeepSeek-V3-Lite-fp8/fp8]
- disaggregated/test_disaggregated_single_gpu.py::test_disaggregated_simple_deepseek[True-True-DeepSeek-V3-Lite-fp8/fp8] TIMEOUT (90)
- disaggregated/test_disaggregated_single_gpu.py::test_disaggregated_simple_qwen3[False-False-Qwen3-8B-FP8]
- disaggregated/test_disaggregated_single_gpu.py::test_disaggregated_simple_qwen3[False-True-Qwen3-8B-FP8]
- disaggregated/test_disaggregated_single_gpu.py::test_disaggregated_simple_qwen3[True-False-Qwen3-8B-FP8]
- disaggregated/test_disaggregated_single_gpu.py::test_disaggregated_simple_qwen3[True-True-Qwen3-8B-FP8]
- disaggregated/test_disaggregated.py::test_disaggregated_load_balance[TinyLlama-1.1B-Chat-v1.0]
- disaggregated/test_disaggregated.py::test_disaggregated_cache_aware_balance[TinyLlama-1.1B-Chat-v1.0]
- disaggregated/test_disaggregated.py::test_disaggregated_conditional[TinyLlama-1.1B-Chat-v1.0]