mirror of
https://github.com/NVIDIA/TensorRT-LLM.git
synced 2026-01-14 06:27:45 +08:00
[TRTLLM-6357][test] Add accuracy tests for Qwen3 (#6177)
Signed-off-by: Lizhi Zhou <1432185+reasonsolo@users.noreply.github.com>
This commit is contained in:
parent
263c6c0ad0
commit
6f34f3489b
@ -70,6 +70,11 @@ deepseek-ai/DeepSeek-R1:
|
||||
- quant_algo: FP8_BLOCK_SCALES
|
||||
spec_dec_algo: MTP
|
||||
accuracy: 95.413
|
||||
Qwen3/Qwen3-8B:
|
||||
- accuracy: 87.1114
|
||||
- quant_algo: FP8
|
||||
kv_cache_quant_algo: FP8
|
||||
accuracy: 87.1114
|
||||
Qwen3/Qwen3-30B-A3B:
|
||||
- quant_algo: FP8_BLOCK_SCALES
|
||||
accuracy: 84.36
|
||||
|
||||
@ -533,3 +533,44 @@ class TestGemma3_1BInstruct(LlmapiAccuracyTestHarness):
|
||||
self.MODEL_PATH) as llm:
|
||||
task = GSM8K(self.MODEL_NAME)
|
||||
task.evaluate(llm)
|
||||
|
||||
|
||||
@pytest.mark.timeout(3600)
|
||||
class TestQwen3_8B(LlmapiAccuracyTestHarness):
|
||||
MODEL_NAME = "Qwen3/Qwen3-8B"
|
||||
MODEL_PATH = f"{llm_models_root()}/Qwen3/Qwen3-8B-FP8"
|
||||
|
||||
@pytest.mark.parametrize("overlap_scheduler", [False, True])
|
||||
def test_auto_dtype(self, overlap_scheduler):
|
||||
ctx_server_config = {
|
||||
"disable_overlap_scheduler": True,
|
||||
"cuda_graph_config": None,
|
||||
"cache_transceiver_config": {
|
||||
"backend": "default"
|
||||
}
|
||||
}
|
||||
gen_server_config = {
|
||||
"disable_overlap_scheduler": overlap_scheduler,
|
||||
"cuda_graph_config": None,
|
||||
"cache_transceiver_config": {
|
||||
"backend": "default"
|
||||
}
|
||||
}
|
||||
disaggregated_server_config = {
|
||||
"hostname": "localhost",
|
||||
"port": 8000,
|
||||
"backend": "pytorch",
|
||||
"context_servers": {
|
||||
"num_instances": 1,
|
||||
"urls": ["localhost:8001"]
|
||||
},
|
||||
"generation_servers": {
|
||||
"num_instances": 1,
|
||||
"urls": ["localhost:8002"]
|
||||
}
|
||||
}
|
||||
with launch_disaggregated_llm(disaggregated_server_config,
|
||||
ctx_server_config, gen_server_config,
|
||||
self.MODEL_PATH) as llm:
|
||||
task = GSM8K(self.MODEL_NAME)
|
||||
task.evaluate(llm)
|
||||
|
||||
@ -27,21 +27,21 @@ MPI_READY = MPI_TAG + 2
|
||||
MPI_REQUEST = MPI_TAG
|
||||
MPI_RESULT = MPI_TAG + 1
|
||||
|
||||
MODEL_PATHS = {
|
||||
"DeepSeek-V3-Lite-fp8": "DeepSeek-V3-Lite/fp8",
|
||||
"TinyLlama-1.1B-Chat-v1.0": "llama-models-v2/TinyLlama-1.1B-Chat-v1.0",
|
||||
"Llama-3.1-8B-Instruct": "llama-3.1-model/Llama-3.1-8B-Instruct/",
|
||||
"EAGLE3-LLaMA3.1-Instruct-8B": "EAGLE3-LLaMA3.1-Instruct-8B",
|
||||
"Qwen3-8B-FP8": "Qwen3/Qwen3-8B-FP8",
|
||||
}
|
||||
|
||||
|
||||
def model_path(model_name):
|
||||
llm_models_root = os.environ["LLM_MODELS_ROOT"]
|
||||
if 'DeepSeek-V3-Lite-fp8' in model_name:
|
||||
return os.path.join(llm_models_root, 'DeepSeek-V3-Lite', 'fp8')
|
||||
elif 'TinyLlama-1.1B-Chat-v1.0' in model_name:
|
||||
return os.path.join(llm_models_root, 'llama-models-v2',
|
||||
'TinyLlama-1.1B-Chat-v1.0')
|
||||
elif 'Llama-3.1-8B-Instruct' in model_name:
|
||||
return os.path.join(llm_models_root, 'llama-3.1-model',
|
||||
'Llama-3.1-8B-Instruct/')
|
||||
elif 'EAGLE3-LLaMA3.1-Instruct-8B' in model_name:
|
||||
return os.path.join(llm_models_root, 'EAGLE3-LLaMA3.1-Instruct-8B')
|
||||
else:
|
||||
raise ValueError(f"Unknown model: {model_name}")
|
||||
for name, path in MODEL_PATHS.items():
|
||||
if name in model_name:
|
||||
return os.path.join(llm_models_root, path)
|
||||
raise ValueError(f"Unknown model: {model_name}")
|
||||
|
||||
|
||||
async def run_worker(kv_cache_config, cache_transceiver_config, pytorch_config,
|
||||
@ -232,6 +232,22 @@ def test_disaggregated_simple_deepseek(model, generation_overlap,
|
||||
])
|
||||
|
||||
|
||||
@skip_no_hopper
|
||||
@pytest.mark.parametrize("model", ["Qwen3-8B-FP8"])
|
||||
@pytest.mark.parametrize("generation_overlap", [False, True])
|
||||
@pytest.mark.parametrize("enable_cuda_graph", [False, True])
|
||||
def test_disaggregated_simple_qwen3(model, generation_overlap,
|
||||
enable_cuda_graph):
|
||||
verify_disaggregated(
|
||||
model, generation_overlap, enable_cuda_graph,
|
||||
" What is the capital of China?",
|
||||
" The capital of China is Beijing. 2. What is the population of China? The population of China is about 1",
|
||||
[
|
||||
576, 6722, 315, 5616, 374, 26549, 13, 220, 17, 13, 3555, 374, 279,
|
||||
7042, 315, 5616, 30, 576, 7042, 315, 5616, 374, 911, 220, 16
|
||||
])
|
||||
|
||||
|
||||
@pytest.mark.parametrize("model", ["DeepSeek-V3-Lite-fp8/fp8"])
|
||||
@pytest.mark.parametrize("enable_cuda_graph", [False])
|
||||
@pytest.mark.parametrize("generation_overlap", [False])
|
||||
|
||||
@ -488,6 +488,8 @@ accuracy/test_disaggregated_serving.py::TestLlama3_1_8BInstruct::test_auto_dtype
|
||||
accuracy/test_disaggregated_serving.py::TestLlama3_1_8BInstruct::test_auto_dtype[True]
|
||||
accuracy/test_disaggregated_serving.py::TestLlama4ScoutInstruct::test_auto_dtype[False]
|
||||
accuracy/test_disaggregated_serving.py::TestLlama4ScoutInstruct::test_auto_dtype[True]
|
||||
accuracy/test_disaggregated_serving.py::TestQwen3_8B::test_auto_dtype[False]
|
||||
accuracy/test_disaggregated_serving.py::TestQwen3_8B::test_auto_dtype[True]
|
||||
accuracy/test_llm_api_pytorch.py::TestQwen3_235B_A22B::test_fp8[throughput_latency]
|
||||
accuracy/test_llm_api_pytorch.py::TestQwen3_235B_A22B::test_fp8[latency]
|
||||
accuracy/test_llm_api_pytorch.py::TestQwen3_235B_A22B::test_nvfp4[latency_moe_cutlass]
|
||||
@ -608,6 +610,10 @@ disaggregated/test_disaggregated.py::test_disaggregated_deepseek_v3_lite_fp8_att
|
||||
disaggregated/test_disaggregated.py::test_disaggregated_load_balance[TinyLlama-1.1B-Chat-v1.0]
|
||||
disaggregated/test_disaggregated.py::test_disaggregated_cache_aware_balance[TinyLlama-1.1B-Chat-v1.0]
|
||||
disaggregated/test_disaggregated.py::test_disaggregated_trtllm_sampler[TinyLlama-1.1B-Chat-v1.0]
|
||||
disaggregated/test_disaggregated_single_gpu.py::test_disaggregated_simple_qwen3[False-False-Qwen3-8B-FP8]
|
||||
disaggregated/test_disaggregated_single_gpu.py::test_disaggregated_simple_qwen3[False-True-Qwen3-8B-FP8]
|
||||
disaggregated/test_disaggregated_single_gpu.py::test_disaggregated_simple_qwen3[True-False-Qwen3-8B-FP8]
|
||||
disaggregated/test_disaggregated_single_gpu.py::test_disaggregated_simple_qwen3[True-True-Qwen3-8B-FP8]
|
||||
disaggregated/test_workers.py::test_workers_conditional_disaggregation[TinyLlama-1.1B-Chat-v1.0]
|
||||
disaggregated/test_workers.py::test_workers_kv_cache_events[TinyLlama-1.1B-Chat-v1.0]
|
||||
disaggregated/test_workers.py::test_workers_kv_cache_aware_router[TinyLlama-1.1B-Chat-v1.0]
|
||||
|
||||
@ -39,6 +39,8 @@ l0_dgx_h100:
|
||||
- accuracy/test_disaggregated_serving.py::TestGemma3_1BInstruct::test_auto_dtype[True]
|
||||
- accuracy/test_disaggregated_serving.py::TestLlama3_1_8BInstruct::test_ngram
|
||||
- accuracy/test_disaggregated_serving.py::TestLlama3_1_8BInstruct::test_eagle3[False]
|
||||
- accuracy/test_disaggregated_serving.py::TestQwen3_8B::test_auto_dtype[False]
|
||||
- accuracy/test_disaggregated_serving.py::TestQwen3_8B::test_auto_dtype[True]
|
||||
- accuracy/test_disaggregated_serving.py::TestLlama3_1_8BInstruct::test_tp_pp_symmetric[GSM8K-tp1pp2]
|
||||
- accuracy/test_disaggregated_serving.py::TestLlama3_1_8BInstruct::test_tp_pp_symmetric[MMLU-tp1pp2]
|
||||
- accuracy/test_disaggregated_serving.py::TestLlama3_1_8BInstruct::test_tp_pp_symmetric[GSM8K-tp2pp1]
|
||||
|
||||
@ -61,6 +61,10 @@ l0_h100:
|
||||
- disaggregated/test_disaggregated_single_gpu.py::test_disaggregated_simple_deepseek[False-True-DeepSeek-V3-Lite-fp8/fp8]
|
||||
- disaggregated/test_disaggregated_single_gpu.py::test_disaggregated_simple_deepseek[True-False-DeepSeek-V3-Lite-fp8/fp8]
|
||||
- disaggregated/test_disaggregated_single_gpu.py::test_disaggregated_simple_deepseek[True-True-DeepSeek-V3-Lite-fp8/fp8] TIMEOUT (90)
|
||||
- disaggregated/test_disaggregated_single_gpu.py::test_disaggregated_simple_qwen3[False-False-Qwen3-8B-FP8]
|
||||
- disaggregated/test_disaggregated_single_gpu.py::test_disaggregated_simple_qwen3[False-True-Qwen3-8B-FP8]
|
||||
- disaggregated/test_disaggregated_single_gpu.py::test_disaggregated_simple_qwen3[True-False-Qwen3-8B-FP8]
|
||||
- disaggregated/test_disaggregated_single_gpu.py::test_disaggregated_simple_qwen3[True-True-Qwen3-8B-FP8]
|
||||
- disaggregated/test_disaggregated.py::test_disaggregated_load_balance[TinyLlama-1.1B-Chat-v1.0]
|
||||
- disaggregated/test_disaggregated.py::test_disaggregated_cache_aware_balance[TinyLlama-1.1B-Chat-v1.0]
|
||||
- disaggregated/test_disaggregated.py::test_disaggregated_conditional[TinyLlama-1.1B-Chat-v1.0]
|
||||
|
||||
Loading…
Reference in New Issue
Block a user