mirror of
https://github.com/NVIDIA/TensorRT-LLM.git
synced 2026-01-14 06:27:45 +08:00
[https://nvbugs/5698434][test] add qwen3-4b accuracy test case (#10382)
Signed-off-by: Ivy Zhang <25222398+crazydemo@users.noreply.github.com>
This commit is contained in:
parent
6095c80e56
commit
4a1b2e23b3
@ -106,6 +106,9 @@ deepseek-ai/DeepSeek-V3.2-Exp:
|
||||
- quant_algo: NVFP4
|
||||
spec_dec_algo: MTP
|
||||
accuracy: 95.6
|
||||
Qwen3/Qwen3-4B:
|
||||
- spec_dec_algo: Eagle
|
||||
accuracy: 85.823
|
||||
Qwen3/Qwen3-8B:
|
||||
- accuracy: 87.1114
|
||||
- spec_dec_algo: Eagle
|
||||
|
||||
@ -3327,6 +3327,35 @@ class TestQwen2_7BInstruct(LlmapiAccuracyTestHarness):
|
||||
extra_evaluator_kwargs=self.EXTRA_EVALUATOR_KWARGS)
|
||||
|
||||
|
||||
class TestQwen3_4B(LlmapiAccuracyTestHarness):
|
||||
MODEL_NAME = "Qwen3/Qwen3-4B"
|
||||
|
||||
def test_eagle3(self):
|
||||
"RCCA: https://nvbugspro.nvidia.com/bug/5698434"
|
||||
pytorch_config = dict(
|
||||
disable_overlap_scheduler=True,
|
||||
cuda_graph_config=CudaGraphConfig(),
|
||||
)
|
||||
kv_cache_config = KvCacheConfig(
|
||||
enable_block_reuse=False,
|
||||
free_gpu_memory_fraction=0.6,
|
||||
)
|
||||
|
||||
eagle_model_dir = f"{llm_models_root()}/Qwen3/Qwen3-4B_eagle3/"
|
||||
target_model_dir = f"{llm_models_root()}/Qwen3/Qwen3-4B"
|
||||
|
||||
draft_len = 3
|
||||
spec_config = EagleDecodingConfig(max_draft_len=draft_len,
|
||||
speculative_model_dir=eagle_model_dir)
|
||||
|
||||
with LLM(model=target_model_dir,
|
||||
**pytorch_config,
|
||||
kv_cache_config=kv_cache_config,
|
||||
speculative_config=spec_config) as llm:
|
||||
task = GSM8K(self.MODEL_NAME)
|
||||
task.evaluate(llm)
|
||||
|
||||
|
||||
class TestQwen3_8B(LlmapiAccuracyTestHarness):
|
||||
MODEL_NAME = "Qwen3/Qwen3-8B"
|
||||
|
||||
|
||||
@ -525,6 +525,7 @@ accuracy/test_llm_api_pytorch.py::TestGLM4_6::test_nvfp4_multi_gpus[throughput]
|
||||
accuracy/test_llm_api_pytorch.py::TestGLM4_6::test_nvfp4_multi_gpus[throughput_trtllm]
|
||||
accuracy/test_llm_api_pytorch.py::TestGLM4_6::test_nvfp4_2_model_mtp[2model]
|
||||
accuracy/test_llm_api_pytorch.py::TestGLM4_6::test_nvfp4_2_model_mtp[2model_trtllm]
|
||||
accuracy/test_llm_api_pytorch.py::TestQwen3_4B::test_eagle3
|
||||
accuracy/test_llm_api_pytorch.py::TestQwen3_8B::test_fp8_block_scales[latency]
|
||||
accuracy/test_llm_api_pytorch.py::TestQwen3_8B::test_bf16[multi_gpus_no_cache]
|
||||
accuracy/test_llm_api_pytorch.py::TestQwen3_30B_A3B::test_fp8_block_scales[latency-torch_compile=False]
|
||||
|
||||
@ -229,6 +229,7 @@ accuracy/test_llm_api_pytorch.py::TestQwen3_30B_A3B::test_w4a8_mxfp4[mxfp8-laten
|
||||
accuracy/test_llm_api_pytorch.py::TestQwen3_8B::test_fp8_block_scales[latency]
|
||||
accuracy/test_llm_api_pytorch.py::TestQwen3_8B::test_w4a8_mxfp4[fp8-latency]
|
||||
accuracy/test_llm_api_pytorch.py::TestQwen3_8B::test_w4a8_mxfp4[mxfp8-latency]
|
||||
accuracy/test_llm_api_pytorch.py::TestQwen3_4B::test_eagle3
|
||||
accuracy/test_llm_api_pytorch.py::TestSeedOss_36B::test_auto_dtype
|
||||
|
||||
accuracy/test_llm_api_pytorch_multimodal.py::TestQwen2_VL_7B::test_auto_dtype
|
||||
|
||||
Loading…
Reference in New Issue
Block a user