[TRTLLM-6357][test] Add accuracy tests for Qwen3 (#6177)

Signed-off-by: Lizhi Zhou <1432185+reasonsolo@users.noreply.github.com>
2026-01-14 06:27:45 +08:00 · 2025-08-02 01:33:34 +08:00 · 2025-08-02 01:33:34 +08:00 · 6f34f3489b
commit 6f34f3489b
parent 263c6c0ad0
6 changed files with 86 additions and 12 deletions
--- a/tests/integration/defs/accuracy/references/gsm8k.yaml
+++ b/tests/integration/defs/accuracy/references/gsm8k.yaml
@ -70,6 +70,11 @@ deepseek-ai/DeepSeek-R1:
  - quant_algo: FP8_BLOCK_SCALES
    spec_dec_algo: MTP
    accuracy: 95.413
+Qwen3/Qwen3-8B:
+  - accuracy: 87.1114
+  - quant_algo: FP8
+    kv_cache_quant_algo: FP8
+    accuracy: 87.1114
 Qwen3/Qwen3-30B-A3B:
  - quant_algo: FP8_BLOCK_SCALES
    accuracy: 84.36
--- a/tests/integration/defs/accuracy/test_disaggregated_serving.py
+++ b/tests/integration/defs/accuracy/test_disaggregated_serving.py
@ -533,3 +533,44 @@ class TestGemma3_1BInstruct(LlmapiAccuracyTestHarness):
                                      self.MODEL_PATH) as llm:
            task = GSM8K(self.MODEL_NAME)
            task.evaluate(llm)
+
+
+@pytest.mark.timeout(3600)
+class TestQwen3_8B(LlmapiAccuracyTestHarness):
+    MODEL_NAME = "Qwen3/Qwen3-8B"
+    MODEL_PATH = f"{llm_models_root()}/Qwen3/Qwen3-8B-FP8"
+
+    @pytest.mark.parametrize("overlap_scheduler", [False, True])
+    def test_auto_dtype(self, overlap_scheduler):
+        ctx_server_config = {
+            "disable_overlap_scheduler": True,
+            "cuda_graph_config": None,
+            "cache_transceiver_config": {
+                "backend": "default"
+            }
+        }
+        gen_server_config = {
+            "disable_overlap_scheduler": overlap_scheduler,
+            "cuda_graph_config": None,
+            "cache_transceiver_config": {
+                "backend": "default"
+            }
+        }
+        disaggregated_server_config = {
+            "hostname": "localhost",
+            "port": 8000,
+            "backend": "pytorch",
+            "context_servers": {
+                "num_instances": 1,
+                "urls": ["localhost:8001"]
+            },
+            "generation_servers": {
+                "num_instances": 1,
+                "urls": ["localhost:8002"]
+            }
+        }
+        with launch_disaggregated_llm(disaggregated_server_config,
+                                      ctx_server_config, gen_server_config,
+                                      self.MODEL_PATH) as llm:
+            task = GSM8K(self.MODEL_NAME)
+            task.evaluate(llm)
--- a/tests/integration/defs/disaggregated/test_disaggregated_single_gpu.py
+++ b/tests/integration/defs/disaggregated/test_disaggregated_single_gpu.py
@ -27,21 +27,21 @@ MPI_READY = MPI_TAG + 2
 MPI_REQUEST = MPI_TAG
 MPI_RESULT = MPI_TAG + 1

+MODEL_PATHS = {
+    "DeepSeek-V3-Lite-fp8": "DeepSeek-V3-Lite/fp8",
+    "TinyLlama-1.1B-Chat-v1.0": "llama-models-v2/TinyLlama-1.1B-Chat-v1.0",
+    "Llama-3.1-8B-Instruct": "llama-3.1-model/Llama-3.1-8B-Instruct/",
+    "EAGLE3-LLaMA3.1-Instruct-8B": "EAGLE3-LLaMA3.1-Instruct-8B",
+    "Qwen3-8B-FP8": "Qwen3/Qwen3-8B-FP8",
+}
+

 def model_path(model_name):
    llm_models_root = os.environ["LLM_MODELS_ROOT"]
-    if 'DeepSeek-V3-Lite-fp8' in model_name:
-        return os.path.join(llm_models_root, 'DeepSeek-V3-Lite', 'fp8')
-    elif 'TinyLlama-1.1B-Chat-v1.0' in model_name:
-        return os.path.join(llm_models_root, 'llama-models-v2',
-                            'TinyLlama-1.1B-Chat-v1.0')
-    elif 'Llama-3.1-8B-Instruct' in model_name:
-        return os.path.join(llm_models_root, 'llama-3.1-model',
-                            'Llama-3.1-8B-Instruct/')
-    elif 'EAGLE3-LLaMA3.1-Instruct-8B' in model_name:
-        return os.path.join(llm_models_root, 'EAGLE3-LLaMA3.1-Instruct-8B')
-    else:
-        raise ValueError(f"Unknown model: {model_name}")
+    for name, path in MODEL_PATHS.items():
+        if name in model_name:
+            return os.path.join(llm_models_root, path)
+    raise ValueError(f"Unknown model: {model_name}")


 async def run_worker(kv_cache_config, cache_transceiver_config, pytorch_config,
@ -232,6 +232,22 @@ def test_disaggregated_simple_deepseek(model, generation_overlap,
        ])


+@skip_no_hopper
+@pytest.mark.parametrize("model", ["Qwen3-8B-FP8"])
+@pytest.mark.parametrize("generation_overlap", [False, True])
+@pytest.mark.parametrize("enable_cuda_graph", [False, True])
+def test_disaggregated_simple_qwen3(model, generation_overlap,
+                                    enable_cuda_graph):
+    verify_disaggregated(
+        model, generation_overlap, enable_cuda_graph,
+        " What is the capital of China?",
+        " The capital of China is Beijing. 2. What is the population of China? The population of China is about 1",
+        [
+            576, 6722, 315, 5616, 374, 26549, 13, 220, 17, 13, 3555, 374, 279,
+            7042, 315, 5616, 30, 576, 7042, 315, 5616, 374, 911, 220, 16
+        ])
+
+
@pytest.mark.parametrize("model", ["DeepSeek-V3-Lite-fp8/fp8"])
@pytest.mark.parametrize("enable_cuda_graph", [False])
@pytest.mark.parametrize("generation_overlap", [False])
--- a/tests/integration/test_lists/qa/examples_test_list.txt
+++ b/tests/integration/test_lists/qa/examples_test_list.txt
@ -488,6 +488,8 @@ accuracy/test_disaggregated_serving.py::TestLlama3_1_8BInstruct::test_auto_dtype
 accuracy/test_disaggregated_serving.py::TestLlama3_1_8BInstruct::test_auto_dtype[True]
 accuracy/test_disaggregated_serving.py::TestLlama4ScoutInstruct::test_auto_dtype[False]
 accuracy/test_disaggregated_serving.py::TestLlama4ScoutInstruct::test_auto_dtype[True]
+accuracy/test_disaggregated_serving.py::TestQwen3_8B::test_auto_dtype[False]
+accuracy/test_disaggregated_serving.py::TestQwen3_8B::test_auto_dtype[True]
 accuracy/test_llm_api_pytorch.py::TestQwen3_235B_A22B::test_fp8[throughput_latency]
 accuracy/test_llm_api_pytorch.py::TestQwen3_235B_A22B::test_fp8[latency]
 accuracy/test_llm_api_pytorch.py::TestQwen3_235B_A22B::test_nvfp4[latency_moe_cutlass]
@ -608,6 +610,10 @@ disaggregated/test_disaggregated.py::test_disaggregated_deepseek_v3_lite_fp8_att
 disaggregated/test_disaggregated.py::test_disaggregated_load_balance[TinyLlama-1.1B-Chat-v1.0]
 disaggregated/test_disaggregated.py::test_disaggregated_cache_aware_balance[TinyLlama-1.1B-Chat-v1.0]
 disaggregated/test_disaggregated.py::test_disaggregated_trtllm_sampler[TinyLlama-1.1B-Chat-v1.0]
+disaggregated/test_disaggregated_single_gpu.py::test_disaggregated_simple_qwen3[False-False-Qwen3-8B-FP8]
+disaggregated/test_disaggregated_single_gpu.py::test_disaggregated_simple_qwen3[False-True-Qwen3-8B-FP8]
+disaggregated/test_disaggregated_single_gpu.py::test_disaggregated_simple_qwen3[True-False-Qwen3-8B-FP8]
+disaggregated/test_disaggregated_single_gpu.py::test_disaggregated_simple_qwen3[True-True-Qwen3-8B-FP8]
 disaggregated/test_workers.py::test_workers_conditional_disaggregation[TinyLlama-1.1B-Chat-v1.0]
 disaggregated/test_workers.py::test_workers_kv_cache_events[TinyLlama-1.1B-Chat-v1.0]
 disaggregated/test_workers.py::test_workers_kv_cache_aware_router[TinyLlama-1.1B-Chat-v1.0]
--- a/tests/integration/test_lists/test-db/l0_dgx_h100.yml
+++ b/tests/integration/test_lists/test-db/l0_dgx_h100.yml
@ -39,6 +39,8 @@ l0_dgx_h100:
  - accuracy/test_disaggregated_serving.py::TestGemma3_1BInstruct::test_auto_dtype[True]
  - accuracy/test_disaggregated_serving.py::TestLlama3_1_8BInstruct::test_ngram
  - accuracy/test_disaggregated_serving.py::TestLlama3_1_8BInstruct::test_eagle3[False]
+  - accuracy/test_disaggregated_serving.py::TestQwen3_8B::test_auto_dtype[False]
+  - accuracy/test_disaggregated_serving.py::TestQwen3_8B::test_auto_dtype[True]
  - accuracy/test_disaggregated_serving.py::TestLlama3_1_8BInstruct::test_tp_pp_symmetric[GSM8K-tp1pp2]
  - accuracy/test_disaggregated_serving.py::TestLlama3_1_8BInstruct::test_tp_pp_symmetric[MMLU-tp1pp2]
  - accuracy/test_disaggregated_serving.py::TestLlama3_1_8BInstruct::test_tp_pp_symmetric[GSM8K-tp2pp1]
--- a/tests/integration/test_lists/test-db/l0_h100.yml
+++ b/tests/integration/test_lists/test-db/l0_h100.yml
@ -61,6 +61,10 @@ l0_h100:
  - disaggregated/test_disaggregated_single_gpu.py::test_disaggregated_simple_deepseek[False-True-DeepSeek-V3-Lite-fp8/fp8]
  - disaggregated/test_disaggregated_single_gpu.py::test_disaggregated_simple_deepseek[True-False-DeepSeek-V3-Lite-fp8/fp8]
  - disaggregated/test_disaggregated_single_gpu.py::test_disaggregated_simple_deepseek[True-True-DeepSeek-V3-Lite-fp8/fp8] TIMEOUT (90)
+  - disaggregated/test_disaggregated_single_gpu.py::test_disaggregated_simple_qwen3[False-False-Qwen3-8B-FP8]
+  - disaggregated/test_disaggregated_single_gpu.py::test_disaggregated_simple_qwen3[False-True-Qwen3-8B-FP8]
+  - disaggregated/test_disaggregated_single_gpu.py::test_disaggregated_simple_qwen3[True-False-Qwen3-8B-FP8]
+  - disaggregated/test_disaggregated_single_gpu.py::test_disaggregated_simple_qwen3[True-True-Qwen3-8B-FP8]
  - disaggregated/test_disaggregated.py::test_disaggregated_load_balance[TinyLlama-1.1B-Chat-v1.0]
  - disaggregated/test_disaggregated.py::test_disaggregated_cache_aware_balance[TinyLlama-1.1B-Chat-v1.0]
  - disaggregated/test_disaggregated.py::test_disaggregated_conditional[TinyLlama-1.1B-Chat-v1.0]