[None][infra] Enable accuracy test for eagle3 and chunked prefill (#6386)

Signed-off-by: leslie-fang25 <leslief@nvidia.com>
2026-01-13 22:18:36 +08:00 · 2025-08-04 13:45:24 +08:00 · 2025-08-04 13:45:24 +08:00 · a60190836c
commit a60190836c
parent 4763e94156
3 changed files with 12 additions and 5 deletions
--- a/docs/source/torch/features/feature_combination_matrix.md
+++ b/docs/source/torch/features/feature_combination_matrix.md
@ -8,8 +8,8 @@
 | Disaggregated Serving      | Yes               | Yes        | Yes                        | ---                   |                 |          |                           |                           |               |                  |                |                        |                       |                 |
 | Chunked Prefill            | Yes               | Yes        | Yes                        | Untested              | ---             |          |                           |                           |               |                  |                |                        |                       |                 |
 | MTP                        | Yes               | Yes        | Yes                        | Yes                   | Untested        | ---      |                           |                           |               |                  |                |                        |                       |                 |
-| EAGLE-3(One Model Engine)  | Yes               | Yes        | Yes                        | No                    | Untested        | No       | ---                       |                           |               |                  |                |                        |                       |                 |
-| EAGLE-3(Two Model Engine)  | NO                | Yes        | Yes                        | No                    | Untested        | No       | No                        | ---                       |               |                  |                |                        |                       |                 |
+| EAGLE-3(One Model Engine)  | Yes               | Yes        | Yes                        | No                    | Yes                   | No       | ---                       |                           |               |                  |                |                        |                       |                 |
+| EAGLE-3(Two Model Engine)  | NO                | Yes        | Yes                        | No                    | Yes                   | No       | No                        | ---                       |               |                  |                |                        |                       |                 |
 | Torch Sampler              | Yes               | Yes        | Yes                        | Yes                   | Yes             | Yes      | Yes                       | Yes                       | ---           |                  |                |                        |                       |                 |
 | TLLM C++ Sampler           | Yes               | Yes        | Yes                        | Yes                   | Yes             | No       | No                        | No                        | No            | ---              |                |                        |                       |                 |
 | KV Cache Reuse             | Yes               | Yes        | Yes                        | Untested              | Yes             | Untested | Yes                       | No                        | Yes           | Yes              | ---            |                        |                       |                 |
--- a/tests/integration/defs/accuracy/test_llm_api_pytorch.py
+++ b/tests/integration/defs/accuracy/test_llm_api_pytorch.py
@ -1955,7 +1955,9 @@ class TestQwen3_8B(LlmapiAccuracyTestHarness):
            task = MMLU(self.MODEL_NAME)
            task.evaluate(llm)

-    def test_eagle3(self):
+    @parametrize_with_ids("eagle3_one_model", [True, False])
+    @parametrize_with_ids("enable_chunked_prefill", [False, True])
+    def test_eagle3(self, enable_chunked_prefill, eagle3_one_model):
        pytorch_config = dict(
            disable_overlap_scheduler=True,
            cuda_graph_config=CudaGraphConfig(batch_sizes=[1]),
@ -1967,11 +1969,13 @@ class TestQwen3_8B(LlmapiAccuracyTestHarness):

        draft_len = 4
        spec_config = EagleDecodingConfig(max_draft_len=draft_len,
-                                          speculative_model_dir=eagle_model_dir)
+                                          speculative_model_dir=eagle_model_dir,
+                                          eagle3_one_model=eagle3_one_model)

        llm = LLM(model=target_model_dir,
                  **pytorch_config,
                  kv_cache_config=kv_cache_config,
+                  enable_chunked_prefill=enable_chunked_prefill,
                  speculative_config=spec_config,
                  build_config=None)

--- a/tests/integration/test_lists/test-db/l0_h100.yml
+++ b/tests/integration/test_lists/test-db/l0_h100.yml
@ -43,7 +43,10 @@ l0_h100:
  - accuracy/test_llm_api_pytorch.py::TestQwen3_8B::test_fp8_block_scales[latency]
  - accuracy/test_llm_api_pytorch.py::TestQwen3_30B_A3B::test_fp8[latency-torch_compile=False]
  - accuracy/test_llm_api_pytorch.py::TestQwen3_30B_A3B::test_fp8[latency-torch_compile=True]
-  - accuracy/test_llm_api_pytorch.py::TestQwen3_8B::test_eagle3
+  - accuracy/test_llm_api_pytorch.py::TestQwen3_8B::test_eagle3[enable_chunked_prefill=False-eagle3_one_model=False]
+  - accuracy/test_llm_api_pytorch.py::TestQwen3_8B::test_eagle3[enable_chunked_prefill=True-eagle3_one_model=True]
+  - accuracy/test_llm_api_pytorch.py::TestQwen3_8B::test_eagle3[enable_chunked_prefill=False-eagle3_one_model=True]
+  - accuracy/test_llm_api_pytorch.py::TestQwen3_8B::test_eagle3[enable_chunked_prefill=True-eagle3_one_model=False]
  - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_fp8_block_scales_cuda_graph_padding[mtp_nextn=0]
  - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_fp8_block_scales_cuda_graph_padding[mtp_nextn=2]