From 11d79aa875e6ff964357b10fea3abb251f040292 Mon Sep 17 00:00:00 2001 From: JennyLiu <141791095+JennyLiu-nv@users.noreply.github.com> Date: Thu, 12 Feb 2026 18:33:39 +0800 Subject: [PATCH] [https://nvbugs/5832481][test] Add gpt-oss-120b-Eagle3-throughput case on DGX-Spark (#11419) Signed-off-by: Jenny Liu Co-authored-by: Jenny Liu --- .../defs/perf/pytorch_model_config.py | 33 ++++++++++++++++--- tests/integration/defs/perf/test_perf.py | 3 +- .../test_lists/qa/llm_spark_perf.yml | 6 ++-- 3 files changed, 34 insertions(+), 8 deletions(-) diff --git a/tests/integration/defs/perf/pytorch_model_config.py b/tests/integration/defs/perf/pytorch_model_config.py index db2d6a12b5..25337faa8a 100644 --- a/tests/integration/defs/perf/pytorch_model_config.py +++ b/tests/integration/defs/perf/pytorch_model_config.py @@ -320,10 +320,10 @@ def get_model_yaml_config(model_label: str, 'num_postprocess_workers': 4 } }, - # GPT-OSS 120B speculative decoding (Eagle3 draft) + # GPT-OSS 120B speculative decoding with Eagle3 { 'patterns': [ - 'gpt_oss_120b_fp4-bench-pytorch-streaming-float4-maxbs:1-maxnt:4096-input_output_len:2048,128-reqs:1-con:1', + 'gpt_oss_120b_eagle3-bench-pytorch', ], 'config': { 'enable_attention_dp': False, @@ -337,9 +337,34 @@ def get_model_yaml_config(model_label: str, 'decoding_type': 'Eagle', 'max_draft_len': - 5, + 3, 'speculative_model_dir': - f"{llm_models_root()}/gpt_oss/gpt-oss-120b-Eagle3", + f'{llm_models_root()}/gpt_oss/gpt-oss-120b-Eagle3', + }, + 'kv_cache_config': { + 'enable_block_reuse': False, + }, + } + }, + # GPT-OSS 120B speculative decoding with Eagle3-throughput (https://nvbugspro.nvidia.com/bug/5832481) + { + 'patterns': [ + 'gpt_oss_120b_eagle3_throughput-bench-pytorch', + ], + 'config': { + 'enable_attention_dp': False, + 'disable_overlap_scheduler': True, + 'enable_autotuner': False, + 'cuda_graph_config': { + 'enable_padding': True, + }, + 'speculative_config': { + 'decoding_type': + 'Eagle', + 'max_draft_len': + 3, + 'speculative_model_dir': + f'{llm_models_root()}/gpt_oss/gpt-oss-120b-Eagle3-throughput', }, 'kv_cache_config': { 'enable_block_reuse': False, diff --git a/tests/integration/defs/perf/test_perf.py b/tests/integration/defs/perf/test_perf.py index 3695cf7e29..120df0f439 100644 --- a/tests/integration/defs/perf/test_perf.py +++ b/tests/integration/defs/perf/test_perf.py @@ -173,7 +173,8 @@ MODEL_PATH_DICT = { "mistral_small_v3.1_24b": "Mistral-Small-3.1-24B-Instruct-2503", "gpt_oss_120b_fp4": "gpt_oss/gpt-oss-120b", "gpt_oss_20b_fp4": "gpt_oss/gpt-oss-20b", - "gpt_oss_120b_eagle3": "gpt_oss/gpt-oss-120b-Eagle3", + "gpt_oss_120b_eagle3": "gpt_oss/gpt-oss-120b", + "gpt_oss_120b_eagle3_throughput": "gpt_oss/gpt-oss-120b", "nemotron_nano_3_30b_fp8": "Nemotron-Nano-3-30B-A3.5B-FP8-KVFP8-dev", "nemotron_nano_12b_v2": "NVIDIA-Nemotron-Nano-12B-v2", "nvidia_nemotron_nano_9b_v2_nvfp4": "NVIDIA-Nemotron-Nano-9B-v2-NVFP4", diff --git a/tests/integration/test_lists/qa/llm_spark_perf.yml b/tests/integration/test_lists/qa/llm_spark_perf.yml index 713192e93c..904bb30f17 100644 --- a/tests/integration/test_lists/qa/llm_spark_perf.yml +++ b/tests/integration/test_lists/qa/llm_spark_perf.yml @@ -10,10 +10,10 @@ llm_spark_perf: lte: 1 tests: - perf/test_perf.py::test_perf[gpt_oss_20b_fp4-bench-pytorch-streaming-float4-maxbs:1-input_output_len:2048,128-reqs:1-con:1] - # GPT-OSS 120B normal case (no spec dec) - perf/test_perf.py::test_perf[gpt_oss_120b_fp4-bench-pytorch-streaming-float4-maxbs:1-input_output_len:2048,128-reqs:1-con:1] - # GPT-OSS 120B spec dec case (Eagle3) - - perf/test_perf.py::test_perf[gpt_oss_120b_fp4-bench-pytorch-streaming-float4-maxbs:1-maxnt:4096-input_output_len:2048,128-reqs:1-con:1] + - perf/test_perf.py::test_perf[gpt_oss_120b_eagle3-bench-pytorch-streaming-float4-maxbs:1-input_output_len:2048,128-reqs:1-con:1] + # GPT-OSS 120B spec dec case (Eagle3-throughput) - https://nvbugspro.nvidia.com/bug/5832481 + - perf/test_perf.py::test_perf[gpt_oss_120b_eagle3_throughput-bench-pytorch-streaming-float4-maxbs:1-input_output_len:2048,128-reqs:1-con:1] - perf/test_perf.py::test_perf[nvidia_nemotron_nano_9b_v2_nvfp4-bench-pytorch-streaming-float4-maxbs:1-input_output_len:2048,128-reqs:1-con:1] - perf/test_perf.py::test_perf[llama_v3.1_8b_instruct_fp8-bench-pytorch-streaming-float8-maxbs:1-input_output_len:2048,128-reqs:1-con:1] - perf/test_perf.py::test_perf[llama_v3.1_8b_instruct_fp4-bench-pytorch-streaming-float4-maxbs:1-input_output_len:2048,128-reqs:1-con:1]