[chore] Clean up quickstart_advanced.py (#6021)

Signed-off-by: Mike Iovine <6158008+mikeiovine@users.noreply.github.com>
2026-01-13 22:18:36 +08:00 · 2025-07-21 15:00:59 -04:00 · 2025-07-21 15:00:59 -04:00 · 9645814bdf
commit 9645814bdf
parent d7f0b0ab68
5 changed files with 16 additions and 19 deletions
--- a/examples/llm-api/README.md
+++ b/examples/llm-api/README.md
@ -40,7 +40,7 @@ python3 quickstart_multimodal.py --model_dir Efficient-Large-Model/NVILA-8B --mo
 python3 quickstart_advanced.py \
    --model_dir meta-llama/Llama-3.1-8B-Instruct \
    --spec_decode_algo NGRAM \
-    --spec_decode_nextn 4 \
+    --spec_decode_max_draft_len 4 \
    --max_matching_ngram_size 2 \
    --disable_overlap_scheduler \
    --disable_kv_cache_reuse
@ -51,7 +51,7 @@ python3 quickstart_advanced.py \
 python3 quickstart_advanced.py \
    --model_dir meta-llama/Llama-3.1-8B-Instruct \
    --spec_decode_algo draft_target \
-    --spec_decode_nextn 5 \
+    --spec_decode_max_draft_len 5 \
    --draft_model_dir meta-llama/Llama-3.2-1B-Instruct \
    --disable_overlap_scheduler \
    --disable_kv_cache_reuse
--- a/examples/llm-api/quickstart_advanced.py
+++ b/examples/llm-api/quickstart_advanced.py
@ -108,11 +108,8 @@ def add_llm_args(parser):

    # Speculative decoding
    parser.add_argument('--spec_decode_algo', type=str, default=None)
-    parser.add_argument('--spec_decode_nextn', type=int, default=1)
-    parser.add_argument('--draft_model_dir',
-                        '--eagle_model_dir',
-                        type=str,
-                        default=None)
+    parser.add_argument('--spec_decode_max_draft_len', type=int, default=1)
+    parser.add_argument('--draft_model_dir', type=str, default=None)
    parser.add_argument('--max_matching_ngram_size', type=int, default=5)
    parser.add_argument('--use_one_model', default=False, action='store_true')

@ -162,23 +159,23 @@ def setup_llm(args, **kwargs):
            )

        spec_config = MTPDecodingConfig(
-            num_nextn_predict_layers=args.spec_decode_nextn,
+            num_nextn_predict_layers=args.spec_decode_max_draft_len,
            use_relaxed_acceptance_for_thinking=args.
            use_relaxed_acceptance_for_thinking,
            relaxed_topk=args.relaxed_topk,
            relaxed_delta=args.relaxed_delta)
    elif spec_decode_algo == "EAGLE3":
        spec_config = EagleDecodingConfig(
-            max_draft_len=args.spec_decode_nextn,
+            max_draft_len=args.spec_decode_max_draft_len,
            speculative_model_dir=args.draft_model_dir,
            eagle3_one_model=args.use_one_model)
    elif spec_decode_algo == "DRAFT_TARGET":
        spec_config = DraftTargetDecodingConfig(
-            max_draft_len=args.spec_decode_nextn,
+            max_draft_len=args.spec_decode_max_draft_len,
            speculative_model_dir=args.draft_model_dir)
    elif spec_decode_algo == "NGRAM":
        spec_config = NGramDecodingConfig(
-            max_draft_len=args.spec_decode_nextn,
+            max_draft_len=args.spec_decode_max_draft_len,
            max_matching_ngram_size=args.max_matching_ngram_size,
            is_keep_all=True,
            is_use_oldest=True,
--- a/examples/models/core/deepseek_v3/README.md
+++ b/examples/models/core/deepseek_v3/README.md
@ -97,7 +97,7 @@ Prompt: 'The future of AI is', Generated text: ' a topic of great interest and s
 To run with MTP, use [examples/llm-api/quickstart_advanced.py](../pytorch/quickstart_advanced.py) with additional options, see
 ```bash
 cd examples/llm-api
-python quickstart_advanced.py --model_dir <YOUR_MODEL_DIR> --spec_decode_algo MTP --spec_decode_nextn N
+python quickstart_advanced.py --model_dir <YOUR_MODEL_DIR> --spec_decode_algo MTP --spec_decode_max_draft_len N
 ```

 `N` is the number of MTP modules. When `N` is equal to `0`, which means that MTP is not used (default). When `N` is greater than `0`, which means that `N` MTP modules are enabled. In the current implementation, the weight of each MTP module is shared.
@ -124,7 +124,7 @@ When verifying and receiving draft tokens, there are two ways:

  ```bash
  cd examples/llm-api
-  python quickstart_advanced.py --model_dir <YOUR_MODEL_DIR> --spec_decode_algo MTP --spec_decode_nextn N --use_relaxed_acceptance_for_thinking --relaxed_topk 15 --relaxed_delta 0.5
+  python quickstart_advanced.py --model_dir <YOUR_MODEL_DIR> --spec_decode_algo MTP --spec_decode_max_draft_len N --use_relaxed_acceptance_for_thinking --relaxed_topk 15 --relaxed_delta 0.5
  ```

 ### Long context support
--- a/examples/ngram/README.md
+++ b/examples/ngram/README.md
@ -90,7 +90,7 @@ python examples/summarize.py \

 ```bash
 python3 examples/llm-api/quickstart_advanced.py \
-    --spec_decode_nextn 4 \
+    --spec_decode_max_draft_len 4 \
    --max_matching_ngram_size 2 \
    --disable_overlap_scheduler \
    --disable_kv_cache_reuse
--- a/tests/integration/defs/test_e2e.py
+++ b/tests/integration/defs/test_e2e.py
@ -1641,7 +1641,7 @@ def test_ptp_quickstart_advanced_mtp(llm_root, llm_venv, model_name,
            [
                str(example_root / "quickstart_advanced.py"),
                "--use_cuda_graph",
-                "--spec_decode_nextn",
+                "--spec_decode_max_draft_len",
                "1",  # test 1 MTP module
                "--spec_decode_algo",
                "MTP",
@ -1720,13 +1720,13 @@ def test_ptp_quickstart_advanced_eagle3(llm_root, llm_venv, model_name,
                                     delete_on_close=True) as running_log:
        llm_venv.run_cmd([
            str(example_root / "quickstart_advanced.py"),
-            "--spec_decode_nextn",
+            "--spec_decode_max_draft_len",
            "4",
            "--spec_decode_algo",
            "eagle3",
            "--model_dir",
            f"{llm_models_root()}/{model_path}",
-            "--eagle_model_dir",
+            "--draft_model_dir",
            f"{llm_models_root()}/{eagle_model_path}",
            "--disable_kv_cache_reuse",
            "--disable_overlap_scheduler",
@ -1753,7 +1753,7 @@ def test_ptp_quickstart_advanced_ngram(llm_root, llm_venv, model_name,
            f"{llm_models_root()}/{model_path}",
            "--spec_decode_algo",
            "NGRAM",
-            "--spec_decode_nextn",
+            "--spec_decode_max_draft_len",
            "4",
            "--max_matching_ngram_size",
            "2",
@ -1829,7 +1829,7 @@ def test_relaxed_acceptance_quickstart_advanced_deepseek_r1_8gpus(
            "--disable_kv_cache_reuse",
            "--spec_decode_algo",
            "MTP",
-            "--spec_decode_nextn",
+            "--spec_decode_max_draft_len",
            "5",
            "--use_relaxed_acceptance_for_thinking",
            "--relaxed_topk=10",