[chore] Clean up quickstart_advanced.py (#6021)

Signed-off-by: Mike Iovine <6158008+mikeiovine@users.noreply.github.com>
This commit is contained in:
Mike Iovine 2025-07-21 15:00:59 -04:00 committed by GitHub
parent d7f0b0ab68
commit 9645814bdf
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
5 changed files with 16 additions and 19 deletions

View File

@ -40,7 +40,7 @@ python3 quickstart_multimodal.py --model_dir Efficient-Large-Model/NVILA-8B --mo
python3 quickstart_advanced.py \
--model_dir meta-llama/Llama-3.1-8B-Instruct \
--spec_decode_algo NGRAM \
--spec_decode_nextn 4 \
--spec_decode_max_draft_len 4 \
--max_matching_ngram_size 2 \
--disable_overlap_scheduler \
--disable_kv_cache_reuse
@ -51,7 +51,7 @@ python3 quickstart_advanced.py \
python3 quickstart_advanced.py \
--model_dir meta-llama/Llama-3.1-8B-Instruct \
--spec_decode_algo draft_target \
--spec_decode_nextn 5 \
--spec_decode_max_draft_len 5 \
--draft_model_dir meta-llama/Llama-3.2-1B-Instruct \
--disable_overlap_scheduler \
--disable_kv_cache_reuse

View File

@ -108,11 +108,8 @@ def add_llm_args(parser):
# Speculative decoding
parser.add_argument('--spec_decode_algo', type=str, default=None)
parser.add_argument('--spec_decode_nextn', type=int, default=1)
parser.add_argument('--draft_model_dir',
'--eagle_model_dir',
type=str,
default=None)
parser.add_argument('--spec_decode_max_draft_len', type=int, default=1)
parser.add_argument('--draft_model_dir', type=str, default=None)
parser.add_argument('--max_matching_ngram_size', type=int, default=5)
parser.add_argument('--use_one_model', default=False, action='store_true')
@ -162,23 +159,23 @@ def setup_llm(args, **kwargs):
)
spec_config = MTPDecodingConfig(
num_nextn_predict_layers=args.spec_decode_nextn,
num_nextn_predict_layers=args.spec_decode_max_draft_len,
use_relaxed_acceptance_for_thinking=args.
use_relaxed_acceptance_for_thinking,
relaxed_topk=args.relaxed_topk,
relaxed_delta=args.relaxed_delta)
elif spec_decode_algo == "EAGLE3":
spec_config = EagleDecodingConfig(
max_draft_len=args.spec_decode_nextn,
max_draft_len=args.spec_decode_max_draft_len,
speculative_model_dir=args.draft_model_dir,
eagle3_one_model=args.use_one_model)
elif spec_decode_algo == "DRAFT_TARGET":
spec_config = DraftTargetDecodingConfig(
max_draft_len=args.spec_decode_nextn,
max_draft_len=args.spec_decode_max_draft_len,
speculative_model_dir=args.draft_model_dir)
elif spec_decode_algo == "NGRAM":
spec_config = NGramDecodingConfig(
max_draft_len=args.spec_decode_nextn,
max_draft_len=args.spec_decode_max_draft_len,
max_matching_ngram_size=args.max_matching_ngram_size,
is_keep_all=True,
is_use_oldest=True,

View File

@ -97,7 +97,7 @@ Prompt: 'The future of AI is', Generated text: ' a topic of great interest and s
To run with MTP, use [examples/llm-api/quickstart_advanced.py](../pytorch/quickstart_advanced.py) with additional options, see
```bash
cd examples/llm-api
python quickstart_advanced.py --model_dir <YOUR_MODEL_DIR> --spec_decode_algo MTP --spec_decode_nextn N
python quickstart_advanced.py --model_dir <YOUR_MODEL_DIR> --spec_decode_algo MTP --spec_decode_max_draft_len N
```
`N` is the number of MTP modules. When `N` is equal to `0`, which means that MTP is not used (default). When `N` is greater than `0`, which means that `N` MTP modules are enabled. In the current implementation, the weight of each MTP module is shared.
@ -124,7 +124,7 @@ When verifying and receiving draft tokens, there are two ways:
```bash
cd examples/llm-api
python quickstart_advanced.py --model_dir <YOUR_MODEL_DIR> --spec_decode_algo MTP --spec_decode_nextn N --use_relaxed_acceptance_for_thinking --relaxed_topk 15 --relaxed_delta 0.5
python quickstart_advanced.py --model_dir <YOUR_MODEL_DIR> --spec_decode_algo MTP --spec_decode_max_draft_len N --use_relaxed_acceptance_for_thinking --relaxed_topk 15 --relaxed_delta 0.5
```
### Long context support

View File

@ -90,7 +90,7 @@ python examples/summarize.py \
```bash
python3 examples/llm-api/quickstart_advanced.py \
--spec_decode_nextn 4 \
--spec_decode_max_draft_len 4 \
--max_matching_ngram_size 2 \
--disable_overlap_scheduler \
--disable_kv_cache_reuse

View File

@ -1641,7 +1641,7 @@ def test_ptp_quickstart_advanced_mtp(llm_root, llm_venv, model_name,
[
str(example_root / "quickstart_advanced.py"),
"--use_cuda_graph",
"--spec_decode_nextn",
"--spec_decode_max_draft_len",
"1", # test 1 MTP module
"--spec_decode_algo",
"MTP",
@ -1720,13 +1720,13 @@ def test_ptp_quickstart_advanced_eagle3(llm_root, llm_venv, model_name,
delete_on_close=True) as running_log:
llm_venv.run_cmd([
str(example_root / "quickstart_advanced.py"),
"--spec_decode_nextn",
"--spec_decode_max_draft_len",
"4",
"--spec_decode_algo",
"eagle3",
"--model_dir",
f"{llm_models_root()}/{model_path}",
"--eagle_model_dir",
"--draft_model_dir",
f"{llm_models_root()}/{eagle_model_path}",
"--disable_kv_cache_reuse",
"--disable_overlap_scheduler",
@ -1753,7 +1753,7 @@ def test_ptp_quickstart_advanced_ngram(llm_root, llm_venv, model_name,
f"{llm_models_root()}/{model_path}",
"--spec_decode_algo",
"NGRAM",
"--spec_decode_nextn",
"--spec_decode_max_draft_len",
"4",
"--max_matching_ngram_size",
"2",
@ -1829,7 +1829,7 @@ def test_relaxed_acceptance_quickstart_advanced_deepseek_r1_8gpus(
"--disable_kv_cache_reuse",
"--spec_decode_algo",
"MTP",
"--spec_decode_nextn",
"--spec_decode_max_draft_len",
"5",
"--use_relaxed_acceptance_for_thinking",
"--relaxed_topk=10",