mirror of
https://github.com/NVIDIA/TensorRT-LLM.git
synced 2026-02-11 13:33:40 +08:00
Merge branch 'main' into fix_spec_gate
Signed-off-by: Zheyu Fu <zheyuf@nvidia.com>
This commit is contained in:
commit
1d61d74c4e
@ -287,7 +287,7 @@ def submit_job(config, log_dir, dry_run):
|
||||
f"--container-image {env_config['container_image']}",
|
||||
f"--container-name {container_name}",
|
||||
f"--container-mounts {env_config['container_mount']}",
|
||||
"--mpi=pmix --overlap",
|
||||
"--no-container-mount-home --mpi=pmix --overlap",
|
||||
f"bash {os.path.join(env_config['work_dir'], 'start_worker.sh')}",
|
||||
server_type,
|
||||
str(server_id),
|
||||
@ -313,7 +313,7 @@ def submit_job(config, log_dir, dry_run):
|
||||
f"--container-name={container_name}",
|
||||
f"--container-image={env_config['container_image']}",
|
||||
f"--container-mounts={env_config['container_mount']}",
|
||||
f"--mpi=pmix --overlap -N 1 -n 1",
|
||||
f"--no-container-mount-home --mpi=pmix --overlap -N 1 -n 1",
|
||||
f"bash {env_config['work_dir']}/start_server.sh {os.path.join(log_dir, 'server_config.yaml')} \"{server_env_var}\"",
|
||||
f"&> {log_dir}/4_output_server.log &",
|
||||
]
|
||||
|
||||
@ -3256,12 +3256,13 @@ def launchTestJobs(pipeline, testFilter)
|
||||
"DGX_H100-4_GPUs-PyTorch-GptOss-1": ["dgx-h100-x4-oci", "l0_dgx_h100", 1, 1, 4],
|
||||
"DGX_H100-4_GPUs-PyTorch-Others-1": ["dgx-h100-x4-oci", "l0_dgx_h100", 1, 1, 4],
|
||||
"DGX_H100-4_GPUs-PyTorch-Ray-1": ["dgx-h100-x4-oci", "l0_dgx_h100", 1, 1, 4],
|
||||
"B300-PyTorch-1": ["b300-single", "l0_b300", 1, 1],
|
||||
"DGX_B200-4_GPUs-PyTorch-1": ["b200-x4-lbd", "l0_dgx_b200", 1, 1, 4, 1, true],
|
||||
"DGX_B200-4_GPUs-PyTorch-Ray-1": ["b200-x4-lbd", "l0_dgx_b200", 1, 1, 4, 1, true],
|
||||
"DGX_B200-8_GPUs-PyTorch-1": ["b200-x8-lbd", "l0_dgx_b200", 1, 1, 8, 1, true],
|
||||
"DGX_B200-4_GPUs-PyTorch-Post-Merge-1": ["b200-x4-lbd", "l0_dgx_b200", 1, 2, 4, 1, true],
|
||||
"DGX_B200-4_GPUs-PyTorch-Post-Merge-2": ["b200-x4-lbd", "l0_dgx_b200", 2, 2, 4, 1, true],
|
||||
"B300-PyTorch-1": ["b300-single", "l0_b300", 1, 1],
|
||||
"DGX_B300-4_GPUs-PyTorch-1": ["b300-x4", "l0_dgx_b300", 1, 1, 4],
|
||||
"DGX_B300-4_GPUs-PyTorch-Post-Merge-1": ["b300-x4", "l0_dgx_b300", 1, 2, 4],
|
||||
"DGX_B300-4_GPUs-PyTorch-Post-Merge-2": ["b300-x4", "l0_dgx_b300", 2, 2, 4],
|
||||
// PerfSanity post-merge tests
|
||||
|
||||
@ -813,13 +813,14 @@ class DSAtrtllmAttentionMetadata(TrtllmAttentionMetadata):
|
||||
# Expand schedule metadata buffer (only generation)
|
||||
kv_lens_expanded = self.kv_lens_expanded_cuda[:num_tokens]
|
||||
scheduler_metadata_buffer_expanded = get_paged_mqa_logits_metadata(
|
||||
kv_lens_expanded, tokens_per_block, self.num_sms)
|
||||
kv_lens_expanded, self.kv_cache_manager.tokens_per_block,
|
||||
self.num_sms)
|
||||
self.scheduler_metadata_buffer_expanded.copy_(
|
||||
scheduler_metadata_buffer_expanded, non_blocking=True)
|
||||
elif self.max_draft_tokens == 3:
|
||||
scheduler_metadata_buffer_mtp3 = get_paged_mqa_logits_metadata(
|
||||
self.kv_lens_cuda[self.num_contexts:self.num_seqs],
|
||||
tokens_per_block, self.num_sms // 2)
|
||||
self.kv_cache_manager.tokens_per_block, self.num_sms // 2)
|
||||
self.scheduler_metadata_buffer_mtp3.copy_(
|
||||
scheduler_metadata_buffer_mtp3, non_blocking=True)
|
||||
self.prepare_dense_topk_indices(self.kv_lens_cuda, device=True)
|
||||
|
||||
@ -18,7 +18,7 @@ from torch.cuda import device_count
|
||||
from tensorrt_llm import LLM as PyTorchLLM
|
||||
from tensorrt_llm import MultimodalEncoder
|
||||
from tensorrt_llm._tensorrt_engine import LLM
|
||||
from tensorrt_llm._utils import get_free_port, mpi_rank
|
||||
from tensorrt_llm._utils import mpi_rank
|
||||
from tensorrt_llm.executor.utils import LlmLauncherEnvs
|
||||
from tensorrt_llm.inputs.multimodal import MultimodalServerConfig
|
||||
from tensorrt_llm.llmapi import (BuildConfig, CapacitySchedulerPolicy,
|
||||
@ -189,25 +189,12 @@ def launch_server(
|
||||
with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s:
|
||||
# If disagg cluster config is provided and port is not specified, try to find a free port, otherwise try to bind to the specified port
|
||||
assert port > 0 or disagg_cluster_config is not None, "Port must be specified if disagg cluster config is not provided"
|
||||
if port > 0:
|
||||
port_retries = 1
|
||||
else:
|
||||
port_retries = 100
|
||||
port = get_free_port()
|
||||
while port_retries > 0:
|
||||
try:
|
||||
s.bind((host, port))
|
||||
break
|
||||
except OSError as e:
|
||||
port_retries -= 1
|
||||
if port_retries == 0:
|
||||
raise RuntimeError(
|
||||
f"Failed to bind socket to {host}:{port}: {e}")
|
||||
else:
|
||||
logger.warning(
|
||||
f"Failed to bind socket to {host}:{port}: {e}, retrying {port_retries}..."
|
||||
)
|
||||
port = get_free_port()
|
||||
try:
|
||||
s.bind((host, port))
|
||||
if port == 0:
|
||||
port = s.getsockname()[1]
|
||||
except OSError as e:
|
||||
raise RuntimeError(f"Failed to bind socket to {host}:{port}: {e}")
|
||||
|
||||
if backend == 'pytorch':
|
||||
llm_args.pop("build_config", None)
|
||||
|
||||
@ -8,3 +8,14 @@ deepseek-ai/DeepSeek-V3-Lite:
|
||||
- accuracy: 77.00
|
||||
- spec_dec_algo: MTP
|
||||
accuracy: 77.00
|
||||
google/gemma-3-1b-it:
|
||||
- quant_algo: FP8
|
||||
kv_cache_quant_algo: FP8
|
||||
accuracy: 61.00
|
||||
GPT-OSS/120B-MXFP4:
|
||||
- quant_algo: W4A16_MXFP4
|
||||
spec_dec_algo: Eagle
|
||||
accuracy: 62.00
|
||||
- quant_algo: W4A8_MXFP4_MXFP8
|
||||
spec_dec_algo: Eagle
|
||||
accuracy: 62.00
|
||||
|
||||
@ -1105,6 +1105,37 @@ class TestGemma3_1BInstruct(LlmapiAccuracyTestHarness):
|
||||
task = MMLU(self.MODEL_NAME)
|
||||
task.evaluate(llm)
|
||||
|
||||
def test_fp8_vswa_reuse(self):
|
||||
# NOTE: Test with VSWA kv cache config.
|
||||
kv_cache_config = KvCacheConfig(
|
||||
enable_block_reuse=True,
|
||||
max_attention_window=[512, 512, 512, 512, 512, 32768],
|
||||
)
|
||||
prequantized_model_path = f"{llm_models_root()}/gemma/gemma-3-1b-it-fp8/"
|
||||
with LLM(prequantized_model_path,
|
||||
kv_cache_config=kv_cache_config) as llm:
|
||||
task = GSM8K(self.MODEL_NAME)
|
||||
task.evaluate(llm)
|
||||
task = MMLU(self.MODEL_NAME)
|
||||
task.evaluate(llm)
|
||||
|
||||
@pytest.mark.parametrize("backend", ["xgrammar"])
|
||||
def test_fp8_guided_decoding_vswa_reuse(self, backend: str, mocker):
|
||||
mocker.patch.dict(os.environ, {"TRTLLM_XGUIDANCE_LENIENT": "1"})
|
||||
prequantized_model_path = f"{llm_models_root()}/gemma/gemma-3-1b-it-fp8/"
|
||||
kv_cache_config = KvCacheConfig(
|
||||
enable_block_reuse=True,
|
||||
max_attention_window=[512, 512, 512, 512, 512, 32768],
|
||||
)
|
||||
cuda_graph_config = CudaGraphConfig(enable_padding=True)
|
||||
llm = LLM(prequantized_model_path,
|
||||
guided_decoding_backend=backend,
|
||||
kv_cache_config=kv_cache_config,
|
||||
cuda_graph_config=cuda_graph_config)
|
||||
with llm:
|
||||
task = JsonModeEval(self.MODEL_NAME)
|
||||
task.evaluate(llm)
|
||||
|
||||
def test_auto_dtype_vswa_without_reuse(self):
|
||||
# NOTE: Test with VSWA kv cache config.
|
||||
kv_cache_config = KvCacheConfig(
|
||||
@ -2269,6 +2300,7 @@ class TestDeepSeekR1(LlmapiAccuracyTestHarness):
|
||||
torch.cuda.empty_cache()
|
||||
|
||||
@skip_pre_blackwell
|
||||
@pytest.mark.skip_less_device_memory(95000)
|
||||
@pytest.mark.parametrize(
|
||||
"tp_size,pp_size,ep_size,mtp_nextn,fp8kv,attention_dp,cuda_graph,overlap_scheduler,max_batch_size,moe_backend",
|
||||
[
|
||||
@ -4460,6 +4492,114 @@ class TestGPTOSS(LlmapiAccuracyTestHarness):
|
||||
sampling_params=sampling_params,
|
||||
extra_evaluator_kwargs=extra_evaluator_kwargs)
|
||||
|
||||
@pytest.mark.skip_less_device(4)
|
||||
@pytest.mark.parametrize("one_model", [True, False],
|
||||
ids=["one_model", "two_model"])
|
||||
def test_eagle3_vswa_reuse_4gpus(self, one_model, mocker):
|
||||
MAX_OUTPUT_LEN = 128179
|
||||
MAX_INPUT_LEN = 32768
|
||||
|
||||
mocker.patch.object(GSM8K, "MAX_OUTPUT_LEN", 8192)
|
||||
mocker.patch.dict(GSM8K.EVALUATE_KWARGS,
|
||||
{"scores_filter": "exact_match,flexible-extract"})
|
||||
|
||||
mocker.patch.object(GPQADiamond, "MAX_OUTPUT_LEN", MAX_OUTPUT_LEN)
|
||||
mocker.patch.object(GPQADiamond, "MAX_INPUT_LEN", MAX_INPUT_LEN)
|
||||
|
||||
pytorch_config = dict(cuda_graph_config=CudaGraphConfig())
|
||||
kv_cache_config = KvCacheConfig(free_gpu_memory_fraction=0.4,
|
||||
dtype="auto",
|
||||
enable_block_reuse=True,
|
||||
max_attention_window=[128, 32768])
|
||||
|
||||
eagle_model_dir = f"{llm_models_root()}/gpt_oss/gpt-oss-120b-Eagle3"
|
||||
draft_len = 3
|
||||
spec_config = EagleDecodingConfig(max_draft_len=draft_len,
|
||||
speculative_model_dir=eagle_model_dir,
|
||||
eagle3_one_model=one_model,
|
||||
allow_advanced_sampling=True)
|
||||
|
||||
max_seq_len = MAX_INPUT_LEN + MAX_OUTPUT_LEN
|
||||
llm = LLM(self.MODEL_PATH,
|
||||
tensor_parallel_size=4,
|
||||
pipeline_parallel_size=1,
|
||||
moe_expert_parallel_size=1,
|
||||
kv_cache_config=kv_cache_config,
|
||||
max_seq_len=max_seq_len,
|
||||
speculative_config=spec_config,
|
||||
**pytorch_config,
|
||||
enable_attention_dp=False)
|
||||
|
||||
with llm:
|
||||
model_name = "GPT-OSS/120B-MXFP4"
|
||||
|
||||
# GSM8K
|
||||
task = GSM8K(model_name)
|
||||
task.evaluate(llm,
|
||||
extra_evaluator_kwargs=self.extra_evaluator_kwargs)
|
||||
|
||||
# GPQA Medium Reasoning
|
||||
task = GPQADiamond(model_name)
|
||||
|
||||
chat_template_kwargs = dict(reasoning_effort="medium")
|
||||
extra_evaluator_kwargs = {
|
||||
**self.extra_evaluator_kwargs, "chat_template_kwargs":
|
||||
chat_template_kwargs
|
||||
}
|
||||
|
||||
sampling_params = SamplingParams(
|
||||
temperature=1.0,
|
||||
top_p=1.0,
|
||||
max_tokens=MAX_OUTPUT_LEN,
|
||||
truncate_prompt_tokens=MAX_INPUT_LEN)
|
||||
|
||||
task.evaluate(llm,
|
||||
sampling_params=sampling_params,
|
||||
extra_evaluator_kwargs=extra_evaluator_kwargs)
|
||||
|
||||
@pytest.mark.skip_less_device(4)
|
||||
@pytest.mark.parametrize("one_model", [True, False],
|
||||
ids=["one_model", "two_model"])
|
||||
def test_eagle3_guided_decoding_4gpus(self, one_model, mocker):
|
||||
MAX_OUTPUT_LEN = 128179
|
||||
MAX_INPUT_LEN = 32768
|
||||
|
||||
mocker.patch.dict(os.environ, {"TRTLLM_XGUIDANCE_LENIENT": "1"})
|
||||
mocker.patch.object(GSM8K, "MAX_OUTPUT_LEN", 8192)
|
||||
mocker.patch.dict(GSM8K.EVALUATE_KWARGS,
|
||||
{"scores_filter": "exact_match,flexible-extract"})
|
||||
|
||||
mocker.patch.object(GPQADiamond, "MAX_OUTPUT_LEN", MAX_OUTPUT_LEN)
|
||||
mocker.patch.object(GPQADiamond, "MAX_INPUT_LEN", MAX_INPUT_LEN)
|
||||
|
||||
pytorch_config = dict(cuda_graph_config=CudaGraphConfig())
|
||||
kv_cache_config = KvCacheConfig(free_gpu_memory_fraction=0.4,
|
||||
dtype="auto")
|
||||
|
||||
eagle_model_dir = f"{llm_models_root()}/gpt_oss/gpt-oss-120b-Eagle3"
|
||||
draft_len = 3
|
||||
spec_config = EagleDecodingConfig(max_draft_len=draft_len,
|
||||
speculative_model_dir=eagle_model_dir,
|
||||
eagle3_one_model=one_model,
|
||||
allow_advanced_sampling=True)
|
||||
|
||||
max_seq_len = MAX_INPUT_LEN + MAX_OUTPUT_LEN
|
||||
llm = LLM(self.MODEL_PATH,
|
||||
tensor_parallel_size=4,
|
||||
pipeline_parallel_size=1,
|
||||
moe_expert_parallel_size=1,
|
||||
guided_decoding_backend="xgrammar",
|
||||
kv_cache_config=kv_cache_config,
|
||||
max_seq_len=max_seq_len,
|
||||
speculative_config=spec_config,
|
||||
**pytorch_config,
|
||||
enable_attention_dp=False)
|
||||
|
||||
with llm:
|
||||
model_name = "GPT-OSS/120B-MXFP4"
|
||||
task = JsonModeEval(model_name)
|
||||
task.evaluate(llm)
|
||||
|
||||
@pytest.mark.skip_less_device(2)
|
||||
@pytest.mark.timeout(14400)
|
||||
@pytest.mark.parametrize("overlap_scheduler", [True, False],
|
||||
|
||||
@ -15,6 +15,7 @@
|
||||
import copy
|
||||
import os
|
||||
import platform
|
||||
import random
|
||||
import re
|
||||
import socket
|
||||
import tempfile
|
||||
@ -1162,28 +1163,33 @@ def get_free_port_in_ci(max_attempts=100):
|
||||
Get a free port in the range [CONTAINER_PORT_START, CONTAINER_PORT_START + CONTAINER_PORT_NUM - 1]
|
||||
If CONTAINER_PORT_START and CONTAINER_PORT_NUM are not set or all ports are already in use, fallback to get_free_port
|
||||
"""
|
||||
global PORTS_IN_USE
|
||||
|
||||
container_port_start = int(os.environ.get("CONTAINER_PORT_START", -1))
|
||||
container_port_num = int(os.environ.get("CONTAINER_PORT_NUM", -1))
|
||||
if container_port_start != -1 and container_port_num != -1:
|
||||
for i in range(container_port_num):
|
||||
port = container_port_start + i
|
||||
if port in PORTS_IN_USE:
|
||||
continue
|
||||
available_ports = [
|
||||
port for port in range(container_port_start, container_port_start +
|
||||
container_port_num)
|
||||
if port not in PORTS_IN_USE
|
||||
]
|
||||
|
||||
for _ in range(len(available_ports)):
|
||||
# Get a random port from the available ports
|
||||
port = random.choice(available_ports)
|
||||
|
||||
# Check if the port is free
|
||||
with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s:
|
||||
try:
|
||||
s.bind(("localhost", port))
|
||||
|
||||
# Port is free, add it to the set of used ports
|
||||
PORTS_IN_USE.add(port)
|
||||
return port
|
||||
except OSError:
|
||||
# Port is not free, try the next port
|
||||
available_ports.remove(port)
|
||||
continue
|
||||
|
||||
# No port found in the range, try to get a random free port from the system
|
||||
for i in range(max_attempts):
|
||||
for _ in range(max_attempts):
|
||||
port = get_free_port()
|
||||
if port not in PORTS_IN_USE:
|
||||
PORTS_IN_USE.add(port)
|
||||
|
||||
@ -430,6 +430,8 @@ accuracy/test_llm_api_pytorch.py::TestLlama3_3_70BInstruct::test_fp8_eagle3_tp8[
|
||||
accuracy/test_llm_api_pytorch.py::TestLlama3_3_70BInstruct::test_fp8_eagle3_tp8[eagle3_one_model=False-torch_compile=True]
|
||||
accuracy/test_llm_api_pytorch.py::TestMistral7B::test_auto_dtype
|
||||
accuracy/test_llm_api_pytorch.py::TestGemma3_1BInstruct::test_auto_dtype
|
||||
accuracy/test_llm_api_pytorch.py::TestGemma3_1BInstruct::test_fp8_vswa_reuse
|
||||
accuracy/test_llm_api_pytorch.py::TestGemma3_1BInstruct::test_fp8_guided_decoding_vswa_reuse[xgrammar]
|
||||
accuracy/test_llm_api_pytorch.py::TestGemma3_27BInstruct::test_fp8_prequantized
|
||||
accuracy/test_llm_api_pytorch.py::TestGemma3_27BInstruct::test_auto_dtype
|
||||
accuracy/test_llm_api_pytorch.py::TestMistralSmall24B::test_auto_dtype
|
||||
@ -613,6 +615,10 @@ accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_eagle3_4gpus[triton-one_model
|
||||
accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_eagle3_4gpus[triton-one_model-no_overlap_scheduler]
|
||||
accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_eagle3_4gpus[triton-two_model-overlap_scheduler]
|
||||
accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_eagle3_4gpus[triton-two_model-no_overlap_scheduler]
|
||||
accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_eagle3_vswa_reuse_4gpus[one_model]
|
||||
accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_eagle3_vswa_reuse_4gpus[two_model]
|
||||
accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_eagle3_guided_decoding_4gpus[one_model]
|
||||
accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_eagle3_guided_decoding_4gpus[two_model]
|
||||
accuracy/test_disaggregated_serving.py::TestLlama3_1_8BInstruct::test_auto_dtype[False-False-False]
|
||||
accuracy/test_disaggregated_serving.py::TestLlama3_1_8BInstruct::test_auto_dtype[True-True-True]
|
||||
accuracy/test_disaggregated_serving.py::TestLlama3_1_8BInstruct::test_ngram
|
||||
|
||||
@ -151,6 +151,9 @@ accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_4gpus[ep4-cutlass-auto]
|
||||
accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_4gpus[tp4-cutlass-auto]
|
||||
accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4a16[dp4-auto]
|
||||
accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_chunked_prefill[cutlass-auto]
|
||||
accuracy/test_llm_api_pytorch.py::TestQwen3_235B_A22B::test_nvfp4[latency_moe_trtllm]
|
||||
accuracy/test_llm_api_pytorch.py::TestQwen3_235B_A22B::test_nvfp4[latency_moe_trtllm_eagle3]
|
||||
accuracy/test_llm_api_pytorch.py::TestDeepSeekR1::test_nvfp4_multi_gpus_sm120[throughput_tp8]
|
||||
accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_eagle3_4gpus[cutlass-one_model-overlap_scheduler]
|
||||
accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_eagle3_4gpus[cutlass-one_model-no_overlap_scheduler]
|
||||
accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_eagle3_4gpus[cutlass-two_model-overlap_scheduler]
|
||||
@ -204,6 +207,18 @@ accuracy/test_disaggregated_serving.py::TestLlama4ScoutInstruct::test_auto_dtype
|
||||
accuracy/test_disaggregated_serving.py::TestQwen3_8B::test_auto_dtype[False]
|
||||
accuracy/test_disaggregated_serving.py::TestQwen3_8B::test_auto_dtype[True]
|
||||
|
||||
accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_eagle3_4gpus[trtllm-one_model-overlap_scheduler]
|
||||
accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_eagle3_4gpus[trtllm-one_model-no_overlap_scheduler]
|
||||
accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_eagle3_4gpus[trtllm-two_model-overlap_scheduler]
|
||||
accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_eagle3_4gpus[trtllm-two_model-no_overlap_scheduler]
|
||||
accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_eagle3_4gpus[triton-one_model-overlap_scheduler]
|
||||
accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_eagle3_4gpus[triton-one_model-no_overlap_scheduler]
|
||||
accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_eagle3_4gpus[triton-two_model-overlap_scheduler]
|
||||
accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_eagle3_4gpus[triton-two_model-no_overlap_scheduler]
|
||||
accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_eagle3_vswa_reuse_4gpus[one_model]
|
||||
accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_eagle3_vswa_reuse_4gpus[two_model]
|
||||
accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_eagle3_guided_decoding_4gpus[one_model]
|
||||
accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_eagle3_guided_decoding_4gpus[two_model]
|
||||
test_e2e.py::test_ptp_quickstart_advanced_mixed_precision
|
||||
test_e2e.py::test_ptp_quickstart_advanced[Llama3.1-8B-BF16-llama-3.1-model/Meta-Llama-3.1-8B]
|
||||
test_e2e.py::test_ptp_quickstart_advanced[Llama3.1-8B-FP8-llama-3.1-model/Llama-3.1-8B-Instruct-FP8]
|
||||
|
||||
@ -31,11 +31,9 @@ l0_dgx_b300:
|
||||
- accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_fp8_4gpus[pp4-fp8kv=False-attn_backend=TRTLLM-torch_compile=False]
|
||||
- accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16_4gpus[tp4-mtp_nextn=0-attention_dp=False-cuda_graph=False-overlap_scheduler=False-torch_compile=True]
|
||||
- accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16_4gpus[tp4-mtp_nextn=2-attention_dp=False-cuda_graph=True-overlap_scheduler=True-torch_compile=False]
|
||||
- accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16_4gpus[ep4-mtp_nextn=2-attention_dp=True-cuda_graph=True-overlap_scheduler=True-torch_compile=False]
|
||||
- accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16_4gpus[ep4-mtp_nextn=2-attention_dp=True-cuda_graph=True-overlap_scheduler=True-torch_compile=True]
|
||||
- accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16_4gpus[tp2pp2-mtp_nextn=0-attention_dp=False-cuda_graph=False-overlap_scheduler=False-torch_compile=True]
|
||||
- accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16_4gpus[tp2pp2-mtp_nextn=2-attention_dp=False-cuda_graph=False-overlap_scheduler=False-torch_compile=True]
|
||||
- accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16_4gpus[tp2pp2-mtp_nextn=2-attention_dp=True-cuda_graph=True-overlap_scheduler=True-torch_compile=False]
|
||||
- accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16_4gpus[pp4-mtp_nextn=0-attention_dp=False-cuda_graph=False-overlap_scheduler=False-torch_compile=True]
|
||||
- accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16_4gpus[pp4-mtp_nextn=2-attention_dp=True-cuda_graph=True-overlap_scheduler=True-torch_compile=False]
|
||||
- accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4_4gpus[moe_backend=CUTLASS-mtp_nextn=0-tp4-fp8kv=False-attention_dp=False-cuda_graph=False-overlap_scheduler=False-torch_compile=True]
|
||||
@ -44,15 +42,12 @@ l0_dgx_b300:
|
||||
- accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4_4gpus[moe_backend=CUTLASS-mtp_nextn=0-tp2pp2-fp8kv=True-attention_dp=True-cuda_graph=True-overlap_scheduler=True-torch_compile=False]
|
||||
- accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4_4gpus[moe_backend=CUTLASS-mtp_nextn=0-pp4-fp8kv=False-attention_dp=False-cuda_graph=False-overlap_scheduler=False-torch_compile=False]
|
||||
- accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4_4gpus[moe_backend=CUTLASS-mtp_nextn=0-pp4-fp8kv=True-attention_dp=True-cuda_graph=True-overlap_scheduler=True-torch_compile=True]
|
||||
- accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4_4gpus[moe_backend=TRTLLM-mtp_nextn=0-tp4-fp8kv=True-attention_dp=True-cuda_graph=True-overlap_scheduler=True-torch_compile=False]
|
||||
- accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4_4gpus[moe_backend=TRTLLM-mtp_nextn=0-ep4-fp8kv=True-attention_dp=True-cuda_graph=True-overlap_scheduler=True-torch_compile=False]
|
||||
- accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4_4gpus[moe_backend=CUTLASS-mtp_nextn=2-tp4-fp8kv=True-attention_dp=True-cuda_graph=True-overlap_scheduler=True-torch_compile=False]
|
||||
- accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4_4gpus[moe_backend=CUTLASS-mtp_nextn=2-ep4-fp8kv=False-attention_dp=False-cuda_graph=False-overlap_scheduler=False-torch_compile=False]
|
||||
- accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4_4gpus[moe_backend=CUTLASS-mtp_nextn=2-tp2pp2-fp8kv=False-attention_dp=False-cuda_graph=False-overlap_scheduler=False-torch_compile=False]
|
||||
- accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4_4gpus[moe_backend=CUTLASS-mtp_nextn=2-tp2pp2-fp8kv=True-attention_dp=True-cuda_graph=True-overlap_scheduler=True-torch_compile=False]
|
||||
- accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4_4gpus[moe_backend=CUTLASS-mtp_nextn=2-pp4-fp8kv=False-attention_dp=False-cuda_graph=False-overlap_scheduler=False-torch_compile=False]
|
||||
- accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4_4gpus[moe_backend=TRTLLM-mtp_nextn=2-ep4-fp8kv=True-attention_dp=True-cuda_graph=True-overlap_scheduler=True-torch_compile=False]
|
||||
- accuracy/test_llm_api_pytorch.py::TestQwen3_30B_A3B::test_nvfp4[tep4_latency_moe_trtllm-torch_compile=False]
|
||||
- accuracy/test_llm_api_pytorch.py::TestQwen3_30B_A3B::test_nvfp4[tep4_latency_moe_trtllm-torch_compile=True]
|
||||
- accuracy/test_llm_api_pytorch.py::TestQwen3_30B_A3B::test_nvfp4[tep4_latency_moe_cutlass-torch_compile=True]
|
||||
- accuracy/test_llm_api_pytorch.py::TestQwen3_30B_A3B::test_nvfp4[dep4_latency_moe_trtllm-torch_compile=False]
|
||||
@ -61,11 +56,9 @@ l0_dgx_b300:
|
||||
- disaggregated/test_disaggregated.py::test_disaggregated_deepseek_v3_lite_fp8_ucx[DeepSeek-V3-Lite-fp8]
|
||||
- accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_4gpus[tp4-cutlass-fp8]
|
||||
- accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_4gpus[tp4-cutlass-auto]
|
||||
- accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_4gpus[ep4-trtllm-auto]
|
||||
- accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_4gpus[dp4-triton-auto]
|
||||
- accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4a16[dp4-auto]
|
||||
- accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4a16[dp4-fp8]
|
||||
- disaggregated/test_disaggregated.py::test_disaggregated_deepseek_v3_lite_fp8_nixl[DeepSeek-V3-Lite-fp8]
|
||||
- disaggregated/test_disaggregated.py::test_disaggregated_benchmark_on_diff_backends[DeepSeek-V3-Lite-bf16]
|
||||
- disaggregated/test_disaggregated.py::test_disaggregated_benchmark_on_diff_backends[llama-v3-8b-hf]
|
||||
- disaggregated/test_disaggregated.py::test_disaggregated_benchmark_on_diff_backends[llama-3.1-8b-instruct-hf-fp8]
|
||||
@ -75,3 +68,25 @@ l0_dgx_b300:
|
||||
- accuracy/test_llm_api_pytorch.py::TestDeepSeekR1::test_nvfp4_multi_gpus[throughput_tp4] TIMEOUT (180)
|
||||
- accuracy/test_llm_api_pytorch.py::TestDeepSeekR1::test_nvfp4_multi_gpus[throughput_pp4_mtp] TIMEOUT (180)
|
||||
# ------------- AutoDeploy tests ---------------
|
||||
- condition:
|
||||
ranges:
|
||||
system_gpu_count:
|
||||
gte: 4
|
||||
lte: 4
|
||||
wildcards:
|
||||
gpu:
|
||||
- '*gb110*'
|
||||
- '*b300*'
|
||||
linux_distribution_name: ubuntu*
|
||||
cpu: x86_64
|
||||
terms:
|
||||
stage: pre_merge
|
||||
backend: pytorch
|
||||
tests:
|
||||
- accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16_4gpus[ep4-mtp_nextn=2-attention_dp=True-cuda_graph=True-overlap_scheduler=True-torch_compile=False]
|
||||
- accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16_4gpus[tp2pp2-mtp_nextn=2-attention_dp=True-cuda_graph=True-overlap_scheduler=True-torch_compile=False]
|
||||
- accuracy/test_llm_api_pytorch.py::TestQwen3_30B_A3B::test_nvfp4[tep4_latency_moe_trtllm-torch_compile=False]
|
||||
- disaggregated/test_disaggregated.py::test_disaggregated_deepseek_v3_lite_fp8_nixl[DeepSeek-V3-Lite-fp8]
|
||||
- accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_4gpus[ep4-trtllm-auto]
|
||||
- accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4_4gpus[moe_backend=TRTLLM-mtp_nextn=0-tp4-fp8kv=True-attention_dp=True-cuda_graph=True-overlap_scheduler=True-torch_compile=False]
|
||||
- accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4_4gpus[moe_backend=CUTLASS-mtp_nextn=2-tp2pp2-fp8kv=True-attention_dp=True-cuda_graph=True-overlap_scheduler=True-torch_compile=False]
|
||||
|
||||
@ -345,7 +345,6 @@ accuracy/test_llm_api_pytorch_multimodal.py::TestNemotron_Nano_12B_V2_VL::test_a
|
||||
accuracy/test_llm_api_pytorch.py::TestNemotronNas::test_auto_dtype_tp8 SKIP (https://nvbugs/5673527)
|
||||
unittest/_torch/auto_deploy/unit/multigpu/test_ad_build_small_multi.py::test_build_ad[meta-llama/Meta-Llama-3.1-8B-Instruct-llm_extra_args0-2] SKIP (https://nvbugs/5680755)
|
||||
full:H100_PCIe/unittest/llmapi/test_llm_pytorch.py::test_llama_7b_multi_lora_evict_and_reload_lora_gpu_cache SKIP (https://nvbugs/5682551)
|
||||
unittest/_torch/speculative/test_eagle3.py::test_qwen3_eagle3[True-True-True-True] SKIP (https://nvbugspro.nvidia.com/bug/5749988)
|
||||
accuracy/test_llm_api_pytorch.py::TestSeedOss_36B::test_auto_dtype SKIP (https://nvbugs/5612438)
|
||||
accuracy/test_llm_api_autodeploy.py::TestNemotronH::test_auto_dtype[True] SKIP (https://nvbugs/5688721)
|
||||
accuracy/test_llm_api_autodeploy.py::TestLlama3_1_8B::test_auto_dtype[False-2] SKIP (https://nvbugs/5769712)
|
||||
@ -419,7 +418,6 @@ accuracy/test_cli_flow.py::TestPhi3_5MiniInstruct::test_auto_dtype SKIP (https:/
|
||||
unittest/_torch/auto_deploy/unit/singlegpu/models/test_llama4_vlm_patch.py::test_build_run_llama4_vlm SKIP (https://nvbugs/5747878)
|
||||
accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4_4gpus_online_eplb[fp8kv=True-moe_backend=TRTLLM] SKIP (https://nvbugs/5740377)
|
||||
cpp/test_multi_gpu.py::TestDisagg::test_symmetric_executor[gpt-2proc-mpi_kvcache-90] SKIP (https://nvbugs/5755941)
|
||||
accuracy/test_disaggregated_serving.py::TestDeepSeekV3Lite::test_guided_decoding[xgrammar-mtp_nextn=0] SKIP (https://nvbugs/5748600)
|
||||
examples/test_bert.py::test_llm_bert_general[compare_hf-enable_remove_input_padding-use_attention_plugin-enable_context_fmha-tp:1-pp:1-float16-BertForQuestionAnswering-bert/bert-base-cased-squad2] SKIP (https://nvbugs/5608979)
|
||||
examples/test_bert.py::test_llm_bert_general[compare_hf-enable_remove_input_padding-use_attention_plugin-enable_context_fmha-tp:1-pp:1-float16-RobertaForQuestionAnswering-bert/roberta-base-squad2] SKIP (https://nvbugs/5608979)
|
||||
examples/test_bert.py::test_llm_bert_general[compare_hf-disable_remove_input_padding-use_attention_plugin-disable_context_fmha-tp:2-pp:1-float16-BertForQuestionAnswering-bert/bert-base-cased-squad2] SKIP (https://nvbugs/5608979)
|
||||
@ -471,15 +469,12 @@ accuracy/test_llm_api_pytorch_multimodal.py::TestMistralLarge3_675B::test_nvfp4_
|
||||
accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_4gpus[dp4-trtllm-fp8] SKIP (https://nvbugs/5772396)
|
||||
full:sm100/accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_4gpus[dp4-trtllm-auto] SKIP (https://nvbugs/5772396)
|
||||
accuracy/test_llm_api_pytorch.py::TestGLM4_6::test_nvfp4_2_model_mtp[2model_trtllm] SKIP (https://nvbugs/5772360)
|
||||
accuracy/test_llm_api_pytorch.py::TestMistralSmall24B::test_fp8 SKIP (https://nvbugs/5772361)
|
||||
accuracy/test_llm_api_pytorch.py::TestGLM4_6::test_nvfp4_2_model_mtp[2model] SKIP (https://nvbugs/5772993)
|
||||
test_e2e.py::test_ptp_quickstart_multimodal_chunked_prefill[mistral-small-3.1-24b-instruct-Mistral-Small-3.1-24B-Instruct-2503-0.6-image] SKIP (https://nvbugs/5772363)
|
||||
accuracy/test_llm_api.py::TestLlama3_1_8BInstruct::test_gather_generation_logits_cuda_graph SKIP (https://nvbugs/5772995)
|
||||
test_e2e.py::test_eagle3_output_consistency_4gpus[Qwen3/Qwen3-30B-A3B-Qwen3/Qwen3-30B-eagle3] SKIP (https://nvbugs/5685010)
|
||||
full:sm89/accuracy/test_disaggregated_serving.py::TestLlama3_1_8BInstruct::test_tp_pp_symmetric[GSM8K-tp2pp2] SKIP (https://nvbugs/5773047)
|
||||
accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_4gpus[enable_configurable_moe-dp4-trtllm-fp8] SKIP (https://nvbugs/5773201)
|
||||
unittest/_torch/auto_deploy/unit/multigpu/transformations/library/test_tp_sharding.py::test_sharding[GQA_Block-torch_dist_all_reduce-True-False-2] SKIP (https://nvbugs/5766982)
|
||||
test_e2e.py::test_ptp_quickstart_multimodal_kv_cache_reuse[mistral-small-3.1-24b-instruct-Mistral-Small-3.1-24B-Instruct-2503-0.6-image] SKIP (https://nvbugs/5773195)
|
||||
accuracy/test_cli_flow.py::TestVicuna7B::test_eagle_2[cuda_graph=True-chunked_context=True] SKIP (https://nvbugs/5773185)
|
||||
accuracy/test_cli_flow.py::TestVicuna7B::test_eagle_2[cuda_graph=True-chunked_context=False] SKIP (https://nvbugs/5773185)
|
||||
accuracy/test_disaggregated_serving.py::TestGPTOSS::test_auto_dtype[True] SKIP (https://nvbugs/5596343)
|
||||
@ -504,3 +499,18 @@ unittest/_torch/attention/test_flashinfer_star_attn.py::TestStarAttention::test_
|
||||
unittest/_torch/ray_orchestrator/multi_gpu/test_ops.py::test_reducescatter_pg_op[var_len:True-seqlen:16-hidden:128] SKIP (https://nvbugs/5781383)
|
||||
cpp/test_e2e.py::test_model[-mamba-86] SKIP (https://nvbugs/5781665)
|
||||
unittest/llmapi/test_llm_multi_gpu_pytorch.py::test_tinyllama_logits_processor_tp2pp2 SKIP (https://nvbugs/5781731)
|
||||
accuracy/test_disaggregated_serving.py::TestLlama3_1_8BInstruct::test_tp_pp_symmetric[GSM8K-tp2pp2] SKIP (https://nvbugs/5756008)
|
||||
accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_guided_decoding_4gpus[llguidance] SKIP (https://nvbugs/5756008)
|
||||
accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_fp8_4gpus[tp2pp2-fp8kv=False-attn_backend=TRTLLM-torch_compile=False] SKIP (https://nvbugs/5784526)
|
||||
unittest/_torch/modules/test_fused_moe.py::test_fused_moe_multi_gpu[1-CUTLASS] SKIP (https://nvbugs/5784543)
|
||||
accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_fp8_4gpus[tp4-fp8kv=False-attn_backend=FLASHINFER-torch_compile=False] SKIP (https://nvbugs/5707359)
|
||||
accuracy/test_disaggregated_serving.py::TestLlama3_1_8BInstruct::test_ctx_pp_gen_tp_asymmetric[GSM8K-gen_tp=1-ctx_pp=2] SKIP (https://nvbugs/5673559)
|
||||
accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16_4gpus[ep4-mtp_nextn=0-attention_dp=False-cuda_graph=False-overlap_scheduler=False-torch_compile=False] SKIP (https://nvbugs/5701445)
|
||||
accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16_4gpus[pp4-mtp_nextn=2-attention_dp=True-cuda_graph=True-overlap_scheduler=True-torch_compile=False] SKIP (https://nvbugs/5740075)
|
||||
accuracy/test_disaggregated_serving.py::TestDeepSeekV3Lite::test_guided_decoding[llguidance-mtp_nextn=0] SKIP (https://nvbugs/5748600)
|
||||
unittest/_torch/ray_orchestrator/multi_gpu/test_multi_instance.py::test_multi_instance[tp2_2instances] SKIP (https://nvbugs/5784566)
|
||||
disaggregated/test_auto_scaling.py::test_worker_restart[etcd-round_robin] SKIP (https://nvbugs/5776445)
|
||||
accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_eagle3_vswa_reuse_4gpus[one_model] SKIP (https://nvbugs/5756028)
|
||||
accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_eagle3_vswa_reuse_4gpus[two_model] SKIP (https://nvbugs/5756028)
|
||||
accuracy/test_llm_api_pytorch.py::TestQwen3_30B_A3B::test_fp8[latency-torch_compile=False] SKIP (https://nvbugs/5785206)
|
||||
examples/test_gpt.py::test_llm_gpt2_parallel_embedding_2gpu[float16-0] SKIP (https://nvbugs/5784518)
|
||||
|
||||
@ -780,117 +780,5 @@ def test_eagle3_cdl_sampling(disable_overlap_scheduler: bool):
|
||||
llm_spec.shutdown()
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"enable_block_reuse,use_one_model,enable_chunked_prefill,fp8_target", [
|
||||
[True, True, True, True],
|
||||
])
|
||||
@pytest.mark.high_cuda_memory
|
||||
def test_qwen3_eagle3(enable_block_reuse: bool, use_one_model: bool,
|
||||
enable_chunked_prefill: bool, fp8_target: bool):
|
||||
# Eagle3 one model works with overlap scheduler and block reuse.
|
||||
total_mem_gb = torch.cuda.get_device_properties(0).total_memory / 1e9
|
||||
if total_mem_gb < 35:
|
||||
pytest.skip("Not enough memory to load target + draft model")
|
||||
|
||||
use_cuda_graph = True
|
||||
attn_backend = "TRTLLM"
|
||||
disable_overlap_scheduler = False
|
||||
use_chain_drafter = True
|
||||
multi_batch = False
|
||||
attention_dp = False
|
||||
|
||||
models_path = llm_models_root()
|
||||
eagle_model_dir = f"{models_path}/Zhi-Create-Qwen3-32B-Eagle3"
|
||||
target_model_dir = f"{models_path}/Qwen3/Qwen3-32B"
|
||||
if fp8_target:
|
||||
target_model_dir = f"{models_path}/Qwen3/Qwen3-32B-FP8/"
|
||||
|
||||
# bs > 1 gives non-deterministic when doing IFB. There are slight chances
|
||||
# that ref and spec does not match 100%
|
||||
max_batch_size = 4 if multi_batch else 1
|
||||
max_draft_len = 3
|
||||
kv_cache_config = KvCacheConfig(enable_block_reuse=enable_block_reuse,
|
||||
max_tokens=8192)
|
||||
if fp8_target:
|
||||
kv_cache_config.dtype = 'fp8'
|
||||
cuda_graph_config = CudaGraphConfig(
|
||||
batch_sizes=[i for i in range(1, max_batch_size +
|
||||
1)]) if use_cuda_graph else None
|
||||
|
||||
llm_common_config = dict(
|
||||
model=target_model_dir,
|
||||
attn_backend=attn_backend,
|
||||
disable_overlap_scheduler=disable_overlap_scheduler,
|
||||
cuda_graph_config=cuda_graph_config,
|
||||
max_batch_size=max_batch_size,
|
||||
kv_cache_config=kv_cache_config,
|
||||
enable_attention_dp=attention_dp,
|
||||
max_seq_len=8192,
|
||||
enable_chunked_prefill=enable_chunked_prefill,
|
||||
)
|
||||
if enable_chunked_prefill:
|
||||
# Use a small max_num_tokens so that the chunked prefill path gets exercised.
|
||||
llm_common_config['max_num_tokens'] = 64
|
||||
|
||||
spec_config = EagleDecodingConfig(
|
||||
max_draft_len=max_draft_len,
|
||||
speculative_model_dir=eagle_model_dir,
|
||||
eagle3_one_model=use_one_model,
|
||||
)
|
||||
spec_config._allow_chain_drafter = use_chain_drafter
|
||||
|
||||
# Create the LLM instance
|
||||
llm_spec = LLM(**llm_common_config, speculative_config=spec_config)
|
||||
|
||||
# Acceptance rate tests
|
||||
if enable_chunked_prefill:
|
||||
# Use a long prompt for chunked prefill tests.
|
||||
prompts = [
|
||||
"The capital of France is a city of romance, art, fashion, and cuisine. Paris is a must-visit destination for anyone who loves history, architecture, and culture. From the iconic Eiffel Tower to the world-famous Louvre Museum, Paris has something to offer for every interest and age.\nThe city is divided into 20 arrondissements, each with its own unique character and charm. The Latin Quarter is a popular area for students and young travelers, while the Champs-Élysées is a hub for shopping and dining. The Montmartre neighborhood is famous for its bohemian vibe and stunning views of the city.\nParis is also known for its beautiful parks and gardens, such as the Luxembourg Gardens and the Tuileries Garden. The city has a rich history, with landmarks like the Notre-Dame Cathedral and the Arc de Triomphe. Visitors can also explore the city's many museums, including the Musée d'Orsay and the Musée Rodin.\nIn addition to its cultural and historical attractions, Paris is also a great destination for foodies. The city is famous for its cuisine, including croissants, baguettes, and cheese. Visitors can sample the city's famous dishes at one of the many restaurants, cafes, and "
|
||||
]
|
||||
tok_ids = [llm_spec.tokenizer.encode(prompts[0])]
|
||||
else:
|
||||
prompts = [
|
||||
"The capital of France is",
|
||||
"The president of the United States is",
|
||||
]
|
||||
tok_ids = [llm_spec.tokenizer.encode("The future of AI is")]
|
||||
if multi_batch:
|
||||
tok_ids.append(llm_spec.tokenizer.encode(prompts))
|
||||
|
||||
sampling_params = SamplingParams(max_tokens=128, temperature=0)
|
||||
for i in range(len(tok_ids)):
|
||||
num_tokens = 0
|
||||
num_drafted = 0
|
||||
num_accepted = 0
|
||||
|
||||
for output in llm_spec.generate_async(tok_ids[i],
|
||||
sampling_params,
|
||||
streaming=True):
|
||||
new_tokens = output.outputs[0].token_ids
|
||||
num_drafted += max_draft_len
|
||||
num_accepted += len(new_tokens) - num_tokens - 1
|
||||
num_tokens = len(new_tokens)
|
||||
|
||||
accept_rate = num_accepted / num_drafted
|
||||
assert accept_rate > 0.10
|
||||
|
||||
# Output tests
|
||||
sampling_params = SamplingParams(max_tokens=10, temperature=0)
|
||||
|
||||
results_spec = llm_spec.generate(prompts, sampling_params)
|
||||
generated_text_spec = [result.outputs[0].text for result in results_spec]
|
||||
llm_spec.shutdown()
|
||||
|
||||
llm_ref = LLM(**llm_common_config)
|
||||
results_ref = llm_ref.generate(prompts, sampling_params)
|
||||
generated_text_ref = [result.outputs[0].text for result in results_ref]
|
||||
llm_ref.shutdown()
|
||||
|
||||
for text_spec, text_ref in zip(generated_text_spec, generated_text_ref):
|
||||
# The spec decode algorithm currently guarantees identical results
|
||||
assert text_spec == text_ref
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
unittest.main()
|
||||
|
||||
Loading…
Reference in New Issue
Block a user