mirror of
https://github.com/NVIDIA/TensorRT-LLM.git
synced 2026-01-13 22:18:36 +08:00
[TRTLLM-9896][test] add vswa test cases coverage (#10146)
Signed-off-by: Ivy Zhang <25222398+crazydemo@users.noreply.github.com>
This commit is contained in:
parent
5108a69fc0
commit
1e828587e5
@ -8,3 +8,14 @@ deepseek-ai/DeepSeek-V3-Lite:
|
||||
- accuracy: 77.00
|
||||
- spec_dec_algo: MTP
|
||||
accuracy: 77.00
|
||||
google/gemma-3-1b-it:
|
||||
- quant_algo: FP8
|
||||
kv_cache_quant_algo: FP8
|
||||
accuracy: 61.00
|
||||
GPT-OSS/120B-MXFP4:
|
||||
- quant_algo: W4A16_MXFP4
|
||||
spec_dec_algo: Eagle
|
||||
accuracy: 62.00
|
||||
- quant_algo: W4A8_MXFP4_MXFP8
|
||||
spec_dec_algo: Eagle
|
||||
accuracy: 62.00
|
||||
|
||||
@ -1105,6 +1105,37 @@ class TestGemma3_1BInstruct(LlmapiAccuracyTestHarness):
|
||||
task = MMLU(self.MODEL_NAME)
|
||||
task.evaluate(llm)
|
||||
|
||||
def test_fp8_vswa_reuse(self):
|
||||
# NOTE: Test with VSWA kv cache config.
|
||||
kv_cache_config = KvCacheConfig(
|
||||
enable_block_reuse=True,
|
||||
max_attention_window=[512, 512, 512, 512, 512, 32768],
|
||||
)
|
||||
prequantized_model_path = f"{llm_models_root()}/gemma/gemma-3-1b-it-fp8/"
|
||||
with LLM(prequantized_model_path,
|
||||
kv_cache_config=kv_cache_config) as llm:
|
||||
task = GSM8K(self.MODEL_NAME)
|
||||
task.evaluate(llm)
|
||||
task = MMLU(self.MODEL_NAME)
|
||||
task.evaluate(llm)
|
||||
|
||||
@pytest.mark.parametrize("backend", ["xgrammar"])
|
||||
def test_fp8_guided_decoding_vswa_reuse(self, backend: str, mocker):
|
||||
mocker.patch.dict(os.environ, {"TRTLLM_XGUIDANCE_LENIENT": "1"})
|
||||
prequantized_model_path = f"{llm_models_root()}/gemma/gemma-3-1b-it-fp8/"
|
||||
kv_cache_config = KvCacheConfig(
|
||||
enable_block_reuse=True,
|
||||
max_attention_window=[512, 512, 512, 512, 512, 32768],
|
||||
)
|
||||
cuda_graph_config = CudaGraphConfig(enable_padding=True)
|
||||
llm = LLM(prequantized_model_path,
|
||||
guided_decoding_backend=backend,
|
||||
kv_cache_config=kv_cache_config,
|
||||
cuda_graph_config=cuda_graph_config)
|
||||
with llm:
|
||||
task = JsonModeEval(self.MODEL_NAME)
|
||||
task.evaluate(llm)
|
||||
|
||||
def test_auto_dtype_vswa_without_reuse(self):
|
||||
# NOTE: Test with VSWA kv cache config.
|
||||
kv_cache_config = KvCacheConfig(
|
||||
@ -4461,6 +4492,114 @@ class TestGPTOSS(LlmapiAccuracyTestHarness):
|
||||
sampling_params=sampling_params,
|
||||
extra_evaluator_kwargs=extra_evaluator_kwargs)
|
||||
|
||||
@pytest.mark.skip_less_device(4)
|
||||
@pytest.mark.parametrize("one_model", [True, False],
|
||||
ids=["one_model", "two_model"])
|
||||
def test_eagle3_vswa_reuse_4gpus(self, one_model, mocker):
|
||||
MAX_OUTPUT_LEN = 128179
|
||||
MAX_INPUT_LEN = 32768
|
||||
|
||||
mocker.patch.object(GSM8K, "MAX_OUTPUT_LEN", 8192)
|
||||
mocker.patch.dict(GSM8K.EVALUATE_KWARGS,
|
||||
{"scores_filter": "exact_match,flexible-extract"})
|
||||
|
||||
mocker.patch.object(GPQADiamond, "MAX_OUTPUT_LEN", MAX_OUTPUT_LEN)
|
||||
mocker.patch.object(GPQADiamond, "MAX_INPUT_LEN", MAX_INPUT_LEN)
|
||||
|
||||
pytorch_config = dict(cuda_graph_config=CudaGraphConfig())
|
||||
kv_cache_config = KvCacheConfig(free_gpu_memory_fraction=0.4,
|
||||
dtype="auto",
|
||||
enable_block_reuse=True,
|
||||
max_attention_window=[128, 32768])
|
||||
|
||||
eagle_model_dir = f"{llm_models_root()}/gpt_oss/gpt-oss-120b-Eagle3"
|
||||
draft_len = 3
|
||||
spec_config = EagleDecodingConfig(max_draft_len=draft_len,
|
||||
speculative_model_dir=eagle_model_dir,
|
||||
eagle3_one_model=one_model,
|
||||
allow_advanced_sampling=True)
|
||||
|
||||
max_seq_len = MAX_INPUT_LEN + MAX_OUTPUT_LEN
|
||||
llm = LLM(self.MODEL_PATH,
|
||||
tensor_parallel_size=4,
|
||||
pipeline_parallel_size=1,
|
||||
moe_expert_parallel_size=1,
|
||||
kv_cache_config=kv_cache_config,
|
||||
max_seq_len=max_seq_len,
|
||||
speculative_config=spec_config,
|
||||
**pytorch_config,
|
||||
enable_attention_dp=False)
|
||||
|
||||
with llm:
|
||||
model_name = "GPT-OSS/120B-MXFP4"
|
||||
|
||||
# GSM8K
|
||||
task = GSM8K(model_name)
|
||||
task.evaluate(llm,
|
||||
extra_evaluator_kwargs=self.extra_evaluator_kwargs)
|
||||
|
||||
# GPQA Medium Reasoning
|
||||
task = GPQADiamond(model_name)
|
||||
|
||||
chat_template_kwargs = dict(reasoning_effort="medium")
|
||||
extra_evaluator_kwargs = {
|
||||
**self.extra_evaluator_kwargs, "chat_template_kwargs":
|
||||
chat_template_kwargs
|
||||
}
|
||||
|
||||
sampling_params = SamplingParams(
|
||||
temperature=1.0,
|
||||
top_p=1.0,
|
||||
max_tokens=MAX_OUTPUT_LEN,
|
||||
truncate_prompt_tokens=MAX_INPUT_LEN)
|
||||
|
||||
task.evaluate(llm,
|
||||
sampling_params=sampling_params,
|
||||
extra_evaluator_kwargs=extra_evaluator_kwargs)
|
||||
|
||||
@pytest.mark.skip_less_device(4)
|
||||
@pytest.mark.parametrize("one_model", [True, False],
|
||||
ids=["one_model", "two_model"])
|
||||
def test_eagle3_guided_decoding_4gpus(self, one_model, mocker):
|
||||
MAX_OUTPUT_LEN = 128179
|
||||
MAX_INPUT_LEN = 32768
|
||||
|
||||
mocker.patch.dict(os.environ, {"TRTLLM_XGUIDANCE_LENIENT": "1"})
|
||||
mocker.patch.object(GSM8K, "MAX_OUTPUT_LEN", 8192)
|
||||
mocker.patch.dict(GSM8K.EVALUATE_KWARGS,
|
||||
{"scores_filter": "exact_match,flexible-extract"})
|
||||
|
||||
mocker.patch.object(GPQADiamond, "MAX_OUTPUT_LEN", MAX_OUTPUT_LEN)
|
||||
mocker.patch.object(GPQADiamond, "MAX_INPUT_LEN", MAX_INPUT_LEN)
|
||||
|
||||
pytorch_config = dict(cuda_graph_config=CudaGraphConfig())
|
||||
kv_cache_config = KvCacheConfig(free_gpu_memory_fraction=0.4,
|
||||
dtype="auto")
|
||||
|
||||
eagle_model_dir = f"{llm_models_root()}/gpt_oss/gpt-oss-120b-Eagle3"
|
||||
draft_len = 3
|
||||
spec_config = EagleDecodingConfig(max_draft_len=draft_len,
|
||||
speculative_model_dir=eagle_model_dir,
|
||||
eagle3_one_model=one_model,
|
||||
allow_advanced_sampling=True)
|
||||
|
||||
max_seq_len = MAX_INPUT_LEN + MAX_OUTPUT_LEN
|
||||
llm = LLM(self.MODEL_PATH,
|
||||
tensor_parallel_size=4,
|
||||
pipeline_parallel_size=1,
|
||||
moe_expert_parallel_size=1,
|
||||
guided_decoding_backend="xgrammar",
|
||||
kv_cache_config=kv_cache_config,
|
||||
max_seq_len=max_seq_len,
|
||||
speculative_config=spec_config,
|
||||
**pytorch_config,
|
||||
enable_attention_dp=False)
|
||||
|
||||
with llm:
|
||||
model_name = "GPT-OSS/120B-MXFP4"
|
||||
task = JsonModeEval(model_name)
|
||||
task.evaluate(llm)
|
||||
|
||||
@pytest.mark.skip_less_device(2)
|
||||
@pytest.mark.timeout(14400)
|
||||
@pytest.mark.parametrize("overlap_scheduler", [True, False],
|
||||
|
||||
@ -430,6 +430,8 @@ accuracy/test_llm_api_pytorch.py::TestLlama3_3_70BInstruct::test_fp8_eagle3_tp8[
|
||||
accuracy/test_llm_api_pytorch.py::TestLlama3_3_70BInstruct::test_fp8_eagle3_tp8[eagle3_one_model=False-torch_compile=True]
|
||||
accuracy/test_llm_api_pytorch.py::TestMistral7B::test_auto_dtype
|
||||
accuracy/test_llm_api_pytorch.py::TestGemma3_1BInstruct::test_auto_dtype
|
||||
accuracy/test_llm_api_pytorch.py::TestGemma3_1BInstruct::test_fp8_vswa_reuse
|
||||
accuracy/test_llm_api_pytorch.py::TestGemma3_1BInstruct::test_fp8_guided_decoding_vswa_reuse[xgrammar]
|
||||
accuracy/test_llm_api_pytorch.py::TestGemma3_27BInstruct::test_fp8_prequantized
|
||||
accuracy/test_llm_api_pytorch.py::TestGemma3_27BInstruct::test_auto_dtype
|
||||
accuracy/test_llm_api_pytorch.py::TestMistralSmall24B::test_auto_dtype
|
||||
@ -613,6 +615,10 @@ accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_eagle3_4gpus[triton-one_model
|
||||
accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_eagle3_4gpus[triton-one_model-no_overlap_scheduler]
|
||||
accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_eagle3_4gpus[triton-two_model-overlap_scheduler]
|
||||
accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_eagle3_4gpus[triton-two_model-no_overlap_scheduler]
|
||||
accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_eagle3_vswa_reuse_4gpus[one_model]
|
||||
accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_eagle3_vswa_reuse_4gpus[two_model]
|
||||
accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_eagle3_guided_decoding_4gpus[one_model]
|
||||
accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_eagle3_guided_decoding_4gpus[two_model]
|
||||
accuracy/test_disaggregated_serving.py::TestLlama3_1_8BInstruct::test_auto_dtype[False-False-False]
|
||||
accuracy/test_disaggregated_serving.py::TestLlama3_1_8BInstruct::test_auto_dtype[True-True-True]
|
||||
accuracy/test_disaggregated_serving.py::TestLlama3_1_8BInstruct::test_ngram
|
||||
|
||||
@ -151,6 +151,9 @@ accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_4gpus[ep4-cutlass-auto]
|
||||
accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_4gpus[tp4-cutlass-auto]
|
||||
accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4a16[dp4-auto]
|
||||
accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_chunked_prefill[cutlass-auto]
|
||||
accuracy/test_llm_api_pytorch.py::TestQwen3_235B_A22B::test_nvfp4[latency_moe_trtllm]
|
||||
accuracy/test_llm_api_pytorch.py::TestQwen3_235B_A22B::test_nvfp4[latency_moe_trtllm_eagle3]
|
||||
accuracy/test_llm_api_pytorch.py::TestDeepSeekR1::test_nvfp4_multi_gpus_sm120[throughput_tp8]
|
||||
accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_eagle3_4gpus[cutlass-one_model-overlap_scheduler]
|
||||
accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_eagle3_4gpus[cutlass-one_model-no_overlap_scheduler]
|
||||
accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_eagle3_4gpus[cutlass-two_model-overlap_scheduler]
|
||||
@ -204,6 +207,18 @@ accuracy/test_disaggregated_serving.py::TestLlama4ScoutInstruct::test_auto_dtype
|
||||
accuracy/test_disaggregated_serving.py::TestQwen3_8B::test_auto_dtype[False]
|
||||
accuracy/test_disaggregated_serving.py::TestQwen3_8B::test_auto_dtype[True]
|
||||
|
||||
accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_eagle3_4gpus[trtllm-one_model-overlap_scheduler]
|
||||
accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_eagle3_4gpus[trtllm-one_model-no_overlap_scheduler]
|
||||
accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_eagle3_4gpus[trtllm-two_model-overlap_scheduler]
|
||||
accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_eagle3_4gpus[trtllm-two_model-no_overlap_scheduler]
|
||||
accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_eagle3_4gpus[triton-one_model-overlap_scheduler]
|
||||
accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_eagle3_4gpus[triton-one_model-no_overlap_scheduler]
|
||||
accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_eagle3_4gpus[triton-two_model-overlap_scheduler]
|
||||
accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_eagle3_4gpus[triton-two_model-no_overlap_scheduler]
|
||||
accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_eagle3_vswa_reuse_4gpus[one_model]
|
||||
accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_eagle3_vswa_reuse_4gpus[two_model]
|
||||
accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_eagle3_guided_decoding_4gpus[one_model]
|
||||
accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_eagle3_guided_decoding_4gpus[two_model]
|
||||
test_e2e.py::test_ptp_quickstart_advanced_mixed_precision
|
||||
test_e2e.py::test_ptp_quickstart_advanced[Llama3.1-8B-BF16-llama-3.1-model/Meta-Llama-3.1-8B]
|
||||
test_e2e.py::test_ptp_quickstart_advanced[Llama3.1-8B-FP8-llama-3.1-model/Llama-3.1-8B-Instruct-FP8]
|
||||
|
||||
@ -508,3 +508,5 @@ unittest/llmapi/test_llm_multi_gpu_pytorch.py::test_tinyllama_logits_processor_t
|
||||
accuracy/test_disaggregated_serving.py::TestLlama3_1_8BInstruct::test_tp_pp_symmetric[GSM8K-tp2pp2] SKIP (https://nvbugs/5756008)
|
||||
accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_guided_decoding_4gpus[llguidance] SKIP (https://nvbugs/5756008)
|
||||
disaggregated/test_auto_scaling.py::test_worker_restart[etcd-round_robin] SKIP (https://nvbugs/5776445)
|
||||
accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_eagle3_vswa_reuse_4gpus[one_model] SKIP (https://nvbugs/5756028)
|
||||
accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_eagle3_vswa_reuse_4gpus[two_model] SKIP (https://nvbugs/5756028)
|
||||
|
||||
Loading…
Reference in New Issue
Block a user