[TRTLLM-8991][test] Add Llama 3.3 70B model with different performance config (#8753)

Signed-off-by: yufeiwu-nv <230315618+yufeiwu-nv@users.noreply.github.com>
Co-authored-by: Larry Xu <197874197+LarryXFly@users.noreply.github.com>
This commit is contained in:
yufeiwu-nv 2025-11-03 13:34:06 +08:00 committed by GitHub
parent f57dc01e6f
commit b4d17d1a4c
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
4 changed files with 58 additions and 16 deletions

View File

@ -105,7 +105,7 @@ def get_llm(runtime_config: RuntimeConfig, kwargs: dict):
"""
llm_cls = LLM
if runtime_config.backend != "tensorrt":
if runtime_config.backend != None:
ignore_trt_only_args(kwargs, runtime_config.backend)
if runtime_config.backend == 'pytorch':

View File

@ -14,7 +14,7 @@
# limitations under the License.
# -*- coding: utf-8 -*-
"""
Model pytorch yaml config for trtllm-bench perf tests
Model pytorch/TRT yaml config for trtllm-bench perf tests
"""
@ -36,12 +36,18 @@ def get_model_yaml_config(model_label: str,
Returns:
dict: yaml config
"""
base_config = {
'print_iter_log': True,
'cuda_graph_config': {
'enable_padding': True,
},
}
if 'pytorch' in model_label:
# Pytorch backend config
base_config = {
'print_iter_log': True,
'cuda_graph_config': {
'enable_padding': True,
},
}
else:
# TRT backend config
base_config = {}
if 'kv_cache_dtype' in model_label:
base_config.update({
'kv_cache_dtype':
@ -241,6 +247,19 @@ def get_model_yaml_config(model_label: str,
'config': {
'enable_chunked_prefill': True,
}
},
# Llama-v3.3 models with xgrammar guided decoding
{
'patterns': [
"llama_v3.3_70b_instruct_fp8-bench-float8-maxbs:512-maxnt:2048-input_output_len:500,2000-reqs:400-con:200-gpus:8-extra"
],
'config': {
'extended_runtime_perf_knob_config': {
'cuda_graph_cache_size': 1.0,
'cuda_graph_mode': True,
},
'guided_decoding_backend': 'xgrammar'
}
}
]
@ -251,7 +270,8 @@ def get_model_yaml_config(model_label: str,
patterns = [patterns]
for pattern in patterns:
if pattern in model_label.lower():
recursive_update(base_config, pattern_config['config'])
if pattern_config.get('config'):
recursive_update(base_config, pattern_config['config'])
break # Stop checking other patterns for this config once we find a match
# lora-specific change for pytorch

View File

@ -57,7 +57,6 @@ MODEL_PATH_DICT = {
"modelopt-hf-model-hub/Llama-3.3-70B-Instruct-fp8",
"llama_v3.3_70b_instruct_fp4":
"modelopt-hf-model-hub/Llama-3.3-70B-Instruct-fp4",
"llama_v3.3_70b_instruct": "llama-3.3-models/Llama-3.3-70B-Instruct",
"llama_v3.1_405b_instruct_fp8":
"llama-3.1-model/Llama-3.1-405B-Instruct-FP8",
"llama_v3.1_405b_instruct_fp4":
@ -783,6 +782,8 @@ class PerfTestConfig:
tp_size: int = 1,
pp_size: int = 1,
num_gpus: int = 1,
# only for torch-backend currently
extra: bool = False,
# _autodeploy backend specific parameters
ad_compile_backend: str = "torch-opt",
free_mem_ratio: float = 0.9,
@ -841,6 +842,8 @@ class PerfTestConfig:
self.pp_size = pp_size
# Number of GPUs.
self.num_gpus = num_gpus
# Extra flag to enable pytorch_model_config reading for TRT backend
self.extra = extra
# _autodeploy backend specific parameters
self.ad_compile_backend = ad_compile_backend
self.free_mem_ratio = free_mem_ratio
@ -1016,6 +1019,10 @@ class PerfTestConfig:
if self.num_gpus > 1:
entries.append(f"gpus:{self.num_gpus}")
# Add extra flag for llm-api-config.yml.
if self.extra:
entries.append("extra")
# Concatenate labels with "-".
return "-".join(entries)
@ -1180,6 +1187,11 @@ class PerfTestConfig:
self.num_gpus = 1 if not labels[0].startswith("gpus:") else int(
labels.pop(0).replace("gpus:", ""))
if len(labels) > 0:
self.extra = True if labels[0] == "extra" else False
if self.extra:
labels.pop(0)
assert len(
labels
) == 0, f"Invalid test name! Some labels cannot be parsed: {labels}"
@ -1644,18 +1656,26 @@ class MultiMetricPerfTest(AbstractPerfScriptTestClass):
benchmark_cmd += [f"--pp={self._config.pp_size}"]
if self._config.streaming == "streaming":
benchmark_cmd += [f"--streaming"]
#use default yaml config
if self._config.backend == "pytorch":
#Add extra-llm-api-config.yml for pytorch backend and tensorrt backend with extra flag
if self._config.backend == "pytorch" or (self._config.backend == ""
and self._config.extra):
pytorch_config_path = os.path.join(engine_dir,
"extra-llm-api-config.yml")
if not os.path.exists(pytorch_config_path):
os.makedirs(os.path.dirname(pytorch_config_path), exist_ok=True)
config = get_model_yaml_config(self._config.to_string(),
lora_dirs=self.lora_dirs)
print_info(f"pytorch model config: {config}")
with open(pytorch_config_path, 'w') as f:
yaml.dump(config, f, default_flow_style=False)
benchmark_cmd += [f"--extra_llm_api_options={pytorch_config_path}"]
if config:
print_info(f"pytorch/TRT model config: {config}")
with open(pytorch_config_path, 'w') as f:
yaml.dump(config, f, default_flow_style=False)
benchmark_cmd += [
f"--extra_llm_api_options={pytorch_config_path}"
]
# If guided_decoding_backend is set, we need to initialize tokenizer
if config.get('guided_decoding_backend') is not None:
benchmark_cmd += ["--no_skip_tokenizer_init"]
elif self._config.backend == "_autodeploy":
autodeploy_config_path = os.path.join(engine_dir,
"extra_llm_api_options.yaml")

View File

@ -392,6 +392,8 @@ llm_perf_nim:
#trt backend
- perf/test_perf.py::test_perf[llama_v3.3_70b_instruct_fp8-bench-float8-input_output_len:128,128-gpus:8]
- perf/test_perf.py::test_perf[llama_v3.3_70b_instruct_fp8-bench-float8-input_output_len:512,32-gpus:8]
- perf/test_perf.py::test_perf[llama_v3.3_70b_instruct_fp8-bench-float8-maxbs:512-maxnt:2048-input_output_len:500,2000-reqs:400-con:200-gpus:8]
- perf/test_perf.py::test_perf[llama_v3.3_70b_instruct_fp8-bench-float8-maxbs:512-maxnt:2048-input_output_len:500,2000-reqs:400-con:200-gpus:8-extra]
- perf/test_perf.py::test_perf[llama_v3.3_70b_instruct_fp8-bench-streaming-float8-maxbs:16-input_output_len:512,32-gpus:8]
- perf/test_perf.py::test_perf[llama_v3.3_70b_instruct-bench-bfloat16-maxbs:16-maxnt:5000-input_output_len:5000,500-quant:fp8-reqs:8-con:1-gpus:8]
- perf/test_perf.py::test_perf[llama_v3.3_70b_instruct-bench-bfloat16-maxbs:16-maxnt:5000-input_output_len:5000,500-quant:fp8-reqs:64-con:250-gpus:8]