mirror of
https://github.com/NVIDIA/TensorRT-LLM.git
synced 2026-01-13 22:18:36 +08:00
[TRTLLM-8991][test] Add Llama 3.3 70B model with different performance config (#8753)
Signed-off-by: yufeiwu-nv <230315618+yufeiwu-nv@users.noreply.github.com> Co-authored-by: Larry Xu <197874197+LarryXFly@users.noreply.github.com>
This commit is contained in:
parent
f57dc01e6f
commit
b4d17d1a4c
@ -105,7 +105,7 @@ def get_llm(runtime_config: RuntimeConfig, kwargs: dict):
|
||||
"""
|
||||
llm_cls = LLM
|
||||
|
||||
if runtime_config.backend != "tensorrt":
|
||||
if runtime_config.backend != None:
|
||||
ignore_trt_only_args(kwargs, runtime_config.backend)
|
||||
|
||||
if runtime_config.backend == 'pytorch':
|
||||
|
||||
@ -14,7 +14,7 @@
|
||||
# limitations under the License.
|
||||
# -*- coding: utf-8 -*-
|
||||
"""
|
||||
Model pytorch yaml config for trtllm-bench perf tests
|
||||
Model pytorch/TRT yaml config for trtllm-bench perf tests
|
||||
"""
|
||||
|
||||
|
||||
@ -36,12 +36,18 @@ def get_model_yaml_config(model_label: str,
|
||||
Returns:
|
||||
dict: yaml config
|
||||
"""
|
||||
base_config = {
|
||||
'print_iter_log': True,
|
||||
'cuda_graph_config': {
|
||||
'enable_padding': True,
|
||||
},
|
||||
}
|
||||
if 'pytorch' in model_label:
|
||||
# Pytorch backend config
|
||||
base_config = {
|
||||
'print_iter_log': True,
|
||||
'cuda_graph_config': {
|
||||
'enable_padding': True,
|
||||
},
|
||||
}
|
||||
else:
|
||||
# TRT backend config
|
||||
base_config = {}
|
||||
|
||||
if 'kv_cache_dtype' in model_label:
|
||||
base_config.update({
|
||||
'kv_cache_dtype':
|
||||
@ -241,6 +247,19 @@ def get_model_yaml_config(model_label: str,
|
||||
'config': {
|
||||
'enable_chunked_prefill': True,
|
||||
}
|
||||
},
|
||||
# Llama-v3.3 models with xgrammar guided decoding
|
||||
{
|
||||
'patterns': [
|
||||
"llama_v3.3_70b_instruct_fp8-bench-float8-maxbs:512-maxnt:2048-input_output_len:500,2000-reqs:400-con:200-gpus:8-extra"
|
||||
],
|
||||
'config': {
|
||||
'extended_runtime_perf_knob_config': {
|
||||
'cuda_graph_cache_size': 1.0,
|
||||
'cuda_graph_mode': True,
|
||||
},
|
||||
'guided_decoding_backend': 'xgrammar'
|
||||
}
|
||||
}
|
||||
]
|
||||
|
||||
@ -251,7 +270,8 @@ def get_model_yaml_config(model_label: str,
|
||||
patterns = [patterns]
|
||||
for pattern in patterns:
|
||||
if pattern in model_label.lower():
|
||||
recursive_update(base_config, pattern_config['config'])
|
||||
if pattern_config.get('config'):
|
||||
recursive_update(base_config, pattern_config['config'])
|
||||
break # Stop checking other patterns for this config once we find a match
|
||||
|
||||
# lora-specific change for pytorch
|
||||
|
||||
@ -57,7 +57,6 @@ MODEL_PATH_DICT = {
|
||||
"modelopt-hf-model-hub/Llama-3.3-70B-Instruct-fp8",
|
||||
"llama_v3.3_70b_instruct_fp4":
|
||||
"modelopt-hf-model-hub/Llama-3.3-70B-Instruct-fp4",
|
||||
"llama_v3.3_70b_instruct": "llama-3.3-models/Llama-3.3-70B-Instruct",
|
||||
"llama_v3.1_405b_instruct_fp8":
|
||||
"llama-3.1-model/Llama-3.1-405B-Instruct-FP8",
|
||||
"llama_v3.1_405b_instruct_fp4":
|
||||
@ -783,6 +782,8 @@ class PerfTestConfig:
|
||||
tp_size: int = 1,
|
||||
pp_size: int = 1,
|
||||
num_gpus: int = 1,
|
||||
# only for torch-backend currently
|
||||
extra: bool = False,
|
||||
# _autodeploy backend specific parameters
|
||||
ad_compile_backend: str = "torch-opt",
|
||||
free_mem_ratio: float = 0.9,
|
||||
@ -841,6 +842,8 @@ class PerfTestConfig:
|
||||
self.pp_size = pp_size
|
||||
# Number of GPUs.
|
||||
self.num_gpus = num_gpus
|
||||
# Extra flag to enable pytorch_model_config reading for TRT backend
|
||||
self.extra = extra
|
||||
# _autodeploy backend specific parameters
|
||||
self.ad_compile_backend = ad_compile_backend
|
||||
self.free_mem_ratio = free_mem_ratio
|
||||
@ -1016,6 +1019,10 @@ class PerfTestConfig:
|
||||
if self.num_gpus > 1:
|
||||
entries.append(f"gpus:{self.num_gpus}")
|
||||
|
||||
# Add extra flag for llm-api-config.yml.
|
||||
if self.extra:
|
||||
entries.append("extra")
|
||||
|
||||
# Concatenate labels with "-".
|
||||
return "-".join(entries)
|
||||
|
||||
@ -1180,6 +1187,11 @@ class PerfTestConfig:
|
||||
self.num_gpus = 1 if not labels[0].startswith("gpus:") else int(
|
||||
labels.pop(0).replace("gpus:", ""))
|
||||
|
||||
if len(labels) > 0:
|
||||
self.extra = True if labels[0] == "extra" else False
|
||||
if self.extra:
|
||||
labels.pop(0)
|
||||
|
||||
assert len(
|
||||
labels
|
||||
) == 0, f"Invalid test name! Some labels cannot be parsed: {labels}"
|
||||
@ -1644,18 +1656,26 @@ class MultiMetricPerfTest(AbstractPerfScriptTestClass):
|
||||
benchmark_cmd += [f"--pp={self._config.pp_size}"]
|
||||
if self._config.streaming == "streaming":
|
||||
benchmark_cmd += [f"--streaming"]
|
||||
#use default yaml config
|
||||
if self._config.backend == "pytorch":
|
||||
|
||||
#Add extra-llm-api-config.yml for pytorch backend and tensorrt backend with extra flag
|
||||
if self._config.backend == "pytorch" or (self._config.backend == ""
|
||||
and self._config.extra):
|
||||
pytorch_config_path = os.path.join(engine_dir,
|
||||
"extra-llm-api-config.yml")
|
||||
if not os.path.exists(pytorch_config_path):
|
||||
os.makedirs(os.path.dirname(pytorch_config_path), exist_ok=True)
|
||||
config = get_model_yaml_config(self._config.to_string(),
|
||||
lora_dirs=self.lora_dirs)
|
||||
print_info(f"pytorch model config: {config}")
|
||||
with open(pytorch_config_path, 'w') as f:
|
||||
yaml.dump(config, f, default_flow_style=False)
|
||||
benchmark_cmd += [f"--extra_llm_api_options={pytorch_config_path}"]
|
||||
if config:
|
||||
print_info(f"pytorch/TRT model config: {config}")
|
||||
with open(pytorch_config_path, 'w') as f:
|
||||
yaml.dump(config, f, default_flow_style=False)
|
||||
benchmark_cmd += [
|
||||
f"--extra_llm_api_options={pytorch_config_path}"
|
||||
]
|
||||
# If guided_decoding_backend is set, we need to initialize tokenizer
|
||||
if config.get('guided_decoding_backend') is not None:
|
||||
benchmark_cmd += ["--no_skip_tokenizer_init"]
|
||||
elif self._config.backend == "_autodeploy":
|
||||
autodeploy_config_path = os.path.join(engine_dir,
|
||||
"extra_llm_api_options.yaml")
|
||||
|
||||
@ -392,6 +392,8 @@ llm_perf_nim:
|
||||
#trt backend
|
||||
- perf/test_perf.py::test_perf[llama_v3.3_70b_instruct_fp8-bench-float8-input_output_len:128,128-gpus:8]
|
||||
- perf/test_perf.py::test_perf[llama_v3.3_70b_instruct_fp8-bench-float8-input_output_len:512,32-gpus:8]
|
||||
- perf/test_perf.py::test_perf[llama_v3.3_70b_instruct_fp8-bench-float8-maxbs:512-maxnt:2048-input_output_len:500,2000-reqs:400-con:200-gpus:8]
|
||||
- perf/test_perf.py::test_perf[llama_v3.3_70b_instruct_fp8-bench-float8-maxbs:512-maxnt:2048-input_output_len:500,2000-reqs:400-con:200-gpus:8-extra]
|
||||
- perf/test_perf.py::test_perf[llama_v3.3_70b_instruct_fp8-bench-streaming-float8-maxbs:16-input_output_len:512,32-gpus:8]
|
||||
- perf/test_perf.py::test_perf[llama_v3.3_70b_instruct-bench-bfloat16-maxbs:16-maxnt:5000-input_output_len:5000,500-quant:fp8-reqs:8-con:1-gpus:8]
|
||||
- perf/test_perf.py::test_perf[llama_v3.3_70b_instruct-bench-bfloat16-maxbs:16-maxnt:5000-input_output_len:5000,500-quant:fp8-reqs:64-con:250-gpus:8]
|
||||
|
||||
Loading…
Reference in New Issue
Block a user