[TRTLLM-8991][test] Add Llama 3.3 70B model with different performance config (#8753)

Signed-off-by: yufeiwu-nv <230315618+yufeiwu-nv@users.noreply.github.com> Co-authored-by: Larry Xu <197874197+LarryXFly@users.noreply.github.com>
2026-01-13 22:18:36 +08:00 · 2025-11-03 13:34:06 +08:00 · 2025-11-03 13:34:06 +08:00 · b4d17d1a4c
commit b4d17d1a4c
parent f57dc01e6f
4 changed files with 58 additions and 16 deletions
--- a/tensorrt_llm/bench/benchmark/init.py
+++ b/tensorrt_llm/bench/benchmark/init.py
@ -105,7 +105,7 @@ def get_llm(runtime_config: RuntimeConfig, kwargs: dict):
    """
    llm_cls = LLM

-    if runtime_config.backend != "tensorrt":
+    if runtime_config.backend != None:
        ignore_trt_only_args(kwargs, runtime_config.backend)

    if runtime_config.backend == 'pytorch':
--- a/tests/integration/defs/perf/pytorch_model_config.py
+++ b/tests/integration/defs/perf/pytorch_model_config.py
@ -14,7 +14,7 @@
 # limitations under the License.
 # -*- coding: utf-8 -*-
 """
-Model pytorch yaml config for trtllm-bench perf tests
+Model pytorch/TRT yaml config for trtllm-bench perf tests
 """


@ -36,12 +36,18 @@ def get_model_yaml_config(model_label: str,
        Returns:
            dict: yaml config
        """
-    base_config = {
-        'print_iter_log': True,
-        'cuda_graph_config': {
-            'enable_padding': True,
-        },
-    }
+    if 'pytorch' in model_label:
+        # Pytorch backend config
+        base_config = {
+            'print_iter_log': True,
+            'cuda_graph_config': {
+                'enable_padding': True,
+            },
+        }
+    else:
+        # TRT backend config
+        base_config = {}
+
    if 'kv_cache_dtype' in model_label:
        base_config.update({
            'kv_cache_dtype':
@ -241,6 +247,19 @@ def get_model_yaml_config(model_label: str,
            'config': {
                'enable_chunked_prefill': True,
            }
+        },
+        # Llama-v3.3 models with xgrammar guided decoding
+        {
+            'patterns': [
+                "llama_v3.3_70b_instruct_fp8-bench-float8-maxbs:512-maxnt:2048-input_output_len:500,2000-reqs:400-con:200-gpus:8-extra"
+            ],
+            'config': {
+                'extended_runtime_perf_knob_config': {
+                    'cuda_graph_cache_size': 1.0,
+                    'cuda_graph_mode': True,
+                },
+                'guided_decoding_backend': 'xgrammar'
+            }
        }
    ]

@ -251,7 +270,8 @@ def get_model_yaml_config(model_label: str,
            patterns = [patterns]
        for pattern in patterns:
            if pattern in model_label.lower():
-                recursive_update(base_config, pattern_config['config'])
+                if pattern_config.get('config'):
+                    recursive_update(base_config, pattern_config['config'])
                break  # Stop checking other patterns for this config once we find a match

    # lora-specific change for pytorch
--- a/tests/integration/defs/perf/test_perf.py
+++ b/tests/integration/defs/perf/test_perf.py
@ -57,7 +57,6 @@ MODEL_PATH_DICT = {
    "modelopt-hf-model-hub/Llama-3.3-70B-Instruct-fp8",
    "llama_v3.3_70b_instruct_fp4":
    "modelopt-hf-model-hub/Llama-3.3-70B-Instruct-fp4",
-    "llama_v3.3_70b_instruct": "llama-3.3-models/Llama-3.3-70B-Instruct",
    "llama_v3.1_405b_instruct_fp8":
    "llama-3.1-model/Llama-3.1-405B-Instruct-FP8",
    "llama_v3.1_405b_instruct_fp4":
@ -783,6 +782,8 @@ class PerfTestConfig:
        tp_size: int = 1,
        pp_size: int = 1,
        num_gpus: int = 1,
+        # only for torch-backend currently
+        extra: bool = False,
        # _autodeploy backend specific parameters
        ad_compile_backend: str = "torch-opt",
        free_mem_ratio: float = 0.9,
@ -841,6 +842,8 @@ class PerfTestConfig:
        self.pp_size = pp_size
        # Number of GPUs.
        self.num_gpus = num_gpus
+        # Extra flag to enable pytorch_model_config reading for TRT backend
+        self.extra = extra
        # _autodeploy backend specific parameters
        self.ad_compile_backend = ad_compile_backend
        self.free_mem_ratio = free_mem_ratio
@ -1016,6 +1019,10 @@ class PerfTestConfig:
        if self.num_gpus > 1:
            entries.append(f"gpus:{self.num_gpus}")

+        # Add extra flag for llm-api-config.yml.
+        if self.extra:
+            entries.append("extra")
+
        # Concatenate labels with "-".
        return "-".join(entries)

@ -1180,6 +1187,11 @@ class PerfTestConfig:
            self.num_gpus = 1 if not labels[0].startswith("gpus:") else int(
                labels.pop(0).replace("gpus:", ""))

+        if len(labels) > 0:
+            self.extra = True if labels[0] == "extra" else False
+            if self.extra:
+                labels.pop(0)
+
        assert len(
            labels
        ) == 0, f"Invalid test name! Some labels cannot be parsed: {labels}"
@ -1644,18 +1656,26 @@ class MultiMetricPerfTest(AbstractPerfScriptTestClass):
            benchmark_cmd += [f"--pp={self._config.pp_size}"]
        if self._config.streaming == "streaming":
            benchmark_cmd += [f"--streaming"]
-        #use default yaml config
-        if self._config.backend == "pytorch":
+
+        #Add extra-llm-api-config.yml for pytorch backend and tensorrt backend with extra flag
+        if self._config.backend == "pytorch" or (self._config.backend == ""
+                                                 and self._config.extra):
            pytorch_config_path = os.path.join(engine_dir,
                                               "extra-llm-api-config.yml")
            if not os.path.exists(pytorch_config_path):
                os.makedirs(os.path.dirname(pytorch_config_path), exist_ok=True)
            config = get_model_yaml_config(self._config.to_string(),
                                           lora_dirs=self.lora_dirs)
-            print_info(f"pytorch model config: {config}")
-            with open(pytorch_config_path, 'w') as f:
-                yaml.dump(config, f, default_flow_style=False)
-            benchmark_cmd += [f"--extra_llm_api_options={pytorch_config_path}"]
+            if config:
+                print_info(f"pytorch/TRT model config: {config}")
+                with open(pytorch_config_path, 'w') as f:
+                    yaml.dump(config, f, default_flow_style=False)
+                benchmark_cmd += [
+                    f"--extra_llm_api_options={pytorch_config_path}"
+                ]
+                # If guided_decoding_backend is set, we need to initialize tokenizer
+                if config.get('guided_decoding_backend') is not None:
+                    benchmark_cmd += ["--no_skip_tokenizer_init"]
        elif self._config.backend == "_autodeploy":
            autodeploy_config_path = os.path.join(engine_dir,
                                                  "extra_llm_api_options.yaml")
--- a/tests/integration/test_lists/qa/llm_perf_nim.yml
+++ b/tests/integration/test_lists/qa/llm_perf_nim.yml
@ -392,6 +392,8 @@ llm_perf_nim:
  #trt backend
  - perf/test_perf.py::test_perf[llama_v3.3_70b_instruct_fp8-bench-float8-input_output_len:128,128-gpus:8]
  - perf/test_perf.py::test_perf[llama_v3.3_70b_instruct_fp8-bench-float8-input_output_len:512,32-gpus:8]
+  - perf/test_perf.py::test_perf[llama_v3.3_70b_instruct_fp8-bench-float8-maxbs:512-maxnt:2048-input_output_len:500,2000-reqs:400-con:200-gpus:8]
+  - perf/test_perf.py::test_perf[llama_v3.3_70b_instruct_fp8-bench-float8-maxbs:512-maxnt:2048-input_output_len:500,2000-reqs:400-con:200-gpus:8-extra]
  - perf/test_perf.py::test_perf[llama_v3.3_70b_instruct_fp8-bench-streaming-float8-maxbs:16-input_output_len:512,32-gpus:8]
  - perf/test_perf.py::test_perf[llama_v3.3_70b_instruct-bench-bfloat16-maxbs:16-maxnt:5000-input_output_len:5000,500-quant:fp8-reqs:8-con:1-gpus:8]
  - perf/test_perf.py::test_perf[llama_v3.3_70b_instruct-bench-bfloat16-maxbs:16-maxnt:5000-input_output_len:5000,500-quant:fp8-reqs:64-con:250-gpus:8]