test: Deprecate gpt_model_type "v1" static batching from triton_backend L0_backend_trtllm (#5229)

Signed-off-by: Yingge He <yinggeh@nvidia.com>
2026-01-13 22:18:36 +08:00 · 2025-06-16 23:47:03 -07:00 · 2025-06-16 23:47:03 -07:00 · 109f28ed3f
commit 109f28ed3f
parent e05b3ff427
2 changed files with 16 additions and 68 deletions
--- a/triton_backend/ci/L0_backend_trtllm/custom_metrics_verification_tests.py
+++ b/triton_backend/ci/L0_backend_trtllm/custom_metrics_verification_tests.py
@ -82,7 +82,7 @@ class CustomMetricsTest(unittest.TestCase):

                    return json.loads(json_string)

-    def _parse_triton_metrics(self, filename, is_v1):
+    def _parse_triton_metrics(self, filename):
        curl_counts = {}
        with open(filename) as metrics_file:
            for line in metrics_file:
@ -91,12 +91,11 @@ class CustomMetricsTest(unittest.TestCase):
                    metric_output = re.sub(r"^.*?{", "{", line).split()
                    metric_key = metric_output[0]
                    metric_value = metric_output[1]
-                    key = self._convert_metric_key_to_stats_key(
-                        metric_key, is_v1)
+                    key = self._convert_metric_key_to_stats_key(metric_key)
                    curl_counts[key] = metric_value
        return curl_counts

-    def _convert_metric_key_to_stats_key(self, metric_output, is_v1):
+    def _convert_metric_key_to_stats_key(self, metric_output):
        # Converts:
        # '{model="tensorrt_llm",request_type="context",version="1"}'
        # to:
@ -107,15 +106,12 @@ class CustomMetricsTest(unittest.TestCase):
            if not i.startswith('model') and not i.startswith('version')
        ][0]
        self.assertIn(key, metric_to_stat_dict)
-        if (is_v1):
-            self.assertNotIn("inflight_batcher_specific_metric", key)
-        else:
-            self.assertNotIn("v1_specific_metric", key)
+        self.assertNotIn("v1_specific_metric", key)
        return metric_to_stat_dict[key]

-    def _base_test(self, stats_file, metrics_file, is_v1):
+    def _base_test(self, stats_file, metrics_file):
        stats = self._parse_log_file(stats_file)
-        metrics = self._parse_triton_metrics(metrics_file, is_v1)
+        metrics = self._parse_triton_metrics(metrics_file)
        self.assertEqual(len(stats.keys()), len(metrics.keys()))
        self.assertEqual(list(stats.keys()).sort(), list(metrics.keys()).sort())
        for metric_key in stats.keys():
@ -140,45 +136,33 @@ class CustomMetricsTest(unittest.TestCase):
                    timedelta(seconds=-1) <= difference, difference
                    <= timedelta(seconds=1))

-    def test_1_gpu_v1(self):
-        self._base_test("1gpu_v1_no_streaming_server.log",
-                        "1gpu_v1_no_stream_metrics.out", True)
-
    def test_1_gpu_IFB_no_stream(self):
        self._base_test("1gpu_IFB_no_streaming_server.log",
-                        "1gpu_IFB_no_stream_metrics.out", False)
+                        "1gpu_IFB_no_stream_metrics.out")

    def test_1_gpu_IFB_stream(self):
        self._base_test("1gpu_IFB_streaming_server.log",
-                        "1gpu_IFB_stream_metrics.out", False)
+                        "1gpu_IFB_stream_metrics.out")

    if AVAILABLE_GPUS >= 2:

-        def test_2_gpu_v1(self):
-            self._base_test("2gpu_v1_no_streaming_server.log",
-                            "2gpu_v1_no_stream_metrics.out", True)
-
        def test_2_gpu_IFB_no_stream(self):
            self._base_test("2gpu_IFB_no_streaming_server.log",
-                            "2gpu_IFB_no_stream_metrics.out", False)
+                            "2gpu_IFB_no_stream_metrics.out")

        def test_2_gpu_IFB_stream(self):
            self._base_test("2gpu_IFB_streaming_server.log",
-                            "2gpu_IFB_stream_metrics.out", False)
+                            "2gpu_IFB_stream_metrics.out")

    if AVAILABLE_GPUS >= 4:

-        def test_4_gpu_v1(self):
-            self._base_test("4gpu_v1_no_streaming_server.log",
-                            "4gpu_v1_no_stream_metrics.out", True)
-
        def test_4_gpu_IFB_no_stream(self):
            self._base_test("4gpu_IFB_no_streaming_server.log",
-                            "4gpu_IFB_no_stream_metrics.out", False)
+                            "4gpu_IFB_no_stream_metrics.out")

        def test_4_gpu_IFB_stream(self):
            self._base_test("4gpu_IFB_streaming_server.log",
-                            "4gpu_IFB_stream_metrics.out", False)
+                            "4gpu_IFB_stream_metrics.out")


 if __name__ == "__main__":
--- a/triton_backend/ci/L0_backend_trtllm/test.sh
+++ b/triton_backend/ci/L0_backend_trtllm/test.sh
@ -228,50 +228,14 @@ for NUM_GPU in "${NUM_GPUS_TO_TEST[@]}"; do

    run_server "${SERVER_ARGS}"
    wait_for_server_ready ${SERVER_TIMEOUT} ${SERVER_PID[@]}
-    if [ "$WAIT_RET" != "0" ]; then
-        # Cleanup
-        kill $SERVER_PID > /dev/null 2>&1 || true
-        echo -e "\n***\n*** Failed to start $SERVER\n***"
+
+    # Expect invalid GPT model type error to be gracefully handled
+    if [ `grep -c "Static batching type is deprecated" $SERVER_LOG` == "0" ]; then
+        echo -e "\n***\n*** GPT model type error not handled gracefully: line ${LINENO}\n***"
        cat $SERVER_LOG
        exit 1
    fi

-    set -e
-    python3 ${TOOLS_DIR}/inflight_batcher_llm/benchmark_core_model.py \
-        --max-input-len=500 \
-        dataset --dataset=${DATASET} \
-        --tokenizer-dir=${TOKENIZER_DIR}
-
-    if [ $? -ne 0 ]; then
-        cat $SERVER_LOG
-        echo -e "\n***\n*** Error executing v1 benchmark_core_model test with ${NUM_GPU}GPU(s): line ${LINENO}\n***"
-        kill_server
-        wait_for_server_terminated ${SERVER_TIMEOUT} ${SERVER_PID[@]}
-        RET=1
-    fi
-    set +e
-
-    set -e
-    python3 ${TOOLS_DIR}/inflight_batcher_llm/end_to_end_test.py \
-        --max-input-len=500 \
-        --dataset=${DATASET}
-
-    if [ $? -ne 0 ]; then
-        cat $SERVER_LOG
-        echo -e "\n***\n*** Error executing v1 end-to-end test with ${NUM_GPU}GPU(s): line ${LINENO}\n***"
-        kill_server
-        wait_for_server_terminated ${SERVER_TIMEOUT} ${SERVER_PID[@]}
-        RET=1
-    fi
-    set +e
-
-    # Make sure the metrics is retrieved after the server has updated the metrics internally
-    sleep ${SLEEP_DURATION}
-    curl localhost:8002/metrics -o ${NUM_GPU}gpu_v1_no_stream_metrics.out
-
-    kill_server
-    wait_for_server_terminated ${SERVER_TIMEOUT} ${SERVER_PID[@]}
-
    # inflight batching ON
    # streaming OFF
    SERVER_LOG="./${NUM_GPU}gpu_IFB_no_streaming_server.log"