test: Deprecate gpt_model_type "v1" static batching from triton_backend L0_backend_trtllm (#5229)

Signed-off-by: Yingge He <yinggeh@nvidia.com>
This commit is contained in:
Yingge He 2025-06-16 23:47:03 -07:00 committed by GitHub
parent e05b3ff427
commit 109f28ed3f
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
2 changed files with 16 additions and 68 deletions

View File

@ -82,7 +82,7 @@ class CustomMetricsTest(unittest.TestCase):
return json.loads(json_string)
def _parse_triton_metrics(self, filename, is_v1):
def _parse_triton_metrics(self, filename):
curl_counts = {}
with open(filename) as metrics_file:
for line in metrics_file:
@ -91,12 +91,11 @@ class CustomMetricsTest(unittest.TestCase):
metric_output = re.sub(r"^.*?{", "{", line).split()
metric_key = metric_output[0]
metric_value = metric_output[1]
key = self._convert_metric_key_to_stats_key(
metric_key, is_v1)
key = self._convert_metric_key_to_stats_key(metric_key)
curl_counts[key] = metric_value
return curl_counts
def _convert_metric_key_to_stats_key(self, metric_output, is_v1):
def _convert_metric_key_to_stats_key(self, metric_output):
# Converts:
# '{model="tensorrt_llm",request_type="context",version="1"}'
# to:
@ -107,15 +106,12 @@ class CustomMetricsTest(unittest.TestCase):
if not i.startswith('model') and not i.startswith('version')
][0]
self.assertIn(key, metric_to_stat_dict)
if (is_v1):
self.assertNotIn("inflight_batcher_specific_metric", key)
else:
self.assertNotIn("v1_specific_metric", key)
self.assertNotIn("v1_specific_metric", key)
return metric_to_stat_dict[key]
def _base_test(self, stats_file, metrics_file, is_v1):
def _base_test(self, stats_file, metrics_file):
stats = self._parse_log_file(stats_file)
metrics = self._parse_triton_metrics(metrics_file, is_v1)
metrics = self._parse_triton_metrics(metrics_file)
self.assertEqual(len(stats.keys()), len(metrics.keys()))
self.assertEqual(list(stats.keys()).sort(), list(metrics.keys()).sort())
for metric_key in stats.keys():
@ -140,45 +136,33 @@ class CustomMetricsTest(unittest.TestCase):
timedelta(seconds=-1) <= difference, difference
<= timedelta(seconds=1))
def test_1_gpu_v1(self):
self._base_test("1gpu_v1_no_streaming_server.log",
"1gpu_v1_no_stream_metrics.out", True)
def test_1_gpu_IFB_no_stream(self):
self._base_test("1gpu_IFB_no_streaming_server.log",
"1gpu_IFB_no_stream_metrics.out", False)
"1gpu_IFB_no_stream_metrics.out")
def test_1_gpu_IFB_stream(self):
self._base_test("1gpu_IFB_streaming_server.log",
"1gpu_IFB_stream_metrics.out", False)
"1gpu_IFB_stream_metrics.out")
if AVAILABLE_GPUS >= 2:
def test_2_gpu_v1(self):
self._base_test("2gpu_v1_no_streaming_server.log",
"2gpu_v1_no_stream_metrics.out", True)
def test_2_gpu_IFB_no_stream(self):
self._base_test("2gpu_IFB_no_streaming_server.log",
"2gpu_IFB_no_stream_metrics.out", False)
"2gpu_IFB_no_stream_metrics.out")
def test_2_gpu_IFB_stream(self):
self._base_test("2gpu_IFB_streaming_server.log",
"2gpu_IFB_stream_metrics.out", False)
"2gpu_IFB_stream_metrics.out")
if AVAILABLE_GPUS >= 4:
def test_4_gpu_v1(self):
self._base_test("4gpu_v1_no_streaming_server.log",
"4gpu_v1_no_stream_metrics.out", True)
def test_4_gpu_IFB_no_stream(self):
self._base_test("4gpu_IFB_no_streaming_server.log",
"4gpu_IFB_no_stream_metrics.out", False)
"4gpu_IFB_no_stream_metrics.out")
def test_4_gpu_IFB_stream(self):
self._base_test("4gpu_IFB_streaming_server.log",
"4gpu_IFB_stream_metrics.out", False)
"4gpu_IFB_stream_metrics.out")
if __name__ == "__main__":

View File

@ -228,50 +228,14 @@ for NUM_GPU in "${NUM_GPUS_TO_TEST[@]}"; do
run_server "${SERVER_ARGS}"
wait_for_server_ready ${SERVER_TIMEOUT} ${SERVER_PID[@]}
if [ "$WAIT_RET" != "0" ]; then
# Cleanup
kill $SERVER_PID > /dev/null 2>&1 || true
echo -e "\n***\n*** Failed to start $SERVER\n***"
# Expect invalid GPT model type error to be gracefully handled
if [ `grep -c "Static batching type is deprecated" $SERVER_LOG` == "0" ]; then
echo -e "\n***\n*** GPT model type error not handled gracefully: line ${LINENO}\n***"
cat $SERVER_LOG
exit 1
fi
set -e
python3 ${TOOLS_DIR}/inflight_batcher_llm/benchmark_core_model.py \
--max-input-len=500 \
dataset --dataset=${DATASET} \
--tokenizer-dir=${TOKENIZER_DIR}
if [ $? -ne 0 ]; then
cat $SERVER_LOG
echo -e "\n***\n*** Error executing v1 benchmark_core_model test with ${NUM_GPU}GPU(s): line ${LINENO}\n***"
kill_server
wait_for_server_terminated ${SERVER_TIMEOUT} ${SERVER_PID[@]}
RET=1
fi
set +e
set -e
python3 ${TOOLS_DIR}/inflight_batcher_llm/end_to_end_test.py \
--max-input-len=500 \
--dataset=${DATASET}
if [ $? -ne 0 ]; then
cat $SERVER_LOG
echo -e "\n***\n*** Error executing v1 end-to-end test with ${NUM_GPU}GPU(s): line ${LINENO}\n***"
kill_server
wait_for_server_terminated ${SERVER_TIMEOUT} ${SERVER_PID[@]}
RET=1
fi
set +e
# Make sure the metrics is retrieved after the server has updated the metrics internally
sleep ${SLEEP_DURATION}
curl localhost:8002/metrics -o ${NUM_GPU}gpu_v1_no_stream_metrics.out
kill_server
wait_for_server_terminated ${SERVER_TIMEOUT} ${SERVER_PID[@]}
# inflight batching ON
# streaming OFF
SERVER_LOG="./${NUM_GPU}gpu_IFB_no_streaming_server.log"