mirror of
https://github.com/NVIDIA/TensorRT-LLM.git
synced 2026-01-13 22:18:36 +08:00
test: Deprecate gpt_model_type "v1" static batching from triton_backend L0_backend_trtllm (#5229)
Signed-off-by: Yingge He <yinggeh@nvidia.com>
This commit is contained in:
parent
e05b3ff427
commit
109f28ed3f
@ -82,7 +82,7 @@ class CustomMetricsTest(unittest.TestCase):
|
||||
|
||||
return json.loads(json_string)
|
||||
|
||||
def _parse_triton_metrics(self, filename, is_v1):
|
||||
def _parse_triton_metrics(self, filename):
|
||||
curl_counts = {}
|
||||
with open(filename) as metrics_file:
|
||||
for line in metrics_file:
|
||||
@ -91,12 +91,11 @@ class CustomMetricsTest(unittest.TestCase):
|
||||
metric_output = re.sub(r"^.*?{", "{", line).split()
|
||||
metric_key = metric_output[0]
|
||||
metric_value = metric_output[1]
|
||||
key = self._convert_metric_key_to_stats_key(
|
||||
metric_key, is_v1)
|
||||
key = self._convert_metric_key_to_stats_key(metric_key)
|
||||
curl_counts[key] = metric_value
|
||||
return curl_counts
|
||||
|
||||
def _convert_metric_key_to_stats_key(self, metric_output, is_v1):
|
||||
def _convert_metric_key_to_stats_key(self, metric_output):
|
||||
# Converts:
|
||||
# '{model="tensorrt_llm",request_type="context",version="1"}'
|
||||
# to:
|
||||
@ -107,15 +106,12 @@ class CustomMetricsTest(unittest.TestCase):
|
||||
if not i.startswith('model') and not i.startswith('version')
|
||||
][0]
|
||||
self.assertIn(key, metric_to_stat_dict)
|
||||
if (is_v1):
|
||||
self.assertNotIn("inflight_batcher_specific_metric", key)
|
||||
else:
|
||||
self.assertNotIn("v1_specific_metric", key)
|
||||
self.assertNotIn("v1_specific_metric", key)
|
||||
return metric_to_stat_dict[key]
|
||||
|
||||
def _base_test(self, stats_file, metrics_file, is_v1):
|
||||
def _base_test(self, stats_file, metrics_file):
|
||||
stats = self._parse_log_file(stats_file)
|
||||
metrics = self._parse_triton_metrics(metrics_file, is_v1)
|
||||
metrics = self._parse_triton_metrics(metrics_file)
|
||||
self.assertEqual(len(stats.keys()), len(metrics.keys()))
|
||||
self.assertEqual(list(stats.keys()).sort(), list(metrics.keys()).sort())
|
||||
for metric_key in stats.keys():
|
||||
@ -140,45 +136,33 @@ class CustomMetricsTest(unittest.TestCase):
|
||||
timedelta(seconds=-1) <= difference, difference
|
||||
<= timedelta(seconds=1))
|
||||
|
||||
def test_1_gpu_v1(self):
|
||||
self._base_test("1gpu_v1_no_streaming_server.log",
|
||||
"1gpu_v1_no_stream_metrics.out", True)
|
||||
|
||||
def test_1_gpu_IFB_no_stream(self):
|
||||
self._base_test("1gpu_IFB_no_streaming_server.log",
|
||||
"1gpu_IFB_no_stream_metrics.out", False)
|
||||
"1gpu_IFB_no_stream_metrics.out")
|
||||
|
||||
def test_1_gpu_IFB_stream(self):
|
||||
self._base_test("1gpu_IFB_streaming_server.log",
|
||||
"1gpu_IFB_stream_metrics.out", False)
|
||||
"1gpu_IFB_stream_metrics.out")
|
||||
|
||||
if AVAILABLE_GPUS >= 2:
|
||||
|
||||
def test_2_gpu_v1(self):
|
||||
self._base_test("2gpu_v1_no_streaming_server.log",
|
||||
"2gpu_v1_no_stream_metrics.out", True)
|
||||
|
||||
def test_2_gpu_IFB_no_stream(self):
|
||||
self._base_test("2gpu_IFB_no_streaming_server.log",
|
||||
"2gpu_IFB_no_stream_metrics.out", False)
|
||||
"2gpu_IFB_no_stream_metrics.out")
|
||||
|
||||
def test_2_gpu_IFB_stream(self):
|
||||
self._base_test("2gpu_IFB_streaming_server.log",
|
||||
"2gpu_IFB_stream_metrics.out", False)
|
||||
"2gpu_IFB_stream_metrics.out")
|
||||
|
||||
if AVAILABLE_GPUS >= 4:
|
||||
|
||||
def test_4_gpu_v1(self):
|
||||
self._base_test("4gpu_v1_no_streaming_server.log",
|
||||
"4gpu_v1_no_stream_metrics.out", True)
|
||||
|
||||
def test_4_gpu_IFB_no_stream(self):
|
||||
self._base_test("4gpu_IFB_no_streaming_server.log",
|
||||
"4gpu_IFB_no_stream_metrics.out", False)
|
||||
"4gpu_IFB_no_stream_metrics.out")
|
||||
|
||||
def test_4_gpu_IFB_stream(self):
|
||||
self._base_test("4gpu_IFB_streaming_server.log",
|
||||
"4gpu_IFB_stream_metrics.out", False)
|
||||
"4gpu_IFB_stream_metrics.out")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
|
||||
@ -228,50 +228,14 @@ for NUM_GPU in "${NUM_GPUS_TO_TEST[@]}"; do
|
||||
|
||||
run_server "${SERVER_ARGS}"
|
||||
wait_for_server_ready ${SERVER_TIMEOUT} ${SERVER_PID[@]}
|
||||
if [ "$WAIT_RET" != "0" ]; then
|
||||
# Cleanup
|
||||
kill $SERVER_PID > /dev/null 2>&1 || true
|
||||
echo -e "\n***\n*** Failed to start $SERVER\n***"
|
||||
|
||||
# Expect invalid GPT model type error to be gracefully handled
|
||||
if [ `grep -c "Static batching type is deprecated" $SERVER_LOG` == "0" ]; then
|
||||
echo -e "\n***\n*** GPT model type error not handled gracefully: line ${LINENO}\n***"
|
||||
cat $SERVER_LOG
|
||||
exit 1
|
||||
fi
|
||||
|
||||
set -e
|
||||
python3 ${TOOLS_DIR}/inflight_batcher_llm/benchmark_core_model.py \
|
||||
--max-input-len=500 \
|
||||
dataset --dataset=${DATASET} \
|
||||
--tokenizer-dir=${TOKENIZER_DIR}
|
||||
|
||||
if [ $? -ne 0 ]; then
|
||||
cat $SERVER_LOG
|
||||
echo -e "\n***\n*** Error executing v1 benchmark_core_model test with ${NUM_GPU}GPU(s): line ${LINENO}\n***"
|
||||
kill_server
|
||||
wait_for_server_terminated ${SERVER_TIMEOUT} ${SERVER_PID[@]}
|
||||
RET=1
|
||||
fi
|
||||
set +e
|
||||
|
||||
set -e
|
||||
python3 ${TOOLS_DIR}/inflight_batcher_llm/end_to_end_test.py \
|
||||
--max-input-len=500 \
|
||||
--dataset=${DATASET}
|
||||
|
||||
if [ $? -ne 0 ]; then
|
||||
cat $SERVER_LOG
|
||||
echo -e "\n***\n*** Error executing v1 end-to-end test with ${NUM_GPU}GPU(s): line ${LINENO}\n***"
|
||||
kill_server
|
||||
wait_for_server_terminated ${SERVER_TIMEOUT} ${SERVER_PID[@]}
|
||||
RET=1
|
||||
fi
|
||||
set +e
|
||||
|
||||
# Make sure the metrics is retrieved after the server has updated the metrics internally
|
||||
sleep ${SLEEP_DURATION}
|
||||
curl localhost:8002/metrics -o ${NUM_GPU}gpu_v1_no_stream_metrics.out
|
||||
|
||||
kill_server
|
||||
wait_for_server_terminated ${SERVER_TIMEOUT} ${SERVER_PID[@]}
|
||||
|
||||
# inflight batching ON
|
||||
# streaming OFF
|
||||
SERVER_LOG="./${NUM_GPU}gpu_IFB_no_streaming_server.log"
|
||||
|
||||
Loading…
Reference in New Issue
Block a user