[Chore] Replace MODEL_CACHE_DIR with LLM_MODELS_ROOT and unwaive triton_server/test_triton.py::test_gpt_ib[gpt-ib] (#5859)

Signed-off-by: Simeng Liu <simengl@nvidia.com>
This commit is contained in:
Simeng Liu 2025-07-21 15:46:37 -07:00 committed by GitHub
parent 9645814bdf
commit 4a0951f85c
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
3 changed files with 45 additions and 33 deletions

View File

@ -64,9 +64,9 @@ def model_path(test_name):
"llava": "llava-1.5-7b-hf",
"llava_fp8": "llava-1.5-7b-hf"
}
model_cache_dir = os.environ.get("MODEL_CACHE_DIR",
"/scratch.trt_llm_data/llm-models")
return os.path.join(model_cache_dir, model_mapping.get(test_name, ""))
model_cache_root = os.environ.get("LLM_MODELS_ROOT",
"/scratch.trt_llm_data/llm-models")
return os.path.join(model_cache_root, model_mapping.get(test_name, ""))
@pytest.fixture

View File

@ -382,7 +382,7 @@ examples/test_multimodal.py::test_llm_multimodal_general[Llama-3.2-11B-Vision-pp
triton_server/test_triton.py::test_mllama[mllama] SKIP (https://nvbugs/5333818)
examples/test_multimodal.py::test_llm_multimodal_general[Llama-3.2-11B-Vision-pp:1-tp:2-bfloat16-bs:1-cpp_e2e:False-nb:1] SKIP (https://nvbugs/5333818)
accuracy/test_cli_flow.py::TestGpt2::test_weight_streaming_ootb SKIP (https://nvbugs/5338552)
triton_server/test_triton.py::test_gpt_ib[gpt-ib] SKIP (https://nvbugs/5348963)
accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_fp8_block_scales_4gpus[tp4-mtp_nextn=0-fp8kv=True-attention_dp=True-cuda_graph=True-overlap_scheduler=True-torch_compile=False] SKIP (https://nvbugs/5345215)
unittest/llmapi/test_llm_multi_gpu.py -m "gpu4 and part0" SKIP (https://nvbugs/5348958)
accuracy/test_llm_api.py::TestLlama3_1_8BInstruct::test_guided_decoding_4gpus[xgrammar] SKIP (https://nvbugs/5346443)
examples/test_multimodal.py::test_llm_multimodal_general[kosmos-2-pp:1-tp:1-float16-bs:1-cpp_e2e:True-nb:1] SKIP (https://nvbugs/5354936)

View File

@ -838,28 +838,37 @@ if __name__ == "__main__":
with open(FLAGS.output_tokens_csv) as csv_file:
csv_reader = csv.reader(csv_file, delimiter=",")
for row in csv_reader:
expected_output_ids = [int(val) for val in row]
expected_output_ids = [[int(val) for val in row]]
break
else:
expected_output_ids = ([] if FLAGS.exclude_input_in_output else
input_ids[0]) + [
21221,
290,
373,
257,
2888,
286,
262,
4141,
2351,
10006,
13,
679,
373,
7018,
284,
262,
]
# expected_output_ids holds a list of lists, each list is a version of "expected" output ids
# The expected output could vary on different GPUs
expected_output_ids = []
expected_output_ids.append(
([] if FLAGS.exclude_input_in_output else input_ids[0]) + [
21221,
290,
373,
257,
2888,
286,
262,
4141,
2351,
10006,
13,
679,
373,
7018,
284,
262,
])
# Adding a second expected output ids for testing on A100 GPUs
expected_output_ids.append(
([] if FLAGS.exclude_input_in_output else input_ids[0]) + [
21221, 290, 257, 4255, 379, 262, 1957, 7072, 11, 4689, 347,
2852, 2564, 494, 13, 679
])
if FLAGS.num_return_sequences is None:
num_generations = FLAGS.beam_width
@ -1186,16 +1195,19 @@ if __name__ == "__main__":
if FLAGS.check_output and seq_idx == 0:
passed = False
if FLAGS.correctness_threshold == 1.0:
passed = (output_ids_w_prompt == expected_output_ids)
passed = (output_ids_w_prompt in expected_output_ids)
else:
# Compare the output tokens one by one
num_same_output_id = 0
expected_len = len(expected_output_ids)
for i in range(min(len(output_ids_w_prompt), expected_len)):
if output_ids_w_prompt[i] == expected_output_ids[i]:
num_same_output_id += 1
num_same_output_id = [0] * len(expected_output_ids)
for i, expect_output in enumerate(expected_output_ids):
for output, expected in zip(output_ids_w_prompt,
expect_output):
if output == expected:
num_same_output_id[i] += 1
# Calculate the match rate
match_rate = num_same_output_id / expected_len
match_rate = max(num_same_output_id) / len(
output_ids_w_prompt)
print(f"Output token matching rate: {match_rate}")
passed = (match_rate > FLAGS.correctness_threshold)
print("expected_output_ids = ", expected_output_ids)
@ -1208,10 +1220,10 @@ if __name__ == "__main__":
if FLAGS.check_output and non_deterministic_sampling and seq_idx > 0:
# Skip the correctness check under non-deterministic sampling.
# Generated sequences should not be identical.
passed = output_ids_w_prompt[seq_idx] != expected_output_ids
passed = output_ids_w_prompt[seq_idx] not in expected_output_ids
if not passed:
print(f"Output tokens of sequence {seq_idx} is identical "
f"to the first sequence.")
f"to the expected sequence.")
if FLAGS.return_log_probs:
print('cum_log_probs:', expand_and_vstack(cum_log_probs))