[Infra] - Minor clean-up and test Ubuntu mirrors (#4829)

Signed-off-by: Yanchao Lu <yanchaol@nvidia.com>
Signed-off-by: QI JUN <22017000+QiJune@users.noreply.github.com>
Co-authored-by: QI JUN <22017000+QiJune@users.noreply.github.com>
This commit is contained in:
Yanchao Lu 2025-06-02 20:18:20 +08:00 committed by GitHub
parent eb2d51a429
commit 8166649d03
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
7 changed files with 55 additions and 42 deletions

View File

@ -314,7 +314,7 @@ def main(args):
theta=pretrain_config.rotary_base,
)
if batch_size == 0:
if batch_size == 0 or len(batch_input_ids) == 0:
return [], [], [], {}
input_lengths = [x.size(0) for x in batch_input_ids]

View File

@ -1006,6 +1006,12 @@ def launchStages(pipeline, reuseBuild, testFilter, enableFailFast, globalVars)
]
}
if (env.testPhase2StageName) {
parameters += [
'testPhase2StageName': env.testPhase2StageName,
]
}
echo "trigger SBSA test job, params: ${parameters}"
def status = triggerJob(

View File

@ -1581,13 +1581,11 @@ def launchTestJobs(pipeline, testFilter, dockerNode=null)
"H100_PCIe-TensorRT-[Post-Merge]-1": ["h100-cr", "l0_h100", 1, 2],
"H100_PCIe-TensorRT-[Post-Merge]-2": ["h100-cr", "l0_h100", 2, 2],
"B200_PCIe-Triton-Python-[Post-Merge]-1": ["b100-ts2", "l0_b200", 1, 1],
"DGX_H100-4_GPUs-TensorRT-[Post-Merge]-1": ["dgx-h100-x4", "l0_dgx_h100", 1, 1, 4],
// "A100_80GB_PCIE-TensorRT-Perf-1": ["a100-80gb-pcie", "l0_perf", 1, 1],
"H100_PCIe-TensorRT-Perf-1": ["h100-cr", "l0_perf", 1, 1],
"H100_PCIe-PyTorch-Perf-1": ["h100-cr", "l0_perf", 1, 1],
"DGX_H200-8_GPUs-PyTorch-[Post-Merge]-1": ["dgx-h200-x8", "l0_dgx_h200", 1, 1, 8],
"DGX_H200-4_GPUs-PyTorch-[Post-Merge]-1": ["dgx-h200-x4", "l0_dgx_h200", 1, 2, 4],
"DGX_H200-4_GPUs-PyTorch-[Post-Merge]-2": ["dgx-h200-x4", "l0_dgx_h200", 2, 2, 4],
"DGX_H200-4_GPUs-PyTorch-[Post-Merge]-1": ["dgx-h200-x4", "l0_dgx_h200", 1, 1, 4],
"DGX_H200-4_GPUs-TensorRT-[Post-Merge]-1": ["dgx-h200-x4", "l0_dgx_h200", 1, 1, 4],
]
parallelJobs = x86TestConfigs.collectEntries{key, values -> [key, [createKubernetesPodConfig(LLM_DOCKER_IMAGE, values[0], "amd64", values[4] ?: 1, key.contains("Perf")), {
@ -1636,7 +1634,7 @@ def launchTestJobs(pipeline, testFilter, dockerNode=null)
fullSet += SBSASlurmTestConfigs.keySet()
if (env.targetArch == AARCH64_TRIPLE) {
parallelJobs = SBSASlurmTestConfigs.collectEntries{key, values -> [key, [createKubernetesPodConfig(LLM_DOCKER_IMAGE, values[0], "arm64"), {
parallelJobs = SBSATestConfigs.collectEntries{key, values -> [key, [createKubernetesPodConfig(LLM_DOCKER_IMAGE, values[0], "arm64"), {
runLLMTestlistOnPlatform(pipeline, values[0], values[1], LINUX_AARCH64_CONFIG, false, key, values[2], values[3])
}]]}

View File

@ -210,39 +210,3 @@ l0_dgx_h100:
- examples/test_multimodal.py::test_llm_multimodal_general[Llama-3.2-11B-Vision-pp:1-tp:2-bfloat16-bs:1-cpp_e2e:False-nb:1]
- examples/test_multimodal.py::test_llm_multimodal_general[llava-v1.6-mistral-7b-hf-vision-trtllm-pp:1-tp:2-float16-bs:1-cpp_e2e:False-nb:1]
- deterministic/test_mixtral_deterministic.py::test_llm_mixtral_4gpus_deterministic[Mixtral-8x7B-Instruct-v0.1-float16]
- condition:
ranges:
system_gpu_count:
gte: 4
lte: 4
wildcards:
gpu:
- '*h100*'
linux_distribution_name: ubuntu*
terms:
stage: post_merge
backend: tensorrt
auto_trigger: others
tests:
# ------------- TRT tests ---------------
- examples/test_bert.py::test_llm_bert_general[compare_hf-enable_remove_input_padding-use_attention_plugin-enable_context_fmha-tp:2-pp:1-float16-BertForSequenceClassification-bert/bert-base-uncased-yelp-polarity]
- examples/test_bert.py::test_llm_bert_general[compare_hf-enable_remove_input_padding-use_attention_plugin-enable_context_fmha-tp:2-pp:1-float16-RobertaForQuestionAnswering-bert/roberta-base-squad2]
- examples/test_bert.py::test_llm_bert_general[compare_hf-enable_remove_input_padding-disable_attention_plugin-disable_context_fmha-tp:2-pp:1-float16-RobertaForSequenceClassification-bert/twitter-roberta-base-emotion]
- examples/test_recurrentgemma.py::test_llm_recurrentgemma_2gpu[recurrentgemma-2b]
- examples/test_mamba.py::test_llm_mamba2_2gpu[mamba-codestral-7B-v0.1]
- accuracy/test_cli_flow.py::TestLlama2_7B::test_smooth_quant_ootb_tp2
- accuracy/test_cli_flow.py::TestLlama2_7B::test_int4_awq_tp2
- accuracy/test_cli_flow.py::TestLlama2_7B::test_int4_awq_prequantized_tp2
- accuracy/test_cli_flow.py::TestLlama2_7B::test_int4_gptq_prequantized_tp2
- accuracy/test_cli_flow.py::TestLlama3_1_8B::test_tp4[disable_gemm_allreduce_plugin]
- accuracy/test_cli_flow.py::TestLlama3_1_8B::test_tp4[enable_gemm_allreduce_plugin]
- accuracy/test_cli_flow.py::TestLlama3_1_8B::test_fp8_rowwise_tp4[disable_gemm_allreduce_plugin]
- accuracy/test_cli_flow.py::TestLlama3_2_1B::test_fp8_tp2[disable_reduce_fusion-enable_fp8_context_fmha]
- accuracy/test_cli_flow.py::TestLlama3_2_1B::test_fp8_tp2[enable_reduce_fusion-disable_fp8_context_fmha]
- accuracy/test_cli_flow.py::TestPhi2::test_tp2
- llmapi/test_llm_e2e.py::test_llmapi_quant_llama_70b
- llmapi/test_llm_examples.py::test_llmapi_example_distributed_autopp_tp2
- examples/test_enc_dec.py::test_llm_enc_dec_general[compare_hf-t5-small-float16-enable_gemm_plugin-enable_attention_plugin-disable_paged_kv_cache-tp:2-pp:2-nb:1-disable_fp8]
- examples/test_gpt.py::test_llm_gpt2_next_prompt_tuning[use_py_session-tp2]
- unittest/llmapi/apps/_test_openai_multi_gpu.py -m "part0"
- test_e2e.py::test_trtllm_bench_mgmn

View File

@ -9,6 +9,7 @@ l0_dgx_h200:
gpu:
- '*h200*'
linux_distribution_name: ubuntu*
cpu: x86_64
terms:
stage: post_merge
backend: pytorch
@ -31,6 +32,7 @@ l0_dgx_h200:
gpu:
- '*h200*'
linux_distribution_name: ubuntu*
cpu: x86_64
terms:
stage: post_merge
backend: pytorch
@ -89,3 +91,39 @@ l0_dgx_h200:
- accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_fp8_4gpus[tp2pp2-fp8kv=True-attn_backend=FLASHINFER-torch_compile=False]
- accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_fp8_4gpus[tp2pp2-fp8kv=True-attn_backend=FLASHINFER-torch_compile=True]
- accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_fp8_4gpus[pp4-fp8kv=True-attn_backend=FLASHINFER-torch_compile=False]
- condition:
ranges:
system_gpu_count:
gte: 4
lte: 4
wildcards:
gpu:
- '*h200*'
linux_distribution_name: ubuntu*
cpu: x86_64
terms:
stage: post_merge
backend: tensorrt
tests:
# ------------- TRT tests ---------------
- examples/test_bert.py::test_llm_bert_general[compare_hf-enable_remove_input_padding-use_attention_plugin-enable_context_fmha-tp:2-pp:1-float16-BertForSequenceClassification-bert/bert-base-uncased-yelp-polarity]
- examples/test_bert.py::test_llm_bert_general[compare_hf-enable_remove_input_padding-use_attention_plugin-enable_context_fmha-tp:2-pp:1-float16-RobertaForQuestionAnswering-bert/roberta-base-squad2]
- examples/test_bert.py::test_llm_bert_general[compare_hf-enable_remove_input_padding-disable_attention_plugin-disable_context_fmha-tp:2-pp:1-float16-RobertaForSequenceClassification-bert/twitter-roberta-base-emotion]
- examples/test_recurrentgemma.py::test_llm_recurrentgemma_2gpu[recurrentgemma-2b]
- examples/test_mamba.py::test_llm_mamba2_2gpu[mamba-codestral-7B-v0.1]
- accuracy/test_cli_flow.py::TestLlama2_7B::test_smooth_quant_ootb_tp2
- accuracy/test_cli_flow.py::TestLlama2_7B::test_int4_awq_tp2
- accuracy/test_cli_flow.py::TestLlama2_7B::test_int4_awq_prequantized_tp2
- accuracy/test_cli_flow.py::TestLlama2_7B::test_int4_gptq_prequantized_tp2
- accuracy/test_cli_flow.py::TestLlama3_1_8B::test_tp4[disable_gemm_allreduce_plugin]
- accuracy/test_cli_flow.py::TestLlama3_1_8B::test_tp4[enable_gemm_allreduce_plugin]
- accuracy/test_cli_flow.py::TestLlama3_1_8B::test_fp8_rowwise_tp4[disable_gemm_allreduce_plugin]
- accuracy/test_cli_flow.py::TestLlama3_2_1B::test_fp8_tp2[disable_reduce_fusion-enable_fp8_context_fmha]
- accuracy/test_cli_flow.py::TestLlama3_2_1B::test_fp8_tp2[enable_reduce_fusion-disable_fp8_context_fmha]
- accuracy/test_cli_flow.py::TestPhi2::test_tp2
- llmapi/test_llm_e2e.py::test_llmapi_quant_llama_70b
- llmapi/test_llm_examples.py::test_llmapi_example_distributed_autopp_tp2
- examples/test_enc_dec.py::test_llm_enc_dec_general[compare_hf-t5-small-float16-enable_gemm_plugin-enable_attention_plugin-disable_paged_kv_cache-tp:2-pp:2-nb:1-disable_fp8]
- examples/test_gpt.py::test_llm_gpt2_next_prompt_tuning[use_py_session-tp2]
- unittest/llmapi/apps/_test_openai_multi_gpu.py -m "part0"
- test_e2e.py::test_trtllm_bench_mgmn

View File

@ -9,6 +9,7 @@ l0_gh200:
gpu:
- '*h200*'
linux_distribution_name: ubuntu*
cpu: aarch64
terms:
stage: pre_merge
backend: tensorrt

View File

@ -461,3 +461,9 @@ test_e2e.py::test_trtllm_bench_llmapi_launch[trt_backend-llama-v3-llama3-8b] SKI
accuracy/test_disaggregated_serving.py::TestLlama3_1_8BInstruct::test_auto_dtype[False] SKIP (https://nvbugs/5303573)
accuracy/test_disaggregated_serving.py::TestLlama3_1_8BInstruct::test_auto_dtype[True] SKIP (https://nvbugs/5303573)
test_e2e.py::test_openai_multi_chat_example SKIP (https://nvbugs/5236980)
test_e2e.py::test_ptp_quickstart_advanced[Llama3.1-70B-FP8-llama-3.1-model/Llama-3.1-70B-Instruct-FP8] SKIP (https://nvbugs/5318059)
accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_fp8_block_scales[mtp_nextn=0-fp8kv=False-attention_dp=True-cuda_graph=False-overlap_scheduler=False-torch_compile=False] SKIP (https://nvbugs/5318087)
accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_fp8_block_scales[mtp_nextn=0-fp8kv=False-attention_dp=False-cuda_graph=False-overlap_scheduler=False-torch_compile=False] SKIP (https://nvbugs/5318087)
unittest/_torch/auto_deploy/integration/test_ad_build.py SKIP (https://nvbugs/5318103)
accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_bfloat16_4gpus[tp2pp2-attn_backend=TRTLLM-torch_compile=False] SKIP (https://nvbugs/5318143)
accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_bfloat16_4gpus[tp2pp2-attn_backend=TRTLLM-torch_compile=True] SKIP (https://nvbugs/5318143)