[ROCm][CI] Remove benchmarks test group and shard long test groups (#41669)

Signed-off-by: Andreas Karatzas <akaratza@amd.com>
2026-06-06 00:16:14 +00:00 · 2026-05-23 10:31:46 -05:00
parent 5bb8d2767a
commit 2a7d5b7324
1 changed files with 47 additions and 35 deletions
@@ -139,19 +139,6 @@ steps:
  - pytest models/multimodal -v -s -m 'distributed(num_gpus=2)' --ignore models/multimodal/generation/test_whisper.py
  - VLLM_WORKER_MULTIPROC_METHOD=spawn pytest models/multimodal/generation/test_whisper.py -v -s -m 'distributed(num_gpus=2)'

-#--------------------------------------------------------  mi250 · benchmarks  ---------------------------------------------------------#
-
- label: Benchmarks # TBD
-  timeout_in_minutes: 180
-  mirror_hardwares: [amdexperimental, amdproduction, amdgfx90anightly, amdmi250]
-  agent_pool: mi250_1
-  working_dir: "/vllm-workspace/.buildkite"
-  source_file_dependencies:
-  - benchmarks/
-  - vllm/platforms/rocm.py
-  commands:
-  - bash scripts/run-benchmarks.sh
-
 #----------------------------------------------------------  mi250 · compile  ----------------------------------------------------------#

 - label: PyTorch Compilation Unit Tests # TBD
@@ -485,7 +472,7 @@ steps:
  - pytest -v -s model_executor -m '(not slow_test)'
  - pytest -v -s entrypoints/openai/completion/test_tensorizer_entrypoint.py

-#----------------------------------------------------------  mi250 · models  -----------------------------------------------------------#
+#------------------------------------------------------  mi250 · models / basic  -------------------------------------------------------#

 - label: Basic Models Test (Other CPU) # TBD
  timeout_in_minutes: 180
@@ -546,6 +533,8 @@ steps:
  commands:
  - pytest -v -s models/test_terratorch.py models/test_transformers.py models/test_registry.py

+#-----------------------------------------------------  mi250 · models / language  -----------------------------------------------------#
+
 - label: Language Models Test (MTEB) # TBD
  timeout_in_minutes: 180
  mirror_hardwares: [amdexperimental, amdproduction, amdgfx90anightly, amdmi250]
@@ -590,6 +579,8 @@ steps:
  - pip freeze | grep -E 'torch'
  - pytest -v -s models/language -m 'core_model and slow_test' --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT --shard-id=$$BUILDKITE_PARALLEL_JOB

+#----------------------------------------------------  mi250 · models / multimodal  ----------------------------------------------------#
+
 - label: Multi-Modal Models (Extended Generation 2) # TBD
  timeout_in_minutes: 180
  mirror_hardwares: [amdexperimental, amdproduction, amdgfx90anightly, amdmi250]
@@ -976,18 +967,6 @@ steps:

 #--------------------------------------------------------  mi300 · benchmarks  ---------------------------------------------------------#

- label: Benchmarks # TBD
-  timeout_in_minutes: 180
-  mirror_hardwares: [amdexperimental, amdproduction, amdgfx942nightly, amdmi300]
-  agent_pool: mi300_1
-  optional: true
-  working_dir: "/vllm-workspace/.buildkite"
-  source_file_dependencies:
-  - benchmarks/
-  - vllm/platforms/rocm.py
-  commands:
-  - bash scripts/run-benchmarks.sh
-
 - label: Benchmarks CLI Test # TBD
  timeout_in_minutes: 180
  mirror_hardwares: [amdexperimental, amdproduction, amdgfx942nightly, amdmi300]
@@ -1759,7 +1738,7 @@ steps:
  - pytest -v -s -x lora/test_gptoss_tp.py
  - pytest -v -s -x lora/test_qwen35_densemodel_lora.py

-#----------------------------------------------------------  mi300 · models  -----------------------------------------------------------#
+#-----------------------------------------------------  mi300 · models / language  -----------------------------------------------------#

 - label: Language Models Test (Extended Pooling)  # TBD
  timeout_in_minutes: 180
@@ -1787,6 +1766,8 @@ steps:
  - pip freeze | grep -E 'torch'
  - pytest -v -s models/language -m 'core_model and (not slow_test)'

+#----------------------------------------------------  mi300 · models / multimodal  ----------------------------------------------------#
+
 - label: Multi-Modal Models (Extended Generation 1) # TBD
  timeout_in_minutes: 180
  mirror_hardwares: [amdexperimental, amdproduction, amdgfx942nightly, amdmi300]
@@ -1892,10 +1873,11 @@ steps:
  - pip install git+https://github.com/TIGER-AI-Lab/Mantis.git
  - pytest -v -s models/multimodal/processing/test_tensor_schema.py

- label: Multi-Modal Processor (CPU) # TBD
+- label: Multi-Modal Processor (CPU) %N # TBD
  timeout_in_minutes: 180
  mirror_hardwares: [amdexperimental, amdproduction, amdgfx942nightly, amdmi300]
  agent_pool: mi300_1
+  parallelism: 4
  no_gpu: true
  optional: true
  working_dir: "/vllm-workspace/tests"
@@ -1905,7 +1887,9 @@ steps:
  - tests/models/registry.py
  commands:
  - pip install git+https://github.com/TIGER-AI-Lab/Mantis.git
-  - pytest -v -s models/multimodal/processing --ignore models/multimodal/processing/test_tensor_schema.py
+  - pytest -v -s models/multimodal/processing --ignore models/multimodal/processing/test_tensor_schema.py --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT --shard-id=$$BUILDKITE_PARALLEL_JOB
+
+#-----------------------------------------------------  mi300 · models / quantized  -----------------------------------------------------#

 - label: Quantized Models Test # TBD
  timeout_in_minutes: 180
@@ -1921,7 +1905,31 @@ steps:
  commands:
  - pytest -v -s models/quantization

- label: Transformers Nightly Models # TBD
+#--------------------------------------------------  mi300 · models / transformers  ---------------------------------------------------#
+
+- label: Transformers Nightly Models (Shardable) %N # TBD
+  timeout_in_minutes: 180
+  mirror_hardwares: [amdexperimental, amdproduction, amdgfx942nightly, amdmi300]
+  agent_pool: mi300_1
+  parallelism: 4
+  optional: true
+  working_dir: "/vllm-workspace/"
+  source_file_dependencies:
+  - vllm/model_executor/models/
+  - vllm/model_executor/model_loader/
+  - vllm/multimodal/
+  - vllm/model_executor/layers/
+  - vllm/v1/attention/backends/
+  - vllm/v1/attention/selector.py
+  - vllm/_aiter_ops.py
+  - vllm/platforms/rocm.py
+  - tests/models/
+  commands:
+  - pip install --upgrade git+https://github.com/huggingface/transformers
+  - pytest -v -s tests/models/test_initialization.py --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT --shard-id=$$BUILDKITE_PARALLEL_JOB
+  - pytest -v -s tests/models/multimodal/processing/ --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT --shard-id=$$BUILDKITE_PARALLEL_JOB
+
+- label: Transformers Nightly Models (Single) # TBD
  timeout_in_minutes: 180
  mirror_hardwares: [amdexperimental, amdproduction, amdgfx942nightly, amdmi300]
  agent_pool: mi300_1
@@ -1940,9 +1948,7 @@ steps:
  - examples/
  commands:
  - pip install --upgrade git+https://github.com/huggingface/transformers
-  - pytest -v -s tests/models/test_initialization.py
  - pytest -v -s tests/models/test_transformers.py
-  - pytest -v -s tests/models/multimodal/processing/
  - pytest -v -s tests/models/multimodal/test_mapping.py
  - python3 examples/basic/offline_inference/chat.py
  - python3 examples/generate/multimodal/vision_language_offline.py --model-type qwen2_5_vl
@@ -2391,7 +2397,7 @@ steps:
  - uv pip install --system -r /vllm-workspace/requirements/kv_connectors_rocm.txt
  - DP_EP=1 ROCM_ATTN=1 bash v1/kv_connector/nixl_integration/config_sweep_accuracy_test.sh

- label: Hyrbid SSM NixlConnector PD accuracy tests (4 GPUs) # TBD
+- label: Hybrid SSM NixlConnector PD accuracy tests (4 GPUs) # TBD
  timeout_in_minutes: 180
  mirror_hardwares: [amdexperimental, amdproduction, amdgfx942nightly, amdmi300]
  agent_pool: mi300_4
@@ -2593,7 +2599,7 @@ steps:
  - export VLLM_WORKER_MULTIPROC_METHOD=spawn
  - pytest -s -v test_lm_eval_correctness.py --config-list-file=configs/models-large-rocm.txt --tp-size=8

-#----------------------------------------------------------  mi325 · models  -----------------------------------------------------------#
+#-----------------------------------------------------  mi325 · models / language  -----------------------------------------------------#

 - label: Language Models Test (Extended Generation) # TBD
  timeout_in_minutes: 180
@@ -2624,6 +2630,8 @@ steps:
  - uv pip install --system --no-build-isolation 'git+https://github.com/Dao-AILab/causal-conv1d@v1.6.0'
  - pytest -v -s models/language/generation -m hybrid_model --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT --shard-id=$$BUILDKITE_PARALLEL_JOB

+#----------------------------------------------------  mi325 · models / multimodal  ----------------------------------------------------#
+
 - label: Multi-Modal Models (Extended Pooling) # TBD
  timeout_in_minutes: 180
  mirror_hardwares: [amdexperimental, amdproduction, amdgfx942nightly, amdmi325]
@@ -3043,7 +3051,7 @@ steps:
  commands:
    - pytest -v -s kernels/moe/test_deepep_moe.py

-#----------------------------------------------------------  mi355 · models  -----------------------------------------------------------#
+#-----------------------------------------------------  mi355 · models / language  -----------------------------------------------------#

 - label: Language Models Test (Extended Generation) # TBD
  timeout_in_minutes: 180
@@ -3111,6 +3119,8 @@ steps:
  - pip freeze | grep -E 'torch'
  - pytest -v -s models/language -m 'core_model and (not slow_test)'

+#----------------------------------------------------  mi355 · models / multimodal  ----------------------------------------------------#
+
 - label: Multi-Modal Models (Extended Generation 1) # TBD
  timeout_in_minutes: 180
  mirror_hardwares: [amdexperimental, amdproduction, amdgfx950nightly, amdmi355]
@@ -3182,6 +3192,8 @@ steps:
  - pytest -v -s models/multimodal/generation/test_memory_leak.py -m core_model
  - cd .. && VLLM_WORKER_MULTIPROC_METHOD=spawn pytest -v -s tests/models/multimodal/generation/test_whisper.py -m core_model

+#-----------------------------------------------------  mi355 · models / quantized  -----------------------------------------------------#
+
 - label: Quantized Models Test # TBD
  timeout_in_minutes: 180
  mirror_hardwares: [amdexperimental, amdproduction, amdgfx950nightly, amdmi355]