[ROCm][CI] Cleaning and restructuring amd-ci legacy pipeline (#34839)

Signed-off-by: Andreas Karatzas <akaratza@amd.com>
2026-06-06 00:16:14 +00:00 · 2026-03-19 14:30:58 -05:00
parent 9279c59a0e
commit 040a505ff5
15 changed files with 2486 additions and 3554 deletions
@@ -0,0 +1 @@
+Qwen3-235B-A22B-Instruct-2507-FP8.yaml
@@ -16,6 +16,23 @@ RAY_BASE_URL="https://raw.githubusercontent.com/ray-project/ray/master/python"
 WORK_DIR=$(mktemp -d)
 trap 'rm -rf "$WORK_DIR"' EXIT

+# ── Detect PyTorch index URL ─────────────────────────────────────────────
+
+if python3 -c "import torch; assert torch.version.hip" 2>/dev/null; then
+    ROCM_VER=$(python3 -c "import torch; print(torch.version.hip.rsplit('.', 1)[0])")
+    CANDIDATE_URL="https://download.pytorch.org/whl/rocm${ROCM_VER}"
+    if curl -fsSL --head "${CANDIDATE_URL}/" >/dev/null 2>&1; then
+        TORCH_INDEX_URL="${CANDIDATE_URL}"
+    else
+        echo ">>> WARNING: ROCm ${ROCM_VER} wheel index not found at ${CANDIDATE_URL}"
+        echo ">>>          Falling back to default PyPI (resolution may be incomplete)"
+        TORCH_INDEX_URL=""
+    fi
+else
+    TORCH_INDEX_URL="https://download.pytorch.org/whl/cu129"
+fi
+echo ">>> Using PyTorch index: ${TORCH_INDEX_URL:-PyPI default}"
+
 # Fetch all Ray requirement files used in the LLM depset pipeline
 echo ">>> Fetching Ray requirement files"
 RAY_FILES=(
@@ -116,6 +133,11 @@ echo "============================================================"
 echo ">>> Resolving: Can Ray generate compatible lock files?"
 echo "============================================================"

+EXTRA_INDEX_ARGS=()
+if [[ -n "${TORCH_INDEX_URL}" ]]; then
+    EXTRA_INDEX_ARGS+=(--extra-index-url "${TORCH_INDEX_URL}")
+fi
+
 set +e
 uv pip compile \
    "${WORK_DIR}/requirements.txt" \
@@ -126,7 +148,7 @@ uv pip compile \
    -c "${WORK_DIR}/vllm-constraints.txt" \
    --python-version 3.12 \
    --python-platform x86_64-manylinux_2_31 \
-    --extra-index-url https://download.pytorch.org/whl/cu129 \
+    "${EXTRA_INDEX_ARGS[@]}" \
    --index-strategy unsafe-best-match \
    --unsafe-package setuptools \
    --unsafe-package ray \
@@ -1,11 +1,14 @@
 #!/usr/bin/env bash
 set -euxo pipefail
-
 # Nightly e2e test for prefetch offloading with a MoE model.
 # Runs DeepSeek-V2-Lite with prefetch offloading of MoE expert weights
 # and validates GSM8K accuracy matches baseline (no offloading).
 #
 # args: [THRESHOLD] [NUM_QUESTIONS] [START_PORT]
+#
+# Environment variables:
+#   ATTENTION_BACKEND   - attention backend to use (e.g., FLASH_ATTN,
+#                         ROCM_ATTN, FLASHINFER). If unset, uses vllm default.
 THRESHOLD=${1:-0.25}
 NUM_Q=${2:-1319}
 PORT=${3:-8030}
@@ -22,6 +25,14 @@ wait_for_server() {

 MODEL="deepseek-ai/DeepSeek-V2-Lite"

+# ── Build optional vllm serve flags ─────────────────────────────────────
+
+EXTRA_ARGS=()
+if [[ -n "${ATTENTION_BACKEND:-}" ]]; then
+  echo "Using attention backend: ${ATTENTION_BACKEND}"
+  EXTRA_ARGS+=(--attention-backend "${ATTENTION_BACKEND}")
+fi
+
 cleanup() {
  if [[ -n "${SERVER_PID:-}" ]] && kill -0 "${SERVER_PID}" 2>/dev/null; then
    kill "${SERVER_PID}" 2>/dev/null || true
@@ -40,7 +51,8 @@ vllm serve "$MODEL" \
  --offload-num-in-group 2 \
  --offload-prefetch-step 1 \
  --offload-params w13_weight w2_weight \
-  --port "$PORT" &
+  --port "$PORT" \
+  ${EXTRA_ARGS+"${EXTRA_ARGS[@]}"} &
 SERVER_PID=$!
 wait_for_server "$PORT"

@@ -59,7 +59,7 @@ steps:
  - export VLLM_TEST_CLEAN_GPU_MEMORY=1
  - pytest -s -v tests/compile/passes/distributed

- label: Fusion and Compile Unit Tests (B200)
+- label: Fusion and Compile Unit Tests (2xB200)
  timeout_in_minutes: 20
  working_dir: "/vllm-workspace/"
  device: b200
@@ -1,3 +1,3 @@
 # GFX942 model configurations for GPQA evaluation
 # Tests different environment variable combinations
-gpt-oss-20b-rocm-baseline.yaml
+gpt-oss-20b-rocm-baseline.yaml
@@ -0,0 +1,12 @@
+model_name: "deepseek-ai/DeepSeek-R1"
+accuracy_threshold: 0.95
+num_questions: 1319
+num_fewshot: 5
+startup_max_wait_seconds: 1200
+server_args: >-
+  --enforce-eager
+  --max-model-len 4096
+  --data-parallel-size 8
+  --enable-expert-parallel
+  --attention-backend=TRITON_ATTN
+  --speculative-config '{"method":"mtp","num_speculative_tokens":3}'
@@ -0,0 +1,12 @@
+model_name: "deepseek-ai/DeepSeek-R1"
+accuracy_threshold: 0.95
+num_questions: 1319
+num_fewshot: 5
+startup_max_wait_seconds: 1200
+server_args: >-
+  --enforce-eager
+  --max-model-len 4096
+  --tensor-parallel-size 8
+  --enable-expert-parallel
+  --attention-backend=TRITON_ATTN
+  --speculative-config '{"method":"mtp","num_speculative_tokens":3}'
@@ -0,0 +1,12 @@
+model_name: "deepseek-ai/DeepSeek-V3.2"
+accuracy_threshold: 0.95
+num_questions: 1319
+num_fewshot: 5
+startup_max_wait_seconds: 1200
+server_args: >-
+  --enforce-eager
+  --max-model-len 4096
+  --data-parallel-size 8
+  --enable-expert-parallel
+  --attention-backend=TRITON_ATTN
+  --speculative-config '{"method":"mtp","num_speculative_tokens":3}'
@@ -0,0 +1,12 @@
+model_name: "deepseek-ai/DeepSeek-V3.2"
+accuracy_threshold: 0.95
+num_questions: 1319
+num_fewshot: 5
+startup_max_wait_seconds: 1200
+server_args: >-
+  --enforce-eager
+  --max-model-len 4096
+  --tensor-parallel-size 8
+  --enable-expert-parallel
+  --attention-backend=TRITON_ATTN
+  --speculative-config '{"method":"mtp","num_speculative_tokens":3}'
@@ -0,0 +1,4 @@
+DeepSeek-R1-TP_MI325.yaml
+DeepSeek-R1-DP_MI325.yaml
+DeepSeek-V3.2-TP_MI325.yaml
+DeepSeek-V3.2-DP_MI325.yaml
@@ -64,6 +64,16 @@ def test_gsm8k_correctness(config_filename):
            "Marlin kernels are not supported."
        )

+    # TODO(akaratza): Enable DeepSeek-V3.2 and DeepSeek-R1 on ROCm platforms
+    if current_platform.is_rocm() and (
+        "deepseek-ai/DeepSeek-V3.2" in eval_config["model_name"]
+        or "deepseek-ai/DeepSeek-R1" in eval_config["model_name"]
+    ):
+        pytest.skip(
+            "Skipping DeepSeek-V3.2 and DeepSeek-R1 on ROCm platforms "
+            "due to agent pool disk space issues and pod evictions."
+        )
+
    # Parse server arguments from config (use shlex to handle quoted strings)
    server_args_str = eval_config.get("server_args", "")
    server_args = shlex.split(server_args_str) if server_args_str else []
@@ -0,0 +1,6 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+
+def test_mi3xx_moe():
+    print("TODO: add tests for Mi3xx MoE quantization")
@@ -31,7 +31,7 @@ class TestAiterMlaFp8SupportCheck:

        # Should return False without raising
        with patch(
-            "vllm._aiter_ops.inspect.signature",
+            "inspect.signature",
            side_effect=ImportError("No module"),
        ):
            result = _check_aiter_mla_fp8_support()
@@ -46,7 +46,7 @@ class TestAiterMlaFp8SupportCheck:
        aiter_ops._AITER_MLA_SUPPORTS_FP8 = None

        with patch(
-            "vllm._aiter_ops.inspect.signature",
+            "inspect.signature",
            side_effect=ModuleNotFoundError("Module not found"),
        ):
            # Should return False without raising
@@ -63,7 +63,7 @@ class TestAiterMlaFp8SupportCheck:
        aiter_ops._AITER_MLA_SUPPORTS_FP8 = None

        with patch(
-            "vllm._aiter_ops.inspect.signature",
+            "inspect.signature",
            side_effect=AttributeError("No attribute"),
        ):
            assert _check_aiter_mla_fp8_support() is False
@@ -78,7 +78,7 @@ class TestAiterMlaFp8SupportCheck:
        aiter_ops._AITER_MLA_SUPPORTS_FP8 = None

        with patch(
-            "vllm._aiter_ops.inspect.signature",
+            "inspect.signature",
            side_effect=ValueError("No signature"),
        ):
            assert _check_aiter_mla_fp8_support() is False
@@ -93,7 +93,7 @@ class TestAiterMlaFp8SupportCheck:
        aiter_ops._AITER_MLA_SUPPORTS_FP8 = None

        with patch(
-            "vllm._aiter_ops.inspect.signature",
+            "inspect.signature",
            side_effect=TypeError("Not a callable"),
        ):
            assert _check_aiter_mla_fp8_support() is False