diff --git a/.buildkite/test-amd.yaml b/.buildkite/test-amd.yaml index c7338b4828d..97fc9c2bb91 100644 --- a/.buildkite/test-amd.yaml +++ b/.buildkite/test-amd.yaml @@ -1299,12 +1299,11 @@ steps: source_file_dependencies: - vllm/ - tests/entrypoints/llm - - tests/entrypoints/offline_mode commands: - export VLLM_WORKER_MULTIPROC_METHOD=spawn - - pytest -v -s entrypoints/llm --ignore=entrypoints/llm/test_generate.py --ignore=entrypoints/llm/test_collective_rpc.py - - pytest -v -s entrypoints/llm/test_generate.py - - pytest -v -s entrypoints/offline_mode + - pytest -v -s entrypoints/llm --ignore=entrypoints/llm/test_generate.py --ignore=entrypoints/llm/test_collective_rpc.py --ignore=entrypoints/llm/offline_mode + - pytest -v -s entrypoints/llm/test_generate.py # it needs a clean process + - pytest -v -s entrypoints/llm/offline_mode # Needs to avoid interference with other tests - label: Entrypoints Integration (Pooling) # TBD timeout_in_minutes: 180 @@ -1346,7 +1345,7 @@ steps: - vllm/platforms/rocm.py commands: - pytest -v -s entrypoints/openai/tool_parsers - - pytest -v -s entrypoints/ --ignore=entrypoints/llm --ignore=entrypoints/offline_mode --ignore=entrypoints/openai --ignore=entrypoints/serve --ignore=entrypoints/test_chat_utils.py --ignore=entrypoints/pooling --ignore=entrypoints/speech_to_text --ignore=tests/entrypoints/generate + - pytest -v -s entrypoints/ --ignore=entrypoints/llm --ignore=entrypoints/openai --ignore=entrypoints/serve --ignore=entrypoints/test_chat_utils.py --ignore=entrypoints/pooling --ignore=entrypoints/speech_to_text --ignore=tests/entrypoints/generate - label: OpenAI API correctness # TBD timeout_in_minutes: 180 diff --git a/.buildkite/test_areas/entrypoints.yaml b/.buildkite/test_areas/entrypoints.yaml index 548174ed748..613cb76eb4e 100644 --- a/.buildkite/test_areas/entrypoints.yaml +++ b/.buildkite/test_areas/entrypoints.yaml @@ -11,7 +11,7 @@ steps: - tests/entrypoints/ commands: - pytest -v -s entrypoints/openai/tool_parsers - - pytest -v -s entrypoints/ --ignore=entrypoints/llm --ignore=entrypoints/offline_mode --ignore=entrypoints/openai --ignore=entrypoints/serve --ignore=entrypoints/test_chat_utils.py --ignore=entrypoints/pooling --ignore=entrypoints/speech_to_text --ignore=tests/entrypoints/generate + - pytest -v -s entrypoints/ --ignore=entrypoints/llm --ignore=entrypoints/openai --ignore=entrypoints/serve --ignore=entrypoints/test_chat_utils.py --ignore=entrypoints/pooling --ignore=entrypoints/speech_to_text --ignore=tests/entrypoints/generate - label: Entrypoints Integration (LLM) key: entrypoints-integration-llm @@ -20,12 +20,11 @@ steps: source_file_dependencies: - vllm/ - tests/entrypoints/llm - - tests/entrypoints/offline_mode commands: - export VLLM_WORKER_MULTIPROC_METHOD=spawn - - pytest -v -s entrypoints/llm --ignore=entrypoints/llm/test_generate.py --ignore=entrypoints/llm/test_collective_rpc.py + - pytest -v -s entrypoints/llm --ignore=entrypoints/llm/test_generate.py --ignore=entrypoints/llm/test_collective_rpc.py --ignore=entrypoints/llm/offline_mode - pytest -v -s entrypoints/llm/test_generate.py # it needs a clean process - - pytest -v -s entrypoints/offline_mode # Needs to avoid interference with other tests + - pytest -v -s entrypoints/llm/offline_mode # Needs to avoid interference with other tests mirror: amd: device: mi325_1 diff --git a/.dockerignore b/.dockerignore index fb010600db9..66447272e95 100644 --- a/.dockerignore +++ b/.dockerignore @@ -33,10 +33,3 @@ share/python-wheels/ *.egg MANIFEST rust/target/ -# Not needed in Docker builds -docs/ -.github/ -.pre-commit-config.yaml -.clang-format -.gitattributes -format.sh diff --git a/.github/CODEOWNERS b/.github/CODEOWNERS index beaaa5d8642..a8947fe2324 100644 --- a/.github/CODEOWNERS +++ b/.github/CODEOWNERS @@ -34,10 +34,11 @@ /vllm/entrypoints/speech_to_text/realtime @njhill /vllm/entrypoints/speech_to_text @NickLucche /vllm/entrypoints/pooling @noooop -/vllm/entrypoints/sagemaker @DarkLight1337 +/vllm/entrypoints/serve/sagemaker @DarkLight1337 /vllm/entrypoints/serve @njhill /vllm/entrypoints/*.py @njhill /vllm/entrypoints/chat_utils.py @DarkLight1337 +/vllm/entrypoints/offline_utils.py @DarkLight1337 /vllm/entrypoints/llm.py @DarkLight1337 # Rust Frontend diff --git a/.github/workflows/stale.yml b/.github/workflows/stale.yml index 44bf71db5e9..ba807fab7c3 100644 --- a/.github/workflows/stale.yml +++ b/.github/workflows/stale.yml @@ -15,7 +15,7 @@ jobs: actions: write runs-on: ubuntu-latest steps: - - uses: actions/stale@997185467fa4f803885201cee163a9f38240193d # v10.1.1 + - uses: actions/stale@eb5cf3af3ac0a1aa4c9c45633dd1ae542a27a899 # v10.3.0 with: # Increasing this value ensures that changes to this workflow # propagate to all issues and PRs in days rather than months diff --git a/csrc/libtorch_stable/fused_deepseek_v4_qnorm_rope_kv_insert_kernel.cu b/csrc/libtorch_stable/fused_deepseek_v4_qnorm_rope_kv_insert_kernel.cu index a5f3f03de00..4d34b4b6b50 100644 --- a/csrc/libtorch_stable/fused_deepseek_v4_qnorm_rope_kv_insert_kernel.cu +++ b/csrc/libtorch_stable/fused_deepseek_v4_qnorm_rope_kv_insert_kernel.cu @@ -102,6 +102,35 @@ constexpr float NUM_TOKEN_CUTOFF = 1024; constexpr int kNumLanes = 32; constexpr int kElemsPerLane = kHeadDim / kNumLanes; // 16 +// Pack this lane's 16 fp32 elements into per-tensor E4M3 FP8 (one uint4 = 16 +// B), scaling by `scale` (a reciprocal scale) and saturating to ±448. Used by +// the FlashInfer full-cache path for both the Q and KV stores. +__device__ __forceinline__ uint4 packFp8E4M3x16(float const* values, + float const scale) { +#ifndef USE_ROCM + uint4 out; + auto* out2 = reinterpret_cast<__nv_fp8x2_storage_t*>(&out); + #pragma unroll + for (int i = 0; i < kElemsPerLane / 2; i++) { + float2 scaled = + make_float2(values[2 * i] * scale, values[2 * i + 1] * scale); + scaled.x = fminf(fmaxf(scaled.x, -kFp8Max), kFp8Max); + scaled.y = fminf(fmaxf(scaled.y, -kFp8Max), kFp8Max); + out2[i] = __nv_cvt_float2_to_fp8x2(scaled, __NV_SATFINITE, __NV_E4M3); + } + return out; +#else + uint8_t out_bytes[kElemsPerLane]; + #pragma unroll + for (int i = 0; i < kElemsPerLane; i++) { + float scaled = values[i] * scale; + scaled = fminf(fmaxf(scaled, -kFp8Max), kFp8Max); + out_bytes[i] = rocm_cvt_float_to_fp8_e4m3(scaled); + } + return *reinterpret_cast(out_bytes); +#endif +} + // ──────────────────────────────────────────────────────────────────────────── // Small inline helpers // ──────────────────────────────────────────────────────────────────────────── @@ -649,6 +678,257 @@ void launchFusedDeepseekV4QNormRopeKVRopeQuantInsert( #undef DISPATCH } +// ──────────────────────────────────────────────────────────────────────────── +// FlashInfer full-cache kernel +// ──────────────────────────────────────────────────────────────────────────── +// +// Sibling to the FlashMLA kernel above, used by the FlashInfer V4 sparse-MLA +// backend. Differences from the legacy path: +// * No Q head padding — output Q layout matches the input num_heads_q. +// * KV is written as a *contiguous* 512-wide row per token (token-strided), +// not the legacy UE8M0 paged layout with a separate scale tail. +// * Q/KV are stored either as bf16 or as per-tensor E4M3 FP8 (one global +// scale), selected by the STORE_Q_FP8 / STORE_KV_FP8 template flags. +// +// Grid: 1D, gridDim.x = ceil(num_tokens_full * (num_heads_q + 1) / warps). +// Each warp handles one (token, slot): slot < num_heads_q → Q, slot == +// num_heads_q → KV. +template +__global__ void fusedDeepseekV4FullCacheKernel( + scalar_t_in* __restrict__ q_inout, // [N, H, 512], in place (bf16) + uint8_t* __restrict__ q_fp8_out, // [N, H, 512] fp8, optional + int64_t const q_fp8_stride0, // elements (fp8 == bytes) + int64_t const q_fp8_stride1, // elements (fp8 == bytes) + scalar_t_in const* __restrict__ kv_in, // [N, 512] bf16 + uint8_t* __restrict__ k_cache, // contiguous bf16 or fp8 cache + int64_t const* __restrict__ slot_mapping, // [num_tokens_insert] i64 + int64_t const* __restrict__ position_ids, // [N] i64 + float const* __restrict__ cos_sin_cache, // [max_pos, 64] fp32 + float const* __restrict__ fp8_scale_ptr, // scalar, KV fp8 only + float const* __restrict__ q_fp8_scale_inv, // scalar, Q fp8 only + float const eps, + int const num_tokens_full, // = q.size(0) = kv.size(0) + int const num_tokens_insert, // = slot_mapping.size(0) + int const num_heads_q, // H (no padding) + int const cache_block_size, // tokens per cache block + int64_t const kv_block_stride, // bytes per cache block + int64_t const kv_token_stride) { // bytes per cache token +#if (!defined(__CUDA_ARCH__) || __CUDA_ARCH__ < 800) && !defined(USE_ROCM) + if constexpr (std::is_same_v) { + return; + } else { +#endif + using Converter = vllm::_typeConvert; + int const warpsPerBlock = blockDim.x / 32; + int const warpId = threadIdx.x / 32; + int const laneId = threadIdx.x % 32; + int const globalWarpIdx = blockIdx.x * warpsPerBlock + warpId; + + int const slotsPerToken = num_heads_q + 1; + int const tokenIdx = globalWarpIdx / slotsPerToken; + int const slotIdx = globalWarpIdx % slotsPerToken; + if (tokenIdx >= num_tokens_full) return; + bool const isKV = (slotIdx == num_heads_q); + // KV branch: skip DP-padded tokens (no slot reserved for them). + if (isKV && tokenIdx >= num_tokens_insert) return; + +#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900) + cudaGridDependencySynchronize(); +#endif + + int const dim_base = laneId * kElemsPerLane; // in [0, 512) step 16 + scalar_t_in const* src_ptr; + if (isKV) { + src_ptr = kv_in + static_cast(tokenIdx) * kHeadDim + dim_base; + } else { + src_ptr = q_inout + + (static_cast(tokenIdx) * num_heads_q + slotIdx) * + kHeadDim + + dim_base; + } + uint4 const v0 = *reinterpret_cast(src_ptr); + uint4 const v1 = *reinterpret_cast(src_ptr + 8); + + // ── Decode bf16 → 16 fp32 registers ─────────────────────────────────── + float elements[kElemsPerLane]; + { + auto const* p0 = + reinterpret_cast(&v0); + auto const* p1 = + reinterpret_cast(&v1); +#pragma unroll + for (int i = 0; i < 4; i++) { + float2 f2 = Converter::convert(p0[i]); + elements[2 * i] = f2.x; + elements[2 * i + 1] = f2.y; + } +#pragma unroll + for (int i = 0; i < 4; i++) { + float2 f2 = Converter::convert(p1[i]); + elements[8 + 2 * i] = f2.x; + elements[8 + 2 * i + 1] = f2.y; + } + } + + // ── Q branch: RMSNorm (no weight) ───────────────────────────────────── + if (!isKV) { + float sumOfSquares = 0.0f; +#pragma unroll + for (int i = 0; i < kElemsPerLane; i++) { + sumOfSquares += elements[i] * elements[i]; + } + sumOfSquares = warpSum(sumOfSquares); + float const rms_rcp = + rsqrtf(sumOfSquares / static_cast(kHeadDim) + eps); +#pragma unroll + for (int i = 0; i < kElemsPerLane; i++) { + elements[i] = elements[i] * rms_rcp; + } + } + + // ── GPT-J RoPE on dims [NOPE_DIM, HEAD_DIM) ─────────────────────────── + bool const is_rope_lane = dim_base >= kNopeDim; + if (is_rope_lane) { + int64_t const pos = position_ids[tokenIdx]; + constexpr int kHalfRope = kRopeDim / 2; + float const* cos_ptr = cos_sin_cache + pos * kRopeDim; + float const* sin_ptr = cos_ptr + kHalfRope; + int const rope_local_base = dim_base - kNopeDim; + int const half_base = rope_local_base >> 1; + float4 const c0 = *reinterpret_cast(cos_ptr + half_base); + float4 const c1 = *reinterpret_cast(cos_ptr + half_base + 4); + float4 const s0 = *reinterpret_cast(sin_ptr + half_base); + float4 const s1 = *reinterpret_cast(sin_ptr + half_base + 4); + float const cos_arr[8] = {c0.x, c0.y, c0.z, c0.w, c1.x, c1.y, c1.z, c1.w}; + float const sin_arr[8] = {s0.x, s0.y, s0.z, s0.w, s1.x, s1.y, s1.z, s1.w}; +#pragma unroll + for (int p = 0; p < kElemsPerLane / 2; p++) { + float const x_even = elements[2 * p]; + float const x_odd = elements[2 * p + 1]; + elements[2 * p] = x_even * cos_arr[p] - x_odd * sin_arr[p]; + elements[2 * p + 1] = x_even * sin_arr[p] + x_odd * cos_arr[p]; + } + } + + // ── Store ───────────────────────────────────────────────────────────── + if (!isKV) { + if constexpr (STORE_Q_FP8) { + float const scale_inv = VLLM_LDG(q_fp8_scale_inv); + uint4 const out = packFp8E4M3x16(elements, scale_inv); + uint8_t* dst = q_fp8_out + + static_cast(tokenIdx) * q_fp8_stride0 + + static_cast(slotIdx) * q_fp8_stride1 + dim_base; + *reinterpret_cast(dst) = out; + } else { + uint4 out0, out1; + auto* po0 = reinterpret_cast(&out0); + auto* po1 = reinterpret_cast(&out1); +#pragma unroll + for (int i = 0; i < 4; i++) { + po0[i] = Converter::convert( + make_float2(elements[2 * i], elements[2 * i + 1])); + } +#pragma unroll + for (int i = 0; i < 4; i++) { + po1[i] = Converter::convert( + make_float2(elements[8 + 2 * i], elements[8 + 2 * i + 1])); + } + scalar_t_in* dst = + q_inout + + (static_cast(tokenIdx) * num_heads_q + slotIdx) * kHeadDim + + dim_base; + *reinterpret_cast(dst) = out0; + *reinterpret_cast(dst + 8) = out1; + } + } else { + int64_t const slot_id = slot_mapping[tokenIdx]; + if (slot_id >= 0) { + int64_t const block_idx = slot_id / cache_block_size; + int64_t const pos_in_block = slot_id % cache_block_size; + uint8_t* cache_row = + k_cache + block_idx * kv_block_stride + pos_in_block * kv_token_stride; + if constexpr (STORE_KV_FP8) { + float const inv_scale = 1.0f / VLLM_LDG(fp8_scale_ptr); + uint4 const out = packFp8E4M3x16(elements, inv_scale); + *reinterpret_cast(cache_row + dim_base) = out; + } else { + uint4 out0, out1; + auto* po0 = + reinterpret_cast(&out0); + auto* po1 = + reinterpret_cast(&out1); +#pragma unroll + for (int i = 0; i < 4; i++) { + po0[i] = Converter::convert( + make_float2(elements[2 * i], elements[2 * i + 1])); + } +#pragma unroll + for (int i = 0; i < 4; i++) { + po1[i] = Converter::convert( + make_float2(elements[8 + 2 * i], elements[8 + 2 * i + 1])); + } + scalar_t_in* dst = reinterpret_cast(cache_row) + dim_base; + *reinterpret_cast(dst) = out0; + *reinterpret_cast(dst + 8) = out1; + } + } + } + +#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900) + cudaTriggerProgrammaticLaunchCompletion(); +#endif +#if (!defined(__CUDA_ARCH__) || __CUDA_ARCH__ < 800) && !defined(USE_ROCM) + } +#endif +} + +// Configure + launch helper shared by the bf16 and fp8 full-cache launchers. +template +static void launchFullCacheKernel( + scalar_t_in* q_inout, uint8_t* q_fp8_out, int64_t q_fp8_stride0, + int64_t q_fp8_stride1, scalar_t_in const* kv_in, uint8_t* k_cache, + int64_t const* slot_mapping, int64_t const* position_ids, + float const* cos_sin_cache, float const* fp8_scale, + float const* q_fp8_scale_inv, float const eps, int const num_tokens_full, + int const num_tokens_insert, int const num_heads_q, + int const cache_block_size, int64_t const kv_block_stride, + int64_t const kv_token_stride, char const* op_name, cudaStream_t stream) { + constexpr int kBlockSize = 256; + constexpr int kWarpsPerBlock = kBlockSize / 32; + int64_t const total_warps = + static_cast(num_tokens_full) * (num_heads_q + 1); + int const grid = + static_cast((total_warps + kWarpsPerBlock - 1) / kWarpsPerBlock); + auto* kernel = + fusedDeepseekV4FullCacheKernel; +#ifndef USE_ROCM + static int const sm_version = getSMVersion(); + STD_TORCH_CHECK(sm_version >= 80, op_name, + " requires sm_80+ (Ampere or newer); got sm_", sm_version); + cudaLaunchConfig_t config; + config.gridDim = dim3(grid); + config.blockDim = dim3(kBlockSize); + config.dynamicSmemBytes = 0; + config.stream = stream; + cudaLaunchAttribute attrs[1]; + attrs[0].id = cudaLaunchAttributeProgrammaticStreamSerialization; + attrs[0].val.programmaticStreamSerializationAllowed = 1; + config.attrs = attrs; + config.numAttrs = (sm_version >= 90) ? 1 : 0; + cudaLaunchKernelEx(&config, kernel, q_inout, q_fp8_out, q_fp8_stride0, + q_fp8_stride1, kv_in, k_cache, slot_mapping, position_ids, + cos_sin_cache, fp8_scale, q_fp8_scale_inv, eps, + num_tokens_full, num_tokens_insert, num_heads_q, + cache_block_size, kv_block_stride, kv_token_stride); +#else + kernel<<>>( + q_inout, q_fp8_out, q_fp8_stride0, q_fp8_stride1, kv_in, k_cache, + slot_mapping, position_ids, cos_sin_cache, fp8_scale, q_fp8_scale_inv, + eps, num_tokens_full, num_tokens_insert, num_heads_q, cache_block_size, + kv_block_stride, kv_token_stride); +#endif +} + } // namespace deepseek_v4_fused_ops } // namespace vllm @@ -735,3 +1015,167 @@ torch::stable::Tensor fused_deepseek_v4_qnorm_rope_kv_rope_quant_insert( }); return q_out; } + +// ──────────────────────────────────────────────────────────────────────────── +// FlashInfer full-cache torch ops +// ──────────────────────────────────────────────────────────────────────────── +void fused_deepseek_v4_qnorm_rope_kv_rope_full_cache_bf16_insert( + torch::stable::Tensor& q, // [N, H, 512] bf16, in place + torch::stable::Tensor const& kv, // [N, 512] bf16, read-only + torch::stable::Tensor& k_cache, // [num_blocks, bs, 512] bf16 + torch::stable::Tensor const& slot_mapping, // [num_tokens_insert] int64 + torch::stable::Tensor const& position_ids, // [N] int64 + torch::stable::Tensor const& cos_sin_cache, // [max_pos, 64] float32 + double eps, int64_t cache_block_size) { + using torch::headeronly::ScalarType; + STD_TORCH_CHECK(q.device().is_cuda() && q.is_contiguous(), + "q must be contiguous CUDA"); + STD_TORCH_CHECK(kv.device().is_cuda() && kv.is_contiguous(), + "kv must be contiguous CUDA"); + STD_TORCH_CHECK(k_cache.device().is_cuda(), "k_cache must be CUDA"); + STD_TORCH_CHECK(slot_mapping.device().is_cuda() && + slot_mapping.scalar_type() == ScalarType::Long, + "slot_mapping must be int64 CUDA"); + STD_TORCH_CHECK(position_ids.device().is_cuda() && + position_ids.scalar_type() == ScalarType::Long, + "position_ids must be int64 CUDA"); + STD_TORCH_CHECK(cos_sin_cache.device().is_cuda() && + cos_sin_cache.scalar_type() == ScalarType::Float && + cos_sin_cache.dim() == 2 && cos_sin_cache.size(1) == 64, + "cos_sin_cache shape [max_pos, 64] float32"); + STD_TORCH_CHECK(q.dim() == 3 && q.size(2) == 512, "q shape [N, H, 512]"); + STD_TORCH_CHECK(kv.dim() == 2 && kv.size(1) == 512, "kv shape [N, 512]"); + STD_TORCH_CHECK(q.scalar_type() == ScalarType::BFloat16 && + kv.scalar_type() == ScalarType::BFloat16, + "q and kv must be bfloat16"); + STD_TORCH_CHECK(k_cache.dim() == 3 && k_cache.size(1) == cache_block_size && + k_cache.size(2) == 512 && k_cache.stride(2) == 1, + "k_cache shape [num_blocks, cache_block_size, 512] contiguous"); + STD_TORCH_CHECK(k_cache.scalar_type() == ScalarType::BFloat16, + "k_cache must be bfloat16"); + + int const num_tokens_full = static_cast(q.size(0)); + int const num_tokens_insert = static_cast(slot_mapping.size(0)); + STD_TORCH_CHECK(static_cast(kv.size(0)) == num_tokens_full && + static_cast(position_ids.size(0)) == num_tokens_full, + "q/kv/position_ids row counts must match"); + STD_TORCH_CHECK(num_tokens_insert <= num_tokens_full, + "slot_mapping must not exceed q row count"); + int const num_heads_q = static_cast(q.size(1)); + + const torch::stable::accelerator::DeviceGuard device_guard( + q.get_device_index()); + const cudaStream_t stream = get_current_cuda_stream(q.get_device_index()); + + // bf16 cache: 2 bytes/element -> byte strides for the uint8-addressed kernel. + int64_t const kv_block_stride = k_cache.stride(0) * 2; + int64_t const kv_token_stride = k_cache.stride(1) * 2; + + VLLM_STABLE_DISPATCH_HALF_TYPES( + q.scalar_type(), + "fused_deepseek_v4_qnorm_rope_kv_rope_full_cache_bf16_insert", [&] { + vllm::deepseek_v4_fused_ops::launchFullCacheKernel( + reinterpret_cast(q.mutable_data_ptr()), nullptr, 0, 0, + reinterpret_cast(kv.const_data_ptr()), + reinterpret_cast(k_cache.mutable_data_ptr()), + slot_mapping.const_data_ptr(), + position_ids.const_data_ptr(), + cos_sin_cache.const_data_ptr(), nullptr, nullptr, + static_cast(eps), num_tokens_full, num_tokens_insert, + num_heads_q, static_cast(cache_block_size), kv_block_stride, + kv_token_stride, + "fused_deepseek_v4_qnorm_rope_kv_rope_full_cache_bf16_insert", + stream); + }); +} + +void fused_deepseek_v4_qnorm_rope_kv_rope_full_cache_fp8_insert( + torch::stable::Tensor const& q, // [N, H, 512] bf16, read-only + torch::stable::Tensor const& kv, // [N, 512] bf16, read-only + torch::stable::Tensor& q_fp8, // [N, H, 512] fp8 e4m3 + torch::stable::Tensor& k_cache, // [num_blocks, bs, 512] fp8 + torch::stable::Tensor const& slot_mapping, // [num_tokens_insert] int64 + torch::stable::Tensor const& position_ids, // [N] int64 + torch::stable::Tensor const& cos_sin_cache, // [max_pos, 64] float32 + torch::stable::Tensor const& fp8_scale, // scalar float32 (KV scale) + torch::stable::Tensor const& q_fp8_scale_inv, // scalar float32 (1 / Q scale) + double eps, int64_t cache_block_size) { + using torch::headeronly::ScalarType; + STD_TORCH_CHECK(q.device().is_cuda() && q.is_contiguous(), + "q must be contiguous CUDA"); + STD_TORCH_CHECK(kv.device().is_cuda() && kv.is_contiguous(), + "kv must be contiguous CUDA"); + STD_TORCH_CHECK(q_fp8.device().is_cuda() && q_fp8.is_contiguous() && + q_fp8.scalar_type() == ScalarType::Float8_e4m3fn && + q_fp8.dim() == 3 && q_fp8.size(0) == q.size(0) && + q_fp8.size(1) == q.size(1) && q_fp8.size(2) == q.size(2), + "q_fp8 must be a contiguous float8_e4m3fn tensor matching q"); + STD_TORCH_CHECK(k_cache.device().is_cuda(), "k_cache must be CUDA"); + STD_TORCH_CHECK(slot_mapping.device().is_cuda() && + slot_mapping.scalar_type() == ScalarType::Long, + "slot_mapping must be int64 CUDA"); + STD_TORCH_CHECK(position_ids.device().is_cuda() && + position_ids.scalar_type() == ScalarType::Long, + "position_ids must be int64 CUDA"); + STD_TORCH_CHECK(cos_sin_cache.device().is_cuda() && + cos_sin_cache.scalar_type() == ScalarType::Float && + cos_sin_cache.dim() == 2 && cos_sin_cache.size(1) == 64, + "cos_sin_cache shape [max_pos, 64] float32"); + STD_TORCH_CHECK(fp8_scale.device().is_cuda() && + fp8_scale.scalar_type() == ScalarType::Float && + fp8_scale.size(0) == 1, + "fp8_scale must be a scalar float32 CUDA tensor"); + STD_TORCH_CHECK(q_fp8_scale_inv.device().is_cuda() && + q_fp8_scale_inv.scalar_type() == ScalarType::Float && + q_fp8_scale_inv.size(0) == 1, + "q_fp8_scale_inv must be a scalar float32 CUDA tensor"); + STD_TORCH_CHECK(q.dim() == 3 && q.size(2) == 512, "q shape [N, H, 512]"); + STD_TORCH_CHECK(kv.dim() == 2 && kv.size(1) == 512, "kv shape [N, 512]"); + STD_TORCH_CHECK(q.scalar_type() == kv.scalar_type(), + "q and kv dtype must match"); + STD_TORCH_CHECK(k_cache.dim() == 3 && k_cache.size(1) == cache_block_size && + k_cache.size(2) == 512 && k_cache.stride(2) == 1, + "k_cache shape [num_blocks, cache_block_size, 512] contiguous"); + STD_TORCH_CHECK(k_cache.scalar_type() == ScalarType::Float8_e4m3fn, + "k_cache must be float8_e4m3fn"); + + int const num_tokens_full = static_cast(q.size(0)); + int const num_tokens_insert = static_cast(slot_mapping.size(0)); + STD_TORCH_CHECK(static_cast(kv.size(0)) == num_tokens_full && + static_cast(position_ids.size(0)) == num_tokens_full, + "q/kv/position_ids row counts must match"); + STD_TORCH_CHECK(num_tokens_insert <= num_tokens_full, + "slot_mapping must not exceed q row count"); + int const num_heads_q = static_cast(q.size(1)); + + const torch::stable::accelerator::DeviceGuard device_guard( + q.get_device_index()); + const cudaStream_t stream = get_current_cuda_stream(q.get_device_index()); + + VLLM_STABLE_DISPATCH_HALF_TYPES( + q.scalar_type(), + "fused_deepseek_v4_qnorm_rope_kv_rope_full_cache_fp8_insert", [&] { + vllm::deepseek_v4_fused_ops::launchFullCacheKernel( + // q is read-only in the fp8 path (the kernel writes q_fp8); the + // launcher signature is non-const, so cast away const on the ptr. + reinterpret_cast( + const_cast(q.const_data_ptr())), + reinterpret_cast(q_fp8.mutable_data_ptr()), + q_fp8.stride(0), q_fp8.stride(1), + reinterpret_cast(kv.const_data_ptr()), + reinterpret_cast(k_cache.mutable_data_ptr()), + slot_mapping.const_data_ptr(), + position_ids.const_data_ptr(), + cos_sin_cache.const_data_ptr(), + fp8_scale.const_data_ptr(), + q_fp8_scale_inv.const_data_ptr(), static_cast(eps), + num_tokens_full, num_tokens_insert, num_heads_q, + static_cast(cache_block_size), + // fp8 cache: 1 byte/element -> stride already in bytes. + k_cache.stride(0), k_cache.stride(1), + "fused_deepseek_v4_qnorm_rope_kv_rope_full_cache_fp8_insert", + stream); + }); +} diff --git a/csrc/libtorch_stable/ops.h b/csrc/libtorch_stable/ops.h index dd27a6968d0..0a991de76ff 100644 --- a/csrc/libtorch_stable/ops.h +++ b/csrc/libtorch_stable/ops.h @@ -238,6 +238,23 @@ torch::stable::Tensor fused_deepseek_v4_qnorm_rope_kv_rope_quant_insert( torch::stable::Tensor const& cos_sin_cache, int64_t q_head_padded, double eps, int64_t cache_block_size); +void fused_deepseek_v4_qnorm_rope_kv_rope_full_cache_bf16_insert( + torch::stable::Tensor& q, torch::stable::Tensor const& kv, + torch::stable::Tensor& k_cache, torch::stable::Tensor const& slot_mapping, + torch::stable::Tensor const& position_ids, + torch::stable::Tensor const& cos_sin_cache, double eps, + int64_t cache_block_size); + +void fused_deepseek_v4_qnorm_rope_kv_rope_full_cache_fp8_insert( + torch::stable::Tensor const& q, torch::stable::Tensor const& kv, + torch::stable::Tensor& q_fp8, torch::stable::Tensor& k_cache, + torch::stable::Tensor const& slot_mapping, + torch::stable::Tensor const& position_ids, + torch::stable::Tensor const& cos_sin_cache, + torch::stable::Tensor const& fp8_scale, + torch::stable::Tensor const& q_fp8_scale_inv, double eps, + int64_t cache_block_size); + #ifndef USE_ROCM torch::stable::Tensor minimax_allreduce_rms( torch::stable::Tensor const& input, diff --git a/csrc/libtorch_stable/torch_bindings.cpp b/csrc/libtorch_stable/torch_bindings.cpp index e9a62a8666c..511a788eeae 100644 --- a/csrc/libtorch_stable/torch_bindings.cpp +++ b/csrc/libtorch_stable/torch_bindings.cpp @@ -343,6 +343,20 @@ STABLE_TORCH_LIBRARY_FRAGMENT(_C, ops) { "Tensor slot_mapping, Tensor position_ids, Tensor cos_sin_cache, " "int q_head_padded, float eps, int cache_block_size) -> Tensor"); + // FlashInfer V4 full-cache variants: write Q in place (bf16) or to a separate + // FP8 tensor, and KV into a contiguous 512-wide token-strided cache. + ops.def( + "fused_deepseek_v4_qnorm_rope_kv_rope_full_cache_bf16_insert(" + "Tensor! q, Tensor kv, Tensor! k_cache, Tensor slot_mapping, " + "Tensor position_ids, Tensor cos_sin_cache, float eps, " + "int cache_block_size) -> ()"); + ops.def( + "fused_deepseek_v4_qnorm_rope_kv_rope_full_cache_fp8_insert(" + "Tensor q, Tensor kv, Tensor! q_fp8, Tensor! k_cache, " + "Tensor slot_mapping, Tensor position_ids, Tensor cos_sin_cache, " + "Tensor fp8_scale, Tensor q_fp8_scale_inv, float eps, " + "int cache_block_size) -> ()"); + #ifndef USE_ROCM ops.def( "minimax_allreduce_rms(" @@ -591,6 +605,12 @@ STABLE_TORCH_LIBRARY_IMPL(_C, CUDA, ops) { ops.impl("fused_qk_norm_rope", TORCH_BOX(&fused_qk_norm_rope)); ops.impl("fused_deepseek_v4_qnorm_rope_kv_rope_quant_insert", TORCH_BOX(&fused_deepseek_v4_qnorm_rope_kv_rope_quant_insert)); + ops.impl( + "fused_deepseek_v4_qnorm_rope_kv_rope_full_cache_bf16_insert", + TORCH_BOX(&fused_deepseek_v4_qnorm_rope_kv_rope_full_cache_bf16_insert)); + ops.impl( + "fused_deepseek_v4_qnorm_rope_kv_rope_full_cache_fp8_insert", + TORCH_BOX(&fused_deepseek_v4_qnorm_rope_kv_rope_full_cache_fp8_insert)); #ifndef USE_ROCM ops.impl("minimax_allreduce_rms", TORCH_BOX(&minimax_allreduce_rms)); ops.impl("minimax_allreduce_rms_qk", TORCH_BOX(&minimax_allreduce_rms_qk)); diff --git a/csrc/torch_bindings.cpp b/csrc/torch_bindings.cpp index c078222bca0..3351638f574 100644 --- a/csrc/torch_bindings.cpp +++ b/csrc/torch_bindings.cpp @@ -55,7 +55,8 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) { // Horizontally-fused DeepseekV4-MLA: per-head RMSNorm + GPT-J RoPE for Q, and // GPT-J RoPE + UE8M0 FP8 quant + paged cache insert for KV, all in one - // kernel launch. Registered in _C_stable_libtorch. + // kernel launch. Registered in _C_stable_libtorch (incl. the FlashInfer V4 + // full-cache bf16/fp8 variants). // Quantization ops #ifndef USE_ROCM diff --git a/docker/Dockerfile.rocm b/docker/Dockerfile.rocm index 1e39306e39f..545765ea856 100644 --- a/docker/Dockerfile.rocm +++ b/docker/Dockerfile.rocm @@ -98,7 +98,6 @@ RUN if [ "$USE_SCCACHE" = "1" ]; then \ ARG USE_SCCACHE ENV SCCACHE_BUCKET=${USE_SCCACHE:+${SCCACHE_BUCKET_NAME}} ENV SCCACHE_REGION=${USE_SCCACHE:+${SCCACHE_REGION_NAME}} -ENV SCCACHE_ENDPOINT=${USE_SCCACHE:+${SCCACHE_ENDPOINT}} ENV SCCACHE_S3_NO_CREDENTIALS=${USE_SCCACHE:+${SCCACHE_S3_NO_CREDENTIALS}} ENV SCCACHE_IDLE_TIMEOUT=${USE_SCCACHE:+0} diff --git a/docs/design/attention_backends.md b/docs/design/attention_backends.md index 329a4aacfb6..bd3eed7d55b 100644 --- a/docs/design/attention_backends.md +++ b/docs/design/attention_backends.md @@ -228,3 +228,17 @@ MLA decode backends are selected using the standard | `TOKENSPEED_MLA` | fp16, bf16 | `fp8`, `fp8_e4m3` | 32, 64 | Any | ❌ | ❌ | ❌ | ❌ | ❌ | Decoder | 10.x | | `TRITON_MLA` | fp16, bf16 | `auto`, `float16`, `bfloat16`, `fp8`, `fp8_e4m3` | %16 | Any | ❌ | ❌ | ❌ | ❌ | ✅ | Decoder | Any | | `XPU_MLA_SPARSE` | fp16, bf16 | `auto`, `float16`, `bfloat16` | Any | 576 | ❌ | ❌ | ✅ | ❌ | ❌ | Decoder | Any | + +### DeepSeek V4 Decode Backends + +DeepSeek V4 sparse MLA uses its own decode backends, selected via +`--attention-backend=` (e.g., `FLASHMLA_SPARSE_DSV4`, +`FLASHINFER_MLA_SPARSE_DSV4`). They share the V4 sparse-index +pipeline (compressor + SWA + indexer, 256-token blocks, head 512); +default on NVIDIA is `FLASHMLA_SPARSE_DSV4`. + +| Backend | Dtypes | KV Dtypes | Block Sizes | Head Sizes | Sink | Non-Causal | Sparse | MM Prefix | DCP | Attention Types | Compute Cap. | +| ------- | ------ | --------- | ----------- | ---------- | ---- | ---------- | ------ | --------- | --- | --------------- | ------------ | +| `FLASHINFER_MLA_SPARSE_DSV4` | fp16, bf16 | `auto` | Any | Any | ❌ | ❌ | ❌ | ❌ | ❌ | Decoder | Any | +| `FLASHMLA_SPARSE_DSV4` | fp16, bf16 | `auto` | 256 | 512 | ❌ | ❌ | ❌ | ❌ | ❌ | Decoder | Any | +| `ROCM_FLASHMLA_SPARSE_DSV4` | fp16, bf16 | `auto` | Any | Any | ❌ | ❌ | ❌ | ❌ | ❌ | Decoder | N/A | diff --git a/docs/design/cuda_graphs_multimodal.md b/docs/design/cuda_graphs_multimodal.md index 1fb5c2ba651..5a9edc1ad93 100644 --- a/docs/design/cuda_graphs_multimodal.md +++ b/docs/design/cuda_graphs_multimodal.md @@ -82,6 +82,7 @@ Models opt-in to encoder CUDA Graphs by implementing the [SupportsEncoderCudaGra | Architecture | Models | CG for Image | CG for Video | | ------------ | ------ | ------------ | ------------ | +| `InternVLChatModel` | `InternVL3.5`, `InternVL3`, `InternVL2.5`, `InternVL2` | ✅︎ | ✅︎ | | `Qwen2VLForConditionalGeneration` | `Qwen2-VL` | ✅︎ | ✅︎ | | `Qwen2_5_VLForConditionalGeneration` | `Qwen2.5-VL` | ✅︎ | ✅︎ | | `Qwen3VLForConditionalGeneration` | `Qwen3-VL` | ✅︎ | ✅︎ | diff --git a/docs/features/quantization/README.md b/docs/features/quantization/README.md index 6c4aa7d8aaa..2be357d8860 100644 --- a/docs/features/quantization/README.md +++ b/docs/features/quantization/README.md @@ -3,7 +3,7 @@ Quantization trades off model precision for smaller memory footprint, allowing large models to be run on a wider range of devices. !!! tip - To get started with quantization, see [LLM Compressor](llm_compressor.md), a library for optimizing models for deployment with vLLM that supports FP8, INT8, INT4, and other quantization formats. + To get started with quantization, see [LLM Compressor](llm_compressor/README.md), a library for optimizing models for deployment with vLLM that supports FP8, INT8, INT4, and other quantization formats. The following are the supported quantization formats for vLLM: @@ -12,9 +12,11 @@ The following are the supported quantization formats for vLLM: - [GGUF](gguf.md) - [GPTQModel](gptqmodel.md) - [Intel Neural Compressor](inc.md) -- [INT4 W4A16](int4.md) -- [INT8 W8A8](int8.md) -- [FP8 W8A8](fp8.md) +- [LLM Compressor](llm_compressor/README.md) + - [FP8 W8A8](llm_compressor/fp8.md) + - [INT4 W4A16](llm_compressor/int4.md) + - [INT8 W4A8](llm_compressor/int8_w4a8.md) + - [INT8 W8A8](llm_compressor/int8_w8a8.md) - [NVIDIA Model Optimizer](modelopt.md) - [Online Quantization](online.md) - [AMD Quark](quark.md) @@ -46,16 +48,17 @@ th:not(:first-child) { } -| Implementation | Volta | Turing | Ampere | Ada | Hopper | AMD GPU | Intel GPU | x86 CPU | -| ------------------------- | ----- | ------ | ------ | --- | ------ | ------- | --------- | ------- | -| AWQ | ❌ | ✅︎ | ✅︎ | ✅︎ | ✅︎ | ❌ | ✅︎ | ✅︎ | -| GPTQ | ✅︎ | ✅︎ | ✅︎ | ✅︎ | ✅︎ | ❌ | ✅︎ | ✅︎ | -| Marlin (GPTQ/AWQ/FP8/FP4) | ❌ | ✅︎* | ✅︎ | ✅︎ | ✅︎ | ❌ | ❌ | ❌ | -| INT8 (W8A8) | ❌ | ✅︎ | ✅︎ | ✅︎ | ✅︎ | ❌ | ❌ | ✅︎ | -| FP8 (W8A8) | ❌ | ❌ | ❌ | ✅︎ | ✅︎ | ✅︎ | ❌ | ❌ | -| bitsandbytes | ✅︎ | ✅︎ | ✅︎ | ✅︎ | ✅︎ | ❌ | ❌ | ❌ | -| DeepSpeedFP | ✅︎ | ✅︎ | ✅︎ | ✅︎ | ✅︎ | ❌ | ❌ | ❌ | -| GGUF | ✅︎ | ✅︎ | ✅︎ | ✅︎ | ✅︎ | ✅︎ | ❌ | ❌ | +| Implementation | Volta | Turing | Ampere | Ada | Hopper | AMD GPU | Intel GPU | x86 CPU | Arm CPU | +| ------------------------- | ----- | ------ | ------ | --- | ------ | ------- | --------- | ------- | ------- | +| AWQ | ❌ | ✅︎ | ✅︎ | ✅︎ | ✅︎ | ❌ | ✅︎ | ✅︎ | ❌ | +| GPTQ | ✅︎ | ✅︎ | ✅︎ | ✅︎ | ✅︎ | ❌ | ✅︎ | ✅︎ | ❌ | +| Marlin (GPTQ/AWQ/FP8/FP4) | ❌ | ✅︎* | ✅︎ | ✅︎ | ✅︎ | ❌ | ❌ | ❌ | ❌ | +| llm-compressor INT8 (W8A8)| ❌ | ✅︎ | ✅︎ | ✅︎ | ✅︎ | ❌ | ❌ | ✅︎ | ✅︎ | +| llm-compressor INT8 (W4A8)| ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ✅︎ | +| llm-compressor FP8 (W8A8) | ❌ | ❌ | ❌ | ✅︎ | ✅︎ | ✅︎ | ❌ | ❌ | ❌ | +| bitsandbytes | ✅︎ | ✅︎ | ✅︎ | ✅︎ | ✅︎ | ❌ | ❌ | ❌ | ❌ | +| DeepSpeedFP | ✅︎ | ✅︎ | ✅︎ | ✅︎ | ✅︎ | ❌ | ❌ | ❌ | ❌ | +| GGUF | ✅︎ | ✅︎ | ✅︎ | ✅︎ | ✅︎ | ✅︎ | ❌ | ❌ | ❌ | - Volta refers to SM 7.0, Turing to SM 7.5, Ampere to SM 8.0/8.6, Ada to SM 8.9, and Hopper to SM 9.0. - ✅︎ indicates that the quantization method is supported on the specified hardware. diff --git a/docs/features/quantization/llm_compressor.md b/docs/features/quantization/llm_compressor/README.md similarity index 100% rename from docs/features/quantization/llm_compressor.md rename to docs/features/quantization/llm_compressor/README.md diff --git a/docs/features/quantization/fp8.md b/docs/features/quantization/llm_compressor/fp8.md similarity index 86% rename from docs/features/quantization/fp8.md rename to docs/features/quantization/llm_compressor/fp8.md index 2de71ce8da1..5dc1a7d43a0 100644 --- a/docs/features/quantization/fp8.md +++ b/docs/features/quantization/llm_compressor/fp8.md @@ -21,9 +21,17 @@ The FP8 types typically supported in hardware have two distinct representations, To produce performant FP8 quantized models with vLLM, you'll need to install the [llm-compressor](https://github.com/vllm-project/llm-compressor/) library: ```bash -pip install llmcompressor +(venv-llm-compressor) pip install llmcompressor ``` +Additionally, install `vllm` and `lm-evaluation-harness` for evaluation: + +```bash +(venv-vllm) pip install vllm "lm-eval[api]>=0.4.12" +``` + +Please use separate environments for vLLM and llm-compressor as they might not work together. + ## Quantization Process The quantization process involves three main steps: @@ -57,36 +65,28 @@ For FP8 quantization, we can recover accuracy with simple RTN quantization. We r Since simple RTN does not require data for weight quantization and the activations are quantized dynamically, we do not need any calibration data for this quantization flow. -??? code +```python +from llmcompressor import oneshot +from llmcompressor.modifiers.quantization import QuantizationModifier - ```python - from llmcompressor import oneshot - from llmcompressor.modifiers.quantization import QuantizationModifier +# Configure the simple PTQ quantization +recipe = QuantizationModifier( + targets="Linear", + scheme="FP8_DYNAMIC", + ignore=["lm_head"], +) - # Configure the simple PTQ quantization - recipe = QuantizationModifier( - targets="Linear", - scheme="FP8_DYNAMIC", - ignore=["lm_head"], - ) +# Apply the quantization algorithm. +oneshot(model=model, recipe=recipe) - # Apply the quantization algorithm. - oneshot(model=model, recipe=recipe) - - # Save the model: Meta-Llama-3-8B-Instruct-FP8-Dynamic - SAVE_DIR = MODEL_ID.split("/")[1] + "-FP8-Dynamic" - model.save_pretrained(SAVE_DIR) - tokenizer.save_pretrained(SAVE_DIR) - ``` +# Save the model: Meta-Llama-3-8B-Instruct-FP8-Dynamic +SAVE_DIR = MODEL_ID.split("/")[1] + "-FP8-Dynamic" +model.save_pretrained(SAVE_DIR) +tokenizer.save_pretrained(SAVE_DIR) +``` ### 3. Evaluating Accuracy -Install `vllm` and `lm-evaluation-harness` for evaluation: - -```bash -pip install vllm "lm-eval[api]>=0.4.12" -``` - Load and run the model in `vllm`: ```python diff --git a/docs/features/quantization/int4.md b/docs/features/quantization/llm_compressor/int4.md similarity index 62% rename from docs/features/quantization/int4.md rename to docs/features/quantization/llm_compressor/int4.md index 41c4b40574f..0e54797397a 100644 --- a/docs/features/quantization/int4.md +++ b/docs/features/quantization/llm_compressor/int4.md @@ -12,15 +12,17 @@ Please visit the HF collection of [quantized INT4 checkpoints of popular LLMs re To use INT4 quantization with vLLM, you'll need to install the [llm-compressor](https://github.com/vllm-project/llm-compressor/) library: ```bash -pip install llmcompressor +(venv-llm-compressor) pip install llmcompressor ``` Additionally, install `vllm` and `lm-evaluation-harness` for evaluation: ```bash -pip install vllm "lm-eval[api]>=0.4.12" +(venv-vllm) pip install vllm "lm-eval[api]>=0.4.12" ``` +Please use separate environments for vLLM and llm-compressor as they might not work together. + ## Quantization Process The quantization process involves four main steps: @@ -52,55 +54,51 @@ When quantizing weights to INT4, you need sample data to estimate the weight upd It's best to use calibration data that closely matches your deployment data. For a general-purpose instruction-tuned model, you can use a dataset like `ultrachat`: -??? code +```python +from datasets import load_dataset - ```python - from datasets import load_dataset +NUM_CALIBRATION_SAMPLES = 512 +MAX_SEQUENCE_LENGTH = 2048 - NUM_CALIBRATION_SAMPLES = 512 - MAX_SEQUENCE_LENGTH = 2048 +# Load and preprocess the dataset +ds = load_dataset("HuggingFaceH4/ultrachat_200k", split="train_sft") +ds = ds.shuffle(seed=42).select(range(NUM_CALIBRATION_SAMPLES)) - # Load and preprocess the dataset - ds = load_dataset("HuggingFaceH4/ultrachat_200k", split="train_sft") - ds = ds.shuffle(seed=42).select(range(NUM_CALIBRATION_SAMPLES)) +def preprocess(example): + return {"text": tokenizer.apply_chat_template(example["messages"], tokenize=False)} +ds = ds.map(preprocess) - def preprocess(example): - return {"text": tokenizer.apply_chat_template(example["messages"], tokenize=False)} - ds = ds.map(preprocess) - - def tokenize(sample): - return tokenizer(sample["text"], padding=False, max_length=MAX_SEQUENCE_LENGTH, truncation=True, add_special_tokens=False) - ds = ds.map(tokenize, remove_columns=ds.column_names) - ``` +def tokenize(sample): + return tokenizer(sample["text"], padding=False, max_length=MAX_SEQUENCE_LENGTH, truncation=True, add_special_tokens=False) +ds = ds.map(tokenize, remove_columns=ds.column_names) +``` ### 3. Applying Quantization Now, apply the quantization algorithms: -??? code +```python +from llmcompressor import oneshot +from llmcompressor.modifiers.quantization import GPTQModifier +from llmcompressor.modifiers.smoothquant import SmoothQuantModifier - ```python - from llmcompressor import oneshot - from llmcompressor.modifiers.quantization import GPTQModifier - from llmcompressor.modifiers.smoothquant import SmoothQuantModifier +# Configure the quantization algorithms +recipe = GPTQModifier(targets="Linear", scheme="W4A16", ignore=["lm_head"]) - # Configure the quantization algorithms - recipe = GPTQModifier(targets="Linear", scheme="W4A16", ignore=["lm_head"]) +# Apply quantization +oneshot( + model=model, + dataset=ds, + recipe=recipe, + max_seq_length=MAX_SEQUENCE_LENGTH, + num_calibration_samples=NUM_CALIBRATION_SAMPLES, +) - # Apply quantization - oneshot( - model=model, - dataset=ds, - recipe=recipe, - max_seq_length=MAX_SEQUENCE_LENGTH, - num_calibration_samples=NUM_CALIBRATION_SAMPLES, - ) - - # Save the compressed model: Meta-Llama-3-8B-Instruct-W4A16-G128 - SAVE_DIR = MODEL_ID.split("/")[1] + "-W4A16-G128" - model.save_pretrained(SAVE_DIR, save_compressed=True) - tokenizer.save_pretrained(SAVE_DIR) - ``` +# Save the compressed model: Meta-Llama-3-8B-Instruct-W4A16-G128 +SAVE_DIR = MODEL_ID.split("/")[1] + "-W4A16-G128" +model.save_pretrained(SAVE_DIR, save_compressed=True) +tokenizer.save_pretrained(SAVE_DIR) +``` This process creates a W4A16 model with weights quantized to 4-bit integers. @@ -141,36 +139,34 @@ lm_eval --model vllm \ The following is an example of an expanded quantization recipe you can tune to your own use case: -??? code - - ```python - from compressed_tensors.quantization import ( - QuantizationArgs, - QuantizationScheme, - QuantizationStrategy, - QuantizationType, - ) - recipe = GPTQModifier( - targets="Linear", - config_groups={ - "config_group": QuantizationScheme( - targets=["Linear"], - weights=QuantizationArgs( - num_bits=4, - type=QuantizationType.INT, - strategy=QuantizationStrategy.GROUP, - group_size=128, - symmetric=True, - dynamic=False, - actorder="weight", - ), +```python +from compressed_tensors.quantization import ( + QuantizationArgs, + QuantizationScheme, + QuantizationStrategy, + QuantizationType, +) +recipe = GPTQModifier( + targets="Linear", + config_groups={ + "config_group": QuantizationScheme( + targets=["Linear"], + weights=QuantizationArgs( + num_bits=4, + type=QuantizationType.INT, + strategy=QuantizationStrategy.GROUP, + group_size=128, + symmetric=True, + dynamic=False, + actorder="weight", ), - }, - ignore=["lm_head"], - update_size=NUM_CALIBRATION_SAMPLES, - dampening_frac=0.01, - ) - ``` + ), + }, + ignore=["lm_head"], + update_size=NUM_CALIBRATION_SAMPLES, + dampening_frac=0.01, +) +``` ## Troubleshooting and Support diff --git a/docs/features/quantization/llm_compressor/int8_w4a8.md b/docs/features/quantization/llm_compressor/int8_w4a8.md new file mode 100644 index 00000000000..cc6a0982832 --- /dev/null +++ b/docs/features/quantization/llm_compressor/int8_w4a8.md @@ -0,0 +1,217 @@ +# INT8 W4A8 + +vLLM supports quantizing weights to INT4 and activations to INT8 for memory savings and inference acceleration. +This quantization method is particularly useful for reducing model size while maintaining good performance. + +## Prerequisites + +To use INT8 W4A8 quantization with vLLM, you'll need to install the [llm-compressor](https://github.com/vllm-project/llm-compressor/) library. + +```bash +(venv-llm-compressor) pip install llmcompressor +``` + +Additionally, install `vllm` and `lm-evaluation-harness` for evaluation: + +```bash +(venv-vllm) pip install vllm "lm-eval[api]>=0.4.12" +``` + +Please use separate environments for vLLM and llm-compressor as they might not work together. + +## Quantization Process + +The quantization process involves four main steps: + +1. Loading the model +2. Preparing calibration data +3. Applying quantization +4. Evaluating accuracy in vLLM + +### 1. Loading the Model + +Load your model and tokenizer using the standard `transformers` AutoModel classes: + +```python +from transformers import AutoTokenizer, AutoModelForCausalLM + +MODEL_ID = "meta-llama/Meta-Llama-3-8B-Instruct" +model = AutoModelForCausalLM.from_pretrained( + MODEL_ID, + dtype="auto", +) +tokenizer = AutoTokenizer.from_pretrained(MODEL_ID) +``` + +### 2. Preparing Calibration Data + +When quantizing activations to INT8 and weights to INT4, you need sample data to estimate the activation scales. +It's best to use calibration data that closely matches your deployment data. +For a general-purpose instruction-tuned model, you can use a dataset like `ultrachat`: + +```python +from datasets import load_dataset + +NUM_CALIBRATION_SAMPLES = 512 +MAX_SEQUENCE_LENGTH = 2048 + +# Load and preprocess the dataset +ds = load_dataset("HuggingFaceH4/ultrachat_200k", split="train_sft") +ds = ds.shuffle(seed=42).select(range(NUM_CALIBRATION_SAMPLES)) + +def preprocess(example): + return {"text": tokenizer.apply_chat_template(example["messages"], tokenize=False)} +ds = ds.map(preprocess) + +def tokenize(sample): + return tokenizer(sample["text"], padding=False, max_length=MAX_SEQUENCE_LENGTH, truncation=True, add_special_tokens=False) +ds = ds.map(tokenize, remove_columns=ds.column_names) +``` + +### 3. Applying Quantization + +Now, apply the quantization algorithms. + +The following recipes create W4A8 models (int4 weights, int8 activations). On Arm® CPUs, this is accelerated through [KleidiAI](https://github.com/ARM-software/kleidiai). + +Use groupwise for best accuracy, and channelwise for best inference performance. + +=== "Groupwise" + + ```python + from llmcompressor import oneshot + from llmcompressor.modifiers.quantization import GPTQModifier + + # Configure the quantization algorithms + recipe = [ + GPTQModifier( + targets="Linear", + scheme="W4A8", + ignore=["lm_head"], + dampening_frac=0.01 + ), + ] + + # Apply quantization + oneshot( + model=model, + dataset=ds, + recipe=recipe, + max_seq_length=MAX_SEQUENCE_LENGTH, + num_calibration_samples=NUM_CALIBRATION_SAMPLES, + ) + + # Save the compressed model: Meta-Llama-3-8B-Instruct-W4A8-G128-Dynamic-Per-Token + SAVE_DIR = MODEL_ID.split("/")[1] + "-W4A8-G128-Dynamic-Per-Token" + model.save_pretrained(SAVE_DIR, save_compressed=True) + tokenizer.save_pretrained(SAVE_DIR) + ``` + +=== "Channelwise" + + ```python + from llmcompressor import oneshot + from llmcompressor.modifiers.quantization import GPTQModifier + from compressed_tensors.quantization import QuantizationStrategy, QuantizationType + + scheme = { + "targets": ["Linear"], + "weights": { + "num_bits": 4, + "type": QuantizationType.INT, + "strategy": QuantizationStrategy.CHANNEL, + "symmetric": True, + "dynamic": False, + "group_size": None, + }, + "input_activations": { + "num_bits": 8, + "type": QuantizationType.INT, + "strategy": QuantizationStrategy.TOKEN, + "dynamic": True, + "symmetric": False, + "observer": None, + }, + "output_activations": None, + } + + recipe = [ + GPTQModifier( + targets="Linear", + config_groups={"group_0": scheme}, + ignore=["lm_head"], + dampening_frac=0.01, + ), + ] + + oneshot( + model=model, + dataset=ds, + recipe=recipe, + max_seq_length=MAX_SEQUENCE_LENGTH, + num_calibration_samples=NUM_CALIBRATION_SAMPLES, + ) + + # Save the compressed model: Meta-Llama-3-8B-Instruct-W4A8-Channelwise-Dynamic-Per-Token + SAVE_DIR = MODEL_ID.split("/")[1] + "-W4A8-Channelwise-Dynamic-Per-Token" + model.save_pretrained(SAVE_DIR, save_compressed=True) + tokenizer.save_pretrained(SAVE_DIR) + ``` + +### 4. Evaluating Accuracy + +=== "Groupwise" + + After quantization, you can load and run the model in vLLM: + + ```python + from vllm import LLM + + llm = LLM("./Meta-Llama-3-8B-Instruct-W4A8-G128-Dynamic-Per-Token") + ``` + + To evaluate accuracy, you can use `lm_eval`: + + ```bash + lm_eval --model vllm \ + --model_args pretrained="./Meta-Llama-3-8B-Instruct-W4A8-G128-Dynamic-Per-Token",add_bos_token=true \ + --tasks gsm8k \ + --num_fewshot 5 \ + --limit 250 \ + --batch_size 'auto' + ``` + +=== "Channelwise" + + After quantization, you can load and run the model in vLLM: + + ```python + from vllm import LLM + + llm = LLM("./Meta-Llama-3-8B-Instruct-W4A8-Channelwise-Dynamic-Per-Token") + ``` + + To evaluate accuracy, you can use `lm_eval`: + + ```bash + lm_eval --model vllm \ + --model_args pretrained="./Meta-Llama-3-8B-Instruct-W4A8-Channelwise-Dynamic-Per-Token",add_bos_token=true \ + --tasks gsm8k \ + --num_fewshot 5 \ + --limit 250 \ + --batch_size 'auto' + ``` + +!!! note + Quantized models can be sensitive to the presence of the `bos` token. Make sure to include the `add_bos_token=True` argument when running evaluations. + +## Best Practices + +- Start with 512 samples for calibration data (increase if accuracy drops) +- Use a sequence length of 2048 as a starting point +- Employ the chat template or instruction template that the model was trained with +- If you've fine-tuned a model, consider using a sample of your training data for calibration + +## Troubleshooting and Support + +If you encounter any issues or have feature requests, please open an issue on the [vllm-project/llm-compressor](https://github.com/vllm-project/llm-compressor/issues) GitHub repository. diff --git a/docs/features/quantization/int8.md b/docs/features/quantization/llm_compressor/int8_w8a8.md similarity index 66% rename from docs/features/quantization/int8.md rename to docs/features/quantization/llm_compressor/int8_w8a8.md index 547eb5aedc2..21ed00d1393 100644 --- a/docs/features/quantization/int8.md +++ b/docs/features/quantization/llm_compressor/int8_w8a8.md @@ -17,15 +17,17 @@ Please visit the HF collection of [quantized INT8 checkpoints of popular LLMs re To use INT8 quantization with vLLM, you'll need to install the [llm-compressor](https://github.com/vllm-project/llm-compressor/) library: ```bash -pip install llmcompressor +(venv-llm-compressor) pip install llmcompressor ``` Additionally, install `vllm` and `lm-evaluation-harness` for evaluation: ```bash -pip install vllm "lm-eval[api]>=0.4.12" +(venv-vllm) pip install vllm "lm-eval[api]>=0.4.12" ``` +Please use separate environments for vLLM and llm-compressor as they might not work together. + ## Quantization Process The quantization process involves four main steps: @@ -57,26 +59,24 @@ When quantizing activations to INT8, you need sample data to estimate the activa It's best to use calibration data that closely matches your deployment data. For a general-purpose instruction-tuned model, you can use a dataset like `ultrachat`: -??? code +```python +from datasets import load_dataset - ```python - from datasets import load_dataset +NUM_CALIBRATION_SAMPLES = 512 +MAX_SEQUENCE_LENGTH = 2048 - NUM_CALIBRATION_SAMPLES = 512 - MAX_SEQUENCE_LENGTH = 2048 +# Load and preprocess the dataset +ds = load_dataset("HuggingFaceH4/ultrachat_200k", split="train_sft") +ds = ds.shuffle(seed=42).select(range(NUM_CALIBRATION_SAMPLES)) - # Load and preprocess the dataset - ds = load_dataset("HuggingFaceH4/ultrachat_200k", split="train_sft") - ds = ds.shuffle(seed=42).select(range(NUM_CALIBRATION_SAMPLES)) +def preprocess(example): + return {"text": tokenizer.apply_chat_template(example["messages"], tokenize=False)} +ds = ds.map(preprocess) - def preprocess(example): - return {"text": tokenizer.apply_chat_template(example["messages"], tokenize=False)} - ds = ds.map(preprocess) - - def tokenize(sample): - return tokenizer(sample["text"], padding=False, max_length=MAX_SEQUENCE_LENGTH, truncation=True, add_special_tokens=False) - ds = ds.map(tokenize, remove_columns=ds.column_names) - ``` +def tokenize(sample): + return tokenizer(sample["text"], padding=False, max_length=MAX_SEQUENCE_LENGTH, truncation=True, add_special_tokens=False) +ds = ds.map(tokenize, remove_columns=ds.column_names) +``` @@ -84,33 +84,31 @@ For a general-purpose instruction-tuned model, you can use a dataset like `ultra Now, apply the quantization algorithms: -??? code +```python +from llmcompressor import oneshot +from llmcompressor.modifiers.quantization import GPTQModifier +from llmcompressor.modifiers.smoothquant import SmoothQuantModifier - ```python - from llmcompressor import oneshot - from llmcompressor.modifiers.quantization import GPTQModifier - from llmcompressor.modifiers.smoothquant import SmoothQuantModifier +# Configure the quantization algorithms +recipe = [ + SmoothQuantModifier(smoothing_strength=0.8), + GPTQModifier(targets="Linear", scheme="W8A8", ignore=["lm_head"]), +] - # Configure the quantization algorithms - recipe = [ - SmoothQuantModifier(smoothing_strength=0.8), - GPTQModifier(targets="Linear", scheme="W8A8", ignore=["lm_head"]), - ] +# Apply quantization +oneshot( + model=model, + dataset=ds, + recipe=recipe, + max_seq_length=MAX_SEQUENCE_LENGTH, + num_calibration_samples=NUM_CALIBRATION_SAMPLES, +) - # Apply quantization - oneshot( - model=model, - dataset=ds, - recipe=recipe, - max_seq_length=MAX_SEQUENCE_LENGTH, - num_calibration_samples=NUM_CALIBRATION_SAMPLES, - ) - - # Save the compressed model: Meta-Llama-3-8B-Instruct-W8A8-Dynamic-Per-Token - SAVE_DIR = MODEL_ID.split("/")[1] + "-W8A8-Dynamic-Per-Token" - model.save_pretrained(SAVE_DIR, save_compressed=True) - tokenizer.save_pretrained(SAVE_DIR) - ``` +# Save the compressed model: Meta-Llama-3-8B-Instruct-W8A8-Dynamic-Per-Token +SAVE_DIR = MODEL_ID.split("/")[1] + "-W8A8-Dynamic-Per-Token" +model.save_pretrained(SAVE_DIR, save_compressed=True) +tokenizer.save_pretrained(SAVE_DIR) +``` This process creates a W8A8 model with weights and activations quantized to 8-bit integers. diff --git a/docs/models/supported_models.md b/docs/models/supported_models.md index 19cccdc12f5..c0a034bd96a 100644 --- a/docs/models/supported_models.md +++ b/docs/models/supported_models.md @@ -569,6 +569,7 @@ These models primarily accept the [`LLM.generate`](./generative_models.md#llmgen | `GlmOcrForConditionalGeneration` | GLM-OCR | T + IE+ | `zai-org/GLM-OCR`, etc. | ✅︎ | ✅︎ | | `Granite4VisionForConditionalGeneration` | Granite 4 Vision | T + IE+ | `ibm-granite/granite-4.1-3b-vision`, etc. | ✅︎ | ✅︎ | | `GraniteSpeechForConditionalGeneration` | Granite Speech | T + A | `ibm-granite/granite-speech-3.3-8b` | ✅︎ | ✅︎ | +| `GraniteSpeechPlusForConditionalGeneration` | Granite Speech Plus | T + A | `ibm-granite/granite-speech-4.1-2b-plus` | ✅︎ | ✅︎ | | `HCXVisionForCausalLM` | HyperCLOVAX-SEED-Vision-Instruct-3B | T + I+ + V+ | `naver-hyperclovax/HyperCLOVAX-SEED-Vision-Instruct-3B` | | | | `HCXVisionV2ForCausalLM` | HyperCLOVAX-SEED-Think-32B | T + I+ + V+ | `naver-hyperclovax/HyperCLOVAX-SEED-Think-32B` | | | | `H2OVLChatModel` | H2OVL | T + IE+ | `h2oai/h2ovl-mississippi-800m`, `h2oai/h2ovl-mississippi-2b`, etc. | ✅︎ | ✅︎ | @@ -709,6 +710,7 @@ Speech2Text models trained specifically for Automatic Speech Recognition. | `Gemma3nForConditionalGeneration` | Gemma3n | `google/gemma-3n-E2B-it`, `google/gemma-3n-E4B-it`, etc. | | | | `GlmAsrForConditionalGeneration` | GLM-ASR | `zai-org/GLM-ASR-Nano-2512` | ✅︎ | ✅︎ | | `GraniteSpeechForConditionalGeneration` | Granite Speech | `ibm-granite/granite-4.0-1b-speech`, `ibm-granite/granite-speech-3.3-2b`, etc. | ✅︎ | ✅︎ | +| `GraniteSpeechPlusForConditionalGeneration` | Granite Speech Plus | `ibm-granite/granite-speech-4.1-2b-plus` | ✅︎ | ✅︎ | | `Qwen3ASRForConditionalGeneration` | Qwen3-ASR | `Qwen/Qwen3-ASR-1.7B`, etc. | ✅︎ | ✅︎ | | `Qwen3OmniMoeThinkerForConditionalGeneration` | Qwen3-Omni | `Qwen/Qwen3-Omni-30B-A3B-Instruct`, etc. | | ✅︎ | | `VoxtralForConditionalGeneration` | Voxtral (Mistral format) | `mistralai/Voxtral-Mini-3B-2507`, `mistralai/Voxtral-Small-24B-2507`, etc. | ✅︎ | ✅︎ | diff --git a/docs/pre_run_check.sh b/docs/pre_run_check.sh index 464766c42ec..4228e4954fe 100644 --- a/docs/pre_run_check.sh +++ b/docs/pre_run_check.sh @@ -24,8 +24,14 @@ echo "Checking pre-commit/pre-run-check status..." MAX_WAIT=300 INTERVAL=60 ELAPSED=0 +# Use a GitHub token if provided to raise the API rate limit (60 -> 5000 +# requests/hour). Set GITHUB_TOKEN in the Read the Docs environment variables. +CURL_AUTH=() +if [ -n "$GITHUB_TOKEN" ]; then + CURL_AUTH=(-H "Authorization: Bearer $GITHUB_TOKEN") +fi while :; do - RAW=$(curl -sS -w "\n%{http_code}" "https://api.github.com/repos/vllm-project/vllm/commits/${READTHEDOCS_GIT_COMMIT_HASH}/check-runs?check_name=pre-run-check&filter=latest") + RAW=$(curl -sS "${CURL_AUTH[@]}" -w "\n%{http_code}" "https://api.github.com/repos/vllm-project/vllm/commits/${READTHEDOCS_GIT_COMMIT_HASH}/check-runs?check_name=pre-run-check&filter=latest") HTTP_CODE=$(printf %s "$RAW" | tail -n1) BODY=$(printf %s "$RAW" | sed '$d') if [ "$HTTP_CODE" != "200" ]; then diff --git a/examples/generate/multimodal/vision_language_offline.py b/examples/generate/multimodal/vision_language_offline.py index b4e34bd6438..4d47d9f8b45 100644 --- a/examples/generate/multimodal/vision_language_offline.py +++ b/examples/generate/multimodal/vision_language_offline.py @@ -2554,6 +2554,7 @@ MODELS_NEED_VIDEO_METADATA = [ MODELS_SUPPORT_VIT_CUDA_GRAPH = [ + "internvl_chat", "qwen2_5_vl", "qwen3_vl", "qwen3_vl_moe", diff --git a/mkdocs.yaml b/mkdocs.yaml index 097f7497fb2..1fee824f3b2 100644 --- a/mkdocs.yaml +++ b/mkdocs.yaml @@ -110,6 +110,9 @@ plugins: redirect_maps: features/spec_decode/README.md: features/speculative_decoding/README.md features/spec_decode/speculators.md: features/speculative_decoding/speculators.md + features/quantization/fp8.md: features/quantization/llm_compressor/fp8.md + features/quantization/int4.md: features/quantization/llm_compressor/int4.md + features/quantization/int8.md: features/quantization/llm_compressor/int8_w8a8.md serving/openai_compatible_server.md: serving/online_serving/README.md markdown_extensions: diff --git a/requirements/common.txt b/requirements/common.txt index d37ef1f1fed..8141dc8ea6b 100644 --- a/requirements/common.txt +++ b/requirements/common.txt @@ -38,7 +38,7 @@ pyyaml six>=1.16.0; python_version > '3.11' # transitive dependency of pandas that needs to be the latest version for python 3.12 setuptools>=77.0.3,<81.0.0; python_version > '3.11' # Setuptools is used by triton, we need to ensure a modern version is installed for 3.12+ so that it does not try to import distutils, which was removed in 3.12 einops # Required for Qwen2-VL. -compressed-tensors == 0.15.0.1 # required for compressed-tensors +compressed-tensors == 0.17.0 # required for compressed-tensors depyf==0.20.0 # required for profiling and debugging with compilation config cloudpickle # allows pickling lambda functions in model_executor/models/registry.py watchfiles # required for http server to monitor the updates of TLS files diff --git a/requirements/cuda.txt b/requirements/cuda.txt index b0e16d11c75..618f8ae0a37 100644 --- a/requirements/cuda.txt +++ b/requirements/cuda.txt @@ -18,7 +18,7 @@ tilelang==0.1.9 nvidia-cudnn-frontend>=1.13.0,<1.19.0 # Required for faster safetensors model loading -fastsafetensors >= 0.2.2 +fastsafetensors >= 0.3.2 # QuACK and Cutlass DSL for FA4 (cute-DSL implementation) nvidia-cutlass-dsl[cu13]==4.5.2 @@ -28,4 +28,4 @@ quack-kernels>=0.3.3 tokenspeed-mla==0.1.2 # Humming kernels for quantization gemm -humming-kernels[cu13]==0.1.2 +humming-kernels[cu13]==0.1.4 diff --git a/requirements/rocm.txt b/requirements/rocm.txt index 0520f4ca1e9..4ca70738303 100644 --- a/requirements/rocm.txt +++ b/requirements/rocm.txt @@ -19,7 +19,10 @@ setuptools-rust>=1.9.0 runai-model-streamer[s3,gcs,azure]==0.15.7 conch-triton-kernels==1.2.1 timm>=1.0.17 -# amd-quark: required for Quark quantization on ROCm +# amd-quark: required for Quark quantization on ROCm # To be consistent with test_quark.py amd-quark>=0.8.99 tilelang==0.1.10 + +# Required for faster safetensors model loading +fastsafetensors >= 0.3.2 diff --git a/requirements/test/cuda.in b/requirements/test/cuda.in index 6c786491603..344a58ec1bb 100644 --- a/requirements/test/cuda.in +++ b/requirements/test/cuda.in @@ -57,7 +57,7 @@ arctic-inference == 0.1.1; platform_machine == "x86_64" # Required for suffix de numba == 0.65.0 # Required for N-gram speculative decoding numpy runai-model-streamer[s3,gcs,azure]==0.15.7 -fastsafetensors>=0.2.2; platform_machine == "x86_64" # 0.2.2 contains important fixes for multi-GPU mem usage +fastsafetensors>=0.3.2 instanttensor>=0.1.5; platform_machine == "x86_64" pydantic>=2.12 # 2.11 leads to error on python 3.13 decord==0.6.0; platform_machine == "x86_64" diff --git a/requirements/test/cuda.txt b/requirements/test/cuda.txt index 245a86f93be..7d847d10577 100644 --- a/requirements/test/cuda.txt +++ b/requirements/test/cuda.txt @@ -191,7 +191,7 @@ fastparquet==2024.11.0 # via genai-perf fastrlock==0.8.2 # via cupy-cuda12x -fastsafetensors==0.2.2 +fastsafetensors==0.3.2 # via # -c requirements/cuda.txt # -r requirements/test/cuda.in diff --git a/requirements/test/nightly-torch.txt b/requirements/test/nightly-torch.txt index 9c70aa8b90e..89fd4ea9b43 100644 --- a/requirements/test/nightly-torch.txt +++ b/requirements/test/nightly-torch.txt @@ -43,6 +43,6 @@ tritonclient>=2.51.0 numba == 0.65.0 # Required for N-gram speculative decoding numpy runai-model-streamer[s3,gcs,azure]==0.15.7 -fastsafetensors>=0.2.2 +fastsafetensors>=0.3.2 instanttensor>=0.1.5 pydantic>=2.12 # 2.11 leads to error on python 3.13 diff --git a/requirements/test/rocm.in b/requirements/test/rocm.in index 97e0658fb10..0a615831774 100644 --- a/requirements/test/rocm.in +++ b/requirements/test/rocm.in @@ -56,7 +56,7 @@ arctic-inference==0.1.1 # Required for suffix decoding test numba==0.65.0 # Required for N-gram speculative decoding numpy runai-model-streamer[s3,gcs,azure]==0.15.7 -fastsafetensors @ git+https://github.com/foundation-model-stack/fastsafetensors.git@0.2.2 # PyPI only ships CUDA wheels +fastsafetensors>=0.3.2 instanttensor>=0.1.5 pydantic>=2.12 # 2.11 leads to error on python 3.13 decord==0.6.0 diff --git a/requirements/test/rocm.txt b/requirements/test/rocm.txt index c39f268709b..e0232d8b6d3 100644 --- a/requirements/test/rocm.txt +++ b/requirements/test/rocm.txt @@ -143,7 +143,7 @@ colorful==0.5.8 # via ray colorlog==6.10.1 # via optuna -compressed-tensors==0.15.0.1 +compressed-tensors==0.17.0 # via # -c requirements/common.txt # -r requirements/test/../common.txt @@ -240,8 +240,10 @@ fastar==0.10.0 # via fastapi-cloud-cli fastparquet==2026.3.0 # via genai-perf -fastsafetensors @ git+https://github.com/foundation-model-stack/fastsafetensors.git@65d80088fca7a8f567fba30415fbcc80f7d2259c - # via -r requirements/test/rocm.in +fastsafetensors==0.3.2 + # via + # -c requirements/rocm.txt + # -r requirements/test/rocm.in filelock==3.25.2 # via # -c requirements/common.txt diff --git a/setup.py b/setup.py index 07374807bee..b674d55a14a 100644 --- a/setup.py +++ b/setup.py @@ -1168,7 +1168,7 @@ setup( "zen": ["zentorch==2.11.0.0"], "bench": ["pandas", "matplotlib", "seaborn", "datasets", "scipy", "plotly"], "tensorizer": ["tensorizer==2.10.1"], - "fastsafetensors": ["fastsafetensors >= 0.2.2"], + "fastsafetensors": ["fastsafetensors >= 0.3.2"], "instanttensor": ["instanttensor >= 0.1.5"], "runai": ["runai-model-streamer[s3,gcs,azure] >= 0.15.7"], "audio": [ diff --git a/tests/compile/passes/distributed/test_fusion_all_reduce.py b/tests/compile/passes/distributed/test_fusion_all_reduce.py index 1a175b8dd33..4805863057d 100644 --- a/tests/compile/passes/distributed/test_fusion_all_reduce.py +++ b/tests/compile/passes/distributed/test_fusion_all_reduce.py @@ -14,6 +14,7 @@ from vllm.compilation.passes.fusion.allreduce_rms_fusion import ( AllReduceFusionPass, RocmAiterAllReduceFusionPass, ) +from vllm.compilation.passes.fx_utils import find_op_nodes from vllm.compilation.passes.utility.fix_functionalization import ( FixFunctionalizationPass, ) @@ -33,7 +34,7 @@ from vllm.distributed.parallel_state import ( init_distributed_environment, initialize_model_parallel, ) -from vllm.model_executor.layers.layernorm import RMSNorm +from vllm.model_executor.layers.layernorm import GemmaRMSNorm, RMSNorm from vllm.model_executor.layers.quantization.utils.quant_utils import ( kFp8StaticTensorSym, ) @@ -91,6 +92,49 @@ class TestAllReduceRMSNormModel(torch.nn.Module): return [torch.ops.vllm.flashinfer_trtllm_fused_allreduce_norm.default] +class TestAllReduceGemmaRMSNormModel(torch.nn.Module): + def __init__( + self, + hidden_size=16, + token_num=16, + eps=1e-6, + dtype: torch.dtype = torch.float16, + ): + super().__init__() + self.hidden_size = hidden_size + self.eps = eps + self.norm = [GemmaRMSNorm(hidden_size, eps) for _ in range(4)] + # Non-trivial weight (~Gemma range) so (1 + w) exercises the scale path. + for n in self.norm: + n.weight.data.normal_(mean=0.0, std=0.1) + self.w = [torch.rand(hidden_size, hidden_size) for _ in range(3)] + + def forward(self, x): + # avoid having graph input be an arg to a pattern directly + z = torch.relu(x) + x = resid = tensor_model_parallel_all_reduce(z) + y = self.norm[0](x) + + z2 = torch.mm(y, self.w[0]) + x2 = tensor_model_parallel_all_reduce(z2) + y2, resid = self.norm[1](x2, resid) + + z3 = torch.mm(y2, self.w[1]) + x3 = tensor_model_parallel_all_reduce(z3) + y3, resid = self.norm[2](x3, resid) + + z4 = torch.mm(y3, self.w[2]) + x4 = tensor_model_parallel_all_reduce(z4) + y4, resid = self.norm[3](x4, resid) + return y4 + + def ops_in_model_before(self): + return [torch.ops.vllm.all_reduce.default] + + def ops_in_model_after(self): + return [torch.ops.vllm.flashinfer_trtllm_fused_allreduce_norm.default] + + class TestAllReduceRMSNormStaticQuantFP8Model(torch.nn.Module): quant_key = kFp8StaticTensorSym @@ -209,6 +253,15 @@ class TestAllReduceFusedAddRMSNormStaticQuantFP4Model(torch.nn.Module): "test_model, enable_quant_fp8_custom_op, use_aiter", [ (TestAllReduceRMSNormModel, False, IS_AITER_FOUND), + pytest.param( + TestAllReduceGemmaRMSNormModel, + False, + False, + marks=pytest.mark.skipif( + current_platform.is_rocm(), + reason="Not supported on ROCm platform", + ), + ), pytest.param( TestAllReduceRMSNormStaticQuantFP8Model, True, @@ -404,4 +457,9 @@ def all_reduce_fusion_pass_on_test_model( ) backend.check_before_ops(model.ops_in_model_before(), fully_replaced=False) backend.check_after_ops(model.ops_in_model_after()) + if test_model_cls is TestAllReduceGemmaRMSNormModel: + fused_op = torch.ops.vllm.flashinfer_trtllm_fused_allreduce_norm.default + fused_nodes = list(find_op_nodes(fused_op, backend.graph_post_pass)) + assert fused_nodes + assert all(n.kwargs.get("weight_bias") == 1.0 for n in fused_nodes) del all_reduce_fusion_pass diff --git a/tests/compile/test_inductor_fallback_allow_list_patch.py b/tests/compile/test_inductor_fallback_allow_list_patch.py new file mode 100644 index 00000000000..29fe9962e34 --- /dev/null +++ b/tests/compile/test_inductor_fallback_allow_list_patch.py @@ -0,0 +1,250 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +"""Tests for the Inductor FALLBACK_ALLOW_LIST patch in env_override.py. + +The patch wraps ``torch._inductor.lowering.FALLBACK_ALLOW_LIST`` in a thin +proxy that auto-allows any custom op in the ``vllm::`` or ``vllm_aiter::`` +namespaces. This routes those ops through Inductor's fast-path +``make_fallback(target, warn=False, override_decomp=True)`` and avoids the +expensive ``error.operator_str(target, args, kwargs)`` formatting that +recursively stringifies every input ``TensorBox``. + +The slow path is what made ``torch.compile`` effectively hang on Kimi-K2.6 +TP=8 (deep MoE/TP IR provenance trees). These tests cover both the proxy's +semantics in isolation and the membership-check fast-path that Inductor's +``GraphLowering.call_function`` actually performs, so we can validate the +optimization without needing a full GPU compile. +""" + +import time + +import pytest + +from vllm.env_override import ( + _patch_inductor_fallback_allow_list, + _VllmFallbackAllowList, +) + + +class TestVllmFallbackAllowListProxy: + """Unit tests for the membership-proxy semantics.""" + + def test_vllm_namespace_auto_allowed(self): + proxy = _VllmFallbackAllowList(set()) + assert "vllm::all_reduce" in proxy + assert "vllm::fused_add_rms_norm" in proxy + assert "vllm::all_reduce.default" in proxy + + def test_vllm_aiter_namespace_auto_allowed(self): + proxy = _VllmFallbackAllowList(set()) + assert "vllm_aiter::fused_add_rms_norm" in proxy + assert "vllm_aiter::rocm_aiter_fused_moe" in proxy + + def test_unknown_namespace_falls_through(self): + proxy = _VllmFallbackAllowList({"torchvision::roi_align"}) + assert "torchvision::roi_align" in proxy + assert "made_up_ns::nonexistent_op" not in proxy + + def test_non_string_falls_through_to_inner(self): + sentinel = object() + inner = {sentinel} + proxy = _VllmFallbackAllowList(inner) + assert sentinel in proxy + assert object() not in proxy + + def test_prefix_only_match_not_substring(self): + proxy = _VllmFallbackAllowList(set()) + assert "not_vllm::something" not in proxy + assert " vllm::space_prefixed" not in proxy + + def test_standard_entries_preserved(self): + base = {"torchvision::roi_align", "aten::index_add"} + proxy = _VllmFallbackAllowList(base) + assert "torchvision::roi_align" in proxy + assert "aten::index_add" in proxy + assert "aten::__not_present__" not in proxy + + def test_add_and_discard_delegate_to_inner(self): + inner: set[str] = set() + proxy = _VllmFallbackAllowList(inner) + proxy.add("custom::op") + assert "custom::op" in inner + proxy.discard("custom::op") + assert "custom::op" not in inner + + def test_iter_len_repr(self): + base = {"torchvision::roi_align", "aten::index_add"} + proxy = _VllmFallbackAllowList(base) + assert set(iter(proxy)) == base + assert len(proxy) == len(base) + assert "torchvision::roi_align" in repr(proxy) + + def test_getattr_delegates_to_inner(self): + class _Inner: + sentinel = "i_am_inner" + + def some_method(self): + return 42 + + inner = _Inner() + proxy = _VllmFallbackAllowList(inner) + assert proxy.sentinel == "i_am_inner" + assert proxy.some_method() == 42 + + def test_sentinel_attribute(self): + proxy = _VllmFallbackAllowList(set()) + assert proxy._vllm_patched is True + + +class TestPatchApplication: + """Integration tests verifying the patch reaches ``torch._inductor``.""" + + def test_patch_applied_to_lowering(self): + import torch._inductor.lowering as _lowering + + assert getattr(_lowering.FALLBACK_ALLOW_LIST, "_vllm_patched", False), ( + "env_override._patch_inductor_fallback_allow_list did not run" + ) + + def test_graph_module_local_binding_rebound(self): + # ``torch/_inductor/graph.py`` does: + # from torch._inductor.lowering import FALLBACK_ALLOW_LIST + # so the patch has to overwrite the graph module's local binding too, + # otherwise the fast-path check in GraphLowering.call_function still + # sees the original (unwrapped) OrderedSet. + import torch._inductor.graph as _graph + import torch._inductor.lowering as _lowering + + if not hasattr(_graph, "FALLBACK_ALLOW_LIST"): + pytest.skip( + "torch._inductor.graph no longer imports FALLBACK_ALLOW_LIST " + "as a module-level symbol; nothing to rebind." + ) + + assert _graph.FALLBACK_ALLOW_LIST is _lowering.FALLBACK_ALLOW_LIST + + def test_patch_is_idempotent(self): + import torch._inductor.lowering as _lowering + + first = _lowering.FALLBACK_ALLOW_LIST + _patch_inductor_fallback_allow_list() + _patch_inductor_fallback_allow_list() + assert _lowering.FALLBACK_ALLOW_LIST is first + + def test_real_vllm_ops_in_real_allow_list(self): + # End-to-end membership check using the live (already-patched) object. + import torch._inductor.lowering as _lowering + + allow_list = _lowering.FALLBACK_ALLOW_LIST + assert "vllm::all_reduce" in allow_list + assert "vllm::fused_add_rms_norm" in allow_list + assert "vllm_aiter::fused_add_rms_norm" in allow_list + + +class TestInductorFallbackFastPath: + """Emulates ``GraphLowering.call_function``'s FALLBACK_ALLOW_LIST check. + + The relevant snippet in ``torch/_inductor/graph.py`` is roughly:: + + base_name = target.name() + if base_name not in FALLBACK_ALLOW_LIST: + log.info( + "Creating implicit fallback for:\\n%s", + error.operator_str(target, args, kwargs), + ) + out = make_fallback(target, ...) + + On a deep MoE/TP graph (Kimi-K2.6 at TP=4/8) ``operator_str`` recurses + through every input ``TensorBox.__str__`` and ends up taking many minutes + of CPU per encountered op. The patch ensures the membership test + short-circuits for ``vllm::*``/``vllm_aiter::*`` ops so the slow path is + never entered. These tests pin that behaviour without needing a real + GPU compile. + """ + + def _simulate_graph_lowering(self, target_names: list[str]): + """Returns the set of target names that would have hit the slow + operator_str() path under the patched FALLBACK_ALLOW_LIST. + """ + import torch._inductor.lowering as _lowering + + allow_list = _lowering.FALLBACK_ALLOW_LIST + slow_path_hits: list[str] = [] + for name in target_names: + if name not in allow_list: + slow_path_hits.append(name) + return slow_path_hits + + def test_vllm_ops_skip_slow_path(self): + slow = self._simulate_graph_lowering( + [ + "vllm::all_reduce", + "vllm::fused_add_rms_norm", + "vllm_aiter::rocm_aiter_fused_moe", + "vllm_aiter::asm_moe", + ] + ) + assert slow == [], ( + "Patched FALLBACK_ALLOW_LIST must short-circuit for all " + f"vllm::*/vllm_aiter::* ops; got slow-path hits: {slow}" + ) + + def test_non_vllm_ops_still_hit_slow_path(self): + # Without the patch this is also what would happen; with the patch + # the behaviour for non-vllm namespaces must be unchanged. + slow = self._simulate_graph_lowering( + ["my_user_ns::custom_op", "fancy_ns::something_else"] + ) + assert "my_user_ns::custom_op" in slow + assert "fancy_ns::something_else" in slow + + def test_kimi_k2_6_style_op_stream(self): + """Emulates one decoder layer's worth of fallback hits. + + Kimi-K2.6 at TP=4 lowers a stream of ``vllm::all_reduce`` + + ``vllm_aiter::fused_add_rms_norm`` calls (one per residual block) + plus a handful of fused-MoE ops. Pre-patch every one of these would + invoke ``operator_str`` and stringify a hundreds-deep IR provenance + tree; post-patch they must all short-circuit. + """ + n_layers = 64 # Kimi-K2.6 has ~64 decoder layers per replica + op_stream: list[str] = [] + for _ in range(n_layers): + op_stream.extend( + [ + "vllm::all_reduce", + "vllm_aiter::fused_add_rms_norm", + "vllm_aiter::rocm_aiter_fused_moe", + ] + ) + + start = time.perf_counter() + slow = self._simulate_graph_lowering(op_stream) + elapsed_s = time.perf_counter() - start + + assert slow == [], ( + f"Expected all {len(op_stream)} vllm/vllm_aiter ops to take " + f"the fast path; got {len(slow)} slow-path hits." + ) + # ``__contains__`` is O(1) per call, so a Kimi-sized stream should + # complete in well under a second even on a slow runner. The + # pre-patch slow path took many minutes per op on Kimi-K2.6 TP=8. + assert elapsed_s < 1.0, ( + f"FALLBACK_ALLOW_LIST membership check is unexpectedly slow: " + f"{elapsed_s:.3f}s for {len(op_stream)} ops" + ) + + def test_inner_set_membership_still_works_for_standard_ops(self): + """The patch must not break Inductor's existing fallback decisions + for non-vllm ops such as ``torchvision::roi_align``.""" + import torch._inductor.lowering as _lowering + + allow_list = _lowering.FALLBACK_ALLOW_LIST + # ``torchvision::roi_align`` has been a member of the upstream + # FALLBACK_ALLOW_LIST since the original Inductor implementation. + # If the proxy ever broke pass-through, this would regress. + if "torchvision::roi_align" not in allow_list: + pytest.skip( + "Upstream FALLBACK_ALLOW_LIST no longer ships " + "torchvision::roi_align; nothing to verify." + ) diff --git a/tests/distributed/test_eplb_execute.py b/tests/distributed/test_eplb_execute.py index d9e6a739b01..0b87477950f 100644 --- a/tests/distributed/test_eplb_execute.py +++ b/tests/distributed/test_eplb_execute.py @@ -277,12 +277,15 @@ def assert_verification_synced(local_ok: bool, msg: str) -> None: assert bool(ok_tensor.item()), msg -def create_eplb_communicator_or_raise(*, group_coordinator, backend, expert_weights): +def create_eplb_communicator_or_raise( + *, group_coordinator, backend, expert_weights, expert_buffer +): try: return create_eplb_communicator( group_coordinator=group_coordinator, backend=backend, expert_weights=expert_weights, + expert_buffer=expert_buffer, ) except Exception as exc: raise RuntimeError( @@ -355,7 +358,8 @@ def _test_async_transfer_layer_without_mtp_worker( communicator = create_eplb_communicator_or_raise( group_coordinator=ep_group_coordinator, backend=eplb_communicator, - expert_weights=expert_weights[0], + expert_weights=expert_weights, + expert_buffer=expert_buffer, ) communicator.set_stream(cuda_stream) @@ -368,6 +372,7 @@ def _test_async_transfer_layer_without_mtp_worker( ep_group=ep_group, communicator=communicator, cuda_stream=cuda_stream, + layer_idx=layer_idx, ) cuda_stream.synchronize() move_from_buffer( @@ -460,10 +465,12 @@ def _test_rearrange_expert_weights_with_redundancy( num_layers, num_local_experts, hidden_sizes, ep_rank, device, old_indices ) + expert_buffer = [torch.empty_like(w) for w in expert_weights[0]] communicator = create_eplb_communicator_or_raise( group_coordinator=ep_group_coordinator, backend=eplb_communicator, - expert_weights=expert_weights[0], + expert_weights=expert_weights, + expert_buffer=expert_buffer, ) # Execute weight rearrangement @@ -471,9 +478,9 @@ def _test_rearrange_expert_weights_with_redundancy( old_indices, new_indices, expert_weights, + expert_buffer, ep_group, - is_profile=False, - communicator=communicator, + communicator, ) # Verify the rearrangement result @@ -593,10 +600,12 @@ def _test_rearrange_expert_weights_no_change(env, world_size) -> None: layer_copy.append(weight.clone()) original_weights.append(layer_copy) + expert_buffer = [torch.empty_like(w) for w in expert_weights[0]] communicator = create_eplb_communicator_or_raise( group_coordinator=ep_group_coordinator, backend="torch_nccl", - expert_weights=expert_weights[0], + expert_weights=expert_weights, + expert_buffer=expert_buffer, ) # Execute rearrangement (should be no change) @@ -604,9 +613,9 @@ def _test_rearrange_expert_weights_no_change(env, world_size) -> None: indices, indices, # Same indices expert_weights, + expert_buffer, ep_group, communicator, - is_profile=False, ) # Verify that the weights have not changed @@ -726,10 +735,12 @@ def _test_rearrange_expert_weights_profile_mode(env, world_size) -> None: layer_copy.append(weight.clone()) original_weights.append(layer_copy) + expert_buffer = [torch.empty_like(w) for w in expert_weights[0]] communicator = create_eplb_communicator_or_raise( group_coordinator=ep_group_coordinator, backend="torch_nccl", - expert_weights=expert_weights[0], + expert_weights=expert_weights, + expert_buffer=expert_buffer, ) # Execute profile mode rearrangement @@ -737,9 +748,10 @@ def _test_rearrange_expert_weights_profile_mode(env, world_size) -> None: old_indices, new_indices, expert_weights, + expert_buffer, ep_group, communicator, - is_profile=True, # Profile mode + is_profile=True, ) # In profile mode, the weights should remain unchanged diff --git a/tests/distributed/test_eplb_fused_moe_layer.py b/tests/distributed/test_eplb_fused_moe_layer.py index eacdb3abc36..87ed4485d3d 100644 --- a/tests/distributed/test_eplb_fused_moe_layer.py +++ b/tests/distributed/test_eplb_fused_moe_layer.py @@ -9,9 +9,11 @@ import pytest import torch from vllm.config import VllmConfig, set_current_vllm_config +from vllm.distributed.eplb.eplb_communicator import create_eplb_communicator from vllm.distributed.eplb.rebalance_execute import rearrange_expert_weights_inplace from vllm.distributed.parallel_state import ( ensure_model_parallel_initialized, + get_eplb_group, get_tp_group, ) from vllm.model_executor.layers.fused_moe.layer import FusedMoE @@ -213,12 +215,20 @@ def _test_eplb_fml(env, world_size: int, test_config: TestConfig): for lidx in range(test_config.num_layers): shuffled_indices[lidx] = torch.randperm(test_config.num_experts) + expert_buffer = [torch.empty_like(w) for w in rank_expert_weights[0]] + communicator = create_eplb_communicator( + group_coordinator=get_eplb_group(), + backend="torch_nccl", + expert_weights=rank_expert_weights, + expert_buffer=expert_buffer, + ) rearrange_expert_weights_inplace( indices, shuffled_indices, rank_expert_weights, + expert_buffer, ep_group, - is_profile=False, + communicator, ) num_local_experts = test_config.num_local_experts diff --git a/tests/distributed/test_eplb_fused_moe_layer_dep_nvfp4.py b/tests/distributed/test_eplb_fused_moe_layer_dep_nvfp4.py index 9ab785af313..4818f8a2c8c 100644 --- a/tests/distributed/test_eplb_fused_moe_layer_dep_nvfp4.py +++ b/tests/distributed/test_eplb_fused_moe_layer_dep_nvfp4.py @@ -10,11 +10,13 @@ import torch from tests.kernels.moe.utils import make_test_quant_config from vllm.config import VllmConfig, set_current_vllm_config +from vllm.distributed.eplb.eplb_communicator import create_eplb_communicator from vllm.distributed.eplb.eplb_state import EplbLayerState from vllm.distributed.eplb.rebalance_execute import rearrange_expert_weights_inplace from vllm.distributed.parallel_state import ( ensure_model_parallel_initialized, get_dp_group, + get_eplb_group, ) from vllm.forward_context import set_forward_context from vllm.model_executor.layers.fused_moe.layer import FusedMoE @@ -171,12 +173,20 @@ def _test_eplb_fml(env, world_size: int, test_config: TestConfig): for lidx in range(test_config.num_layers): shuffled_indices[lidx] = torch.randperm(test_config.num_experts) + expert_buffer = [torch.empty_like(w) for w in rank_expert_weights[0]] + communicator = create_eplb_communicator( + group_coordinator=get_eplb_group(), + backend="torch_nccl", + expert_weights=rank_expert_weights, + expert_buffer=expert_buffer, + ) rearrange_expert_weights_inplace( indices, shuffled_indices, rank_expert_weights, + expert_buffer, ep_group, - is_profile=False, + communicator, ) num_global_experts = test_config.num_experts diff --git a/tests/entrypoints/offline_mode/__init__.py b/tests/entrypoints/llm/offline_mode/__init__.py similarity index 100% rename from tests/entrypoints/offline_mode/__init__.py rename to tests/entrypoints/llm/offline_mode/__init__.py diff --git a/tests/entrypoints/offline_mode/test_offline_mode.py b/tests/entrypoints/llm/offline_mode/test_offline_mode.py similarity index 100% rename from tests/entrypoints/offline_mode/test_offline_mode.py rename to tests/entrypoints/llm/offline_mode/test_offline_mode.py diff --git a/tests/entrypoints/serve/lora/test_serving_models.py b/tests/entrypoints/serve/lora/test_serving_models.py index ce9fdcc2bfb..0cab3fd42cf 100644 --- a/tests/entrypoints/serve/lora/test_serving_models.py +++ b/tests/entrypoints/serve/lora/test_serving_models.py @@ -6,6 +6,7 @@ from unittest.mock import MagicMock import pytest +from vllm import PoolingParams from vllm.config import ModelConfig from vllm.engine.protocol import EngineClient from vllm.entrypoints.openai.engine.protocol import ( @@ -13,10 +14,13 @@ from vllm.entrypoints.openai.engine.protocol import ( ) from vllm.entrypoints.openai.models.protocol import BaseModelPath from vllm.entrypoints.openai.models.serving import OpenAIServingModels +from vllm.entrypoints.pooling.base.serving import PoolingServingBase +from vllm.entrypoints.pooling.typing import PoolingServeContext from vllm.entrypoints.serve.lora.protocol import ( LoadLoRAAdapterRequest, UnloadLoRAAdapterRequest, ) +from vllm.exceptions import VLLMNotFoundError from vllm.lora.request import LoRARequest MODEL_NAME = "hmellor/tiny-random-LlamaForCausalLM" @@ -130,3 +134,60 @@ async def test_unload_lora_adapter_not_found(): assert isinstance(response, ErrorResponse) assert response.error.type == "NotFoundError" assert response.error.code == HTTPStatus.NOT_FOUND + + +class _ConcretePoolingServing(PoolingServingBase): + """Minimal concrete subclass used only in these unit tests.""" + + request_id_prefix = "test" + + def get_io_processor(self, request): + raise NotImplementedError + + def _build_response(self, ctx): + raise NotImplementedError + + +def _make_pooling_serving(lora_name: str) -> _ConcretePoolingServing: + lora_request = LoRARequest( + lora_name=lora_name, lora_int_id=1, lora_path="/path/to/lora" + ) + mock_models = MagicMock() + mock_models.lora_requests = {lora_name: lora_request} + mock_models.is_base_model.side_effect = lambda name: name == MODEL_NAME + + serving = object.__new__(_ConcretePoolingServing) + serving.models = mock_models + return serving + + +def _make_pooling_ctx(model_name: str) -> PoolingServeContext: + mock_request = MagicMock() + mock_request.model = model_name + return PoolingServeContext( + request=mock_request, + model_name=MODEL_NAME, + request_id="test-id", + pooling_params=PoolingParams(), + ) + + +def test_pooling_maybe_get_adapters_lora_name_sets_lora_request(): + """LoRA adapter name must populate ctx.lora_request without raising.""" + lora_name = "bot-embed-lora" + serving = _make_pooling_serving(lora_name) + ctx = _make_pooling_ctx(lora_name) + + serving._maybe_get_adapters(ctx) + + assert ctx.lora_request is not None + assert ctx.lora_request.lora_name == lora_name + + +def test_pooling_maybe_get_adapters_unknown_model_raises(): + """An unrecognised model name must still raise VLLMNotFoundError.""" + serving = _make_pooling_serving("some-lora") + ctx = _make_pooling_ctx("unknown-model") + + with pytest.raises(VLLMNotFoundError): + serving._maybe_get_adapters(ctx) diff --git a/tests/entrypoints/sagemaker/__init__.py b/tests/entrypoints/serve/sagemaker/__init__.py similarity index 100% rename from tests/entrypoints/sagemaker/__init__.py rename to tests/entrypoints/serve/sagemaker/__init__.py diff --git a/tests/entrypoints/sagemaker/conftest.py b/tests/entrypoints/serve/sagemaker/conftest.py similarity index 97% rename from tests/entrypoints/sagemaker/conftest.py rename to tests/entrypoints/serve/sagemaker/conftest.py index 1c34d738fa7..d36c20ccd9a 100644 --- a/tests/entrypoints/sagemaker/conftest.py +++ b/tests/entrypoints/serve/sagemaker/conftest.py @@ -6,7 +6,7 @@ import pytest import pytest_asyncio -from ...utils import RemoteOpenAIServer +from tests.utils import RemoteOpenAIServer # Model name constants used across tests MODEL_NAME_SMOLLM = "HuggingFaceTB/SmolLM2-135M-Instruct" diff --git a/tests/entrypoints/sagemaker/test_sagemaker_handler_overrides.py b/tests/entrypoints/serve/sagemaker/test_sagemaker_handler_overrides.py similarity index 99% rename from tests/entrypoints/sagemaker/test_sagemaker_handler_overrides.py rename to tests/entrypoints/serve/sagemaker/test_sagemaker_handler_overrides.py index 0d4f8e88582..ebc51056bb3 100644 --- a/tests/entrypoints/sagemaker/test_sagemaker_handler_overrides.py +++ b/tests/entrypoints/serve/sagemaker/test_sagemaker_handler_overrides.py @@ -22,7 +22,8 @@ import tempfile import pytest import requests -from ...utils import RemoteOpenAIServer +from tests.utils import RemoteOpenAIServer + from .conftest import ( MODEL_NAME_SMOLLM, ) diff --git a/tests/entrypoints/sagemaker/test_sagemaker_lora_adapters.py b/tests/entrypoints/serve/sagemaker/test_sagemaker_lora_adapters.py similarity index 99% rename from tests/entrypoints/sagemaker/test_sagemaker_lora_adapters.py rename to tests/entrypoints/serve/sagemaker/test_sagemaker_lora_adapters.py index 01b3e650222..4a7d8640366 100644 --- a/tests/entrypoints/sagemaker/test_sagemaker_lora_adapters.py +++ b/tests/entrypoints/serve/sagemaker/test_sagemaker_lora_adapters.py @@ -4,7 +4,8 @@ import openai # use the official async_client for correctness check import pytest import requests -from ...utils import RemoteOpenAIServer +from tests.utils import RemoteOpenAIServer + from .conftest import MODEL_NAME_SMOLLM diff --git a/tests/entrypoints/sagemaker/test_sagemaker_middleware_integration.py b/tests/entrypoints/serve/sagemaker/test_sagemaker_middleware_integration.py similarity index 99% rename from tests/entrypoints/sagemaker/test_sagemaker_middleware_integration.py rename to tests/entrypoints/serve/sagemaker/test_sagemaker_middleware_integration.py index f1ed0c7e289..bc7574d6503 100644 --- a/tests/entrypoints/sagemaker/test_sagemaker_middleware_integration.py +++ b/tests/entrypoints/serve/sagemaker/test_sagemaker_middleware_integration.py @@ -12,7 +12,8 @@ import tempfile import pytest import requests -from ...utils import RemoteOpenAIServer +from tests.utils import RemoteOpenAIServer + from .conftest import ( MODEL_NAME_SMOLLM, ) diff --git a/tests/entrypoints/sagemaker/test_sagemaker_stateful_sessions.py b/tests/entrypoints/serve/sagemaker/test_sagemaker_stateful_sessions.py similarity index 99% rename from tests/entrypoints/sagemaker/test_sagemaker_stateful_sessions.py rename to tests/entrypoints/serve/sagemaker/test_sagemaker_stateful_sessions.py index 6206000385b..7267b4265cc 100644 --- a/tests/entrypoints/sagemaker/test_sagemaker_stateful_sessions.py +++ b/tests/entrypoints/serve/sagemaker/test_sagemaker_stateful_sessions.py @@ -6,7 +6,8 @@ import openai # use the official client for correctness check import pytest import requests -from ...utils import RemoteOpenAIServer +from tests.utils import RemoteOpenAIServer + from .conftest import ( HEADER_SAGEMAKER_CLOSED_SESSION_ID, HEADER_SAGEMAKER_NEW_SESSION_ID, diff --git a/tests/entrypoints/serve/utils/__init__.py b/tests/entrypoints/serve/utils/__init__.py new file mode 100644 index 00000000000..e69de29bb2d diff --git a/tests/entrypoints/test_utils.py b/tests/entrypoints/serve/utils/test_api_utils.py similarity index 98% rename from tests/entrypoints/test_utils.py rename to tests/entrypoints/serve/utils/test_api_utils.py index ff65066ffd2..2dc6f76da6d 100644 --- a/tests/entrypoints/test_utils.py +++ b/tests/entrypoints/serve/utils/test_api_utils.py @@ -4,7 +4,7 @@ import pytest from vllm.entrypoints.openai.engine.protocol import StreamOptions -from vllm.entrypoints.utils import ( +from vllm.entrypoints.serve.utils.api_utils import ( get_max_tokens, sanitize_message, should_include_usage, diff --git a/tests/entrypoints/openai/test_fingerprint.py b/tests/entrypoints/serve/utils/test_fingerprint.py similarity index 97% rename from tests/entrypoints/openai/test_fingerprint.py rename to tests/entrypoints/serve/utils/test_fingerprint.py index b78ed38636c..46ec6255f4e 100644 --- a/tests/entrypoints/openai/test_fingerprint.py +++ b/tests/entrypoints/serve/utils/test_fingerprint.py @@ -6,7 +6,7 @@ from types import SimpleNamespace import pytest -from vllm.entrypoints.openai import fingerprint as fp +from vllm.entrypoints.serve.utils import fingerprint as fp def _cfg(tp=1, pp=1, dp=1, ep=False, digest="a3b21f94deadbeef"): diff --git a/tests/entrypoints/serve/utils/test_request_logger.py b/tests/entrypoints/serve/utils/test_request_logger.py new file mode 100644 index 00000000000..c17f2471e48 --- /dev/null +++ b/tests/entrypoints/serve/utils/test_request_logger.py @@ -0,0 +1,248 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + +from unittest.mock import MagicMock, patch + +from vllm.entrypoints.serve.utils.request_logger import RequestLogger + + +def test_request_logger_log_outputs(): + """Test the new log_outputs functionality.""" + # Create a mock logger to capture log calls + mock_logger = MagicMock() + + with patch("vllm.entrypoints.serve.utils.request_logger.logger", mock_logger): + request_logger = RequestLogger(max_log_len=None) + + # Test basic output logging + request_logger.log_outputs( + request_id="test-123", + outputs="Hello, world!", + output_token_ids=[1, 2, 3, 4], + finish_reason="stop", + is_streaming=False, + delta=False, + ) + + mock_logger.info.assert_called_once() + call_args = mock_logger.info.call_args.args + assert "Generated response %s%s" in call_args[0] + assert call_args[1] == "test-123" + assert call_args[3] == "Hello, world!" + assert call_args[4] == [1, 2, 3, 4] + assert call_args[5] == "stop" + + +def test_request_logger_log_outputs_streaming_delta(): + """Test log_outputs with streaming delta mode.""" + mock_logger = MagicMock() + + with patch("vllm.entrypoints.serve.utils.request_logger.logger", mock_logger): + request_logger = RequestLogger(max_log_len=None) + + # Test streaming delta logging + request_logger.log_outputs( + request_id="test-456", + outputs="Hello", + output_token_ids=[1], + finish_reason=None, + is_streaming=True, + delta=True, + ) + + mock_logger.info.assert_called_once() + call_args = mock_logger.info.call_args.args + assert "Generated response %s%s" in call_args[0] + assert call_args[1] == "test-456" + assert call_args[2] == " (streaming delta)" + assert call_args[3] == "Hello" + assert call_args[4] == [1] + assert call_args[5] is None + + +def test_request_logger_log_outputs_streaming_complete(): + """Test log_outputs with streaming complete mode.""" + mock_logger = MagicMock() + + with patch("vllm.entrypoints.serve.utils.request_logger.logger", mock_logger): + request_logger = RequestLogger(max_log_len=None) + + # Test streaming complete logging + request_logger.log_outputs( + request_id="test-789", + outputs="Complete response", + output_token_ids=[1, 2, 3], + finish_reason="length", + is_streaming=True, + delta=False, + ) + + mock_logger.info.assert_called_once() + call_args = mock_logger.info.call_args.args + assert "Generated response %s%s" in call_args[0] + assert call_args[1] == "test-789" + assert call_args[2] == " (streaming complete)" + assert call_args[3] == "Complete response" + assert call_args[4] == [1, 2, 3] + assert call_args[5] == "length" + + +def test_request_logger_log_outputs_with_truncation(): + """Test log_outputs respects max_log_len setting.""" + mock_logger = MagicMock() + + with patch("vllm.entrypoints.serve.utils.request_logger.logger", mock_logger): + # Set max_log_len to 10 + request_logger = RequestLogger(max_log_len=10) + + # Test output truncation + long_output = "This is a very long output that should be truncated" + long_token_ids = list(range(20)) # 20 tokens + + request_logger.log_outputs( + request_id="test-truncate", + outputs=long_output, + output_token_ids=long_token_ids, + finish_reason="stop", + is_streaming=False, + delta=False, + ) + + mock_logger.info.assert_called_once() + call_args = mock_logger.info.call_args + + # Check that output was truncated to first 10 characters + logged_output = call_args[0][3] + assert logged_output == "This is a " + assert len(logged_output) == 10 + + # Check that token IDs were truncated to first 10 tokens + logged_token_ids = call_args[0][4] + assert logged_token_ids == list(range(10)) + assert len(logged_token_ids) == 10 + + +def test_request_logger_log_outputs_none_values(): + """Test log_outputs handles None values correctly.""" + mock_logger = MagicMock() + + with patch("vllm.entrypoints.serve.utils.request_logger.logger", mock_logger): + request_logger = RequestLogger(max_log_len=None) + + # Test with None output_token_ids + request_logger.log_outputs( + request_id="test-none", + outputs="Test output", + output_token_ids=None, + finish_reason="stop", + is_streaming=False, + delta=False, + ) + + mock_logger.info.assert_called_once() + call_args = mock_logger.info.call_args.args + assert "Generated response %s%s" in call_args[0] + assert call_args[1] == "test-none" + assert call_args[3] == "Test output" + assert call_args[4] is None + assert call_args[5] == "stop" + + +def test_request_logger_log_outputs_empty_output(): + """Test log_outputs handles empty output correctly.""" + mock_logger = MagicMock() + + with patch("vllm.entrypoints.serve.utils.request_logger.logger", mock_logger): + request_logger = RequestLogger(max_log_len=5) + + # Test with empty output + request_logger.log_outputs( + request_id="test-empty", + outputs="", + output_token_ids=[], + finish_reason="stop", + is_streaming=False, + delta=False, + ) + + mock_logger.info.assert_called_once() + call_args = mock_logger.info.call_args.args + assert "Generated response %s%s" in call_args[0] + assert call_args[1] == "test-empty" + assert call_args[3] == "" + assert call_args[4] == [] + assert call_args[5] == "stop" + + +def test_request_logger_log_outputs_integration(): + """Test that log_outputs can be called alongside log_inputs.""" + mock_logger = MagicMock() + + with patch("vllm.entrypoints.serve.utils.request_logger.logger", mock_logger): + request_logger = RequestLogger(max_log_len=None) + + # Test that both methods can be called without interference + request_logger.log_inputs( + request_id="test-integration", + prompt="Test prompt", + prompt_token_ids=[1, 2, 3], + prompt_embeds=None, + params=None, + lora_request=None, + ) + + request_logger.log_outputs( + request_id="test-integration", + outputs="Test output", + output_token_ids=[4, 5, 6], + finish_reason="stop", + is_streaming=False, + delta=False, + ) + + # Should have been called twice - once for inputs, once for outputs + assert mock_logger.info.call_count == 2 + + # Check that the calls were made with correct patterns + input_call = mock_logger.info.call_args_list[0][0] + output_call = mock_logger.info.call_args_list[1][0] + + assert "Received request %s" in input_call[0] + assert input_call[1] == "test-integration" + + assert "Generated response %s%s" in output_call[0] + assert output_call[1] == "test-integration" + + +def test_streaming_complete_logs_full_text_content(): + """Test that streaming complete logging includes + full accumulated text, not just token count.""" + mock_logger = MagicMock() + + with patch("vllm.entrypoints.serve.utils.request_logger.logger", mock_logger): + request_logger = RequestLogger(max_log_len=None) + + # Test with actual content instead of token count format + full_response = "This is a complete response from streaming" + request_logger.log_outputs( + request_id="test-streaming-full-text", + outputs=full_response, + output_token_ids=None, + finish_reason="streaming_complete", + is_streaming=True, + delta=False, + ) + + mock_logger.info.assert_called_once() + call_args = mock_logger.info.call_args.args + + # Verify the logged output is the full text, not a token count format + logged_output = call_args[3] + assert logged_output == full_response + assert "tokens>" not in logged_output + assert "streaming_complete" not in logged_output + + # Verify other parameters + assert call_args[1] == "test-streaming-full-text" + assert call_args[2] == " (streaming complete)" + assert call_args[5] == "streaming_complete" diff --git a/tests/entrypoints/test_ssl_cert_refresher.py b/tests/entrypoints/serve/utils/test_ssl_cert_refresher.py similarity index 96% rename from tests/entrypoints/test_ssl_cert_refresher.py rename to tests/entrypoints/serve/utils/test_ssl_cert_refresher.py index b56fbd9fee7..57a856ce118 100644 --- a/tests/entrypoints/test_ssl_cert_refresher.py +++ b/tests/entrypoints/serve/utils/test_ssl_cert_refresher.py @@ -7,7 +7,7 @@ from ssl import SSLContext import pytest -from vllm.entrypoints.ssl import SSLCertRefresher +from vllm.entrypoints.serve.utils.ssl import SSLCertRefresher class MockSSLContext(SSLContext): diff --git a/tests/evals/gsm8k/configs/DeepSeek-V2-Lite-Instruct-FP8.yaml b/tests/evals/gsm8k/configs/DeepSeek-V2-Lite-Instruct-FP8.yaml index 72fa7e8a38c..dde67727bc6 100644 --- a/tests/evals/gsm8k/configs/DeepSeek-V2-Lite-Instruct-FP8.yaml +++ b/tests/evals/gsm8k/configs/DeepSeek-V2-Lite-Instruct-FP8.yaml @@ -2,4 +2,5 @@ model_name: "RedHatAI/DeepSeek-Coder-V2-Lite-Instruct-FP8" accuracy_threshold: 0.72 num_questions: 1319 num_fewshot: 5 +rocm_request_timeout_seconds: 1800 server_args: "--enforce-eager --max-model-len 4096" diff --git a/tests/evals/gsm8k/configs/Qwen1.5-MoE-W4A16-CT.yaml b/tests/evals/gsm8k/configs/Qwen1.5-MoE-W4A16-CT.yaml index 4a1b1948aca..027b4ba5622 100644 --- a/tests/evals/gsm8k/configs/Qwen1.5-MoE-W4A16-CT.yaml +++ b/tests/evals/gsm8k/configs/Qwen1.5-MoE-W4A16-CT.yaml @@ -2,4 +2,5 @@ model_name: "nm-testing/Qwen1.5-MoE-A2.7B-Chat-quantized.w4a16" accuracy_threshold: 0.45 num_questions: 1319 num_fewshot: 5 +rocm_request_timeout_seconds: 1800 server_args: "--enforce-eager --max-model-len 4096" diff --git a/tests/evals/gsm8k/gsm8k_eval.py b/tests/evals/gsm8k/gsm8k_eval.py index 647c149ef5f..ff0718cd2aa 100644 --- a/tests/evals/gsm8k/gsm8k_eval.py +++ b/tests/evals/gsm8k/gsm8k_eval.py @@ -106,7 +106,7 @@ async def call_vllm_api( completion_tokens = result.get("usage", {}).get("completion_tokens", 0) return text, completion_tokens except Exception as e: - print(f"Error calling vLLM API: {e}") + print(f"Error calling vLLM API ({type(e).__name__}): {e}") return "", 0 @@ -177,6 +177,7 @@ def evaluate_gsm8k( port: int = 8000, temperature: float = 0.0, seed: int | None = 42, + request_timeout_seconds: float = 600, ) -> dict[str, float | int]: """ Evaluate GSM8K accuracy using vLLM serve endpoint. @@ -205,9 +206,8 @@ def evaluate_gsm8k( output_tokens[i] = tokens return answer, tokens - async with aiohttp.ClientSession( - timeout=aiohttp.ClientTimeout(total=600) - ) as session: + timeout = aiohttp.ClientTimeout(total=request_timeout_seconds) + async with aiohttp.ClientSession(timeout=timeout) as session: tasks = [get_answer(session, i) for i in range(num_questions)] await tqdm.gather(*tasks, desc="Evaluating") diff --git a/tests/evals/gsm8k/test_gsm8k_correctness.py b/tests/evals/gsm8k/test_gsm8k_correctness.py index 57513e18aba..e7a254e760f 100644 --- a/tests/evals/gsm8k/test_gsm8k_correctness.py +++ b/tests/evals/gsm8k/test_gsm8k_correctness.py @@ -39,11 +39,18 @@ def run_gsm8k_eval(eval_config: dict, server_url: str) -> dict: host = f"http://{host}" # Run GSM8K evaluation + request_timeout_seconds = eval_config.get("request_timeout_seconds", 600) + if current_platform.is_rocm(): + request_timeout_seconds = eval_config.get( + "rocm_request_timeout_seconds", request_timeout_seconds + ) + results = evaluate_gsm8k( num_questions=eval_config["num_questions"], num_shots=eval_config["num_fewshot"], host=host, port=port, + request_timeout_seconds=request_timeout_seconds, ) return results @@ -90,6 +97,12 @@ def test_gsm8k_correctness(config_filename): print(f"Expected metric threshold: {eval_config['accuracy_threshold']}") print(f"Number of questions: {eval_config['num_questions']}") print(f"Number of few-shot examples: {eval_config['num_fewshot']}") + request_timeout_seconds = eval_config.get("request_timeout_seconds", 600) + if current_platform.is_rocm(): + request_timeout_seconds = eval_config.get( + "rocm_request_timeout_seconds", request_timeout_seconds + ) + print(f"Request timeout: {request_timeout_seconds}s") print(f"Server args: {' '.join(server_args)}") print(f"Environment variables: {env_dict}") diff --git a/tests/kernels/attention/test_rocm_aiter_unified_attn.py b/tests/kernels/attention/test_rocm_aiter_unified_attn.py new file mode 100644 index 00000000000..9e33f24ea28 --- /dev/null +++ b/tests/kernels/attention/test_rocm_aiter_unified_attn.py @@ -0,0 +1,339 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +"""ROCm kernel correctness tests for AITER unified attention. + +Compares ``aiter.ops.triton.unified_attention`` against ``ref_paged_attn`` under +decode, prefill, and mixed batches with varied shapes. +""" + +from typing import Any, Literal + +import pytest +import torch + +from tests.kernels.attention.test_triton_unified_attention import ref_paged_attn +from vllm.platforms import current_platform +from vllm.utils.torch_utils import set_random_seed + +_SKIP_NON_MI3XX = True +if current_platform.is_rocm(): + from vllm.platforms.rocm import on_mi3xx + + _SKIP_NON_MI3XX = not on_mi3xx() + +pytestmark = [ + pytest.mark.skipif(not current_platform.is_rocm(), reason="ROCm-specific tests"), + pytest.mark.skipif(_SKIP_NON_MI3XX, reason="MI300/MI350 ROCm only"), +] + +NUM_Q_HEADS = 8 +NUM_KV_HEADS = 8 +HEAD_SIZES = [128, 256] +BLOCK_SIZES = [16, 64] +DTYPES = [torch.bfloat16, torch.float16] +FP8_DTYPE = current_platform.fp8_dtype() + +# (query_len, kv_len) per sequence +MIXED_SEQ_LENS = [ + [(1, 128), (5, 18), (129, 463)], + [(10, 256), (5, 64), (32, 128)], + [(1, 1024), (5, 18), (129, 1328)], +] +DECODE_SEQ_LENS = [ + [(1, 128), (1, 256), (1, 384), (1, 512)], + [(1, 1024), (1, 1536), (1, 2048)], +] +PREFILL_SEQ_LENS = [ + [(256, 256), (128, 512)], + [(64, 128), (32, 256), (16, 512)], + [(256, 1024), (128, 2048)], +] + +DEFAULT_ATOL, DEFAULT_RTOL = 1.5e-2, 1e-2 +FP8_ATOL, FP8_RTOL = 1.5e-1, 1.5e-1 +# Non-unity scale so q_descale handling is exercised explicitly. +Q_SCALE = 0.75 +K_SCALE, V_SCALE = 0.5, 0.25 + +Fp8Variant = Literal["fp8_kv", "fp8_query", "fp8_query_kv"] + +FP8_VARIANTS = [ + pytest.param("fp8_kv", id="fp8_kv"), + pytest.param("fp8_query", id="fp8_query"), + pytest.param("fp8_query_kv", id="fp8_query_kv"), +] + +FP8_SEQ_LENS = [ + MIXED_SEQ_LENS[0], + DECODE_SEQ_LENS[0], + DECODE_SEQ_LENS[1], + PREFILL_SEQ_LENS[0], + PREFILL_SEQ_LENS[2], +] + + +def _require_aiter() -> None: + from vllm._aiter_ops import is_aiter_found_and_supported + + if not is_aiter_found_and_supported(): + pytest.skip("aiter is required on supported ROCm hardware for this test") + + +def _make_case( + *, + seq_lens: list[tuple[int, int]], + head_size: int, + block_size: int, + dtype: torch.dtype, + num_blocks: int = 2048, + kv_cache_dtype: torch.dtype | None = None, + k_scale: float = 1.0, + v_scale: float = 1.0, + q_dtype: torch.dtype | None = None, + q_scale: float = Q_SCALE, +) -> dict[str, Any]: + torch.set_default_device("cuda") + + query_lens = [q for q, _ in seq_lens] + kv_lens = [k for _, k in seq_lens] + num_seqs = len(seq_lens) + max_query_len = max(query_lens) + max_kv_len = max(kv_lens) + scale = head_size**-0.5 + + query = torch.randn(sum(query_lens), NUM_Q_HEADS, head_size, dtype=dtype) + if kv_cache_dtype is None: + key_cache = torch.randn( + num_blocks, block_size, NUM_KV_HEADS, head_size, dtype=dtype + ) + value_cache = torch.randn_like(key_cache) + else: + key_cache = torch.clamp( + torch.randn(num_blocks, block_size, NUM_KV_HEADS, head_size), + -1.0, + 1.0, + ).to(kv_cache_dtype) + value_cache = torch.clamp( + torch.randn(num_blocks, block_size, NUM_KV_HEADS, head_size), + -1.0, + 1.0, + ).to(kv_cache_dtype) + + cu_seqlens_q = torch.tensor([0] + query_lens, dtype=torch.int32).cumsum( + dim=0, dtype=torch.int32 + ) + seq_lens_tensor = torch.tensor(kv_lens, dtype=torch.int32) + + max_num_blocks = (max_kv_len + block_size - 1) // block_size + block_tables = torch.randint( + 0, num_blocks, (num_seqs, max_num_blocks), dtype=torch.int32 + ) + + descale_shape = (num_seqs, NUM_KV_HEADS) + k_descale = torch.full(descale_shape, k_scale, dtype=torch.float32, device="cuda") + v_descale = torch.full(descale_shape, v_scale, dtype=torch.float32, device="cuda") + + kernel_query = query + q_descale = None + if q_dtype is not None: + q_descale = torch.tensor(q_scale, dtype=torch.float32, device="cuda") + kernel_query = (query / q_scale).to(q_dtype) + + return { + "query": query, + "kernel_query": kernel_query, + "key_cache": key_cache, + "value_cache": value_cache, + "block_tables": block_tables, + "query_lens": query_lens, + "kv_lens": kv_lens, + "seq_lens_tensor": seq_lens_tensor, + "cu_seqlens_q": cu_seqlens_q, + "q_descale": q_descale, + "k_descale": k_descale, + "v_descale": v_descale, + "scale": scale, + "max_query_len": max_query_len, + "max_kv_len": max_kv_len, + "query_dtype": dtype, + "k_scale": k_scale, + "v_scale": v_scale, + } + + +def _make_fp8_case( + *, + seq_lens: list[tuple[int, int]], + head_size: int, + block_size: int, + variant: Fp8Variant, +) -> dict[str, Any]: + use_fp8_kv = variant in ("fp8_kv", "fp8_query_kv") + use_fp8_query = variant in ("fp8_query", "fp8_query_kv") + return _make_case( + seq_lens=seq_lens, + head_size=head_size, + block_size=block_size, + dtype=torch.bfloat16, + kv_cache_dtype=FP8_DTYPE if use_fp8_kv else None, + k_scale=K_SCALE if use_fp8_kv else 1.0, + v_scale=V_SCALE if use_fp8_kv else 1.0, + q_dtype=FP8_DTYPE if use_fp8_query else None, + ) + + +def _run_aiter_unified_attention(case: dict[str, Any]) -> torch.Tensor: + from aiter.ops.triton.unified_attention import unified_attention + + kernel_query = case["kernel_query"] + # Kernel writes high-precision output even when Q is FP8 (matches vLLM usage). + output = torch.empty_like(case["query"]) + unified_attention( + q=kernel_query, + k=case["key_cache"], + v=case["value_cache"], + out=output, + cu_seqlens_q=case["cu_seqlens_q"], + max_seqlen_q=case["max_query_len"], + seqused_k=case["seq_lens_tensor"], + max_seqlen_k=case["max_kv_len"], + softmax_scale=case["scale"], + causal=True, + alibi_slopes=None, + window_size=(-1, -1), + block_table=case["block_tables"], + softcap=0, + q_descale=case["q_descale"], + k_descale=case["k_descale"], + v_descale=case["v_descale"], + sinks=None, + output_scale=None, + ) + return output + + +def _ref_output(case: dict[str, Any]) -> torch.Tensor: + key_cache = case["key_cache"] + value_cache = case["value_cache"] + if key_cache.dtype != case["query_dtype"]: + key_cache = key_cache.to(case["query_dtype"]) * case["k_scale"] + value_cache = value_cache.to(case["query_dtype"]) * case["v_scale"] + + return ref_paged_attn( + query=case["query"], + key_cache=key_cache, + value_cache=value_cache, + query_lens=case["query_lens"], + kv_lens=case["kv_lens"], + block_tables=case["block_tables"], + scale=case["scale"], + ) + + +def _assert_matches_reference( + case: dict[str, Any], + *, + atol: float = DEFAULT_ATOL, + rtol: float = DEFAULT_RTOL, +) -> None: + output = _run_aiter_unified_attention(case) + output_ref = _ref_output(case) + torch.testing.assert_close(output, output_ref, atol=atol, rtol=rtol) + + +@pytest.mark.parametrize("seq_lens", MIXED_SEQ_LENS) +@pytest.mark.parametrize("head_size", HEAD_SIZES) +@pytest.mark.parametrize("block_size", BLOCK_SIZES) +@pytest.mark.parametrize("dtype", DTYPES) +@torch.inference_mode() +def test_aiter_unified_attn_mixed_batch( + seq_lens: list[tuple[int, int]], + head_size: int, + block_size: int, + dtype: torch.dtype, +) -> None: + """Decode + prefill sequences in one batch (native dtypes).""" + _require_aiter() + set_random_seed(0) + + case = _make_case( + seq_lens=seq_lens, + head_size=head_size, + block_size=block_size, + dtype=dtype, + ) + _assert_matches_reference(case) + + +@pytest.mark.parametrize("seq_lens", DECODE_SEQ_LENS) +@pytest.mark.parametrize("head_size", HEAD_SIZES) +@pytest.mark.parametrize("block_size", BLOCK_SIZES) +@pytest.mark.parametrize("dtype", [torch.bfloat16]) +@torch.inference_mode() +def test_aiter_unified_attn_decode( + seq_lens: list[tuple[int, int]], + head_size: int, + block_size: int, + dtype: torch.dtype, +) -> None: + """Single-token decode (native dtypes).""" + _require_aiter() + set_random_seed(0) + + case = _make_case( + seq_lens=seq_lens, + head_size=head_size, + block_size=block_size, + dtype=dtype, + ) + _assert_matches_reference(case) + + +@pytest.mark.parametrize("seq_lens", PREFILL_SEQ_LENS) +@pytest.mark.parametrize("head_size", [128]) +@pytest.mark.parametrize("block_size", [16]) +@torch.inference_mode() +def test_aiter_unified_attn_prefill( + seq_lens: list[tuple[int, int]], + head_size: int, + block_size: int, +) -> None: + """Prefill-only batches with query_len > 1 (native dtypes).""" + _require_aiter() + set_random_seed(0) + + case = _make_case( + seq_lens=seq_lens, + head_size=head_size, + block_size=block_size, + dtype=torch.bfloat16, + ) + _assert_matches_reference(case) + + +@pytest.mark.skipif( + not current_platform.supports_fp8(), + reason="FP8 not supported on this hardware", +) +@pytest.mark.parametrize("variant", FP8_VARIANTS) +@pytest.mark.parametrize("seq_lens", FP8_SEQ_LENS) +@pytest.mark.parametrize("head_size", [128]) +@pytest.mark.parametrize("block_size", [16, 64]) +@torch.inference_mode() +def test_aiter_unified_attn_fp8( + variant: Fp8Variant, + seq_lens: list[tuple[int, int]], + head_size: int, + block_size: int, +) -> None: + """FP8 KV cache, FP8 query, or both; compared at bf16 reference precision.""" + _require_aiter() + set_random_seed(0) + + case = _make_fp8_case( + seq_lens=seq_lens, + head_size=head_size, + block_size=block_size, + variant=variant, + ) + _assert_matches_reference(case, atol=FP8_ATOL, rtol=FP8_RTOL) diff --git a/tests/kernels/moe/test_cutlass_moe.py b/tests/kernels/moe/test_cutlass_moe.py index 1380281bb2e..e3315142a9b 100644 --- a/tests/kernels/moe/test_cutlass_moe.py +++ b/tests/kernels/moe/test_cutlass_moe.py @@ -205,7 +205,10 @@ def run_with_expert_maps( w2 = kwargs["w2"] a = kwargs["hidden_states"] moe_config = make_dummy_moe_config( - num_experts=w2.shape[0], + max_num_tokens=kwargs.get("hidden_states").shape[0], + experts_per_token=kwargs.get("topk_ids").shape[1], + num_experts=num_experts, + num_local_experts=num_local_experts, hidden_dim=w2.shape[1], intermediate_size_per_partition=w2.shape[2], in_dtype=a.dtype, @@ -258,23 +261,27 @@ def run_8_bit( a1_scale=None, ) + num_experts = moe_tensors.w1.size(0) # type: ignore[attr-defined] + with_ep = num_local_experts is not None or num_local_experts == num_experts + kwargs = { "hidden_states": moe_tensors.a, "w1": moe_tensors.w1_q, # type: ignore[union-attr] "w2": moe_tensors.w2_q, # type: ignore[union-attr] "topk_weights": topk_weights, "topk_ids": topk_ids, - "global_num_experts": moe_tensors.w1_q.shape[0], # type: ignore[union-attr] + "global_num_experts": num_experts, "activation": MoEActivation.SILU, "expert_map": None, "apply_router_weight_on_input": False, } - num_experts = moe_tensors.w1.size(0) # type: ignore[attr-defined] - with_ep = num_local_experts is not None or num_local_experts == num_experts if not with_ep: moe_config = make_dummy_moe_config( - num_experts=moe_tensors.w2_q.shape[0], # type: ignore[union-attr] + max_num_tokens=moe_tensors.a.shape[0], + experts_per_token=topk_ids.shape[1], + num_experts=num_experts, + num_local_experts=num_local_experts, hidden_dim=moe_tensors.w2_q.shape[1], # type: ignore[union-attr] intermediate_size_per_partition=moe_tensors.w2_q.shape[2], # type: ignore[union-attr] in_dtype=moe_tensors.a.dtype, @@ -581,6 +588,7 @@ def test_run_cutlass_moe_fp8( per_out_channel, False, topk_weights, + None, ) workspace13.random_() diff --git a/tests/kernels/moe/test_moe_layer.py b/tests/kernels/moe/test_moe_layer.py index e0f73cd657e..188f4448137 100644 --- a/tests/kernels/moe/test_moe_layer.py +++ b/tests/kernels/moe/test_moe_layer.py @@ -1287,10 +1287,12 @@ def _test_body_eplb( expert_weights = [list(eplb_moe_layer.get_expert_weights())] + expert_buffer = [torch.empty_like(w) for w in expert_weights[0]] communicator = create_eplb_communicator( group_coordinator=get_eplb_group(), backend=vllm_config.parallel_config.eplb_config.communicator, - expert_weights=expert_weights[0], + expert_weights=expert_weights, + expert_buffer=expert_buffer, ) # Rearrange expert weights across EP ranks @@ -1298,6 +1300,7 @@ def _test_body_eplb( old_global_expert_indices=initial_indices.unsqueeze(0), new_global_expert_indices=shuffled_indices.unsqueeze(0), expert_weights=expert_weights, + expert_buffer=expert_buffer, ep_group=cpu_group, communicator=communicator, ) diff --git a/tests/kernels/moe/utils.py b/tests/kernels/moe/utils.py index 3503ce4cdeb..ebb99576756 100644 --- a/tests/kernels/moe/utils.py +++ b/tests/kernels/moe/utils.py @@ -49,10 +49,12 @@ def shuffle_weight(w: torch.Tensor) -> torch.Tensor: def make_dummy_moe_config( num_experts: int = 1, + num_local_experts: int | None = None, experts_per_token: int = 1, hidden_dim: int = 1, intermediate_size_per_partition: int = 1, in_dtype: torch.dtype = torch.bfloat16, + max_num_tokens: int = 512, ) -> FusedMoEConfig: """ This is a dummy config for the mk constructor interface @@ -66,14 +68,16 @@ def make_dummy_moe_config( experts_per_token=experts_per_token, hidden_dim=hidden_dim, intermediate_size_per_partition=intermediate_size_per_partition, - num_local_experts=num_experts, + num_local_experts=num_local_experts + if num_local_experts is not None + else num_experts, num_logical_experts=num_experts, moe_parallel_config=FusedMoEParallelConfig.make_no_parallel(), activation=MoEActivation.SILU, in_dtype=in_dtype, device="cuda", routing_method=RoutingMethodType.TopK, - max_num_tokens=512, + max_num_tokens=max_num_tokens, ) diff --git a/tests/kernels/quantization/test_quantized_embedding.py b/tests/kernels/quantization/test_quantized_embedding.py new file mode 100644 index 00000000000..0e4af0a0c1a --- /dev/null +++ b/tests/kernels/quantization/test_quantized_embedding.py @@ -0,0 +1,67 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +"""Tests for the Triton dequant-gather kernel used by +``CompressedTensorsEmbeddingWNA16Int`` (quantized embedding lookup).""" + +import pytest +import torch +from compressed_tensors.compressors.pack_quantized.helpers import unpack_from_int32 + +from vllm.model_executor.layers.quantization.compressed_tensors.compressed_tensors_embedding import ( # noqa: E501 + _dequant_gather_triton, +) +from vllm.platforms import current_platform + + +def _dequant_gather_torch( + ids: torch.Tensor, + weight_packed: torch.Tensor, + weight_scale: torch.Tensor, + hidden: int, + num_bits: int, +) -> torch.Tensor: + """Reference: gather packed rows by id, unpack int32-packed INT, dequant.""" + n = ids.shape[0] + int8 = unpack_from_int32(weight_packed[ids], num_bits, torch.Size([n, hidden])) + scale_rows = weight_scale[ids] + w = int8.to(scale_rows.dtype) + if scale_rows.shape[1] == 1: + return w * scale_rows + ng = scale_rows.shape[1] + return (w.view(n, ng, hidden // ng) * scale_rows.unsqueeze(-1)).view(n, hidden) + + +@pytest.mark.skipif( + not current_platform.is_cuda(), reason="Triton dequant kernel requires CUDA" +) +@pytest.mark.parametrize("num_bits", [2, 4, 8]) +@pytest.mark.parametrize("group_size", [0, 256]) # 0 -> channel +@pytest.mark.parametrize("dtype", [torch.float32, torch.bfloat16]) +@pytest.mark.parametrize("num_ids", [1, 17, 4096]) +def test_dequant_gather(num_bits, group_size, dtype, num_ids): + torch.manual_seed(0) + device = "cuda" + vocab, hidden = 1000, 2048 + pack_factor = 32 // num_bits + + # Random full-range int32 packed weights (covers the sign bit -> exercises the + # arithmetic-shift + mask unpack path). + weight_packed = torch.randint( + -(2**31), + 2**31, + (vocab, hidden // pack_factor), + dtype=torch.int32, + device=device, + ) + + num_groups = 1 if group_size == 0 else hidden // group_size + weight_scale = torch.rand(vocab, num_groups, dtype=dtype, device=device) + 0.01 + + ids = torch.randint(0, vocab, (num_ids,), dtype=torch.long, device=device) + + out = _dequant_gather_triton(ids, weight_packed, weight_scale, hidden, num_bits) + ref = _dequant_gather_torch(ids, weight_packed, weight_scale, hidden, num_bits) + + assert out.shape == (num_ids, hidden) + assert out.dtype == dtype + torch.testing.assert_close(out, ref) diff --git a/tests/kernels/test_compressor_kv_cache.py b/tests/kernels/test_compressor_kv_cache.py index c6daab2d86b..74dc01472a8 100644 --- a/tests/kernels/test_compressor_kv_cache.py +++ b/tests/kernels/test_compressor_kv_cache.py @@ -468,6 +468,7 @@ def _reference_kv_compress_norm_rope( use_fp4: bool = False, rms_eps: float = 1e-6, fp8_max: float = 448.0, + return_full_cache: bool = False, ): """Compress → RMSNorm → GPT-J RoPE → quantize. @@ -521,6 +522,12 @@ def _reference_kv_compress_norm_rope( results.append(torch.cat([nope, rope]).to(state_cache.dtype)) result = torch.stack(results) + if return_full_cache: + # Contiguous 512-wide bf16 row (nope unrotated + rope rotated), matching + # the FlashInfer full-cache layout before any per-tensor fp8 quant. The + # kernel rounds the fp32 result to bf16 once at the store. + return result.to(torch.bfloat16) + if use_fp4: return quantize_to_mxfp4(result) else: @@ -667,3 +674,145 @@ def test_fused_kv_insert_indexer(num_tokens: int, kv_block_size: int, use_fp4: b assert torch.equal(actual_scale, scale[i : i + 1]), ( f"token {i}: scale {actual_scale.item()} != {scale[i].item()}" ) + + +@pytest.mark.parametrize("compress_ratio", [4, 128]) +@pytest.mark.parametrize("store_fp8", [False, True]) +def test_cutedsl_full_cache_store(compress_ratio: int, store_fp8: bool): + """CuTeDSL compressor full-cache (FlashInfer) store parity for head=512. + + Exercises the contiguous bf16 / per-tensor fp8 store branch of both the C4 + fused kernel and the C128 split kernel against the PyTorch reference. + """ + cutedsl = pytest.importorskip("cutlass") # noqa: F841 + from vllm.models.deepseek_v4.nvidia.ops.sparse_attn_compress_cutedsl import ( + fused_kv_compress_norm_rope_insert_sparse_attn_cutedsl, + split_kv_compress_norm_rope_insert_sparse_attn_cutedsl, + ) + + HEAD_DIM = 512 + ROPE_DIM = 64 + RMS_EPS = 1e-6 + FP8_MAX = 448.0 + # C128 compress (Block8 kernel) requires state-cache block_size=8; C4 uses 16. + BLOCK_SIZE = 8 if compress_ratio == 128 else 16 + KV_BLOCK_SIZE = 64 + device = "cuda" + torch.manual_seed(7) + + overlap = 1 if compress_ratio == 4 else 0 + coff = 1 + overlap + num_tokens = 8 + + num_pages = (compress_ratio * num_tokens - 1) // BLOCK_SIZE + 2 + # The production CompressorStateCache is fp32. + state_cache = torch.randn( + num_pages, BLOCK_SIZE, 2 * coff * HEAD_DIM, dtype=torch.float32, device=device + ) + block_table = torch.arange(num_pages, dtype=torch.int32, device=device).unsqueeze(0) + token_to_req = torch.zeros(num_tokens, dtype=torch.int32, device=device) + slot_mapping = torch.arange(num_tokens, dtype=torch.int64, device=device) + positions = torch.arange( + compress_ratio - 1, + compress_ratio * num_tokens, + compress_ratio, + dtype=torch.int64, + device=device, + ) + rms_weight = torch.randn(HEAD_DIM, dtype=torch.bfloat16, device=device) + cos_sin_cache = torch.randn( + compress_ratio * num_tokens, ROPE_DIM, dtype=torch.float32, device=device + ) + + dtype = torch.float8_e4m3fn if store_fp8 else torch.bfloat16 + kv_n_blocks = (num_tokens + KV_BLOCK_SIZE - 1) // KV_BLOCK_SIZE + 1 + k_cache = torch.zeros( + kv_n_blocks, KV_BLOCK_SIZE, HEAD_DIM, dtype=dtype, device=device + ) + fp8_scale = torch.tensor( + [0.5 if store_fp8 else 1.0], dtype=torch.float32, device=device + ) + + if compress_ratio == 4: + fused_kv_compress_norm_rope_insert_sparse_attn_cutedsl( + state_cache, + token_to_req, + positions, + slot_mapping, + block_table, + BLOCK_SIZE, + rms_weight, + RMS_EPS, + cos_sin_cache, + k_cache, + slot_mapping, + KV_BLOCK_SIZE, + k_cache.stride(0), + head_size=HEAD_DIM, + state_width=coff * HEAD_DIM, + rope_head_dim=ROPE_DIM, + fp8_max=FP8_MAX, + quant_block=64, + token_stride=576, + scale_dim=8, + compress_ratio=compress_ratio, + overlap=True, + store_full_kv=True, + store_full_fp8=store_fp8, + fp8_scale=fp8_scale, + ) + else: + compressed_kv = torch.empty( + (num_tokens, HEAD_DIM), dtype=torch.float32, device=device + ) + split_kv_compress_norm_rope_insert_sparse_attn_cutedsl( + state_cache, + token_to_req, + positions, + slot_mapping, + block_table, + BLOCK_SIZE, + compressed_kv, + rms_weight, + RMS_EPS, + cos_sin_cache, + k_cache, + slot_mapping, + KV_BLOCK_SIZE, + k_cache.stride(0), + head_size=HEAD_DIM, + state_width=coff * HEAD_DIM, + rope_head_dim=ROPE_DIM, + fp8_max=FP8_MAX, + quant_block=64, + token_stride=576, + scale_dim=8, + compress_ratio=compress_ratio, + overlap=bool(overlap), + store_full_kv=True, + store_full_fp8=store_fp8, + fp8_scale=fp8_scale, + ) + + ref = _reference_kv_compress_norm_rope( + state_cache, + block_table, + positions, + rms_weight, + cos_sin_cache, + compress_ratio, + overlap, + rms_eps=RMS_EPS, + return_full_cache=True, + ) # [num_tokens, HEAD_DIM] bf16 + + actual = torch.stack( + [k_cache[i // KV_BLOCK_SIZE, i % KV_BLOCK_SIZE] for i in range(num_tokens)] + ) + if store_fp8: + ref_fp8 = torch.clamp(ref.float() / fp8_scale, -FP8_MAX, FP8_MAX).to( + torch.float8_e4m3fn + ) + torch.testing.assert_close(actual.float(), ref_fp8.float(), rtol=0.0, atol=0.3) + else: + torch.testing.assert_close(actual.float(), ref.float(), rtol=3e-2, atol=3e-2) diff --git a/tests/kernels/test_fused_deepseek_v4_qnorm_rope_kv_insert.py b/tests/kernels/test_fused_deepseek_v4_qnorm_rope_kv_insert.py index a49ea498e5e..e568ce57638 100644 --- a/tests/kernels/test_fused_deepseek_v4_qnorm_rope_kv_insert.py +++ b/tests/kernels/test_fused_deepseek_v4_qnorm_rope_kv_insert.py @@ -67,7 +67,7 @@ def apply_rope_gptj_last_k( head_dim = x.shape[-1] nope_dim = head_dim - rope_dim - cs = cos_sin_cache[positions].to(torch.float32) + cs = cos_sin_cache[positions.long()].to(torch.float32) cos = cs[..., :half] sin = cs[..., half:] @@ -114,6 +114,18 @@ def _op_available() -> bool: return hasattr(torch.ops._C, "fused_deepseek_v4_qnorm_rope_kv_rope_quant_insert") +def _full_cache_fp8_op_available() -> bool: + return hasattr( + torch.ops._C, "fused_deepseek_v4_qnorm_rope_kv_rope_full_cache_fp8_insert" + ) + + +def _full_cache_bf16_op_available() -> bool: + return hasattr( + torch.ops._C, "fused_deepseek_v4_qnorm_rope_kv_rope_full_cache_bf16_insert" + ) + + pytestmark = pytest.mark.skipif( not torch.cuda.is_available() or not _op_available(), reason="CUDA not available or fused DeepseekV4 op not built in", @@ -415,3 +427,238 @@ def test_combined_q_and_kv( "padded head slots must be exact zero" ) torch.testing.assert_close(k_cache_fused, k_cache_ref, rtol=0, atol=0) + + +# ── Full-cache (FlashInfer) path parity ────────────────────────────────────── + + +def _call_full_cache_fp8_fused( + q, + kv, + q_fp8, + k_cache, + slot_mapping, + positions, + cos_sin_cache, + fp8_scale, + q_fp8_scale_inv, + eps, + bs, +): + torch.ops._C.fused_deepseek_v4_qnorm_rope_kv_rope_full_cache_fp8_insert( + q, + kv, + q_fp8, + k_cache, + slot_mapping, + positions.long(), + cos_sin_cache, + fp8_scale, + q_fp8_scale_inv, + eps, + bs, + ) + + +def _call_full_cache_bf16_fused( + q, + kv, + k_cache, + slot_mapping, + positions, + cos_sin_cache, + eps, + bs, +): + torch.ops._C.fused_deepseek_v4_qnorm_rope_kv_rope_full_cache_bf16_insert( + q, + kv, + k_cache, + slot_mapping, + positions.long(), + cos_sin_cache, + eps, + bs, + ) + + +def _fp8_full_cache_reference( + q, + kv, + k_cache, + q_fp8, + slot_mapping, + positions, + cos_sin_cache, + eps, + block_size, + fp8_scale, + q_fp8_scale_inv, +): + q_ref = rmsnorm_no_weight(q, eps) + q_ref = apply_rope_gptj_last_k(q_ref, positions, cos_sin_cache) + q_fp8.copy_( + torch.clamp(q_ref.float() * q_fp8_scale_inv, -FP8_MAX, FP8_MAX).to( + torch.float8_e4m3fn + ) + ) + + kv_ref = apply_rope_gptj_last_k(kv, positions, cos_sin_cache) + valid = slot_mapping >= 0 + slots = slot_mapping[valid] + block_idx = slots // block_size + pos_in_block = slots % block_size + k_cache[block_idx, pos_in_block] = torch.clamp( + kv_ref[valid].float() / fp8_scale, -FP8_MAX, FP8_MAX + ).to(torch.float8_e4m3fn) + + +def _bf16_full_cache_reference( + q, + kv, + k_cache, + slot_mapping, + positions, + cos_sin_cache, + eps, + block_size, +): + q_ref = rmsnorm_no_weight(q, eps) + # Kernel keeps RMSNorm+RoPE in fp32 and rounds to bf16 once at the store. + q_ref = apply_rope_gptj_last_k(q_ref, positions, cos_sin_cache).to(q.dtype) + + kv_ref = apply_rope_gptj_last_k(kv, positions, cos_sin_cache) + valid = slot_mapping >= 0 + slots = slot_mapping[valid] + block_idx = slots // block_size + pos_in_block = slots % block_size + k_cache[block_idx, pos_in_block] = kv_ref[valid] + return q_ref + + +@pytest.mark.skipif( + not _full_cache_fp8_op_available(), + reason="full-cache per-tensor FP8 DeepseekV4 op not built in", +) +@pytest.mark.parametrize("num_tokens", [4, 17]) +@pytest.mark.parametrize("n_heads", [8, 17]) +@pytest.mark.parametrize("positions_dtype", [torch.int32, torch.int64]) +def test_full_cache_per_tensor_fp8_matches_reference( + num_tokens: int, + n_heads: int, + positions_dtype: torch.dtype, +): + torch.manual_seed(4) + device = "cuda" + dtype = torch.bfloat16 + eps = 1e-6 + block_size = 16 + max_pos = 4096 + + q = torch.randn(num_tokens, n_heads, HEAD_DIM, dtype=dtype, device=device) + kv = torch.randn(num_tokens, HEAD_DIM, dtype=dtype, device=device) + positions = torch.arange(num_tokens, dtype=positions_dtype, device=device) + cos_sin_cache = make_cos_sin_cache(max_pos, ROPE_DIM, torch.float32, device) + + num_blocks = (num_tokens + block_size - 1) // block_size + 1 + slot_mapping = torch.arange(num_tokens, dtype=torch.int64, device=device) + fp8_scale = torch.tensor([1.0], dtype=torch.float32, device=device) + q_fp8_scale_inv = torch.tensor([1.0], dtype=torch.float32, device=device) + + q_fp8_ref = torch.empty_like(q, dtype=torch.float8_e4m3fn) + q_fp8_fused = torch.empty_like(q, dtype=torch.float8_e4m3fn) + k_cache_ref = torch.zeros( + num_blocks, block_size, HEAD_DIM, dtype=torch.float8_e4m3fn, device=device + ) + k_cache_fused = torch.zeros_like(k_cache_ref) + + _fp8_full_cache_reference( + q, + kv, + k_cache_ref, + q_fp8_ref, + slot_mapping, + positions, + cos_sin_cache, + eps, + block_size, + fp8_scale, + q_fp8_scale_inv, + ) + _call_full_cache_fp8_fused( + q.clone(), + kv, + q_fp8_fused, + k_cache_fused, + slot_mapping, + positions, + cos_sin_cache, + fp8_scale, + q_fp8_scale_inv, + eps, + block_size, + ) + + torch.testing.assert_close( + q_fp8_fused.float(), q_fp8_ref.float(), rtol=0, atol=0.25 + ) + torch.testing.assert_close( + k_cache_fused.float(), k_cache_ref.float(), rtol=0, atol=0.25 + ) + + +@pytest.mark.skipif( + not _full_cache_bf16_op_available(), + reason="full-cache BF16 DeepseekV4 op not built in", +) +@pytest.mark.parametrize("num_tokens", [4, 17]) +@pytest.mark.parametrize("n_heads", [8, 17]) +@pytest.mark.parametrize("positions_dtype", [torch.int32, torch.int64]) +def test_full_cache_bf16_matches_reference( + num_tokens: int, + n_heads: int, + positions_dtype: torch.dtype, +): + torch.manual_seed(5) + device = "cuda" + dtype = torch.bfloat16 + eps = 1e-6 + block_size = 16 + max_pos = 4096 + + q = torch.randn(num_tokens, n_heads, HEAD_DIM, dtype=dtype, device=device) + kv = torch.randn(num_tokens, HEAD_DIM, dtype=dtype, device=device) + positions = torch.arange(num_tokens, dtype=positions_dtype, device=device) + cos_sin_cache = make_cos_sin_cache(max_pos, ROPE_DIM, torch.float32, device) + + num_blocks = (num_tokens + block_size - 1) // block_size + 1 + slot_mapping = torch.arange(num_tokens, dtype=torch.int64, device=device) + + q_fused = q.clone() + k_cache_ref = torch.zeros( + num_blocks, block_size, HEAD_DIM, dtype=torch.bfloat16, device=device + ) + k_cache_fused = torch.zeros_like(k_cache_ref) + q_ref = _bf16_full_cache_reference( + q, + kv, + k_cache_ref, + slot_mapping, + positions, + cos_sin_cache, + eps, + block_size, + ) + _call_full_cache_bf16_fused( + q_fused, + kv, + k_cache_fused, + slot_mapping, + positions, + cos_sin_cache, + eps, + block_size, + ) + + torch.testing.assert_close(q_fused, q_ref, rtol=1e-2, atol=1e-2) + torch.testing.assert_close(k_cache_fused, k_cache_ref, rtol=0, atol=0) diff --git a/tests/model_executor/layers/test_pooler_heads.py b/tests/model_executor/layers/test_pooler_heads.py new file mode 100644 index 00000000000..99097636f94 --- /dev/null +++ b/tests/model_executor/layers/test_pooler_heads.py @@ -0,0 +1,481 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +"""Unit tests for sequence and token pooler head classes.""" + +import torch +import torch.nn as nn + +from vllm.model_executor.layers.pooler.activations import PoolerNormalize +from vllm.model_executor.layers.pooler.seqwise.heads import ( + ClassifierPoolerHead, + EmbeddingPoolerHead, +) +from vllm.model_executor.layers.pooler.tokwise.heads import ( + TokenClassifierPoolerHead, + TokenEmbeddingPoolerHead, +) +from vllm.pooling_params import PoolingParams +from vllm.v1.pool.metadata import PoolingMetadata, PoolingStates + +_HIDDEN = 16 +_BATCH = 3 + + +def _make_params( + n: int, + *, + task: str = "embed", + dimensions: int | None = None, + use_activation: bool | None = None, +) -> list[PoolingParams]: + return [ + PoolingParams(task=task, dimensions=dimensions, use_activation=use_activation) + for _ in range(n) + ] + + +def _make_metadata(pooling_params: list[PoolingParams]) -> PoolingMetadata: + n = len(pooling_params) + return PoolingMetadata( + prompt_lens=torch.ones(n, dtype=torch.long), + prompt_token_ids=None, + prompt_token_ids_cpu=None, + pooling_params=pooling_params, + pooling_states=[PoolingStates() for _ in range(n)], + ) + + +def _linear(in_f: int, out_f: int) -> nn.Linear: + torch.manual_seed(42) + return nn.Linear(in_f, out_f, bias=False) + + +# --------------------------------------------------------------------------- +# EmbeddingPoolerHead +# --------------------------------------------------------------------------- +class TestEmbeddingPoolerHead: + def test_supported_tasks(self): + head = EmbeddingPoolerHead() + assert head.get_supported_tasks() == {"embed"} + + def test_passthrough(self): + head = EmbeddingPoolerHead() + x = torch.randn(_BATCH, _HIDDEN) + meta = _make_metadata(_make_params(_BATCH)) + out = head(x, meta) + assert torch.equal(out, x) + + def test_head_dtype(self): + head = EmbeddingPoolerHead(head_dtype=torch.float16) + x = torch.randn(_BATCH, _HIDDEN) + meta = _make_metadata(_make_params(_BATCH)) + out = head(x, meta) + assert out.dtype == torch.float16 + + def test_projector(self): + proj = _linear(_HIDDEN, 8) + head = EmbeddingPoolerHead(projector=proj) + x = torch.randn(_BATCH, _HIDDEN) + meta = _make_metadata(_make_params(_BATCH)) + out = head(x, meta) + assert out.shape == (_BATCH, 8) + assert torch.allclose(out, proj(x)) + + def test_matryoshka_uniform(self): + head = EmbeddingPoolerHead() + x = torch.randn(_BATCH, _HIDDEN) + params = _make_params(_BATCH, dimensions=4) + meta = _make_metadata(params) + out = head(x, meta) + assert out.shape == (_BATCH, 4) + assert torch.equal(out, x[..., :4]) + + def test_matryoshka_mixed(self): + head = EmbeddingPoolerHead() + x = torch.randn(2, _HIDDEN) + params = [ + PoolingParams(task="embed", dimensions=4), + PoolingParams(task="embed", dimensions=8), + ] + meta = _make_metadata(params) + out = head(x, meta) + assert isinstance(out, list) + assert len(out) == 2 + assert out[0].shape[-1] == 4 + assert out[1].shape[-1] == 8 + + def test_matryoshka_mixed_with_none(self): + head = EmbeddingPoolerHead() + x = torch.randn(2, _HIDDEN) + params = [ + PoolingParams(task="embed", dimensions=4), + PoolingParams(task="embed", dimensions=None), + ] + meta = _make_metadata(params) + out = head(x, meta) + assert isinstance(out, list) + assert out[0].shape[-1] == 4 + assert torch.equal(out[1], x[1]) + + def test_activation_uniform_true(self): + head = EmbeddingPoolerHead(activation=PoolerNormalize()) + x = torch.randn(_BATCH, _HIDDEN) + params = _make_params(_BATCH, use_activation=True) + meta = _make_metadata(params) + out = head(x, meta) + norms = torch.linalg.norm(out, dim=-1) + assert torch.allclose(norms, torch.ones(_BATCH), atol=1e-5) + + def test_activation_uniform_false(self): + head = EmbeddingPoolerHead(activation=PoolerNormalize()) + x = torch.randn(_BATCH, _HIDDEN) + params = _make_params(_BATCH, use_activation=False) + meta = _make_metadata(params) + out = head(x, meta) + assert torch.equal(out, x) + + def test_activation_mixed_flags(self): + head = EmbeddingPoolerHead(activation=PoolerNormalize()) + x = torch.randn(2, _HIDDEN) + params = [ + PoolingParams(task="embed", use_activation=True), + PoolingParams(task="embed", use_activation=False), + ] + meta = _make_metadata(params) + out = head(x, meta) + assert isinstance(out, list) + norm_0 = torch.linalg.norm(out[0], dim=-1) + assert torch.allclose(norm_0, torch.ones(1), atol=1e-5) + assert torch.equal(out[1], x[1]) + + def test_list_input_gets_stacked(self): + head = EmbeddingPoolerHead() + tensors = [torch.randn(_HIDDEN) for _ in range(_BATCH)] + meta = _make_metadata(_make_params(_BATCH)) + out = head(tensors, meta) + assert out.shape == (_BATCH, _HIDDEN) + expected = torch.stack(tensors) + assert torch.equal(out, expected) + + def test_projector_then_matryoshka(self): + proj = _linear(_HIDDEN, 8) + head = EmbeddingPoolerHead(projector=proj) + x = torch.randn(_BATCH, _HIDDEN) + params = _make_params(_BATCH, dimensions=4) + meta = _make_metadata(params) + out = head(x, meta) + assert out.shape == (_BATCH, 4) + assert torch.equal(out, proj(x)[..., :4]) + + def test_matryoshka_then_activation(self): + head = EmbeddingPoolerHead(activation=PoolerNormalize()) + x = torch.randn(_BATCH, _HIDDEN) + params = _make_params(_BATCH, dimensions=4, use_activation=True) + meta = _make_metadata(params) + out = head(x, meta) + assert out.shape == (_BATCH, 4) + norms = torch.linalg.norm(out, dim=-1) + assert torch.allclose(norms, torch.ones(_BATCH), atol=1e-5) + + def test_empty_batch(self): + head = EmbeddingPoolerHead() + x = torch.randn(0, _HIDDEN) + meta = _make_metadata([]) + out = head(x, meta) + assert out.shape == (0, _HIDDEN) + + +# --------------------------------------------------------------------------- +# ClassifierPoolerHead +# --------------------------------------------------------------------------- +class TestClassifierPoolerHead: + def test_supported_tasks(self): + head = ClassifierPoolerHead() + assert head.get_supported_tasks() == {"classify"} + + def test_passthrough(self): + head = ClassifierPoolerHead() + x = torch.randn(_BATCH, _HIDDEN) + meta = _make_metadata(_make_params(_BATCH, task="classify")) + out = head(x, meta) + assert torch.equal(out, x) + + def test_head_dtype(self): + head = ClassifierPoolerHead(head_dtype=torch.float16) + x = torch.randn(_BATCH, _HIDDEN) + meta = _make_metadata(_make_params(_BATCH, task="classify")) + out = head(x, meta) + assert out.dtype == torch.float16 + + def test_classifier(self): + clf = _linear(_HIDDEN, 3) + head = ClassifierPoolerHead(classifier=clf) + x = torch.randn(_BATCH, _HIDDEN) + meta = _make_metadata(_make_params(_BATCH, task="classify")) + out = head(x, meta) + assert out.shape == (_BATCH, 3) + assert torch.allclose(out, clf(x)) + + def test_logit_mean(self): + head = ClassifierPoolerHead(logit_mean=2.0) + x = torch.randn(_BATCH, _HIDDEN) + meta = _make_metadata(_make_params(_BATCH, task="classify")) + out = head(x, meta) + assert torch.allclose(out, x - 2.0) + + def test_logit_sigma(self): + head = ClassifierPoolerHead(logit_sigma=0.5) + x = torch.randn(_BATCH, _HIDDEN) + meta = _make_metadata(_make_params(_BATCH, task="classify")) + out = head(x, meta) + assert torch.allclose(out, x / 0.5) + + def test_platt_scaling_combined(self): + head = ClassifierPoolerHead(logit_mean=1.0, logit_sigma=2.0) + x = torch.randn(_BATCH, _HIDDEN) + meta = _make_metadata(_make_params(_BATCH, task="classify")) + out = head(x, meta) + assert torch.allclose(out, (x - 1.0) / 2.0) + + def test_activation_uniform_true(self): + head = ClassifierPoolerHead(activation=PoolerNormalize()) + x = torch.randn(_BATCH, _HIDDEN) + params = _make_params(_BATCH, task="classify", use_activation=True) + meta = _make_metadata(params) + out = head(x, meta) + norms = torch.linalg.norm(out, dim=-1) + assert torch.allclose(norms, torch.ones(_BATCH), atol=1e-5) + + def test_activation_uniform_false(self): + head = ClassifierPoolerHead(activation=PoolerNormalize()) + x = torch.randn(_BATCH, _HIDDEN) + params = _make_params(_BATCH, task="classify", use_activation=False) + meta = _make_metadata(params) + out = head(x, meta) + assert torch.equal(out, x) + + def test_activation_mixed_flags(self): + head = ClassifierPoolerHead(activation=PoolerNormalize()) + x = torch.randn(2, _HIDDEN) + params = [ + PoolingParams(task="classify", use_activation=True), + PoolingParams(task="classify", use_activation=False), + ] + meta = _make_metadata(params) + out = head(x, meta) + assert isinstance(out, list) + norm_0 = torch.linalg.norm(out[0], dim=-1) + assert torch.allclose(norm_0, torch.ones(1), atol=1e-5) + assert torch.equal(out[1], x[1]) + + def test_list_input_gets_stacked(self): + head = ClassifierPoolerHead() + tensors = [torch.randn(_HIDDEN) for _ in range(_BATCH)] + meta = _make_metadata(_make_params(_BATCH, task="classify")) + out = head(tensors, meta) + assert out.shape == (_BATCH, _HIDDEN) + expected = torch.stack(tensors) + assert torch.equal(out, expected) + + def test_classifier_then_platt_scaling(self): + clf = _linear(_HIDDEN, 3) + head = ClassifierPoolerHead(classifier=clf, logit_mean=1.0, logit_sigma=2.0) + x = torch.randn(_BATCH, _HIDDEN) + meta = _make_metadata(_make_params(_BATCH, task="classify")) + out = head(x, meta) + expected = (clf(x) - 1.0) / 2.0 + assert torch.allclose(out, expected) + + def test_empty_batch(self): + head = ClassifierPoolerHead() + x = torch.randn(0, _HIDDEN) + meta = _make_metadata([]) + out = head(x, meta) + assert out.shape == (0, _HIDDEN) + + +# --------------------------------------------------------------------------- +# TokenEmbeddingPoolerHead +# --------------------------------------------------------------------------- +class TestTokenEmbeddingPoolerHead: + def test_supported_tasks(self): + head = TokenEmbeddingPoolerHead() + assert head.get_supported_tasks() == {"token_embed"} + + def test_passthrough(self): + head = TokenEmbeddingPoolerHead() + x = torch.randn(5, _HIDDEN) + param = PoolingParams(task="token_embed") + out = head.forward_chunk(x, param) + assert torch.equal(out, x) + + def test_none_chunked_prefill(self): + head = TokenEmbeddingPoolerHead() + param = PoolingParams(task="token_embed") + out = head.forward_chunk(None, param) + assert out is None + + def test_head_dtype(self): + head = TokenEmbeddingPoolerHead(head_dtype=torch.float16) + x = torch.randn(5, _HIDDEN) + param = PoolingParams(task="token_embed") + out = head.forward_chunk(x, param) + assert out.dtype == torch.float16 + + def test_projector(self): + proj = _linear(_HIDDEN, 8) + head = TokenEmbeddingPoolerHead(projector=proj) + x = torch.randn(5, _HIDDEN) + param = PoolingParams(task="token_embed") + out = head.forward_chunk(x, param) + assert out.shape == (5, 8) + assert torch.allclose(out, proj(x)) + + def test_matryoshka_truncation(self): + head = TokenEmbeddingPoolerHead() + x = torch.randn(5, _HIDDEN) + param = PoolingParams(task="token_embed", dimensions=4) + out = head.forward_chunk(x, param) + assert out.shape == (5, 4) + assert torch.equal(out, x[..., :4]) + + def test_activation_true(self): + head = TokenEmbeddingPoolerHead(activation=PoolerNormalize()) + x = torch.randn(5, _HIDDEN) + param = PoolingParams(task="token_embed", use_activation=True) + out = head.forward_chunk(x, param) + norms = torch.linalg.norm(out, dim=-1) + assert torch.allclose(norms, torch.ones(5), atol=1e-5) + + def test_activation_false(self): + head = TokenEmbeddingPoolerHead(activation=PoolerNormalize()) + x = torch.randn(5, _HIDDEN) + param = PoolingParams(task="token_embed", use_activation=False) + out = head.forward_chunk(x, param) + assert torch.equal(out, x) + + def test_projector_then_matryoshka(self): + proj = _linear(_HIDDEN, 8) + head = TokenEmbeddingPoolerHead(projector=proj) + x = torch.randn(5, _HIDDEN) + param = PoolingParams(task="token_embed", dimensions=4) + out = head.forward_chunk(x, param) + assert out.shape == (5, 4) + assert torch.equal(out, proj(x)[..., :4]) + + def test_matryoshka_then_activation(self): + head = TokenEmbeddingPoolerHead(activation=PoolerNormalize()) + x = torch.randn(5, _HIDDEN) + param = PoolingParams(task="token_embed", dimensions=4, use_activation=True) + out = head.forward_chunk(x, param) + assert out.shape == (5, 4) + norms = torch.linalg.norm(out, dim=-1) + assert torch.allclose(norms, torch.ones(5), atol=1e-5) + + def test_forward_mixed_batch_chunked_prefill(self): + head = TokenEmbeddingPoolerHead() + pooled_data = [torch.randn(5, _HIDDEN), None, torch.randn(3, _HIDDEN)] + params = _make_params(3, task="token_embed") + meta = _make_metadata(params) + out = head(pooled_data, meta) + assert len(out) == 3 + assert torch.equal(out[0], pooled_data[0]) + assert out[1] is None + assert torch.equal(out[2], pooled_data[2]) + + def test_forward_empty_batch(self): + head = TokenEmbeddingPoolerHead() + meta = _make_metadata([]) + out = head([], meta) + assert out == [] + + +# --------------------------------------------------------------------------- +# TokenClassifierPoolerHead +# --------------------------------------------------------------------------- +class TestTokenClassifierPoolerHead: + def test_supported_tasks(self): + head = TokenClassifierPoolerHead() + assert head.get_supported_tasks() == {"token_classify"} + + def test_passthrough(self): + head = TokenClassifierPoolerHead() + x = torch.randn(5, _HIDDEN) + param = PoolingParams(task="token_classify") + out = head.forward_chunk(x, param) + assert torch.equal(out, x) + + def test_none_chunked_prefill(self): + head = TokenClassifierPoolerHead() + param = PoolingParams(task="token_classify") + out = head.forward_chunk(None, param) + assert out is None + + def test_head_dtype(self): + head = TokenClassifierPoolerHead(head_dtype=torch.float16) + x = torch.randn(5, _HIDDEN) + param = PoolingParams(task="token_classify") + out = head.forward_chunk(x, param) + assert out.dtype == torch.float16 + + def test_classifier(self): + clf = _linear(_HIDDEN, 3) + head = TokenClassifierPoolerHead(classifier=clf) + x = torch.randn(5, _HIDDEN) + param = PoolingParams(task="token_classify") + out = head.forward_chunk(x, param) + assert out.shape == (5, 3) + assert torch.allclose(out, clf(x)) + + def test_logit_mean(self): + head = TokenClassifierPoolerHead(logit_mean=2.0) + x = torch.randn(5, _HIDDEN) + param = PoolingParams(task="token_classify") + out = head.forward_chunk(x, param) + assert torch.allclose(out, x - 2.0) + + def test_logit_sigma(self): + head = TokenClassifierPoolerHead(logit_sigma=0.5) + x = torch.randn(5, _HIDDEN) + param = PoolingParams(task="token_classify") + out = head.forward_chunk(x, param) + assert torch.allclose(out, x / 0.5) + + def test_platt_scaling_combined(self): + head = TokenClassifierPoolerHead(logit_mean=1.0, logit_sigma=2.0) + x = torch.randn(5, _HIDDEN) + param = PoolingParams(task="token_classify") + out = head.forward_chunk(x, param) + assert torch.allclose(out, (x - 1.0) / 2.0) + + def test_activation_true(self): + head = TokenClassifierPoolerHead(activation=PoolerNormalize()) + x = torch.randn(5, _HIDDEN) + param = PoolingParams(task="token_classify", use_activation=True) + out = head.forward_chunk(x, param) + norms = torch.linalg.norm(out, dim=-1) + assert torch.allclose(norms, torch.ones(5), atol=1e-5) + + def test_activation_false(self): + head = TokenClassifierPoolerHead(activation=PoolerNormalize()) + x = torch.randn(5, _HIDDEN) + param = PoolingParams(task="token_classify", use_activation=False) + out = head.forward_chunk(x, param) + assert torch.equal(out, x) + + def test_forward_mixed_batch_chunked_prefill(self): + head = TokenClassifierPoolerHead() + pooled_data = [torch.randn(5, _HIDDEN), None, torch.randn(3, _HIDDEN)] + params = _make_params(3, task="token_classify") + meta = _make_metadata(params) + out = head(pooled_data, meta) + assert len(out) == 3 + assert torch.equal(out[0], pooled_data[0]) + assert out[1] is None + assert torch.equal(out[2], pooled_data[2]) + + def test_forward_empty_batch(self): + head = TokenClassifierPoolerHead() + meta = _make_metadata([]) + out = head([], meta) + assert out == [] diff --git a/tests/models/language/generation/test_hybrid.py b/tests/models/language/generation/test_hybrid.py index e410daf2fcd..7d22278cf1f 100644 --- a/tests/models/language/generation/test_hybrid.py +++ b/tests/models/language/generation/test_hybrid.py @@ -2,11 +2,12 @@ # SPDX-FileCopyrightText: Copyright contributors to the vLLM project from collections.abc import Callable +from contextlib import contextmanager, nullcontext import pytest from tests.models.registry import HF_EXAMPLE_MODELS -from tests.utils import multi_gpu_test +from tests.utils import multi_gpu_test, wait_for_gpu_memory_to_clear from vllm import LLM from vllm.engine.arg_utils import EngineArgs from vllm.platforms import current_platform @@ -404,6 +405,30 @@ def _get_vllm_runner_params( } +def _wait_for_rocm_memory_to_settle() -> None: + if not current_platform.is_rocm(): + return + + num_gpus = current_platform.device_count() + if num_gpus == 0: + return + + wait_for_gpu_memory_to_clear( + devices=list(range(num_gpus)), + threshold_ratio=0.01, + timeout_s=120, + ) + + +@contextmanager +def _owned_vLLM_runner(vllm_runner, kwargs): + try: + with vllm_runner(**kwargs) as runner: + yield runner + finally: + _wait_for_rocm_memory_to_settle() + + def _get_vLLM_output( vllm_runner, kwargs, @@ -413,17 +438,21 @@ def _get_vLLM_output( num_repetitions=1, vllm_model=None, ): - outs = [] - if vllm_model is None: - vllm_model = vllm_runner(**kwargs) - for _ in range(num_repetitions): - if num_logprobs < 0: - vllm_output = vllm_model.generate_greedy(prompts, max_tokens) - else: - vllm_output = vllm_model.generate_greedy_logprobs( - prompts, max_tokens, num_logprobs - ) - outs.append(vllm_output) + runner_context = ( + _owned_vLLM_runner(vllm_runner, kwargs) + if vllm_model is None + else nullcontext(vllm_model) + ) + with runner_context as runner: + outs = [] + for _ in range(num_repetitions): + if num_logprobs < 0: + vllm_output = runner.generate_greedy(prompts, max_tokens) + else: + vllm_output = runner.generate_greedy_logprobs( + prompts, max_tokens, num_logprobs + ) + outs.append(vllm_output) return outs, vllm_model @@ -772,38 +801,44 @@ def test_apc_multiple_prompts_partial_cached_outputs( # Cache only part of all the prompts vllm_runner_kwargs["enable_prefix_caching"] = True - vllm_outputs_partial_cache, vllm_model = _get_vLLM_output( - vllm_runner, vllm_runner_kwargs, generated_prompts[:3], max_tokens, num_logprobs - ) - - compare_operator( - outputs_0_lst=vllm_outputs_no_cache[0][:3], - outputs_1_lst=vllm_outputs_partial_cache[0], - name_0="vllm_no_cache", - name_1="vllm_partial_cache", - ) - - vllm_outputs_cache_rep, _ = _get_vLLM_output( - vllm_runner, - vllm_runner_kwargs, - generated_prompts, - max_tokens, - num_logprobs, - n_repetitions, - vllm_model=vllm_model, - ) - - for r_idx, vllm_outputs_cache_itn in enumerate(vllm_outputs_cache_rep): - # In the first repetition, the caches are filled - # In the second repetition, these caches are reused + with _owned_vLLM_runner(vllm_runner, vllm_runner_kwargs) as vllm_model: + vllm_outputs_partial_cache, _ = _get_vLLM_output( + vllm_runner, + vllm_runner_kwargs, + generated_prompts[:3], + max_tokens, + num_logprobs, + vllm_model=vllm_model, + ) compare_operator( - outputs_0_lst=vllm_outputs_no_cache[0], - outputs_1_lst=vllm_outputs_cache_itn, + outputs_0_lst=vllm_outputs_no_cache[0][:3], + outputs_1_lst=vllm_outputs_partial_cache[0], name_0="vllm_no_cache", - name_1=f"vllm_cache_it_{r_idx + 1}", + name_1="vllm_partial_cache", ) + vllm_outputs_cache_rep, _ = _get_vLLM_output( + vllm_runner, + vllm_runner_kwargs, + generated_prompts, + max_tokens, + num_logprobs, + n_repetitions, + vllm_model=vllm_model, + ) + + for r_idx, vllm_outputs_cache_itn in enumerate(vllm_outputs_cache_rep): + # In the first repetition, the caches are filled + # In the second repetition, these caches are reused + + compare_operator( + outputs_0_lst=vllm_outputs_no_cache[0], + outputs_1_lst=vllm_outputs_cache_itn, + name_0="vllm_no_cache", + name_1=f"vllm_cache_it_{r_idx + 1}", + ) + # Test that outputs match whether prefix caching is enabled or not for mamba. @pytest.mark.parametrize("model", ["tiiuae/falcon-mamba-7b"]) @@ -826,7 +861,7 @@ def test_same_mamba_output_apc_on_vs_off( # No prefix caching kwargs_no_apc = {**base_kwargs, "enable_prefix_caching": False} - with vllm_runner(**kwargs_no_apc) as vllm_model: + with _owned_vLLM_runner(vllm_runner, kwargs_no_apc) as vllm_model: outputs_no_apc, _ = _get_vLLM_output( vllm_runner, kwargs_no_apc, @@ -841,7 +876,7 @@ def test_same_mamba_output_apc_on_vs_off( "enable_prefix_caching": True, "mamba_block_size": 16, } - with vllm_runner(**kwargs_with_apc) as vllm_model: + with _owned_vLLM_runner(vllm_runner, kwargs_with_apc) as vllm_model: outputs_with_apc, _ = _get_vLLM_output( vllm_runner, kwargs_with_apc, diff --git a/tests/models/multimodal/generation/test_granite_speech.py b/tests/models/multimodal/generation/test_granite_speech.py index 038a15d057c..3019f5f22d4 100644 --- a/tests/models/multimodal/generation/test_granite_speech.py +++ b/tests/models/multimodal/generation/test_granite_speech.py @@ -30,11 +30,14 @@ def vllm_to_hf_output( MODEL_NAME = "ibm-granite/granite-speech-3.3-2b" MODEL_NAME_4_0 = "ibm-granite/granite-4.0-1b-speech" +# "plus" variant of granite speech (uses GraniteSpeechPlusForConditionalGeneration). +MODEL_NAME_4_1_PLUS = "ibm-granite/granite-speech-4.1-2b-plus" # Audio lora co-exists directly in the 3.3 model directory, -# the 4.0 model has adapters merged into the weights. +# the 4.0 and 4.1-plus models have adapters merged into the weights. models: dict[str, str | None] = { MODEL_NAME: MODEL_NAME, MODEL_NAME_4_0: None, + MODEL_NAME_4_1_PLUS: None, } diff --git a/tests/models/multimodal/generation/test_vit_cudagraph.py b/tests/models/multimodal/generation/test_vit_cudagraph.py index 18630e3559a..cbdc5e878ae 100644 --- a/tests/models/multimodal/generation/test_vit_cudagraph.py +++ b/tests/models/multimodal/generation/test_vit_cudagraph.py @@ -43,6 +43,10 @@ def qwen_vl_chat_template(content: str) -> str: return f"<|im_start|>user\n{content}<|im_end|>\n<|im_start|>assistant\n" +def internvl_chat_template(content: str) -> str: + return f"<|im_start|>user\n{content}<|im_end|>\n<|im_start|>assistant\n" + + def step3_vl_chat_template(content: str) -> str: return ( "<|begin▁of▁sentence|> You are a helpful assistant.<|BOT|>user\n " @@ -51,6 +55,17 @@ def step3_vl_chat_template(content: str) -> str: MODEL_CONFIGS: dict[str, VitCudagraphTestConfig] = { + "internvl": VitCudagraphTestConfig( + model="OpenGVLab/InternVL3-1B", + num_video_frames=8, + image_prompt=internvl_chat_template("\nWhat is in this image?"), + video_prompt=internvl_chat_template( + "