[TRTLLM-9766][feat] Integration of the KVCacheManager V2 to TRTLLM Runtime (#10659)

Signed-off-by: yizhang-nv <187001205+yizhang-nv@users.noreply.github.com>
This commit is contained in:
Yi Zhang 2026-02-02 14:29:02 +08:00 committed by GitHub
parent d3df3f6feb
commit 0306c0f12c
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
32 changed files with 1467 additions and 187 deletions

2
.gitignore vendored
View File

@ -6,6 +6,8 @@ __pycache__/
*.cache
*.nsys-rep
*.npy
*.so
*.whl
.VSCodeCounter
cpp/build*
cpp/Release

View File

@ -17,6 +17,7 @@
#include "tensorrt_llm/batch_manager/kvCacheManagerV2Utils.h"
#include "tensorrt_llm/common/logger.h"
#include "tensorrt_llm/common/memoryUtils.h"
#include <cassert>
#include <cstdio>
#include <cuda.h>
@ -25,6 +26,9 @@
#include <unistd.h>
#include <vector>
namespace tc = tensorrt_llm::common;
using namespace tensorrt_llm::runtime;
namespace tensorrt_llm::batch_manager::kv_cache_manager_v2
{
@ -160,4 +164,76 @@ CUresult copyHostToHost(std::vector<Task<MemAddress, MemAddress>> tasks, ssize_t
return cuLaunchHostFunc(stream, hostFnHostToHostCopy, data.release());
}
SizeType32 IndexMapper::addNewSequence(LlmRequest::RequestIdType requestId)
{
TLLM_CHECK(indexMap_.find(requestId) == indexMap_.end());
auto iter = freeIndices_.begin();
TLLM_CHECK_WITH_INFO(iter != freeIndices_.end(), "No free index found");
auto index = *iter;
freeIndices_.erase(iter);
indexMap_[requestId] = index;
return index;
}
SizeType32 IndexMapper::getIndex(LlmRequest::RequestIdType requestId)
{
auto iter = indexMap_.find(requestId);
TLLM_CHECK_WITH_INFO(iter != indexMap_.end(), "Request ID not found in IndexMapper");
return iter->second;
}
void IndexMapper::removeSequence(LlmRequest::RequestIdType requestId)
{
auto iter = indexMap_.find(requestId);
TLLM_CHECK(iter != indexMap_.end());
auto index = iter->second;
freeIndices_.insert(index);
indexMap_.erase(iter);
}
at::Tensor IndexMapper::getCopyIndex(
std::vector<LlmRequest::RequestIdType> const& requestIds, SizeType32 numContext, SizeType32 beamWidth)
{
int numSeqs = numContext + beamWidth * (requestIds.size() - numContext);
SizeType32 batchSize = static_cast<SizeType32>(requestIds.size());
SizeType32 idx = 0;
for (SizeType32 i = 0; i < batchSize; i++)
{
if (i < numContext)
{
copyIndex_[idx++] = this->getIndex(requestIds[i]) * maxBeamWidth_;
}
else
{
for (SizeType32 j = 0; j < beamWidth; j++)
{
copyIndex_[idx++] = this->getIndex(requestIds[i]) * maxBeamWidth_ + j;
}
}
}
TLLM_CHECK_WITH_INFO(idx == numSeqs, "Index mapper failed to generate copy index");
return copyIndex_.slice(0, 0, numSeqs);
}
IndexMapper::IndexMapper(SizeType32 maxBatchSize, SizeType32 maxBeamWidth)
: maxBeamWidth_(maxBeamWidth)
{
indexMap_.reserve(maxBatchSize);
for (SizeType32 i = 0; i < maxBatchSize; i++)
{
freeIndices_.insert(i);
}
// Allocate copyIndex_ memory as pinned (page-locked) host memory
copyIndex_
= at::empty({maxBatchSize * maxBeamWidth}, at::TensorOptions().dtype(at::ScalarType::Int).pinned_memory(true));
}
IndexMapper::~IndexMapper()
{
indexMap_.clear();
freeIndices_.clear();
}
} // namespace tensorrt_llm::batch_manager::kv_cache_manager_v2

View File

@ -18,15 +18,20 @@
#include "kvCacheManagerV2Utils.h"
#include "tensorrt_llm/common/assert.h"
#include "tensorrt_llm/common/cudaUtils.h"
#include "tensorrt_llm/common/envUtils.h"
#include "tensorrt_llm/common/memoryUtils.h"
#include <algorithm>
#include <array>
#include <cassert>
#include <cuda_runtime.h>
#include <vector>
namespace tensorrt_llm::batch_manager::kv_cache_manager_v2
{
using Grain = uint4;
constexpr uint32_t ctaSize = 128;
constexpr uint32_t copyBlockCtaSize = 128;
constexpr uint32_t copyBlocknbBufs = 2;
constexpr uint32_t nbBufs = 4;
constexpr uint32_t grainBytes = sizeof(Grain);
@ -179,4 +184,122 @@ CUresult copyDeviceToDevice(std::vector<MMTask> const& tasks, ssize_t numBytes,
return launchBatchedCopy(false, tasks, numBytes, stream);
}
// dst_tensor[:, :num_seqs, 0] = src_tensor[:, copy_idx]
// dst_tensor[:, :num_seqs, 1] = dst_tensor[:, :num_seqs, 0] + 1
template <bool COPY_V_IDX = true>
__global__ void copyBatchBlockOffsetsToDeviceKernel(SizeType32 const* __restrict__ srcPtr,
SizeType32* __restrict__ dstPtr, SizeType32 const srcMaxNumSequences, SizeType32 const dstMaxNumSequences,
SizeType32 numBlocksPerSeq, SizeType32 const* __restrict__ copyIndex)
{
constexpr uint32_t kvFactor = 2;
constexpr auto elemPerAccess = sizeof(PackedInt) / sizeof(SizeType32);
__shared__ PackedInt data[copyBlocknbBufs][copyBlockCtaSize];
auto const iterPerSeq = divUp(numBlocksPerSeq * sizeof(SizeType32), sizeof(PackedInt) * copyBlockCtaSize);
auto const tid = threadIdx.x;
auto const poolIdx = blockIdx.x;
auto const seqIdx = blockIdx.y;
auto const seqDimStride = kvFactor * numBlocksPerSeq;
uint32_t const srcIdxBeg = tid * elemPerAccess + (poolIdx * srcMaxNumSequences + copyIndex[seqIdx]) * seqDimStride;
uint32_t const dstIdxKBeg = tid * elemPerAccess + (poolIdx * dstMaxNumSequences + seqIdx) * seqDimStride;
uint32_t const dstIdxVBeg = dstIdxKBeg + numBlocksPerSeq;
uint32_t const srcIdxEnd = (poolIdx * srcMaxNumSequences + copyIndex[seqIdx]) * seqDimStride + numBlocksPerSeq;
for (uint32_t i = 0; i < iterPerSeq + copyBlocknbBufs; i++)
{
uint32_t const idxBuf = i % copyBlocknbBufs;
if (i >= copyBlocknbBufs)
{
uint32_t const stIter = i - copyBlocknbBufs;
assert(idxBuf == (stIter % copyBlocknbBufs));
auto const offset = copyBlockCtaSize * stIter * elemPerAccess;
SizeType32 const srcIdx = srcIdxBeg + offset;
SizeType32 const dstIdxK = dstIdxKBeg + offset;
SizeType32 const dstIdxV = dstIdxVBeg + offset;
PackedInt const& src = data[idxBuf][tid];
PackedInt& dstK = *reinterpret_cast<PackedInt*>(dstPtr + dstIdxK);
PackedInt& dstV = *reinterpret_cast<PackedInt*>(dstPtr + dstIdxV);
asm volatile("cp.async.wait_group %0;\n" ::"n"(copyBlocknbBufs - 1) : "memory");
if (srcIdx < srcIdxEnd)
{
dstK = src;
if (COPY_V_IDX)
{
dstV = src;
}
else
{
#pragma unroll
for (uint32_t j = 0; j < elemPerAccess; j++)
{
auto const val = src.unpacked[j];
dstV.unpacked[j] = (val == BAD_PAGE_INDEX) ? val : (val + 1);
}
}
}
}
uint32_t const ldIter = i;
PackedInt* const dst = &data[idxBuf][tid];
uint32_t const srcIdx = srcIdxBeg + copyBlockCtaSize * ldIter * elemPerAccess;
PackedInt const* const src = reinterpret_cast<PackedInt const*>(srcPtr + srcIdx);
if (srcIdx < srcIdxEnd)
{
uint32_t const size = sizeof(PackedInt);
asm volatile("cp.async.cg.shared.global [%0], [%1], %2, %3;\n" ::"l"(__cvta_generic_to_shared(dst)),
"l"(src), "n"(size), "r"(size)
: "memory");
}
asm volatile("cp.async.commit_group;\n" : : : "memory");
}
}
// Host-side launcher
void copyBatchBlockOffsetsToDevice(
ITensor const& input, ITensor& output, ITensor const& copyIndex, bool copyVIdx, CUstream stream) noexcept
{
using namespace tensorrt_llm::runtime;
auto const* srcPtr = bufferCast<tk::KVCacheIndex::UnderlyingType const>(input);
auto* dstPtr = bufferCast<tk::KVCacheIndex::UnderlyingType>(
output); // [numPools, maxNumSequences, kvFactor, numBlocksPerSeq]
auto const* copyIndexPtr = bufferCast<SizeType32 const>(copyIndex);
auto const& srcShape = input.getShape();
auto const& dstShape = output.getShape();
auto const& copyIndexShape = copyIndex.getShape();
TLLM_CHECK(srcShape.nbDims == 4); // [numPools, srcMaxNumSequences, kvFactor, numBlocksPerSeq]
TLLM_CHECK(dstShape.nbDims == 4); // [numPools, dstMaxNumSequences, kvFactor, numBlocksPerSeq]
SizeType32 numPools = srcShape.d[0];
SizeType32 srcMaxNumSequences = srcShape.d[1];
SizeType32 dstMaxNumSequences = dstShape.d[1];
SizeType32 numBlocksPerSeq = srcShape.d[3];
SizeType32 numSeqs = copyIndexShape.d[0];
if (numSeqs == 0)
{
return;
}
TLLM_CHECK_WITH_INFO((numBlocksPerSeq * sizeof(SizeType32)) % sizeof(PackedInt) == 0,
"Not implemented case: numBlocksPerSeq * sizeof(SizeType32) = %zu must be a multiple of %zu.",
static_cast<size_t>(numBlocksPerSeq * sizeof(SizeType32)), static_cast<size_t>(sizeof(PackedInt)));
dim3 gridDim(numPools, numSeqs, 1);
dim3 blockDim(copyBlockCtaSize);
if (copyVIdx)
{
copyBatchBlockOffsetsToDeviceKernel<true><<<gridDim, blockDim, 0, stream>>>(
srcPtr, dstPtr, srcMaxNumSequences, dstMaxNumSequences, numBlocksPerSeq, copyIndexPtr);
}
else
{
copyBatchBlockOffsetsToDeviceKernel<false><<<gridDim, blockDim, 0, stream>>>(
srcPtr, dstPtr, srcMaxNumSequences, dstMaxNumSequences, numBlocksPerSeq, copyIndexPtr);
}
}
} // namespace tensorrt_llm::batch_manager::kv_cache_manager_v2

View File

@ -17,10 +17,21 @@
#pragma once
#include "tensorrt_llm/batch_manager/llmRequest.h"
#include "tensorrt_llm/kernels/kvCacheIndex.h"
#include "tensorrt_llm/runtime/iBuffer.h"
#include "tensorrt_llm/runtime/iTensor.h"
#include <ATen/ATen.h>
#include <cstdint>
#include <cuda.h>
#include <set>
#include <unordered_map>
#include <vector>
namespace tk = tensorrt_llm::kernels;
using SizeType32 = tensorrt_llm::runtime::SizeType32;
using ITensor = tensorrt_llm::runtime::ITensor;
namespace tensorrt_llm::batch_manager::kv_cache_manager_v2
{
struct DiskAddress
@ -31,6 +42,9 @@ struct DiskAddress
using MemAddress = std::uintptr_t;
// Please make sure to align with the definition in tensorrt_llm/runtime/kv_cache_manager_v2/_common.py
constexpr tk::KVCacheIndex::UnderlyingType BAD_PAGE_INDEX = -1;
template <typename DstAddr, typename SrcAddr>
struct Task
{
@ -38,6 +52,38 @@ struct Task
SrcAddr src;
};
using PackedInt = union
{
int4 packed;
tk::KVCacheIndex::UnderlyingType unpacked[4];
};
class IndexMapper
{
public:
IndexMapper(SizeType32 maxBatchSize, SizeType32 maxBeamWidth);
~IndexMapper();
IndexMapper(IndexMapper const&) = delete;
IndexMapper& operator=(IndexMapper const&) = delete;
SizeType32 addNewSequence(LlmRequest::RequestIdType requestId);
SizeType32 getIndex(LlmRequest::RequestIdType requestId);
void removeSequence(LlmRequest::RequestIdType requestId);
at::Tensor getCopyIndex(
std::vector<LlmRequest::RequestIdType> const& requestIds, SizeType32 numContext, SizeType32 beamWidth);
private:
std::unordered_map<LlmRequest::RequestIdType, SizeType32> indexMap_;
std::set<SizeType32> freeIndices_;
SizeType32 maxBeamWidth_;
at::Tensor copyIndex_;
};
CUresult copyDiskToDisk(std::vector<Task<DiskAddress, DiskAddress>> tasks, ssize_t numBytes, CUstream stream) noexcept;
CUresult copyDiskToHost(std::vector<Task<MemAddress, DiskAddress>> tasks, ssize_t numBytes, CUstream stream) noexcept;
CUresult copyHostToDisk(std::vector<Task<DiskAddress, MemAddress>> tasks, ssize_t numBytes, CUstream stream) noexcept;
@ -48,4 +94,8 @@ CUresult copyDeviceToHost(
std::vector<Task<MemAddress, MemAddress>> const& tasks, ssize_t numBytes, CUstream stream) noexcept;
CUresult copyDeviceToDevice(
std::vector<Task<MemAddress, MemAddress>> const& tasks, ssize_t numBytes, CUstream stream) noexcept;
void copyBatchBlockOffsetsToDevice(
ITensor const& input, ITensor& output, ITensor const& copyIndex, bool copyVIdx, CUstream stream) noexcept;
} // namespace tensorrt_llm::batch_manager::kv_cache_manager_v2

View File

@ -17,14 +17,32 @@
#include "kvCacheManagerV2Utils.h"
#include "tensorrt_llm/batch_manager/kvCacheManagerV2Utils.h"
#include "tensorrt_llm/nanobind/common/customCasters.h"
#include "tensorrt_llm/runtime/iTensor.h"
#include "tensorrt_llm/runtime/torchView.h"
#include <ATen/ATen.h>
#include <nanobind/nanobind.h>
#include <nanobind/stl/optional.h>
#include <nanobind/stl/vector.h>
#include <torch/extension.h>
namespace tr = tensorrt_llm::runtime;
namespace nb = nanobind;
using SizeType32 = tensorrt_llm::runtime::SizeType32;
namespace tensorrt_llm::batch_manager::kv_cache_manager_v2
{
std::optional<tensorrt_llm::runtime::ITensor::UniquePtr> from_torch(std::optional<at::Tensor> torchPtr)
{
if (torchPtr)
{
return tr::TorchView::of(torchPtr.value());
}
return std::nullopt;
}
void KVCacheManagerV2UtilsBindings::initBindings(nb::module_& module)
{
// Bind DiskAddress struct
@ -54,6 +72,13 @@ void KVCacheManagerV2UtilsBindings::initBindings(nb::module_& module)
.def_rw("dst", &Task<MemAddress, MemAddress>::dst)
.def_rw("src", &Task<MemAddress, MemAddress>::src);
nb::class_<IndexMapper>(module, "IndexMapper")
.def(nb::init<SizeType32, SizeType32>(), nb::arg("max_batch_size"), nb::arg("max_beam_width"))
.def("add_new_sequence", &IndexMapper::addNewSequence)
.def("get_index", &IndexMapper::getIndex)
.def("remove_sequence", &IndexMapper::removeSequence)
.def("get_copy_index", &IndexMapper::getCopyIndex);
// Bind copy functions
module.def(
"copy_disk_to_disk",
@ -103,6 +128,22 @@ void KVCacheManagerV2UtilsBindings::initBindings(nb::module_& module)
{ return copyDeviceToDevice(tasks, numBytes, reinterpret_cast<CUstream>(stream)); },
nb::arg("tasks"), nb::arg("num_bytes"), nb::arg("stream"), nb::call_guard<nb::gil_scoped_release>(),
"Copy data from device to device using CUDA kernels");
module.def(
"copy_batch_block_offsets_to_device",
[](at::Tensor input, at::Tensor output, at::Tensor copyIndex, bool copyVIdx, uintptr_t stream)
{
auto _input = from_torch(input);
auto _output = from_torch(output);
auto _copyIndex = from_torch(copyIndex);
TLLM_CHECK_WITH_INFO(_input.has_value(), "Invalid input tensor.");
TLLM_CHECK_WITH_INFO(_output.has_value(), "Invalid output tensor.");
TLLM_CHECK_WITH_INFO(_copyIndex.has_value(), "Invalid copy index tensor.");
copyBatchBlockOffsetsToDevice(*(_input.value()), *(_output.value()), *(_copyIndex.value()), copyVIdx,
reinterpret_cast<CUstream>(stream));
},
nb::arg("input"), nb::arg("output"), nb::arg("copy_index"), nb::arg("copy_v_idx"), nb::arg("stream"),
nb::call_guard<nb::gil_scoped_release>(), "Copy batch block indices to device");
}
} // namespace tensorrt_llm::batch_manager::kv_cache_manager_v2

View File

@ -63,15 +63,15 @@ void initBindings(nb::module_& m)
new (&self) tle::DecodingMode(nb::cast<tle::DecodingMode::UnderlyingType>(state[0]));
};
nb::class_<tle::DecodingMode>(m, "DecodingMode")
.def("Auto", &tle::DecodingMode::Auto)
.def("TopK", &tle::DecodingMode::TopK)
.def("TopP", &tle::DecodingMode::TopP)
.def("TopKTopP", &tle::DecodingMode::TopKTopP)
.def("BeamSearch", &tle::DecodingMode::BeamSearch)
.def("Medusa", &tle::DecodingMode::Medusa)
.def("Lookahead", &tle::DecodingMode::Lookahead)
.def("ExplicitDraftTokens", &tle::DecodingMode::ExplicitDraftTokens)
.def("Eagle", &tle::DecodingMode::Eagle)
.def_static("Auto", &tle::DecodingMode::Auto)
.def_static("TopK", &tle::DecodingMode::TopK)
.def_static("TopP", &tle::DecodingMode::TopP)
.def_static("TopKTopP", &tle::DecodingMode::TopKTopP)
.def_static("BeamSearch", &tle::DecodingMode::BeamSearch)
.def_static("Medusa", &tle::DecodingMode::Medusa)
.def_static("Lookahead", &tle::DecodingMode::Lookahead)
.def_static("ExplicitDraftTokens", &tle::DecodingMode::ExplicitDraftTokens)
.def_static("Eagle", &tle::DecodingMode::Eagle)
.def("isAuto", &tle::DecodingMode::isAuto)
.def("isTopK", &tle::DecodingMode::isTopK)
.def("isTopP", &tle::DecodingMode::isTopP)

View File

@ -109,6 +109,12 @@ def add_llm_args(parser):
parser.add_argument('--log_kv_cache_events',
default=False,
action='store_true')
parser.add_argument(
'--use_kv_cache_manager_v2',
default=False,
action='store_true',
help='Use KVCacheManagerV2 for KV cache management (PyTorch backend).',
)
# Runtime
parser.add_argument('--disable_overlap_scheduler',
@ -214,6 +220,7 @@ def setup_llm(args, **kwargs):
free_gpu_memory_fraction=args.kv_cache_fraction,
dtype=args.kv_cache_dtype,
tokens_per_block=args.tokens_per_block,
use_kv_cache_manager_v2=args.use_kv_cache_manager_v2,
mamba_ssm_cache_dtype=args.mamba_ssm_cache_dtype,
event_buffer_max_size=1024 if args.log_kv_cache_events else 0)

View File

@ -21,7 +21,7 @@ from tensorrt_llm.models.modeling_utils import QuantConfig
from ..memory_buffer_utils import Buffers
from ..metadata import KVCacheParams
from ..pyexecutor.resource_manager import KVCacheManager
from ..pyexecutor.resource_manager import KVCacheManager, KVCacheManagerV2
from ..utils import get_model_extra_attrs
try:
@ -63,7 +63,7 @@ class AttentionMetadata:
# The max number of sequences in a single batch.
max_num_sequences: Optional[int] = None
# The KV cache manager.
kv_cache_manager: KVCacheManager
kv_cache_manager: Union[KVCacheManager, KVCacheManagerV2]
mapping: Optional[Mapping] = None
enable_flash_mla: bool = False

View File

@ -34,12 +34,14 @@ from .llm_request import ExecutorResponse
from .mamba_cache_manager import MambaHybridCacheManager
from .model_engine import PyTorchModelEngine
from .py_executor import PyExecutor
from .resource_manager import (KVCacheManager, PeftCacheManager,
ResourceManager, ResourceManagerType)
from .resource_manager import (KVCacheManager, KVCacheManagerV2,
PeftCacheManager, ResourceManager,
ResourceManagerType)
from .sampler import (EarlyStopSampler, EarlyStopWithMMResult, TorchSampler,
TRTLLMSampler)
from .scheduler import (BindCapacityScheduler, BindMicroBatchScheduler,
SimpleScheduler, SimpleUnifiedScheduler)
KVCacheV2DummyScheduler, SimpleScheduler,
SimpleUnifiedScheduler)
from .seq_slot_manager import SeqSlotManager
GB = 1 << 30
@ -99,6 +101,8 @@ class KvCacheCreator:
self._kv_cache_manager_cls = get_kv_cache_manager_cls(
model_engine.model.model_config)
self._execution_stream = execution_stream
if self._kv_cache_manager_cls == KVCacheManager and kv_cache_config.use_kv_cache_manager_v2:
self._kv_cache_manager_cls = KVCacheManagerV2
def _get_kv_size_per_token(self):
model_config = self._model_engine.model.model_config
@ -583,6 +587,7 @@ def _create_kv_cache_manager(
mapping=mapping,
dtype=kv_cache_dtype,
spec_config=spec_config,
vocab_size=config.vocab_size,
max_beam_width=max_beam_width,
is_draft=model_engine.is_draft_model,
kv_connector_manager=kv_connector_manager
@ -704,6 +709,7 @@ def _create_kv_cache_manager(
mapping=mapping,
dtype=kv_cache_dtype,
spec_config=spec_config,
vocab_size=config.vocab_size,
max_num_tokens=max_num_tokens,
model_config=binding_model_config,
max_beam_width=max_beam_width,
@ -855,7 +861,8 @@ def create_py_executor_instance(
scheduler_capacity += 1
use_python_scheduler = os.getenv("TLLM_USE_PYTHON_SCHEDULER", "0") == "1"
if use_python_scheduler:
if use_python_scheduler and not isinstance(kv_cache_manager,
KVCacheManagerV2):
scheduler = SimpleUnifiedScheduler(
max_batch_size=max_batch_size,
max_num_tokens=max_num_tokens,
@ -868,12 +875,19 @@ def create_py_executor_instance(
two_step_lookahead=mapping.has_pp(),
scheduler_capacity=scheduler_capacity)
else:
capacity_scheduler = BindCapacityScheduler(
scheduler_capacity,
kv_cache_manager.impl if kv_cache_manager is not None else None,
peft_cache_manager.impl if peft_cache_manager is not None else None,
scheduler_config.capacity_scheduler_policy,
two_step_lookahead=mapping.has_pp())
if isinstance(kv_cache_manager, KVCacheManagerV2):
capacity_scheduler = KVCacheV2DummyScheduler(
scheduler_capacity,
kv_cache_manager if kv_cache_manager is not None else None)
else:
capacity_scheduler = BindCapacityScheduler(
scheduler_capacity,
kv_cache_manager.impl if kv_cache_manager is not None else None,
peft_cache_manager.impl
if peft_cache_manager is not None else None,
scheduler_config.capacity_scheduler_policy,
two_step_lookahead=mapping.has_pp())
mb_scheduler = BindMicroBatchScheduler(max_batch_size, max_num_tokens,
ctx_chunk_config)
scheduler = SimpleScheduler(capacity_scheduler, mb_scheduler)

View File

@ -434,17 +434,17 @@ class CUDAGraphRunner:
# This is not strictly required, but we should probably
# respect the requirement just in case that changes in the future.
if self.padding_dummy_request is None:
available_blocks = kv_cache_manager.get_num_free_blocks()
# No padding if not enough KV cache space
if available_blocks < 1:
return 0
self.padding_dummy_request = kv_cache_manager.add_dummy_requests(
[CUDA_GRAPH_DUMMY_REQUEST_ID],
is_gen=True,
max_num_draft_tokens=runtime_draft_len,
use_mrope=self.config.use_mrope,
max_beam_width=self.config.max_beam_width)[0]
max_beam_width=self.config.max_beam_width)
if self.padding_dummy_request is None:
return 0
else:
self.padding_dummy_request = self.padding_dummy_request[0]
self.padding_dummy_request.is_cuda_graph_dummy = True
spec_res_mgr = resource_manager.get_resource_manager(
ResourceManagerType.SPEC_RESOURCE_MANAGER)

View File

@ -8,7 +8,7 @@ import os
import weakref
from abc import ABC, abstractmethod
from contextlib import contextmanager
from typing import Any, Callable, Dict, List, Optional, Tuple
from typing import Any, Callable, Dict, List, Optional, Tuple, Union
import torch
import torch._dynamo.config
@ -64,8 +64,8 @@ from .layerwise_nvtx_marker import LayerwiseNvtxMarker
from .llm_request import LlmRequest, get_draft_token_length
from .model_loader import ModelLoader, _construct_checkpoint_loader
from .resource_manager import (BaseResourceManager, KVCacheManager,
PeftCacheManager, ResourceManager,
ResourceManagerType)
KVCacheManagerV2, PeftCacheManager,
ResourceManager, ResourceManagerType)
from .sampler import SampleStateTensors
from .scheduler import ScheduledRequests
@ -668,8 +668,8 @@ class PyTorchModelEngine(ModelEngine):
self.kv_cache_manager_key)
curr_max_num_tokens = min(
kv_cache_manager.get_num_available_tokens(
self.original_max_draft_len), self.max_num_tokens,
self.batch_size * (self.max_seq_len - 1))
max_num_draft_tokens=self.original_max_draft_len),
self.max_num_tokens, self.batch_size * (self.max_seq_len - 1))
max_batch_size = min(
self.batch_size,
curr_max_num_tokens // (1 + self.runtime_draft_len))
@ -720,8 +720,8 @@ class PyTorchModelEngine(ModelEngine):
self.kv_cache_manager_key)
curr_max_num_tokens = min(
kv_cache_manager.get_num_available_tokens(
self.original_max_draft_len), self.max_num_tokens,
self.batch_size * (self.max_seq_len - 1))
max_num_draft_tokens=self.original_max_draft_len),
self.max_num_tokens, self.batch_size * (self.max_seq_len - 1))
cache_path = os.environ.get("TLLM_AUTOTUNER_CACHE_PATH", None)
with self.no_cuda_graph(), autotune(cache_path=cache_path):
@ -945,7 +945,7 @@ class PyTorchModelEngine(ModelEngine):
ResourceManagerType.SPEC_RESOURCE_MANAGER)
available_tokens = kv_cache_manager.get_num_available_tokens(
self.runtime_draft_len)
max_num_draft_tokens=self.runtime_draft_len)
available_blocks = kv_cache_manager.get_num_free_blocks()
if num_tokens > self.max_num_tokens or num_tokens > available_tokens:
return None
@ -998,7 +998,8 @@ class PyTorchModelEngine(ModelEngine):
num_left_over_tokens /
kv_cache_manager.tokens_per_block) + num_gen_requests
if blocks_to_use > available_blocks:
if blocks_to_use > available_blocks and isinstance(
kv_cache_manager, KVCacheManager):
return None
if num_ctx_tokens > 0:
@ -1014,6 +1015,9 @@ class PyTorchModelEngine(ModelEngine):
use_mrope=self.use_mrope,
num_extra_decoding_steps=num_extra_decoding_steps)
if ctx_requests is None:
return None
if spec_resource_manager is not None:
spec_resource_manager.add_dummy_requests(
request_ids=list(range(num_ctx_requests)))
@ -1029,6 +1033,12 @@ class PyTorchModelEngine(ModelEngine):
use_mrope=self.use_mrope,
max_beam_width=self.max_beam_width,
num_extra_decoding_steps=num_extra_decoding_steps)
if gen_requests is None:
for r in ctx_requests:
kv_cache_manager.free_resources(r)
return None
if spec_resource_manager is not None:
spec_resource_manager.add_dummy_requests(request_ids=list(
range(num_ctx_requests, num_ctx_requests +
@ -1069,7 +1079,11 @@ class PyTorchModelEngine(ModelEngine):
max_beam_width=self.max_beam_width,
num_extra_decoding_steps=num_extra_decoding_steps)
available_tokens = kv_cache_manager.get_num_available_tokens(draft_len)
if requests is None:
return None
available_tokens = kv_cache_manager.get_num_available_tokens(
batch_size=batch_size, max_num_draft_tokens=draft_len)
# Add one dummy request with the maximum possible sequence length.
max_seq_len = min(
@ -1098,7 +1112,14 @@ class PyTorchModelEngine(ModelEngine):
max_num_draft_tokens=draft_len,
use_mrope=self.use_mrope,
max_beam_width=self.max_beam_width,
num_extra_decoding_steps=num_extra_decoding_steps)[0]
num_extra_decoding_steps=num_extra_decoding_steps)
if max_seq_len_request is None:
for r in requests:
kv_cache_manager.free_resources(r)
return None
else:
max_seq_len_request = max_seq_len_request[0]
# Insert the longest request first to simulate padding for the CUDA graph.
requests.insert(0, max_seq_len_request)
@ -1122,7 +1143,8 @@ class PyTorchModelEngine(ModelEngine):
req.py_is_first_draft = True
req.py_draft_tokens = []
def _set_up_attn_metadata(self, kv_cache_manager: KVCacheManager):
def _set_up_attn_metadata(self, kv_cache_manager: Union[KVCacheManager,
KVCacheManagerV2]):
enable_context_mla_with_cached_kv = is_mla(
self.model.model_config.pretrained_config) and (
self.attn_runtime_features.cache_reuse
@ -1529,7 +1551,7 @@ class PyTorchModelEngine(ModelEngine):
def _apply_incremental_update(
self,
scheduled_requests: ScheduledRequests,
kv_cache_manager: KVCacheManager,
kv_cache_manager: Union[KVCacheManager, KVCacheManagerV2],
attn_metadata: AttentionMetadata,
spec_metadata: Optional[SpecMetadata] = None,
new_tensors_device: Optional[SampleStateTensors] = None,
@ -1961,7 +1983,7 @@ class PyTorchModelEngine(ModelEngine):
def _prepare_tp_inputs(
self,
scheduled_requests: ScheduledRequests,
kv_cache_manager: KVCacheManager,
kv_cache_manager: Union[KVCacheManager, KVCacheManagerV2],
attn_metadata: AttentionMetadata,
spec_metadata: Optional[SpecMetadata] = None,
new_tensors_device: Optional[SampleStateTensors] = None,
@ -3306,7 +3328,7 @@ class PyTorchModelEngine(ModelEngine):
def _prepare_inputs(
self,
scheduled_requests: ScheduledRequests,
kv_cache_manager: KVCacheManager,
kv_cache_manager: Union[KVCacheManager, KVCacheManagerV2],
attn_metadata: AttentionMetadata,
spec_metadata: Optional[SpecMetadata] = None,
new_tensors_device: Optional[SampleStateTensors] = None,

View File

@ -3,20 +3,40 @@ import enum
import math
from abc import ABC, abstractmethod
from collections import OrderedDict, defaultdict, deque
from typing import (TYPE_CHECKING, Dict, Iterable, List, Optional, Set, Tuple,
Union)
from typing import (TYPE_CHECKING, Dict, Iterable, List, Optional, Sequence,
Set, Tuple, Union)
import numpy as np
import torch
import tensorrt_llm
import tensorrt_llm.bindings
from tensorrt_llm._torch.distributed.communicator import Distributed, ReduceOp
from tensorrt_llm._utils import (TensorWrapper, convert_to_torch_tensor,
get_size_in_bytes)
from tensorrt_llm.bindings.internal.batch_manager.kv_cache_manager_v2_utils import (
IndexMapper, copy_batch_block_offsets_to_device)
from tensorrt_llm.bindings.internal.runtime import TaskLayerModuleConfig
from tensorrt_llm.llmapi.llm_args import (KvCacheConfig, PeftCacheConfig,
PybindMirror)
from tensorrt_llm.lora_helper import LoraConfig
from tensorrt_llm.lora_manager import LoraManager, LoraModelConfig
from tensorrt_llm.math_utils import ceil_div
from tensorrt_llm.runtime import ModelConfig as ModelConfigPython
from tensorrt_llm.runtime.kv_cache_manager_v2 import (AttentionLayerConfig,
BufferConfig,
GpuCacheTierConfig,
HostCacheTierConfig)
from tensorrt_llm.runtime.kv_cache_manager_v2 import \
KVCacheManager as KVCacheManagerPy
from tensorrt_llm.runtime.kv_cache_manager_v2 import \
KVCacheManagerConfig as KVCacheManagerConfigPy
from tensorrt_llm.runtime.kv_cache_manager_v2 import (LayerId, TokenIdExt,
_KVCache)
from tensorrt_llm.runtime.kv_cache_manager_v2._common import GPU_LEVEL
from tensorrt_llm.runtime.kv_cache_manager_v2._config import DataRole
from tensorrt_llm.runtime.kv_cache_manager_v2._utils import (exact_div,
typed_range)
from tensorrt_llm.sampling_params import SamplingParams
from ..._utils import (binding_to_str_dtype, get_size_in_bytes, mpi_rank,
@ -56,6 +76,14 @@ class ResourceManagerType(enum.Enum):
SPEC_RESOURCE_MANAGER = "SPEC_RESOURCE_MANAGER"
class Role:
KEY = DataRole("key")
VALUE = DataRole("value")
KEY_BLOCK_QUANT = DataRole("key_block_quant")
VALUE_BLOCK_QUANT = DataRole("value_block_quant")
ALL = DataRole("all")
def compute_page_count(token_count: int, tokens_per_page: int) -> int:
return (token_count + tokens_per_page) // tokens_per_page
@ -523,6 +551,11 @@ class KVCacheManager(BaseResourceManager):
# occur.
num_extra_decoding_steps: int = 0,
):
available_blocks = self.get_num_free_blocks()
# No padding if not enough KV cache space
if available_blocks < 1:
return None
beam_width = max_beam_width
requests = []
for i, req_id in enumerate(request_ids):
@ -870,12 +903,16 @@ class KVCacheManager(BaseResourceManager):
def get_batch_cache_indices(
self,
request_ids: List[int],
window_size: Optional[int] = None,
layer_idx: Optional[int] = None,
) -> List[List[int]]:
if window_size is None:
if layer_idx is None:
if len(self.max_attention_window_vec) > 1:
raise ValueError("window_size must be provided for VSWA")
raise ValueError("layer_idx must be provided for VSWA")
window_size = self.max_attention_window_vec[0]
else:
layer_offset = self.layer_offsets[layer_idx]
window_size = self.max_attention_window_vec[layer_offset % len(
self.max_attention_window_vec)]
result = self.impl.get_batch_cache_block_ids(request_ids, window_size)
for i in range(len(result)):
@ -896,7 +933,9 @@ class KVCacheManager(BaseResourceManager):
def get_num_kv_blocks(self, num_tokens: int) -> int:
return (num_tokens + self.tokens_per_block - 1) // self.tokens_per_block
def get_num_available_tokens(self, max_num_draft_tokens: int = 0) -> int:
def get_num_available_tokens(self,
max_num_draft_tokens: int = 0,
**kwargs) -> int:
return (self.get_num_free_blocks() * self.tokens_per_block -
self.num_extra_kv_tokens - max_num_draft_tokens)
@ -1326,6 +1365,722 @@ class KVCacheManager(BaseResourceManager):
self.impl.reset_reuse_state()
class KVCacheManagerV2(BaseResourceManager):
def __init__(
self,
kv_cache_config: KvCacheConfig,
kv_cache_type: CacheTypeCpp,
*,
num_layers: int,
num_kv_heads: Union[int, List[Optional[int]]],
head_dim: int,
tokens_per_block: int,
# Note that max_seq_len is not necessarily equal to kv_cache_config.num_tokens.
# It's derived from the model's BuildConfig for consistency with the C++ backend.
max_seq_len: int,
max_batch_size: int,
mapping: Mapping,
dtype: DataType = DataType.HALF,
spec_config=None,
layer_mask: Optional[List[bool]] = None,
vocab_size: int = None,
max_num_tokens: int = 8192,
model_config: Optional[ModelConfigCpp] = None,
max_beam_width: int = 1,
is_draft: bool = False,
kv_connector_manager: Optional[KvCacheConnectorManager] = None,
**kwargs,
) -> None:
self.mapping = mapping
self.dtype = dtype
assert self.dtype != DataType.NVFP4, "NVFP4 is not supported for KVCacheManagerV2"
assert kv_connector_manager is None, "kv_connector_manager is not supported for KVCacheManagerV2"
assert max_beam_width == 1, "max_beam_width must be 1 for KVCacheManagerV2"
self.kv_cache_type = kv_cache_type
self.pp_layers, self.num_layers = get_pp_layers(
num_layers,
mapping,
spec_config=spec_config,
layer_mask=layer_mask,
)
self.is_draft = is_draft
self.num_local_layers = len(self.pp_layers)
self.layer_offsets = {
idx: offset
for offset, idx in enumerate(self.pp_layers)
}
self.max_beam_width = max_beam_width
tp_size = mapping.tp_size
if mapping.enable_attention_dp:
tp_size = 1
self.num_kv_heads = num_kv_heads
self.head_dim = head_dim
self.tokens_per_block = tokens_per_block
self.max_seq_len = max_seq_len
self.max_batch_size = max_batch_size
self.kv_factor = 1 if kv_cache_type == CacheTypeCpp.SELFKONLY else 2
from ..speculative import get_num_extra_kv_tokens
self.num_extra_kv_tokens = get_num_extra_kv_tokens(spec_config)
self.event_buffer_max_size = kv_cache_config.event_buffer_max_size
assert self.event_buffer_max_size == 0, "event_buffer_max_size must be 0"
# Determine max_attention_window_vec
if kv_cache_config.max_attention_window is not None:
self.max_attention_window_vec = kv_cache_config.max_attention_window.copy(
) # Make a copy to avoid modifying original
# Clamp all window sizes to max_seq_len before calculating the
# number of KV cache blocks. This prevents the KV cache pool from
# being skewed by the largest window values.
self.max_attention_window_vec = [
min(max_seq_len, w) for w in self.max_attention_window_vec
]
self.max_attention_window_vec = [
None if w == max_seq_len else w
for w in self.max_attention_window_vec
]
else:
self.max_attention_window_vec = [None]
if isinstance(num_kv_heads, int):
self.num_kv_heads_per_layer = [
(num_kv_heads + tp_size - 1) // tp_size
for _ in range(self.num_local_layers)
]
self.total_num_kv_heads_per_layer = [
(num_kv_heads + tp_size - 1) // tp_size
for _ in range(self.num_layers)
]
else:
assert len(num_kv_heads) == self.num_layers
def append_to_kv_heads_per_layer(num_kv_heads_per_layer: List[int],
kv_head: Optional[int]):
if kv_head is not None:
num_kv_heads_per_layer.append(
(kv_head + tp_size - 1) // tp_size)
else:
num_kv_heads_per_layer.append(0)
self.num_kv_heads_per_layer = []
if self.num_local_layers > 0:
for i in self.pp_layers:
kv_head = num_kv_heads[i]
append_to_kv_heads_per_layer(self.num_kv_heads_per_layer,
kv_head)
self.total_num_kv_heads_per_layer = []
for i in range(self.num_layers):
kv_head = num_kv_heads[i]
append_to_kv_heads_per_layer(self.total_num_kv_heads_per_layer,
kv_head)
self.is_vswa = len(set(self.max_attention_window_vec)) > 1
self.kv_connector_manager = kv_connector_manager
quota = float('inf')
if kv_cache_config.max_tokens is not None:
quota = int(
ceil_div(
kv_cache_config.max_tokens *
self.get_cache_bytes_per_token(),
kv_cache_config.max_util_for_resume))
if kv_cache_config.free_gpu_memory_fraction is not None:
logger.warning(
f"Both max_tokens and free_gpu_memory_fraction are set to {kv_cache_config.max_tokens} and {kv_cache_config.free_gpu_memory_fraction}, the smaller value will be used."
)
if kv_cache_config.max_gpu_total_bytes is not None and kv_cache_config.max_gpu_total_bytes > 0:
if quota > int(kv_cache_config.max_gpu_total_bytes):
logger.warning(
f"max_gpu_total_bytes {kv_cache_config.max_gpu_total_bytes / (1 << 30)}GiB is smaller than the calculated quota {quota / (1 << 30)}GiB, clamping quota to {kv_cache_config.max_gpu_total_bytes / (1 << 30)}GiB"
)
quota = min(quota, int(kv_cache_config.max_gpu_total_bytes))
assert quota != float(
'inf'
), "Quota not set. Check kv_cache_config.max_tokens or kv_cache_config.max_gpu_total_bytes"
logger.info(
f"KV cache manager v2 device quota set to {quota / (1 << 30)}GiB")
cache_tiers = [GpuCacheTierConfig(quota=quota)]
if kv_cache_config.host_cache_size is not None and kv_cache_config.host_cache_size > 0:
cache_tiers.append(
HostCacheTierConfig(quota=kv_cache_config.host_cache_size))
logger.info(
f"KV cache manager v2 host cache quota set to {kv_cache_config.host_cache_size / (1 << 30)}GiB"
)
buffer_type = [Role.KEY]
if kv_cache_type != CacheTypeCpp.SELFKONLY:
buffer_type.append(Role.VALUE)
config = KVCacheManagerConfigPy(
tokens_per_block=tokens_per_block,
vocab_size=vocab_size,
cache_tiers=cache_tiers,
max_util_for_resume=kv_cache_config.max_util_for_resume,
layers=[
AttentionLayerConfig(
layer_id=layer_id,
buffers=[
BufferConfig(
role=role,
size=self.get_cache_bytes_per_token(
local_layer_idx=layer_id, data_role=role) *
tokens_per_block,
) for role in buffer_type
],
sliding_window_size=self.max_attention_window_vec[
layer_id % len(self.max_attention_window_vec)],
num_sink_tokens=None,
) for layer_id in typed_range(LayerId(self.num_local_layers))
],
)
self.kv_cache_manager_py_config = config
self.impl = KVCacheManagerPy(config)
self.num_pools = len(self.impl.layer_grouping)
self.layer_to_pool_mapping_dict: dict[int, int] = {
layer_id: self.impl.get_layer_group_id(layer_id)
for layer_id in typed_range(LayerId(self.num_local_layers))
}
self.kv_cache_pool_pointers = torch.tensor([[
self.impl.get_mem_pool_base_address(
self.impl.layer_grouping[pool_id][0], Role.KEY), 0
] for pool_id in range(self.num_pools)],
dtype=torch.int64,
device="cpu",
pin_memory=True)
kv_cache_pool_mapping_list = []
for layer_id in typed_range(LayerId(self.num_local_layers)):
layer_group_id = self.impl.get_layer_group_id(layer_id)
offset = exact_div(
self.impl.get_mem_pool_base_address(layer_id, Role.KEY) -
int(self.kv_cache_pool_pointers[layer_group_id][0]),
self.get_cache_bytes_per_token(layer_id, Role.KEY) *
self.kv_factor * self.tokens_per_block)
kv_cache_pool_mapping_list.append([layer_group_id, offset])
self.kv_cache_pool_mapping = torch.tensor(kv_cache_pool_mapping_list,
dtype=torch.int32,
device="cpu",
pin_memory=True)
# Pad max_blocks_per_seq to next multiple of 4 for copy_block_offsets kernel
self.max_blocks_per_seq = (max_seq_len + tokens_per_block -
1) // tokens_per_block
if self.max_blocks_per_seq % 4 != 0:
self.max_blocks_per_seq = ((self.max_blocks_per_seq + 3) // 4) * 4
self.kv_cache_map: dict[int, _KVCache] = {}
max_num_tokens = self.get_num_available_tokens()
if max_seq_len > max_num_tokens:
logger.warning(
f"max_seq_len {max_seq_len} is greater than max_num_tokens {max_num_tokens} that can be allocated in kv cache manager, setting max_seq_len to {max_num_tokens}"
)
self.max_seq_len = max_num_tokens
self.enable_block_reuse = kv_cache_config.enable_block_reuse
# Plus 1 for cuda graph dummy request
self.index_mapper = IndexMapper(max_batch_size + 1, max_beam_width)
self.host_kv_cache_block_offsets = torch.empty(
self.num_pools,
(max_batch_size + 1) * max_beam_width,
2, # key and value
self.max_blocks_per_seq,
dtype=torch.int32,
pin_memory=True,
device='cpu')
@property
def blocks_in_primary_pool(self) -> int:
"""
Get the number of blocks in the primary pool.
"""
return self.impl.get_page_index_upper_bound(0, Role.KEY)
def get_buffers(self,
layer_idx: int,
kv_layout: str = "NHD") -> Optional[torch.Tensor]:
layer_offset = self.layer_offsets[layer_idx]
addr_key = self.impl.get_mem_pool_base_address(layer_offset, Role.KEY)
if self.kv_cache_type != CacheTypeCpp.SELFKONLY:
addr_value = self.impl.get_mem_pool_base_address(
layer_offset, Role.VALUE)
page_size_key = self.impl.get_page_stride(layer_offset, Role.KEY)
page_size_value = self.impl.get_page_stride(layer_offset,
Role.VALUE)
assert addr_key + page_size_value == addr_value and page_size_key == page_size_value
assert kv_layout in ["NHD",
"HND"], f"Unsupported kv_layout: {kv_layout}"
if kv_layout == "NHD":
shape = [
self.impl.get_page_index_upper_bound(layer_offset, Role.KEY) //
self.kv_factor,
self.kv_factor,
self.tokens_per_block,
self.num_kv_heads_per_layer[layer_offset],
self.head_dim,
]
else:
shape = [
self.impl.get_page_index_upper_bound(layer_offset, Role.KEY) //
self.kv_factor,
self.kv_factor,
self.num_kv_heads_per_layer[layer_offset],
self.tokens_per_block,
self.head_dim,
]
return convert_to_torch_tensor(
TensorWrapper(
addr_key,
self.dtype,
shape,
))
def get_num_available_tokens(self,
*,
batch_size: int = 1,
max_num_draft_tokens: int = 0) -> int:
if max_num_draft_tokens > 0:
raise ValueError(
"max_num_draft_tokens is not supported for KVCacheManagerV2")
return int(
self.impl.clamp_max_seq_len_for_mem(batch_size) *
self.kv_cache_manager_py_config.max_util_for_resume
) - self.num_extra_kv_tokens - max_num_draft_tokens
def get_num_free_blocks(self) -> int:
# NOTE This method is used to get the number of blocks in the primary pool not the FREE blocks.
# However, since we only use this function when the kv cache manager is empty, so it is safe to do so.
assert len(
self.kv_cache_map
) == 0, "get_num_free_blocks is only used when the kv cache manager is empty"
max_num_pages = max([
self.impl.get_page_index_upper_bound(layer_id, Role.KEY)
for layer_id in typed_range(LayerId(self.num_local_layers))
])
return max_num_pages // self.kv_factor
@nvtx_range("prepare_resources_kv_cache_manager_v2")
def prepare_resources(self, scheduled_batch: ScheduledRequests):
with request_context(self.is_draft, scheduled_batch):
context_batch = scheduled_batch.context_requests
generation_batch = scheduled_batch.generation_requests
# allocate KV Cache
for req in context_batch:
beam_width = req.sampling_config.beam_width
if 'cp_type' in self.mapping.cp_config and CpType.STAR == self.mapping.cp_config[
'cp_type']:
raise RuntimeError(
"Star attention is not supported for kv cache manager v2"
)
else:
if req.is_first_context_chunk and self._kv_connector_should_add_sequence(
req):
# Last token cannot be recovered, so we don't include it in the input tokens to look up for the block that can be reused.
kv_cache = self._create_kv_cache(
req.py_request_id, req.lora_task_id,
req.get_tokens(0)[:-1]
if self.enable_block_reuse else None)
assert beam_width == 1, "Currently, KVCacheManagerV2 only supports beam width 1"
if not self.enable_block_reuse:
assert kv_cache.num_committed_tokens == 0
kv_cache.stop_committing()
else:
req.context_current_position = kv_cache.num_committed_tokens
chunk_size = req.context_chunk_size
if req.context_current_position + req.context_chunk_size < req.prompt_len:
floored_end_position = (
req.context_current_position +
req.context_chunk_size
) // self.tokens_per_block * self.tokens_per_block
chunk_size = floored_end_position - req.context_current_position
req.context_chunk_size = min(
chunk_size,
req.prompt_len - req.context_current_position)
success = kv_cache.resume(
torch.cuda.current_stream().cuda_stream)
assert success
kv_cache.resize(req.prompt_len)
if self.kv_connector_manager is not None:
block_ids = self.get_cache_indices(req)
self.kv_connector_manager.update_state_after_alloc(
req, block_ids)
for req in generation_batch:
kv_cache = self.kv_cache_map[req.py_request_id]
kv_cache.resize(kv_cache.capacity + 1)
if self.kv_connector_manager is not None:
self.kv_connector_manager.build_scheduler_output(
scheduled_batch, self)
def _kv_connector_should_add_sequence(self, request: LlmRequest) -> bool:
return self.kv_connector_manager is None or self.kv_connector_manager.should_add_sequence(
request)
def get_kv_cache_stats(self):
class KVCacheStatus:
def __init__(self, allocated_bytes: int):
self.allocated_bytes = allocated_bytes
return KVCacheStatus(allocated_bytes=self.impl.get_quota(GPU_LEVEL))
def add_dummy_requests(
self,
request_ids: List[int],
# Note that token_nums should be past_kv_len + input_len (without
# spec decoding). The draft tokens will be added in this function,
# so we don't need to take care of it in the caller. When preparing
# token_nums, we should not take the draft tokens into account, so
# don't use the kv_cache_manager.max_seq_len, which includes both
# extra tokens and draft tokens.
token_nums: Optional[List[int]] = None,
is_gen: bool = False,
prepare_resource: bool = True,
max_num_draft_tokens: int = 0,
use_mrope: bool = False,
max_beam_width: int = 1,
num_extra_decoding_steps:
int = 0, # TODO: support num_extra_decoding_steps
):
beam_width = max_beam_width
requests = []
for i, req_id in enumerate(request_ids):
# exact choice of n can be ignored for dummy requests
sampling_params = SamplingParams(n=beam_width,
best_of=beam_width,
use_beam_search=beam_width > 1)
# Here 1+max_num_draft_tokens is used to extend the prompt length to
# a non-zero number to skip illegal memory access issue in MLA kernel
# during warmup.
token_num = token_nums[
i] if token_nums is not None else 1 + max_num_draft_tokens
# TODO: support cross attention
encoder_input_tokens = None
# Using 1 instead of 0 prevents NaN during warmup in e.g. Deepseek
input_tokens = [1 for _ in range(token_num)]
req = LlmRequest(request_id=req_id,
max_new_tokens=1,
input_tokens=input_tokens,
sampling_config=SamplingConfig(
sampling_params._get_sampling_config()),
is_streaming=False,
encoder_input_tokens=encoder_input_tokens)
req.is_dummy_request = True
req.paged_kv_block_ids = []
if prepare_resource:
kv_cache = self._create_kv_cache(req.py_request_id,
req.lora_task_id, input_tokens)
assert kv_cache.num_committed_tokens == 0
success = kv_cache.resume(
torch.cuda.current_stream().cuda_stream)
if not success:
for r in requests:
self.free_resources(r)
self.free_resources(req)
return None
kv_cache.stop_committing()
kv_cache.resize(token_num)
if is_gen:
req.state = LlmRequestState.GENERATION_IN_PROGRESS
req.prompt_len = token_num - 1
req.py_prompt_len = req.prompt_len
# TODO: Planning to get dummy_data from each model. Before that, we need to add dummy mrop_config to the request here.
if use_mrope:
dummy_mrope_position_ids = torch.arange(
0, token_num, dtype=torch.int32).expand(3, 1, -1).clone()
req.py_multimodal_data = {
"mrope_config": {
"mrope_position_ids": dummy_mrope_position_ids
}
}
if is_gen:
dummy_mrope_position_deltas = torch.zeros(
1, dtype=torch.int32).unsqueeze(0)
req.py_multimodal_data["mrope_config"][
"mrope_position_deltas"] = dummy_mrope_position_deltas
requests.append(req)
return requests
def free_resources(self, request: LlmRequest, pin_on_release: bool = False):
kv_cache = self.kv_cache_map.pop(request.py_request_id)
kv_cache.close()
self.index_mapper.remove_sequence(request.py_request_id)
def get_batch_cache_indices(self,
request_ids: List[int],
layer_id: int = 0) -> List[List[int]]:
return self._get_batch_cache_indices_by_pool_id(
request_ids,
pool_id=self.layer_to_pool_mapping_dict[layer_id],
is_kv_aggregate=True)
def _get_batch_cache_indices_by_pool_id(
self,
request_ids: List[int],
*,
pool_id: int = 0,
is_kv_aggregate: bool = True) -> List[List[int]]:
if is_kv_aggregate:
# Div by kv_factor to index kv cache with size [num_blocks, kv_factor, tokens_per_block, num_kv_heads, head_dim]
div_factor = self.kv_factor
else:
div_factor = 1
return [
(np.asarray(self.kv_cache_map[req_id].get_page_indices(pool_id)) //
div_factor).tolist() for req_id in request_ids
]
def get_cache_bytes_per_token(
self,
local_layer_idx: Optional[int] = None,
data_role: Role = Role.ALL): # None means all layers/data_roles
if self.dtype not in (
DataType.FP8,
DataType.HALF,
DataType.BF16,
DataType.FLOAT,
DataType.NVFP4,
):
raise ValueError(f"Cannot support {self.dtype} KV cache.")
if data_role == Role.ALL:
kv_factor = self.kv_factor
elif data_role in [
Role.KEY, Role.VALUE, Role.KEY_BLOCK_QUANT,
Role.VALUE_BLOCK_QUANT
]:
if data_role in [Role.KEY_BLOCK_QUANT, Role.VALUE_BLOCK_QUANT]:
assert self.dtype == DataType.NVFP4, "NVFP4 is the only supported dtype for block quant data roles"
if data_role == Role.VALUE:
assert self.kv_cache_type != CacheTypeCpp.SELFKONLY, "SELFKONLY is the only supported cache type for value data role"
kv_factor = 1
else:
raise ValueError(f"Invalid data role: {data_role}")
if local_layer_idx is None:
cache_size_per_token = (kv_factor *
sum(self.num_kv_heads_per_layer) *
self.head_dim)
else:
cache_size_per_token = (
kv_factor * self.num_kv_heads_per_layer[local_layer_idx] *
self.head_dim)
cache_size_bytes_per_token = get_size_in_bytes(cache_size_per_token,
self.dtype)
if data_role in [Role.KEY, Role.VALUE]:
return cache_size_bytes_per_token
quant_size_per_token = 0
if self.dtype == DataType.NVFP4:
quant_size_per_token = self.calculate_scaling_factor_size_bytes(
cache_size_per_token,
quant_vector_size=16,
scaling_factor_dtype=DataType.FP8,
)
if data_role in [Role.KEY_BLOCK_QUANT, Role.VALUE_BLOCK_QUANT]:
return quant_size_per_token
return cache_size_bytes_per_token + quant_size_per_token
@staticmethod
def calculate_scaling_factor_size_bytes(
cache_size: int, quant_vector_size: int,
scaling_factor_dtype: DataType) -> int:
assert cache_size % quant_vector_size == 0, "NVFP4 cache size must be divisible by quant vector size"
return get_size_in_bytes(cache_size // quant_vector_size,
scaling_factor_dtype)
def check_invalid_values_in_kv_cache(self,
fill_with_zero: bool = False) -> bool:
some_checks_unavailable = False
has_invalid_values = torch.tensor([False],
dtype=torch.bool,
device=torch.cuda.current_device())
pool_handled = set()
# Handle each layer from start to end to traverse the whole KV cache.
for layer_id in typed_range(LayerId(self.num_local_layers)):
pool_id = self.layer_to_pool_mapping_dict[layer_id]
if pool_id in pool_handled:
continue
buffer = self.get_buffers(layer_id)
# process in chunks of 256 pages to avoid OoM
for i in range(0, buffer.shape[0], 256):
buffer_slice = buffer[i:i + 256]
try:
has_invalid_values.logical_or_(
torch.isnan(buffer_slice).any())
has_invalid_values.logical_or_(
torch.isinf(buffer_slice).any())
except NotImplementedError:
some_checks_unavailable = True
if fill_with_zero:
buffer.zero_()
pool_handled.add(pool_id)
torch.cuda.synchronize()
if some_checks_unavailable:
logger.warning(
"`torch.isnan` or `torch.isinf` is not implemented for current kv cache dtype, related checks are skipped"
)
return bool(has_invalid_values)
def shutdown(self):
for kv_cache in self.kv_cache_map.values():
kv_cache.close()
self.kv_cache_map.clear()
self.impl.clear_reusable_blocks()
def get_max_resource_count(self) -> int:
# TODO: implement this
return 1
def get_needed_resource_to_completion(self, request: LlmRequest) -> int:
# TODO: implement this
# context_token_count = request.orig_prompt_len
# num_context_blocks = context_token_count // self.tokens_per_block
# remaining_tokens = context_token_count + request.max_new_tokens - num_context_blocks * self.tokens_per_block
# need_blocks = num_context_blocks + math.ceil(
# remaining_tokens / self.tokens_per_block)
# return need_blocks
return 0
# TODO: refactor get_cache_size_per_token and get_cache_bytes_per_token to use the same logic
@staticmethod
def get_cache_size_per_token(model_config: ModelConfigPython,
mapping: Mapping, **kwargs):
# get kv cache dtype bytes
mem_per_token = 2
quant_config = model_config.quant_config
if quant_config is not None and quant_config.quant_mode.has_fp8_kv_cache(
):
mem_per_token = 1
# get num key value heads
config = model_config.pretrained_config
num_key_value_heads = getattr(config, 'num_key_value_heads',
config.num_attention_heads)
if isinstance(num_key_value_heads, Iterable):
num_key_value_heads = sum(num_key_value_heads) / len(
num_key_value_heads)
# get head dim
mla = hasattr(config, "kv_lora_rank")
if mla:
head_dim = config.kv_lora_rank + config.qk_rope_head_dim
kv_factor = 1
else:
tp_size = 1 if mapping.enable_attention_dp else mapping.tp_size
head_dim = getattr(config, "head_dim", None)
if not isinstance(head_dim, int):
head_dim = config.hidden_size // config.num_attention_heads
head_dim = head_dim * num_key_value_heads // tp_size
kv_factor = 2
# provide at least 1 layer to prevent division by zero cache size
num_attention_layers = max(
len(mapping.pp_layers(model_config.get_num_attention_layers())), 1)
mem_per_token *= num_attention_layers * head_dim
# K and V
mem_per_token *= kv_factor
return mem_per_token
def update_resources(self,
scheduled_batch: ScheduledRequests,
attn_metadata: "AttentionMetadata" = None,
kv_cache_dtype_byte_size: float = None):
for req in scheduled_batch.context_requests:
if req.py_request_id not in self.kv_cache_map:
continue
kv_cache = self.kv_cache_map[req.py_request_id]
if self.enable_block_reuse and not req.is_dummy_request:
if req.context_current_position > kv_cache.num_committed_tokens:
kv_cache.commit(
req.get_tokens(0)[kv_cache.num_committed_tokens:req.
context_current_position])
kv_cache.stop_committing()
else:
kv_cache.resize(None, req.context_current_position)
for req in scheduled_batch.generation_requests:
if req.py_request_id not in self.kv_cache_map:
continue
kv_cache = self.kv_cache_map[req.py_request_id]
kv_cache.resize(None, req.max_beam_num_tokens - 1)
def copy_batch_block_offsets(self, dst_tensor: torch.Tensor,
request_ids: List[int], beam_width: int,
num_contexts: int, num_seqs: int):
assert beam_width == 1, "beam_width must be 1 for KVCacheManagerV2"
copy_idx = self.index_mapper.get_copy_index(request_ids, num_contexts,
beam_width)
assert copy_idx.shape[0] == num_seqs
copy_batch_block_offsets_to_device(
self.host_kv_cache_block_offsets, dst_tensor, copy_idx,
self.kv_cache_type == CacheTypeCpp.SELFKONLY,
torch.cuda.current_stream().cuda_stream)
def _create_kv_cache(self, request_id: int, lora_task_id: int | None,
input_tokens: Sequence[TokenIdExt] | None):
assert request_id not in self.kv_cache_map, f"KV cache for request {request_id} already exists"
kv_cache = self.impl.create_kv_cache(lora_task_id, input_tokens)
self.kv_cache_map[request_id] = kv_cache
index = self.index_mapper.add_new_sequence(request_id)
for i in range(self.max_beam_width):
for pool_idx in range(self.num_pools):
buffer: torch.Tensor = self.host_kv_cache_block_offsets[
pool_idx, index * self.max_beam_width + i, 0]
kv_cache.set_page_index_buf(i, pool_idx,
memoryview(buffer.numpy()))
return kv_cache
class SlotManager:
def __init__(self, max_num_requests: int):

View File

@ -168,6 +168,68 @@ class BindCapacityScheduler(CapacityScheduler):
self.peft_cache_manager)
class KVCacheV2DummyScheduler(CapacityScheduler):
# only schedule requests has no_schedule_until_state <= state < no_schedule_after_state
no_schedule_until_state = LlmRequestState.CONTEXT_INIT
no_schedule_after_state = LlmRequestState.GENERATION_COMPLETE
def __init__(self, max_num_requests: int, kv_cache_manager):
super(KVCacheV2DummyScheduler, self).__init__()
self.max_num_requests = max_num_requests
self.kv_cache_manager = kv_cache_manager
def schedule_request(
self, active_requests: RequestList
) -> tuple[list[LlmRequest], list[LlmRequest], list[LlmRequest]]:
scheduled_requests = []
scheduled_disagg_gen_init_requests = []
pending_requests = []
reserved_blocks = 0
max_blocks = self.kv_cache_manager.get_max_resource_count()
for request in active_requests:
req_state = request.state
# if request cannot be scheduled yet or request should no longer be scheduled, skip
if not req_state == LlmRequestState.DISAGG_GENERATION_INIT and (
req_state.value < self.no_schedule_until_state.value
or req_state.value >= self.no_schedule_after_state.value):
continue
if len(scheduled_requests
) >= self.max_num_requests or reserved_blocks >= max_blocks:
break
elif req_state == LlmRequestState.GENERATION_IN_PROGRESS or req_state == LlmRequestState.GENERATION_TO_COMPLETE:
scheduled_requests.append(request)
reserved_blocks += self.kv_cache_manager.get_needed_resource_to_completion(
request)
elif req_state == LlmRequestState.DISAGG_GENERATION_INIT:
scheduled_disagg_gen_init_requests.append(request)
reserved_blocks += self.kv_cache_manager.get_needed_resource_to_completion(
request)
else:
pending_requests.append(request)
avaiable_blocks = max_blocks - reserved_blocks
for request in pending_requests:
req_state = request.state
if len(scheduled_requests) >= self.max_num_requests:
break
elif req_state == LlmRequestState.CONTEXT_INIT:
needed_blocks = self.kv_cache_manager.get_needed_resource_to_completion(
request)
if needed_blocks <= avaiable_blocks:
scheduled_requests.append(request)
avaiable_blocks -= needed_blocks
elif needed_blocks > avaiable_blocks:
# If one requests fails to be scheduled, break
break
assert len(scheduled_requests) + len(
scheduled_disagg_gen_init_requests) > 0, (
"no pending request can get enough resource to complete, "
"please increase KV cache pool size.")
return scheduled_requests, scheduled_disagg_gen_init_requests, []
class MicroBatchScheduler(ABC):
@abstractmethod

View File

@ -210,6 +210,12 @@ def binding_to_str_dtype(binding_dtype) -> str:
return ret
def binding_to_torch_dtype(binding_dtype) -> torch.dtype:
ret = _binding_to_str_dtype.get(binding_dtype)
assert ret is not None, f'Unsupported binding dtype: {binding_dtype}'
return str_dtype_to_torch(ret)
def binding_dtype_size(dtype: DataType):
return _binding_dtype_size[dtype]
@ -989,7 +995,7 @@ class TensorWrapper:
def __init__(
self,
data_ptr: int,
dtype: Union[torch.dtype, str, np.dtype, trt.DataType],
dtype: Union[torch.dtype, str, np.dtype, trt.DataType, DataType],
shape: Sequence[int],
strides: Optional[Sequence[int]] = None,
):
@ -1011,7 +1017,8 @@ class TensorWrapper:
return getattr(self, "_shape", None)
@dtype.setter
def dtype(self, dtype: Union[torch.dtype, str, np.dtype, trt.DataType]):
def dtype(self, dtype: Union[torch.dtype, str, np.dtype, trt.DataType,
DataType]):
if isinstance(dtype, torch.dtype):
self._dtype = dtype
elif isinstance(dtype, str):
@ -1020,6 +1027,8 @@ class TensorWrapper:
self._dtype = np_dtype_to_torch(dtype)
elif isinstance(dtype, trt.DataType):
self._dtype = trt_dtype_to_torch(dtype)
elif isinstance(dtype, DataType):
self._dtype = binding_to_torch_dtype(dtype)
else:
raise TypeError(f"Unsupported dtype: {dtype}")

View File

@ -1760,6 +1760,18 @@ class KvCacheConfig(StrictBaseModel, PybindMirror):
tokens_per_block: int = Field(default=32,
description="The number of tokens per block.")
use_kv_cache_manager_v2: bool = Field(
default=False,
status="prototype",
description="Whether to use the KV cache manager v2 (experimental).")
max_util_for_resume: float = Field(
default=0.95,
status="prototype",
description=
"The maximum utilization of the KV cache for resume. Default is 95%. Only used when using KV cache manager v2 (experimental)."
)
def _to_pybind(self):
return _KvCacheConfig(
enable_block_reuse=self.enable_block_reuse,
@ -1820,6 +1832,14 @@ class KvCacheConfig(StrictBaseModel, PybindMirror):
)
return v
@field_validator('max_util_for_resume')
@classmethod
def validate_max_util_for_resume(cls, v: float):
if not 0 <= v <= 1:
raise ValueError(
"kv_cache_config.max_util_for_resume must be between 0 and 1")
return v
@PybindMirror.mirror_pybind_fields(_ExtendedRuntimePerfKnobConfig)
class ExtendedRuntimePerfKnobConfig(StrictBaseModel, PybindMirror):

View File

@ -258,4 +258,4 @@ class KVCacheManager:
def get_aggregated_pages(
self, buffers: Iterable[BufferSlice]
) -> Iterator[AggregatedPageDesc]: ...
def clamp_max_seq_len_for_mem(self, batch_size: int, model_max_seq_len: int) -> int: ...
def clamp_max_seq_len_for_mem(self, batch_size: int) -> int: ...

View File

@ -295,7 +295,7 @@ class KVCacheManager:
)
# @TODO: need updating when dynamic resizing is supported.
def clamp_max_seq_len_for_mem(self, batch_size: int, model_max_seq_len: int) -> int:
def clamp_max_seq_len_for_mem(self, batch_size: int) -> int:
"Get the max possible sequence length limited by the GPU memory pools."
assert batch_size > 0
tokens_per_block = self.tokens_per_block
@ -330,13 +330,14 @@ class KVCacheManager:
assert is_enough(1)
lb = 1
ub = div_up(model_max_seq_len, tokens_per_block)
if is_enough(ub):
return model_max_seq_len
while lb < ub:
ub = lb
while is_enough(ub):
lb = ub
ub *= 2
while lb < ub - 1:
mid = (lb + ub) // 2
if is_enough(mid):
lb = mid
else:
ub = mid - 1
return min(lb * tokens_per_block, model_max_seq_len)
ub = mid
return lb * tokens_per_block

View File

@ -256,11 +256,16 @@
"accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4_4gpus_online_eplb[fp8kv=True-moe_backend=WIDEEP]": 360.0002855450729839504,
"accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4_batch_waiting[batch_wait_timeout_iters=10-batch_wait_max_tokens_ratio=0.75-mtp_nextn=0-fp8kv=False-cuda_graph=False-overlap_scheduler=False-torch_compile=False]": 360.0003064870252273977,
"accuracy/test_llm_api_pytorch.py::TestEXAONE4::test_auto_dtype": 3600.0004039629711769521,
"accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_1gpu[True-True-cutlass-auto]": 360.00032637204276397824,
"accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_1gpu[True-True-cutlass-fp8]": 360.0003586999955587089,
"accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_1gpu[True-True-triton-auto]": 360.6586053780047223,
"accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_1gpu[True-True-trtllm-auto]": 360.0003633099840953946,
"accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_1gpu[True-True-trtllm-fp8]": 360.00036422599805518985,
"accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_1gpu[v1_kv_cache-True-True-cutlass-auto]": 360.00032637204276397824,
"accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_1gpu[v1_kv_cache-True-True-cutlass-fp8]": 360.0003586999955587089,
"accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_1gpu[v1_kv_cache-True-True-triton-auto]": 360.6586053780047223,
"accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_1gpu[v1_kv_cache-True-True-trtllm-auto]": 360.0003633099840953946,
"accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_1gpu[v1_kv_cache-True-True-trtllm-fp8]": 360.00036422599805518985,
"accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_1gpu[v2_kv_cache-True-True-cutlass-auto]": 360.00032637204276397824,
"accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_1gpu[v2_kv_cache-True-True-cutlass-fp8]": 360.0003586999955587089,
"accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_1gpu[v2_kv_cache-True-True-triton-auto]": 360.6586053780047223,
"accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_1gpu[v2_kv_cache-True-True-trtllm-auto]": 360.0003633099840953946,
"accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_1gpu[v2_kv_cache-True-True-trtllm-fp8]": 360.00036422599805518985,
"accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_2gpus[dp2-cutlass-auto]": 360.0003378289984539151,
"accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_2gpus[dp2-triton-auto]": 360.9436147869564593,
"accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_2gpus[dp2-trtllm-auto]": 360.0003398499684408307,
@ -273,18 +278,30 @@
"accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_2gpus[tp2-triton-auto]": 360.8670774899655953,
"accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_2gpus[tp2-trtllm-auto]": 360.00040231598541140556,
"accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_2gpus[tp2-trtllm-fp8]": 360.0003254589391872287,
"accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_4gpus[dp4-cutlass-auto]": 745.8583740849863,
"accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_4gpus[dp4-triton-auto]": 745.9345730679342523,
"accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_4gpus[dp4-trtllm-auto]": 745.0004936959594488144,
"accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_4gpus[dp4-trtllm-fp8]": 745.00031642295653000474,
"accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_4gpus[ep4-cutlass-auto]": 658.1757711600512,
"accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_4gpus[ep4-triton-auto]": 745.9436021829606034,
"accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_4gpus[ep4-trtllm-auto]": 745.0004371170070953667,
"accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_4gpus[ep4-trtllm-fp8]": 745.0004142870311625302,
"accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_4gpus[tp4-cutlass-auto]": 676.3980704760179,
"accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_4gpus[tp4-triton-auto]": 745.0292645250447094,
"accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_4gpus[tp4-trtllm-auto]": 745.0003769229515455663,
"accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_4gpus[tp4-trtllm-fp8]": 677.000331886054482311,
"accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_4gpus[v1_kv_cache-dp4-cutlass-auto]": 745.8583740849863,
"accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_4gpus[v1_kv_cache-dp4-triton-auto]": 745.9345730679342523,
"accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_4gpus[v1_kv_cache-dp4-trtllm-auto]": 745.0004936959594488144,
"accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_4gpus[v1_kv_cache-dp4-trtllm-fp8]": 745.00031642295653000474,
"accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_4gpus[v1_kv_cache-ep4-cutlass-auto]": 658.1757711600512,
"accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_4gpus[v1_kv_cache-ep4-triton-auto]": 745.9436021829606034,
"accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_4gpus[v1_kv_cache-ep4-trtllm-auto]": 745.0004371170070953667,
"accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_4gpus[v1_kv_cache-ep4-trtllm-fp8]": 745.0004142870311625302,
"accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_4gpus[v1_kv_cache-tp4-cutlass-auto]": 676.3980704760179,
"accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_4gpus[v1_kv_cache-tp4-triton-auto]": 745.0292645250447094,
"accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_4gpus[v1_kv_cache-tp4-trtllm-auto]": 745.0003769229515455663,
"accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_4gpus[v1_kv_cache-tp4-trtllm-fp8]": 677.000331886054482311,
"accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_4gpus[v2_kv_cache-dp4-cutlass-auto]": 745.8583740849863,
"accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_4gpus[v2_kv_cache-dp4-triton-auto]": 745.9345730679342523,
"accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_4gpus[v2_kv_cache-dp4-trtllm-auto]": 745.0004936959594488144,
"accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_4gpus[v2_kv_cache-dp4-trtllm-fp8]": 745.00031642295653000474,
"accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_4gpus[v2_kv_cache-ep4-cutlass-auto]": 658.1757711600512,
"accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_4gpus[v2_kv_cache-ep4-triton-auto]": 745.9436021829606034,
"accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_4gpus[v2_kv_cache-ep4-trtllm-auto]": 745.0004371170070953667,
"accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_4gpus[v2_kv_cache-ep4-trtllm-fp8]": 745.0004142870311625302,
"accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_4gpus[v2_kv_cache-tp4-cutlass-auto]": 676.3980704760179,
"accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_4gpus[v2_kv_cache-tp4-triton-auto]": 745.0292645250447094,
"accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_4gpus[v2_kv_cache-tp4-trtllm-auto]": 745.0003769229515455663,
"accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_4gpus[v2_kv_cache-tp4-trtllm-fp8]": 677.000331886054482311,
"accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_chunked_prefill[cutlass-auto]": 643.3513998010312,
"accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_chunked_prefill[triton-auto]": 764.9216735750087537,
"accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_chunked_prefill[trtllm-auto]": 764.0002969659981317818,

View File

@ -4472,8 +4472,10 @@ class TestGPTOSS(LlmapiAccuracyTestHarness):
@pytest.mark.parametrize("cuda_graph,overlap_scheduler", [
(True, True),
])
@pytest.mark.parametrize("v2_kv_cache", [True, False],
ids=["v2_kv_cache", "v1_kv_cache"])
def test_w4_1gpu(self, kv_cache_dtype, moe_backend, cuda_graph,
overlap_scheduler, mocker):
overlap_scheduler, mocker, v2_kv_cache):
mocker.patch.object(GSM8K, "MAX_OUTPUT_LEN", 8192)
mocker.patch.dict(GSM8K.EVALUATE_KWARGS,
{"scores_filter": "exact_match,flexible-extract"})
@ -4482,14 +4484,16 @@ class TestGPTOSS(LlmapiAccuracyTestHarness):
disable_overlap_scheduler=not overlap_scheduler,
cuda_graph_config=CudaGraphConfig() if cuda_graph else None)
kv_cache_config = KvCacheConfig(free_gpu_memory_fraction=0.5,
dtype=kv_cache_dtype)
kv_cache_config = KvCacheConfig(free_gpu_memory_fraction=0.7,
dtype=kv_cache_dtype,
use_kv_cache_manager_v2=v2_kv_cache)
llm = LLM(self.MODEL_PATH,
tensor_parallel_size=1,
pipeline_parallel_size=1,
moe_expert_parallel_size=1,
kv_cache_config=kv_cache_config,
max_batch_size=720,
**pytorch_config,
moe_config=MoeConfig(backend=moe_backend))
@ -4526,26 +4530,11 @@ class TestGPTOSS(LlmapiAccuracyTestHarness):
(4, 1, 4, True, True, True),
],
ids=["tp4", "ep4", "dp4"])
@pytest.mark.parametrize("enable_configurable_moe", [0, 1],
ids=lambda x: ""
if x == 0 else "enable_configurable_moe")
@pytest.mark.parametrize("v2_kv_cache", [True, False],
ids=["v2_kv_cache", "v1_kv_cache"])
def test_w4_4gpus(self, kv_cache_dtype, moe_backend, tp_size, pp_size,
ep_size, attention_dp, cuda_graph, overlap_scheduler,
enable_configurable_moe, mocker):
# Handle ENABLE_CONFIGURABLE_MOE environment variable
if enable_configurable_moe == 1 and moe_backend not in [
"TRTLLM", "CUTLASS"
]:
pytest.skip(
f"ENABLE_CONFIGURABLE_MOE=1 is only supported with TRTLLM and CUTLASS backend, "
f"current backend is {moe_backend}")
# Patch MpiPoolSession to propagate env vars to MPI worker processes
env_value = "1" if enable_configurable_moe == 1 and moe_backend in [
"TRTLLM", "CUTLASS"
] else "0"
patch_mpi_pool_session_for_env(mocker,
{"ENABLE_CONFIGURABLE_MOE": env_value})
mocker, v2_kv_cache):
MAX_OUTPUT_LEN = 128179
MAX_INPUT_LEN = 32768
@ -4563,7 +4552,8 @@ class TestGPTOSS(LlmapiAccuracyTestHarness):
moe_config=MoeConfig(backend=moe_backend))
kv_cache_config = KvCacheConfig(free_gpu_memory_fraction=0.7,
dtype=kv_cache_dtype)
dtype=kv_cache_dtype,
use_kv_cache_manager_v2=v2_kv_cache)
max_seq_len = MAX_INPUT_LEN + MAX_OUTPUT_LEN
llm = LLM(self.MODEL_PATH,

View File

@ -159,11 +159,13 @@ accuracy/test_llm_api_pytorch.py::TestQwen3_235B_A22B::test_nvfp4[latency_moe_cu
accuracy/test_llm_api_pytorch.py::TestQwen3_235B_A22B::test_nvfp4[latency_moe_trtllm]
accuracy/test_llm_api_pytorch.py::TestQwen3_235B_A22B::test_nvfp4[latency_moe_trtllm_attention_dp]
accuracy/test_llm_api_pytorch.py::TestQwen3_235B_A22B::test_nvfp4_4gpus[latency_moe_trtllm_eagle3]
accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_1gpu[True-True-cutlass-auto]
accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_1gpu[True-True-cutlass-fp8]
accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_1gpu[True-True-triton-auto]
accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_1gpu[True-True-trtllm-auto]
accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_1gpu[True-True-trtllm-fp8]
accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_1gpu[v1_kv_cache-True-True-cutlass-auto]
accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_1gpu[v1_kv_cache-True-True-cutlass-fp8]
accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_1gpu[v1_kv_cache-True-True-triton-auto]
accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_1gpu[v1_kv_cache-True-True-trtllm-auto]
accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_1gpu[v1_kv_cache-True-True-trtllm-fp8]
accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_1gpu[v2_kv_cache-True-True-trtllm-auto]
accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_1gpu[v2_kv_cache-True-True-trtllm-fp8]
accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_2gpus[tp2-cutlass-auto]
accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_2gpus[tp2-trtllm-auto]
accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_2gpus[tp2-trtllm-fp8]
@ -176,18 +178,22 @@ accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_2gpus[dp2-cutlass-auto]
accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_2gpus[dp2-trtllm-auto]
accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_2gpus[dp2-trtllm-fp8]
accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_2gpus[dp2-triton-auto]
accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_4gpus[dp4-cutlass-auto]
accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_4gpus[dp4-triton-auto]
accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_4gpus[dp4-trtllm-auto]
accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_4gpus[dp4-trtllm-fp8]
accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_4gpus[ep4-cutlass-auto]
accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_4gpus[ep4-triton-auto]
accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_4gpus[ep4-trtllm-auto]
accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_4gpus[ep4-trtllm-fp8]
accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_4gpus[tp4-cutlass-auto]
accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_4gpus[tp4-triton-auto]
accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_4gpus[tp4-trtllm-auto]
accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_4gpus[tp4-trtllm-fp8]
accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_4gpus[v1_kv_cache-dp4-cutlass-auto]
accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_4gpus[v1_kv_cache-dp4-triton-auto]
accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_4gpus[v1_kv_cache-dp4-trtllm-auto]
accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_4gpus[v1_kv_cache-dp4-trtllm-fp8]
accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_4gpus[v1_kv_cache-ep4-cutlass-auto]
accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_4gpus[v1_kv_cache-ep4-triton-auto]
accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_4gpus[v1_kv_cache-ep4-trtllm-auto]
accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_4gpus[v1_kv_cache-ep4-trtllm-fp8]
accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_4gpus[v1_kv_cache-tp4-cutlass-auto]
accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_4gpus[v1_kv_cache-tp4-triton-auto]
accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_4gpus[v1_kv_cache-tp4-trtllm-auto]
accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_4gpus[v1_kv_cache-tp4-trtllm-fp8]
accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_4gpus[v2_kv_cache-dp4-trtllm-auto]
accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_4gpus[v2_kv_cache-dp4-trtllm-fp8]
accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_4gpus[v2_kv_cache-ep4-trtllm-auto]
accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_4gpus[v2_kv_cache-ep4-trtllm-fp8]
accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4a16[dp4-auto]
accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4a16[dp4-fp8]
accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_chunked_prefill[cutlass-auto]

View File

@ -49,11 +49,13 @@ accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_eagle3_4gpus[trtllm-one_model
accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_eagle3_4gpus[trtllm-one_model-overlap_scheduler]
accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_eagle3_4gpus[trtllm-two_model-no_overlap_scheduler]
accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_eagle3_4gpus[trtllm-two_model-overlap_scheduler]
accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_1gpu[True-True-cutlass-auto]
accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_1gpu[True-True-cutlass-fp8]
accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_1gpu[True-True-triton-auto]
accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_1gpu[True-True-trtllm-auto]
accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_1gpu[True-True-trtllm-fp8]
accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_1gpu[v1_kv_cache-True-True-cutlass-auto]
accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_1gpu[v1_kv_cache-True-True-cutlass-fp8]
accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_1gpu[v1_kv_cache-True-True-triton-auto]
accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_1gpu[v1_kv_cache-True-True-trtllm-auto]
accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_1gpu[v1_kv_cache-True-True-trtllm-fp8]
accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_1gpu[v2_kv_cache-True-True-trtllm-auto]
accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_1gpu[v2_kv_cache-True-True-trtllm-fp8]
accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_2gpus[dp2-cutlass-auto]
accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_2gpus[dp2-triton-auto]
accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_2gpus[dp2-trtllm-auto]
@ -66,18 +68,22 @@ accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_2gpus[tp2-cutlass-auto]
accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_2gpus[tp2-triton-auto]
accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_2gpus[tp2-trtllm-auto]
accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_2gpus[tp2-trtllm-fp8]
accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_4gpus[dp4-cutlass-auto]
accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_4gpus[dp4-triton-auto]
accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_4gpus[dp4-trtllm-auto]
accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_4gpus[dp4-trtllm-fp8]
accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_4gpus[ep4-cutlass-auto]
accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_4gpus[ep4-triton-auto]
accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_4gpus[ep4-trtllm-auto]
accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_4gpus[ep4-trtllm-fp8]
accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_4gpus[tp4-cutlass-auto]
accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_4gpus[tp4-triton-auto]
accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_4gpus[tp4-trtllm-auto]
accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_4gpus[tp4-trtllm-fp8]
accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_4gpus[v1_kv_cache-dp4-cutlass-auto]
accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_4gpus[v1_kv_cache-dp4-triton-auto]
accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_4gpus[v1_kv_cache-dp4-trtllm-auto]
accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_4gpus[v1_kv_cache-dp4-trtllm-fp8]
accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_4gpus[v1_kv_cache-ep4-cutlass-auto]
accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_4gpus[v1_kv_cache-ep4-triton-auto]
accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_4gpus[v1_kv_cache-ep4-trtllm-auto]
accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_4gpus[v1_kv_cache-ep4-trtllm-fp8]
accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_4gpus[v1_kv_cache-tp4-cutlass-auto]
accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_4gpus[v1_kv_cache-tp4-triton-auto]
accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_4gpus[v1_kv_cache-tp4-trtllm-auto]
accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_4gpus[v1_kv_cache-tp4-trtllm-fp8]
accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_4gpus[v2_kv_cache-dp4-trtllm-auto]
accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_4gpus[v2_kv_cache-dp4-trtllm-fp8]
accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_4gpus[v2_kv_cache-ep4-trtllm-auto]
accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_4gpus[v2_kv_cache-ep4-trtllm-fp8]
accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_chunked_prefill[cutlass-auto]
accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_chunked_prefill[triton-auto]
accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_chunked_prefill[trtllm-auto]

View File

@ -144,13 +144,15 @@ accuracy/test_llm_api_pytorch.py::TestQwen3_30B_A3B::test_nvfp4[latency_moe_cutl
accuracy/test_llm_api_pytorch.py::TestQwen3_235B_A22B::test_fp8[throughput_latency]
accuracy/test_llm_api_pytorch.py::TestQwen3_235B_A22B::test_fp8[latency]
accuracy/test_llm_api_pytorch.py::TestQwen3_235B_A22B::test_nvfp4[latency_moe_cutlass]
accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_1gpu[True-True-cutlass-auto]
accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_1gpu[v1_kv_cache-True-True-cutlass-auto]
accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_2gpus[tp2-cutlass-auto]
accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_2gpus[ep2-cutlass-auto]
accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_2gpus[dp2-cutlass-auto]
accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_4gpus[dp4-cutlass-auto]
accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_4gpus[ep4-cutlass-auto]
accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_4gpus[tp4-cutlass-auto]
accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_4gpus[v1_kv_cache-dp4-cutlass-auto]
accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_4gpus[v1_kv_cache-ep4-cutlass-auto]
accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_4gpus[v1_kv_cache-tp4-cutlass-auto]
accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_4gpus[v2_kv_cache-dp4-cutlass-auto]
accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_4gpus[v2_kv_cache-ep4-cutlass-auto]
accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4a16[dp4-auto]
accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_chunked_prefill[cutlass-auto]
accuracy/test_llm_api_pytorch.py::TestQwen3_235B_A22B::test_nvfp4[latency_moe_trtllm]

View File

@ -38,10 +38,12 @@ l0_b200:
- accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_chunked_prefill[quant_dtype=nvfp4-kv_cache_reuse=True-fp8kv=True-overlap_scheduler=True]
- accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_guided_decoding[xgrammar-mtp_nextn=2]
- accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_dummy_load_format
- accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_1gpu[True-True-cutlass-auto]
- accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_1gpu[True-True-trtllm-auto]
- accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_1gpu[True-True-trtllm-fp8]
- accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_1gpu[True-True-triton-auto]
- accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_1gpu[v1_kv_cache-True-True-cutlass-auto]
- accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_1gpu[v1_kv_cache-True-True-trtllm-auto]
- accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_1gpu[v1_kv_cache-True-True-trtllm-fp8]
- accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_1gpu[v1_kv_cache-True-True-triton-auto]
- accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_1gpu[v2_kv_cache-True-True-trtllm-auto]
- accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_1gpu[v2_kv_cache-True-True-trtllm-fp8]
- accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_dummy_load_format
- accuracy/test_llm_api_pytorch.py::TestQwen3_8B::test_fp8_block_scales[latency] # Cover nvbugs 5461712 and 5505402
- accuracy/test_llm_api_pytorch.py::TestQwen3_8B::test_w4a8_mxfp4[fp8-latency]

View File

@ -181,12 +181,16 @@ l0_dgx_b200:
- accuracy/test_llm_api_pytorch.py::TestLlama4ScoutInstruct::test_fp8_chunked_prefill[tp4ep4-cuda_graph=True]
- accuracy/test_disaggregated_serving.py::TestDeepSeekV3Lite::test_guided_decoding[xgrammar-mtp_nextn=0]
- accuracy/test_disaggregated_serving.py::TestQwen3_30B_A3B::test_mixed_ctx_gen_model[ctxpp2gentp2]
- accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_4gpus[tp4-cutlass-auto]
- accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_4gpus[tp4-triton-auto]
- accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_4gpus[ep4-trtllm-auto]
- accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_4gpus[ep4-trtllm-fp8]
- accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_4gpus[dp4-cutlass-auto]
- accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_4gpus[dp4-triton-auto]
- accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_4gpus[v1_kv_cache-tp4-cutlass-auto]
- accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_4gpus[v1_kv_cache-tp4-triton-auto]
- accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_4gpus[v1_kv_cache-ep4-trtllm-auto]
- accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_4gpus[v1_kv_cache-ep4-trtllm-fp8]
- accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_4gpus[v1_kv_cache-dp4-cutlass-auto]
- accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_4gpus[v1_kv_cache-dp4-triton-auto]
- accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_4gpus[v2_kv_cache-ep4-trtllm-auto]
- accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_4gpus[v2_kv_cache-ep4-trtllm-fp8]
- accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_4gpus[v2_kv_cache-dp4-cutlass-auto]
- accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_4gpus[v2_kv_cache-dp4-triton-auto]
- accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_eagle3_4gpus[trtllm-one_model-overlap_scheduler]
- accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_eagle3_4gpus[trtllm-one_model-no_overlap_scheduler]
- accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_eagle3_4gpus[trtllm-two_model-no_overlap_scheduler]

View File

@ -54,9 +54,11 @@ l0_dgx_b300:
- accuracy/test_llm_api_pytorch.py::TestQwen3_30B_A3B::test_nvfp4[dep4_latency_moe_cutlass-torch_compile=True]
- accuracy/test_disaggregated_serving.py::TestQwen3_30B_A3B::test_mixed_ctx_gen_model[ctxpp2gentp2]
- disaggregated/test_disaggregated.py::test_disaggregated_deepseek_v3_lite_fp8_ucx[DeepSeek-V3-Lite-fp8]
- accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_4gpus[tp4-cutlass-fp8]
- accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_4gpus[tp4-cutlass-auto]
- accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_4gpus[dp4-triton-auto]
- accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_4gpus[v1_kv_cache-tp4-cutlass-fp8]
- accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_4gpus[v1_kv_cache-tp4-cutlass-auto]
- accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_4gpus[v1_kv_cache-dp4-triton-auto]
- accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_4gpus[v2_kv_cache-tp4-cutlass-auto]
- accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_4gpus[v2_kv_cache-dp4-triton-auto]
- accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4a16[dp4-auto]
- accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4a16[dp4-fp8]
- disaggregated/test_disaggregated.py::test_disaggregated_benchmark_on_diff_backends[DeepSeek-V3-Lite-bf16]
@ -86,6 +88,7 @@ l0_dgx_b300:
- accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16_4gpus[tp2pp2-mtp_nextn=2-attention_dp=True-cuda_graph=True-overlap_scheduler=True-torch_compile=False]
- accuracy/test_llm_api_pytorch.py::TestQwen3_30B_A3B::test_nvfp4[tep4_latency_moe_trtllm-torch_compile=False]
- disaggregated/test_disaggregated.py::test_disaggregated_deepseek_v3_lite_fp8_nixl[DeepSeek-V3-Lite-fp8]
- accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_4gpus[ep4-trtllm-auto]
- accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_4gpus[v1_kv_cache-ep4-trtllm-auto]
- accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_4gpus[v2_kv_cache-ep4-trtllm-auto]
- accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4_4gpus[moe_backend=TRTLLM-mtp_nextn=0-tp4-fp8kv=True-attention_dp=True-cuda_graph=True-overlap_scheduler=True-torch_compile=False]
- accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4_4gpus[moe_backend=CUTLASS-mtp_nextn=2-tp2pp2-fp8kv=True-attention_dp=True-cuda_graph=True-overlap_scheduler=True-torch_compile=False]

View File

@ -210,12 +210,16 @@ l0_dgx_h100:
auto_trigger: gpt_oss
orchestrator: mpi
tests:
- accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_4gpus[tp4-cutlass-auto]
- accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_4gpus[tp4-triton-auto]
- accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_4gpus[ep4-cutlass-auto]
- accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_4gpus[ep4-triton-auto]
- accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_4gpus[dp4-cutlass-auto]
- accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_4gpus[dp4-triton-auto]
- accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_4gpus[v1_kv_cache-tp4-cutlass-auto]
- accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_4gpus[v1_kv_cache-tp4-triton-auto]
- accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_4gpus[v1_kv_cache-ep4-cutlass-auto]
- accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_4gpus[v1_kv_cache-ep4-triton-auto]
- accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_4gpus[v1_kv_cache-dp4-cutlass-auto]
- accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_4gpus[v1_kv_cache-dp4-triton-auto]
- accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_4gpus[v2_kv_cache-ep4-cutlass-auto]
- accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_4gpus[v2_kv_cache-ep4-triton-auto]
- accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_4gpus[v2_kv_cache-dp4-cutlass-auto]
- accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_4gpus[v2_kv_cache-dp4-triton-auto]
- accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4a16[dp4-auto]
- condition:
ranges:

View File

@ -53,13 +53,19 @@ l0_gb200_multi_gpus:
- accuracy/test_llm_api_pytorch.py::TestLlama3_3_70BInstruct::test_nvfp4_tp4[torch_compile=False]
- accuracy/test_llm_api_pytorch.py::TestLlama3_3_70BInstruct::test_nvfp4_tp4[torch_compile=True]
- accuracy/test_llm_api_pytorch.py::TestLlama3_3_70BInstruct::test_fp4_tp2pp2[torch_compile=True-enable_gemm_allreduce_fusion=False]
- accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_4gpus[tp4-trtllm-auto]
- accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_4gpus[dp4-cutlass-auto]
- accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_4gpus[v1_kv_cache-tp4-trtllm-auto]
- accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_4gpus[v1_kv_cache-dp4-cutlass-auto]
- accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_4gpus[v2_kv_cache-tp4-trtllm-auto]
- accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_4gpus[v2_kv_cache-dp4-cutlass-auto]
- accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_4gpus_online_eplb[fp8]
- accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_4gpus[ep4-cutlass-auto]
- accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_4gpus[ep4-triton-auto]
- accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_4gpus[dp4-trtllm-auto]
- accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_4gpus[dp4-trtllm-fp8]
- accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_4gpus[v1_kv_cache-ep4-cutlass-auto]
- accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_4gpus[v1_kv_cache-ep4-triton-auto]
- accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_4gpus[v1_kv_cache-dp4-trtllm-auto]
- accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_4gpus[v1_kv_cache-dp4-trtllm-fp8]
- accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_4gpus[v2_kv_cache-ep4-cutlass-auto]
- accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_4gpus[v2_kv_cache-ep4-triton-auto]
- accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_4gpus[v2_kv_cache-dp4-trtllm-auto]
- accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_4gpus[v2_kv_cache-dp4-trtllm-fp8]
- accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4a16[dp4-auto]
- accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4a16[dp4-fp8]
- accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_eagle3_4gpus[trtllm-two_model-overlap_scheduler]

View File

@ -17,6 +17,7 @@ l0_gb300:
tests:
# ------------- PyTorch tests ---------------
- accuracy/test_llm_api_pytorch.py::TestQwen3_8B::test_w4a8_mxfp4[fp8-latency]
- accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_1gpu[True-True-cutlass-auto]
- accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_1gpu[v1_kv_cache-True-True-cutlass-auto]
- accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_1gpu[v2_kv_cache-True-True-cutlass-auto]
- accuracy/test_llm_api_pytorch.py::TestQwen3_8B::test_fp8_block_scales[latency] # Cover nvbugs 5461712 and 5505402
- unittest/_torch/thop/parallel TIMEOUT (90)

View File

@ -39,7 +39,7 @@ l0_rtx_pro_6000:
- test_e2e.py::test_ptp_quickstart_multimodal_phi4mm[phi4-multimodal-instruct-fp8-multimodals/Phi-4-multimodal-instruct-FP8-image_audio]
- accuracy/test_llm_api_pytorch.py::TestQwen3_30B_A3B::test_nvfp4[latency_moe_cutlass-torch_compile=False] # 8mins
- accuracy/test_llm_api_pytorch.py::TestQwen3_30B_A3B::test_nvfp4[latency_moe_cutlass-torch_compile=True] # 8 mins
- accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_1gpu[True-True-cutlass-auto]
- accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_1gpu[v1_kv_cache-True-True-cutlass-auto]
- accuracy/test_llm_api_pytorch.py::TestPhi4MM::test_fp4
- accuracy/test_llm_api_pytorch.py::TestPhi4MM::test_fp8
- accuracy/test_llm_api_pytorch.py::TestQwen3NextInstruct::test_nvfp4[tp1-cutlass]

View File

@ -181,8 +181,10 @@ full:L40S/accuracy/test_disaggregated_serving.py::TestLlama3_1_8BInstruct::test_
unittest/llmapi/test_memory_profiling.py::test_profile_kvcache SKIP (https://nvbugs/5580781)
triton_server/test_triton.py::test_llava[llava] SKIP (https://nvbugs/5547414)
full:RTX/accuracy/test_llm_api_pytorch.py::TestGemma3_1BInstruct::test_auto_dtype SKIP (https://nvbugs/5569696)
accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_4gpus[ep4-cutlass-auto] SKIP (https://nvbugs/5596343)
accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_4gpus[tp4-cutlass-auto] SKIP (https://nvbugs/5596343)
accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_4gpus[v1_kv_cache-ep4-cutlass-auto] SKIP (https://nvbugs/5596343)
accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_4gpus[v1_kv_cache-tp4-cutlass-auto] SKIP (https://nvbugs/5596343)
accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_4gpus[v2_kv_cache-ep4-cutlass-auto] SKIP (https://nvbugs/5596343)
accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_4gpus[v2_kv_cache-tp4-cutlass-auto] SKIP (https://nvbugs/5596343)
examples/test_phi.py::test_llm_phi_lora_1gpu[Phi-3-mini-4k-instruct-ru-lora-Phi-3-mini-4k-instruct-lora_fp16-base_fp16] SKIP (https://nvbugs/5612313)
triton_server/test_triton.py::test_cpp_unit_tests[cpp-unit-tests] SKIP (https://nvbugs/5619359)
triton_server/test_triton_rcca.py::test_rcca_bug_4934893[Temperature:0.5-TOP_P:0.95-TOP_K:10-False-1---False-True-False-0-2048-enableDecoupleMode-inflight_fused_batching-disableTrtOverlap--max_utilization---1-1-1-False-ensemble] SKIP (https://nvbugs/5619369)
@ -312,7 +314,8 @@ accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_guided_decoding_
accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_guided_decoding_with_eagle3[llguidance-eagle3_one_model=True] SKIP (https://nvbugs/5821053)
accuracy/test_llm_api_pytorch.py::TestLlama3_3_70BInstruct::test_fp8_tp4[torch_compile=True] SKIP (https://nvbugs/5821415)
accuracy/test_llm_api_pytorch.py::TestLlama3_3_70BInstruct::test_nvfp4_tp4[torch_compile=True] SKIP (https://nvbugs/5821415)
accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_4gpus[ep4-trtllm-auto] SKIP (https://nvbugs/5651865)
accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_4gpus[v1_kv_cache-ep4-trtllm-auto] SKIP (https://nvbugs/5651865)
accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_4gpus[v2_kv_cache-ep4-trtllm-auto] SKIP (https://nvbugs/5651865)
accuracy/test_disaggregated_serving.py::TestDeepSeekV3Lite::test_auto_dtype[mtp_nextn=0-overlap_scheduler=True] SKIP (https://nvbugs/5822983)
accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_fp8_block_scales_4gpus[tp4-mtp_nextn=2-fp8kv=True-attention_dp=False-cuda_graph=False-overlap_scheduler=False-torch_compile=False-sampler_async_worker=False] SKIP (https://nvbugs/5701445)
accuracy/test_disaggregated_serving.py::TestDeepSeekV3Lite::test_guided_decoding[xgrammar-mtp_nextn=0] SKIP (https://nvbugs/5748600)
@ -325,7 +328,8 @@ perf/test_perf_sanity.py::test_e2e[disagg_upload-deepseek-r1-fp4_1k1k_ctx1_gen1_
accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_bfloat16_4gpus[tp4-attn_backend=TRTLLM-torch_compile=True] SKIP (https://nvbugs/5826604)
disaggregated/test_disaggregated.py::test_disaggregated_mixed[TinyLlama-1.1B-Chat-v1.0] SKIP (https://nvbugs/5834212)
accuracy/test_llm_api_pytorch.py::TestGLM4_5Air::test_nvfp4_multi_gpus[throughput_trtllm] SKIP (https://nvbugs/5837275)
accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_4gpus[ep4-trtllm-fp8] SKIP (https://nvbugs/5640697)
accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_4gpus[v1_kv_cache-ep4-trtllm-fp8] SKIP (https://nvbugs/5640697)
accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_4gpus[v2_kv_cache-ep4-trtllm-fp8] SKIP (https://nvbugs/5640697)
accuracy/test_llm_api_pytorch.py::TestGLM4_5Air::test_nvfp4_multi_gpus[throughput] SKIP (https://nvbugs/5837275)
test_e2e.py::test_ptp_quickstart_advanced_deepseek_r1_w4afp8_8gpus[DeepSeek-R1-W4AFP8-DeepSeek-R1/DeepSeek-R1-W4AFP8] SKIP (https://nvbugs/5836830)
accuracy/test_llm_api_pytorch.py::TestNemotronV3Super::test_auto_dtype_4gpus[4-4-True-False-False] SKIP (https://nvbugs/5823587)
@ -342,7 +346,8 @@ perf/test_perf_sanity.py::test_e2e[aggr_upload-deepseek_r1_fp4_v2_grace_blackwel
examples/test_llama.py::test_llama_3_x_with_bf16_lora_torch[llama-3.2-1b-instruct] SKIP (https://nvbugs/5838178)
accuracy/test_llm_api_autodeploy.py::TestNemotronSuperV3::test_bf16 SKIP (https://nvbugs/5838184)
cpp/test_multi_gpu.py::test_cache_transceiver[8proc-mooncake_kvcache-90] SKIP (https://nvbugs/5838199)
accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_4gpus[dp4-cutlass-auto] SKIP (https://nvbugs/5838211)
accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_4gpus[v1_kv_cache-dp4-cutlass-auto] SKIP (https://nvbugs/5838211)
accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_4gpus[v2_kv_cache-dp4-cutlass-auto] SKIP (https://nvbugs/5838211)
test_e2e.py::test_ptp_quickstart_advanced_ngram[Llama-3.1-8B-Instruct-llama-3.1-model/Llama-3.1-8B-Instruct] SKIP (https://nvbugs/5843112)
accuracy/test_llm_api_pytorch.py::TestDeepSeekR1::test_fp8_blockscale[throughput_mtp] SKIP (https://nvbugs/5839028)
full:A10/unittest/kv_cache_manager_v2_tests/ SKIP (https://nvbugs/5841954)
@ -366,7 +371,8 @@ full:RTXPro6000D/accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp
full:RTXPro6000D/accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4_4gpus[moe_backend=CUTLASS-mtp_nextn=2-tp4-fp8kv=True-attention_dp=True-cuda_graph=True-overlap_scheduler=True-torch_compile=False] SKIP (https://nvbugs/5846154)
perf/test_perf_sanity.py::test_e2e[aggr_upload-deepseek_r1_fp4_v2_grace_blackwell-r1_fp4_v2_dep4_mtp1_1k8k] SKIP (https://nvbugs/5846166)
accuracy/test_disaggregated_serving.py::TestDeepSeekV32Exp::test_auto_dtype[False] SKIP (https://nvbugs/5847284)
accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_4gpus[dp4-trtllm-fp8] SKIP (https://nvbugs/5850183)
accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_4gpus[v1_kv_cache-dp4-trtllm-fp8] SKIP (https://nvbugs/5850183)
accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_4gpus[v2_kv_cache-dp4-trtllm-fp8] SKIP (https://nvbugs/5850183)
examples/test_ad_speculative_decoding.py::test_autodeploy_eagle3_acceptance_rate SKIP (https://nvbugs/5855540)
unittest/_torch/auto_deploy/unit/singlegpu/test_ad_speculative_decoding.py::test_ad_speculative_decoding_smoke[False] SKIP (https://nvbugs/5859869)
unittest/_torch/thop/serial/test_moe.py::TestMoeFp4::test_autotune_fp8_fp4[RoutingDSlite-384-1024-1] SKIP (https://nvbugs/5859881)

View File

@ -14,8 +14,9 @@ from tensorrt_llm._torch.attention_backend import (AttentionBackend,
from tensorrt_llm._torch.attention_backend.interface import \
PredefinedAttentionMask
from tensorrt_llm._torch.metadata import KVCacheParams
from tensorrt_llm._torch.pyexecutor.resource_manager import KVCacheManager
from tensorrt_llm.bindings.executor import KvCacheConfig
from tensorrt_llm._torch.pyexecutor.resource_manager import (KVCacheManager,
KVCacheManagerV2)
from tensorrt_llm.llmapi.llm_args import KvCacheConfig
from tensorrt_llm.mapping import Mapping
from tensorrt_llm.models.modeling_utils import QuantConfig
from tensorrt_llm.quantization.mode import QuantAlgo
@ -131,8 +132,13 @@ paged_backends = {
}
def kv_cache_manager_from(Attention: type[AttentionBackend], s: Scenario,
kv_cache: torch.Tensor) -> KVCacheManager:
def kv_cache_manager_from(
Attention: type[AttentionBackend],
s: Scenario,
kv_cache: torch.Tensor,
request_ids: list[int],
token_nums: list[int],
use_kv_cache_manager_v2: bool = False) -> KVCacheManager:
paged = paged_backends[Attention]
num_blocks = s.max_num_pages if paged else s.batch_size
@ -158,7 +164,12 @@ def kv_cache_manager_from(Attention: type[AttentionBackend], s: Scenario,
cache_type = tensorrt_llm.bindings.internal.batch_manager.CacheType.CROSS if s.cross else tensorrt_llm.bindings.internal.batch_manager.CacheType.SELF
result = KVCacheManager(
if use_kv_cache_manager_v2:
kv_cache_manager_cls = KVCacheManagerV2
else:
kv_cache_manager_cls = KVCacheManager
kv_cache_manager = kv_cache_manager_cls(
kv_cache_config,
cache_type,
num_layers=num_layers,
@ -171,9 +182,19 @@ def kv_cache_manager_from(Attention: type[AttentionBackend], s: Scenario,
dtype=kv_cache_dtype,
)
kv_cache_manager.add_dummy_requests(request_ids, token_nums)
for i in range(s.num_layers):
result.get_buffers(i).view_as(kv_cache[i]).copy_(kv_cache[i])
return result
buffer = kv_cache_manager.get_buffers(i)
block_ids = [
block_id
for req_block_ids in kv_cache_manager.get_batch_cache_indices(
request_ids, i) for block_id in req_block_ids
if block_id is not -1
]
for idx, block_id in enumerate(block_ids):
buffer[block_id].view_as(kv_cache[i][idx]).copy_(kv_cache[i][idx])
return kv_cache_manager
def produce_outputs(
@ -181,6 +202,7 @@ def produce_outputs(
q_at_layer: torch.Tensor,
kv: Optional[torch.Tensor],
s: Scenario,
use_kv_cache_manager_v2: bool = False,
*,
kv_cache: torch.Tensor,
num_cached_tokens: Callable[[int], int] | int,
@ -197,12 +219,13 @@ def produce_outputs(
kv_cache_params = KVCacheParams(
use_cache=True, num_cached_tokens_per_seq=num_cached_tokens_per_seq)
kv_cache_manager = kv_cache_manager_from(Attention, s, kv_cache)
request_ids = list(range(s.batch_size))
seq_lens_append = seq_lens_kv if seq_lens_kv is not None else seq_lens
token_nums = (torch.tensor(num_cached_tokens_per_seq) +
seq_lens_append).tolist()
kv_cache_manager.add_dummy_requests(request_ids, token_nums)
kv_cache_manager = kv_cache_manager_from(Attention, s, kv_cache,
request_ids, token_nums,
use_kv_cache_manager_v2)
metadata = Attention.Metadata(
num_contexts=num_contexts if num_contexts is not None else s.batch_size,
@ -414,7 +437,9 @@ def test_flashinfer_prefill():
Scenario(num_layers=1, qo_len=32, kv_len=64, causal=False)
],
ids=["typical", "non-causal", "cross", "cross-diff-kv-len"])
def test_attention_backend(s: Scenario):
@pytest.mark.parametrize("use_kv_cache_manager_v2", [True, False],
ids=["v2_kv_cache", "v1_kv_cache"])
def test_attention_backend(s: Scenario, use_kv_cache_manager_v2: bool):
dtype = s.dtype
num_layers = s.num_layers
num_heads = s.num_heads
@ -457,6 +482,7 @@ def test_attention_backend(s: Scenario):
q_at_layer,
kv,
s,
use_kv_cache_manager_v2=use_kv_cache_manager_v2,
kv_cache=kv_cache,
num_cached_tokens=past_kv_len,
seq_lens=torch.full((batch_size, ), qo_len).int(),
@ -559,7 +585,9 @@ def generate_causal_mask(seq_lens, qo_lens, batch_size, dtype):
kvcache_dtype=torch.float8_e4m3fn),
],
ids=["fp16", "fp16-cross", "fp8", "fp8-cross"])
def test_attention_backend_ifb(s: PagedScenario):
@pytest.mark.parametrize("use_kv_cache_manager_v2", [True, False],
ids=["v2_kv_cache", "v1_kv_cache"])
def test_attention_backend_ifb(s: PagedScenario, use_kv_cache_manager_v2: bool):
dtype = s.dtype
is_fp8 = s.kvcache_dtype == torch.float8_e4m3fn
if is_fp8 and getSMVersion() < 89:
@ -625,6 +653,7 @@ def test_attention_backend_ifb(s: PagedScenario):
q_at_layer,
kv,
s,
use_kv_cache_manager_v2=use_kv_cache_manager_v2,
kv_cache=kv_cache,
num_cached_tokens=lambda i: num_cached_tokens_prefill
if i < num_contexts else num_cached_tokens_decode,

View File

@ -5,6 +5,7 @@ from typing import List
import pytest
import torch
from utils.util import getSMVersion
import tensorrt_llm
from tensorrt_llm._torch.attention_backend.interface import (
@ -14,10 +15,11 @@ from tensorrt_llm._torch.metadata import KVCacheParams
from tensorrt_llm._torch.pyexecutor.llm_request import (LlmRequest,
LlmRequestState,
SamplingConfig)
from tensorrt_llm._torch.pyexecutor.resource_manager import KVCacheManager
from tensorrt_llm._torch.pyexecutor.resource_manager import (KVCacheManager,
KVCacheManagerV2)
from tensorrt_llm._utils import str_dtype_to_binding, torch_dtype_to_str
from tensorrt_llm.bindings.executor import KvCacheConfig
from tensorrt_llm.functional import PositionEmbeddingType, RopeEmbeddingUtils
from tensorrt_llm.llmapi.llm_args import KvCacheConfig
from tensorrt_llm.mapping import Mapping
from tensorrt_llm.models.modeling_utils import QuantConfig
from tensorrt_llm.quantization.mode import QuantAlgo
@ -359,10 +361,17 @@ accuracy_dict = {
@pytest.mark.parametrize("num_generation_steps",
num_generation_steps,
ids=lambda x: f"num_generation_steps: {x}")
@pytest.mark.parametrize("v2_kv_cache", [True, False],
ids=["v2_kv_cache", "v1_kv_cache"])
def test_attention_mla(scenario: Scenario, context_sequence_lengths: List[int],
generation_seq_len_q: int,
num_generation_steps: List[int]):
num_generation_steps: List[int], v2_kv_cache: bool):
"""Test MLA computation for both context and generation phases"""
if v2_kv_cache and getSMVersion() != 100:
pytest.skip(
"v2_kv_cache is only supported for MLA on Blackwell architectures")
num_heads = scenario.num_heads
num_kv_heads = scenario.num_kv_heads
q_lora_rank = scenario.q_lora_rank
@ -403,7 +412,8 @@ def test_attention_mla(scenario: Scenario, context_sequence_lengths: List[int],
qk_rope_head_dim, v_head_dim, rope_config,
kv_cache_tokens_per_block, device, dtype,
kv_cache_dtype, context_sequence_lengths,
generation_seq_len_q, num_generation_steps)
generation_seq_len_q, num_generation_steps,
v2_kv_cache)
def _run_test_for_backend(backend_name, num_heads, num_kv_heads, num_layers,
@ -411,7 +421,8 @@ def _run_test_for_backend(backend_name, num_heads, num_kv_heads, num_layers,
qk_rope_head_dim, v_head_dim, rope_config,
kv_cache_tokens_per_block, device, dtype,
kv_cache_dtype, context_sequence_lengths,
generation_seq_len_q, num_generation_steps):
generation_seq_len_q, num_generation_steps,
v2_kv_cache):
AttentionCls = get_attention_backend(backend_name)
qk_head_dim = qk_nope_head_dim + qk_rope_head_dim
@ -597,7 +608,8 @@ def _run_test_for_backend(backend_name, num_heads, num_kv_heads, num_layers,
(num_generation_steps + 1) * generation_seq_len_q +
kv_cache_tokens_per_block - 1
) // kv_cache_tokens_per_block * kv_cache_tokens_per_block * max_num_contexts
kv_cache_manager = KVCacheManager(
kv_cache_cls = KVCacheManagerV2 if v2_kv_cache else KVCacheManager
kv_cache_manager = kv_cache_cls(
KvCacheConfig(
max_tokens=max_tokens,
enable_block_reuse=False,
@ -625,8 +637,14 @@ def _run_test_for_backend(backend_name, num_heads, num_kv_heads, num_layers,
)
req.paged_kv_block_ids = []
beam_width = 1
kv_cache_manager.impl.add_sequence(req_id, ctx_len, beam_width, req)
request_list.append(req)
if v2_kv_cache:
kv_cache = kv_cache_manager._create_kv_cache(req_id, None, None)
success = kv_cache.resume(torch.cuda.current_stream().cuda_stream)
assert success, f"Failed to resume KV cache for request {req_id}"
kv_cache.capacity = ctx_len
else:
kv_cache_manager.impl.add_sequence(req_id, ctx_len, beam_width, req)
attn_metadata = AttentionCls.Metadata(
seq_lens=torch.tensor(context_sequence_lengths, dtype=torch.int),
request_ids=list(range(len(context_sequence_lengths))),
@ -649,7 +667,11 @@ def _run_test_for_backend(backend_name, num_heads, num_kv_heads, num_layers,
if step > 0:
for req_id in range(len(context_sequence_lengths)):
for _ in range(generation_seq_len_q):
kv_cache_manager.impl.add_token(req_id)
if v2_kv_cache:
kv_cache = kv_cache_manager.kv_cache_map[req_id]
kv_cache.capacity += 1
else:
kv_cache_manager.impl.add_token(req_id)
attn_metadata = AttentionCls.Metadata(
seq_lens=torch.tensor([generation_seq_len_q] *
len(context_sequence_lengths),