diff --git a/.github/CODEOWNERS b/.github/CODEOWNERS index 35181f4f3d..91062b138a 100644 --- a/.github/CODEOWNERS +++ b/.github/CODEOWNERS @@ -1,5 +1,18 @@ # This file defines code ownership rules for the repository. +## TensorRT-LLM QA +### Integration Tests +/tests/integration/test_lists/qa @NVIDIA/trt-llm-qa +/tests/integration/defs/examples/test_ray.py @NVIDIA/trt-llm-qa-function +/tests/integration/defs/examples/test_redrafter.py @NVIDIA/trt-llm-qa-function +/tests/integration/defs/accuracy @NVIDIA/trt-llm-qa-function +/tests/integration/defs/stress_test @NVIDIA/trt-llm-qa-function +/tests/integration/defs/triton_server @NVIDIA/trt-llm-qa-function +/tests/integration/defs/test_e2e.py @NVIDIA/trt-llm-qa-function +/tests/integration/defs/disaggregated @NVIDIA/trt-llm-qa-serving +/tests/integration/defs/sysinfo @NVIDIA/trt-llm-qa-perf +/tests/integration/defs/perf @NVIDIA/trt-llm-qa-perf +/tests/integration/defs/perf/disagg @NVIDIA/trt-llm-qa-serving ## TensorRT-LLM Infra ### CI diff --git a/.gitignore b/.gitignore index 130ea9837b..7f7ffd18c6 100644 --- a/.gitignore +++ b/.gitignore @@ -56,7 +56,7 @@ tensorrt_llm/scripts docs/source/**/*.rst !docs/source/examples/index.rst !docs/source/deployment-guide/config_table.rst -!docs/source/deployment-guide/note_sections.rst +!docs/source/_includes/note_sections.rst *.swp # Testing diff --git a/cpp/include/tensorrt_llm/executor/executor.h b/cpp/include/tensorrt_llm/executor/executor.h index dda8f52cc8..787fa0bb7e 100644 --- a/cpp/include/tensorrt_llm/executor/executor.h +++ b/cpp/include/tensorrt_llm/executor/executor.h @@ -1468,7 +1468,8 @@ public: DEFAULT = 0, MPI = 1, UCX = 2, - NIXL = 3 + NIXL = 3, + MOONCAKE = 4 }; explicit CacheTransceiverConfig(std::optional backendType = std::nullopt, std::optional maxNumTokens = std::nullopt, std::optional kvTransferTimeoutMs = std::nullopt, diff --git a/cpp/include/tensorrt_llm/executor/transferAgent.h b/cpp/include/tensorrt_llm/executor/transferAgent.h index ac469fcb40..5f4ff1f061 100644 --- a/cpp/include/tensorrt_llm/executor/transferAgent.h +++ b/cpp/include/tensorrt_llm/executor/transferAgent.h @@ -391,6 +391,14 @@ template "libtensorrt_llm_nixl_wrapper.so", "createNixlTransferAgent"); return func(std::forward(args)...); } + if (backend == "mooncake") + { + auto& loader = DynLibLoader::getInstance(); + using CreateMooncakeFuncType = std::unique_ptr (*)(BaseAgentConfig const*); + auto* func = loader.getFunctionPointer( + "libtensorrt_llm_mooncake_wrapper.so", "createMooncakeTransferAgent"); + return func(std::forward(args)...); + } TLLM_THROW("Unknown backend name."); } diff --git a/cpp/tensorrt_llm/CMakeLists.txt b/cpp/tensorrt_llm/CMakeLists.txt index a9e4a00729..76604ec229 100644 --- a/cpp/tensorrt_llm/CMakeLists.txt +++ b/cpp/tensorrt_llm/CMakeLists.txt @@ -159,6 +159,10 @@ if(NIXL_ROOT) set(NIXL_WRAPPER_TARGET tensorrt_llm_nixl_wrapper) endif() +if(MOONCAKE_ROOT) + set(MOONCAKE_WRAPPER_TARGET tensorrt_llm_mooncake_wrapper) +endif() + add_subdirectory(executor) find_package(Threads REQUIRED) @@ -272,6 +276,11 @@ if(TARGET ${NIXL_WRAPPER_TARGET}) add_dependencies(${SHARED_TARGET} ${NIXL_WRAPPER_TARGET}) endif() +if(TARGET ${MOONCAKE_WRAPPER_TARGET}) + target_link_libraries(${MOONCAKE_WRAPPER_TARGET} INTERFACE ${SHARED_TARGET}) + add_dependencies(${SHARED_TARGET} ${MOONCAKE_WRAPPER_TARGET}) +endif() + if(NOT WIN32) # Load libraries at $PREFIX/lib from # $PREFIX/lib/python3.12/site-packages/tensorrt_llm/libs diff --git a/cpp/tensorrt_llm/batch_manager/cacheTransceiver.cpp b/cpp/tensorrt_llm/batch_manager/cacheTransceiver.cpp index bb253c969f..7e4c26bfd7 100644 --- a/cpp/tensorrt_llm/batch_manager/cacheTransceiver.cpp +++ b/cpp/tensorrt_llm/batch_manager/cacheTransceiver.cpp @@ -81,6 +81,11 @@ std::unique_ptr CacheTransceiverFactory::createCacheTransc backendType = executor::CacheTransceiverConfig::BackendType::NIXL; TLLM_LOG_INFO("Enable NIXL KV cache transport."); } + else if (common::getEnvUseMooncakeKvCache()) + { + backendType = executor::CacheTransceiverConfig::BackendType::MOONCAKE; + TLLM_LOG_INFO("Enable MOONCAKE KV cache transport."); + } else if (common::getEnvUseMPIKvCache()) { backendType = executor::CacheTransceiverConfig::BackendType::MPI; @@ -203,9 +208,15 @@ CacheTransceiver::CacheTransceiver(kv_cache_manager::BaseKVCacheManager* cacheMa else if (backendType.value() == executor::CacheTransceiverConfig::BackendType::NIXL) { mManager = std::make_unique( - mCacheTransBufferManagerPtrs, *mCacheState); + mCacheTransBufferManagerPtrs, *mCacheState, "nixl"); TLLM_LOG_INFO("NIXL Connection Manager created"); } + else if (backendType.value() == executor::CacheTransceiverConfig::BackendType::MOONCAKE) + { + mManager = std::make_unique( + mCacheTransBufferManagerPtrs, *mCacheState, "mooncake"); + TLLM_LOG_INFO("MOONCAKE Connection Manager created"); + } else if (backendType.value() == executor::CacheTransceiverConfig::BackendType::MPI) { mMpiWorldComm = std::addressof(tensorrt_llm::mpi::MpiComm::world()); diff --git a/cpp/tensorrt_llm/common/envUtils.cpp b/cpp/tensorrt_llm/common/envUtils.cpp index fc85975acb..4a082a4ff3 100644 --- a/cpp/tensorrt_llm/common/envUtils.cpp +++ b/cpp/tensorrt_llm/common/envUtils.cpp @@ -281,6 +281,12 @@ bool getEnvUseNixlKvCache() return useNixlKvCache; } +bool getEnvUseMooncakeKvCache() +{ + static bool const useMooncakeKvCache = getBoolEnv("TRTLLM_USE_MOONCAKE_KVCACHE"); + return useMooncakeKvCache; +} + bool getEnvUseRoundRobinBlockDistForCP() { static bool const useRoundRobinBlockDistForCP = getBoolEnv("TRTLLM_USE_ROUND_ROBIN_BLOCK_DIST_FOR_CP"); @@ -343,6 +349,23 @@ std::string getEnvNixlBackend() return nixlBackend; } +std::string getEnvMooncakeInterface() +{ + static std::once_flag flag; + static std::string mooncakeInterface; + + std::call_once(flag, + [&]() + { + char const* mooncake_interface = std::getenv("TRTLLM_MOONCAKE_INTERFACE"); + if (mooncake_interface) + { + mooncakeInterface = mooncake_interface; + } + }); + return mooncakeInterface; +} + bool getEnvDisaggLayerwise() { static bool const disaggLayerwise = getBoolEnv("TRTLLM_DISAGG_LAYERWISE"); diff --git a/cpp/tensorrt_llm/common/envUtils.h b/cpp/tensorrt_llm/common/envUtils.h index 8a3af2458d..f838f0e9ae 100644 --- a/cpp/tensorrt_llm/common/envUtils.h +++ b/cpp/tensorrt_llm/common/envUtils.h @@ -83,8 +83,11 @@ inline void launchWithPdlWhenEnabled(char const* name, KernelFn kernelFn, dim3 g bool getEnvUseUCXKvCache(); bool getEnvUseMPIKvCache(); + bool getEnvUseNixlKvCache(); +bool getEnvUseMooncakeKvCache(); + bool getEnvUseRoundRobinBlockDistForCP(); std::string getEnvUCXInterface(); @@ -93,6 +96,8 @@ std::string getEnvNixlInterface(); std::string getEnvNixlBackend(); +std::string getEnvMooncakeInterface(); + bool getEnvDisaggLayerwise(); bool getEnvParallelCacheSend(); diff --git a/cpp/tensorrt_llm/common/ipUtils.cpp b/cpp/tensorrt_llm/common/ipUtils.cpp new file mode 100644 index 0000000000..e4e9767194 --- /dev/null +++ b/cpp/tensorrt_llm/common/ipUtils.cpp @@ -0,0 +1,226 @@ +/* + * SPDX-FileCopyrightText: Copyright (c) 2022-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "ipUtils.h" +#include "tensorrt_llm/common/logger.h" + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +TRTLLM_NAMESPACE_BEGIN + +namespace common +{ + +std::string getLocalIpByNic(std::string const& interface, int rank) +{ + struct ifaddrs* ifaddr = nullptr; + if (getifaddrs(&ifaddr) == -1) + { + TLLM_LOG_ERROR(rank, + "getLocalIpByNic: Can't get local ip from NIC Interface. Please check whether corresponding INTERFACE is " + "set " + "correctly."); + return std::string{}; + } + + for (struct ifaddrs* ifa = ifaddr; ifa != nullptr; ifa = ifa->ifa_next) + { + if (ifa->ifa_addr == nullptr) + { + continue; + } + + if (ifa->ifa_name == interface) + { + if (ifa->ifa_addr->sa_family == AF_INET) + { + char ip[INET_ADDRSTRLEN]{}; + void* addr = &((reinterpret_cast(ifa->ifa_addr))->sin_addr); + if ((inet_ntop(AF_INET, addr, ip, sizeof(ip)) != nullptr) && std::strcmp(ip, "0.0.0.0") != 0) + { + freeifaddrs(ifaddr); + return std::string(ip); + } + } + else if (ifa->ifa_addr->sa_family == AF_INET6) + { + char ip[INET6_ADDRSTRLEN]{}; + void* addr = &((reinterpret_cast(ifa->ifa_addr))->sin6_addr); + if ((inet_ntop(AF_INET6, addr, ip, sizeof(ip)) != nullptr) && std::strncmp(ip, "fe80::", 6) != 0 + && std::strcmp(ip, "::1") != 0) + { + freeifaddrs(ifaddr); + return std::string(ip); + } + } + } + } + + freeifaddrs(ifaddr); + TLLM_LOG_ERROR( + rank, "Can't get local ip from NIC Interface. Please check whether corresponding INTERFACE is set correctly."); + return std::string{}; +} + +std::string getLocalIpByHostname(int rank) +{ + char hostname[256]{}; + if (gethostname(hostname, sizeof(hostname)) == -1) + { + TLLM_LOG_ERROR(rank, "getLocalIpByHostname: Can't get hostname"); + return std::string{}; + } + + struct addrinfo hints = {}; + hints.ai_family = AF_UNSPEC; + hints.ai_socktype = SOCK_STREAM; + hints.ai_flags = AI_CANONNAME; + + struct addrinfo* res = nullptr; + if (getaddrinfo(hostname, nullptr, &hints, &res) != 0) + { + TLLM_LOG_WARNING(rank, "getLocalIpByHostname: Can't get address info for hostname"); + return std::string{}; + } + + for (struct addrinfo* p = res; p != nullptr; p = p->ai_next) + { + + if (p->ai_family == AF_INET) + { // IPv4 + char ip[INET_ADDRSTRLEN]{}; + struct sockaddr_in* ipv4 = reinterpret_cast(p->ai_addr); + void* addr = &(ipv4->sin_addr); + if ((inet_ntop(AF_INET, addr, ip, sizeof(ip)) != nullptr) && std::strcmp(ip, "127.0.0.1") != 0 + && std::strcmp(ip, "0.0.0.0") != 0) + { + freeaddrinfo(res); + return std::string(ip); + } + } + else if (p->ai_family == AF_INET6) + { // IPv6 + char ip[INET6_ADDRSTRLEN]{}; + struct sockaddr_in6* ipv6 = reinterpret_cast(p->ai_addr); + void* addr = &(ipv6->sin6_addr); + if ((inet_ntop(AF_INET6, addr, ip, sizeof(ip)) != nullptr) && std::strncmp(ip, "fe80::", 6) != 0 + && std::strcmp(ip, "::1") != 0) + { + freeaddrinfo(res); + return std::string(ip); + } + } + } + + freeaddrinfo(res); + TLLM_LOG_WARNING(rank, "getLocalIpByHostname: Can't get local ip from hostname"); + return std::string{}; +} + +std::string getLocalIpByRemoteOrHostName(int rank) +{ + + // Try IPv4 + struct sockaddr_in addr + { + }; + + addr.sin_family = AF_INET; + addr.sin_port = htons(80); + // using google's public dns server to get the local ip which can be accessed from remote + char const* dns_ip_v4 = "8.8.8.8"; + inet_pton(AF_INET, dns_ip_v4, &addr.sin_addr); + + int sock = socket(AF_INET, SOCK_DGRAM, 0); + if (sock != -1) + { + if (connect(sock, reinterpret_cast(&addr), sizeof(addr)) != -1) + { + socklen_t addr_len = sizeof(addr); + if (getsockname(sock, reinterpret_cast(&addr), &addr_len) != -1) + { + char ip[INET_ADDRSTRLEN]{}; + inet_ntop(AF_INET, &addr.sin_addr, ip, sizeof(ip)); + close(sock); + return std::string(ip); + } + } + close(sock); + } + + // Try IPv6 + struct sockaddr_in6 addr6 + { + }; + + addr6.sin6_family = AF_INET6; + addr6.sin6_port = htons(80); + // using google's public dns server + char const* dns_ipv6 = "2001:4860:4860::8888"; + inet_pton(AF_INET6, dns_ipv6, &addr6.sin6_addr); + + sock = socket(AF_INET6, SOCK_DGRAM, 0); + if (sock != -1) + { + if (connect(sock, reinterpret_cast(&addr6), sizeof(addr6)) != -1) + { + socklen_t addr_len = sizeof(addr6); + if (getsockname(sock, reinterpret_cast(&addr6), &addr_len) != -1) + { + char ip[INET6_ADDRSTRLEN]{}; + inet_ntop(AF_INET6, &addr6.sin6_addr, ip, sizeof(ip)); + close(sock); + return std::string(ip); + } + } + close(sock); + } + + // Try hostname + return getLocalIpByHostname(rank); +} + +std::string getLocalIp(std::string interface, int rank) +{ + std::string localIP = {}; + if (!interface.empty()) + { + localIP = getLocalIpByNic(interface, rank); + } + if (localIP.empty()) + { + localIP = getLocalIpByRemoteOrHostName(rank); + } + // check whether the localIP is valid + if (localIP.empty()) + { + TLLM_THROW("getLocalIp: Can't get local ip"); + } + return localIP; +} +} // namespace common + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/common/ipUtils.h b/cpp/tensorrt_llm/common/ipUtils.h new file mode 100644 index 0000000000..9e8081683d --- /dev/null +++ b/cpp/tensorrt_llm/common/ipUtils.h @@ -0,0 +1,28 @@ +/* + * SPDX-FileCopyrightText: Copyright (c) 2022-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "tensorrt_llm/common/config.h" +#include + +TRTLLM_NAMESPACE_BEGIN + +namespace common +{ +std::string getLocalIp(std::string interface, int rank); +} // namespace common + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/executor/CMakeLists.txt b/cpp/tensorrt_llm/executor/CMakeLists.txt index e0e91d4b99..6639b58275 100644 --- a/cpp/tensorrt_llm/executor/CMakeLists.txt +++ b/cpp/tensorrt_llm/executor/CMakeLists.txt @@ -91,3 +91,4 @@ target_compile_definitions(${EXECUTOR_STATIC_TARGET} add_subdirectory(cache_transmission/ucx_utils) add_subdirectory(cache_transmission/nixl_utils) +add_subdirectory(cache_transmission/mooncake_utils) diff --git a/cpp/tensorrt_llm/executor/cache_transmission/agent_utils/connection.cpp b/cpp/tensorrt_llm/executor/cache_transmission/agent_utils/connection.cpp index 9a3bb98a91..ee8e8e21b3 100644 --- a/cpp/tensorrt_llm/executor/cache_transmission/agent_utils/connection.cpp +++ b/cpp/tensorrt_llm/executor/cache_transmission/agent_utils/connection.cpp @@ -236,7 +236,7 @@ bool AgentConnection::recvReadySignal(DataContext const& ctx) const AgentConnectionManager::AgentConnectionManager( std::vector cacheTransBufferManagers, - CacheState cacheState) + CacheState cacheState, std::string const& backendType) : mCacheState(std::move(cacheState)) , mCacheTransBufferManagers(std::move(cacheTransBufferManagers)) , mRegMemDescs(MemoryType::kVRAM, {}) @@ -247,7 +247,7 @@ AgentConnectionManager::AgentConnectionManager( mAgentName = genUniqueAgentName(); // Create Agent BaseAgentConfig config{mAgentName, true}; - m_Agent = makeTransferAgent("nixl", &config); + m_Agent = makeTransferAgent(backendType, &config); TLLM_CHECK(!mCacheTransBufferManagers.empty()); std::vector memDescs; for (auto* cacheTransBufferManager : mCacheTransBufferManagers) diff --git a/cpp/tensorrt_llm/executor/cache_transmission/agent_utils/connection.h b/cpp/tensorrt_llm/executor/cache_transmission/agent_utils/connection.h index d5a780bf45..6b8bd875e4 100644 --- a/cpp/tensorrt_llm/executor/cache_transmission/agent_utils/connection.h +++ b/cpp/tensorrt_llm/executor/cache_transmission/agent_utils/connection.h @@ -277,7 +277,7 @@ class AgentConnectionManager : public ConnectionManager public: AgentConnectionManager( std::vector cacheTransBufferManagers, - CacheState cacheState); + CacheState cacheState, std::string const& backendType); ~AgentConnectionManager(); AgentConnection* recvConnect(DataContext const& ctx, void* data, size_t size) override; [[nodiscard]] std::vector getConnections(CommState const& state) override; diff --git a/cpp/tensorrt_llm/executor/cache_transmission/mooncake_utils/CMakeLists.txt b/cpp/tensorrt_llm/executor/cache_transmission/mooncake_utils/CMakeLists.txt new file mode 100644 index 0000000000..105d3b93f1 --- /dev/null +++ b/cpp/tensorrt_llm/executor/cache_transmission/mooncake_utils/CMakeLists.txt @@ -0,0 +1,40 @@ +# SPDX-FileCopyrightText: Copyright (c) 2023-2025 NVIDIA CORPORATION & +# AFFILIATES. All rights reserved. SPDX-License-Identifier: NVIDIA TensorRT +# Source Code License Agreement +# +# NVIDIA CORPORATION, its affiliates and licensors retain all intellectual +# property and proprietary rights in and to this material, related documentation +# and any modifications thereto. Any use, reproduction, disclosure or +# distribution of this material and related documentation without an express +# license agreement from NVIDIA CORPORATION or its affiliates is strictly +# prohibited. + +# MOONCAKE is not supported on Rocky8 for now +set(IS_ROCKY8 FALSE) +if(EXISTS "/etc/redhat-release") + set(IS_ROCKY8 TRUE) +endif() + +if(MOONCAKE_ROOT AND NOT IS_ROCKY8) + find_library(TRANSFER_ENGINE_LIB transfer_engine ${MOONCAKE_ROOT}/lib) + find_path(TRANSFER_ENGINE_INCLUDE_DIR transfer_engine_c.h + ${MOONCAKE_ROOT}/include) + + message(STATUS "Find transfer engine results:") + message(STATUS " TRANSFER_ENGINE_LIB = ${TRANSFER_ENGINE_LIB}") + message( + STATUS " TRANSFER_ENGINE_INCLUDE_DIR = ${TRANSFER_ENGINE_INCLUDE_DIR}") + + if(TRANSFER_ENGINE_LIB AND TRANSFER_ENGINE_INCLUDE_DIR) + set(MOONCAKE_WRAPPER_TARGET "tensorrt_llm_mooncake_wrapper") + + add_library(${MOONCAKE_WRAPPER_TARGET} SHARED transferAgent.cpp) + target_compile_options(${MOONCAKE_WRAPPER_TARGET} PRIVATE -Wno-error) + + target_include_directories(${MOONCAKE_WRAPPER_TARGET} + PRIVATE ${TRANSFER_ENGINE_INCLUDE_DIR}) + + target_link_libraries(${MOONCAKE_WRAPPER_TARGET} + PRIVATE ${TRANSFER_ENGINE_LIB} CUDA::cudart) + endif() +endif() diff --git a/cpp/tensorrt_llm/executor/cache_transmission/mooncake_utils/transferAgent.cpp b/cpp/tensorrt_llm/executor/cache_transmission/mooncake_utils/transferAgent.cpp new file mode 100644 index 0000000000..eabbca98c3 --- /dev/null +++ b/cpp/tensorrt_llm/executor/cache_transmission/mooncake_utils/transferAgent.cpp @@ -0,0 +1,546 @@ +/* + * SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "tensorrt_llm/executor/cache_transmission/mooncake_utils/transferAgent.h" +#include "tensorrt_llm/common/envUtils.h" +#include "tensorrt_llm/common/ipUtils.h" +#include "tensorrt_llm/common/logger.h" +#include "tensorrt_llm/executor/transferAgent.h" +#include "tensorrt_llm/runtime/utils/mpiUtils.h" + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +namespace tensorrt_llm::executor::kv_cache +{ + +MooncakeTransferStatus::MooncakeTransferStatus(transfer_engine_t engine, uint64_t batchId, size_t requestCount) + : mEngine{engine} + , mBatchId{batchId} + , mRequestCount{requestCount} +{ + TLLM_CHECK(mEngine); +} + +void MooncakeTransferStatus::wait() const +{ + while (!isCompleted()) + { + std::this_thread::sleep_for(std::chrono::milliseconds(1)); + } +} + +[[nodiscard]] bool MooncakeTransferStatus::isCompleted() const +{ + if (mBatchFreed) + { + return true; + } + + bool has_failed = false; + for (size_t index = 0; index < mRequestCount; ++index) + { + transfer_status_t status; + int rc = getTransferStatus(mEngine, mBatchId, index, &status); + if (rc || status.status == STATUS_FAILED) + { + has_failed = true; + if (rc) + { + TLLM_LOG_ERROR( + "Failed to get transfer status for batch %lu, task %zu: error code %d", mBatchId, index, rc); + } + else + { + TLLM_LOG_ERROR("Transfer failed for batch %lu, task %zu: status %d", mBatchId, index, status.status); + } + } + else if (status.status == STATUS_PENDING || status.status == STATUS_WAITING) + { + TLLM_LOG_DEBUG("Transfer is pending for batch %lu, task %zu", mBatchId, index); + return false; + } + } + if (!has_failed) + { + // Each batchId has the batch size, and cannot process more requests + // than the batch size. So, free the batch id here to workaround the issue + // where the same batchId could be used to post multiple transfer. + freeBatchID(mEngine, mBatchId); + mBatchFreed = true; + TLLM_LOG_DEBUG("Batch ID %lu freed, future calls will return true directly", mBatchId); + } + // Currently, we cannot distinguish between failed and completed from return value. + TLLM_LOG_DEBUG("Transfer is completed for batch %lu", mBatchId); + return true; +} + +const std::string MooncakeBase64Helper::STANDARD_CHARS + = "ABCDEFGHIJKLMNOPQRSTUVWXYZ" + "abcdefghijklmnopqrstuvwxyz" + "0123456789+/"; + +std::string MooncakeBase64Helper::encode(std::vector const& data) +{ + return encodeInternal(data, STANDARD_CHARS); +} + +std::string MooncakeBase64Helper::encode(std::string const& data) +{ + std::vector vec(data.begin(), data.end()); + return encode(vec); +} + +std::vector MooncakeBase64Helper::decode(std::string const& encoded) +{ + return decodeInternal(encoded, STANDARD_CHARS); +} + +std::string MooncakeBase64Helper::decodeToString(std::string const& encoded) +{ + auto vec = decode(encoded); + return std::string(vec.begin(), vec.end()); +} + +std::string MooncakeBase64Helper::encodeInternal(std::vector const& data, std::string const& chars) +{ + std::string encoded; + size_t i = 0; + size_t j = 0; + std::array charArray3{}; + std::array charArray4{}; + size_t dataLen = data.size(); + uint8_t const* bytes = data.data(); + + while (dataLen--) + { + charArray3[i++] = *(bytes++); + if (i == 3) + { + charArray4[0] = (charArray3[0] & 0xfc) >> 2; + charArray4[1] = ((charArray3[0] & 0x03) << 4) + ((charArray3[1] & 0xf0) >> 4); + charArray4[2] = ((charArray3[1] & 0x0f) << 2) + ((charArray3[2] & 0xc0) >> 6); + charArray4[3] = charArray3[2] & 0x3f; + + for (i = 0; i < 4; i++) + { + encoded += chars[charArray4[i]]; + } + i = 0; + } + } + + if (i > 0) + { + for (j = i; j < 3; j++) + { + charArray3[j] = '\0'; + } + + charArray4[0] = (charArray3[0] & 0xfc) >> 2; + charArray4[1] = ((charArray3[0] & 0x03) << 4) + ((charArray3[1] & 0xf0) >> 4); + charArray4[2] = ((charArray3[1] & 0x0f) << 2) + ((charArray3[2] & 0xc0) >> 6); + charArray4[3] = charArray3[2] & 0x3f; + + for (j = 0; j < i + 1; j++) + { + encoded += chars[charArray4[j]]; + } + + while (i++ < 3) + { + encoded += '='; + } + } + + return encoded; +} + +std::vector MooncakeBase64Helper::decodeInternal(std::string const& encoded, std::string const& chars) +{ + size_t encodedLen = encoded.size(); + size_t i = 0; + size_t j = 0; + size_t in_ = 0; + std::array charArray3{}; + std::array charArray4{}; + std::vector decoded; + + std::string cleanEncoded; + for (char c : encoded) + { + if (!isWhitespace(c)) + { + cleanEncoded += c; + } + } + + encodedLen = cleanEncoded.size(); + + while (encodedLen-- && cleanEncoded[in_] != '=' && isBase64(cleanEncoded[in_], chars)) + { + charArray4[i++] = cleanEncoded[in_]; + in_++; + if (i == 4) + { + for (i = 0; i < 4; i++) + { + charArray4[i] = chars.find(charArray4[i]); + } + + charArray3[0] = (charArray4[0] << 2) + ((charArray4[1] & 0x30) >> 4); + charArray3[1] = ((charArray4[1] & 0xf) << 4) + ((charArray4[2] & 0x3c) >> 2); + charArray3[2] = ((charArray4[2] & 0x3) << 6) + charArray4[3]; + + for (i = 0; i < 3; i++) + { + decoded.push_back(charArray3[i]); + } + i = 0; + } + } + + if (i > 0) + { + for (j = i; j < 4; j++) + { + charArray4[j] = 0; + } + + for (j = 0; j < 4; j++) + { + charArray4[j] = chars.find(charArray4[j]); + } + + charArray3[0] = (charArray4[0] << 2) + ((charArray4[1] & 0x30) >> 4); + charArray3[1] = ((charArray4[1] & 0xf) << 4) + ((charArray4[2] & 0x3c) >> 2); + charArray3[2] = ((charArray4[2] & 0x3) << 6) + charArray4[3]; + + for (j = 0; j < i - 1; j++) + { + decoded.push_back(charArray3[j]); + } + } + + return decoded; +} + +bool MooncakeBase64Helper::isBase64(uint8_t c, std::string const& chars) +{ + return (isalnum(c) || (c == chars[62]) || (c == chars[63])); +} + +bool MooncakeBase64Helper::isWhitespace(uint8_t c) +{ + return (c == ' ' || c == '\n' || c == '\r' || c == '\t'); +} + +MooncakeTransferAgent::MooncakeTransferAgent(BaseAgentConfig const& config) +{ + mLocalAgentName = config.mName; + std::string segmentName = "127.0.0.1"; + + if (getenv("TLLM_MOONCAKE_IP_ADDR")) + { + segmentName = std::string(getenv("TLLM_MOONCAKE_IP_ADDR")); + } + else + { + auto ip = common::getLocalIp(common::getEnvMooncakeInterface(), mpi::MpiComm::session().getRank()); + if (!ip.empty()) + segmentName = ip; + } + + mEngine = createTransferEngine("P2PHANDSHAKE", segmentName.c_str(), "", 0, true); +} + +void MooncakeTransferAgent::registerMemory(RegisterDescs const& descs) +{ + TLLM_LOG_DEBUG("MooncakeTransferAgent::registerMemory"); + + std::lock_guard lock(mMutex); + for (auto const& desc : descs.getDescs()) + { + auto it = mMemRegInfo.find(desc.getAddr()); + if (it != mMemRegInfo.end()) + { + it->second->addRef(); + continue; + } + + int err = registerLocalMemory(mEngine, reinterpret_cast(desc.getAddr()), desc.getLen(), "*", 1); + + TLLM_CHECK_WITH_INFO(err == 0, "registerLocalMemory failed, addr: %p, len: %lu", + reinterpret_cast(desc.getAddr()), desc.getLen()); + + auto mooncakeDesc = std::make_shared(desc); + mMemRegInfo[desc.getAddr()] = std::move(mooncakeDesc); + } +} + +void MooncakeTransferAgent::deregisterMemory(RegisterDescs const& descs) +{ + TLLM_LOG_DEBUG("MooncakeTransferAgent::deregisterMemory"); + + std::lock_guard lock(mMutex); + for (auto const& desc : descs.getDescs()) + { + auto it = mMemRegInfo.find(desc.getAddr()); + if (it != mMemRegInfo.end()) + { + auto const& mooncakeDesc = it->second; + mooncakeDesc->releaseRef(); + if (mooncakeDesc->getRefCount()) + continue; + + int err = unregisterLocalMemory(mEngine, reinterpret_cast(desc.getAddr())); + + TLLM_CHECK_WITH_INFO( + err == 0, "unregisterLocalMemory failed, addr: %p", reinterpret_cast(desc.getAddr())); + + mMemRegInfo.erase(desc.getAddr()); + } + } +} + +void MooncakeTransferAgent::loadRemoteAgent(std::string const& name, AgentDesc const& agentDesc) +{ + TLLM_LOG_DEBUG("MooncakeTransferAgent::loadRemoteAgent"); + + // Do the same thing as loadRemoteAgent(std::string const& name, ConnectionInfoType const& connectionInfo) + loadRemoteAgent(name, std::move(agentDesc.getBackendAgentDesc())); +} + +void MooncakeTransferAgent::loadRemoteAgent(std::string const& name, ConnectionInfoType const& connectionInfo) +{ + TLLM_LOG_DEBUG(mpi::MpiComm::world().getRank(), + "MooncakeTransferAgent::loadRemoteAgent loadRemoteAgent to %s remoteagent name: %s", connectionInfo.c_str(), + name.c_str()); + + std::lock_guard lock(mMutex); + auto segmentId = openSegment(mEngine, connectionInfo.c_str()); + + TLLM_CHECK_WITH_INFO( + segmentId >= 0, "loadRemoteAgent openSegment failed, connectionInfo: %s", connectionInfo.c_str()); + + mConnectedAgents[name].segmentId = segmentId; +} + +void MooncakeTransferAgent::invalidateRemoteAgent(std::string const& name) +{ + TLLM_LOG_DEBUG("MooncakeTransferAgent::invalidateRemoteAgent"); +} + +AgentDesc MooncakeTransferAgent::getLocalAgentDesc() +{ + TLLM_LOG_DEBUG("MooncakeTransferAgent::getLocalAgentDesc"); + + // Using connection info as agent desc + const static size_t kBufLen = 64; + char connectionInfo[kBufLen]; + + int ret = getLocalIpAndPort(mEngine, connectionInfo, kBufLen); + + TLLM_CHECK_WITH_INFO(ret == 0, "MooncakeTransferAgent::getLocalAgentDesc::getLocalIpAndPort failed"); + + return AgentDesc{std::string(connectionInfo)}; +} + +ConnectionInfoType MooncakeTransferAgent::getLocalConnectionInfo() +{ + TLLM_LOG_DEBUG("MooncakeTransferAgent::getLocalConnectionInfo"); + + const static size_t kBufLen = 64; + char connectionInfo[kBufLen]; + + int ret = getLocalIpAndPort(mEngine, connectionInfo, kBufLen); + + TLLM_CHECK_WITH_INFO(ret == 0, "MooncakeTransferAgent::getLocalAgentDesc::getLocalConnectionInfo failed"); + + return std::string(connectionInfo); +} + +[[nodiscard]] std::unique_ptr MooncakeTransferAgent::submitTransferRequests( + TransferRequest const& request) +{ + TLLM_LOG_DEBUG("MooncakeTransferAgent::submitTransferRequests"); + + bool hasNotif = false; + std::string syncMessage; + + if (request.getSyncMessage().has_value()) + { + hasNotif = true; + syncMessage = request.getSyncMessage().value(); + } + + const static size_t kMaxRequestCount = 1024; + uint64_t batchId = allocateBatchID(mEngine, kMaxRequestCount); + + TLLM_CHECK_WITH_INFO(batchId != INVALID_BATCH, "allocateBatchID failed"); + + int segmentId; + { + std::lock_guard lock(mMutex); + std::string remoteName = request.getRemoteName(); + + auto it = mConnectedAgents.find(remoteName); + if (it == mConnectedAgents.end()) + { + std::string error = "Remote agent " + remoteName + "not found"; + TLLM_THROW(error); + } + + auto const& agentInfo = it->second; + segmentId = agentInfo.segmentId; + } + + auto localDescs = request.getSrcDescs().getDescs(); + auto remoteDescs = request.getDstDescs().getDescs(); + + TLLM_CHECK_WITH_INFO(localDescs.size() == remoteDescs.size(), "Number of local and remote memory must match"); + + size_t requestCount = localDescs.size(); + std::vector transferRequests(requestCount); + + for (size_t index = 0; index < requestCount; ++index) + { + TLLM_CHECK_WITH_INFO( + localDescs[index].getLen() == remoteDescs[index].getLen(), "Length of local and remote memory must match"); + + transferRequests[index].opcode = (request.getOp() == TransferOp::kREAD) ? OPCODE_READ : OPCODE_WRITE; + transferRequests[index].source = reinterpret_cast(localDescs[index].getAddr()); + transferRequests[index].target_offset = remoteDescs[index].getAddr(); + transferRequests[index].length = localDescs[index].getLen(); + transferRequests[index].target_id = segmentId; + } + + int rc = 0; + if (hasNotif) + { + notify_msg_t notifyMsg; + notifyMsg.name = const_cast(mLocalAgentName.c_str()); + notifyMsg.msg = const_cast(syncMessage.c_str()); + rc = submitTransferWithNotify(mEngine, batchId, transferRequests.data(), requestCount, notifyMsg); + } + else + { + rc = submitTransfer(mEngine, batchId, transferRequests.data(), requestCount); + } + + TLLM_CHECK_WITH_INFO(rc == 0, "submitTransfer failed with status: %d", rc); + + return std::make_unique(mEngine, batchId, requestCount); +} + +void MooncakeTransferAgent::notifySyncMessage(std::string const& name, SyncMessage const& syncMessage) +{ + TLLM_LOG_DEBUG("MooncakeTransferAgent::notifySyncMessage"); + int segmentId; + { + std::lock_guard lock(mMutex); + auto it = mConnectedAgents.find(name); + + if (it == mConnectedAgents.end()) + { + TLLM_LOG_WARNING("Remote agent %s not found", name.c_str()); + return; + } + + auto const& agentInfo = it->second; + segmentId = agentInfo.segmentId; + } + + notify_msg_t notifyMsg; + notifyMsg.name = const_cast(mLocalAgentName.c_str()); + std::string encoded = MooncakeBase64Helper::encode(syncMessage); + notifyMsg.msg = const_cast(encoded.c_str()); + + TLLM_LOG_DEBUG("MooncakeTransferAgent::notifySyncMessage notifyMsg.name: %s, notifyMsg.msg: %s", notifyMsg.name, + notifyMsg.msg); + + int ret = genNotifyInEngine(mEngine, segmentId, notifyMsg); + + TLLM_CHECK_WITH_INFO(ret == 0, "genNotifyInEngine failed with status: %d", ret); +} + +[[nodiscard]] std::unordered_map> MooncakeTransferAgent::getNotifiedSyncMessages() +{ + std::unordered_map> notifs; + int size = 0; + + notify_msg_t* notifyMsgs = getNotifsFromEngine(mEngine, &size); + + TLLM_CHECK_WITH_INFO(size >= 0, "getNotifsFromEngine returned negative size: %d", size); + + for (int i = 0; i < size; i++) + { + if (notifyMsgs[i].msg == nullptr) + { + TLLM_LOG_WARNING("Message pointer is null for: %s", notifyMsgs[i].name); + continue; + } + + std::string decoded = MooncakeBase64Helper::decodeToString(notifyMsgs[i].msg); + notifs[notifyMsgs[i].name].emplace_back(std::move(decoded)); + + TLLM_LOG_DEBUG("MooncakeTransferAgent::getNotifiedSyncMessages getNotifsFromEngine: %s, %s", notifyMsgs[i].name, + notifyMsgs[i].msg); + } + + freeNotifsMsgBuf(notifyMsgs, size); + return notifs; +} + +bool MooncakeTransferAgent::checkRemoteDescs(std::string const& name, MemoryDescs const& memoryDescs) +{ + TLLM_LOG_DEBUG("MooncakeTransferAgent::checkRemoteDescs"); + return true; +} + +MooncakeTransferAgent::~MooncakeTransferAgent() +{ + destroyTransferEngine(mEngine); + TLLM_LOG_DEBUG("MooncakeTransferAgent::~MooncakeTransferAgent"); +} + +#if defined(__clang__) +#pragma clang diagnostic push +#pragma clang diagnostic ignored "-Wreturn-type-c-linkage" +#endif + +extern "C" +{ + std::unique_ptr createMooncakeTransferAgent(BaseAgentConfig const* config) + { + TLLM_CHECK(config); + return std::make_unique(*config); + } +} + +} // namespace tensorrt_llm::executor::kv_cache diff --git a/cpp/tensorrt_llm/executor/cache_transmission/mooncake_utils/transferAgent.h b/cpp/tensorrt_llm/executor/cache_transmission/mooncake_utils/transferAgent.h new file mode 100644 index 0000000000..0aeeedeae1 --- /dev/null +++ b/cpp/tensorrt_llm/executor/cache_transmission/mooncake_utils/transferAgent.h @@ -0,0 +1,165 @@ +/* + * SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include +#include +#include +#include + +#include "tensorrt_llm/executor/transferAgent.h" +#include "transfer_engine_c.h" + +namespace tensorrt_llm::executor::kv_cache +{ + +class MooncakeTransferStatus final : public TransferStatus +{ +public: + MooncakeTransferStatus(transfer_engine_t engine, uint64_t batchId, size_t requestCount); + + [[nodiscard]] bool isCompleted() const override; + + void wait() const override; + +private: + transfer_engine_t mEngine; + uint64_t mBatchId; + size_t mRequestCount; + mutable bool mBatchFreed = false; +}; + +class MooncakeMemoryDesc +{ +public: + MooncakeMemoryDesc(MemoryDesc desc) + : mDesc{std::move(desc)} + , mRefCnt{0} + { + } + + MooncakeMemoryDesc(MooncakeMemoryDesc const& other) + : mDesc{other.mDesc} + , mRefCnt{0} + { + } + + MooncakeMemoryDesc& operator=(MooncakeMemoryDesc const&) = delete; + + ~MooncakeMemoryDesc() = default; + + void addRef() noexcept + { + ++mRefCnt; + } + + int releaseRef() noexcept + { + return --mRefCnt; + } + + int getRefCount() const noexcept + { + return mRefCnt; + } + + MemoryDesc const& getDesc() const noexcept + { + return mDesc; + } + +private: + MemoryDesc mDesc; + int mRefCnt; +}; + +class MooncakeBase64Helper +{ +public: + static std::string encode(std::vector const& data); + static std::string encode(std::string const& data); + + static std::vector decode(std::string const& encoded); + static std::string decodeToString(std::string const& encoded); + +private: + static const std::string STANDARD_CHARS; + + static std::string encodeInternal(std::vector const& data, std::string const& chars); + static std::vector decodeInternal(std::string const& encoded, std::string const& chars); + + static inline bool isBase64(uint8_t c, std::string const& chars); + static inline bool isWhitespace(uint8_t c); +}; + +class MooncakeTransferAgent final : public BaseTransferAgent +{ +public: + MooncakeTransferAgent(BaseAgentConfig const& config); + ~MooncakeTransferAgent(); + + void registerMemory(RegisterDescs const& descs) override; + + void deregisterMemory(RegisterDescs const& descs) override; + + void loadRemoteAgent(std::string const& name, AgentDesc const& agentDesc) override; + + void loadRemoteAgent(std::string const& name, ConnectionInfoType const& connectionInfo) override; + + void invalidateRemoteAgent(std::string const& name) override; + + AgentDesc getLocalAgentDesc() override; + + ConnectionInfoType getLocalConnectionInfo() override; + + [[nodiscard]] std::unique_ptr submitTransferRequests(TransferRequest const& request) override; + + void notifySyncMessage(std::string const& name, SyncMessage const& syncMessage) override; + + [[nodiscard]] std::unordered_map> getNotifiedSyncMessages() override; + + bool checkRemoteDescs(std::string const& name, MemoryDescs const& memoryDescs) override; + +private: + struct AgentInfo + { + int segmentId; + }; + + mutable std::mutex mMutex; + transfer_engine_t mEngine; + std::unordered_map> mMemRegInfo; + std::unordered_map mConnectedAgents; + std::string mLocalAgentName; +}; + +#if defined(__clang__) +#pragma clang diagnostic push +#pragma clang diagnostic ignored "-Wreturn-type-c-linkage" +#endif + +extern "C" +{ + [[nodiscard]] std::unique_ptr createMooncakeTransferAgent(BaseAgentConfig const* config); +} + +#if defined(__clang__) +#pragma clang diagnostic pop +#endif + +} // namespace tensorrt_llm::executor::kv_cache diff --git a/cpp/tensorrt_llm/kernels/indexerTopK.cu b/cpp/tensorrt_llm/kernels/indexerTopK.cu index 740e83f0bb..b9a3849494 100644 --- a/cpp/tensorrt_llm/kernels/indexerTopK.cu +++ b/cpp/tensorrt_llm/kernels/indexerTopK.cu @@ -606,8 +606,8 @@ static __global__ __launch_bounds__(kNumThreadsPerBlock) void topKPerRowPrefill( int rowEnd = rowEnds[rowIdx]; // Local pointers to this block - outIndices += rowIdx * topK; - logits += rowIdx * stride0; + outIndices += static_cast(rowIdx) * topK; + logits += static_cast(rowIdx) * stride0; topKPerRowJob( nullptr, logits, rowStart, rowEnd, outIndices, nullptr, stride1, topK); @@ -638,23 +638,23 @@ static __global__ __launch_bounds__(kNumThreadsPerBlock) void topKPerRowDecode(f // Local pointers to this block if constexpr (!multipleBlocksPerRow && !mergeBlocks) { - outIndices += rowIdx * topK; + outIndices += static_cast(rowIdx) * topK; } else if constexpr (multipleBlocksPerRow) { auto const blockSize = rowEnd / gridDim.y; // 16384 / 2 = 8192 rowStart = blockSize * blockIdx.y; // 8192 * 1 = 8192 rowEnd = gridDim.y == blockIdx.y + 1 ? rowEnd : rowStart + blockSize; - outIndices += rowIdx * gridDim.y * topK + blockIdx.y * topK; - outLogits += rowIdx * gridDim.y * topK + blockIdx.y * topK; + outIndices += static_cast(rowIdx) * gridDim.y * topK + blockIdx.y * topK; + outLogits += static_cast(rowIdx) * gridDim.y * topK + blockIdx.y * topK; } else if constexpr (mergeBlocks) { rowEnd = numBlocksToMerge * topK; - indices += rowIdx * numBlocksToMerge * topK; - outIndices += rowIdx * topK; + indices += static_cast(rowIdx) * numBlocksToMerge * topK; + outIndices += static_cast(rowIdx) * topK; } - logits += rowIdx * stride0; + logits += static_cast(rowIdx) * stride0; topKPerRowJob( indices, logits, rowStart, rowEnd, outIndices, outLogits, stride1, topK); diff --git a/cpp/tensorrt_llm/nanobind/executor/executorConfig.cpp b/cpp/tensorrt_llm/nanobind/executor/executorConfig.cpp index bed5db70f7..051586b7fe 100644 --- a/cpp/tensorrt_llm/nanobind/executor/executorConfig.cpp +++ b/cpp/tensorrt_llm/nanobind/executor/executorConfig.cpp @@ -449,6 +449,7 @@ void initConfigBindings(nb::module_& m) .value("MPI", tle::CacheTransceiverConfig::BackendType::MPI) .value("UCX", tle::CacheTransceiverConfig::BackendType::UCX) .value("NIXL", tle::CacheTransceiverConfig::BackendType::NIXL) + .value("MOONCAKE", tle::CacheTransceiverConfig::BackendType::MOONCAKE) .def("from_string", [](std::string const& str) { @@ -460,6 +461,8 @@ void initConfigBindings(nb::module_& m) return tle::CacheTransceiverConfig::BackendType::UCX; if (str == "NIXL" || str == "nixl") return tle::CacheTransceiverConfig::BackendType::NIXL; + if (str == "MOONCAKE" || str == "mooncake") + return tle::CacheTransceiverConfig::BackendType::MOONCAKE; throw std::runtime_error("Invalid backend type: " + str); }); diff --git a/cpp/tensorrt_llm/pybind/executor/executorConfig.cpp b/cpp/tensorrt_llm/pybind/executor/executorConfig.cpp index 7919423256..4fe20a6c66 100644 --- a/cpp/tensorrt_llm/pybind/executor/executorConfig.cpp +++ b/cpp/tensorrt_llm/pybind/executor/executorConfig.cpp @@ -431,6 +431,7 @@ void initConfigBindings(pybind11::module_& m) .value("MPI", tle::CacheTransceiverConfig::BackendType::MPI) .value("UCX", tle::CacheTransceiverConfig::BackendType::UCX) .value("NIXL", tle::CacheTransceiverConfig::BackendType::NIXL) + .value("MOONCAKE", tle::CacheTransceiverConfig::BackendType::MOONCAKE) .def("from_string", [](std::string const& str) { @@ -442,6 +443,8 @@ void initConfigBindings(pybind11::module_& m) return tle::CacheTransceiverConfig::BackendType::UCX; if (str == "NIXL" || str == "nixl") return tle::CacheTransceiverConfig::BackendType::NIXL; + if (str == "MOONCAKE" || str == "mooncake") + return tle::CacheTransceiverConfig::BackendType::MOONCAKE; throw std::runtime_error("Invalid backend type: " + str); }); diff --git a/cpp/tests/unit_tests/executor/CMakeLists.txt b/cpp/tests/unit_tests/executor/CMakeLists.txt index de3a694d21..069363c5ed 100644 --- a/cpp/tests/unit_tests/executor/CMakeLists.txt +++ b/cpp/tests/unit_tests/executor/CMakeLists.txt @@ -38,10 +38,31 @@ add_gtest(ucxCommTest ucxCommTest.cpp) target_link_libraries(ucxCommTest PRIVATE ${Python3_LIBRARIES}) target_link_libraries(serializeUtilsTest PRIVATE ${Python3_LIBRARIES}) -if(NIXL_ROOT) - add_gtest(transferAgentTest transferAgentTest.cpp) - add_gtest(agentCommTest agentCommTest.cpp) - target_link_libraries(transferAgentTest PRIVATE tensorrt_llm_nixl_wrapper) - target_link_libraries(agentCommTest PRIVATE tensorrt_llm_nixl_wrapper - ${Python3_LIBRARIES}) +# Skip MOONCAKE related tests on Rocky8 +set(IS_ROCKY8 FALSE) +if(EXISTS "/etc/redhat-release") + set(IS_ROCKY8 TRUE) +endif() + +if(NIXL_ROOT OR (MOONCAKE_ROOT AND NOT IS_ROCKY8)) + add_gtest(agentCommTest agentCommTest.cpp) + add_gtest(transferAgentTest transferAgentTest.cpp) + + if(NIXL_ROOT) + target_link_libraries(transferAgentTest PRIVATE tensorrt_llm_nixl_wrapper) + target_link_libraries(agentCommTest PRIVATE tensorrt_llm_nixl_wrapper + ${Python3_LIBRARIES}) + target_compile_definitions(transferAgentTest PRIVATE TEST_NIXL_BACKEND=1) + target_compile_definitions(agentCommTest PRIVATE TEST_NIXL_BACKEND=1) + endif() + + if(MOONCAKE_ROOT) + target_link_libraries(transferAgentTest + PRIVATE tensorrt_llm_mooncake_wrapper) + target_link_libraries(agentCommTest PRIVATE tensorrt_llm_mooncake_wrapper + ${Python3_LIBRARIES}) + target_compile_definitions(transferAgentTest + PRIVATE TEST_MOONCAKE_BACKEND=1) + target_compile_definitions(agentCommTest PRIVATE TEST_MOONCAKE_BACKEND=1) + endif() endif() diff --git a/cpp/tests/unit_tests/executor/agentCommTest.cpp b/cpp/tests/unit_tests/executor/agentCommTest.cpp index ccd54ab926..025a3a8bc6 100644 --- a/cpp/tests/unit_tests/executor/agentCommTest.cpp +++ b/cpp/tests/unit_tests/executor/agentCommTest.cpp @@ -22,22 +22,54 @@ using namespace tensorrt_llm::batch_manager::kv_cache_manager; using namespace tensorrt_llm::runtime; using namespace tensorrt_llm::executor::kv_cache; -bool needSkipTest(std::string& skipReason) +std::vector getAvailableBackends() +{ + std::vector backends; + +#ifdef TEST_NIXL_BACKEND + backends.push_back("nixl"); +#endif + +#ifdef TEST_MOONCAKE_BACKEND + backends.push_back("mooncake"); +#endif + + return backends; +} + +bool needSkipTest(std::string const& backend, std::string& skipReason) { bool skip = false; try { auto& loader = tensorrt_llm::executor::kv_cache::DynLibLoader::getInstance(); - using CreateNixlFuncType = std::unique_ptr (*)( - tensorrt_llm::executor::kv_cache::BaseAgentConfig const*); - auto* func = loader.getFunctionPointer( - "libtensorrt_llm_nixl_wrapper.so", "createNixlTransferAgent"); + if (backend == "nixl") + { + using CreateNixlFuncType = std::unique_ptr (*)( + tensorrt_llm::executor::kv_cache::BaseAgentConfig const*); + auto* func = loader.getFunctionPointer( + "libtensorrt_llm_nixl_wrapper.so", "createNixlTransferAgent"); + } + else if (backend == "mooncake") + { + using CreateMooncakeFuncType = std::unique_ptr (*)( + tensorrt_llm::executor::kv_cache::BaseAgentConfig const*); + auto* func = loader.getFunctionPointer( + "libtensorrt_llm_mooncake_wrapper.so", "createMooncakeTransferAgent"); + } + else + { + skip = true; + skipReason = "Unknown backend: " + backend; + } } catch (std::exception const& e) { std::string error = e.what(); - if (error.find("libtensorrt_llm_nixl_wrapper.so") != std::string::npos) + std::string libName + = (backend == "nixl") ? "libtensorrt_llm_nixl_wrapper.so" : "libtensorrt_llm_mooncake_wrapper.so"; + if (error.find(libName) != std::string::npos) { skip = true; skipReason = error; @@ -46,17 +78,26 @@ bool needSkipTest(std::string& skipReason) return skip; } -class AgentCommTest : public ::testing::Test +class AgentCommTest : public ::testing::TestWithParam { protected: void SetUp() override { + backend = GetParam(); std::string skipReason; - if (needSkipTest(skipReason)) + if (needSkipTest(backend, skipReason)) { GTEST_SKIP() << skipReason; } - setenv("TRTLLM_USE_NIXL_KVCACHE", "1", 1); + + if (backend == "nixl") + { + setenv("TRTLLM_USE_NIXL_KVCACHE", "1", 1); + } + else if (backend == "mooncake") + { + setenv("TRTLLM_USE_MOONCAKE_KVCACHE", "1", 1); + } auto constexpr numLayers = 8; auto constexpr numHeads = 16; @@ -106,15 +147,16 @@ protected: mCacheState.reset(); } + std::string backend; std::unique_ptr mTransBufferManager; std::unique_ptr mCacheManager; std::unique_ptr mCacheState; }; -TEST_F(AgentCommTest, AgentConnectionManagerBasic) +TEST_P(AgentCommTest, AgentConnectionManagerBasic) { std::vector bufferManagers{mTransBufferManager.get()}; - auto connectionManager = std::make_unique(bufferManagers, *mCacheState); + auto connectionManager = std::make_unique(bufferManagers, *mCacheState, backend); ASSERT_TRUE(connectionManager != nullptr); ASSERT_EQ(connectionManager->getCacheTransBufferManagers().size(), bufferManagers.size()); ASSERT_TRUE(connectionManager->getCacheTransBufferManagers().front() != nullptr); @@ -126,11 +168,11 @@ TEST_F(AgentCommTest, AgentConnectionManagerBasic) ASSERT_EQ(commState.getAgentState().size(), 1); } -TEST_F(AgentCommTest, AgentConnectionManagerConnect) +TEST_P(AgentCommTest, AgentConnectionManagerConnect) { std::vector bufferManagers{mTransBufferManager.get()}; - auto connectionManager0 = std::make_unique(bufferManagers, *mCacheState); - auto connectionManager1 = std::make_unique(bufferManagers, *mCacheState); + auto connectionManager0 = std::make_unique(bufferManagers, *mCacheState, backend); + auto connectionManager1 = std::make_unique(bufferManagers, *mCacheState, backend); auto agentName0 = connectionManager0->getAgentName(); auto agentName1 = connectionManager1->getAgentName(); ASSERT_TRUE(!agentName0.empty()); @@ -189,3 +231,6 @@ TEST_F(AgentCommTest, AgentConnectionManagerConnect) } TLLM_LOG_INFO("after finish"); } + +INSTANTIATE_TEST_SUITE_P(AvailableBackends, AgentCommTest, ::testing::ValuesIn(getAvailableBackends()), + [](::testing::TestParamInfo const& info) { return info.param; }); diff --git a/cpp/tests/unit_tests/executor/transferAgentTest.cpp b/cpp/tests/unit_tests/executor/transferAgentTest.cpp index 0f21449f30..7218611a0e 100644 --- a/cpp/tests/unit_tests/executor/transferAgentTest.cpp +++ b/cpp/tests/unit_tests/executor/transferAgentTest.cpp @@ -22,11 +22,27 @@ #include #include +#include namespace fs = std::filesystem; using namespace tensorrt_llm::executor::kv_cache; +std::vector getAvailableBackends() +{ + std::vector backends; + +#ifdef TEST_NIXL_BACKEND + backends.push_back("nixl"); +#endif + +#ifdef TEST_MOONCAKE_BACKEND + backends.push_back("mooncake"); +#endif + + return backends; +} + class RegisteredHostMemory { public: @@ -54,100 +70,105 @@ private: BaseTransferAgent* mAgentPtr{}; }; -class TransferAgentTest : public ::testing::Test // NOLINT(cppcoreguidelines-pro-type-member-init) +class TransferAgentTest : public ::testing::TestWithParam // NOLINT(cppcoreguidelines-pro-type-member-init) { public: - void SetUp() override {} + void SetUp() override + { + backend = GetParam(); + } void TearDown() override {} [[nodiscard]] std::unique_ptr makeTransferAgent(BaseAgentConfig const& config) { - return tensorrt_llm::executor::kv_cache::makeTransferAgent("nixl", &config); + return tensorrt_llm::executor::kv_cache::makeTransferAgent(backend, &config); } + + std::string backend; }; -TEST_F(TransferAgentTest, Basic) +TEST_P(TransferAgentTest, Basic) { std::string const agent0{"agent0"}, agent1{"agent1"}; BaseAgentConfig config0{agent0, true}, config1{agent1, true}; - auto nixlAgent0 = makeTransferAgent(config0); - auto nixlAgent1 = makeTransferAgent(config1); + auto xferAgent0 = makeTransferAgent(config0); + auto xferAgent1 = makeTransferAgent(config1); - TLLM_CHECK(nixlAgent0); - TLLM_CHECK(nixlAgent1); + TLLM_CHECK(xferAgent0); + TLLM_CHECK(xferAgent1); std::vector memory0(100, 10); std::vector memory1(100, 1); - RegisteredHostMemory regMem0(MemoryDescs{MemoryType::kDRAM, {MemoryDesc{memory0}}}, nixlAgent0.get()); - RegisteredHostMemory regMem1(MemoryDescs{MemoryType::kDRAM, {MemoryDesc{memory1}}}, nixlAgent1.get()); + RegisteredHostMemory regMem0(MemoryDescs{MemoryType::kDRAM, {MemoryDesc{memory0}}}, xferAgent0.get()); + RegisteredHostMemory regMem1(MemoryDescs{MemoryType::kDRAM, {MemoryDesc{memory1}}}, xferAgent1.get()); - // nixlAgent0->loadRemoteAgent(agent1); - auto connectionInfo = nixlAgent1->getLocalConnectionInfo(); - nixlAgent0->loadRemoteAgent(agent1, connectionInfo); + // xferAgent0->loadRemoteAgent(agent1); + auto connectionInfo = xferAgent1->getLocalConnectionInfo(); + xferAgent0->loadRemoteAgent(agent1, connectionInfo); bool checked = false; do { - checked = nixlAgent0->checkRemoteDescs(agent1, regMem1.getDescs()); - // wait for regMem is unpacked by nixlAgent0 + checked = xferAgent0->checkRemoteDescs(agent1, regMem1.getDescs()); + // wait for regMem is unpacked by xferAgent0 } while (!checked); TransferRequest writeReq{TransferOp::kWRITE, regMem0.getDescs(), regMem1.getDescs(), agent1}; - auto status = nixlAgent0->submitTransferRequests(writeReq); + auto status = xferAgent0->submitTransferRequests(writeReq); status->wait(); TLLM_CHECK(memory0 == memory1); - nixlAgent0->invalidateRemoteAgent(agent1); + xferAgent0->invalidateRemoteAgent(agent1); } -TEST_F(TransferAgentTest, Basic2) +TEST_P(TransferAgentTest, Basic2) { std::string const agent0{"agent0"}, agent1{"agent1"}; BaseAgentConfig config0{agent0, true}, config1{agent1, true}; - auto nixlAgent0 = makeTransferAgent(config0); - auto nixlAgent1 = makeTransferAgent(config1); + auto xferAgent0 = makeTransferAgent(config0); + auto xferAgent1 = makeTransferAgent(config1); - TLLM_CHECK(nixlAgent0); - TLLM_CHECK(nixlAgent1); + TLLM_CHECK(xferAgent0); + TLLM_CHECK(xferAgent1); std::vector memory0(100, 10); std::vector memory1(100, 1); - RegisteredHostMemory regMem0(MemoryDescs{MemoryType::kDRAM, {MemoryDesc{memory0}}}, nixlAgent0.get()); - RegisteredHostMemory regMem1(MemoryDescs{MemoryType::kDRAM, {MemoryDesc{memory1}}}, nixlAgent1.get()); + RegisteredHostMemory regMem0(MemoryDescs{MemoryType::kDRAM, {MemoryDesc{memory0}}}, xferAgent0.get()); + RegisteredHostMemory regMem1(MemoryDescs{MemoryType::kDRAM, {MemoryDesc{memory1}}}, xferAgent1.get()); - // nixlAgent0->loadRemoteAgent(agent1); - auto connectionInfo = nixlAgent1->getLocalConnectionInfo(); - nixlAgent0->loadRemoteAgent(agent1, connectionInfo); + // xferAgent0->loadRemoteAgent(agent1); + auto connectionInfo = xferAgent1->getLocalConnectionInfo(); + xferAgent0->loadRemoteAgent(agent1, connectionInfo); bool checked = false; do { - checked = nixlAgent0->checkRemoteDescs(agent1, regMem1.getDescs()); + checked = xferAgent0->checkRemoteDescs(agent1, regMem1.getDescs()); } while (!checked); TransferRequest readReq{TransferOp::kREAD, regMem0.getDescs(), regMem1.getDescs(), agent1}; - auto status = nixlAgent0->submitTransferRequests(readReq); + auto status = xferAgent0->submitTransferRequests(readReq); status->wait(); TLLM_CHECK(memory0 == memory1); - nixlAgent0->invalidateRemoteAgent(agent1); + xferAgent0->invalidateRemoteAgent(agent1); } -TEST_F(TransferAgentTest, DeviceMemory) +TEST_P(TransferAgentTest, DeviceMemory) { std::string const agent0{"agent0"}, agent1{"agent1"}; BaseAgentConfig config0{agent0, true}, config1{agent1, true}; - auto nixlAgent0 = makeTransferAgent(config0); - auto nixlAgent1 = makeTransferAgent(config1); + auto xferAgent0 = makeTransferAgent(config0); + auto xferAgent1 = makeTransferAgent(config1); - TLLM_CHECK(nixlAgent0); - TLLM_CHECK(nixlAgent1); + TLLM_CHECK(xferAgent0); + TLLM_CHECK(xferAgent1); char* dev_ptr0; char* dev_ptr1; size_t size = 100; @@ -159,20 +180,20 @@ TEST_F(TransferAgentTest, DeviceMemory) cudaMemcpy(dev_ptr0, memory0.data(), size, cudaMemcpyHostToDevice); cudaMemcpy(dev_ptr1, memory1.data(), size, cudaMemcpyHostToDevice); RegisteredHostMemory regMem0( - MemoryDescs{MemoryType::kVRAM, {MemoryDesc{dev_ptr0, size, deviceId}}}, nixlAgent0.get()); + MemoryDescs{MemoryType::kVRAM, {MemoryDesc{dev_ptr0, size, deviceId}}}, xferAgent0.get()); RegisteredHostMemory regMem1( - MemoryDescs{MemoryType::kVRAM, {MemoryDesc{dev_ptr1, size, deviceId}}}, nixlAgent1.get()); + MemoryDescs{MemoryType::kVRAM, {MemoryDesc{dev_ptr1, size, deviceId}}}, xferAgent1.get()); - // nixlAgent0->loadRemoteAgent(agent1); - auto connectionInfo = nixlAgent1->getLocalConnectionInfo(); - nixlAgent0->loadRemoteAgent(agent1, connectionInfo); + // xferAgent0->loadRemoteAgent(agent1); + auto connectionInfo = xferAgent1->getLocalConnectionInfo(); + xferAgent0->loadRemoteAgent(agent1, connectionInfo); bool checked = false; do { - checked = nixlAgent0->checkRemoteDescs(agent1, regMem1.getDescs()); + checked = xferAgent0->checkRemoteDescs(agent1, regMem1.getDescs()); } while (!checked); TransferRequest writeReq{TransferOp::kWRITE, regMem0.getDescs(), regMem1.getDescs(), agent1}; - auto status = nixlAgent0->submitTransferRequests(writeReq); + auto status = xferAgent0->submitTransferRequests(writeReq); status->wait(); cudaMemcpy(memory0.data(), dev_ptr0, size, cudaMemcpyDeviceToHost); @@ -181,98 +202,99 @@ TEST_F(TransferAgentTest, DeviceMemory) TLLM_CHECK(memory0 == memory1); TLLM_CUDA_CHECK(cudaFree(dev_ptr0)); TLLM_CUDA_CHECK(cudaFree(dev_ptr1)); - nixlAgent0->invalidateRemoteAgent(agent1); + xferAgent0->invalidateRemoteAgent(agent1); } -TEST_F(TransferAgentTest, Connect) +TEST_P(TransferAgentTest, Connect) { std::string const agent0{"agent0"}, agent1{"agent1"}, agent2{"agent2"}; BaseAgentConfig config0{agent0, true}, config1{agent1, true}, config2{agent2, true}; - auto nixlAgent0 = makeTransferAgent(config0); - auto nixlAgent1 = makeTransferAgent(config1); - auto nixlAgent2 = makeTransferAgent(config2); + auto xferAgent0 = makeTransferAgent(config0); + auto xferAgent1 = makeTransferAgent(config1); + auto xferAgent2 = makeTransferAgent(config2); - TLLM_CHECK(nixlAgent0); - TLLM_CHECK(nixlAgent1); + TLLM_CHECK(xferAgent0); + TLLM_CHECK(xferAgent1); std::vector memory0(100, 10); std::vector memory1(100, 1); MemoryDescs memDescs0{MemoryType::kDRAM, {MemoryDesc{memory0}}}; MemoryDescs memDescs1{MemoryType::kDRAM, {MemoryDesc{memory1}}}; - nixlAgent0->registerMemory(memDescs0); - nixlAgent1->registerMemory(memDescs1); - nixlAgent2->registerMemory(memDescs0); + xferAgent0->registerMemory(memDescs0); + xferAgent1->registerMemory(memDescs1); + xferAgent2->registerMemory(memDescs0); - // nixlAgent0->loadRemoteAgent(agent1); - auto connectionInfo = nixlAgent1->getLocalConnectionInfo(); - nixlAgent0->loadRemoteAgent(agent1, connectionInfo); + // xferAgent0->loadRemoteAgent(agent1); + auto connectionInfo = xferAgent1->getLocalConnectionInfo(); + xferAgent0->loadRemoteAgent(agent1, connectionInfo); bool checked = false; do { - checked = nixlAgent0->checkRemoteDescs(agent1, memDescs1); + checked = xferAgent0->checkRemoteDescs(agent1, memDescs1); } while (!checked); TransferRequest writeReq{TransferOp::kWRITE, memDescs0, memDescs1, agent1}; - auto status = nixlAgent0->submitTransferRequests(writeReq); + auto status = xferAgent0->submitTransferRequests(writeReq); status->wait(); TLLM_CHECK(memory0 == memory1); - nixlAgent2->loadRemoteAgent(agent1, connectionInfo); + xferAgent2->loadRemoteAgent(agent1, connectionInfo); checked = false; do { - checked = nixlAgent2->checkRemoteDescs(agent1, memDescs1); + checked = xferAgent2->checkRemoteDescs(agent1, memDescs1); } while (!checked); TransferRequest writeReq2{TransferOp::kWRITE, memDescs0, memDescs1, agent1}; - auto status2 = nixlAgent2->submitTransferRequests(writeReq2); + auto status2 = xferAgent2->submitTransferRequests(writeReq2); status2->wait(); TLLM_CHECK(memory0 == memory1); - nixlAgent0->invalidateRemoteAgent(agent1); - nixlAgent2->invalidateRemoteAgent(agent1); - nixlAgent0->deregisterMemory(memDescs0); - nixlAgent1->deregisterMemory(memDescs1); - nixlAgent2->deregisterMemory(memDescs0); + xferAgent0->invalidateRemoteAgent(agent1); + xferAgent2->invalidateRemoteAgent(agent1); + xferAgent0->deregisterMemory(memDescs0); + xferAgent1->deregisterMemory(memDescs1); + xferAgent2->deregisterMemory(memDescs0); } -TEST_F(TransferAgentTest, SyncMessage) +TEST_P(TransferAgentTest, SyncMessage) { constexpr std::size_t MAX_QUERY_TIMES = std::numeric_limits::max(); std::string const agent0{"agent0"}, agent1{"agent1"}; BaseAgentConfig config0{agent0, true}, config1{agent1, true}; - auto nixlAgent0 = makeTransferAgent(config0); - auto nixlAgent1 = makeTransferAgent(config1); + auto xferAgent0 = makeTransferAgent(config0); + auto xferAgent1 = makeTransferAgent(config1); - TLLM_CHECK(nixlAgent0); - TLLM_CHECK(nixlAgent1); + TLLM_CHECK(xferAgent0); + TLLM_CHECK(xferAgent1); std::vector memory0(100, 10); std::vector memory1(100, 1); - RegisteredHostMemory regMem0(MemoryDescs{MemoryType::kDRAM, {MemoryDesc{memory0}}}, nixlAgent0.get()); - RegisteredHostMemory regMem1(MemoryDescs{MemoryType::kDRAM, {MemoryDesc{memory1}}}, nixlAgent0.get()); + RegisteredHostMemory regMem0(MemoryDescs{MemoryType::kDRAM, {MemoryDesc{memory0}}}, xferAgent0.get()); + RegisteredHostMemory regMem1(MemoryDescs{MemoryType::kDRAM, {MemoryDesc{memory1}}}, xferAgent0.get()); - RegisteredHostMemory regMem2(MemoryDescs{MemoryType::kDRAM, {MemoryDesc{memory0}}}, nixlAgent1.get()); - RegisteredHostMemory regMem3(MemoryDescs{MemoryType::kDRAM, {MemoryDesc{memory1}}}, nixlAgent1.get()); + RegisteredHostMemory regMem2(MemoryDescs{MemoryType::kDRAM, {MemoryDesc{memory0}}}, xferAgent1.get()); + RegisteredHostMemory regMem3(MemoryDescs{MemoryType::kDRAM, {MemoryDesc{memory1}}}, xferAgent1.get()); - // nixlAgent0->loadRemoteAgent(agent1); - auto connectionInfo = nixlAgent1->getLocalConnectionInfo(); - nixlAgent0->loadRemoteAgent(agent1, connectionInfo); + // xferAgent0->loadRemoteAgent(agent1); + auto connectionInfo = xferAgent1->getLocalConnectionInfo(); + xferAgent0->loadRemoteAgent(agent1, connectionInfo); bool checked = false; do { - checked = nixlAgent0->checkRemoteDescs(agent1, regMem3.getDescs()); + checked = xferAgent0->checkRemoteDescs(agent1, regMem3.getDescs()); } while (!checked); auto syncMessage = std::string("agent_sync_message"); TransferRequest writeReq{TransferOp::kWRITE, regMem0.getDescs(), regMem3.getDescs(), agent1}; - auto status = nixlAgent0->submitTransferRequests(writeReq); - nixlAgent0->notifySyncMessage(agent1, syncMessage); + auto status = xferAgent0->submitTransferRequests(writeReq); + xferAgent0->notifySyncMessage(agent1, syncMessage); - auto notif = nixlAgent1->getNotifiedSyncMessages(); + auto notif = xferAgent1->getNotifiedSyncMessages(); for (std::size_t i = 0; i < MAX_QUERY_TIMES && notif.size() == 0; i++) { - notif = nixlAgent1->getNotifiedSyncMessages(); + notif = xferAgent1->getNotifiedSyncMessages(); } + status->wait(); TLLM_CHECK(status->isCompleted()); TLLM_CHECK(notif.size() == 1); TLLM_CHECK(notif[agent0].size() == 1); @@ -281,25 +303,25 @@ TEST_F(TransferAgentTest, SyncMessage) TLLM_CHECK(memory0 == memory1); std::string syncMessage2 = "two_agent_sync_message"; - nixlAgent0->notifySyncMessage(agent1, syncMessage2); - auto notif2 = nixlAgent1->getNotifiedSyncMessages(); + xferAgent0->notifySyncMessage(agent1, syncMessage2); + auto notif2 = xferAgent1->getNotifiedSyncMessages(); for (std::size_t i = 0; i < MAX_QUERY_TIMES && notif2.size() == 0; i++) { - notif2 = nixlAgent1->getNotifiedSyncMessages(); + notif2 = xferAgent1->getNotifiedSyncMessages(); } TLLM_CHECK(notif2.size() == 1); TLLM_CHECK(notif2[agent0].size() == 1); TLLM_CHECK(notif2[agent0][0] == syncMessage2); - // nixlAgent1->loadRemoteAgent(agent0); - auto connectionInfo2 = nixlAgent0->getLocalConnectionInfo(); - nixlAgent1->loadRemoteAgent(agent0, connectionInfo2); + // xferAgent1->loadRemoteAgent(agent0); + auto connectionInfo2 = xferAgent0->getLocalConnectionInfo(); + xferAgent1->loadRemoteAgent(agent0, connectionInfo2); std::string syncMessage3 = "three_agent_sync_message"; - nixlAgent1->notifySyncMessage(agent0, syncMessage3); - auto notif3 = nixlAgent0->getNotifiedSyncMessages(); + xferAgent1->notifySyncMessage(agent0, syncMessage3); + auto notif3 = xferAgent0->getNotifiedSyncMessages(); for (std::size_t i = 0; i < MAX_QUERY_TIMES && notif3.size() == 0; i++) { - notif3 = nixlAgent0->getNotifiedSyncMessages(); + notif3 = xferAgent0->getNotifiedSyncMessages(); } TLLM_CHECK(notif3.size() == 1); TLLM_CHECK(notif3[agent1].size() == 1); @@ -308,19 +330,20 @@ TEST_F(TransferAgentTest, SyncMessage) bool checked2 = false; do { - checked2 = nixlAgent0->checkRemoteDescs(agent1, regMem1.getDescs()); + checked2 = xferAgent0->checkRemoteDescs(agent1, regMem1.getDescs()); } while (!checked2); std::string syncMessage4 = "four_agent_sync_message"; TransferRequest writeReq1{TransferOp::kWRITE, regMem2.getDescs(), regMem1.getDescs(), agent0}; - auto status1 = nixlAgent1->submitTransferRequests(writeReq1); - nixlAgent1->notifySyncMessage(agent0, syncMessage4); + auto status1 = xferAgent1->submitTransferRequests(writeReq1); + xferAgent1->notifySyncMessage(agent0, syncMessage4); - auto notif4 = nixlAgent0->getNotifiedSyncMessages(); + auto notif4 = xferAgent0->getNotifiedSyncMessages(); for (std::size_t i = 0; i < MAX_QUERY_TIMES && notif4.size() == 0; i++) { - notif4 = nixlAgent0->getNotifiedSyncMessages(); + notif4 = xferAgent0->getNotifiedSyncMessages(); } + status1->wait(); TLLM_CHECK(status1->isCompleted()); TLLM_CHECK(notif4.size() == 1); TLLM_CHECK(notif4[agent1].size() == 1); @@ -335,11 +358,11 @@ TEST_F(TransferAgentTest, SyncMessage) std::stringstream ss; Serialization::serialize(state, ss); std::string serializedState = ss.str(); - nixlAgent0->notifySyncMessage(agent1, serializedState); - auto notif5 = nixlAgent1->getNotifiedSyncMessages(); + xferAgent0->notifySyncMessage(agent1, serializedState); + auto notif5 = xferAgent1->getNotifiedSyncMessages(); for (size_t i = 0; i < MAX_QUERY_TIMES && notif5.size() == 0; i++) { - notif5 = nixlAgent1->getNotifiedSyncMessages(); + notif5 = xferAgent1->getNotifiedSyncMessages(); } TLLM_CHECK(notif5.size() == 1); TLLM_CHECK(notif5[agent0].size() == 1); @@ -348,10 +371,16 @@ TEST_F(TransferAgentTest, SyncMessage) auto state2 = Serialization::deserializeCommState(ss2); TLLM_CHECK(state2 == state); - nixlAgent0->invalidateRemoteAgent(agent1); - nixlAgent1->invalidateRemoteAgent(agent0); + xferAgent0->invalidateRemoteAgent(agent1); + xferAgent1->invalidateRemoteAgent(agent0); } +INSTANTIATE_TEST_SUITE_P(AvailableBackends, TransferAgentTest, ::testing::ValuesIn(getAvailableBackends()), + [](::testing::TestParamInfo const& info) { return info.param; }); + +// Skip LoopbackAgentTest for mooncake backend for now +#ifdef TEST_NIXL_BACKEND + class LoopbackAgentTest : public ::testing::Test, public ::testing::WithParamInterface // NOLINT(cppcoreguidelines-pro-type-member-init) { @@ -466,3 +495,5 @@ TEST_P(LoopbackAgentTest, GpuToFile) } INSTANTIATE_TEST_SUITE_P(, LoopbackAgentTest, ::testing::Values(true, false)); + +#endif // TEST_NIXL_BACKEND diff --git a/cpp/tests/unit_tests/multi_gpu/cacheTransceiverTest.cpp b/cpp/tests/unit_tests/multi_gpu/cacheTransceiverTest.cpp index 17ca989eee..41dd8e7a92 100644 --- a/cpp/tests/unit_tests/multi_gpu/cacheTransceiverTest.cpp +++ b/cpp/tests/unit_tests/multi_gpu/cacheTransceiverTest.cpp @@ -46,6 +46,7 @@ #include #include #include +#include #include #include #include @@ -713,7 +714,7 @@ protected: return; } else if (tensorrt_llm::common::getEnvUseMPIKvCache() || tensorrt_llm::common::getEnvUseUCXKvCache() - || tensorrt_llm::common::getEnvUseNixlKvCache()) + || tensorrt_llm::common::getEnvUseNixlKvCache() || tensorrt_llm::common::getEnvUseMooncakeKvCache()) { int maxNumTokens = 2048; mCacheTransBufferManagers.clear(); @@ -729,7 +730,15 @@ protected: } bool isUcx = tensorrt_llm::common::getEnvUseUCXKvCache(); bool isNixl = tensorrt_llm::common::getEnvUseNixlKvCache(); - TLLM_LOG_INFO("Enable %s KV cache transport.", isUcx ? "UCX" : isNixl ? "NIXL" : "MPI"); + bool isMooncake = tensorrt_llm::common::getEnvUseMooncakeKvCache(); + // Skip tests for MOONCAKE when on Rocky8 + bool isRocky8 = std::filesystem::exists("/etc/redhat-release"); + isMooncake = isMooncake && !isRocky8; + TLLM_LOG_INFO("Enable %s KV cache transport.", + isUcx ? "UCX" + : isNixl ? "NIXL" + : isMooncake ? "MOONCAKE" + : "MPI"); if (isUcx) { @@ -756,7 +765,12 @@ protected: setenv("TRTLLM_NIXL_PORT", std::to_string(port).c_str(), 1); mConnectionManager - = std::make_unique(bufferManagers, *mCacheState); + = std::make_unique(bufferManagers, *mCacheState, "nixl"); + } + else if (isMooncake) + { + mConnectionManager = std::make_unique( + bufferManagers, *mCacheState, "mooncake"); } else { @@ -783,7 +797,7 @@ protected: std::vector contextRankVec(mContextRankSize); std::iota(contextRankVec.begin(), contextRankVec.end(), 0); - if (isUcx || isNixl) + if (isUcx || isNixl || isMooncake) { auto commState = mConnectionManager->getCommState(); namespace su = tensorrt_llm::executor::serialize_utils; @@ -1286,9 +1300,9 @@ TEST_P(AsymmetricalCacheTest, TestCase) int indexerDimPerHead = std::get<17>(param); int indexerKCacheQuantBlockSize = std::get<18>(param); - if (genCp > 1 && tensorrt_llm::common::getEnvUseNixlKvCache()) + if (genCp > 1 && (tensorrt_llm::common::getEnvUseNixlKvCache() || tensorrt_llm::common::getEnvUseMooncakeKvCache())) { - GTEST_SKIP() << "Temporarily skipping cache transceiver tests with NIXL backend for CP."; + GTEST_SKIP() << "Temporarily skipping cache transceiver tests with NIXL and MOONCAKE backend for CP."; } std::vector lenList = {30, 10, 60, 80}; if (genCp > 1) @@ -1410,9 +1424,9 @@ TEST_P(AsymmetricalCacheTestWithDP, TestCase) int indexerDimPerHead = std::get<17>(param); int indexerKCacheQuantBlockSize = std::get<18>(param); - if (genCp > 1 && tensorrt_llm::common::getEnvUseNixlKvCache()) + if (genCp > 1 && (tensorrt_llm::common::getEnvUseNixlKvCache() || tensorrt_llm::common::getEnvUseMooncakeKvCache())) { - GTEST_SKIP() << "Temporarily skipping cache transceiver tests with NIXL backend for CP."; + GTEST_SKIP() << "Temporarily skipping cache transceiver tests with NIXL and MOONCAKE backend for CP."; } setUpCommunicator(contextTp, contextPp, contextCp, genTp, genPp, genCp, isMLA, contextDP, generationDP); diff --git a/docs/source/deployment-guide/note_sections.rst b/docs/source/_includes/note_sections.rst similarity index 75% rename from docs/source/deployment-guide/note_sections.rst rename to docs/source/_includes/note_sections.rst index 4cd0d1c41d..d0b1657638 100644 --- a/docs/source/deployment-guide/note_sections.rst +++ b/docs/source/_includes/note_sections.rst @@ -1,11 +1,20 @@ .. - Reusable note sections for deployment guides. + Reusable note sections for docs. Include specific notes using: - .. include:: note_sections.rst + .. include:: /note_sections.rst :start-after: .. start-note- :end-before: .. end-note- +.. start-note-config-flag-alias + +.. note:: + + **Non-breaking**: ``--config `` is the preferred flag for passing a :ref:`YAML configuration file `. + Existing workflows using ``--extra_llm_api_options `` continue to work; it is an equivalent alias. + +.. end-note-config-flag-alias + .. start-note-traffic-patterns .. note:: diff --git a/docs/source/blogs/Best_perf_practice_on_DeepSeek-R1_in_TensorRT-LLM.md b/docs/source/blogs/Best_perf_practice_on_DeepSeek-R1_in_TensorRT-LLM.md index ad0e9975a1..7072f770bf 100644 --- a/docs/source/blogs/Best_perf_practice_on_DeepSeek-R1_in_TensorRT-LLM.md +++ b/docs/source/blogs/Best_perf_practice_on_DeepSeek-R1_in_TensorRT-LLM.md @@ -139,7 +139,7 @@ To do the benchmark, run the following command: ```bash YOUR_DATA_PATH= -cat >./extra-llm-api-config.yml<./config.yml<./extra-llm-api-config.yml <./config.yml <./extra-llm-api-config.yml <./config.yml < -cat >./extra-llm-api-config.yml<./config.yml<./extra-llm-api-config.yml<./config.yml< -cat >./extra-llm-api-config.yml<./config.yml< -cat >./extra-llm-api-config.yml<./config.yml< ./extra_llm_api_options.yaml < ./config.yaml < ./extra_llm_api_options_eplb.yaml < ./config_eplb.yaml < @@ -201,7 +201,7 @@ trtllm-serve \ --ep_size 4 \ --max_batch_size 640 \ --trust_remote_code \ - --extra_llm_api_options max_throughput.yaml \ + --config max_throughput.yaml \ --kv_cache_free_gpu_memory_fraction 0.9 ``` @@ -223,7 +223,7 @@ OpenAI ships a set of Triton kernels optimized for its MoE models. TensorRT LLM ### Selecting Triton as the MoE backend -To use the Triton MoE backend with **trtllm-serve** (or other similar commands) add this snippet to the YAML file passed via `--extra_llm_api_options`: +To use the Triton MoE backend with **trtllm-serve** (or other similar commands) add this snippet to the YAML file passed via `--config`: ```yaml moe_config: @@ -347,7 +347,7 @@ OpenAI ships a set of Triton kernels optimized for its MoE models. TensorRT-LLM ### Selecting Triton as the MoE backend -To use the Triton MoE backend with **trtllm-serve** (or other commands), add this snippet to the YAML file passed via `--extra_llm_api_options`: +To use the Triton MoE backend with **trtllm-serve** (or other commands), add this snippet to the YAML file passed via `--config`: ```yaml moe_config: diff --git a/docs/source/commands/trtllm-bench.rst b/docs/source/commands/trtllm-bench.rst index cd69874e0c..fee60a9ab7 100644 --- a/docs/source/commands/trtllm-bench.rst +++ b/docs/source/commands/trtllm-bench.rst @@ -3,9 +3,12 @@ trtllm-bench trtllm-bench is a comprehensive benchmarking tool for TensorRT LLM engines. It provides three main subcommands for different benchmarking scenarios: -**Common Options for All Commands:** +.. include:: ../_includes/note_sections.rst + :start-after: .. start-note-config-flag-alias + :end-before: .. end-note-config-flag-alias -**Usage:** +Syntax +------ .. click:: tensorrt_llm.commands.bench:main :prog: trtllm-bench @@ -14,8 +17,11 @@ trtllm-bench is a comprehensive benchmarking tool for TensorRT LLM engines. It p +Dataset preparation +------------------ + prepare_dataset.py -=========================== +^^^^^^^^^^^^^^^^^^ trtllm-bench is designed to work with the `prepare_dataset.py `_ script, which generates benchmark datasets in the required format. The prepare_dataset script supports: @@ -38,7 +44,7 @@ trtllm-bench is designed to work with the `prepare_dataset.py --help``. +.. include:: ../_includes/note_sections.rst + :start-after: .. start-note-config-flag-alias + :end-before: .. end-note-config-flag-alias + Syntax diff --git a/docs/source/commands/trtllm-serve/run-benchmark-with-trtllm-serve.md b/docs/source/commands/trtllm-serve/run-benchmark-with-trtllm-serve.md index 34a509f5a4..089426d9b7 100644 --- a/docs/source/commands/trtllm-serve/run-benchmark-with-trtllm-serve.md +++ b/docs/source/commands/trtllm-serve/run-benchmark-with-trtllm-serve.md @@ -3,30 +3,11 @@ TensorRT LLM provides the OpenAI-compatible API via `trtllm-serve` command. A complete reference for the API is available in the [OpenAI API Reference](https://platform.openai.com/docs/api-reference). -This step-by-step tutorial covers the following topics for running online serving benchmarking with Llama 3.1 70B and Qwen2.5-VL-7B for multimodal models: - * Methodology Introduction - * Launch the OpenAI-Compatible Server with NGC container - * Run the performance benchmark - * Using `extra_llm_api_options` - * Multimodal Serving and Benchmarking - -## Table of Contents -- [Run benchmarking with `trtllm-serve`](#run-benchmarking-with-trtllm-serve) - - [Table of Contents](#table-of-contents) - - [Methodology Introduction](#methodology-introduction) - - [Preparation](#preparation) - - [Launch the NGC container](#launch-the-ngc-container) - - [Start the trtllm-serve service](#start-the-trtllm-serve-service) - - [Benchmark using `tensorrt_llm.serve.scripts.benchmark_serving`](#benchmark-using-tensorrt_llmservescriptsbenchmark_serving) - - [Key Metrics](#key-metrics) - - [About `extra_llm_api_options`](#about-extra_llm_api_options) - - [`kv_cache_config`](#kv_cache_config) - - [`cuda_graph_config`](#cuda_graph_config) - - [`moe_config`](#moe_config) - - [`attention_backend`](#attention_backend) - - [Multimodal Serving and Benchmarking](#multimodal-serving-and-benchmarking) - - [Setting up Multimodal Serving](#setting-up-multimodal-serving) - - [Multimodal Benchmarking](#multimodal-benchmarking) +```{contents} +:Contents +:local: +:depth: 3 +``` ## Methodology Introduction @@ -57,9 +38,9 @@ For benchmarking purposes, first create a bash script using the following code a ```bash #! /bin/bash model_path=/path/to/llama3.1_70B -extra_llm_api_file=/tmp/extra-llm-api-config.yml +config_file=/tmp/config.yml -cat << EOF > ${extra_llm_api_file} +cat << EOF > ${config_file} enable_attention_dp: false print_iter_log: true cuda_graph_config: @@ -77,7 +58,7 @@ trtllm-serve ${model_path} \ --tp_size 1 \ --ep_size 1 \ --trust_remote_code \ - --extra_llm_api_options ${extra_llm_api_file} + --config ${config_file} ``` > [!NOTE] > The trtllm-llmapi-launch is a script that launches the LLM-API code on @@ -215,17 +196,24 @@ $$ To get more detailed metrics besides the key metrics above, there is an [experimental tool](https://github.com/NVIDIA/TensorRT-LLM/tree/main/tensorrt_llm/serve/scripts/time_breakdown) for request time breakdown. -## About `extra_llm_api_options` - trtllm-serve provides `extra_llm_api_options` knob to **overwrite** the parameters specified by trtllm-serve. - Generally, We create a YAML file that contains various performance switches. - e.g - ```yaml - cuda_graph_config: - padding_enabled: true - print_iter_log: true - kv_cache_dtype: fp8 - enable_attention_dp: true - ``` +## About `--config` + +```{eval-rst} +.. include:: ../../_includes/note_sections.rst + :start-after: .. start-note-config-flag-alias + :end-before: .. end-note-config-flag-alias +``` + +`trtllm-serve` provides `--config` to **overwrite** the parameters specified by `trtllm-serve`. +Generally, we create a YAML file that contains various performance switches. For example: + +```yaml +cuda_graph_config: + padding_enabled: true +print_iter_log: true +kv_cache_dtype: fp8 +enable_attention_dp: true +``` The following is a list of common performance switches. #### `kv_cache_config` @@ -274,7 +262,7 @@ The following is a list of common performance switches.  **Default**: TRTLLM -See the [TorchLlmArgs class](https://nvidia.github.io/TensorRT-LLM/llm-api/reference.html#tensorrt_llm.llmapi.TorchLlmArgs) for the full list of options which can be used in the extra\_llm\_api\_options`.` +See the [TorchLlmArgs class](https://nvidia.github.io/TensorRT-LLM/llm-api/reference.html#tensorrt_llm.llmapi.TorchLlmArgs) for the full list of options which can be used in the `--config`. ## Multimodal Serving and Benchmarking diff --git a/docs/source/commands/trtllm-serve/trtllm-serve.rst b/docs/source/commands/trtllm-serve/trtllm-serve.rst index 33bad7f1e5..7e09872a9b 100644 --- a/docs/source/commands/trtllm-serve/trtllm-serve.rst +++ b/docs/source/commands/trtllm-serve/trtllm-serve.rst @@ -98,7 +98,7 @@ First, create a configuration file: .. code-block:: bash - cat >./extra-llm-api-config.yml<./config.yml<`_ m .. code-block:: bash - echo -e "enable_attention_dp: true\npytorch_backend_config:\n enable_overlap_scheduler: true" > extra-llm-api-config.yml + echo -e "enable_attention_dp: true\npytorch_backend_config:\n enable_overlap_scheduler: true" > config.yml srun -N 2 -w [NODES] \ --output=benchmark_2node.log \ @@ -210,7 +210,7 @@ You can deploy `DeepSeek-V3 `_ m --container-image= \ --container-mounts=/workspace:/workspace \ --container-workdir /workspace \ - bash -c "trtllm-llmapi-launch trtllm-serve deepseek-ai/DeepSeek-V3 --max_batch_size 161 --max_num_tokens 1160 --tp_size 16 --ep_size 4 --kv_cache_free_gpu_memory_fraction 0.95 --extra_llm_api_options ./extra-llm-api-config.yml" + bash -c "trtllm-llmapi-launch trtllm-serve deepseek-ai/DeepSeek-V3 --max_batch_size 161 --max_num_tokens 1160 --tp_size 16 --ep_size 4 --kv_cache_free_gpu_memory_fraction 0.95 --config ./config.yml" See `the source code `_ of ``trtllm-llmapi-launch`` for more details. @@ -234,11 +234,11 @@ For the default PyTorch backend, iteration statistics logging is enabled by sett # extra_llm_config.yaml enable_iter_perf_stats: true -Start the server and specify the ``--extra_llm_api_options`` argument with the path to the YAML file: +Start the server and specify the ``--config`` argument with the path to the YAML file: .. code-block:: bash - trtllm-serve "TinyLlama/TinyLlama-1.1B-Chat-v1.0" --extra_llm_api_options extra_llm_config.yaml + trtllm-serve "TinyLlama/TinyLlama-1.1B-Chat-v1.0" --config config.yaml After sending at least one inference request to the server, you can fetch runtime iteration statistics by polling the ``/metrics`` endpoint. Since the statistics are stored in an internal queue and removed once retrieved, it's recommended to poll the endpoint shortly after each request and store the results if needed. @@ -272,10 +272,16 @@ Example output: } ] +.. _configuring-with-yaml-files: + Configuring with YAML Files ---------------------------- -You can configure various options of ``trtllm-serve`` using YAML files by setting the ``--extra_llm_api_options`` option to the path of a YAML file, the arguments in the file will override the corresponding command line arguments. +You can configure various options of ``trtllm-serve`` using YAML files by setting the ``--config`` option to the path of a YAML file. The arguments in the file override the corresponding command line arguments. + +.. include:: ../../_includes/note_sections.rst + :start-after: .. start-note-config-flag-alias + :end-before: .. end-note-config-flag-alias The yaml file is configuration of `tensorrt_llm.llmapi.LlmArgs `_, the class has multiple levels of hierarchy, to configure the top level arguments like ``max_batch_size``, the yaml file should be like: diff --git a/docs/source/deployment-guide/config_table.rst b/docs/source/deployment-guide/config_table.rst index c2e1e5b55d..bb59b7505f 100644 --- a/docs/source/deployment-guide/config_table.rst +++ b/docs/source/deployment-guide/config_table.rst @@ -1,4 +1,4 @@ -.. include:: note_sections.rst +.. include:: ../_includes/note_sections.rst :start-after: .. start-note-traffic-patterns :end-before: .. end-note-traffic-patterns @@ -25,121 +25,121 @@ - 1024 / 1024 - 4 - `1k1k_tp8_conc4.yaml `_ - - ``trtllm-serve deepseek-ai/DeepSeek-R1-0528 --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/deepseek-ai/DeepSeek-R1-0528/B200/1k1k_tp8_conc4.yaml`` + - ``trtllm-serve deepseek-ai/DeepSeek-R1-0528 --config ${TRTLLM_DIR}/examples/configs/database/deepseek-ai/DeepSeek-R1-0528/B200/1k1k_tp8_conc4.yaml`` * - 8xB200_NVL - Low Latency - 1024 / 1024 - 8 - `1k1k_tp8_conc8.yaml `_ - - ``trtllm-serve deepseek-ai/DeepSeek-R1-0528 --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/deepseek-ai/DeepSeek-R1-0528/B200/1k1k_tp8_conc8.yaml`` + - ``trtllm-serve deepseek-ai/DeepSeek-R1-0528 --config ${TRTLLM_DIR}/examples/configs/database/deepseek-ai/DeepSeek-R1-0528/B200/1k1k_tp8_conc8.yaml`` * - 8xB200_NVL - Balanced - 1024 / 1024 - 16 - `1k1k_tp8_conc16.yaml `_ - - ``trtllm-serve deepseek-ai/DeepSeek-R1-0528 --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/deepseek-ai/DeepSeek-R1-0528/B200/1k1k_tp8_conc16.yaml`` + - ``trtllm-serve deepseek-ai/DeepSeek-R1-0528 --config ${TRTLLM_DIR}/examples/configs/database/deepseek-ai/DeepSeek-R1-0528/B200/1k1k_tp8_conc16.yaml`` * - 8xB200_NVL - High Throughput - 1024 / 1024 - 32 - `1k1k_tp8_conc32.yaml `_ - - ``trtllm-serve deepseek-ai/DeepSeek-R1-0528 --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/deepseek-ai/DeepSeek-R1-0528/B200/1k1k_tp8_conc32.yaml`` + - ``trtllm-serve deepseek-ai/DeepSeek-R1-0528 --config ${TRTLLM_DIR}/examples/configs/database/deepseek-ai/DeepSeek-R1-0528/B200/1k1k_tp8_conc32.yaml`` * - 8xB200_NVL - Max Throughput - 1024 / 1024 - 64 - `1k1k_tp8_conc64.yaml `_ - - ``trtllm-serve deepseek-ai/DeepSeek-R1-0528 --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/deepseek-ai/DeepSeek-R1-0528/B200/1k1k_tp8_conc64.yaml`` + - ``trtllm-serve deepseek-ai/DeepSeek-R1-0528 --config ${TRTLLM_DIR}/examples/configs/database/deepseek-ai/DeepSeek-R1-0528/B200/1k1k_tp8_conc64.yaml`` * - 8xB200_NVL - Min Latency - 8192 / 1024 - 4 - `8k1k_tp8_conc4.yaml `_ - - ``trtllm-serve deepseek-ai/DeepSeek-R1-0528 --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/deepseek-ai/DeepSeek-R1-0528/B200/8k1k_tp8_conc4.yaml`` + - ``trtllm-serve deepseek-ai/DeepSeek-R1-0528 --config ${TRTLLM_DIR}/examples/configs/database/deepseek-ai/DeepSeek-R1-0528/B200/8k1k_tp8_conc4.yaml`` * - 8xB200_NVL - Low Latency - 8192 / 1024 - 8 - `8k1k_tp8_conc8.yaml `_ - - ``trtllm-serve deepseek-ai/DeepSeek-R1-0528 --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/deepseek-ai/DeepSeek-R1-0528/B200/8k1k_tp8_conc8.yaml`` + - ``trtllm-serve deepseek-ai/DeepSeek-R1-0528 --config ${TRTLLM_DIR}/examples/configs/database/deepseek-ai/DeepSeek-R1-0528/B200/8k1k_tp8_conc8.yaml`` * - 8xB200_NVL - Balanced - 8192 / 1024 - 16 - `8k1k_tp8_conc16.yaml `_ - - ``trtllm-serve deepseek-ai/DeepSeek-R1-0528 --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/deepseek-ai/DeepSeek-R1-0528/B200/8k1k_tp8_conc16.yaml`` + - ``trtllm-serve deepseek-ai/DeepSeek-R1-0528 --config ${TRTLLM_DIR}/examples/configs/database/deepseek-ai/DeepSeek-R1-0528/B200/8k1k_tp8_conc16.yaml`` * - 8xB200_NVL - High Throughput - 8192 / 1024 - 32 - `8k1k_tp8_conc32.yaml `_ - - ``trtllm-serve deepseek-ai/DeepSeek-R1-0528 --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/deepseek-ai/DeepSeek-R1-0528/B200/8k1k_tp8_conc32.yaml`` + - ``trtllm-serve deepseek-ai/DeepSeek-R1-0528 --config ${TRTLLM_DIR}/examples/configs/database/deepseek-ai/DeepSeek-R1-0528/B200/8k1k_tp8_conc32.yaml`` * - 8xB200_NVL - Max Throughput - 8192 / 1024 - 64 - `8k1k_tp8_conc64.yaml `_ - - ``trtllm-serve deepseek-ai/DeepSeek-R1-0528 --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/deepseek-ai/DeepSeek-R1-0528/B200/8k1k_tp8_conc64.yaml`` + - ``trtllm-serve deepseek-ai/DeepSeek-R1-0528 --config ${TRTLLM_DIR}/examples/configs/database/deepseek-ai/DeepSeek-R1-0528/B200/8k1k_tp8_conc64.yaml`` * - 8xH200_SXM - Min Latency - 1024 / 1024 - 4 - `1k1k_tp8_conc4.yaml `_ - - ``trtllm-serve deepseek-ai/DeepSeek-R1-0528 --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/deepseek-ai/DeepSeek-R1-0528/H200/1k1k_tp8_conc4.yaml`` + - ``trtllm-serve deepseek-ai/DeepSeek-R1-0528 --config ${TRTLLM_DIR}/examples/configs/database/deepseek-ai/DeepSeek-R1-0528/H200/1k1k_tp8_conc4.yaml`` * - 8xH200_SXM - Low Latency - 1024 / 1024 - 8 - `1k1k_tp8_conc8.yaml `_ - - ``trtllm-serve deepseek-ai/DeepSeek-R1-0528 --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/deepseek-ai/DeepSeek-R1-0528/H200/1k1k_tp8_conc8.yaml`` + - ``trtllm-serve deepseek-ai/DeepSeek-R1-0528 --config ${TRTLLM_DIR}/examples/configs/database/deepseek-ai/DeepSeek-R1-0528/H200/1k1k_tp8_conc8.yaml`` * - 8xH200_SXM - Balanced - 1024 / 1024 - 16 - `1k1k_tp8_conc16.yaml `_ - - ``trtllm-serve deepseek-ai/DeepSeek-R1-0528 --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/deepseek-ai/DeepSeek-R1-0528/H200/1k1k_tp8_conc16.yaml`` + - ``trtllm-serve deepseek-ai/DeepSeek-R1-0528 --config ${TRTLLM_DIR}/examples/configs/database/deepseek-ai/DeepSeek-R1-0528/H200/1k1k_tp8_conc16.yaml`` * - 8xH200_SXM - High Throughput - 1024 / 1024 - 32 - `1k1k_tp8_conc32.yaml `_ - - ``trtllm-serve deepseek-ai/DeepSeek-R1-0528 --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/deepseek-ai/DeepSeek-R1-0528/H200/1k1k_tp8_conc32.yaml`` + - ``trtllm-serve deepseek-ai/DeepSeek-R1-0528 --config ${TRTLLM_DIR}/examples/configs/database/deepseek-ai/DeepSeek-R1-0528/H200/1k1k_tp8_conc32.yaml`` * - 8xH200_SXM - Max Throughput - 1024 / 1024 - 64 - `1k1k_tp8_conc64.yaml `_ - - ``trtllm-serve deepseek-ai/DeepSeek-R1-0528 --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/deepseek-ai/DeepSeek-R1-0528/H200/1k1k_tp8_conc64.yaml`` + - ``trtllm-serve deepseek-ai/DeepSeek-R1-0528 --config ${TRTLLM_DIR}/examples/configs/database/deepseek-ai/DeepSeek-R1-0528/H200/1k1k_tp8_conc64.yaml`` * - 8xH200_SXM - Min Latency - 8192 / 1024 - 4 - `8k1k_tp8_conc4.yaml `_ - - ``trtllm-serve deepseek-ai/DeepSeek-R1-0528 --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/deepseek-ai/DeepSeek-R1-0528/H200/8k1k_tp8_conc4.yaml`` + - ``trtllm-serve deepseek-ai/DeepSeek-R1-0528 --config ${TRTLLM_DIR}/examples/configs/database/deepseek-ai/DeepSeek-R1-0528/H200/8k1k_tp8_conc4.yaml`` * - 8xH200_SXM - Low Latency - 8192 / 1024 - 8 - `8k1k_tp8_conc8.yaml `_ - - ``trtllm-serve deepseek-ai/DeepSeek-R1-0528 --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/deepseek-ai/DeepSeek-R1-0528/H200/8k1k_tp8_conc8.yaml`` + - ``trtllm-serve deepseek-ai/DeepSeek-R1-0528 --config ${TRTLLM_DIR}/examples/configs/database/deepseek-ai/DeepSeek-R1-0528/H200/8k1k_tp8_conc8.yaml`` * - 8xH200_SXM - Balanced - 8192 / 1024 - 16 - `8k1k_tp8_conc16.yaml `_ - - ``trtllm-serve deepseek-ai/DeepSeek-R1-0528 --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/deepseek-ai/DeepSeek-R1-0528/H200/8k1k_tp8_conc16.yaml`` + - ``trtllm-serve deepseek-ai/DeepSeek-R1-0528 --config ${TRTLLM_DIR}/examples/configs/database/deepseek-ai/DeepSeek-R1-0528/H200/8k1k_tp8_conc16.yaml`` * - 8xH200_SXM - High Throughput - 8192 / 1024 - 32 - `8k1k_tp8_conc32.yaml `_ - - ``trtllm-serve deepseek-ai/DeepSeek-R1-0528 --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/deepseek-ai/DeepSeek-R1-0528/H200/8k1k_tp8_conc32.yaml`` + - ``trtllm-serve deepseek-ai/DeepSeek-R1-0528 --config ${TRTLLM_DIR}/examples/configs/database/deepseek-ai/DeepSeek-R1-0528/H200/8k1k_tp8_conc32.yaml`` * - 8xH200_SXM - Max Throughput - 8192 / 1024 - 64 - `8k1k_tp8_conc64.yaml `_ - - ``trtllm-serve deepseek-ai/DeepSeek-R1-0528 --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/deepseek-ai/DeepSeek-R1-0528/H200/8k1k_tp8_conc64.yaml`` + - ``trtllm-serve deepseek-ai/DeepSeek-R1-0528 --config ${TRTLLM_DIR}/examples/configs/database/deepseek-ai/DeepSeek-R1-0528/H200/8k1k_tp8_conc64.yaml`` .. end-deepseek-ai/DeepSeek-R1-0528 @@ -166,169 +166,169 @@ - 1024 / 1024 - 4 - `1k1k_tp4_conc4.yaml `_ - - ``trtllm-serve nvidia/DeepSeek-R1-0528-FP4-v2 --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/1k1k_tp4_conc4.yaml`` + - ``trtllm-serve nvidia/DeepSeek-R1-0528-FP4-v2 --config ${TRTLLM_DIR}/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/1k1k_tp4_conc4.yaml`` * - 4xB200_NVL - Low Latency - 1024 / 1024 - 8 - `1k1k_tp4_conc8.yaml `_ - - ``trtllm-serve nvidia/DeepSeek-R1-0528-FP4-v2 --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/1k1k_tp4_conc8.yaml`` + - ``trtllm-serve nvidia/DeepSeek-R1-0528-FP4-v2 --config ${TRTLLM_DIR}/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/1k1k_tp4_conc8.yaml`` * - 4xB200_NVL - Low Latency - 1024 / 1024 - 16 - `1k1k_tp4_conc16.yaml `_ - - ``trtllm-serve nvidia/DeepSeek-R1-0528-FP4-v2 --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/1k1k_tp4_conc16.yaml`` + - ``trtllm-serve nvidia/DeepSeek-R1-0528-FP4-v2 --config ${TRTLLM_DIR}/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/1k1k_tp4_conc16.yaml`` * - 4xB200_NVL - Balanced - 1024 / 1024 - 32 - `1k1k_tp4_conc32.yaml `_ - - ``trtllm-serve nvidia/DeepSeek-R1-0528-FP4-v2 --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/1k1k_tp4_conc32.yaml`` + - ``trtllm-serve nvidia/DeepSeek-R1-0528-FP4-v2 --config ${TRTLLM_DIR}/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/1k1k_tp4_conc32.yaml`` * - 4xB200_NVL - High Throughput - 1024 / 1024 - 64 - `1k1k_tp4_conc64.yaml `_ - - ``trtllm-serve nvidia/DeepSeek-R1-0528-FP4-v2 --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/1k1k_tp4_conc64.yaml`` + - ``trtllm-serve nvidia/DeepSeek-R1-0528-FP4-v2 --config ${TRTLLM_DIR}/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/1k1k_tp4_conc64.yaml`` * - 4xB200_NVL - High Throughput - 1024 / 1024 - 128 - `1k1k_tp4_conc128.yaml `_ - - ``trtllm-serve nvidia/DeepSeek-R1-0528-FP4-v2 --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/1k1k_tp4_conc128.yaml`` + - ``trtllm-serve nvidia/DeepSeek-R1-0528-FP4-v2 --config ${TRTLLM_DIR}/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/1k1k_tp4_conc128.yaml`` * - 4xB200_NVL - Max Throughput - 1024 / 1024 - 256 - `1k1k_tp4_conc256.yaml `_ - - ``trtllm-serve nvidia/DeepSeek-R1-0528-FP4-v2 --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/1k1k_tp4_conc256.yaml`` + - ``trtllm-serve nvidia/DeepSeek-R1-0528-FP4-v2 --config ${TRTLLM_DIR}/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/1k1k_tp4_conc256.yaml`` * - 4xB200_NVL - Min Latency - 8192 / 1024 - 4 - `8k1k_tp4_conc4.yaml `_ - - ``trtllm-serve nvidia/DeepSeek-R1-0528-FP4-v2 --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/8k1k_tp4_conc4.yaml`` + - ``trtllm-serve nvidia/DeepSeek-R1-0528-FP4-v2 --config ${TRTLLM_DIR}/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/8k1k_tp4_conc4.yaml`` * - 4xB200_NVL - Low Latency - 8192 / 1024 - 8 - `8k1k_tp4_conc8.yaml `_ - - ``trtllm-serve nvidia/DeepSeek-R1-0528-FP4-v2 --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/8k1k_tp4_conc8.yaml`` + - ``trtllm-serve nvidia/DeepSeek-R1-0528-FP4-v2 --config ${TRTLLM_DIR}/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/8k1k_tp4_conc8.yaml`` * - 4xB200_NVL - Low Latency - 8192 / 1024 - 16 - `8k1k_tp4_conc16.yaml `_ - - ``trtllm-serve nvidia/DeepSeek-R1-0528-FP4-v2 --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/8k1k_tp4_conc16.yaml`` + - ``trtllm-serve nvidia/DeepSeek-R1-0528-FP4-v2 --config ${TRTLLM_DIR}/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/8k1k_tp4_conc16.yaml`` * - 4xB200_NVL - Balanced - 8192 / 1024 - 32 - `8k1k_tp4_conc32.yaml `_ - - ``trtllm-serve nvidia/DeepSeek-R1-0528-FP4-v2 --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/8k1k_tp4_conc32.yaml`` + - ``trtllm-serve nvidia/DeepSeek-R1-0528-FP4-v2 --config ${TRTLLM_DIR}/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/8k1k_tp4_conc32.yaml`` * - 4xB200_NVL - High Throughput - 8192 / 1024 - 64 - `8k1k_tp4_conc64.yaml `_ - - ``trtllm-serve nvidia/DeepSeek-R1-0528-FP4-v2 --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/8k1k_tp4_conc64.yaml`` + - ``trtllm-serve nvidia/DeepSeek-R1-0528-FP4-v2 --config ${TRTLLM_DIR}/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/8k1k_tp4_conc64.yaml`` * - 4xB200_NVL - High Throughput - 8192 / 1024 - 128 - `8k1k_tp4_conc128.yaml `_ - - ``trtllm-serve nvidia/DeepSeek-R1-0528-FP4-v2 --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/8k1k_tp4_conc128.yaml`` + - ``trtllm-serve nvidia/DeepSeek-R1-0528-FP4-v2 --config ${TRTLLM_DIR}/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/8k1k_tp4_conc128.yaml`` * - 4xB200_NVL - Max Throughput - 8192 / 1024 - 256 - `8k1k_tp4_conc256.yaml `_ - - ``trtllm-serve nvidia/DeepSeek-R1-0528-FP4-v2 --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/8k1k_tp4_conc256.yaml`` + - ``trtllm-serve nvidia/DeepSeek-R1-0528-FP4-v2 --config ${TRTLLM_DIR}/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/8k1k_tp4_conc256.yaml`` * - 8xB200_NVL - Min Latency - 1024 / 1024 - 4 - `1k1k_tp8_conc4.yaml `_ - - ``trtllm-serve nvidia/DeepSeek-R1-0528-FP4-v2 --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/1k1k_tp8_conc4.yaml`` + - ``trtllm-serve nvidia/DeepSeek-R1-0528-FP4-v2 --config ${TRTLLM_DIR}/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/1k1k_tp8_conc4.yaml`` * - 8xB200_NVL - Low Latency - 1024 / 1024 - 8 - `1k1k_tp8_conc8.yaml `_ - - ``trtllm-serve nvidia/DeepSeek-R1-0528-FP4-v2 --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/1k1k_tp8_conc8.yaml`` + - ``trtllm-serve nvidia/DeepSeek-R1-0528-FP4-v2 --config ${TRTLLM_DIR}/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/1k1k_tp8_conc8.yaml`` * - 8xB200_NVL - Low Latency - 1024 / 1024 - 16 - `1k1k_tp8_conc16.yaml `_ - - ``trtllm-serve nvidia/DeepSeek-R1-0528-FP4-v2 --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/1k1k_tp8_conc16.yaml`` + - ``trtllm-serve nvidia/DeepSeek-R1-0528-FP4-v2 --config ${TRTLLM_DIR}/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/1k1k_tp8_conc16.yaml`` * - 8xB200_NVL - Balanced - 1024 / 1024 - 32 - `1k1k_tp8_conc32.yaml `_ - - ``trtllm-serve nvidia/DeepSeek-R1-0528-FP4-v2 --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/1k1k_tp8_conc32.yaml`` + - ``trtllm-serve nvidia/DeepSeek-R1-0528-FP4-v2 --config ${TRTLLM_DIR}/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/1k1k_tp8_conc32.yaml`` * - 8xB200_NVL - High Throughput - 1024 / 1024 - 64 - `1k1k_tp8_conc64.yaml `_ - - ``trtllm-serve nvidia/DeepSeek-R1-0528-FP4-v2 --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/1k1k_tp8_conc64.yaml`` + - ``trtllm-serve nvidia/DeepSeek-R1-0528-FP4-v2 --config ${TRTLLM_DIR}/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/1k1k_tp8_conc64.yaml`` * - 8xB200_NVL - High Throughput - 1024 / 1024 - 128 - `1k1k_tp8_conc128.yaml `_ - - ``trtllm-serve nvidia/DeepSeek-R1-0528-FP4-v2 --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/1k1k_tp8_conc128.yaml`` + - ``trtllm-serve nvidia/DeepSeek-R1-0528-FP4-v2 --config ${TRTLLM_DIR}/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/1k1k_tp8_conc128.yaml`` * - 8xB200_NVL - Max Throughput - 1024 / 1024 - 256 - `1k1k_tp8_conc256.yaml `_ - - ``trtllm-serve nvidia/DeepSeek-R1-0528-FP4-v2 --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/1k1k_tp8_conc256.yaml`` + - ``trtllm-serve nvidia/DeepSeek-R1-0528-FP4-v2 --config ${TRTLLM_DIR}/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/1k1k_tp8_conc256.yaml`` * - 8xB200_NVL - Min Latency - 8192 / 1024 - 4 - `8k1k_tp8_conc4.yaml `_ - - ``trtllm-serve nvidia/DeepSeek-R1-0528-FP4-v2 --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/8k1k_tp8_conc4.yaml`` + - ``trtllm-serve nvidia/DeepSeek-R1-0528-FP4-v2 --config ${TRTLLM_DIR}/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/8k1k_tp8_conc4.yaml`` * - 8xB200_NVL - Low Latency - 8192 / 1024 - 8 - `8k1k_tp8_conc8.yaml `_ - - ``trtllm-serve nvidia/DeepSeek-R1-0528-FP4-v2 --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/8k1k_tp8_conc8.yaml`` + - ``trtllm-serve nvidia/DeepSeek-R1-0528-FP4-v2 --config ${TRTLLM_DIR}/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/8k1k_tp8_conc8.yaml`` * - 8xB200_NVL - Low Latency - 8192 / 1024 - 16 - `8k1k_tp8_conc16.yaml `_ - - ``trtllm-serve nvidia/DeepSeek-R1-0528-FP4-v2 --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/8k1k_tp8_conc16.yaml`` + - ``trtllm-serve nvidia/DeepSeek-R1-0528-FP4-v2 --config ${TRTLLM_DIR}/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/8k1k_tp8_conc16.yaml`` * - 8xB200_NVL - Balanced - 8192 / 1024 - 32 - `8k1k_tp8_conc32.yaml `_ - - ``trtllm-serve nvidia/DeepSeek-R1-0528-FP4-v2 --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/8k1k_tp8_conc32.yaml`` + - ``trtllm-serve nvidia/DeepSeek-R1-0528-FP4-v2 --config ${TRTLLM_DIR}/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/8k1k_tp8_conc32.yaml`` * - 8xB200_NVL - High Throughput - 8192 / 1024 - 64 - `8k1k_tp8_conc64.yaml `_ - - ``trtllm-serve nvidia/DeepSeek-R1-0528-FP4-v2 --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/8k1k_tp8_conc64.yaml`` + - ``trtllm-serve nvidia/DeepSeek-R1-0528-FP4-v2 --config ${TRTLLM_DIR}/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/8k1k_tp8_conc64.yaml`` * - 8xB200_NVL - High Throughput - 8192 / 1024 - 128 - `8k1k_tp8_conc128.yaml `_ - - ``trtllm-serve nvidia/DeepSeek-R1-0528-FP4-v2 --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/8k1k_tp8_conc128.yaml`` + - ``trtllm-serve nvidia/DeepSeek-R1-0528-FP4-v2 --config ${TRTLLM_DIR}/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/8k1k_tp8_conc128.yaml`` * - 8xB200_NVL - Max Throughput - 8192 / 1024 - 256 - `8k1k_tp8_conc256.yaml `_ - - ``trtllm-serve nvidia/DeepSeek-R1-0528-FP4-v2 --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/8k1k_tp8_conc256.yaml`` + - ``trtllm-serve nvidia/DeepSeek-R1-0528-FP4-v2 --config ${TRTLLM_DIR}/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/8k1k_tp8_conc256.yaml`` .. end-nvidia/DeepSeek-R1-0528-FP4-v2 @@ -355,720 +355,720 @@ - 1024 / 1024 - 4 - `1k1k_tp1_conc4.yaml `_ - - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/1k1k_tp1_conc4.yaml`` + - ``trtllm-serve openai/gpt-oss-120b --config ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/1k1k_tp1_conc4.yaml`` * - B200_NVL - Low Latency - 1024 / 1024 - 8 - `1k1k_tp1_conc8.yaml `_ - - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/1k1k_tp1_conc8.yaml`` + - ``trtllm-serve openai/gpt-oss-120b --config ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/1k1k_tp1_conc8.yaml`` * - B200_NVL - Balanced - 1024 / 1024 - 16 - `1k1k_tp1_conc16.yaml `_ - - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/1k1k_tp1_conc16.yaml`` + - ``trtllm-serve openai/gpt-oss-120b --config ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/1k1k_tp1_conc16.yaml`` * - B200_NVL - High Throughput - 1024 / 1024 - 32 - `1k1k_tp1_conc32.yaml `_ - - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/1k1k_tp1_conc32.yaml`` + - ``trtllm-serve openai/gpt-oss-120b --config ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/1k1k_tp1_conc32.yaml`` * - B200_NVL - Max Throughput - 1024 / 1024 - 64 - `1k1k_tp1_conc64.yaml `_ - - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/1k1k_tp1_conc64.yaml`` + - ``trtllm-serve openai/gpt-oss-120b --config ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/1k1k_tp1_conc64.yaml`` * - B200_NVL - Min Latency - 1024 / 8192 - 4 - `1k8k_tp1_conc4.yaml `_ - - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/1k8k_tp1_conc4.yaml`` + - ``trtllm-serve openai/gpt-oss-120b --config ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/1k8k_tp1_conc4.yaml`` * - B200_NVL - Low Latency - 1024 / 8192 - 8 - `1k8k_tp1_conc8.yaml `_ - - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/1k8k_tp1_conc8.yaml`` + - ``trtllm-serve openai/gpt-oss-120b --config ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/1k8k_tp1_conc8.yaml`` * - B200_NVL - Balanced - 1024 / 8192 - 16 - `1k8k_tp1_conc16.yaml `_ - - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/1k8k_tp1_conc16.yaml`` + - ``trtllm-serve openai/gpt-oss-120b --config ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/1k8k_tp1_conc16.yaml`` * - B200_NVL - High Throughput - 1024 / 8192 - 32 - `1k8k_tp1_conc32.yaml `_ - - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/1k8k_tp1_conc32.yaml`` + - ``trtllm-serve openai/gpt-oss-120b --config ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/1k8k_tp1_conc32.yaml`` * - B200_NVL - Max Throughput - 1024 / 8192 - 64 - `1k8k_tp1_conc64.yaml `_ - - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/1k8k_tp1_conc64.yaml`` + - ``trtllm-serve openai/gpt-oss-120b --config ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/1k8k_tp1_conc64.yaml`` * - B200_NVL - Min Latency - 8192 / 1024 - 4 - `8k1k_tp1_conc4.yaml `_ - - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/8k1k_tp1_conc4.yaml`` + - ``trtllm-serve openai/gpt-oss-120b --config ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/8k1k_tp1_conc4.yaml`` * - B200_NVL - Low Latency - 8192 / 1024 - 8 - `8k1k_tp1_conc8.yaml `_ - - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/8k1k_tp1_conc8.yaml`` + - ``trtllm-serve openai/gpt-oss-120b --config ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/8k1k_tp1_conc8.yaml`` * - B200_NVL - Balanced - 8192 / 1024 - 16 - `8k1k_tp1_conc16.yaml `_ - - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/8k1k_tp1_conc16.yaml`` + - ``trtllm-serve openai/gpt-oss-120b --config ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/8k1k_tp1_conc16.yaml`` * - B200_NVL - High Throughput - 8192 / 1024 - 32 - `8k1k_tp1_conc32.yaml `_ - - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/8k1k_tp1_conc32.yaml`` + - ``trtllm-serve openai/gpt-oss-120b --config ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/8k1k_tp1_conc32.yaml`` * - B200_NVL - Max Throughput - 8192 / 1024 - 64 - `8k1k_tp1_conc64.yaml `_ - - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/8k1k_tp1_conc64.yaml`` + - ``trtllm-serve openai/gpt-oss-120b --config ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/8k1k_tp1_conc64.yaml`` * - 2xB200_NVL - Min Latency - 1024 / 1024 - 4 - `1k1k_tp2_conc4.yaml `_ - - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/1k1k_tp2_conc4.yaml`` + - ``trtllm-serve openai/gpt-oss-120b --config ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/1k1k_tp2_conc4.yaml`` * - 2xB200_NVL - Low Latency - 1024 / 1024 - 8 - `1k1k_tp2_conc8.yaml `_ - - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/1k1k_tp2_conc8.yaml`` + - ``trtllm-serve openai/gpt-oss-120b --config ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/1k1k_tp2_conc8.yaml`` * - 2xB200_NVL - Balanced - 1024 / 1024 - 16 - `1k1k_tp2_conc16.yaml `_ - - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/1k1k_tp2_conc16.yaml`` + - ``trtllm-serve openai/gpt-oss-120b --config ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/1k1k_tp2_conc16.yaml`` * - 2xB200_NVL - High Throughput - 1024 / 1024 - 32 - `1k1k_tp2_conc32.yaml `_ - - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/1k1k_tp2_conc32.yaml`` + - ``trtllm-serve openai/gpt-oss-120b --config ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/1k1k_tp2_conc32.yaml`` * - 2xB200_NVL - Max Throughput - 1024 / 1024 - 64 - `1k1k_tp2_conc64.yaml `_ - - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/1k1k_tp2_conc64.yaml`` + - ``trtllm-serve openai/gpt-oss-120b --config ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/1k1k_tp2_conc64.yaml`` * - 2xB200_NVL - Min Latency - 1024 / 8192 - 4 - `1k8k_tp2_conc4.yaml `_ - - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/1k8k_tp2_conc4.yaml`` + - ``trtllm-serve openai/gpt-oss-120b --config ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/1k8k_tp2_conc4.yaml`` * - 2xB200_NVL - Low Latency - 1024 / 8192 - 8 - `1k8k_tp2_conc8.yaml `_ - - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/1k8k_tp2_conc8.yaml`` + - ``trtllm-serve openai/gpt-oss-120b --config ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/1k8k_tp2_conc8.yaml`` * - 2xB200_NVL - Balanced - 1024 / 8192 - 16 - `1k8k_tp2_conc16.yaml `_ - - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/1k8k_tp2_conc16.yaml`` + - ``trtllm-serve openai/gpt-oss-120b --config ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/1k8k_tp2_conc16.yaml`` * - 2xB200_NVL - High Throughput - 1024 / 8192 - 32 - `1k8k_tp2_conc32.yaml `_ - - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/1k8k_tp2_conc32.yaml`` + - ``trtllm-serve openai/gpt-oss-120b --config ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/1k8k_tp2_conc32.yaml`` * - 2xB200_NVL - Max Throughput - 1024 / 8192 - 64 - `1k8k_tp2_conc64.yaml `_ - - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/1k8k_tp2_conc64.yaml`` + - ``trtllm-serve openai/gpt-oss-120b --config ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/1k8k_tp2_conc64.yaml`` * - 2xB200_NVL - Min Latency - 8192 / 1024 - 4 - `8k1k_tp2_conc4.yaml `_ - - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/8k1k_tp2_conc4.yaml`` + - ``trtllm-serve openai/gpt-oss-120b --config ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/8k1k_tp2_conc4.yaml`` * - 2xB200_NVL - Low Latency - 8192 / 1024 - 8 - `8k1k_tp2_conc8.yaml `_ - - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/8k1k_tp2_conc8.yaml`` + - ``trtllm-serve openai/gpt-oss-120b --config ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/8k1k_tp2_conc8.yaml`` * - 2xB200_NVL - Balanced - 8192 / 1024 - 16 - `8k1k_tp2_conc16.yaml `_ - - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/8k1k_tp2_conc16.yaml`` + - ``trtllm-serve openai/gpt-oss-120b --config ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/8k1k_tp2_conc16.yaml`` * - 2xB200_NVL - High Throughput - 8192 / 1024 - 32 - `8k1k_tp2_conc32.yaml `_ - - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/8k1k_tp2_conc32.yaml`` + - ``trtllm-serve openai/gpt-oss-120b --config ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/8k1k_tp2_conc32.yaml`` * - 2xB200_NVL - Max Throughput - 8192 / 1024 - 64 - `8k1k_tp2_conc64.yaml `_ - - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/8k1k_tp2_conc64.yaml`` + - ``trtllm-serve openai/gpt-oss-120b --config ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/8k1k_tp2_conc64.yaml`` * - 4xB200_NVL - Min Latency - 1024 / 1024 - 4 - `1k1k_tp4_conc4.yaml `_ - - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/1k1k_tp4_conc4.yaml`` + - ``trtllm-serve openai/gpt-oss-120b --config ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/1k1k_tp4_conc4.yaml`` * - 4xB200_NVL - Low Latency - 1024 / 1024 - 8 - `1k1k_tp4_conc8.yaml `_ - - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/1k1k_tp4_conc8.yaml`` + - ``trtllm-serve openai/gpt-oss-120b --config ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/1k1k_tp4_conc8.yaml`` * - 4xB200_NVL - Balanced - 1024 / 1024 - 16 - `1k1k_tp4_conc16.yaml `_ - - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/1k1k_tp4_conc16.yaml`` + - ``trtllm-serve openai/gpt-oss-120b --config ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/1k1k_tp4_conc16.yaml`` * - 4xB200_NVL - High Throughput - 1024 / 1024 - 32 - `1k1k_tp4_conc32.yaml `_ - - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/1k1k_tp4_conc32.yaml`` + - ``trtllm-serve openai/gpt-oss-120b --config ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/1k1k_tp4_conc32.yaml`` * - 4xB200_NVL - Max Throughput - 1024 / 1024 - 64 - `1k1k_tp4_conc64.yaml `_ - - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/1k1k_tp4_conc64.yaml`` + - ``trtllm-serve openai/gpt-oss-120b --config ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/1k1k_tp4_conc64.yaml`` * - 4xB200_NVL - Min Latency - 1024 / 8192 - 4 - `1k8k_tp4_conc4.yaml `_ - - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/1k8k_tp4_conc4.yaml`` + - ``trtllm-serve openai/gpt-oss-120b --config ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/1k8k_tp4_conc4.yaml`` * - 4xB200_NVL - Low Latency - 1024 / 8192 - 8 - `1k8k_tp4_conc8.yaml `_ - - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/1k8k_tp4_conc8.yaml`` + - ``trtllm-serve openai/gpt-oss-120b --config ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/1k8k_tp4_conc8.yaml`` * - 4xB200_NVL - Balanced - 1024 / 8192 - 16 - `1k8k_tp4_conc16.yaml `_ - - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/1k8k_tp4_conc16.yaml`` + - ``trtllm-serve openai/gpt-oss-120b --config ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/1k8k_tp4_conc16.yaml`` * - 4xB200_NVL - High Throughput - 1024 / 8192 - 32 - `1k8k_tp4_conc32.yaml `_ - - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/1k8k_tp4_conc32.yaml`` + - ``trtllm-serve openai/gpt-oss-120b --config ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/1k8k_tp4_conc32.yaml`` * - 4xB200_NVL - Max Throughput - 1024 / 8192 - 64 - `1k8k_tp4_conc64.yaml `_ - - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/1k8k_tp4_conc64.yaml`` + - ``trtllm-serve openai/gpt-oss-120b --config ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/1k8k_tp4_conc64.yaml`` * - 4xB200_NVL - Min Latency - 8192 / 1024 - 4 - `8k1k_tp4_conc4.yaml `_ - - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/8k1k_tp4_conc4.yaml`` + - ``trtllm-serve openai/gpt-oss-120b --config ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/8k1k_tp4_conc4.yaml`` * - 4xB200_NVL - Low Latency - 8192 / 1024 - 8 - `8k1k_tp4_conc8.yaml `_ - - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/8k1k_tp4_conc8.yaml`` + - ``trtllm-serve openai/gpt-oss-120b --config ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/8k1k_tp4_conc8.yaml`` * - 4xB200_NVL - Balanced - 8192 / 1024 - 16 - `8k1k_tp4_conc16.yaml `_ - - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/8k1k_tp4_conc16.yaml`` + - ``trtllm-serve openai/gpt-oss-120b --config ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/8k1k_tp4_conc16.yaml`` * - 4xB200_NVL - High Throughput - 8192 / 1024 - 32 - `8k1k_tp4_conc32.yaml `_ - - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/8k1k_tp4_conc32.yaml`` + - ``trtllm-serve openai/gpt-oss-120b --config ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/8k1k_tp4_conc32.yaml`` * - 4xB200_NVL - Max Throughput - 8192 / 1024 - 64 - `8k1k_tp4_conc64.yaml `_ - - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/8k1k_tp4_conc64.yaml`` + - ``trtllm-serve openai/gpt-oss-120b --config ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/8k1k_tp4_conc64.yaml`` * - 8xB200_NVL - Min Latency - 1024 / 1024 - 4 - `1k1k_tp8_conc4.yaml `_ - - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/1k1k_tp8_conc4.yaml`` + - ``trtllm-serve openai/gpt-oss-120b --config ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/1k1k_tp8_conc4.yaml`` * - 8xB200_NVL - Low Latency - 1024 / 1024 - 8 - `1k1k_tp8_conc8.yaml `_ - - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/1k1k_tp8_conc8.yaml`` + - ``trtllm-serve openai/gpt-oss-120b --config ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/1k1k_tp8_conc8.yaml`` * - 8xB200_NVL - Balanced - 1024 / 1024 - 16 - `1k1k_tp8_conc16.yaml `_ - - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/1k1k_tp8_conc16.yaml`` + - ``trtllm-serve openai/gpt-oss-120b --config ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/1k1k_tp8_conc16.yaml`` * - 8xB200_NVL - High Throughput - 1024 / 1024 - 32 - `1k1k_tp8_conc32.yaml `_ - - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/1k1k_tp8_conc32.yaml`` + - ``trtllm-serve openai/gpt-oss-120b --config ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/1k1k_tp8_conc32.yaml`` * - 8xB200_NVL - Max Throughput - 1024 / 1024 - 64 - `1k1k_tp8_conc64.yaml `_ - - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/1k1k_tp8_conc64.yaml`` + - ``trtllm-serve openai/gpt-oss-120b --config ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/1k1k_tp8_conc64.yaml`` * - 8xB200_NVL - Min Latency - 1024 / 8192 - 4 - `1k8k_tp8_conc4.yaml `_ - - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/1k8k_tp8_conc4.yaml`` + - ``trtllm-serve openai/gpt-oss-120b --config ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/1k8k_tp8_conc4.yaml`` * - 8xB200_NVL - Low Latency - 1024 / 8192 - 8 - `1k8k_tp8_conc8.yaml `_ - - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/1k8k_tp8_conc8.yaml`` + - ``trtllm-serve openai/gpt-oss-120b --config ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/1k8k_tp8_conc8.yaml`` * - 8xB200_NVL - Balanced - 1024 / 8192 - 16 - `1k8k_tp8_conc16.yaml `_ - - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/1k8k_tp8_conc16.yaml`` + - ``trtllm-serve openai/gpt-oss-120b --config ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/1k8k_tp8_conc16.yaml`` * - 8xB200_NVL - High Throughput - 1024 / 8192 - 32 - `1k8k_tp8_conc32.yaml `_ - - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/1k8k_tp8_conc32.yaml`` + - ``trtllm-serve openai/gpt-oss-120b --config ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/1k8k_tp8_conc32.yaml`` * - 8xB200_NVL - Max Throughput - 1024 / 8192 - 64 - `1k8k_tp8_conc64.yaml `_ - - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/1k8k_tp8_conc64.yaml`` + - ``trtllm-serve openai/gpt-oss-120b --config ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/1k8k_tp8_conc64.yaml`` * - 8xB200_NVL - Min Latency - 8192 / 1024 - 4 - `8k1k_tp8_conc4.yaml `_ - - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/8k1k_tp8_conc4.yaml`` + - ``trtllm-serve openai/gpt-oss-120b --config ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/8k1k_tp8_conc4.yaml`` * - 8xB200_NVL - Low Latency - 8192 / 1024 - 8 - `8k1k_tp8_conc8.yaml `_ - - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/8k1k_tp8_conc8.yaml`` + - ``trtllm-serve openai/gpt-oss-120b --config ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/8k1k_tp8_conc8.yaml`` * - 8xB200_NVL - Balanced - 8192 / 1024 - 16 - `8k1k_tp8_conc16.yaml `_ - - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/8k1k_tp8_conc16.yaml`` + - ``trtllm-serve openai/gpt-oss-120b --config ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/8k1k_tp8_conc16.yaml`` * - 8xB200_NVL - High Throughput - 8192 / 1024 - 32 - `8k1k_tp8_conc32.yaml `_ - - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/8k1k_tp8_conc32.yaml`` + - ``trtllm-serve openai/gpt-oss-120b --config ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/8k1k_tp8_conc32.yaml`` * - 8xB200_NVL - Max Throughput - 8192 / 1024 - 64 - `8k1k_tp8_conc64.yaml `_ - - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/8k1k_tp8_conc64.yaml`` + - ``trtllm-serve openai/gpt-oss-120b --config ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/8k1k_tp8_conc64.yaml`` * - H200_SXM - Min Latency - 1024 / 1024 - 4 - `1k1k_tp1_conc4.yaml `_ - - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/1k1k_tp1_conc4.yaml`` + - ``trtllm-serve openai/gpt-oss-120b --config ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/1k1k_tp1_conc4.yaml`` * - H200_SXM - Low Latency - 1024 / 1024 - 8 - `1k1k_tp1_conc8.yaml `_ - - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/1k1k_tp1_conc8.yaml`` + - ``trtllm-serve openai/gpt-oss-120b --config ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/1k1k_tp1_conc8.yaml`` * - H200_SXM - Balanced - 1024 / 1024 - 16 - `1k1k_tp1_conc16.yaml `_ - - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/1k1k_tp1_conc16.yaml`` + - ``trtllm-serve openai/gpt-oss-120b --config ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/1k1k_tp1_conc16.yaml`` * - H200_SXM - High Throughput - 1024 / 1024 - 32 - `1k1k_tp1_conc32.yaml `_ - - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/1k1k_tp1_conc32.yaml`` + - ``trtllm-serve openai/gpt-oss-120b --config ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/1k1k_tp1_conc32.yaml`` * - H200_SXM - Max Throughput - 1024 / 1024 - 64 - `1k1k_tp1_conc64.yaml `_ - - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/1k1k_tp1_conc64.yaml`` + - ``trtllm-serve openai/gpt-oss-120b --config ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/1k1k_tp1_conc64.yaml`` * - H200_SXM - Min Latency - 1024 / 8192 - 4 - `1k8k_tp1_conc4.yaml `_ - - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/1k8k_tp1_conc4.yaml`` + - ``trtllm-serve openai/gpt-oss-120b --config ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/1k8k_tp1_conc4.yaml`` * - H200_SXM - Low Latency - 1024 / 8192 - 8 - `1k8k_tp1_conc8.yaml `_ - - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/1k8k_tp1_conc8.yaml`` + - ``trtllm-serve openai/gpt-oss-120b --config ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/1k8k_tp1_conc8.yaml`` * - H200_SXM - Balanced - 1024 / 8192 - 16 - `1k8k_tp1_conc16.yaml `_ - - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/1k8k_tp1_conc16.yaml`` + - ``trtllm-serve openai/gpt-oss-120b --config ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/1k8k_tp1_conc16.yaml`` * - H200_SXM - High Throughput - 1024 / 8192 - 32 - `1k8k_tp1_conc32.yaml `_ - - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/1k8k_tp1_conc32.yaml`` + - ``trtllm-serve openai/gpt-oss-120b --config ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/1k8k_tp1_conc32.yaml`` * - H200_SXM - Max Throughput - 1024 / 8192 - 64 - `1k8k_tp1_conc64.yaml `_ - - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/1k8k_tp1_conc64.yaml`` + - ``trtllm-serve openai/gpt-oss-120b --config ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/1k8k_tp1_conc64.yaml`` * - H200_SXM - Min Latency - 8192 / 1024 - 4 - `8k1k_tp1_conc4.yaml `_ - - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/8k1k_tp1_conc4.yaml`` + - ``trtllm-serve openai/gpt-oss-120b --config ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/8k1k_tp1_conc4.yaml`` * - H200_SXM - Low Latency - 8192 / 1024 - 8 - `8k1k_tp1_conc8.yaml `_ - - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/8k1k_tp1_conc8.yaml`` + - ``trtllm-serve openai/gpt-oss-120b --config ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/8k1k_tp1_conc8.yaml`` * - H200_SXM - Balanced - 8192 / 1024 - 16 - `8k1k_tp1_conc16.yaml `_ - - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/8k1k_tp1_conc16.yaml`` + - ``trtllm-serve openai/gpt-oss-120b --config ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/8k1k_tp1_conc16.yaml`` * - H200_SXM - High Throughput - 8192 / 1024 - 32 - `8k1k_tp1_conc32.yaml `_ - - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/8k1k_tp1_conc32.yaml`` + - ``trtllm-serve openai/gpt-oss-120b --config ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/8k1k_tp1_conc32.yaml`` * - H200_SXM - Max Throughput - 8192 / 1024 - 64 - `8k1k_tp1_conc64.yaml `_ - - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/8k1k_tp1_conc64.yaml`` + - ``trtllm-serve openai/gpt-oss-120b --config ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/8k1k_tp1_conc64.yaml`` * - 2xH200_SXM - Min Latency - 1024 / 1024 - 4 - `1k1k_tp2_conc4.yaml `_ - - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/1k1k_tp2_conc4.yaml`` + - ``trtllm-serve openai/gpt-oss-120b --config ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/1k1k_tp2_conc4.yaml`` * - 2xH200_SXM - Low Latency - 1024 / 1024 - 8 - `1k1k_tp2_conc8.yaml `_ - - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/1k1k_tp2_conc8.yaml`` + - ``trtllm-serve openai/gpt-oss-120b --config ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/1k1k_tp2_conc8.yaml`` * - 2xH200_SXM - Balanced - 1024 / 1024 - 16 - `1k1k_tp2_conc16.yaml `_ - - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/1k1k_tp2_conc16.yaml`` + - ``trtllm-serve openai/gpt-oss-120b --config ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/1k1k_tp2_conc16.yaml`` * - 2xH200_SXM - High Throughput - 1024 / 1024 - 32 - `1k1k_tp2_conc32.yaml `_ - - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/1k1k_tp2_conc32.yaml`` + - ``trtllm-serve openai/gpt-oss-120b --config ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/1k1k_tp2_conc32.yaml`` * - 2xH200_SXM - Max Throughput - 1024 / 1024 - 64 - `1k1k_tp2_conc64.yaml `_ - - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/1k1k_tp2_conc64.yaml`` + - ``trtllm-serve openai/gpt-oss-120b --config ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/1k1k_tp2_conc64.yaml`` * - 2xH200_SXM - Min Latency - 1024 / 8192 - 4 - `1k8k_tp2_conc4.yaml `_ - - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/1k8k_tp2_conc4.yaml`` + - ``trtllm-serve openai/gpt-oss-120b --config ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/1k8k_tp2_conc4.yaml`` * - 2xH200_SXM - Low Latency - 1024 / 8192 - 8 - `1k8k_tp2_conc8.yaml `_ - - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/1k8k_tp2_conc8.yaml`` + - ``trtllm-serve openai/gpt-oss-120b --config ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/1k8k_tp2_conc8.yaml`` * - 2xH200_SXM - Balanced - 1024 / 8192 - 16 - `1k8k_tp2_conc16.yaml `_ - - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/1k8k_tp2_conc16.yaml`` + - ``trtllm-serve openai/gpt-oss-120b --config ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/1k8k_tp2_conc16.yaml`` * - 2xH200_SXM - High Throughput - 1024 / 8192 - 32 - `1k8k_tp2_conc32.yaml `_ - - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/1k8k_tp2_conc32.yaml`` + - ``trtllm-serve openai/gpt-oss-120b --config ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/1k8k_tp2_conc32.yaml`` * - 2xH200_SXM - Max Throughput - 1024 / 8192 - 64 - `1k8k_tp2_conc64.yaml `_ - - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/1k8k_tp2_conc64.yaml`` + - ``trtllm-serve openai/gpt-oss-120b --config ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/1k8k_tp2_conc64.yaml`` * - 2xH200_SXM - Min Latency - 8192 / 1024 - 4 - `8k1k_tp2_conc4.yaml `_ - - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/8k1k_tp2_conc4.yaml`` + - ``trtllm-serve openai/gpt-oss-120b --config ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/8k1k_tp2_conc4.yaml`` * - 2xH200_SXM - Low Latency - 8192 / 1024 - 8 - `8k1k_tp2_conc8.yaml `_ - - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/8k1k_tp2_conc8.yaml`` + - ``trtllm-serve openai/gpt-oss-120b --config ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/8k1k_tp2_conc8.yaml`` * - 2xH200_SXM - Balanced - 8192 / 1024 - 16 - `8k1k_tp2_conc16.yaml `_ - - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/8k1k_tp2_conc16.yaml`` + - ``trtllm-serve openai/gpt-oss-120b --config ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/8k1k_tp2_conc16.yaml`` * - 2xH200_SXM - High Throughput - 8192 / 1024 - 32 - `8k1k_tp2_conc32.yaml `_ - - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/8k1k_tp2_conc32.yaml`` + - ``trtllm-serve openai/gpt-oss-120b --config ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/8k1k_tp2_conc32.yaml`` * - 2xH200_SXM - Max Throughput - 8192 / 1024 - 64 - `8k1k_tp2_conc64.yaml `_ - - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/8k1k_tp2_conc64.yaml`` + - ``trtllm-serve openai/gpt-oss-120b --config ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/8k1k_tp2_conc64.yaml`` * - 4xH200_SXM - Min Latency - 1024 / 1024 - 4 - `1k1k_tp4_conc4.yaml `_ - - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/1k1k_tp4_conc4.yaml`` + - ``trtllm-serve openai/gpt-oss-120b --config ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/1k1k_tp4_conc4.yaml`` * - 4xH200_SXM - Low Latency - 1024 / 1024 - 8 - `1k1k_tp4_conc8.yaml `_ - - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/1k1k_tp4_conc8.yaml`` + - ``trtllm-serve openai/gpt-oss-120b --config ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/1k1k_tp4_conc8.yaml`` * - 4xH200_SXM - Balanced - 1024 / 1024 - 16 - `1k1k_tp4_conc16.yaml `_ - - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/1k1k_tp4_conc16.yaml`` + - ``trtllm-serve openai/gpt-oss-120b --config ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/1k1k_tp4_conc16.yaml`` * - 4xH200_SXM - High Throughput - 1024 / 1024 - 32 - `1k1k_tp4_conc32.yaml `_ - - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/1k1k_tp4_conc32.yaml`` + - ``trtllm-serve openai/gpt-oss-120b --config ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/1k1k_tp4_conc32.yaml`` * - 4xH200_SXM - Max Throughput - 1024 / 1024 - 64 - `1k1k_tp4_conc64.yaml `_ - - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/1k1k_tp4_conc64.yaml`` + - ``trtllm-serve openai/gpt-oss-120b --config ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/1k1k_tp4_conc64.yaml`` * - 4xH200_SXM - Min Latency - 1024 / 8192 - 4 - `1k8k_tp4_conc4.yaml `_ - - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/1k8k_tp4_conc4.yaml`` + - ``trtllm-serve openai/gpt-oss-120b --config ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/1k8k_tp4_conc4.yaml`` * - 4xH200_SXM - Low Latency - 1024 / 8192 - 8 - `1k8k_tp4_conc8.yaml `_ - - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/1k8k_tp4_conc8.yaml`` + - ``trtllm-serve openai/gpt-oss-120b --config ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/1k8k_tp4_conc8.yaml`` * - 4xH200_SXM - Balanced - 1024 / 8192 - 16 - `1k8k_tp4_conc16.yaml `_ - - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/1k8k_tp4_conc16.yaml`` + - ``trtllm-serve openai/gpt-oss-120b --config ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/1k8k_tp4_conc16.yaml`` * - 4xH200_SXM - High Throughput - 1024 / 8192 - 32 - `1k8k_tp4_conc32.yaml `_ - - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/1k8k_tp4_conc32.yaml`` + - ``trtllm-serve openai/gpt-oss-120b --config ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/1k8k_tp4_conc32.yaml`` * - 4xH200_SXM - Max Throughput - 1024 / 8192 - 64 - `1k8k_tp4_conc64.yaml `_ - - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/1k8k_tp4_conc64.yaml`` + - ``trtllm-serve openai/gpt-oss-120b --config ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/1k8k_tp4_conc64.yaml`` * - 4xH200_SXM - Min Latency - 8192 / 1024 - 4 - `8k1k_tp4_conc4.yaml `_ - - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/8k1k_tp4_conc4.yaml`` + - ``trtllm-serve openai/gpt-oss-120b --config ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/8k1k_tp4_conc4.yaml`` * - 4xH200_SXM - Low Latency - 8192 / 1024 - 8 - `8k1k_tp4_conc8.yaml `_ - - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/8k1k_tp4_conc8.yaml`` + - ``trtllm-serve openai/gpt-oss-120b --config ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/8k1k_tp4_conc8.yaml`` * - 4xH200_SXM - Balanced - 8192 / 1024 - 16 - `8k1k_tp4_conc16.yaml `_ - - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/8k1k_tp4_conc16.yaml`` + - ``trtllm-serve openai/gpt-oss-120b --config ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/8k1k_tp4_conc16.yaml`` * - 4xH200_SXM - High Throughput - 8192 / 1024 - 32 - `8k1k_tp4_conc32.yaml `_ - - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/8k1k_tp4_conc32.yaml`` + - ``trtllm-serve openai/gpt-oss-120b --config ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/8k1k_tp4_conc32.yaml`` * - 4xH200_SXM - Max Throughput - 8192 / 1024 - 64 - `8k1k_tp4_conc64.yaml `_ - - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/8k1k_tp4_conc64.yaml`` + - ``trtllm-serve openai/gpt-oss-120b --config ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/8k1k_tp4_conc64.yaml`` * - 8xH200_SXM - Min Latency - 1024 / 1024 - 4 - `1k1k_tp8_conc4.yaml `_ - - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/1k1k_tp8_conc4.yaml`` + - ``trtllm-serve openai/gpt-oss-120b --config ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/1k1k_tp8_conc4.yaml`` * - 8xH200_SXM - Low Latency - 1024 / 1024 - 8 - `1k1k_tp8_conc8.yaml `_ - - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/1k1k_tp8_conc8.yaml`` + - ``trtllm-serve openai/gpt-oss-120b --config ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/1k1k_tp8_conc8.yaml`` * - 8xH200_SXM - Balanced - 1024 / 1024 - 16 - `1k1k_tp8_conc16.yaml `_ - - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/1k1k_tp8_conc16.yaml`` + - ``trtllm-serve openai/gpt-oss-120b --config ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/1k1k_tp8_conc16.yaml`` * - 8xH200_SXM - High Throughput - 1024 / 1024 - 32 - `1k1k_tp8_conc32.yaml `_ - - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/1k1k_tp8_conc32.yaml`` + - ``trtllm-serve openai/gpt-oss-120b --config ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/1k1k_tp8_conc32.yaml`` * - 8xH200_SXM - Max Throughput - 1024 / 1024 - 64 - `1k1k_tp8_conc64.yaml `_ - - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/1k1k_tp8_conc64.yaml`` + - ``trtllm-serve openai/gpt-oss-120b --config ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/1k1k_tp8_conc64.yaml`` * - 8xH200_SXM - Min Latency - 1024 / 8192 - 4 - `1k8k_tp8_conc4.yaml `_ - - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/1k8k_tp8_conc4.yaml`` + - ``trtllm-serve openai/gpt-oss-120b --config ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/1k8k_tp8_conc4.yaml`` * - 8xH200_SXM - Low Latency - 1024 / 8192 - 8 - `1k8k_tp8_conc8.yaml `_ - - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/1k8k_tp8_conc8.yaml`` + - ``trtllm-serve openai/gpt-oss-120b --config ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/1k8k_tp8_conc8.yaml`` * - 8xH200_SXM - Balanced - 1024 / 8192 - 16 - `1k8k_tp8_conc16.yaml `_ - - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/1k8k_tp8_conc16.yaml`` + - ``trtllm-serve openai/gpt-oss-120b --config ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/1k8k_tp8_conc16.yaml`` * - 8xH200_SXM - High Throughput - 1024 / 8192 - 32 - `1k8k_tp8_conc32.yaml `_ - - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/1k8k_tp8_conc32.yaml`` + - ``trtllm-serve openai/gpt-oss-120b --config ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/1k8k_tp8_conc32.yaml`` * - 8xH200_SXM - Max Throughput - 1024 / 8192 - 64 - `1k8k_tp8_conc64.yaml `_ - - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/1k8k_tp8_conc64.yaml`` + - ``trtllm-serve openai/gpt-oss-120b --config ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/1k8k_tp8_conc64.yaml`` * - 8xH200_SXM - Min Latency - 8192 / 1024 - 4 - `8k1k_tp8_conc4.yaml `_ - - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/8k1k_tp8_conc4.yaml`` + - ``trtllm-serve openai/gpt-oss-120b --config ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/8k1k_tp8_conc4.yaml`` * - 8xH200_SXM - Low Latency - 8192 / 1024 - 8 - `8k1k_tp8_conc8.yaml `_ - - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/8k1k_tp8_conc8.yaml`` + - ``trtllm-serve openai/gpt-oss-120b --config ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/8k1k_tp8_conc8.yaml`` * - 8xH200_SXM - Balanced - 8192 / 1024 - 16 - `8k1k_tp8_conc16.yaml `_ - - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/8k1k_tp8_conc16.yaml`` + - ``trtllm-serve openai/gpt-oss-120b --config ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/8k1k_tp8_conc16.yaml`` * - 8xH200_SXM - High Throughput - 8192 / 1024 - 32 - `8k1k_tp8_conc32.yaml `_ - - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/8k1k_tp8_conc32.yaml`` + - ``trtllm-serve openai/gpt-oss-120b --config ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/8k1k_tp8_conc32.yaml`` * - 8xH200_SXM - Max Throughput - 8192 / 1024 - 64 - `8k1k_tp8_conc64.yaml `_ - - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/8k1k_tp8_conc64.yaml`` + - ``trtllm-serve openai/gpt-oss-120b --config ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/8k1k_tp8_conc64.yaml`` .. end-openai/gpt-oss-120b diff --git a/docs/source/deployment-guide/deployment-guide-for-deepseek-r1-on-trtllm.md b/docs/source/deployment-guide/deployment-guide-for-deepseek-r1-on-trtllm.md index e4165eac09..881f86eb12 100644 --- a/docs/source/deployment-guide/deployment-guide-for-deepseek-r1-on-trtllm.md +++ b/docs/source/deployment-guide/deployment-guide-for-deepseek-r1-on-trtllm.md @@ -115,7 +115,7 @@ append: EOF Below is an example command to launch the TensorRT LLM server with the DeepSeek-R1 model from within the container. The command is specifically configured for the 1024/1024 Input/Output Sequence Length test. The explanation of each flag is shown in the “LLM API Options (YAML Configuration)” section. ```shell -trtllm-serve deepseek-ai/DeepSeek-R1-0528 --host 0.0.0.0 --port 8000 --extra_llm_api_options ${EXTRA_LLM_API_FILE} +trtllm-serve deepseek-ai/DeepSeek-R1-0528 --host 0.0.0.0 --port 8000 --config ${EXTRA_LLM_API_FILE} ``` After the server is set up, the client can now send prompt requests to the server and receive results. @@ -124,7 +124,7 @@ After the server is set up, the client can now send prompt requests to the serve -These options provide control over TensorRT LLM's behavior and are set within the YAML file passed to the `trtllm-serve` command via the `--extra_llm_api_options` argument. +These options provide control over TensorRT LLM's behavior and are set within the YAML file passed to the `trtllm-serve` command via the `--config` argument. #### `tensor_parallel_size` @@ -200,7 +200,7 @@ These options provide control over TensorRT LLM's behavior and are set within th * **Default**: `TRTLLM` -See the [`TorchLlmArgs` class](https://nvidia.github.io/TensorRT-LLM/llm-api/reference.html#tensorrt_llm.llmapi.TorchLlmArgs) for the full list of options which can be used in the `extra_llm_api_options`. +See the [`TorchLlmArgs` class](https://nvidia.github.io/TensorRT-LLM/llm-api/reference.html#tensorrt_llm.llmapi.TorchLlmArgs) for the full list of options which can be used in the YAML configuration file. ### Wide Expert Parallelism @@ -435,7 +435,7 @@ $$ The following tables list recommended configurations from the comprehensive database for different performance profiles. ```{eval-rst} -.. include:: note_sections.rst +.. include:: ../_includes/note_sections.rst :start-after: .. start-note-traffic-patterns :end-before: .. end-note-traffic-patterns diff --git a/docs/source/deployment-guide/deployment-guide-for-gpt-oss-on-trtllm.md b/docs/source/deployment-guide/deployment-guide-for-gpt-oss-on-trtllm.md index 5a9f9f4c72..d28f3fa9f3 100644 --- a/docs/source/deployment-guide/deployment-guide-for-gpt-oss-on-trtllm.md +++ b/docs/source/deployment-guide/deployment-guide-for-gpt-oss-on-trtllm.md @@ -113,7 +113,7 @@ append: EOF Below is an example command to launch the TensorRT LLM server with the GPT-OSS model from within the container. The command is specifically configured for the 1024/1024 Input/Output Sequence Length test. The explanation of each flag is shown in the “LLM API Options (YAML Configuration)” section. ```shell -trtllm-serve openai/gpt-oss-120b --host 0.0.0.0 --port 8000 --extra_llm_api_options ${EXTRA_LLM_API_FILE} +trtllm-serve openai/gpt-oss-120b --host 0.0.0.0 --port 8000 --config ${EXTRA_LLM_API_FILE} ``` After the server is set up, the client can now send prompt requests to the server and receive results. @@ -122,7 +122,7 @@ After the server is set up, the client can now send prompt requests to the serve -These options provide control over TensorRT LLM's behavior and are set within the YAML file passed to the `trtllm-serve` command via the `--extra_llm_api_options` argument. +These options provide control over TensorRT LLM's behavior and are set within the YAML file passed to the `trtllm-serve` command via the `--config` argument. #### `tensor_parallel_size` @@ -178,7 +178,7 @@ These options provide control over TensorRT LLM's behavior and are set within th * `backend`: The backend to use for MoE operations. **Default**: `CUTLASS` -See the [`TorchLlmArgs` class](https://nvidia.github.io/TensorRT-LLM/llm-api/reference.html#tensorrt_llm.llmapi.TorchLlmArgs) for the full list of options which can be used in the `extra_llm_api_options`. +See the [`TorchLlmArgs` class](https://nvidia.github.io/TensorRT-LLM/llm-api/reference.html#tensorrt_llm.llmapi.TorchLlmArgs) for the full list of options which can be used in the YAML configuration file. ## Testing API Endpoint @@ -383,7 +383,7 @@ $$ The following table lists recommended configurations from the comprehensive database for different performance profiles. ```{eval-rst} -.. include:: note_sections.rst +.. include:: ../_includes/note_sections.rst :start-after: .. start-note-traffic-patterns :end-before: .. end-note-traffic-patterns diff --git a/docs/source/deployment-guide/deployment-guide-for-kimi-k2-thinking-on-trtllm.md b/docs/source/deployment-guide/deployment-guide-for-kimi-k2-thinking-on-trtllm.md index 391a72091d..8ae2dac147 100644 --- a/docs/source/deployment-guide/deployment-guide-for-kimi-k2-thinking-on-trtllm.md +++ b/docs/source/deployment-guide/deployment-guide-for-kimi-k2-thinking-on-trtllm.md @@ -60,7 +60,7 @@ With the `EXTRA_OPTIONS_YAML_FILE`, use the following example command to launch ```bash trtllm-serve nvidia/Kimi-K2-Thinking-NVFP4 \ --host 0.0.0.0 --port 8000 \ - --extra_llm_api_options ${EXTRA_OPTIONS_YAML_FILE} + --config ${EXTRA_OPTIONS_YAML_FILE} ``` TensorRT LLM will load weights and select the best kernels during startup. The server is successfully launched when the following log is shown: diff --git a/docs/source/deployment-guide/deployment-guide-for-llama3.3-70b-on-trtllm.md b/docs/source/deployment-guide/deployment-guide-for-llama3.3-70b-on-trtllm.md index d3e328d810..f58405e8be 100644 --- a/docs/source/deployment-guide/deployment-guide-for-llama3.3-70b-on-trtllm.md +++ b/docs/source/deployment-guide/deployment-guide-for-llama3.3-70b-on-trtllm.md @@ -83,7 +83,7 @@ append: EOF Below is an example command to launch the TensorRT LLM server with the Llama-3.3-70B-Instruct-FP8 model from within the container. The command is specifically configured for the 1024/1024 Input/Output Sequence Length test. The explanation of each flag is shown in the “LLM API Options (YAML Configuration)” section. ```shell -trtllm-serve nvidia/Llama-3.3-70B-Instruct-FP8 --host 0.0.0.0 --port 8000 --extra_llm_api_options ${EXTRA_LLM_API_FILE} +trtllm-serve nvidia/Llama-3.3-70B-Instruct-FP8 --host 0.0.0.0 --port 8000 --config ${EXTRA_LLM_API_FILE} ``` After the server is set up, the client can now send prompt requests to the server and receive results. @@ -92,7 +92,7 @@ After the server is set up, the client can now send prompt requests to the serve -These options provide control over TensorRT LLM's behavior and are set within the YAML file passed to the `trtllm-serve` command via the `--extra_llm_api_options` argument. +These options provide control over TensorRT LLM's behavior and are set within the YAML file passed to the `trtllm-serve` command via the `--config` argument. #### `tensor_parallel_size` @@ -170,7 +170,7 @@ These options provide control over TensorRT LLM's behavior and are set within th  **Default**: TRTLLM -See the [TorchLlmArgs](https://nvidia.github.io/TensorRT-LLM/llm-api/reference.html#tensorrt_llm.llmapi.TorchLlmArgs) class for the full list of options which can be used in the `extra_llm_api_options`. +See the [TorchLlmArgs](https://nvidia.github.io/TensorRT-LLM/llm-api/reference.html#tensorrt_llm.llmapi.TorchLlmArgs) class for the full list of options which can be used in the YAML configuration file. ## Testing API Endpoint diff --git a/docs/source/deployment-guide/deployment-guide-for-llama4-scout-on-trtllm.md b/docs/source/deployment-guide/deployment-guide-for-llama4-scout-on-trtllm.md index 7d69b7a8be..d279ab3716 100644 --- a/docs/source/deployment-guide/deployment-guide-for-llama4-scout-on-trtllm.md +++ b/docs/source/deployment-guide/deployment-guide-for-llama4-scout-on-trtllm.md @@ -82,7 +82,7 @@ append: EOF Below is an example command to launch the TensorRT LLM server with the Llama-4-Scout-17B-16E-Instruct-FP8 model from within the container. The command is specifically configured for the 1024/1024 Input/Output Sequence Length test. The explanation of each flag is shown in the “LLM API Options (YAML Configuration)” section. ```shell -trtllm-serve nvidia/Llama-4-Scout-17B-16E-Instruct-FP8 --host 0.0.0.0 --port 8000 --extra_llm_api_options ${EXTRA_LLM_API_FILE} +trtllm-serve nvidia/Llama-4-Scout-17B-16E-Instruct-FP8 --host 0.0.0.0 --port 8000 --config ${EXTRA_LLM_API_FILE} ``` After the server is set up, the client can now send prompt requests to the server and receive results. @@ -91,7 +91,7 @@ After the server is set up, the client can now send prompt requests to the serve -These options provide control over TensorRT LLM's behavior and are set within the YAML file passed to the `trtllm-serve` command via the `--extra_llm_api_options` argument. +These options provide control over TensorRT LLM's behavior and are set within the YAML file passed to the `trtllm-serve` command via the `--config` argument. #### `tensor_parallel_size` @@ -166,7 +166,7 @@ These options provide control over TensorRT LLM's behavior and are set within th * **Default**: `TRTLLM` -See the [TorchLlmArgs](https://nvidia.github.io/TensorRT-LLM/llm-api/reference.html#tensorrt_llm.llmapi.TorchLlmArgs) class for the full list of options which can be used in the `extra_llm_api_options`. +See the [TorchLlmArgs](https://nvidia.github.io/TensorRT-LLM/llm-api/reference.html#tensorrt_llm.llmapi.TorchLlmArgs) class for the full list of options which can be used in the YAML configuration file. ## Testing API Endpoint diff --git a/docs/source/deployment-guide/deployment-guide-for-qwen3-next-on-trtllm.md b/docs/source/deployment-guide/deployment-guide-for-qwen3-next-on-trtllm.md index 46bf724b71..3ff4432d1b 100644 --- a/docs/source/deployment-guide/deployment-guide-for-qwen3-next-on-trtllm.md +++ b/docs/source/deployment-guide/deployment-guide-for-qwen3-next-on-trtllm.md @@ -61,7 +61,7 @@ append: EOF Below is an example command to launch the TensorRT LLM server with the Qwen3-Next model from within the container. ```shell -trtllm-serve Qwen/Qwen3-Next-80B-A3B-Thinking --host 0.0.0.0 --port 8000 --extra_llm_api_options ${EXTRA_LLM_API_FILE} +trtllm-serve Qwen/Qwen3-Next-80B-A3B-Thinking --host 0.0.0.0 --port 8000 --config ${EXTRA_LLM_API_FILE} ``` After the server is set up, the client can now send prompt requests to the server and receive results. @@ -70,7 +70,7 @@ After the server is set up, the client can now send prompt requests to the serve -These options provide control over TensorRT LLM's behavior and are set within the YAML file passed to the `trtllm-serve` command via the `--extra_llm_api_options` argument. +These options provide control over TensorRT LLM's behavior and are set within the YAML file passed to the `trtllm-serve` command via the `--config` argument. #### `tensor_parallel_size` @@ -127,7 +127,7 @@ These options provide control over TensorRT LLM's behavior and are set within th * `backend`: The backend to use for MoE operations. **Default**: `CUTLASS` -See the [`TorchLlmArgs` class](https://nvidia.github.io/TensorRT-LLM/llm-api/reference.html#tensorrt_llm.llmapi.TorchLlmArgs) for the full list of options which can be used in the `extra_llm_api_options`. +See the [`TorchLlmArgs` class](https://nvidia.github.io/TensorRT-LLM/llm-api/reference.html#tensorrt_llm.llmapi.TorchLlmArgs) for the full list of options which can be used in the YAML configuration file. ## Testing API Endpoint @@ -220,7 +220,7 @@ If you want to save the results to a file add the following options. --result-filename "concurrency_${concurrency}.json" ``` -For more benchmarking options see [benchmark_serving.py](https://github.com/NVIDIA/TensorRT-LLM/blob/main/tensorrt_llm/serve/scripts/benchmark_serving.py) +For more benchmarking options see [benchmark_serving.py](https://github.com/NVIDIA/TensorRT-LLM/blob/main/tensorrt_llm/serve/scripts/benchmark_serving.py) Run `bench.sh` to begin a serving benchmark. This will take a long time if you run all the concurrencies mentioned in the above `bench.sh` script. diff --git a/docs/source/deployment-guide/deployment-guide-for-qwen3-on-trtllm.md b/docs/source/deployment-guide/deployment-guide-for-qwen3-on-trtllm.md index 894c6a1e63..bda3e1a4c4 100644 --- a/docs/source/deployment-guide/deployment-guide-for-qwen3-on-trtllm.md +++ b/docs/source/deployment-guide/deployment-guide-for-qwen3-on-trtllm.md @@ -66,7 +66,7 @@ append: EOF Below is an example command to launch the TensorRT LLM server with the Qwen3 model from within the container. ```shell -trtllm-serve Qwen/Qwen3-30B-A3B --host 0.0.0.0 --port 8000 --extra_llm_api_options ${EXTRA_LLM_API_FILE} +trtllm-serve Qwen/Qwen3-30B-A3B --host 0.0.0.0 --port 8000 --config ${EXTRA_LLM_API_FILE} ``` After the server is set up, the client can now send prompt requests to the server and receive results. @@ -75,7 +75,7 @@ After the server is set up, the client can now send prompt requests to the serve -These options provide control over TensorRT LLM's behavior and are set within the YAML file passed to the `trtllm-serve` command via the `--extra_llm_api_options` argument. +These options provide control over TensorRT LLM's behavior and are set within the YAML file passed to the `trtllm-serve` command via the `--config` argument. #### `tensor_parallel_size` @@ -127,10 +127,10 @@ These options provide control over TensorRT LLM's behavior and are set within th * **Options**: * `backend`: The backend to use for MoE operations. - + **Default**: `CUTLASS` -See the [`TorchLlmArgs` class](https://nvidia.github.io/TensorRT-LLM/llm-api/reference.html#tensorrt_llm.llmapi.TorchLlmArgs) for the full list of options which can be used in the `extra_llm_api_options`. +See the [`TorchLlmArgs` class](https://nvidia.github.io/TensorRT-LLM/llm-api/reference.html#tensorrt_llm.llmapi.TorchLlmArgs) for the full list of options which can be used in the YAML configuration file. ## Testing API Endpoint @@ -247,7 +247,7 @@ If you want to save the results to a file add the following options. --result-filename "concurrency_${concurrency}.json" ``` -For more benchmarking options see [benchmark_serving.py](https://github.com/NVIDIA/TensorRT-LLM/blob/main/tensorrt_llm/serve/scripts/benchmark_serving.py) +For more benchmarking options see [benchmark_serving.py](https://github.com/NVIDIA/TensorRT-LLM/blob/main/tensorrt_llm/serve/scripts/benchmark_serving.py) Run `bench.sh` to begin a serving benchmark. This will take a long time if you run all the concurrencies mentioned in the above `bench.sh` script. diff --git a/docs/source/deployment-guide/index.rst b/docs/source/deployment-guide/index.rst index 644a9d9ae9..1d2df5e5b6 100644 --- a/docs/source/deployment-guide/index.rst +++ b/docs/source/deployment-guide/index.rst @@ -17,7 +17,7 @@ The TensorRT LLM Docker container makes these config files available at ``/app/t export TRTLLM_DIR="/app/tensorrt_llm" # path to the TensorRT LLM repo in your local environment -.. include:: note_sections.rst +.. include:: ../_includes/note_sections.rst :start-after: .. start-note-quick-start-isl-osl :end-before: .. end-note-quick-start-isl-osl @@ -36,52 +36,52 @@ This table is designed to provide a straightforward starting point; for detailed - H100, H200 - Max Throughput - `deepseek-r1-throughput.yaml `_ - - ``trtllm-serve deepseek-ai/DeepSeek-R1-0528 --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/curated/deepseek-r1-throughput.yaml`` + - ``trtllm-serve deepseek-ai/DeepSeek-R1-0528 --config ${TRTLLM_DIR}/examples/configs/curated/deepseek-r1-throughput.yaml`` * - `DeepSeek-R1 `_ - B200, GB200 - Max Throughput - `deepseek-r1-deepgemm.yaml `_ - - ``trtllm-serve deepseek-ai/DeepSeek-R1-0528 --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/curated/deepseek-r1-deepgemm.yaml`` + - ``trtllm-serve deepseek-ai/DeepSeek-R1-0528 --config ${TRTLLM_DIR}/examples/configs/curated/deepseek-r1-deepgemm.yaml`` * - `DeepSeek-R1 (NVFP4) `_ - B200, GB200 - Max Throughput - `deepseek-r1-throughput.yaml `_ - - ``trtllm-serve nvidia/DeepSeek-R1-FP4 --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/curated/deepseek-r1-throughput.yaml`` + - ``trtllm-serve nvidia/DeepSeek-R1-FP4 --config ${TRTLLM_DIR}/examples/configs/curated/deepseek-r1-throughput.yaml`` * - `DeepSeek-R1 (NVFP4) `_ - B200, GB200 - Min Latency - `deepseek-r1-latency.yaml `_ - - ``trtllm-serve nvidia/DeepSeek-R1-FP4-v2 --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/curated/deepseek-r1-latency.yaml`` + - ``trtllm-serve nvidia/DeepSeek-R1-FP4-v2 --config ${TRTLLM_DIR}/examples/configs/curated/deepseek-r1-latency.yaml`` * - `gpt-oss-120b `_ - Any - Max Throughput - `gpt-oss-120b-throughput.yaml `_ - - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/curated/gpt-oss-120b-throughput.yaml`` + - ``trtllm-serve openai/gpt-oss-120b --config ${TRTLLM_DIR}/examples/configs/curated/gpt-oss-120b-throughput.yaml`` * - `gpt-oss-120b `_ - Any - Min Latency - `gpt-oss-120b-latency.yaml `_ - - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/curated/gpt-oss-120b-latency.yaml`` + - ``trtllm-serve openai/gpt-oss-120b --config ${TRTLLM_DIR}/examples/configs/curated/gpt-oss-120b-latency.yaml`` * - `Qwen3-Next-80B-A3B-Thinking `_ - Any - Max Throughput - `qwen3-next.yaml `_ - - ``trtllm-serve Qwen/Qwen3-Next-80B-A3B-Thinking --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/curated/qwen3-next.yaml`` + - ``trtllm-serve Qwen/Qwen3-Next-80B-A3B-Thinking --config ${TRTLLM_DIR}/examples/configs/curated/qwen3-next.yaml`` * - Qwen3 family (e.g. `Qwen3-30B-A3B `_) - Any - Max Throughput - `qwen3.yaml `_ - - ``trtllm-serve Qwen/Qwen3-30B-A3B --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/curated/qwen3.yaml`` (swap to another Qwen3 model name as needed) + - ``trtllm-serve Qwen/Qwen3-30B-A3B --config ${TRTLLM_DIR}/examples/configs/curated/qwen3.yaml`` (swap to another Qwen3 model name as needed) * - `Llama-3.3-70B (FP8) `_ - Any - Max Throughput - `llama-3.3-70b.yaml `_ - - ``trtllm-serve nvidia/Llama-3.3-70B-Instruct-FP8 --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/curated/llama-3.3-70b.yaml`` + - ``trtllm-serve nvidia/Llama-3.3-70B-Instruct-FP8 --config ${TRTLLM_DIR}/examples/configs/curated/llama-3.3-70b.yaml`` * - `Llama 4 Scout (FP8) `_ - Any - Max Throughput - `llama-4-scout.yaml `_ - - ``trtllm-serve nvidia/Llama-4-Scout-17B-16E-Instruct-FP8 --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/curated/llama-4-scout.yaml`` + - ``trtllm-serve nvidia/Llama-4-Scout-17B-16E-Instruct-FP8 --config ${TRTLLM_DIR}/examples/configs/curated/llama-4-scout.yaml`` Model-Specific Deployment Guides --------------------------------- diff --git a/docs/source/developer-guide/perf-benchmarking.md b/docs/source/developer-guide/perf-benchmarking.md index ab6feab7e3..e95e28c496 100644 --- a/docs/source/developer-guide/perf-benchmarking.md +++ b/docs/source/developer-guide/perf-benchmarking.md @@ -2,6 +2,13 @@ # TensorRT LLM Benchmarking + +```{eval-rst} +.. include:: ../_includes/note_sections.rst + :start-after: .. start-note-config-flag-alias + :end-before: .. end-note-config-flag-alias +``` + TensorRT LLM provides the `trtllm-bench` CLI, a packaged benchmarking utility that aims to make it easier for users to reproduce our officially published [performance overview](./perf-overview.md#throughput-measurements). `trtllm-bench` provides the follows: @@ -176,7 +183,7 @@ trtllm-bench --model meta-llama/Llama-3.1-8B prepare-dataset --output /tmp/synth To benchmark the PyTorch backend (`tensorrt_llm._torch`), use the following command with [dataset](#preparing-a-dataset) generated from previous steps. The `throughput` benchmark initializes the backend by tuning against the dataset provided via `--dataset` (or the other build mode settings described above). -Note that CUDA graph is enabled by default. You can add additional pytorch config with `--extra_llm_api_options` followed by the path to a YAML file. For more details, please refer to the help text by running the command with `--help`. +Note that CUDA graph is enabled by default. You can add additional pytorch config with `--config` followed by the path to a YAML file. For more details, please refer to the help text by running the command with `--help`. ```{tip} The command below specifies the `--model_path` option. The model path is optional and used only when you want to run a locally @@ -289,7 +296,7 @@ The generated dataset will include LoRA request metadata. Below is an example of **LoRA Configuration** -Create an `extra-llm-api-options.yaml` file with LoRA configuration: +Create a `config.yaml` file with LoRA configuration: ```yaml lora_config: @@ -314,7 +321,7 @@ trtllm-bench --model /path/to/base/model \ throughput \ --dataset synthetic_lora_data.json \ --backend pytorch \ - --extra_llm_api_options extra-llm-api-options.yaml + --config config.yaml ``` ```{note} diff --git a/docs/source/developer-guide/perf-overview.md b/docs/source/developer-guide/perf-overview.md index ae3a0072e9..8602ff1896 100644 --- a/docs/source/developer-guide/perf-overview.md +++ b/docs/source/developer-guide/perf-overview.md @@ -269,7 +269,7 @@ Testing was performed using the PyTorch backend - this workflow does not require | Stage | Description | Command | | :- | - | - | | [Dataset](#preparing-a-dataset) | Create a synthetic dataset | `python benchmarks/cpp/prepare_dataset.py --tokenizer=$model_name --stdout token-norm-dist --num-requests=$num_requests --input-mean=$isl --output-mean=$osl --input-stdev=0 --output-stdev=0 > $dataset_file` | -| [Run](#running-the-benchmark) | Run a benchmark with a dataset | `trtllm-bench --model $model_name throughput --dataset $dataset_file --backend pytorch --extra_llm_api_options $llm_options` | +| [Run](#running-the-benchmark) | Run a benchmark with a dataset | `trtllm-bench --model $model_name throughput --dataset $dataset_file --backend pytorch --config $llm_options` | ### Variables @@ -323,7 +323,7 @@ a model name (HuggingFace reference or path to a local model), a [generated data For dense / non-MoE models: ```shell -trtllm-bench --tp $tp_size --pp $pp_size --model $model_name throughput --dataset $dataset_file --backend pytorch --extra_llm_api_options $llm_options +trtllm-bench --tp $tp_size --pp $pp_size --model $model_name throughput --dataset $dataset_file --backend pytorch --config $llm_options ``` Llama 3.3 @@ -337,7 +337,7 @@ cuda_graph_config: For MoE models: ```shell -trtllm-bench --tp $tp_size --pp $pp_size --ep $ep_size --model $model_name throughput --dataset $dataset_file --backend pytorch --extra_llm_api_options $llm_options +trtllm-bench --tp $tp_size --pp $pp_size --ep $ep_size --model $model_name throughput --dataset $dataset_file --backend pytorch --config $llm_options ``` GPT-OSS: diff --git a/docs/source/features/auto_deploy/advanced/benchmarking_with_trtllm_bench.md b/docs/source/features/auto_deploy/advanced/benchmarking_with_trtllm_bench.md index d5e0cde8f2..84f8015889 100644 --- a/docs/source/features/auto_deploy/advanced/benchmarking_with_trtllm_bench.md +++ b/docs/source/features/auto_deploy/advanced/benchmarking_with_trtllm_bench.md @@ -24,7 +24,13 @@ As in the PyTorch workflow, AutoDeploy does not require a separate `trtllm-bench ## Advanced Configuration -For more granular control over AutoDeploy's behavior during benchmarking, use the `--extra_llm_api_options` flag with a YAML configuration file: +For more granular control over AutoDeploy's behavior during benchmarking, use the `--config` flag with a YAML configuration file: + +```{eval-rst} +.. include:: ../../../_includes/note_sections.rst + :start-after: .. start-note-config-flag-alias + :end-before: .. end-note-config-flag-alias +``` ```bash trtllm-bench \ @@ -32,7 +38,7 @@ trtllm-bench \ throughput \ --dataset /tmp/synthetic_128_128.txt \ --backend _autodeploy \ - --extra_llm_api_options autodeploy_config.yaml + --config autodeploy_config.yaml ``` ### Configuration Examples diff --git a/docs/source/features/disagg-serving.md b/docs/source/features/disagg-serving.md index ce52b9a3d5..b6eb4b17f9 100644 --- a/docs/source/features/disagg-serving.md +++ b/docs/source/features/disagg-serving.md @@ -1,4 +1,4 @@ -# Disaggregated Serving +# Disaggregated Serving - [Motivation](#Motivation) - [KV Cache Exchange](#KV-Cache-Exchange) @@ -100,6 +100,12 @@ For more information on how to use Dynamo with TensorRT-LLM, please refer to [th The second approach to evaluate disaggregated LLM inference with TensorRT LLM involves launching a separate OpenAI-compatible server per context and generation instance using `trtllm-serve`. An additional server, referred to as the "disaggregated" server, is also launched with `trtllm-serve` and acts as an orchestrator which receives client requests and dispatches them to the appropriate context and generation servers via OpenAI REST API. Figure 6 below illustrates the disaggregated serving workflow when using this approach. When a context instance is done generating the KV blocks associated with the prompt, it returns a response to the disaggregated server. This response includes the prompt tokens, the first generated token and metadata associated with the context request and context instance. This metadata is referred to as context parameters (`ctx_params` in Figure 6). These parameters are then used by the generation instances to establish communication with the context instance and retrieve the KV cache blocks associated with the request. +```{eval-rst} +.. include:: ../_includes/note_sections.rst + :start-after: .. start-note-config-flag-alias + :end-before: .. end-note-config-flag-alias +``` +
@@ -126,19 +132,19 @@ For example, you could launch two context servers and one generation servers as ``` -# Generate context_extra-llm-api-config.yml +# Generate context_config.yml # Overlap scheduler for context servers are disabled because it's not supported for disaggregated context servers yet -echo -e "disable_overlap_scheduler: True\ncache_transceiver_config:\n backend: UCX\n max_tokens_in_buffer: 2048" > context_extra-llm-api-config.yml +echo -e "disable_overlap_scheduler: True\ncache_transceiver_config:\n backend: UCX\n max_tokens_in_buffer: 2048" > context_config.yml # Start Context servers -CUDA_VISIBLE_DEVICES=0 trtllm-serve TinyLlama/TinyLlama-1.1B-Chat-v1.0 --host localhost --port 8001 --backend pytorch --extra_llm_api_options ./context_extra-llm-api-config.yml &> log_ctx_0 & -CUDA_VISIBLE_DEVICES=1 trtllm-serve TinyLlama/TinyLlama-1.1B-Chat-v1.0 --host localhost --port 8002 --backend pytorch --extra_llm_api_options ./context_extra-llm-api-config.yml &> log_ctx_1 & +CUDA_VISIBLE_DEVICES=0 trtllm-serve TinyLlama/TinyLlama-1.1B-Chat-v1.0 --host localhost --port 8001 --backend pytorch --config ./context_config.yml &> log_ctx_0 & +CUDA_VISIBLE_DEVICES=1 trtllm-serve TinyLlama/TinyLlama-1.1B-Chat-v1.0 --host localhost --port 8002 --backend pytorch --config ./context_config.yml &> log_ctx_1 & -# Generate gen_extra-llm-api-config.yml -echo -e "cache_transceiver_config:\n backend: UCX\n max_tokens_in_buffer: 2048" > gen_extra-llm-api-config.yml +# Generate gen_config.yml +echo -e "cache_transceiver_config:\n backend: UCX\n max_tokens_in_buffer: 2048" > gen_config.yml # Start Generation servers -CUDA_VISIBLE_DEVICES=2 trtllm-serve TinyLlama/TinyLlama-1.1B-Chat-v1.0 --host localhost --port 8003 --backend pytorch --extra_llm_api_options ./gen_extra-llm-api-config.yml &> log_gen_0 & +CUDA_VISIBLE_DEVICES=2 trtllm-serve TinyLlama/TinyLlama-1.1B-Chat-v1.0 --host localhost --port 8003 --backend pytorch --config ./gen_config.yml &> log_gen_0 & ``` Once the context and generation servers are launched, you can launch the disaggregated server, which will accept requests from clients and do the orchestration between context diff --git a/docs/source/features/guided-decoding.md b/docs/source/features/guided-decoding.md index 110efc8e51..3591d1808f 100644 --- a/docs/source/features/guided-decoding.md +++ b/docs/source/features/guided-decoding.md @@ -9,14 +9,20 @@ TensorRT LLM supports two grammar backends: ## Online API: `trtllm-serve` -If you are using `trtllm-serve`, enable guided decoding by specifying `guided_decoding_backend` with `xgrammar` or `llguidance` in the YAML configuration file, and pass it to `--extra_llm_api_options`. For example, +If you are using `trtllm-serve`, enable guided decoding by specifying `guided_decoding_backend` with `xgrammar` or `llguidance` in the YAML configuration file, and pass it to `--config`. For example, + +```{eval-rst} +.. include:: ../_includes/note_sections.rst + :start-after: .. start-note-config-flag-alias + :end-before: .. end-note-config-flag-alias +``` ```bash -cat > extra_llm_api_options.yaml < config.yaml < @@ -158,21 +164,21 @@ For the op outside of attention and MLP, the developer should obey the torch.com

Figure 2. TensorRT LLM Custom torch.compile Backend Overview

-Above is the overview of the TensorRT LLM custom backend for `torch.compile`. +Above is the overview of the TensorRT LLM custom backend for `torch.compile`. #### Torch IR Optimization Torch IR is the Fx graph that is directly traced by Torch Dynamo. It has several important features for us to do some graph rewriting and get information: 1. Preserve the operations as is: We can easily find a specific operation and then transform it to arbitrary operations. No need to deal with `auto_functionalize`, etc. -2. Preserve original variable tensor name in the Fx graph: For Piecewise CUDA Graph, it needs to find the correct `SymInt` which represents the token number. Hence, we rely on the `input_ids`'s shape to make it find the `SymInt` correctly. +2. Preserve original variable tensor name in the Fx graph: For Piecewise CUDA Graph, it needs to find the correct `SymInt` which represents the token number. Hence, we rely on the `input_ids`'s shape to make it find the `SymInt` correctly. #### ATen IR Optimization We get ATen IR after explicitly calling `aot_module_simplified` on the Fx graph. ATen IR is 1. In SSA format (no input mutations) -2. Strict subset of aten op (<250): In Torch IR, Python native add op, `torch.Tensor().add()`, `torch.aten.add.Tensor` could be three different ops. After the transform, they will be the same op. +2. Strict subset of aten op (<250): In Torch IR, Python native add op, `torch.Tensor().add()`, `torch.aten.add.Tensor` could be three different ops. After the transform, they will be the same op. 3. Guaranteed metadata information, e.g., dtype and shape propagation On this IR level, TensorRT LLM will do the following optimization @@ -183,16 +189,16 @@ All fusions are located in `tensorrt_llm/_torch/compilation/patterns` and implem 1. Inadequate handling of scalars and lists: - Scalars get specialized into the traced pattern, forcing one pattern per value—impractical and non-general. - - Lists are flattened, turning elements into separate input arguments, making it impossible to match the original operation. + - Lists are flattened, turning elements into separate input arguments, making it impossible to match the original operation. 2. Trace-driven pitfalls: Because it’s trace-based, the generated source patterns may not meet our needs and can introduce additional issues as we expand pattern coverage. We mainly do the operation fusion for AllReduce & RMSNorm. 1. AllReduce related fusion: Fuse the following operations into one AllReduce op. + AllReduce + Residual + RMSNorm - + AllReduce + Residual + RMSNorm + FP8 Quantization + + AllReduce + Residual + RMSNorm + FP8 Quantization + AllReduce + Residual + RMSNorm + FP4 Quantization -2. AllReduce with User Buffer: Converts AllReduce operations to use userbuffers to avoid extra copy overhead. +2. AllReduce with User Buffer: Converts AllReduce operations to use userbuffers to avoid extra copy overhead. We enable these fusions in torch.compile because they’re difficult to express in eager mode. For the AllReduce + RMSNorm fusion, which is cross-module, implementing it in eager mode would require moving code between modules, leading to redundant, complex, and hard-to-maintain logic. @@ -204,7 +210,7 @@ Because ATen IR is SSA, in-place operations are rewritten as out-of-place via a ##### Auto Multi-stream -Currently torch.compile won't create a subgraph for user user-defined CUDA stream. Instead, it will convert it to `set_stream`. The set_stream op doesn't have any consumers, so it will be removed in the Torch IR to ATen IR transformation, thus losing all the multi-stream scheduling. +Currently torch.compile won't create a subgraph for user user-defined CUDA stream. Instead, it will convert it to `set_stream`. The set_stream op doesn't have any consumers, so it will be removed in the Torch IR to ATen IR transformation, thus losing all the multi-stream scheduling. To address this, we implemented an auto multi-stream scheduler: @@ -214,7 +220,7 @@ To address this, we implemented an auto multi-stream scheduler: 3. Schedules nodes onto up to `max_num_streams` specified by user config -4. Insert multi-stream related custom op: since the Fx graph executes operators in list order, so we insert streaming-control operators directly into the graph. Moreover, as these operators have no users, we cannot perform dead-code elimination after multi-stream scheduling. Below is an example of multi-stream, which `trtllm.dsv3_router_gemm_op.default` and `trtllm.silu_and_mul.default` + `trtllm.fp4_quantize.default` execute in parallel. +4. Insert multi-stream related custom op: since the Fx graph executes operators in list order, so we insert streaming-control operators directly into the graph. Moreover, as these operators have no users, we cannot perform dead-code elimination after multi-stream scheduling. Below is an example of multi-stream, which `trtllm.dsv3_router_gemm_op.default` and `trtllm.silu_and_mul.default` + `trtllm.fp4_quantize.default` execute in parallel. ``` call_function record_event trtllm.record_event (1,) {} @@ -238,7 +244,7 @@ To address this, we implemented an auto multi-stream scheduler: call_function record_stream_1 trtllm.record_stream (mm_1, 1) {} call_function record_event_4 trtllm.record_event (2,) {} call_function set_stream_1 trtllm.set_stream (0,) {} - call_function wait_event_2 trtllm.wait_event (2,) + call_function wait_event_2 trtllm.wait_event (2,) ``` #### Piecewise CUDA Graph @@ -254,14 +260,14 @@ In the current design, we assume the attention block is the only non-capturable Notes: -1. Attention **MUST NOT** have any output. The output tensor should be allocated by CUDA Graph. -2. Each sub-cudagraph **MUST** have at least one input tensor that contains the number of tokens in the shape. -3. Only allow dynamic shape for `num_of_tokens` dim. +1. Attention **MUST NOT** have any output. The output tensor should be allocated by CUDA Graph. +2. Each sub-cudagraph **MUST** have at least one input tensor that contains the number of tokens in the shape. +3. Only allow dynamic shape for `num_of_tokens` dim. ### Common Trace Failure 1. Custom op fake kernel: For every custom op, developers must implement a correct fake kernel. **Make sure to update the corresponding fake kernel when the custom op is changed** -2. Dynamic Iteration Number Loop: This is technically not a trace failure, but it will introduce long-time tracing that is generally not acceptable. When torch.compile tries to convert PyTorch modeling code to Fx graph, it will try to unroll the loop. For a loop that has a large and dynamic loop number with a large loop body, the tracing process will take a long time to do the unrolling. +2. Dynamic Iteration Number Loop: This is technically not a trace failure, but it will introduce long-time tracing that is generally not acceptable. When torch.compile tries to convert PyTorch modeling code to Fx graph, it will try to unroll the loop. For a loop that has a large and dynamic loop number with a large loop body, the tracing process will take a long time to do the unrolling. 1. If the IO of the loop can be easily written into a custom op format, try to replace it with a custom op 2. If the loop num is unchanged during the whole inference service lifetime, then it is ok to leave the loop as is. (e.g., Model decoder layer loop) @@ -276,30 +282,30 @@ Notes: + `torch.nonzeros()`: Produce data-dependent dynamic shape tensor + `torch.sym_min`: `SymInt` aware min + `torch.Tensor.tolist()`, `torch.Tensor.item()` - + **Solution:** Use them inside a custom op if these operators don't get involved in producing the custom op's output tensor. + + **Solution:** Use them inside a custom op if these operators don't get involved in producing the custom op's output tensor. -2. Use a custom object’s method: For a class like mapping config, we cannot directly use its method like has_pp() in the model forward. +2. Use a custom object’s method: For a class like mapping config, we cannot directly use its method like has_pp() in the model forward. - + **Solution**: We should convert it to a bool in the model init and use the bool. + + **Solution**: We should convert it to a bool in the model init and use the bool. ```python class Mapping(object): def __init__(self, ...): ... - + def has_pp(self): # Cannot use this method in torch.compile return self.pp_size > 1 ``` 3. Data Dependent Control(DDC) flow involved in code - + **Solution**: Try to avoid DDC in the code. Try to pre-compute the result outside of torch.compile's scope. For the following example, try to pre-compute the `torch.sum(data)` at the data preparation stage, and pass the result to the `forward`. + + **Solution**: Try to avoid DDC in the code. Try to pre-compute the result outside of torch.compile's scope. For the following example, try to pre-compute the `torch.sum(data)` at the data preparation stage, and pass the result to the `forward`. ```python class TestCase(torch.nn.Module): def __init__(self): super().__init__() - + def forward(self, x, data): y = x ** 2 if torch.sum(data) >= 4: # Data Dependent Control Here! @@ -308,7 +314,7 @@ Notes: t = y / 2 t = t + 10 return t - + test_case = TestCase() test_case = torch.compile(test_case, backend=Backend()) x = torch.randn(5).cuda() @@ -320,15 +326,15 @@ Notes: ### Recompilation -1. Try not to use data-dependent dynamic shapes in the model forward. (e.g., slice the tensor based on input value). This will introduce 0/1 specialization to the model and will possibly introduce recompile. +1. Try not to use data-dependent dynamic shapes in the model forward. (e.g., slice the tensor based on input value). This will introduce 0/1 specialization to the model and will possibly introduce recompile. 1. **0/1 specialization**: torch.compile will recompile the model if a dynamic tensor’s dim equals 0 or 1. In the worst case, it will recompile 3 times for 1 dimension: 0,1, >2 -2. For an int argument that would change during runtime, use `SymInt` rather than int in the C++ custom op definition. Otherwise, it will trigger a recompile when the value changes. +2. For an int argument that would change during runtime, use `SymInt` rather than int in the C++ custom op definition. Otherwise, it will trigger a recompile when the value changes. ```c++ TORCH_LIBRARY_FRAGMENT(trtllm, m) - { + { m.def("allgather(Tensor input, SymInt[]? sizes, int[] group) -> Tensor"); m.def("allgather_list(Tensor[] input_list, SymInt[]? sizes, int[] group) -> Tensor[]"); } @@ -340,13 +346,13 @@ Notes: 2. Control Flow based on dynamic shape - 3. Next power of two: Previously, we used `bit_length()` to implement the next power of 2 function. However, it will cause a recompile for every int value. Now rewrite the code to be torch.compile-friendly. + 3. Next power of two: Previously, we used `bit_length()` to implement the next power of 2 function. However, it will cause a recompile for every int value. Now rewrite the code to be torch.compile-friendly. ```python def next_positive_power_of_2(x: int) -> int: if x < 1: return 1 - + # Following code is equivalent to 1 << (x - 1).bit_length() # But this impl does not contain bit_length(), so it can be used by torch compile. # It can correctly handle 64-bit numbers, which should be enough for now. @@ -359,5 +365,3 @@ Notes: n |= n >> 32 return n + 1 ``` - - diff --git a/docs/source/helper.py b/docs/source/helper.py index 675bd697e9..9f6530e166 100644 --- a/docs/source/helper.py +++ b/docs/source/helper.py @@ -358,15 +358,20 @@ def update_version(): docs_source_dir = Path(__file__).parent.resolve() md_files = list(docs_source_dir.rglob("*.md")) + # Default is to replace `release:x.y.z` placeholders; set to 0 to disable. + if os.environ.get("TRTLLM_DOCS_REPLACE_CONTAINER_TAG", "1") != "1": + return + for file_path in md_files: with open(file_path, "r") as f: content = f.read() - content = content.replace( + updated = content.replace( "nvcr.io/nvidia/tensorrt-llm/release:x.y.z", f"nvcr.io/nvidia/tensorrt-llm/release:{version}", ) - with open(file_path, "w") as f: - f.write(content) + if updated != content: + with open(file_path, "w") as f: + f.write(updated) if __name__ == "__main__": diff --git a/docs/source/legacy/performance/perf-benchmarking.md b/docs/source/legacy/performance/perf-benchmarking.md index 9530b6da1b..caca11a7a4 100644 --- a/docs/source/legacy/performance/perf-benchmarking.md +++ b/docs/source/legacy/performance/perf-benchmarking.md @@ -415,11 +415,17 @@ Total Latency (ms): 13525.6862 ### Running with the PyTorch Workflow +```{eval-rst} +.. include:: ../../_includes/note_sections.rst + :start-after: .. start-note-config-flag-alias + :end-before: .. end-note-config-flag-alias +``` + To benchmark the PyTorch backend (`tensorrt_llm._torch`), use the following command with [dataset](#preparing-a-dataset) generated from previous steps. With the PyTorch flow, you will not need to run `trtllm-bench build`; the `throughput` benchmark initializes the backend by tuning against the dataset provided via `--dataset` (or the other build mode settings described [above](#other-build-modes)). Note that CUDA graph is enabled by default. You can add additional pytorch config with -`--extra_llm_api_options` followed by the path to a YAML file. For more details, please refer to the +`--config` followed by the path to a YAML file. For more details, please refer to the help text by running the command with `--help`. ```{tip} @@ -511,7 +517,7 @@ The generated dataset will include LoRA request metadata. Below is an example of **LoRA Configuration** -Create an `extra-llm-api-options.yaml` file with LoRA configuration: +Create a `config.yaml` file with LoRA configuration: ```yaml lora_config: @@ -535,7 +541,7 @@ lora_config: trtllm-bench --model /path/to/base/model \ throughput \ --dataset synthetic_lora_data.json \ - --extra_llm_api_options extra-llm-api-options.yaml + --config config.yaml ``` ```{note} diff --git a/docs/source/torch/auto_deploy/advanced/benchmarking_with_trtllm_bench.md b/docs/source/torch/auto_deploy/advanced/benchmarking_with_trtllm_bench.md index 43e2a1a46e..2f37c716cf 100644 --- a/docs/source/torch/auto_deploy/advanced/benchmarking_with_trtllm_bench.md +++ b/docs/source/torch/auto_deploy/advanced/benchmarking_with_trtllm_bench.md @@ -24,7 +24,7 @@ As in the PyTorch workflow, AutoDeploy does not require a separate `trtllm-bench ## Advanced Configuration -For more granular control over AutoDeploy's behavior during benchmarking, use the `--extra_llm_api_options` flag with a YAML configuration file: +For more granular control over AutoDeploy's behavior during benchmarking, use the `--config` flag with a YAML configuration file: ```bash trtllm-bench \ @@ -32,7 +32,7 @@ trtllm-bench \ throughput \ --dataset /tmp/synthetic_128_128.txt \ --backend _autodeploy \ - --extra_llm_api_options autodeploy_config.yaml + --config autodeploy_config.yaml ``` ### Configuration Examples diff --git a/docs/source/torch/auto_deploy/advanced/serving_with_trtllm_serve.md b/docs/source/torch/auto_deploy/advanced/serving_with_trtllm_serve.md index 6e52fe4ea4..20693f6170 100644 --- a/docs/source/torch/auto_deploy/advanced/serving_with_trtllm_serve.md +++ b/docs/source/torch/auto_deploy/advanced/serving_with_trtllm_serve.md @@ -30,13 +30,13 @@ curl -s http://localhost:8000/v1/chat/completions \ ## Configuration via YAML -Use `--extra_llm_api_options` to supply a YAML file that augments or overrides server/runtime settings. +Use `--config` to supply a YAML file that augments or overrides server/runtime settings. ```bash trtllm-serve \ meta-llama/Llama-3.1-8B \ --backend _autodeploy \ - --extra_llm_api_options autodeploy_config.yaml + --config autodeploy_config.yaml ``` Example `autodeploy_config.yaml`: diff --git a/docs/source/torch/features/lora.md b/docs/source/torch/features/lora.md index d00a27d49a..ccf7561efb 100644 --- a/docs/source/torch/features/lora.md +++ b/docs/source/torch/features/lora.md @@ -157,7 +157,7 @@ llm = LLM( ### YAML Configuration -Create an `extra_llm_api_options.yaml` file: +Create a `config.yaml` file: ```yaml lora_config: @@ -170,7 +170,7 @@ lora_config: ```bash python -m tensorrt_llm.commands.serve /path/to/model \ - --extra_llm_api_options extra_llm_api_options.yaml + --config config.yaml ``` ### Client Usage @@ -198,7 +198,7 @@ response = client.completions.create( ### YAML Configuration -Create an `extra_llm_api_options.yaml` file: +Create a `config.yaml` file: ```yaml lora_config: @@ -220,5 +220,5 @@ lora_config: ### Run trtllm-bench ```bash -trtllm-bench --model $model_path throughput --dataset $dataset_path --extra_llm_api_options extra-llm-api-options.yaml --num_requests 64 --concurrency 16 +trtllm-bench --model $model_path throughput --dataset $dataset_path --config config.yaml --num_requests 64 --concurrency 16 ``` diff --git a/examples/__init__.py b/examples/__init__.py new file mode 100644 index 0000000000..3159bfe656 --- /dev/null +++ b/examples/__init__.py @@ -0,0 +1,14 @@ +# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/examples/auto_deploy/.gitignore b/examples/auto_deploy/.gitignore index f15c233aee..0999a4ed76 100644 --- a/examples/auto_deploy/.gitignore +++ b/examples/auto_deploy/.gitignore @@ -6,3 +6,4 @@ benchmark_results.json *.yaml !nano_v3.yaml !nemotron_flash.yaml +!model_registry/configs/*.yaml diff --git a/examples/auto_deploy/model_registry/README.md b/examples/auto_deploy/model_registry/README.md new file mode 100644 index 0000000000..0c5756fca9 --- /dev/null +++ b/examples/auto_deploy/model_registry/README.md @@ -0,0 +1,160 @@ +# AutoDeploy Model Registry + +The AutoDeploy model registry provides a comprehensive, maintainable list of supported models for testing and coverage tracking. + +## Format + +**Version: 2.0** (Flat format with composable configurations) + +### Structure + +```yaml +version: '2.0' +description: AutoDeploy Model Registry - Flat format with composable configs +models: +- name: meta-llama/Llama-3.1-8B-Instruct + yaml_extra: [dashboard_default.yaml, world_size_2.yaml] + +- name: meta-llama/Llama-3.3-70B-Instruct + yaml_extra: [dashboard_default.yaml, world_size_4.yaml, llama-3.3-70b.yaml] + +- name: deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B + yaml_extra: [dashboard_default.yaml, world_size_2.yaml, demollm_triton.yaml] +``` + +### Key Concepts + +- **Flat list**: Models are in a single flat list (not grouped) +- **Composable configs**: Each model references YAML config files via `yaml_extra` +- **Deep merging**: Config files are merged in order (later files override earlier ones) +- **No inline args**: All configuration is in YAML files for reusability + +## Configuration Files + +Config files are stored in `configs/` subdirectory and define runtime parameters: + +### Core Configs + +| File | Purpose | Example Use | +|------|---------|-------------| +| `dashboard_default.yaml` | Baseline settings for all models | Always first in yaml_extra | +| `world_size_N.yaml` | GPU count (1, 2, 4, 8) | Defines tensor_parallel_size | + +### Runtime Configs + +| File | Purpose | +|------|---------| +| `multimodal.yaml` | Vision + text models | +| `demollm_triton.yaml` | DemoLLM runtime with Triton backend | +| `simple_shard_only.yaml` | Large models requiring simple sharding + +### Model-Specific Configs + +| File | Purpose | +|------|---------| +| `llama-3.3-70b.yaml` | Optimized settings for Llama 3.3 70B | +| `nano_v3.yaml` | Settings for Nemotron Nano V3 | +| `llama-4-scout.yaml` | Settings for Llama 4 Scout | +| `openelm.yaml` | Apple OpenELM (custom tokenizer) | +| `gemma3_1b.yaml` | Gemma 3 1B (sequence length) | +| `deepseek_v3_lite.yaml` | DeepSeek V3/R1 (reduced layers) | +| `llama4_maverick_lite.yaml` | Llama 4 Maverick (reduced layers) | + +## Adding a New Model + +### Simple Model (Standard Config) + +```yaml +- name: organization/my-new-model-7b + yaml_extra: [dashboard_default.yaml, world_size_2.yaml] +``` + +### Model with Special Requirements + +```yaml +- name: organization/my-multimodal-model + yaml_extra: [dashboard_default.yaml, world_size_4.yaml, multimodal.yaml] +``` + +### Model with Custom Config + +1. Create `configs/my_model.yaml`: + +```yaml +# Custom settings for my model +max_batch_size: 2048 +kv_cache_free_gpu_memory_fraction: 0.95 +cuda_graph_config: + enable_padding: true +``` + +2. Reference it in `models.yaml`: + +```yaml +- name: organization/my-custom-model + yaml_extra: [dashboard_default.yaml, world_size_8.yaml, my_model.yaml] +``` + +## Config Merging + +Configs are merged in order. Example: + +```yaml +yaml_extra: + - dashboard_default.yaml # baseline: runtime=trtllm, benchmark_enabled=true + - world_size_2.yaml # adds: tensor_parallel_size=2 + - openelm.yaml # overrides: tokenizer=llama-2, benchmark_enabled=false +``` + +**Result**: `runtime=trtllm, tensor_parallel_size=2, tokenizer=llama-2, benchmark_enabled=false` + +## World Size Guidelines + +| World Size | Model Size Range | Example Models | +|------------|------------------|----------------| +| 1 | \< 2B params | TinyLlama, Qwen 0.5B, Phi-4-mini | +| 2 | 2-15B params | Llama 3.1 8B, Qwen 7B, Mistral 7B | +| 4 | 20-80B params | Llama 3.3 70B, QwQ 32B, Gemma 27B | +| 8 | 80B+ params | DeepSeek V3, Llama 405B, Nemotron Ultra | + +## Model Coverage + +The registry contains models distributed across different GPU configurations (world sizes 1, 2, 4, and 8), including both text-only and multimodal models. + +**To verify current model counts and coverage:** + +```bash +cd /path/to/autodeploy-dashboard +python3 scripts/prepare_model_coverage_v2.py \ + --source local \ + --local-path /path/to/TensorRT-LLM \ + --output /tmp/model_coverage.yaml + +# View summary +grep -E "^- name:|yaml_extra:" /path/to/TensorRT-LLM/examples/auto_deploy/model_registry/models.yaml | wc -l +``` + +When adding or removing models, use `prepare_model_coverage_v2.py` to validate the registry structure and coverage. + +## Best Practices + +1. **Always include `dashboard_default.yaml` first** - it provides baseline settings +1. **Always include a `world_size_N.yaml`** - defines GPU count +1. **Add special configs after world_size** - they override defaults +1. **Create reusable configs** - if 3+ models need same settings, make a config file +1. **Use model-specific configs sparingly** - only for unique requirements +1. **Test before committing** - verify with `prepare_model_coverage_v2.py` + +## Testing Changes + +```bash +# Generate workload from local changes +cd /path/to/autodeploy-dashboard +python3 scripts/prepare_model_coverage_v2.py \ + --source local \ + --local-path /path/to/TensorRT-LLM \ + --output /tmp/test_workload.yaml + +# Verify output +cat /tmp/test_workload.yaml +``` diff --git a/examples/auto_deploy/model_registry/configs/dashboard_default.yaml b/examples/auto_deploy/model_registry/configs/dashboard_default.yaml new file mode 100644 index 0000000000..6d22bc2a43 --- /dev/null +++ b/examples/auto_deploy/model_registry/configs/dashboard_default.yaml @@ -0,0 +1,9 @@ +# Default configuration for all AutoDeploy dashboard tests +# These are baseline settings that apply to all models unless overridden + +runtime: trtllm +attn_backend: flashinfer +compile_backend: torch-compile +model_factory: AutoModelForCausalLM +skip_loading_weights: false +max_seq_len: 512 diff --git a/examples/auto_deploy/model_registry/configs/deepseek_v3_lite.yaml b/examples/auto_deploy/model_registry/configs/deepseek_v3_lite.yaml new file mode 100644 index 0000000000..8475097ba2 --- /dev/null +++ b/examples/auto_deploy/model_registry/configs/deepseek_v3_lite.yaml @@ -0,0 +1,4 @@ +# Configuration for DeepSeek V3 and R1 with reduced layers +# Full models are too large, so we test with limited layers +model_kwargs: + num_hidden_layers: 10 diff --git a/examples/auto_deploy/model_registry/configs/demollm_triton.yaml b/examples/auto_deploy/model_registry/configs/demollm_triton.yaml new file mode 100644 index 0000000000..6f0d9a7326 --- /dev/null +++ b/examples/auto_deploy/model_registry/configs/demollm_triton.yaml @@ -0,0 +1,4 @@ +# Configuration for DemoLLM runtime with Triton backend +# Used for experimental or specific model requirements +runtime: demollm +attn_backend: triton diff --git a/examples/auto_deploy/model_registry/configs/gemma3_1b.yaml b/examples/auto_deploy/model_registry/configs/gemma3_1b.yaml new file mode 100644 index 0000000000..d076697e8a --- /dev/null +++ b/examples/auto_deploy/model_registry/configs/gemma3_1b.yaml @@ -0,0 +1,3 @@ +# Configuration for Gemma 3 1B model +# Specific sequence length requirement due to small attention window +max_seq_len: 511 diff --git a/examples/auto_deploy/model_registry/configs/llama3_3_70b.yaml b/examples/auto_deploy/model_registry/configs/llama3_3_70b.yaml new file mode 100644 index 0000000000..828800c93b --- /dev/null +++ b/examples/auto_deploy/model_registry/configs/llama3_3_70b.yaml @@ -0,0 +1,10 @@ +# Configuration for Llama 3.3 70B +# AutoDeploy-specific settings for large Llama models + +max_batch_size: 1024 +max_num_tokens: 2048 +free_mem_ratio: 0.9 +trust_remote_code: true +cuda_graph_batch_sizes: [1, 2, 4, 8, 16, 32, 64, 128, 256, 512, 768, 1024] +kv_cache_config: + dtype: fp8 diff --git a/examples/auto_deploy/model_registry/configs/llama4_maverick_lite.yaml b/examples/auto_deploy/model_registry/configs/llama4_maverick_lite.yaml new file mode 100644 index 0000000000..24372fa5cd --- /dev/null +++ b/examples/auto_deploy/model_registry/configs/llama4_maverick_lite.yaml @@ -0,0 +1,5 @@ +# Configuration for Llama 4 Maverick with reduced layers +# Full model is too large for testing +model_kwargs: + text_config: + num_hidden_layers: 5 diff --git a/examples/auto_deploy/model_registry/configs/llama4_scout.yaml b/examples/auto_deploy/model_registry/configs/llama4_scout.yaml new file mode 100644 index 0000000000..25b5c98971 --- /dev/null +++ b/examples/auto_deploy/model_registry/configs/llama4_scout.yaml @@ -0,0 +1,10 @@ +# Configuration for Llama 4 Scout (VLM) +# AutoDeploy-specific settings for Llama 4 Scout MoE vision model + +max_batch_size: 1024 +max_num_tokens: 2048 +free_mem_ratio: 0.9 +trust_remote_code: true +cuda_graph_batch_sizes: [1, 2, 4, 8, 16, 32, 64, 128, 256, 512, 768, 1024] +kv_cache_config: + dtype: fp8 diff --git a/examples/auto_deploy/model_registry/configs/multimodal.yaml b/examples/auto_deploy/model_registry/configs/multimodal.yaml new file mode 100644 index 0000000000..0220389c92 --- /dev/null +++ b/examples/auto_deploy/model_registry/configs/multimodal.yaml @@ -0,0 +1,2 @@ +# Configuration for multimodal (vision + text) models +model_factory: AutoModelForImageTextToText diff --git a/examples/auto_deploy/model_registry/configs/openelm.yaml b/examples/auto_deploy/model_registry/configs/openelm.yaml new file mode 100644 index 0000000000..848b125465 --- /dev/null +++ b/examples/auto_deploy/model_registry/configs/openelm.yaml @@ -0,0 +1,3 @@ +# Configuration for Apple OpenELM models +# These models require Llama-2 tokenizer +tokenizer: meta-llama/Llama-2-7b-hf diff --git a/examples/auto_deploy/model_registry/configs/simple_shard_only.yaml b/examples/auto_deploy/model_registry/configs/simple_shard_only.yaml new file mode 100644 index 0000000000..518cfcb219 --- /dev/null +++ b/examples/auto_deploy/model_registry/configs/simple_shard_only.yaml @@ -0,0 +1,5 @@ +# Configuration for models that require simple sharding only +# Used for very large models with specific sharding requirements +transforms: + detect_sharding: + simple_shard_only: true diff --git a/examples/auto_deploy/model_registry/configs/world_size_1.yaml b/examples/auto_deploy/model_registry/configs/world_size_1.yaml new file mode 100644 index 0000000000..266ced60fc --- /dev/null +++ b/examples/auto_deploy/model_registry/configs/world_size_1.yaml @@ -0,0 +1,2 @@ +# Configuration for single GPU models +world_size: 1 diff --git a/examples/auto_deploy/model_registry/configs/world_size_2.yaml b/examples/auto_deploy/model_registry/configs/world_size_2.yaml new file mode 100644 index 0000000000..ba7a36dda3 --- /dev/null +++ b/examples/auto_deploy/model_registry/configs/world_size_2.yaml @@ -0,0 +1,2 @@ +# Configuration for 2 GPU models +world_size: 2 diff --git a/examples/auto_deploy/model_registry/configs/world_size_4.yaml b/examples/auto_deploy/model_registry/configs/world_size_4.yaml new file mode 100644 index 0000000000..1a6da8c44e --- /dev/null +++ b/examples/auto_deploy/model_registry/configs/world_size_4.yaml @@ -0,0 +1,2 @@ +# Configuration for 4 GPU models +world_size: 4 diff --git a/examples/auto_deploy/model_registry/configs/world_size_8.yaml b/examples/auto_deploy/model_registry/configs/world_size_8.yaml new file mode 100644 index 0000000000..d978b0bcd4 --- /dev/null +++ b/examples/auto_deploy/model_registry/configs/world_size_8.yaml @@ -0,0 +1,2 @@ +# Configuration for 8 GPU models +world_size: 8 diff --git a/examples/auto_deploy/model_registry/models.yaml b/examples/auto_deploy/model_registry/models.yaml new file mode 100644 index 0000000000..1ec27706db --- /dev/null +++ b/examples/auto_deploy/model_registry/models.yaml @@ -0,0 +1,248 @@ +version: '2.0' +description: AutoDeploy Model Registry - Flat format with composable configs +models: +- name: TinyLlama/TinyLlama-1.1B-Chat-v1.0 + yaml_extra: ['dashboard_default.yaml', 'world_size_1.yaml'] +- name: Qwen/Qwen2.5-0.5B-Instruct + yaml_extra: ['dashboard_default.yaml', 'world_size_1.yaml'] +- name: Qwen/Qwen3-0.6B + yaml_extra: ['dashboard_default.yaml', 'world_size_1.yaml'] +# DISABLED: TorchDynamo compilation error - fake tensor dispatch failure +# - name: apple/OpenELM-270M-Instruct +# yaml_extra: ['dashboard_default.yaml', 'world_size_1.yaml', 'openelm.yaml'] +# DISABLED: TorchDynamo compilation error - fake tensor dispatch failure +# - name: apple/OpenELM-1_1B-Instruct +# yaml_extra: ['dashboard_default.yaml', 'world_size_1.yaml', 'openelm.yaml'] +# DISABLED: TorchDynamo compilation error - fake tensor dispatch failure +# - name: apple/OpenELM-3B-Instruct +# yaml_extra: ['dashboard_default.yaml', 'world_size_1.yaml', 'openelm.yaml'] +- name: microsoft/Phi-4-mini-instruct + yaml_extra: ['dashboard_default.yaml', 'world_size_1.yaml'] +- name: microsoft/Phi-4-mini-reasoning + yaml_extra: ['dashboard_default.yaml', 'world_size_1.yaml'] +- name: google/gemma-3-1b-it + yaml_extra: ['dashboard_default.yaml', 'world_size_1.yaml', 'gemma3_1b.yaml'] +# DISABLED: SLURM cluster cancellation - infrastructure issue +# - name: meta-llama/Llama-3.1-8B-Instruct +# yaml_extra: ['dashboard_default.yaml', 'world_size_2.yaml'] +# DISABLED: SLURM cluster cancellation - infrastructure issue +# - name: casperhansen/llama-3-8b-instruct-awq +# yaml_extra: ['dashboard_default.yaml', 'world_size_2.yaml'] +# DISABLED: SLURM cluster cancellation - infrastructure issue +# - name: meta-llama/Llama-3.2-1B-Instruct +# yaml_extra: ['dashboard_default.yaml', 'world_size_2.yaml'] +# DISABLED: SLURM cluster cancellation - infrastructure issue +# - name: meta-llama/Llama-3.2-3B-Instruct +# yaml_extra: ['dashboard_default.yaml', 'world_size_2.yaml'] +- name: Qwen/Qwen2.5-1.5B-Instruct + yaml_extra: ['dashboard_default.yaml', 'world_size_2.yaml'] +- name: Qwen/Qwen2.5-3B-Instruct + yaml_extra: ['dashboard_default.yaml', 'world_size_2.yaml'] +# DISABLED: SLURM cluster cancellation - infrastructure issue +# - name: Qwen/Qwen2.5-7B-Instruct +# yaml_extra: ['dashboard_default.yaml', 'world_size_2.yaml'] +- name: Qwen/Qwen2.5-7B-Instruct-AWQ + yaml_extra: ['dashboard_default.yaml', 'world_size_2.yaml'] +- name: Qwen/Qwen3-4B + yaml_extra: ['dashboard_default.yaml', 'world_size_2.yaml'] +- name: Qwen/Qwen3-8B + yaml_extra: ['dashboard_default.yaml', 'world_size_2.yaml'] +- name: microsoft/phi-4 + yaml_extra: ['dashboard_default.yaml', 'world_size_2.yaml'] +- name: microsoft/Phi-4-reasoning + yaml_extra: ['dashboard_default.yaml', 'world_size_2.yaml'] +- name: microsoft/Phi-4-reasoning-plus + yaml_extra: ['dashboard_default.yaml', 'world_size_2.yaml'] +- name: google/gemma-1.1-7b-it + yaml_extra: ['dashboard_default.yaml', 'world_size_2.yaml'] +# DISABLED: SLURM cluster cancellation - infrastructure issue +# - name: google/gemma-2-2b-it +# yaml_extra: ['dashboard_default.yaml', 'world_size_2.yaml'] +# DISABLED: SLURM cluster cancellation - infrastructure issue +# - name: google/gemma-2-9b-it +# yaml_extra: ['dashboard_default.yaml', 'world_size_2.yaml'] +- name: google/codegemma-7b-it + yaml_extra: ['dashboard_default.yaml', 'world_size_2.yaml'] +# DISABLED: SLURM cluster cancellation - infrastructure issue +# - name: mistralai/Mistral-7B-Instruct-v0.2 +# yaml_extra: ['dashboard_default.yaml', 'world_size_2.yaml'] +# DISABLED: SLURM cluster cancellation - infrastructure issue +# - name: mistralai/Mistral-7B-Instruct-v0.3 +# yaml_extra: ['dashboard_default.yaml', 'world_size_2.yaml'] +- name: TheBloke/Mistral-7B-Instruct-v0.2-GPTQ + yaml_extra: ['dashboard_default.yaml', 'world_size_2.yaml'] +- name: bigcode/starcoder2-7b + yaml_extra: ['dashboard_default.yaml', 'world_size_2.yaml'] +- name: bigcode/starcoder2-15b-instruct-v0.1 + yaml_extra: ['dashboard_default.yaml', 'world_size_2.yaml'] +# DISABLED: SLURM cluster cancellation - infrastructure issue +# - name: deepseek-ai/DeepSeek-Prover-V1.5-SFT +# yaml_extra: ['dashboard_default.yaml', 'world_size_2.yaml'] +# DISABLED: SLURM cluster cancellation - infrastructure issue +# - name: deepseek-ai/DeepSeek-Prover-V2-7B +# yaml_extra: ['dashboard_default.yaml', 'world_size_2.yaml'] +# DISABLED: SLURM cluster cancellation - infrastructure issue +# - name: deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B +# yaml_extra: ['dashboard_default.yaml', 'world_size_2.yaml'] +- name: ibm-granite/granite-3.1-2b-instruct + yaml_extra: ['dashboard_default.yaml', 'world_size_2.yaml'] +# DISABLED: SLURM cluster cancellation - infrastructure issue +# - name: ibm-granite/granite-3.1-8b-instruct +# yaml_extra: ['dashboard_default.yaml', 'world_size_2.yaml'] +- name: ibm-granite/granite-3.3-2b-instruct + yaml_extra: ['dashboard_default.yaml', 'world_size_2.yaml'] +- name: ibm-granite/granite-3.3-8b-instruct + yaml_extra: ['dashboard_default.yaml', 'world_size_2.yaml'] +- name: ibm-granite/granite-guardian-3.1-2b + yaml_extra: ['dashboard_default.yaml', 'world_size_2.yaml'] +- name: ibm-granite/granite-guardian-3.2-5b + yaml_extra: ['dashboard_default.yaml', 'world_size_2.yaml'] +- name: meta-llama/CodeLlama-7b-Instruct-hf + yaml_extra: ['dashboard_default.yaml', 'world_size_2.yaml'] +- name: meta-llama/CodeLlama-7b-Python-hf + yaml_extra: ['dashboard_default.yaml', 'world_size_2.yaml'] +# DISABLED: SLURM cluster cancellation - infrastructure issue +# - name: meta-llama/Llama-2-7b-chat-hf +# yaml_extra: ['dashboard_default.yaml', 'world_size_2.yaml'] +# DISABLED: FakeTensorMode error in unified_attn export +# - name: nvidia/Llama-3.1-8B-Instruct-FP8 +# yaml_extra: ['dashboard_default.yaml', 'world_size_2.yaml'] +# DISABLED: SLURM cluster cancellation - infrastructure issue +# - name: nvidia/Llama-3.1-Minitron-4B-Depth-Base +# yaml_extra: ['dashboard_default.yaml', 'world_size_2.yaml'] +# DISABLED: SLURM cluster cancellation - infrastructure issue +# - name: nvidia/Llama-3.1-Minitron-4B-Width-Base +# yaml_extra: ['dashboard_default.yaml', 'world_size_2.yaml'] +- name: nvidia/Llama-3.1-Nemotron-Nano-8B-v1 + yaml_extra: ['dashboard_default.yaml', 'world_size_2.yaml'] +- name: nvidia/Mistral-NeMo-Minitron-8B-Base + yaml_extra: ['dashboard_default.yaml', 'world_size_2.yaml'] +- name: openai/gpt-oss-20b + yaml_extra: ['dashboard_default.yaml', 'world_size_2.yaml'] +# DISABLED: Custom op error - append_paged_kv_cache missing Float kernel +# - name: bigcode/starcoder2-15b +# yaml_extra: ['dashboard_default.yaml', 'world_size_2.yaml'] +# DISABLED: SLURM cluster cancellation - infrastructure issue +# - name: ibm-granite/granite-3.0-8b-instruct +# yaml_extra: ['dashboard_default.yaml', 'world_size_2.yaml'] +# DISABLED: SLURM cluster cancellation - infrastructure issue +# - name: mistralai/Ministral-8B-Instruct-2410 +# yaml_extra: ['dashboard_default.yaml', 'world_size_2.yaml'] +- name: nvidia/NVIDIA-Nemotron-Nano-9B-v2-FP8 + yaml_extra: ['dashboard_default.yaml', 'world_size_2.yaml'] +- name: nvidia/NVIDIA-Nemotron-Nano-9B-v2-NVFP4 + yaml_extra: ['dashboard_default.yaml', 'world_size_2.yaml'] +- name: nvidia/NVIDIA-Nemotron-Nano-12B-v2-VL-FP8 + yaml_extra: ['dashboard_default.yaml', 'world_size_2.yaml', 'multimodal.yaml'] +- name: google/gemma-3-27b-it + yaml_extra: ['dashboard_default.yaml', 'world_size_2.yaml', 'multimodal.yaml'] +- name: google/gemma-3-2b-it + yaml_extra: ['dashboard_default.yaml', 'world_size_2.yaml'] +- name: deepseek-ai/DeepSeek-V2.5 + yaml_extra: ['dashboard_default.yaml', 'world_size_2.yaml'] +# DISABLED: Network timeout downloading from Hugging Face +# - name: ai21labs/AI21-Jamba-1.5-Mini +# yaml_extra: ['dashboard_default.yaml', 'world_size_2.yaml'] +# DISABLED: SLURM cluster cancellation - infrastructure issue +# - name: THUDM/glm-4v-9b +# yaml_extra: ['dashboard_default.yaml', 'world_size_2.yaml', 'multimodal.yaml'] +# DISABLED: SLURM cluster cancellation - infrastructure issue +# - name: meta-llama/Llama-3.2-11B-Vision-Instruct +# yaml_extra: ['dashboard_default.yaml', 'world_size_2.yaml', 'multimodal.yaml'] +# DISABLED: Auto-deploy compilation error +# - name: meta-llama/Llama-3.3-70B-Instruct +# yaml_extra: ['dashboard_default.yaml', 'world_size_4.yaml', 'llama3_3_70b.yaml'] +- name: meta-llama/CodeLlama-34b-Instruct-hf + yaml_extra: ['dashboard_default.yaml', 'world_size_4.yaml'] +- name: meta-llama/Llama-2-13b-chat-hf + yaml_extra: ['dashboard_default.yaml', 'world_size_4.yaml'] +- name: microsoft/Phi-3-medium-128k-instruct + yaml_extra: ['dashboard_default.yaml', 'world_size_4.yaml'] +- name: microsoft/Phi-3-medium-4k-instruct + yaml_extra: ['dashboard_default.yaml', 'world_size_4.yaml'] +# DISABLED: SLURM cluster cancellation - infrastructure issue +# - name: mistralai/Codestral-22B-v0.1 +# yaml_extra: ['dashboard_default.yaml', 'world_size_4.yaml'] +# DISABLED: Graph transformation error in auto-deploy +# - name: neuralmagic/Mixtral-8x7B-Instruct-v0.1-FP8 +# yaml_extra: ['dashboard_default.yaml', 'world_size_4.yaml'] +- name: TheBloke/falcon-40b-instruct-GPTQ + yaml_extra: ['dashboard_default.yaml', 'world_size_4.yaml'] +- name: Qwen/QwQ-32B + yaml_extra: ['dashboard_default.yaml', 'world_size_4.yaml'] +# DISABLED: SLURM cluster cancellation - infrastructure issue +# - name: google/gemma-2-27b-it +# yaml_extra: ['dashboard_default.yaml', 'world_size_4.yaml'] +- name: perplexity-ai/r1-1776-distill-llama-70b + yaml_extra: ['dashboard_default.yaml', 'world_size_4.yaml'] +- name: nvidia/NVIDIA-Nemotron-Nano-31B-A3-v3 + yaml_extra: ['dashboard_default.yaml', 'world_size_4.yaml', 'nano_v3.yaml'] +- name: nvidia/NVIDIA-Nemotron-3-Nano-30B-A3B-FP8 + yaml_extra: ['dashboard_default.yaml', 'world_size_4.yaml'] +- name: Qwen/QwQ-32B-Preview + yaml_extra: ['dashboard_default.yaml', 'world_size_4.yaml'] +- name: Qwen/Qwen3-Coder-32B-Instruct + yaml_extra: ['dashboard_default.yaml', 'world_size_4.yaml'] +- name: Qwen/Qwen3-235B-A22B-Instruct-2507 + yaml_extra: ['dashboard_default.yaml', 'world_size_4.yaml'] +# DISABLED: Network timeout downloading from Hugging Face +# - name: ai21labs/AI21-Jamba-1.5-Large +# yaml_extra: ['dashboard_default.yaml', 'world_size_4.yaml'] +- name: nvidia/OpenReasoning-Nemotron-32B + yaml_extra: ['dashboard_default.yaml', 'world_size_4.yaml'] +# DISABLED: Auto-deploy compilation error +# - name: mistralai/Mistral-Large-Instruct-v2.1 +# yaml_extra: ['dashboard_default.yaml', 'world_size_4.yaml'] +# DISABLED: SLURM cluster cancellation - infrastructure issue +# - name: deepseek-ai/DeepSeek-R1-Distill-Llama-70B +# yaml_extra: ['dashboard_default.yaml', 'world_size_8.yaml'] +# DISABLED: Auto-deploy compilation error +# - name: deepseek-ai/DeepSeek-R1-Distill-Qwen-32B +# yaml_extra: ['dashboard_default.yaml', 'world_size_8.yaml'] +# DISABLED: Graph transformation error in auto-deploy +# - name: mistralai/Mixtral-8x22B-Instruct-v0.1 +# yaml_extra: ['dashboard_default.yaml', 'world_size_8.yaml'] +# DISABLED: FakeTensorMode error in unified_attn export +# - name: nvidia/Llama-3.1-70B-Instruct-FP8 +# yaml_extra: ['dashboard_default.yaml', 'world_size_8.yaml'] +# DISABLED: FakeTensorMode error in unified_attn export +# - name: nvidia/Llama-3.1-405B-Instruct-FP8 +# yaml_extra: ['dashboard_default.yaml', 'world_size_8.yaml'] +- name: nvidia/Llama-3.1-Nemotron-70B-Instruct-HF + yaml_extra: ['dashboard_default.yaml', 'world_size_8.yaml'] +# DISABLED: Model loading failure - dynamic module registry issue +# - name: nvidia/Llama-3_1-Nemotron-51B-Instruct +# yaml_extra: ['dashboard_default.yaml', 'world_size_8.yaml', 'simple_shard_only.yaml'] +- name: nvidia/Llama-3_1-Nemotron-Ultra-253B-v1 + yaml_extra: ['dashboard_default.yaml', 'world_size_8.yaml', 'simple_shard_only.yaml'] +- name: nvidia/Llama-3_1-Nemotron-Ultra-253B-v1-FP8 + yaml_extra: ['dashboard_default.yaml', 'world_size_8.yaml', 'simple_shard_only.yaml'] +- name: nvidia/Llama-3_3-Nemotron-Super-49B-v1 + yaml_extra: ['dashboard_default.yaml', 'world_size_8.yaml', 'simple_shard_only.yaml'] +- name: Qwen/Qwen3-30B-A3B + yaml_extra: ['dashboard_default.yaml', 'world_size_8.yaml', 'simple_shard_only.yaml'] +- name: Qwen/Qwen3-235B-A22B + yaml_extra: ['dashboard_default.yaml', 'world_size_8.yaml', 'simple_shard_only.yaml'] +- name: deepseek-ai/DeepSeek-R1 + yaml_extra: ['dashboard_default.yaml', 'world_size_8.yaml', 'deepseek_v3_lite.yaml'] +# DISABLED: Auto-deploy compilation error +# - name: deepseek-ai/DeepSeek-V3 +# yaml_extra: ['dashboard_default.yaml', 'world_size_8.yaml', 'deepseek_v3_lite.yaml'] +# DISABLED: Assertion failure in auto-deploy transform pipeline +# - name: deepseek-ai/DeepSeek-Coder-V2-Instruct +# yaml_extra: ['dashboard_default.yaml', 'world_size_8.yaml'] +- name: Qwen/Qwen3-VL-8B-Instruct + yaml_extra: ['dashboard_default.yaml', 'world_size_8.yaml'] +- name: Qwen/Qwen2-VL-72B-Instruct-GPTQ-Int4 + yaml_extra: ['dashboard_default.yaml', 'world_size_8.yaml', 'multimodal.yaml'] +# DISABLED: SLURM cluster cancellation - infrastructure issue +# - name: codellama/CodeLlama-70b-Instruct-hf +# yaml_extra: ['dashboard_default.yaml', 'world_size_8.yaml'] +- name: meta-llama/Llama-3.2-90B-Vision-Instruct + yaml_extra: ['dashboard_default.yaml', 'world_size_8.yaml', 'multimodal.yaml'] +- name: openai/gpt-oss-120b + yaml_extra: ['dashboard_default.yaml', 'world_size_8.yaml'] +- name: meta-llama/Llama-4-Scout-17B-16E-Instruct + yaml_extra: ['dashboard_default.yaml', 'world_size_8.yaml', 'multimodal.yaml', 'llama4_scout.yaml'] +- name: meta-llama/Llama-4-Maverick-17B-128E-Instruct + yaml_extra: ['dashboard_default.yaml', 'world_size_8.yaml', 'multimodal.yaml', 'llama4_maverick_lite.yaml'] diff --git a/examples/configs/README.md b/examples/configs/README.md index b9a47281d2..dc633c8b2c 100644 --- a/examples/configs/README.md +++ b/examples/configs/README.md @@ -1,5 +1,5 @@ # Recommended LLM API Configuration Settings -This directory contains recommended [LLM API](https://nvidia.github.io/TensorRT-LLM/llm-api/) performance settings for popular models. They can be used out-of-the-box with `trtllm-serve` via the `--extra_llm_api_options` CLI flag, or you can adjust them to your specific use case. +This directory contains recommended [LLM API](https://nvidia.github.io/TensorRT-LLM/llm-api/) performance settings for popular models. They can be used out-of-the-box with `trtllm-serve` via the `--config` CLI flag, or you can adjust them to your specific use case. For model-specific deployment guides, please refer to the [official documentation](https://nvidia.github.io/TensorRT-LLM/deployment-guide/index.html). diff --git a/examples/configs/__init__.py b/examples/configs/__init__.py new file mode 100644 index 0000000000..3159bfe656 --- /dev/null +++ b/examples/configs/__init__.py @@ -0,0 +1,14 @@ +# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/examples/configs/curated/qwen3-next.yaml b/examples/configs/curated/qwen3-next.yaml index b78921a6c2..b9aa4f1b63 100644 --- a/examples/configs/curated/qwen3-next.yaml +++ b/examples/configs/curated/qwen3-next.yaml @@ -13,4 +13,4 @@ stream_interval: 20 num_postprocess_workers: 4 kv_cache_config: enable_block_reuse: false - free_gpu_memory_fraction: 0.6 + free_gpu_memory_fraction: 0.9 diff --git a/examples/configs/database/__init__.py b/examples/configs/database/__init__.py new file mode 100644 index 0000000000..3159bfe656 --- /dev/null +++ b/examples/configs/database/__init__.py @@ -0,0 +1,14 @@ +# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/examples/configs/database/deepseek-ai/DeepSeek-R1-0528/B200/1k1k_tp8_conc16.yaml b/examples/configs/database/deepseek-ai/DeepSeek-R1-0528/B200/1k1k_tp8_conc16.yaml index f770a6566e..1b4d4b6aaf 100644 --- a/examples/configs/database/deepseek-ai/DeepSeek-R1-0528/B200/1k1k_tp8_conc16.yaml +++ b/examples/configs/database/deepseek-ai/DeepSeek-R1-0528/B200/1k1k_tp8_conc16.yaml @@ -6,7 +6,6 @@ print_iter_log: true kv_cache_config: dtype: fp8 free_gpu_memory_fraction: 0.8 - enable_block_reuse: false stream_interval: 10 moe_config: backend: DEEPGEMM diff --git a/examples/configs/database/deepseek-ai/DeepSeek-R1-0528/B200/1k1k_tp8_conc32.yaml b/examples/configs/database/deepseek-ai/DeepSeek-R1-0528/B200/1k1k_tp8_conc32.yaml index f770a6566e..1b4d4b6aaf 100644 --- a/examples/configs/database/deepseek-ai/DeepSeek-R1-0528/B200/1k1k_tp8_conc32.yaml +++ b/examples/configs/database/deepseek-ai/DeepSeek-R1-0528/B200/1k1k_tp8_conc32.yaml @@ -6,7 +6,6 @@ print_iter_log: true kv_cache_config: dtype: fp8 free_gpu_memory_fraction: 0.8 - enable_block_reuse: false stream_interval: 10 moe_config: backend: DEEPGEMM diff --git a/examples/configs/database/deepseek-ai/DeepSeek-R1-0528/B200/1k1k_tp8_conc4.yaml b/examples/configs/database/deepseek-ai/DeepSeek-R1-0528/B200/1k1k_tp8_conc4.yaml index f770a6566e..1b4d4b6aaf 100644 --- a/examples/configs/database/deepseek-ai/DeepSeek-R1-0528/B200/1k1k_tp8_conc4.yaml +++ b/examples/configs/database/deepseek-ai/DeepSeek-R1-0528/B200/1k1k_tp8_conc4.yaml @@ -6,7 +6,6 @@ print_iter_log: true kv_cache_config: dtype: fp8 free_gpu_memory_fraction: 0.8 - enable_block_reuse: false stream_interval: 10 moe_config: backend: DEEPGEMM diff --git a/examples/configs/database/deepseek-ai/DeepSeek-R1-0528/B200/1k1k_tp8_conc64.yaml b/examples/configs/database/deepseek-ai/DeepSeek-R1-0528/B200/1k1k_tp8_conc64.yaml index f770a6566e..1b4d4b6aaf 100644 --- a/examples/configs/database/deepseek-ai/DeepSeek-R1-0528/B200/1k1k_tp8_conc64.yaml +++ b/examples/configs/database/deepseek-ai/DeepSeek-R1-0528/B200/1k1k_tp8_conc64.yaml @@ -6,7 +6,6 @@ print_iter_log: true kv_cache_config: dtype: fp8 free_gpu_memory_fraction: 0.8 - enable_block_reuse: false stream_interval: 10 moe_config: backend: DEEPGEMM diff --git a/examples/configs/database/deepseek-ai/DeepSeek-R1-0528/B200/1k1k_tp8_conc8.yaml b/examples/configs/database/deepseek-ai/DeepSeek-R1-0528/B200/1k1k_tp8_conc8.yaml index f770a6566e..1b4d4b6aaf 100644 --- a/examples/configs/database/deepseek-ai/DeepSeek-R1-0528/B200/1k1k_tp8_conc8.yaml +++ b/examples/configs/database/deepseek-ai/DeepSeek-R1-0528/B200/1k1k_tp8_conc8.yaml @@ -6,7 +6,6 @@ print_iter_log: true kv_cache_config: dtype: fp8 free_gpu_memory_fraction: 0.8 - enable_block_reuse: false stream_interval: 10 moe_config: backend: DEEPGEMM diff --git a/examples/configs/database/deepseek-ai/DeepSeek-R1-0528/B200/8k1k_tp8_conc16.yaml b/examples/configs/database/deepseek-ai/DeepSeek-R1-0528/B200/8k1k_tp8_conc16.yaml index 6660bcea96..09ee0c6020 100644 --- a/examples/configs/database/deepseek-ai/DeepSeek-R1-0528/B200/8k1k_tp8_conc16.yaml +++ b/examples/configs/database/deepseek-ai/DeepSeek-R1-0528/B200/8k1k_tp8_conc16.yaml @@ -6,7 +6,6 @@ print_iter_log: true kv_cache_config: dtype: fp8 free_gpu_memory_fraction: 0.8 - enable_block_reuse: false stream_interval: 10 moe_config: backend: DEEPGEMM diff --git a/examples/configs/database/deepseek-ai/DeepSeek-R1-0528/B200/8k1k_tp8_conc32.yaml b/examples/configs/database/deepseek-ai/DeepSeek-R1-0528/B200/8k1k_tp8_conc32.yaml index 6660bcea96..09ee0c6020 100644 --- a/examples/configs/database/deepseek-ai/DeepSeek-R1-0528/B200/8k1k_tp8_conc32.yaml +++ b/examples/configs/database/deepseek-ai/DeepSeek-R1-0528/B200/8k1k_tp8_conc32.yaml @@ -6,7 +6,6 @@ print_iter_log: true kv_cache_config: dtype: fp8 free_gpu_memory_fraction: 0.8 - enable_block_reuse: false stream_interval: 10 moe_config: backend: DEEPGEMM diff --git a/examples/configs/database/deepseek-ai/DeepSeek-R1-0528/B200/8k1k_tp8_conc4.yaml b/examples/configs/database/deepseek-ai/DeepSeek-R1-0528/B200/8k1k_tp8_conc4.yaml index 6660bcea96..09ee0c6020 100644 --- a/examples/configs/database/deepseek-ai/DeepSeek-R1-0528/B200/8k1k_tp8_conc4.yaml +++ b/examples/configs/database/deepseek-ai/DeepSeek-R1-0528/B200/8k1k_tp8_conc4.yaml @@ -6,7 +6,6 @@ print_iter_log: true kv_cache_config: dtype: fp8 free_gpu_memory_fraction: 0.8 - enable_block_reuse: false stream_interval: 10 moe_config: backend: DEEPGEMM diff --git a/examples/configs/database/deepseek-ai/DeepSeek-R1-0528/B200/8k1k_tp8_conc64.yaml b/examples/configs/database/deepseek-ai/DeepSeek-R1-0528/B200/8k1k_tp8_conc64.yaml index 919a028409..5fba0289d3 100644 --- a/examples/configs/database/deepseek-ai/DeepSeek-R1-0528/B200/8k1k_tp8_conc64.yaml +++ b/examples/configs/database/deepseek-ai/DeepSeek-R1-0528/B200/8k1k_tp8_conc64.yaml @@ -6,7 +6,6 @@ print_iter_log: true kv_cache_config: dtype: fp8 free_gpu_memory_fraction: 0.8 - enable_block_reuse: false stream_interval: 10 moe_config: backend: DEEPGEMM diff --git a/examples/configs/database/deepseek-ai/DeepSeek-R1-0528/B200/8k1k_tp8_conc8.yaml b/examples/configs/database/deepseek-ai/DeepSeek-R1-0528/B200/8k1k_tp8_conc8.yaml index 6660bcea96..09ee0c6020 100644 --- a/examples/configs/database/deepseek-ai/DeepSeek-R1-0528/B200/8k1k_tp8_conc8.yaml +++ b/examples/configs/database/deepseek-ai/DeepSeek-R1-0528/B200/8k1k_tp8_conc8.yaml @@ -6,7 +6,6 @@ print_iter_log: true kv_cache_config: dtype: fp8 free_gpu_memory_fraction: 0.8 - enable_block_reuse: false stream_interval: 10 moe_config: backend: DEEPGEMM diff --git a/examples/configs/database/deepseek-ai/DeepSeek-R1-0528/H200/1k1k_tp8_conc16.yaml b/examples/configs/database/deepseek-ai/DeepSeek-R1-0528/H200/1k1k_tp8_conc16.yaml index 008da1df54..9f80dfd3bc 100644 --- a/examples/configs/database/deepseek-ai/DeepSeek-R1-0528/H200/1k1k_tp8_conc16.yaml +++ b/examples/configs/database/deepseek-ai/DeepSeek-R1-0528/H200/1k1k_tp8_conc16.yaml @@ -6,7 +6,6 @@ print_iter_log: true kv_cache_config: dtype: fp8 free_gpu_memory_fraction: 0.75 - enable_block_reuse: false stream_interval: 10 moe_config: backend: CUTLASS diff --git a/examples/configs/database/deepseek-ai/DeepSeek-R1-0528/H200/1k1k_tp8_conc32.yaml b/examples/configs/database/deepseek-ai/DeepSeek-R1-0528/H200/1k1k_tp8_conc32.yaml index 008da1df54..9f80dfd3bc 100644 --- a/examples/configs/database/deepseek-ai/DeepSeek-R1-0528/H200/1k1k_tp8_conc32.yaml +++ b/examples/configs/database/deepseek-ai/DeepSeek-R1-0528/H200/1k1k_tp8_conc32.yaml @@ -6,7 +6,6 @@ print_iter_log: true kv_cache_config: dtype: fp8 free_gpu_memory_fraction: 0.75 - enable_block_reuse: false stream_interval: 10 moe_config: backend: CUTLASS diff --git a/examples/configs/database/deepseek-ai/DeepSeek-R1-0528/H200/1k1k_tp8_conc4.yaml b/examples/configs/database/deepseek-ai/DeepSeek-R1-0528/H200/1k1k_tp8_conc4.yaml index 008da1df54..9f80dfd3bc 100644 --- a/examples/configs/database/deepseek-ai/DeepSeek-R1-0528/H200/1k1k_tp8_conc4.yaml +++ b/examples/configs/database/deepseek-ai/DeepSeek-R1-0528/H200/1k1k_tp8_conc4.yaml @@ -6,7 +6,6 @@ print_iter_log: true kv_cache_config: dtype: fp8 free_gpu_memory_fraction: 0.75 - enable_block_reuse: false stream_interval: 10 moe_config: backend: CUTLASS diff --git a/examples/configs/database/deepseek-ai/DeepSeek-R1-0528/H200/1k1k_tp8_conc64.yaml b/examples/configs/database/deepseek-ai/DeepSeek-R1-0528/H200/1k1k_tp8_conc64.yaml index 008da1df54..9f80dfd3bc 100644 --- a/examples/configs/database/deepseek-ai/DeepSeek-R1-0528/H200/1k1k_tp8_conc64.yaml +++ b/examples/configs/database/deepseek-ai/DeepSeek-R1-0528/H200/1k1k_tp8_conc64.yaml @@ -6,7 +6,6 @@ print_iter_log: true kv_cache_config: dtype: fp8 free_gpu_memory_fraction: 0.75 - enable_block_reuse: false stream_interval: 10 moe_config: backend: CUTLASS diff --git a/examples/configs/database/deepseek-ai/DeepSeek-R1-0528/H200/1k1k_tp8_conc8.yaml b/examples/configs/database/deepseek-ai/DeepSeek-R1-0528/H200/1k1k_tp8_conc8.yaml index 008da1df54..9f80dfd3bc 100644 --- a/examples/configs/database/deepseek-ai/DeepSeek-R1-0528/H200/1k1k_tp8_conc8.yaml +++ b/examples/configs/database/deepseek-ai/DeepSeek-R1-0528/H200/1k1k_tp8_conc8.yaml @@ -6,7 +6,6 @@ print_iter_log: true kv_cache_config: dtype: fp8 free_gpu_memory_fraction: 0.75 - enable_block_reuse: false stream_interval: 10 moe_config: backend: CUTLASS diff --git a/examples/configs/database/deepseek-ai/DeepSeek-R1-0528/H200/8k1k_tp8_conc16.yaml b/examples/configs/database/deepseek-ai/DeepSeek-R1-0528/H200/8k1k_tp8_conc16.yaml index decbb1744a..b2ff0ba8e5 100644 --- a/examples/configs/database/deepseek-ai/DeepSeek-R1-0528/H200/8k1k_tp8_conc16.yaml +++ b/examples/configs/database/deepseek-ai/DeepSeek-R1-0528/H200/8k1k_tp8_conc16.yaml @@ -6,7 +6,6 @@ print_iter_log: true kv_cache_config: dtype: fp8 free_gpu_memory_fraction: 0.75 - enable_block_reuse: false stream_interval: 10 moe_config: backend: CUTLASS diff --git a/examples/configs/database/deepseek-ai/DeepSeek-R1-0528/H200/8k1k_tp8_conc32.yaml b/examples/configs/database/deepseek-ai/DeepSeek-R1-0528/H200/8k1k_tp8_conc32.yaml index decbb1744a..b2ff0ba8e5 100644 --- a/examples/configs/database/deepseek-ai/DeepSeek-R1-0528/H200/8k1k_tp8_conc32.yaml +++ b/examples/configs/database/deepseek-ai/DeepSeek-R1-0528/H200/8k1k_tp8_conc32.yaml @@ -6,7 +6,6 @@ print_iter_log: true kv_cache_config: dtype: fp8 free_gpu_memory_fraction: 0.75 - enable_block_reuse: false stream_interval: 10 moe_config: backend: CUTLASS diff --git a/examples/configs/database/deepseek-ai/DeepSeek-R1-0528/H200/8k1k_tp8_conc4.yaml b/examples/configs/database/deepseek-ai/DeepSeek-R1-0528/H200/8k1k_tp8_conc4.yaml index decbb1744a..b2ff0ba8e5 100644 --- a/examples/configs/database/deepseek-ai/DeepSeek-R1-0528/H200/8k1k_tp8_conc4.yaml +++ b/examples/configs/database/deepseek-ai/DeepSeek-R1-0528/H200/8k1k_tp8_conc4.yaml @@ -6,7 +6,6 @@ print_iter_log: true kv_cache_config: dtype: fp8 free_gpu_memory_fraction: 0.75 - enable_block_reuse: false stream_interval: 10 moe_config: backend: CUTLASS diff --git a/examples/configs/database/deepseek-ai/DeepSeek-R1-0528/H200/8k1k_tp8_conc64.yaml b/examples/configs/database/deepseek-ai/DeepSeek-R1-0528/H200/8k1k_tp8_conc64.yaml index 363eebf521..2b54bf087b 100644 --- a/examples/configs/database/deepseek-ai/DeepSeek-R1-0528/H200/8k1k_tp8_conc64.yaml +++ b/examples/configs/database/deepseek-ai/DeepSeek-R1-0528/H200/8k1k_tp8_conc64.yaml @@ -6,7 +6,6 @@ print_iter_log: true kv_cache_config: dtype: fp8 free_gpu_memory_fraction: 0.75 - enable_block_reuse: false stream_interval: 10 moe_config: backend: CUTLASS diff --git a/examples/configs/database/deepseek-ai/DeepSeek-R1-0528/H200/8k1k_tp8_conc8.yaml b/examples/configs/database/deepseek-ai/DeepSeek-R1-0528/H200/8k1k_tp8_conc8.yaml index decbb1744a..b2ff0ba8e5 100644 --- a/examples/configs/database/deepseek-ai/DeepSeek-R1-0528/H200/8k1k_tp8_conc8.yaml +++ b/examples/configs/database/deepseek-ai/DeepSeek-R1-0528/H200/8k1k_tp8_conc8.yaml @@ -6,7 +6,6 @@ print_iter_log: true kv_cache_config: dtype: fp8 free_gpu_memory_fraction: 0.75 - enable_block_reuse: false stream_interval: 10 moe_config: backend: CUTLASS diff --git a/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/1k1k_tp4_conc128.yaml b/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/1k1k_tp4_conc128.yaml index c61e3abc15..d86cb71568 100644 --- a/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/1k1k_tp4_conc128.yaml +++ b/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/1k1k_tp4_conc128.yaml @@ -6,7 +6,6 @@ print_iter_log: true kv_cache_config: dtype: fp8 free_gpu_memory_fraction: 0.8 - enable_block_reuse: false stream_interval: 10 moe_config: backend: TRTLLM diff --git a/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/1k1k_tp4_conc16.yaml b/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/1k1k_tp4_conc16.yaml index fe58a6a32b..903449e0f3 100644 --- a/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/1k1k_tp4_conc16.yaml +++ b/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/1k1k_tp4_conc16.yaml @@ -6,7 +6,6 @@ print_iter_log: true kv_cache_config: dtype: fp8 free_gpu_memory_fraction: 0.8 - enable_block_reuse: false stream_interval: 10 moe_config: backend: TRTLLM diff --git a/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/1k1k_tp4_conc256.yaml b/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/1k1k_tp4_conc256.yaml index 2a06d3978d..18fbe5eec9 100644 --- a/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/1k1k_tp4_conc256.yaml +++ b/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/1k1k_tp4_conc256.yaml @@ -6,7 +6,6 @@ print_iter_log: true kv_cache_config: dtype: fp8 free_gpu_memory_fraction: 0.8 - enable_block_reuse: false stream_interval: 10 moe_config: backend: CUTLASS diff --git a/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/1k1k_tp4_conc32.yaml b/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/1k1k_tp4_conc32.yaml index fe58a6a32b..903449e0f3 100644 --- a/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/1k1k_tp4_conc32.yaml +++ b/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/1k1k_tp4_conc32.yaml @@ -6,7 +6,6 @@ print_iter_log: true kv_cache_config: dtype: fp8 free_gpu_memory_fraction: 0.8 - enable_block_reuse: false stream_interval: 10 moe_config: backend: TRTLLM diff --git a/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/1k1k_tp4_conc4.yaml b/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/1k1k_tp4_conc4.yaml index fe58a6a32b..903449e0f3 100644 --- a/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/1k1k_tp4_conc4.yaml +++ b/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/1k1k_tp4_conc4.yaml @@ -6,7 +6,6 @@ print_iter_log: true kv_cache_config: dtype: fp8 free_gpu_memory_fraction: 0.8 - enable_block_reuse: false stream_interval: 10 moe_config: backend: TRTLLM diff --git a/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/1k1k_tp4_conc64.yaml b/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/1k1k_tp4_conc64.yaml index fe58a6a32b..903449e0f3 100644 --- a/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/1k1k_tp4_conc64.yaml +++ b/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/1k1k_tp4_conc64.yaml @@ -6,7 +6,6 @@ print_iter_log: true kv_cache_config: dtype: fp8 free_gpu_memory_fraction: 0.8 - enable_block_reuse: false stream_interval: 10 moe_config: backend: TRTLLM diff --git a/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/1k1k_tp4_conc8.yaml b/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/1k1k_tp4_conc8.yaml index fe58a6a32b..903449e0f3 100644 --- a/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/1k1k_tp4_conc8.yaml +++ b/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/1k1k_tp4_conc8.yaml @@ -6,7 +6,6 @@ print_iter_log: true kv_cache_config: dtype: fp8 free_gpu_memory_fraction: 0.8 - enable_block_reuse: false stream_interval: 10 moe_config: backend: TRTLLM diff --git a/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/1k1k_tp8_conc128.yaml b/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/1k1k_tp8_conc128.yaml index a4a4fe28c7..696ad9bb03 100644 --- a/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/1k1k_tp8_conc128.yaml +++ b/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/1k1k_tp8_conc128.yaml @@ -6,7 +6,6 @@ print_iter_log: true kv_cache_config: dtype: fp8 free_gpu_memory_fraction: 0.8 - enable_block_reuse: false stream_interval: 10 moe_config: backend: TRTLLM diff --git a/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/1k1k_tp8_conc16.yaml b/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/1k1k_tp8_conc16.yaml index 397565e15b..fc971bf09e 100644 --- a/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/1k1k_tp8_conc16.yaml +++ b/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/1k1k_tp8_conc16.yaml @@ -6,7 +6,6 @@ print_iter_log: true kv_cache_config: dtype: fp8 free_gpu_memory_fraction: 0.8 - enable_block_reuse: false stream_interval: 10 moe_config: backend: TRTLLM diff --git a/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/1k1k_tp8_conc256.yaml b/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/1k1k_tp8_conc256.yaml index 686db04f1f..86707e68fb 100644 --- a/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/1k1k_tp8_conc256.yaml +++ b/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/1k1k_tp8_conc256.yaml @@ -6,7 +6,6 @@ print_iter_log: true kv_cache_config: dtype: fp8 free_gpu_memory_fraction: 0.8 - enable_block_reuse: false stream_interval: 10 moe_config: backend: CUTLASS diff --git a/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/1k1k_tp8_conc32.yaml b/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/1k1k_tp8_conc32.yaml index 397565e15b..fc971bf09e 100644 --- a/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/1k1k_tp8_conc32.yaml +++ b/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/1k1k_tp8_conc32.yaml @@ -6,7 +6,6 @@ print_iter_log: true kv_cache_config: dtype: fp8 free_gpu_memory_fraction: 0.8 - enable_block_reuse: false stream_interval: 10 moe_config: backend: TRTLLM diff --git a/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/1k1k_tp8_conc4.yaml b/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/1k1k_tp8_conc4.yaml index 397565e15b..fc971bf09e 100644 --- a/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/1k1k_tp8_conc4.yaml +++ b/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/1k1k_tp8_conc4.yaml @@ -6,7 +6,6 @@ print_iter_log: true kv_cache_config: dtype: fp8 free_gpu_memory_fraction: 0.8 - enable_block_reuse: false stream_interval: 10 moe_config: backend: TRTLLM diff --git a/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/1k1k_tp8_conc64.yaml b/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/1k1k_tp8_conc64.yaml index 397565e15b..fc971bf09e 100644 --- a/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/1k1k_tp8_conc64.yaml +++ b/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/1k1k_tp8_conc64.yaml @@ -6,7 +6,6 @@ print_iter_log: true kv_cache_config: dtype: fp8 free_gpu_memory_fraction: 0.8 - enable_block_reuse: false stream_interval: 10 moe_config: backend: TRTLLM diff --git a/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/1k1k_tp8_conc8.yaml b/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/1k1k_tp8_conc8.yaml index 397565e15b..fc971bf09e 100644 --- a/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/1k1k_tp8_conc8.yaml +++ b/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/1k1k_tp8_conc8.yaml @@ -6,7 +6,6 @@ print_iter_log: true kv_cache_config: dtype: fp8 free_gpu_memory_fraction: 0.8 - enable_block_reuse: false stream_interval: 10 moe_config: backend: TRTLLM diff --git a/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/8k1k_tp4_conc128.yaml b/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/8k1k_tp4_conc128.yaml index ace419c0d8..6d88108502 100644 --- a/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/8k1k_tp4_conc128.yaml +++ b/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/8k1k_tp4_conc128.yaml @@ -6,7 +6,6 @@ print_iter_log: true kv_cache_config: dtype: fp8 free_gpu_memory_fraction: 0.8 - enable_block_reuse: false stream_interval: 10 moe_config: backend: CUTLASS diff --git a/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/8k1k_tp4_conc16.yaml b/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/8k1k_tp4_conc16.yaml index a0f2de5fec..9b81917cfe 100644 --- a/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/8k1k_tp4_conc16.yaml +++ b/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/8k1k_tp4_conc16.yaml @@ -6,7 +6,6 @@ print_iter_log: true kv_cache_config: dtype: fp8 free_gpu_memory_fraction: 0.8 - enable_block_reuse: false stream_interval: 10 moe_config: backend: TRTLLM diff --git a/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/8k1k_tp4_conc256.yaml b/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/8k1k_tp4_conc256.yaml index 3c812ea3e9..bff28a8fa5 100644 --- a/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/8k1k_tp4_conc256.yaml +++ b/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/8k1k_tp4_conc256.yaml @@ -6,7 +6,6 @@ print_iter_log: true kv_cache_config: dtype: fp8 free_gpu_memory_fraction: 0.8 - enable_block_reuse: false stream_interval: 10 moe_config: backend: CUTLASS diff --git a/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/8k1k_tp4_conc32.yaml b/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/8k1k_tp4_conc32.yaml index a0f2de5fec..9b81917cfe 100644 --- a/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/8k1k_tp4_conc32.yaml +++ b/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/8k1k_tp4_conc32.yaml @@ -6,7 +6,6 @@ print_iter_log: true kv_cache_config: dtype: fp8 free_gpu_memory_fraction: 0.8 - enable_block_reuse: false stream_interval: 10 moe_config: backend: TRTLLM diff --git a/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/8k1k_tp4_conc4.yaml b/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/8k1k_tp4_conc4.yaml index a0f2de5fec..9b81917cfe 100644 --- a/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/8k1k_tp4_conc4.yaml +++ b/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/8k1k_tp4_conc4.yaml @@ -6,7 +6,6 @@ print_iter_log: true kv_cache_config: dtype: fp8 free_gpu_memory_fraction: 0.8 - enable_block_reuse: false stream_interval: 10 moe_config: backend: TRTLLM diff --git a/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/8k1k_tp4_conc64.yaml b/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/8k1k_tp4_conc64.yaml index 06f600c1cd..17ca6555ef 100644 --- a/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/8k1k_tp4_conc64.yaml +++ b/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/8k1k_tp4_conc64.yaml @@ -6,7 +6,6 @@ print_iter_log: true kv_cache_config: dtype: fp8 free_gpu_memory_fraction: 0.8 - enable_block_reuse: false stream_interval: 10 moe_config: backend: CUTLASS diff --git a/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/8k1k_tp4_conc8.yaml b/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/8k1k_tp4_conc8.yaml index a0f2de5fec..9b81917cfe 100644 --- a/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/8k1k_tp4_conc8.yaml +++ b/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/8k1k_tp4_conc8.yaml @@ -6,7 +6,6 @@ print_iter_log: true kv_cache_config: dtype: fp8 free_gpu_memory_fraction: 0.8 - enable_block_reuse: false stream_interval: 10 moe_config: backend: TRTLLM diff --git a/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/8k1k_tp8_conc128.yaml b/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/8k1k_tp8_conc128.yaml index 5334ed3cf5..cfff9caf5e 100644 --- a/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/8k1k_tp8_conc128.yaml +++ b/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/8k1k_tp8_conc128.yaml @@ -6,7 +6,6 @@ print_iter_log: true kv_cache_config: dtype: fp8 free_gpu_memory_fraction: 0.8 - enable_block_reuse: false stream_interval: 10 moe_config: backend: CUTLASS diff --git a/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/8k1k_tp8_conc16.yaml b/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/8k1k_tp8_conc16.yaml index 382a3c9045..c2a9fb067f 100644 --- a/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/8k1k_tp8_conc16.yaml +++ b/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/8k1k_tp8_conc16.yaml @@ -6,7 +6,6 @@ print_iter_log: true kv_cache_config: dtype: fp8 free_gpu_memory_fraction: 0.8 - enable_block_reuse: false stream_interval: 10 moe_config: backend: TRTLLM diff --git a/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/8k1k_tp8_conc256.yaml b/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/8k1k_tp8_conc256.yaml index 639fdde94a..f4488125d2 100644 --- a/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/8k1k_tp8_conc256.yaml +++ b/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/8k1k_tp8_conc256.yaml @@ -6,7 +6,6 @@ print_iter_log: true kv_cache_config: dtype: fp8 free_gpu_memory_fraction: 0.8 - enable_block_reuse: false stream_interval: 10 moe_config: backend: CUTLASS diff --git a/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/8k1k_tp8_conc32.yaml b/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/8k1k_tp8_conc32.yaml index 382a3c9045..c2a9fb067f 100644 --- a/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/8k1k_tp8_conc32.yaml +++ b/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/8k1k_tp8_conc32.yaml @@ -6,7 +6,6 @@ print_iter_log: true kv_cache_config: dtype: fp8 free_gpu_memory_fraction: 0.8 - enable_block_reuse: false stream_interval: 10 moe_config: backend: TRTLLM diff --git a/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/8k1k_tp8_conc4.yaml b/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/8k1k_tp8_conc4.yaml index 382a3c9045..c2a9fb067f 100644 --- a/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/8k1k_tp8_conc4.yaml +++ b/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/8k1k_tp8_conc4.yaml @@ -6,7 +6,6 @@ print_iter_log: true kv_cache_config: dtype: fp8 free_gpu_memory_fraction: 0.8 - enable_block_reuse: false stream_interval: 10 moe_config: backend: TRTLLM diff --git a/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/8k1k_tp8_conc64.yaml b/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/8k1k_tp8_conc64.yaml index 930a625308..eb2fb249d1 100644 --- a/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/8k1k_tp8_conc64.yaml +++ b/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/8k1k_tp8_conc64.yaml @@ -6,7 +6,6 @@ print_iter_log: true kv_cache_config: dtype: fp8 free_gpu_memory_fraction: 0.8 - enable_block_reuse: false stream_interval: 10 moe_config: backend: CUTLASS diff --git a/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/8k1k_tp8_conc8.yaml b/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/8k1k_tp8_conc8.yaml index 382a3c9045..c2a9fb067f 100644 --- a/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/8k1k_tp8_conc8.yaml +++ b/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/8k1k_tp8_conc8.yaml @@ -6,7 +6,6 @@ print_iter_log: true kv_cache_config: dtype: fp8 free_gpu_memory_fraction: 0.8 - enable_block_reuse: false stream_interval: 10 moe_config: backend: TRTLLM diff --git a/examples/configs/database/openai/gpt-oss-120b/B200/1k1k_tp1_conc16.yaml b/examples/configs/database/openai/gpt-oss-120b/B200/1k1k_tp1_conc16.yaml index 1d4df97010..323d5260e3 100644 --- a/examples/configs/database/openai/gpt-oss-120b/B200/1k1k_tp1_conc16.yaml +++ b/examples/configs/database/openai/gpt-oss-120b/B200/1k1k_tp1_conc16.yaml @@ -7,7 +7,6 @@ cuda_graph_config: enable_attention_dp: false kv_cache_config: dtype: fp8 - enable_block_reuse: false free_gpu_memory_fraction: 0.85 print_iter_log: true stream_interval: 20 diff --git a/examples/configs/database/openai/gpt-oss-120b/B200/1k1k_tp1_conc32.yaml b/examples/configs/database/openai/gpt-oss-120b/B200/1k1k_tp1_conc32.yaml index 7d65f54710..4e42c74c5d 100644 --- a/examples/configs/database/openai/gpt-oss-120b/B200/1k1k_tp1_conc32.yaml +++ b/examples/configs/database/openai/gpt-oss-120b/B200/1k1k_tp1_conc32.yaml @@ -7,7 +7,6 @@ cuda_graph_config: enable_attention_dp: false kv_cache_config: dtype: fp8 - enable_block_reuse: false free_gpu_memory_fraction: 0.85 print_iter_log: true stream_interval: 20 diff --git a/examples/configs/database/openai/gpt-oss-120b/B200/1k1k_tp1_conc4.yaml b/examples/configs/database/openai/gpt-oss-120b/B200/1k1k_tp1_conc4.yaml index ca850a7758..41a3501db8 100644 --- a/examples/configs/database/openai/gpt-oss-120b/B200/1k1k_tp1_conc4.yaml +++ b/examples/configs/database/openai/gpt-oss-120b/B200/1k1k_tp1_conc4.yaml @@ -7,7 +7,6 @@ cuda_graph_config: enable_attention_dp: false kv_cache_config: dtype: fp8 - enable_block_reuse: false free_gpu_memory_fraction: 0.85 print_iter_log: true stream_interval: 20 diff --git a/examples/configs/database/openai/gpt-oss-120b/B200/1k1k_tp1_conc64.yaml b/examples/configs/database/openai/gpt-oss-120b/B200/1k1k_tp1_conc64.yaml index 345b0e5013..ef8559949b 100644 --- a/examples/configs/database/openai/gpt-oss-120b/B200/1k1k_tp1_conc64.yaml +++ b/examples/configs/database/openai/gpt-oss-120b/B200/1k1k_tp1_conc64.yaml @@ -7,7 +7,6 @@ cuda_graph_config: enable_attention_dp: false kv_cache_config: dtype: fp8 - enable_block_reuse: false free_gpu_memory_fraction: 0.85 print_iter_log: true stream_interval: 20 diff --git a/examples/configs/database/openai/gpt-oss-120b/B200/1k1k_tp1_conc8.yaml b/examples/configs/database/openai/gpt-oss-120b/B200/1k1k_tp1_conc8.yaml index 5fa5e373d2..cf3f143581 100644 --- a/examples/configs/database/openai/gpt-oss-120b/B200/1k1k_tp1_conc8.yaml +++ b/examples/configs/database/openai/gpt-oss-120b/B200/1k1k_tp1_conc8.yaml @@ -7,7 +7,6 @@ cuda_graph_config: enable_attention_dp: false kv_cache_config: dtype: fp8 - enable_block_reuse: false free_gpu_memory_fraction: 0.85 print_iter_log: true stream_interval: 20 diff --git a/examples/configs/database/openai/gpt-oss-120b/B200/1k1k_tp2_conc16.yaml b/examples/configs/database/openai/gpt-oss-120b/B200/1k1k_tp2_conc16.yaml index 7b392ada8d..89f16c0834 100644 --- a/examples/configs/database/openai/gpt-oss-120b/B200/1k1k_tp2_conc16.yaml +++ b/examples/configs/database/openai/gpt-oss-120b/B200/1k1k_tp2_conc16.yaml @@ -7,7 +7,6 @@ cuda_graph_config: enable_attention_dp: false kv_cache_config: dtype: fp8 - enable_block_reuse: false free_gpu_memory_fraction: 0.85 print_iter_log: true stream_interval: 20 diff --git a/examples/configs/database/openai/gpt-oss-120b/B200/1k1k_tp2_conc32.yaml b/examples/configs/database/openai/gpt-oss-120b/B200/1k1k_tp2_conc32.yaml index e8212dd139..50a9a01eab 100644 --- a/examples/configs/database/openai/gpt-oss-120b/B200/1k1k_tp2_conc32.yaml +++ b/examples/configs/database/openai/gpt-oss-120b/B200/1k1k_tp2_conc32.yaml @@ -7,7 +7,6 @@ cuda_graph_config: enable_attention_dp: false kv_cache_config: dtype: fp8 - enable_block_reuse: false free_gpu_memory_fraction: 0.85 print_iter_log: true stream_interval: 20 diff --git a/examples/configs/database/openai/gpt-oss-120b/B200/1k1k_tp2_conc4.yaml b/examples/configs/database/openai/gpt-oss-120b/B200/1k1k_tp2_conc4.yaml index ab22a7baf6..9e38bf35d1 100644 --- a/examples/configs/database/openai/gpt-oss-120b/B200/1k1k_tp2_conc4.yaml +++ b/examples/configs/database/openai/gpt-oss-120b/B200/1k1k_tp2_conc4.yaml @@ -7,7 +7,6 @@ cuda_graph_config: enable_attention_dp: false kv_cache_config: dtype: fp8 - enable_block_reuse: false free_gpu_memory_fraction: 0.85 print_iter_log: true stream_interval: 20 diff --git a/examples/configs/database/openai/gpt-oss-120b/B200/1k1k_tp2_conc64.yaml b/examples/configs/database/openai/gpt-oss-120b/B200/1k1k_tp2_conc64.yaml index 3f82650480..828ad26582 100644 --- a/examples/configs/database/openai/gpt-oss-120b/B200/1k1k_tp2_conc64.yaml +++ b/examples/configs/database/openai/gpt-oss-120b/B200/1k1k_tp2_conc64.yaml @@ -7,7 +7,6 @@ cuda_graph_config: enable_attention_dp: false kv_cache_config: dtype: fp8 - enable_block_reuse: false free_gpu_memory_fraction: 0.85 print_iter_log: true stream_interval: 20 diff --git a/examples/configs/database/openai/gpt-oss-120b/B200/1k1k_tp2_conc8.yaml b/examples/configs/database/openai/gpt-oss-120b/B200/1k1k_tp2_conc8.yaml index b07960f33d..27a1f1b0a5 100644 --- a/examples/configs/database/openai/gpt-oss-120b/B200/1k1k_tp2_conc8.yaml +++ b/examples/configs/database/openai/gpt-oss-120b/B200/1k1k_tp2_conc8.yaml @@ -7,7 +7,6 @@ cuda_graph_config: enable_attention_dp: false kv_cache_config: dtype: fp8 - enable_block_reuse: false free_gpu_memory_fraction: 0.85 print_iter_log: true stream_interval: 20 diff --git a/examples/configs/database/openai/gpt-oss-120b/B200/1k1k_tp4_conc16.yaml b/examples/configs/database/openai/gpt-oss-120b/B200/1k1k_tp4_conc16.yaml index e078ea3d6d..d85b3d01a4 100644 --- a/examples/configs/database/openai/gpt-oss-120b/B200/1k1k_tp4_conc16.yaml +++ b/examples/configs/database/openai/gpt-oss-120b/B200/1k1k_tp4_conc16.yaml @@ -7,7 +7,6 @@ cuda_graph_config: enable_attention_dp: false kv_cache_config: dtype: fp8 - enable_block_reuse: false free_gpu_memory_fraction: 0.85 print_iter_log: true stream_interval: 20 diff --git a/examples/configs/database/openai/gpt-oss-120b/B200/1k1k_tp4_conc32.yaml b/examples/configs/database/openai/gpt-oss-120b/B200/1k1k_tp4_conc32.yaml index 15f5a3ca50..9ed994fa4b 100644 --- a/examples/configs/database/openai/gpt-oss-120b/B200/1k1k_tp4_conc32.yaml +++ b/examples/configs/database/openai/gpt-oss-120b/B200/1k1k_tp4_conc32.yaml @@ -7,7 +7,6 @@ cuda_graph_config: enable_attention_dp: false kv_cache_config: dtype: fp8 - enable_block_reuse: false free_gpu_memory_fraction: 0.85 print_iter_log: true stream_interval: 20 diff --git a/examples/configs/database/openai/gpt-oss-120b/B200/1k1k_tp4_conc4.yaml b/examples/configs/database/openai/gpt-oss-120b/B200/1k1k_tp4_conc4.yaml index cdbb40a3eb..f7d77d7296 100644 --- a/examples/configs/database/openai/gpt-oss-120b/B200/1k1k_tp4_conc4.yaml +++ b/examples/configs/database/openai/gpt-oss-120b/B200/1k1k_tp4_conc4.yaml @@ -7,7 +7,6 @@ cuda_graph_config: enable_attention_dp: false kv_cache_config: dtype: fp8 - enable_block_reuse: false free_gpu_memory_fraction: 0.85 print_iter_log: true stream_interval: 20 diff --git a/examples/configs/database/openai/gpt-oss-120b/B200/1k1k_tp4_conc64.yaml b/examples/configs/database/openai/gpt-oss-120b/B200/1k1k_tp4_conc64.yaml index c5854b6daf..8d4215121a 100644 --- a/examples/configs/database/openai/gpt-oss-120b/B200/1k1k_tp4_conc64.yaml +++ b/examples/configs/database/openai/gpt-oss-120b/B200/1k1k_tp4_conc64.yaml @@ -7,7 +7,6 @@ cuda_graph_config: enable_attention_dp: false kv_cache_config: dtype: fp8 - enable_block_reuse: false free_gpu_memory_fraction: 0.85 print_iter_log: true stream_interval: 20 diff --git a/examples/configs/database/openai/gpt-oss-120b/B200/1k1k_tp4_conc8.yaml b/examples/configs/database/openai/gpt-oss-120b/B200/1k1k_tp4_conc8.yaml index 0ac4431175..0c0101c832 100644 --- a/examples/configs/database/openai/gpt-oss-120b/B200/1k1k_tp4_conc8.yaml +++ b/examples/configs/database/openai/gpt-oss-120b/B200/1k1k_tp4_conc8.yaml @@ -7,7 +7,6 @@ cuda_graph_config: enable_attention_dp: false kv_cache_config: dtype: fp8 - enable_block_reuse: false free_gpu_memory_fraction: 0.85 print_iter_log: true stream_interval: 20 diff --git a/examples/configs/database/openai/gpt-oss-120b/B200/1k1k_tp8_conc16.yaml b/examples/configs/database/openai/gpt-oss-120b/B200/1k1k_tp8_conc16.yaml index a18faa2622..dda82a593e 100644 --- a/examples/configs/database/openai/gpt-oss-120b/B200/1k1k_tp8_conc16.yaml +++ b/examples/configs/database/openai/gpt-oss-120b/B200/1k1k_tp8_conc16.yaml @@ -7,7 +7,6 @@ cuda_graph_config: enable_attention_dp: false kv_cache_config: dtype: fp8 - enable_block_reuse: false free_gpu_memory_fraction: 0.85 print_iter_log: true stream_interval: 20 diff --git a/examples/configs/database/openai/gpt-oss-120b/B200/1k1k_tp8_conc32.yaml b/examples/configs/database/openai/gpt-oss-120b/B200/1k1k_tp8_conc32.yaml index 4ce42b3ce8..c770096914 100644 --- a/examples/configs/database/openai/gpt-oss-120b/B200/1k1k_tp8_conc32.yaml +++ b/examples/configs/database/openai/gpt-oss-120b/B200/1k1k_tp8_conc32.yaml @@ -7,7 +7,6 @@ cuda_graph_config: enable_attention_dp: false kv_cache_config: dtype: fp8 - enable_block_reuse: false free_gpu_memory_fraction: 0.85 print_iter_log: true stream_interval: 20 diff --git a/examples/configs/database/openai/gpt-oss-120b/B200/1k1k_tp8_conc4.yaml b/examples/configs/database/openai/gpt-oss-120b/B200/1k1k_tp8_conc4.yaml index 966138c163..55e10c0d15 100644 --- a/examples/configs/database/openai/gpt-oss-120b/B200/1k1k_tp8_conc4.yaml +++ b/examples/configs/database/openai/gpt-oss-120b/B200/1k1k_tp8_conc4.yaml @@ -7,7 +7,6 @@ cuda_graph_config: enable_attention_dp: false kv_cache_config: dtype: fp8 - enable_block_reuse: false free_gpu_memory_fraction: 0.85 print_iter_log: true stream_interval: 20 diff --git a/examples/configs/database/openai/gpt-oss-120b/B200/1k1k_tp8_conc64.yaml b/examples/configs/database/openai/gpt-oss-120b/B200/1k1k_tp8_conc64.yaml index a322f0681d..f606123c32 100644 --- a/examples/configs/database/openai/gpt-oss-120b/B200/1k1k_tp8_conc64.yaml +++ b/examples/configs/database/openai/gpt-oss-120b/B200/1k1k_tp8_conc64.yaml @@ -7,7 +7,6 @@ cuda_graph_config: enable_attention_dp: false kv_cache_config: dtype: fp8 - enable_block_reuse: false free_gpu_memory_fraction: 0.85 print_iter_log: true stream_interval: 20 diff --git a/examples/configs/database/openai/gpt-oss-120b/B200/1k1k_tp8_conc8.yaml b/examples/configs/database/openai/gpt-oss-120b/B200/1k1k_tp8_conc8.yaml index 644d2dabb4..2d284fcf8c 100644 --- a/examples/configs/database/openai/gpt-oss-120b/B200/1k1k_tp8_conc8.yaml +++ b/examples/configs/database/openai/gpt-oss-120b/B200/1k1k_tp8_conc8.yaml @@ -7,7 +7,6 @@ cuda_graph_config: enable_attention_dp: false kv_cache_config: dtype: fp8 - enable_block_reuse: false free_gpu_memory_fraction: 0.85 print_iter_log: true stream_interval: 20 diff --git a/examples/configs/database/openai/gpt-oss-120b/B200/1k8k_tp1_conc16.yaml b/examples/configs/database/openai/gpt-oss-120b/B200/1k8k_tp1_conc16.yaml index 31544aa9f4..b17f7425cc 100644 --- a/examples/configs/database/openai/gpt-oss-120b/B200/1k8k_tp1_conc16.yaml +++ b/examples/configs/database/openai/gpt-oss-120b/B200/1k8k_tp1_conc16.yaml @@ -7,7 +7,6 @@ cuda_graph_config: enable_attention_dp: false kv_cache_config: dtype: fp8 - enable_block_reuse: false free_gpu_memory_fraction: 0.85 print_iter_log: true stream_interval: 20 diff --git a/examples/configs/database/openai/gpt-oss-120b/B200/1k8k_tp1_conc32.yaml b/examples/configs/database/openai/gpt-oss-120b/B200/1k8k_tp1_conc32.yaml index ec0ea7b2ba..e559dcc0b8 100644 --- a/examples/configs/database/openai/gpt-oss-120b/B200/1k8k_tp1_conc32.yaml +++ b/examples/configs/database/openai/gpt-oss-120b/B200/1k8k_tp1_conc32.yaml @@ -7,7 +7,6 @@ cuda_graph_config: enable_attention_dp: false kv_cache_config: dtype: fp8 - enable_block_reuse: false free_gpu_memory_fraction: 0.85 print_iter_log: true stream_interval: 20 diff --git a/examples/configs/database/openai/gpt-oss-120b/B200/1k8k_tp1_conc4.yaml b/examples/configs/database/openai/gpt-oss-120b/B200/1k8k_tp1_conc4.yaml index 249b14723f..af72d4f494 100644 --- a/examples/configs/database/openai/gpt-oss-120b/B200/1k8k_tp1_conc4.yaml +++ b/examples/configs/database/openai/gpt-oss-120b/B200/1k8k_tp1_conc4.yaml @@ -7,7 +7,6 @@ cuda_graph_config: enable_attention_dp: false kv_cache_config: dtype: fp8 - enable_block_reuse: false free_gpu_memory_fraction: 0.85 print_iter_log: true stream_interval: 20 diff --git a/examples/configs/database/openai/gpt-oss-120b/B200/1k8k_tp1_conc64.yaml b/examples/configs/database/openai/gpt-oss-120b/B200/1k8k_tp1_conc64.yaml index 21de3414a8..3e3f969fc7 100644 --- a/examples/configs/database/openai/gpt-oss-120b/B200/1k8k_tp1_conc64.yaml +++ b/examples/configs/database/openai/gpt-oss-120b/B200/1k8k_tp1_conc64.yaml @@ -7,7 +7,6 @@ cuda_graph_config: enable_attention_dp: false kv_cache_config: dtype: fp8 - enable_block_reuse: false free_gpu_memory_fraction: 0.85 print_iter_log: true stream_interval: 20 diff --git a/examples/configs/database/openai/gpt-oss-120b/B200/1k8k_tp1_conc8.yaml b/examples/configs/database/openai/gpt-oss-120b/B200/1k8k_tp1_conc8.yaml index 315b1add42..aef4e1636b 100644 --- a/examples/configs/database/openai/gpt-oss-120b/B200/1k8k_tp1_conc8.yaml +++ b/examples/configs/database/openai/gpt-oss-120b/B200/1k8k_tp1_conc8.yaml @@ -7,7 +7,6 @@ cuda_graph_config: enable_attention_dp: false kv_cache_config: dtype: fp8 - enable_block_reuse: false free_gpu_memory_fraction: 0.85 print_iter_log: true stream_interval: 20 diff --git a/examples/configs/database/openai/gpt-oss-120b/B200/1k8k_tp2_conc16.yaml b/examples/configs/database/openai/gpt-oss-120b/B200/1k8k_tp2_conc16.yaml index 56e1b648bd..879c2fba70 100644 --- a/examples/configs/database/openai/gpt-oss-120b/B200/1k8k_tp2_conc16.yaml +++ b/examples/configs/database/openai/gpt-oss-120b/B200/1k8k_tp2_conc16.yaml @@ -7,7 +7,6 @@ cuda_graph_config: enable_attention_dp: false kv_cache_config: dtype: fp8 - enable_block_reuse: false free_gpu_memory_fraction: 0.85 print_iter_log: true stream_interval: 20 diff --git a/examples/configs/database/openai/gpt-oss-120b/B200/1k8k_tp2_conc32.yaml b/examples/configs/database/openai/gpt-oss-120b/B200/1k8k_tp2_conc32.yaml index 4e02fe671b..322f73709e 100644 --- a/examples/configs/database/openai/gpt-oss-120b/B200/1k8k_tp2_conc32.yaml +++ b/examples/configs/database/openai/gpt-oss-120b/B200/1k8k_tp2_conc32.yaml @@ -7,7 +7,6 @@ cuda_graph_config: enable_attention_dp: false kv_cache_config: dtype: fp8 - enable_block_reuse: false free_gpu_memory_fraction: 0.85 print_iter_log: true stream_interval: 20 diff --git a/examples/configs/database/openai/gpt-oss-120b/B200/1k8k_tp2_conc4.yaml b/examples/configs/database/openai/gpt-oss-120b/B200/1k8k_tp2_conc4.yaml index 4bc360839a..84cac88c86 100644 --- a/examples/configs/database/openai/gpt-oss-120b/B200/1k8k_tp2_conc4.yaml +++ b/examples/configs/database/openai/gpt-oss-120b/B200/1k8k_tp2_conc4.yaml @@ -7,7 +7,6 @@ cuda_graph_config: enable_attention_dp: false kv_cache_config: dtype: fp8 - enable_block_reuse: false free_gpu_memory_fraction: 0.85 print_iter_log: true stream_interval: 20 diff --git a/examples/configs/database/openai/gpt-oss-120b/B200/1k8k_tp2_conc64.yaml b/examples/configs/database/openai/gpt-oss-120b/B200/1k8k_tp2_conc64.yaml index 584fb5ae1a..031a0fd744 100644 --- a/examples/configs/database/openai/gpt-oss-120b/B200/1k8k_tp2_conc64.yaml +++ b/examples/configs/database/openai/gpt-oss-120b/B200/1k8k_tp2_conc64.yaml @@ -7,7 +7,6 @@ cuda_graph_config: enable_attention_dp: false kv_cache_config: dtype: fp8 - enable_block_reuse: false free_gpu_memory_fraction: 0.85 print_iter_log: true stream_interval: 20 diff --git a/examples/configs/database/openai/gpt-oss-120b/B200/1k8k_tp2_conc8.yaml b/examples/configs/database/openai/gpt-oss-120b/B200/1k8k_tp2_conc8.yaml index 6ab46126d5..445cec1d82 100644 --- a/examples/configs/database/openai/gpt-oss-120b/B200/1k8k_tp2_conc8.yaml +++ b/examples/configs/database/openai/gpt-oss-120b/B200/1k8k_tp2_conc8.yaml @@ -7,7 +7,6 @@ cuda_graph_config: enable_attention_dp: false kv_cache_config: dtype: fp8 - enable_block_reuse: false free_gpu_memory_fraction: 0.85 print_iter_log: true stream_interval: 20 diff --git a/examples/configs/database/openai/gpt-oss-120b/B200/1k8k_tp4_conc16.yaml b/examples/configs/database/openai/gpt-oss-120b/B200/1k8k_tp4_conc16.yaml index ef539d3bef..4d9ce00dbc 100644 --- a/examples/configs/database/openai/gpt-oss-120b/B200/1k8k_tp4_conc16.yaml +++ b/examples/configs/database/openai/gpt-oss-120b/B200/1k8k_tp4_conc16.yaml @@ -7,7 +7,6 @@ cuda_graph_config: enable_attention_dp: false kv_cache_config: dtype: fp8 - enable_block_reuse: false free_gpu_memory_fraction: 0.85 print_iter_log: true stream_interval: 20 diff --git a/examples/configs/database/openai/gpt-oss-120b/B200/1k8k_tp4_conc32.yaml b/examples/configs/database/openai/gpt-oss-120b/B200/1k8k_tp4_conc32.yaml index 40dc752084..e790e0d980 100644 --- a/examples/configs/database/openai/gpt-oss-120b/B200/1k8k_tp4_conc32.yaml +++ b/examples/configs/database/openai/gpt-oss-120b/B200/1k8k_tp4_conc32.yaml @@ -7,7 +7,6 @@ cuda_graph_config: enable_attention_dp: false kv_cache_config: dtype: fp8 - enable_block_reuse: false free_gpu_memory_fraction: 0.85 print_iter_log: true stream_interval: 20 diff --git a/examples/configs/database/openai/gpt-oss-120b/B200/1k8k_tp4_conc4.yaml b/examples/configs/database/openai/gpt-oss-120b/B200/1k8k_tp4_conc4.yaml index 3e0f48e7e1..f72680460c 100644 --- a/examples/configs/database/openai/gpt-oss-120b/B200/1k8k_tp4_conc4.yaml +++ b/examples/configs/database/openai/gpt-oss-120b/B200/1k8k_tp4_conc4.yaml @@ -7,7 +7,6 @@ cuda_graph_config: enable_attention_dp: false kv_cache_config: dtype: fp8 - enable_block_reuse: false free_gpu_memory_fraction: 0.85 print_iter_log: true stream_interval: 20 diff --git a/examples/configs/database/openai/gpt-oss-120b/B200/1k8k_tp4_conc64.yaml b/examples/configs/database/openai/gpt-oss-120b/B200/1k8k_tp4_conc64.yaml index 2e3721c712..ddceeb61fa 100644 --- a/examples/configs/database/openai/gpt-oss-120b/B200/1k8k_tp4_conc64.yaml +++ b/examples/configs/database/openai/gpt-oss-120b/B200/1k8k_tp4_conc64.yaml @@ -7,7 +7,6 @@ cuda_graph_config: enable_attention_dp: false kv_cache_config: dtype: fp8 - enable_block_reuse: false free_gpu_memory_fraction: 0.85 print_iter_log: true stream_interval: 20 diff --git a/examples/configs/database/openai/gpt-oss-120b/B200/1k8k_tp4_conc8.yaml b/examples/configs/database/openai/gpt-oss-120b/B200/1k8k_tp4_conc8.yaml index 098e7ec388..f08dca8cd1 100644 --- a/examples/configs/database/openai/gpt-oss-120b/B200/1k8k_tp4_conc8.yaml +++ b/examples/configs/database/openai/gpt-oss-120b/B200/1k8k_tp4_conc8.yaml @@ -7,7 +7,6 @@ cuda_graph_config: enable_attention_dp: false kv_cache_config: dtype: fp8 - enable_block_reuse: false free_gpu_memory_fraction: 0.85 print_iter_log: true stream_interval: 20 diff --git a/examples/configs/database/openai/gpt-oss-120b/B200/1k8k_tp8_conc16.yaml b/examples/configs/database/openai/gpt-oss-120b/B200/1k8k_tp8_conc16.yaml index 45d77f70bd..95cdf3a61f 100644 --- a/examples/configs/database/openai/gpt-oss-120b/B200/1k8k_tp8_conc16.yaml +++ b/examples/configs/database/openai/gpt-oss-120b/B200/1k8k_tp8_conc16.yaml @@ -7,7 +7,6 @@ cuda_graph_config: enable_attention_dp: false kv_cache_config: dtype: fp8 - enable_block_reuse: false free_gpu_memory_fraction: 0.85 print_iter_log: true stream_interval: 20 diff --git a/examples/configs/database/openai/gpt-oss-120b/B200/1k8k_tp8_conc32.yaml b/examples/configs/database/openai/gpt-oss-120b/B200/1k8k_tp8_conc32.yaml index 9436b07959..74d4e29f67 100644 --- a/examples/configs/database/openai/gpt-oss-120b/B200/1k8k_tp8_conc32.yaml +++ b/examples/configs/database/openai/gpt-oss-120b/B200/1k8k_tp8_conc32.yaml @@ -7,7 +7,6 @@ cuda_graph_config: enable_attention_dp: false kv_cache_config: dtype: fp8 - enable_block_reuse: false free_gpu_memory_fraction: 0.85 print_iter_log: true stream_interval: 20 diff --git a/examples/configs/database/openai/gpt-oss-120b/B200/1k8k_tp8_conc4.yaml b/examples/configs/database/openai/gpt-oss-120b/B200/1k8k_tp8_conc4.yaml index a2917bfd5b..59b9319ee8 100644 --- a/examples/configs/database/openai/gpt-oss-120b/B200/1k8k_tp8_conc4.yaml +++ b/examples/configs/database/openai/gpt-oss-120b/B200/1k8k_tp8_conc4.yaml @@ -7,7 +7,6 @@ cuda_graph_config: enable_attention_dp: false kv_cache_config: dtype: fp8 - enable_block_reuse: false free_gpu_memory_fraction: 0.85 print_iter_log: true stream_interval: 20 diff --git a/examples/configs/database/openai/gpt-oss-120b/B200/1k8k_tp8_conc64.yaml b/examples/configs/database/openai/gpt-oss-120b/B200/1k8k_tp8_conc64.yaml index 702d3bc00c..365f467d6e 100644 --- a/examples/configs/database/openai/gpt-oss-120b/B200/1k8k_tp8_conc64.yaml +++ b/examples/configs/database/openai/gpt-oss-120b/B200/1k8k_tp8_conc64.yaml @@ -7,7 +7,6 @@ cuda_graph_config: enable_attention_dp: false kv_cache_config: dtype: fp8 - enable_block_reuse: false free_gpu_memory_fraction: 0.85 print_iter_log: true stream_interval: 20 diff --git a/examples/configs/database/openai/gpt-oss-120b/B200/1k8k_tp8_conc8.yaml b/examples/configs/database/openai/gpt-oss-120b/B200/1k8k_tp8_conc8.yaml index c0b90314c3..befde425ae 100644 --- a/examples/configs/database/openai/gpt-oss-120b/B200/1k8k_tp8_conc8.yaml +++ b/examples/configs/database/openai/gpt-oss-120b/B200/1k8k_tp8_conc8.yaml @@ -7,7 +7,6 @@ cuda_graph_config: enable_attention_dp: false kv_cache_config: dtype: fp8 - enable_block_reuse: false free_gpu_memory_fraction: 0.85 print_iter_log: true stream_interval: 20 diff --git a/examples/configs/database/openai/gpt-oss-120b/B200/8k1k_tp1_conc16.yaml b/examples/configs/database/openai/gpt-oss-120b/B200/8k1k_tp1_conc16.yaml index 31544aa9f4..b17f7425cc 100644 --- a/examples/configs/database/openai/gpt-oss-120b/B200/8k1k_tp1_conc16.yaml +++ b/examples/configs/database/openai/gpt-oss-120b/B200/8k1k_tp1_conc16.yaml @@ -7,7 +7,6 @@ cuda_graph_config: enable_attention_dp: false kv_cache_config: dtype: fp8 - enable_block_reuse: false free_gpu_memory_fraction: 0.85 print_iter_log: true stream_interval: 20 diff --git a/examples/configs/database/openai/gpt-oss-120b/B200/8k1k_tp1_conc32.yaml b/examples/configs/database/openai/gpt-oss-120b/B200/8k1k_tp1_conc32.yaml index ec0ea7b2ba..e559dcc0b8 100644 --- a/examples/configs/database/openai/gpt-oss-120b/B200/8k1k_tp1_conc32.yaml +++ b/examples/configs/database/openai/gpt-oss-120b/B200/8k1k_tp1_conc32.yaml @@ -7,7 +7,6 @@ cuda_graph_config: enable_attention_dp: false kv_cache_config: dtype: fp8 - enable_block_reuse: false free_gpu_memory_fraction: 0.85 print_iter_log: true stream_interval: 20 diff --git a/examples/configs/database/openai/gpt-oss-120b/B200/8k1k_tp1_conc4.yaml b/examples/configs/database/openai/gpt-oss-120b/B200/8k1k_tp1_conc4.yaml index 249b14723f..af72d4f494 100644 --- a/examples/configs/database/openai/gpt-oss-120b/B200/8k1k_tp1_conc4.yaml +++ b/examples/configs/database/openai/gpt-oss-120b/B200/8k1k_tp1_conc4.yaml @@ -7,7 +7,6 @@ cuda_graph_config: enable_attention_dp: false kv_cache_config: dtype: fp8 - enable_block_reuse: false free_gpu_memory_fraction: 0.85 print_iter_log: true stream_interval: 20 diff --git a/examples/configs/database/openai/gpt-oss-120b/B200/8k1k_tp1_conc64.yaml b/examples/configs/database/openai/gpt-oss-120b/B200/8k1k_tp1_conc64.yaml index 21de3414a8..3e3f969fc7 100644 --- a/examples/configs/database/openai/gpt-oss-120b/B200/8k1k_tp1_conc64.yaml +++ b/examples/configs/database/openai/gpt-oss-120b/B200/8k1k_tp1_conc64.yaml @@ -7,7 +7,6 @@ cuda_graph_config: enable_attention_dp: false kv_cache_config: dtype: fp8 - enable_block_reuse: false free_gpu_memory_fraction: 0.85 print_iter_log: true stream_interval: 20 diff --git a/examples/configs/database/openai/gpt-oss-120b/B200/8k1k_tp1_conc8.yaml b/examples/configs/database/openai/gpt-oss-120b/B200/8k1k_tp1_conc8.yaml index 315b1add42..aef4e1636b 100644 --- a/examples/configs/database/openai/gpt-oss-120b/B200/8k1k_tp1_conc8.yaml +++ b/examples/configs/database/openai/gpt-oss-120b/B200/8k1k_tp1_conc8.yaml @@ -7,7 +7,6 @@ cuda_graph_config: enable_attention_dp: false kv_cache_config: dtype: fp8 - enable_block_reuse: false free_gpu_memory_fraction: 0.85 print_iter_log: true stream_interval: 20 diff --git a/examples/configs/database/openai/gpt-oss-120b/B200/8k1k_tp2_conc16.yaml b/examples/configs/database/openai/gpt-oss-120b/B200/8k1k_tp2_conc16.yaml index 56e1b648bd..879c2fba70 100644 --- a/examples/configs/database/openai/gpt-oss-120b/B200/8k1k_tp2_conc16.yaml +++ b/examples/configs/database/openai/gpt-oss-120b/B200/8k1k_tp2_conc16.yaml @@ -7,7 +7,6 @@ cuda_graph_config: enable_attention_dp: false kv_cache_config: dtype: fp8 - enable_block_reuse: false free_gpu_memory_fraction: 0.85 print_iter_log: true stream_interval: 20 diff --git a/examples/configs/database/openai/gpt-oss-120b/B200/8k1k_tp2_conc32.yaml b/examples/configs/database/openai/gpt-oss-120b/B200/8k1k_tp2_conc32.yaml index 4e02fe671b..322f73709e 100644 --- a/examples/configs/database/openai/gpt-oss-120b/B200/8k1k_tp2_conc32.yaml +++ b/examples/configs/database/openai/gpt-oss-120b/B200/8k1k_tp2_conc32.yaml @@ -7,7 +7,6 @@ cuda_graph_config: enable_attention_dp: false kv_cache_config: dtype: fp8 - enable_block_reuse: false free_gpu_memory_fraction: 0.85 print_iter_log: true stream_interval: 20 diff --git a/examples/configs/database/openai/gpt-oss-120b/B200/8k1k_tp2_conc4.yaml b/examples/configs/database/openai/gpt-oss-120b/B200/8k1k_tp2_conc4.yaml index 4bc360839a..84cac88c86 100644 --- a/examples/configs/database/openai/gpt-oss-120b/B200/8k1k_tp2_conc4.yaml +++ b/examples/configs/database/openai/gpt-oss-120b/B200/8k1k_tp2_conc4.yaml @@ -7,7 +7,6 @@ cuda_graph_config: enable_attention_dp: false kv_cache_config: dtype: fp8 - enable_block_reuse: false free_gpu_memory_fraction: 0.85 print_iter_log: true stream_interval: 20 diff --git a/examples/configs/database/openai/gpt-oss-120b/B200/8k1k_tp2_conc64.yaml b/examples/configs/database/openai/gpt-oss-120b/B200/8k1k_tp2_conc64.yaml index 584fb5ae1a..031a0fd744 100644 --- a/examples/configs/database/openai/gpt-oss-120b/B200/8k1k_tp2_conc64.yaml +++ b/examples/configs/database/openai/gpt-oss-120b/B200/8k1k_tp2_conc64.yaml @@ -7,7 +7,6 @@ cuda_graph_config: enable_attention_dp: false kv_cache_config: dtype: fp8 - enable_block_reuse: false free_gpu_memory_fraction: 0.85 print_iter_log: true stream_interval: 20 diff --git a/examples/configs/database/openai/gpt-oss-120b/B200/8k1k_tp2_conc8.yaml b/examples/configs/database/openai/gpt-oss-120b/B200/8k1k_tp2_conc8.yaml index 6ab46126d5..445cec1d82 100644 --- a/examples/configs/database/openai/gpt-oss-120b/B200/8k1k_tp2_conc8.yaml +++ b/examples/configs/database/openai/gpt-oss-120b/B200/8k1k_tp2_conc8.yaml @@ -7,7 +7,6 @@ cuda_graph_config: enable_attention_dp: false kv_cache_config: dtype: fp8 - enable_block_reuse: false free_gpu_memory_fraction: 0.85 print_iter_log: true stream_interval: 20 diff --git a/examples/configs/database/openai/gpt-oss-120b/B200/8k1k_tp4_conc16.yaml b/examples/configs/database/openai/gpt-oss-120b/B200/8k1k_tp4_conc16.yaml index ef539d3bef..4d9ce00dbc 100644 --- a/examples/configs/database/openai/gpt-oss-120b/B200/8k1k_tp4_conc16.yaml +++ b/examples/configs/database/openai/gpt-oss-120b/B200/8k1k_tp4_conc16.yaml @@ -7,7 +7,6 @@ cuda_graph_config: enable_attention_dp: false kv_cache_config: dtype: fp8 - enable_block_reuse: false free_gpu_memory_fraction: 0.85 print_iter_log: true stream_interval: 20 diff --git a/examples/configs/database/openai/gpt-oss-120b/B200/8k1k_tp4_conc32.yaml b/examples/configs/database/openai/gpt-oss-120b/B200/8k1k_tp4_conc32.yaml index 40dc752084..e790e0d980 100644 --- a/examples/configs/database/openai/gpt-oss-120b/B200/8k1k_tp4_conc32.yaml +++ b/examples/configs/database/openai/gpt-oss-120b/B200/8k1k_tp4_conc32.yaml @@ -7,7 +7,6 @@ cuda_graph_config: enable_attention_dp: false kv_cache_config: dtype: fp8 - enable_block_reuse: false free_gpu_memory_fraction: 0.85 print_iter_log: true stream_interval: 20 diff --git a/examples/configs/database/openai/gpt-oss-120b/B200/8k1k_tp4_conc4.yaml b/examples/configs/database/openai/gpt-oss-120b/B200/8k1k_tp4_conc4.yaml index 3e0f48e7e1..f72680460c 100644 --- a/examples/configs/database/openai/gpt-oss-120b/B200/8k1k_tp4_conc4.yaml +++ b/examples/configs/database/openai/gpt-oss-120b/B200/8k1k_tp4_conc4.yaml @@ -7,7 +7,6 @@ cuda_graph_config: enable_attention_dp: false kv_cache_config: dtype: fp8 - enable_block_reuse: false free_gpu_memory_fraction: 0.85 print_iter_log: true stream_interval: 20 diff --git a/examples/configs/database/openai/gpt-oss-120b/B200/8k1k_tp4_conc64.yaml b/examples/configs/database/openai/gpt-oss-120b/B200/8k1k_tp4_conc64.yaml index 2e3721c712..ddceeb61fa 100644 --- a/examples/configs/database/openai/gpt-oss-120b/B200/8k1k_tp4_conc64.yaml +++ b/examples/configs/database/openai/gpt-oss-120b/B200/8k1k_tp4_conc64.yaml @@ -7,7 +7,6 @@ cuda_graph_config: enable_attention_dp: false kv_cache_config: dtype: fp8 - enable_block_reuse: false free_gpu_memory_fraction: 0.85 print_iter_log: true stream_interval: 20 diff --git a/examples/configs/database/openai/gpt-oss-120b/B200/8k1k_tp4_conc8.yaml b/examples/configs/database/openai/gpt-oss-120b/B200/8k1k_tp4_conc8.yaml index 098e7ec388..f08dca8cd1 100644 --- a/examples/configs/database/openai/gpt-oss-120b/B200/8k1k_tp4_conc8.yaml +++ b/examples/configs/database/openai/gpt-oss-120b/B200/8k1k_tp4_conc8.yaml @@ -7,7 +7,6 @@ cuda_graph_config: enable_attention_dp: false kv_cache_config: dtype: fp8 - enable_block_reuse: false free_gpu_memory_fraction: 0.85 print_iter_log: true stream_interval: 20 diff --git a/examples/configs/database/openai/gpt-oss-120b/B200/8k1k_tp8_conc16.yaml b/examples/configs/database/openai/gpt-oss-120b/B200/8k1k_tp8_conc16.yaml index 45d77f70bd..95cdf3a61f 100644 --- a/examples/configs/database/openai/gpt-oss-120b/B200/8k1k_tp8_conc16.yaml +++ b/examples/configs/database/openai/gpt-oss-120b/B200/8k1k_tp8_conc16.yaml @@ -7,7 +7,6 @@ cuda_graph_config: enable_attention_dp: false kv_cache_config: dtype: fp8 - enable_block_reuse: false free_gpu_memory_fraction: 0.85 print_iter_log: true stream_interval: 20 diff --git a/examples/configs/database/openai/gpt-oss-120b/B200/8k1k_tp8_conc32.yaml b/examples/configs/database/openai/gpt-oss-120b/B200/8k1k_tp8_conc32.yaml index 9436b07959..74d4e29f67 100644 --- a/examples/configs/database/openai/gpt-oss-120b/B200/8k1k_tp8_conc32.yaml +++ b/examples/configs/database/openai/gpt-oss-120b/B200/8k1k_tp8_conc32.yaml @@ -7,7 +7,6 @@ cuda_graph_config: enable_attention_dp: false kv_cache_config: dtype: fp8 - enable_block_reuse: false free_gpu_memory_fraction: 0.85 print_iter_log: true stream_interval: 20 diff --git a/examples/configs/database/openai/gpt-oss-120b/B200/8k1k_tp8_conc4.yaml b/examples/configs/database/openai/gpt-oss-120b/B200/8k1k_tp8_conc4.yaml index a2917bfd5b..59b9319ee8 100644 --- a/examples/configs/database/openai/gpt-oss-120b/B200/8k1k_tp8_conc4.yaml +++ b/examples/configs/database/openai/gpt-oss-120b/B200/8k1k_tp8_conc4.yaml @@ -7,7 +7,6 @@ cuda_graph_config: enable_attention_dp: false kv_cache_config: dtype: fp8 - enable_block_reuse: false free_gpu_memory_fraction: 0.85 print_iter_log: true stream_interval: 20 diff --git a/examples/configs/database/openai/gpt-oss-120b/B200/8k1k_tp8_conc64.yaml b/examples/configs/database/openai/gpt-oss-120b/B200/8k1k_tp8_conc64.yaml index 702d3bc00c..365f467d6e 100644 --- a/examples/configs/database/openai/gpt-oss-120b/B200/8k1k_tp8_conc64.yaml +++ b/examples/configs/database/openai/gpt-oss-120b/B200/8k1k_tp8_conc64.yaml @@ -7,7 +7,6 @@ cuda_graph_config: enable_attention_dp: false kv_cache_config: dtype: fp8 - enable_block_reuse: false free_gpu_memory_fraction: 0.85 print_iter_log: true stream_interval: 20 diff --git a/examples/configs/database/openai/gpt-oss-120b/B200/8k1k_tp8_conc8.yaml b/examples/configs/database/openai/gpt-oss-120b/B200/8k1k_tp8_conc8.yaml index c0b90314c3..befde425ae 100644 --- a/examples/configs/database/openai/gpt-oss-120b/B200/8k1k_tp8_conc8.yaml +++ b/examples/configs/database/openai/gpt-oss-120b/B200/8k1k_tp8_conc8.yaml @@ -7,7 +7,6 @@ cuda_graph_config: enable_attention_dp: false kv_cache_config: dtype: fp8 - enable_block_reuse: false free_gpu_memory_fraction: 0.85 print_iter_log: true stream_interval: 20 diff --git a/examples/configs/database/openai/gpt-oss-120b/H200/1k1k_tp1_conc16.yaml b/examples/configs/database/openai/gpt-oss-120b/H200/1k1k_tp1_conc16.yaml index 2eea897e2f..6736070cc7 100644 --- a/examples/configs/database/openai/gpt-oss-120b/H200/1k1k_tp1_conc16.yaml +++ b/examples/configs/database/openai/gpt-oss-120b/H200/1k1k_tp1_conc16.yaml @@ -6,7 +6,6 @@ cuda_graph_config: enable_attention_dp: false kv_cache_config: dtype: auto - enable_block_reuse: false free_gpu_memory_fraction: 0.85 moe_config: backend: TRITON diff --git a/examples/configs/database/openai/gpt-oss-120b/H200/1k1k_tp1_conc32.yaml b/examples/configs/database/openai/gpt-oss-120b/H200/1k1k_tp1_conc32.yaml index 1a0d44fb27..a6fec264e8 100644 --- a/examples/configs/database/openai/gpt-oss-120b/H200/1k1k_tp1_conc32.yaml +++ b/examples/configs/database/openai/gpt-oss-120b/H200/1k1k_tp1_conc32.yaml @@ -6,7 +6,6 @@ cuda_graph_config: enable_attention_dp: false kv_cache_config: dtype: auto - enable_block_reuse: false free_gpu_memory_fraction: 0.85 moe_config: backend: TRITON diff --git a/examples/configs/database/openai/gpt-oss-120b/H200/1k1k_tp1_conc4.yaml b/examples/configs/database/openai/gpt-oss-120b/H200/1k1k_tp1_conc4.yaml index 82662456f0..c98b557f79 100644 --- a/examples/configs/database/openai/gpt-oss-120b/H200/1k1k_tp1_conc4.yaml +++ b/examples/configs/database/openai/gpt-oss-120b/H200/1k1k_tp1_conc4.yaml @@ -6,7 +6,6 @@ cuda_graph_config: enable_attention_dp: false kv_cache_config: dtype: auto - enable_block_reuse: false free_gpu_memory_fraction: 0.85 moe_config: backend: TRITON diff --git a/examples/configs/database/openai/gpt-oss-120b/H200/1k1k_tp1_conc64.yaml b/examples/configs/database/openai/gpt-oss-120b/H200/1k1k_tp1_conc64.yaml index 57d8e2ada2..8144ce5b4d 100644 --- a/examples/configs/database/openai/gpt-oss-120b/H200/1k1k_tp1_conc64.yaml +++ b/examples/configs/database/openai/gpt-oss-120b/H200/1k1k_tp1_conc64.yaml @@ -6,7 +6,6 @@ cuda_graph_config: enable_attention_dp: false kv_cache_config: dtype: auto - enable_block_reuse: false free_gpu_memory_fraction: 0.85 moe_config: backend: TRITON diff --git a/examples/configs/database/openai/gpt-oss-120b/H200/1k1k_tp1_conc8.yaml b/examples/configs/database/openai/gpt-oss-120b/H200/1k1k_tp1_conc8.yaml index 87e34788d7..45864a956e 100644 --- a/examples/configs/database/openai/gpt-oss-120b/H200/1k1k_tp1_conc8.yaml +++ b/examples/configs/database/openai/gpt-oss-120b/H200/1k1k_tp1_conc8.yaml @@ -6,7 +6,6 @@ cuda_graph_config: enable_attention_dp: false kv_cache_config: dtype: auto - enable_block_reuse: false free_gpu_memory_fraction: 0.85 moe_config: backend: TRITON diff --git a/examples/configs/database/openai/gpt-oss-120b/H200/1k1k_tp2_conc16.yaml b/examples/configs/database/openai/gpt-oss-120b/H200/1k1k_tp2_conc16.yaml index 57b4b87fc7..4762e57770 100644 --- a/examples/configs/database/openai/gpt-oss-120b/H200/1k1k_tp2_conc16.yaml +++ b/examples/configs/database/openai/gpt-oss-120b/H200/1k1k_tp2_conc16.yaml @@ -6,7 +6,6 @@ cuda_graph_config: enable_attention_dp: false kv_cache_config: dtype: auto - enable_block_reuse: false free_gpu_memory_fraction: 0.85 moe_config: backend: TRITON diff --git a/examples/configs/database/openai/gpt-oss-120b/H200/1k1k_tp2_conc32.yaml b/examples/configs/database/openai/gpt-oss-120b/H200/1k1k_tp2_conc32.yaml index 0d796e4751..3a16da2d4a 100644 --- a/examples/configs/database/openai/gpt-oss-120b/H200/1k1k_tp2_conc32.yaml +++ b/examples/configs/database/openai/gpt-oss-120b/H200/1k1k_tp2_conc32.yaml @@ -6,7 +6,6 @@ cuda_graph_config: enable_attention_dp: false kv_cache_config: dtype: auto - enable_block_reuse: false free_gpu_memory_fraction: 0.85 moe_config: backend: TRITON diff --git a/examples/configs/database/openai/gpt-oss-120b/H200/1k1k_tp2_conc4.yaml b/examples/configs/database/openai/gpt-oss-120b/H200/1k1k_tp2_conc4.yaml index f6c41d8bbd..2b789c4add 100644 --- a/examples/configs/database/openai/gpt-oss-120b/H200/1k1k_tp2_conc4.yaml +++ b/examples/configs/database/openai/gpt-oss-120b/H200/1k1k_tp2_conc4.yaml @@ -6,7 +6,6 @@ cuda_graph_config: enable_attention_dp: false kv_cache_config: dtype: auto - enable_block_reuse: false free_gpu_memory_fraction: 0.85 moe_config: backend: TRITON diff --git a/examples/configs/database/openai/gpt-oss-120b/H200/1k1k_tp2_conc64.yaml b/examples/configs/database/openai/gpt-oss-120b/H200/1k1k_tp2_conc64.yaml index fdec025db8..94c40f2ac2 100644 --- a/examples/configs/database/openai/gpt-oss-120b/H200/1k1k_tp2_conc64.yaml +++ b/examples/configs/database/openai/gpt-oss-120b/H200/1k1k_tp2_conc64.yaml @@ -6,7 +6,6 @@ cuda_graph_config: enable_attention_dp: false kv_cache_config: dtype: auto - enable_block_reuse: false free_gpu_memory_fraction: 0.85 moe_config: backend: TRITON diff --git a/examples/configs/database/openai/gpt-oss-120b/H200/1k1k_tp2_conc8.yaml b/examples/configs/database/openai/gpt-oss-120b/H200/1k1k_tp2_conc8.yaml index 8565e82e36..2b866099d8 100644 --- a/examples/configs/database/openai/gpt-oss-120b/H200/1k1k_tp2_conc8.yaml +++ b/examples/configs/database/openai/gpt-oss-120b/H200/1k1k_tp2_conc8.yaml @@ -6,7 +6,6 @@ cuda_graph_config: enable_attention_dp: false kv_cache_config: dtype: auto - enable_block_reuse: false free_gpu_memory_fraction: 0.85 moe_config: backend: TRITON diff --git a/examples/configs/database/openai/gpt-oss-120b/H200/1k1k_tp4_conc16.yaml b/examples/configs/database/openai/gpt-oss-120b/H200/1k1k_tp4_conc16.yaml index 4773067517..ff543422a0 100644 --- a/examples/configs/database/openai/gpt-oss-120b/H200/1k1k_tp4_conc16.yaml +++ b/examples/configs/database/openai/gpt-oss-120b/H200/1k1k_tp4_conc16.yaml @@ -6,7 +6,6 @@ cuda_graph_config: enable_attention_dp: false kv_cache_config: dtype: auto - enable_block_reuse: false free_gpu_memory_fraction: 0.85 moe_config: backend: TRITON diff --git a/examples/configs/database/openai/gpt-oss-120b/H200/1k1k_tp4_conc32.yaml b/examples/configs/database/openai/gpt-oss-120b/H200/1k1k_tp4_conc32.yaml index 5e0d27c5ea..95477ae879 100644 --- a/examples/configs/database/openai/gpt-oss-120b/H200/1k1k_tp4_conc32.yaml +++ b/examples/configs/database/openai/gpt-oss-120b/H200/1k1k_tp4_conc32.yaml @@ -6,7 +6,6 @@ cuda_graph_config: enable_attention_dp: false kv_cache_config: dtype: auto - enable_block_reuse: false free_gpu_memory_fraction: 0.85 moe_config: backend: TRITON diff --git a/examples/configs/database/openai/gpt-oss-120b/H200/1k1k_tp4_conc4.yaml b/examples/configs/database/openai/gpt-oss-120b/H200/1k1k_tp4_conc4.yaml index 9b135c0a32..dca7e5a63a 100644 --- a/examples/configs/database/openai/gpt-oss-120b/H200/1k1k_tp4_conc4.yaml +++ b/examples/configs/database/openai/gpt-oss-120b/H200/1k1k_tp4_conc4.yaml @@ -6,7 +6,6 @@ cuda_graph_config: enable_attention_dp: false kv_cache_config: dtype: auto - enable_block_reuse: false free_gpu_memory_fraction: 0.85 moe_config: backend: TRITON diff --git a/examples/configs/database/openai/gpt-oss-120b/H200/1k1k_tp4_conc64.yaml b/examples/configs/database/openai/gpt-oss-120b/H200/1k1k_tp4_conc64.yaml index 6874784b9f..1b2ad812ed 100644 --- a/examples/configs/database/openai/gpt-oss-120b/H200/1k1k_tp4_conc64.yaml +++ b/examples/configs/database/openai/gpt-oss-120b/H200/1k1k_tp4_conc64.yaml @@ -6,7 +6,6 @@ cuda_graph_config: enable_attention_dp: false kv_cache_config: dtype: auto - enable_block_reuse: false free_gpu_memory_fraction: 0.85 moe_config: backend: TRITON diff --git a/examples/configs/database/openai/gpt-oss-120b/H200/1k1k_tp4_conc8.yaml b/examples/configs/database/openai/gpt-oss-120b/H200/1k1k_tp4_conc8.yaml index cc1d2d8ac9..0d1ee33fee 100644 --- a/examples/configs/database/openai/gpt-oss-120b/H200/1k1k_tp4_conc8.yaml +++ b/examples/configs/database/openai/gpt-oss-120b/H200/1k1k_tp4_conc8.yaml @@ -6,7 +6,6 @@ cuda_graph_config: enable_attention_dp: false kv_cache_config: dtype: auto - enable_block_reuse: false free_gpu_memory_fraction: 0.85 moe_config: backend: TRITON diff --git a/examples/configs/database/openai/gpt-oss-120b/H200/1k1k_tp8_conc16.yaml b/examples/configs/database/openai/gpt-oss-120b/H200/1k1k_tp8_conc16.yaml index f7e46b17a3..142a9c07af 100644 --- a/examples/configs/database/openai/gpt-oss-120b/H200/1k1k_tp8_conc16.yaml +++ b/examples/configs/database/openai/gpt-oss-120b/H200/1k1k_tp8_conc16.yaml @@ -6,7 +6,6 @@ cuda_graph_config: enable_attention_dp: false kv_cache_config: dtype: auto - enable_block_reuse: false free_gpu_memory_fraction: 0.85 moe_config: backend: TRITON diff --git a/examples/configs/database/openai/gpt-oss-120b/H200/1k1k_tp8_conc32.yaml b/examples/configs/database/openai/gpt-oss-120b/H200/1k1k_tp8_conc32.yaml index 1b1b874c3e..c42c0506e1 100644 --- a/examples/configs/database/openai/gpt-oss-120b/H200/1k1k_tp8_conc32.yaml +++ b/examples/configs/database/openai/gpt-oss-120b/H200/1k1k_tp8_conc32.yaml @@ -6,7 +6,6 @@ cuda_graph_config: enable_attention_dp: false kv_cache_config: dtype: auto - enable_block_reuse: false free_gpu_memory_fraction: 0.85 moe_config: backend: TRITON diff --git a/examples/configs/database/openai/gpt-oss-120b/H200/1k1k_tp8_conc4.yaml b/examples/configs/database/openai/gpt-oss-120b/H200/1k1k_tp8_conc4.yaml index 28a7f3d17c..161ebf4cdb 100644 --- a/examples/configs/database/openai/gpt-oss-120b/H200/1k1k_tp8_conc4.yaml +++ b/examples/configs/database/openai/gpt-oss-120b/H200/1k1k_tp8_conc4.yaml @@ -6,7 +6,6 @@ cuda_graph_config: enable_attention_dp: false kv_cache_config: dtype: auto - enable_block_reuse: false free_gpu_memory_fraction: 0.85 moe_config: backend: TRITON diff --git a/examples/configs/database/openai/gpt-oss-120b/H200/1k1k_tp8_conc64.yaml b/examples/configs/database/openai/gpt-oss-120b/H200/1k1k_tp8_conc64.yaml index 8036e74399..f7f9a1b76d 100644 --- a/examples/configs/database/openai/gpt-oss-120b/H200/1k1k_tp8_conc64.yaml +++ b/examples/configs/database/openai/gpt-oss-120b/H200/1k1k_tp8_conc64.yaml @@ -6,7 +6,6 @@ cuda_graph_config: enable_attention_dp: false kv_cache_config: dtype: auto - enable_block_reuse: false free_gpu_memory_fraction: 0.85 moe_config: backend: TRITON diff --git a/examples/configs/database/openai/gpt-oss-120b/H200/1k1k_tp8_conc8.yaml b/examples/configs/database/openai/gpt-oss-120b/H200/1k1k_tp8_conc8.yaml index 12289904ed..0c9084d25c 100644 --- a/examples/configs/database/openai/gpt-oss-120b/H200/1k1k_tp8_conc8.yaml +++ b/examples/configs/database/openai/gpt-oss-120b/H200/1k1k_tp8_conc8.yaml @@ -6,7 +6,6 @@ cuda_graph_config: enable_attention_dp: false kv_cache_config: dtype: auto - enable_block_reuse: false free_gpu_memory_fraction: 0.85 moe_config: backend: TRITON diff --git a/examples/configs/database/openai/gpt-oss-120b/H200/1k8k_tp1_conc16.yaml b/examples/configs/database/openai/gpt-oss-120b/H200/1k8k_tp1_conc16.yaml index 7ccdc4ae11..466565e489 100644 --- a/examples/configs/database/openai/gpt-oss-120b/H200/1k8k_tp1_conc16.yaml +++ b/examples/configs/database/openai/gpt-oss-120b/H200/1k8k_tp1_conc16.yaml @@ -6,7 +6,6 @@ cuda_graph_config: enable_attention_dp: false kv_cache_config: dtype: auto - enable_block_reuse: false free_gpu_memory_fraction: 0.85 moe_config: backend: TRITON diff --git a/examples/configs/database/openai/gpt-oss-120b/H200/1k8k_tp1_conc32.yaml b/examples/configs/database/openai/gpt-oss-120b/H200/1k8k_tp1_conc32.yaml index ea6a93ba64..4f0632784c 100644 --- a/examples/configs/database/openai/gpt-oss-120b/H200/1k8k_tp1_conc32.yaml +++ b/examples/configs/database/openai/gpt-oss-120b/H200/1k8k_tp1_conc32.yaml @@ -6,7 +6,6 @@ cuda_graph_config: enable_attention_dp: false kv_cache_config: dtype: auto - enable_block_reuse: false free_gpu_memory_fraction: 0.85 moe_config: backend: TRITON diff --git a/examples/configs/database/openai/gpt-oss-120b/H200/1k8k_tp1_conc4.yaml b/examples/configs/database/openai/gpt-oss-120b/H200/1k8k_tp1_conc4.yaml index a0149f2ab5..57f3c1547c 100644 --- a/examples/configs/database/openai/gpt-oss-120b/H200/1k8k_tp1_conc4.yaml +++ b/examples/configs/database/openai/gpt-oss-120b/H200/1k8k_tp1_conc4.yaml @@ -6,7 +6,6 @@ cuda_graph_config: enable_attention_dp: false kv_cache_config: dtype: auto - enable_block_reuse: false free_gpu_memory_fraction: 0.85 moe_config: backend: TRITON diff --git a/examples/configs/database/openai/gpt-oss-120b/H200/1k8k_tp1_conc64.yaml b/examples/configs/database/openai/gpt-oss-120b/H200/1k8k_tp1_conc64.yaml index 3ae56a300a..e61ad0b3de 100644 --- a/examples/configs/database/openai/gpt-oss-120b/H200/1k8k_tp1_conc64.yaml +++ b/examples/configs/database/openai/gpt-oss-120b/H200/1k8k_tp1_conc64.yaml @@ -6,7 +6,6 @@ cuda_graph_config: enable_attention_dp: false kv_cache_config: dtype: auto - enable_block_reuse: false free_gpu_memory_fraction: 0.85 moe_config: backend: TRITON diff --git a/examples/configs/database/openai/gpt-oss-120b/H200/1k8k_tp1_conc8.yaml b/examples/configs/database/openai/gpt-oss-120b/H200/1k8k_tp1_conc8.yaml index c18bc3c758..53b5461cf4 100644 --- a/examples/configs/database/openai/gpt-oss-120b/H200/1k8k_tp1_conc8.yaml +++ b/examples/configs/database/openai/gpt-oss-120b/H200/1k8k_tp1_conc8.yaml @@ -6,7 +6,6 @@ cuda_graph_config: enable_attention_dp: false kv_cache_config: dtype: auto - enable_block_reuse: false free_gpu_memory_fraction: 0.85 moe_config: backend: TRITON diff --git a/examples/configs/database/openai/gpt-oss-120b/H200/1k8k_tp2_conc16.yaml b/examples/configs/database/openai/gpt-oss-120b/H200/1k8k_tp2_conc16.yaml index e88b4e05fe..41d9a88b20 100644 --- a/examples/configs/database/openai/gpt-oss-120b/H200/1k8k_tp2_conc16.yaml +++ b/examples/configs/database/openai/gpt-oss-120b/H200/1k8k_tp2_conc16.yaml @@ -6,7 +6,6 @@ cuda_graph_config: enable_attention_dp: false kv_cache_config: dtype: auto - enable_block_reuse: false free_gpu_memory_fraction: 0.85 moe_config: backend: TRITON diff --git a/examples/configs/database/openai/gpt-oss-120b/H200/1k8k_tp2_conc32.yaml b/examples/configs/database/openai/gpt-oss-120b/H200/1k8k_tp2_conc32.yaml index 95b8e20733..5bd3aa86c7 100644 --- a/examples/configs/database/openai/gpt-oss-120b/H200/1k8k_tp2_conc32.yaml +++ b/examples/configs/database/openai/gpt-oss-120b/H200/1k8k_tp2_conc32.yaml @@ -6,7 +6,6 @@ cuda_graph_config: enable_attention_dp: false kv_cache_config: dtype: auto - enable_block_reuse: false free_gpu_memory_fraction: 0.85 moe_config: backend: TRITON diff --git a/examples/configs/database/openai/gpt-oss-120b/H200/1k8k_tp2_conc4.yaml b/examples/configs/database/openai/gpt-oss-120b/H200/1k8k_tp2_conc4.yaml index c35b691a81..90854c5fbd 100644 --- a/examples/configs/database/openai/gpt-oss-120b/H200/1k8k_tp2_conc4.yaml +++ b/examples/configs/database/openai/gpt-oss-120b/H200/1k8k_tp2_conc4.yaml @@ -6,7 +6,6 @@ cuda_graph_config: enable_attention_dp: false kv_cache_config: dtype: auto - enable_block_reuse: false free_gpu_memory_fraction: 0.85 moe_config: backend: TRITON diff --git a/examples/configs/database/openai/gpt-oss-120b/H200/1k8k_tp2_conc64.yaml b/examples/configs/database/openai/gpt-oss-120b/H200/1k8k_tp2_conc64.yaml index ce0f7c2757..d16f3ecc95 100644 --- a/examples/configs/database/openai/gpt-oss-120b/H200/1k8k_tp2_conc64.yaml +++ b/examples/configs/database/openai/gpt-oss-120b/H200/1k8k_tp2_conc64.yaml @@ -6,7 +6,6 @@ cuda_graph_config: enable_attention_dp: false kv_cache_config: dtype: auto - enable_block_reuse: false free_gpu_memory_fraction: 0.85 moe_config: backend: TRITON diff --git a/examples/configs/database/openai/gpt-oss-120b/H200/1k8k_tp2_conc8.yaml b/examples/configs/database/openai/gpt-oss-120b/H200/1k8k_tp2_conc8.yaml index 344166bc32..3617162742 100644 --- a/examples/configs/database/openai/gpt-oss-120b/H200/1k8k_tp2_conc8.yaml +++ b/examples/configs/database/openai/gpt-oss-120b/H200/1k8k_tp2_conc8.yaml @@ -6,7 +6,6 @@ cuda_graph_config: enable_attention_dp: false kv_cache_config: dtype: auto - enable_block_reuse: false free_gpu_memory_fraction: 0.85 moe_config: backend: TRITON diff --git a/examples/configs/database/openai/gpt-oss-120b/H200/1k8k_tp4_conc16.yaml b/examples/configs/database/openai/gpt-oss-120b/H200/1k8k_tp4_conc16.yaml index 4f895199b1..bb3a41bc2f 100644 --- a/examples/configs/database/openai/gpt-oss-120b/H200/1k8k_tp4_conc16.yaml +++ b/examples/configs/database/openai/gpt-oss-120b/H200/1k8k_tp4_conc16.yaml @@ -6,7 +6,6 @@ cuda_graph_config: enable_attention_dp: false kv_cache_config: dtype: auto - enable_block_reuse: false free_gpu_memory_fraction: 0.85 moe_config: backend: TRITON diff --git a/examples/configs/database/openai/gpt-oss-120b/H200/1k8k_tp4_conc32.yaml b/examples/configs/database/openai/gpt-oss-120b/H200/1k8k_tp4_conc32.yaml index ca549de3d2..1dd6379a4e 100644 --- a/examples/configs/database/openai/gpt-oss-120b/H200/1k8k_tp4_conc32.yaml +++ b/examples/configs/database/openai/gpt-oss-120b/H200/1k8k_tp4_conc32.yaml @@ -6,7 +6,6 @@ cuda_graph_config: enable_attention_dp: false kv_cache_config: dtype: auto - enable_block_reuse: false free_gpu_memory_fraction: 0.85 moe_config: backend: TRITON diff --git a/examples/configs/database/openai/gpt-oss-120b/H200/1k8k_tp4_conc4.yaml b/examples/configs/database/openai/gpt-oss-120b/H200/1k8k_tp4_conc4.yaml index b87044bbc0..e63805377e 100644 --- a/examples/configs/database/openai/gpt-oss-120b/H200/1k8k_tp4_conc4.yaml +++ b/examples/configs/database/openai/gpt-oss-120b/H200/1k8k_tp4_conc4.yaml @@ -6,7 +6,6 @@ cuda_graph_config: enable_attention_dp: false kv_cache_config: dtype: auto - enable_block_reuse: false free_gpu_memory_fraction: 0.85 moe_config: backend: TRITON diff --git a/examples/configs/database/openai/gpt-oss-120b/H200/1k8k_tp4_conc64.yaml b/examples/configs/database/openai/gpt-oss-120b/H200/1k8k_tp4_conc64.yaml index 9af104970e..52be12ec84 100644 --- a/examples/configs/database/openai/gpt-oss-120b/H200/1k8k_tp4_conc64.yaml +++ b/examples/configs/database/openai/gpt-oss-120b/H200/1k8k_tp4_conc64.yaml @@ -6,7 +6,6 @@ cuda_graph_config: enable_attention_dp: false kv_cache_config: dtype: auto - enable_block_reuse: false free_gpu_memory_fraction: 0.85 moe_config: backend: TRITON diff --git a/examples/configs/database/openai/gpt-oss-120b/H200/1k8k_tp4_conc8.yaml b/examples/configs/database/openai/gpt-oss-120b/H200/1k8k_tp4_conc8.yaml index 7440c3fcb7..a80d8f26ab 100644 --- a/examples/configs/database/openai/gpt-oss-120b/H200/1k8k_tp4_conc8.yaml +++ b/examples/configs/database/openai/gpt-oss-120b/H200/1k8k_tp4_conc8.yaml @@ -6,7 +6,6 @@ cuda_graph_config: enable_attention_dp: false kv_cache_config: dtype: auto - enable_block_reuse: false free_gpu_memory_fraction: 0.85 moe_config: backend: TRITON diff --git a/examples/configs/database/openai/gpt-oss-120b/H200/1k8k_tp8_conc16.yaml b/examples/configs/database/openai/gpt-oss-120b/H200/1k8k_tp8_conc16.yaml index b1d8a6eead..4c416c78bb 100644 --- a/examples/configs/database/openai/gpt-oss-120b/H200/1k8k_tp8_conc16.yaml +++ b/examples/configs/database/openai/gpt-oss-120b/H200/1k8k_tp8_conc16.yaml @@ -6,7 +6,6 @@ cuda_graph_config: enable_attention_dp: false kv_cache_config: dtype: auto - enable_block_reuse: false free_gpu_memory_fraction: 0.85 moe_config: backend: TRITON diff --git a/examples/configs/database/openai/gpt-oss-120b/H200/1k8k_tp8_conc32.yaml b/examples/configs/database/openai/gpt-oss-120b/H200/1k8k_tp8_conc32.yaml index f8c7fec13a..eade986209 100644 --- a/examples/configs/database/openai/gpt-oss-120b/H200/1k8k_tp8_conc32.yaml +++ b/examples/configs/database/openai/gpt-oss-120b/H200/1k8k_tp8_conc32.yaml @@ -6,7 +6,6 @@ cuda_graph_config: enable_attention_dp: false kv_cache_config: dtype: auto - enable_block_reuse: false free_gpu_memory_fraction: 0.85 moe_config: backend: TRITON diff --git a/examples/configs/database/openai/gpt-oss-120b/H200/1k8k_tp8_conc4.yaml b/examples/configs/database/openai/gpt-oss-120b/H200/1k8k_tp8_conc4.yaml index f9cb8feb69..24916df78a 100644 --- a/examples/configs/database/openai/gpt-oss-120b/H200/1k8k_tp8_conc4.yaml +++ b/examples/configs/database/openai/gpt-oss-120b/H200/1k8k_tp8_conc4.yaml @@ -6,7 +6,6 @@ cuda_graph_config: enable_attention_dp: false kv_cache_config: dtype: auto - enable_block_reuse: false free_gpu_memory_fraction: 0.85 moe_config: backend: TRITON diff --git a/examples/configs/database/openai/gpt-oss-120b/H200/1k8k_tp8_conc64.yaml b/examples/configs/database/openai/gpt-oss-120b/H200/1k8k_tp8_conc64.yaml index a9124d7007..a609e99a88 100644 --- a/examples/configs/database/openai/gpt-oss-120b/H200/1k8k_tp8_conc64.yaml +++ b/examples/configs/database/openai/gpt-oss-120b/H200/1k8k_tp8_conc64.yaml @@ -6,7 +6,6 @@ cuda_graph_config: enable_attention_dp: false kv_cache_config: dtype: auto - enable_block_reuse: false free_gpu_memory_fraction: 0.85 moe_config: backend: TRITON diff --git a/examples/configs/database/openai/gpt-oss-120b/H200/1k8k_tp8_conc8.yaml b/examples/configs/database/openai/gpt-oss-120b/H200/1k8k_tp8_conc8.yaml index 7c2507ace7..9c37e6359d 100644 --- a/examples/configs/database/openai/gpt-oss-120b/H200/1k8k_tp8_conc8.yaml +++ b/examples/configs/database/openai/gpt-oss-120b/H200/1k8k_tp8_conc8.yaml @@ -6,7 +6,6 @@ cuda_graph_config: enable_attention_dp: false kv_cache_config: dtype: auto - enable_block_reuse: false free_gpu_memory_fraction: 0.85 moe_config: backend: TRITON diff --git a/examples/configs/database/openai/gpt-oss-120b/H200/8k1k_tp1_conc16.yaml b/examples/configs/database/openai/gpt-oss-120b/H200/8k1k_tp1_conc16.yaml index 7ccdc4ae11..466565e489 100644 --- a/examples/configs/database/openai/gpt-oss-120b/H200/8k1k_tp1_conc16.yaml +++ b/examples/configs/database/openai/gpt-oss-120b/H200/8k1k_tp1_conc16.yaml @@ -6,7 +6,6 @@ cuda_graph_config: enable_attention_dp: false kv_cache_config: dtype: auto - enable_block_reuse: false free_gpu_memory_fraction: 0.85 moe_config: backend: TRITON diff --git a/examples/configs/database/openai/gpt-oss-120b/H200/8k1k_tp1_conc32.yaml b/examples/configs/database/openai/gpt-oss-120b/H200/8k1k_tp1_conc32.yaml index ea6a93ba64..4f0632784c 100644 --- a/examples/configs/database/openai/gpt-oss-120b/H200/8k1k_tp1_conc32.yaml +++ b/examples/configs/database/openai/gpt-oss-120b/H200/8k1k_tp1_conc32.yaml @@ -6,7 +6,6 @@ cuda_graph_config: enable_attention_dp: false kv_cache_config: dtype: auto - enable_block_reuse: false free_gpu_memory_fraction: 0.85 moe_config: backend: TRITON diff --git a/examples/configs/database/openai/gpt-oss-120b/H200/8k1k_tp1_conc4.yaml b/examples/configs/database/openai/gpt-oss-120b/H200/8k1k_tp1_conc4.yaml index a0149f2ab5..57f3c1547c 100644 --- a/examples/configs/database/openai/gpt-oss-120b/H200/8k1k_tp1_conc4.yaml +++ b/examples/configs/database/openai/gpt-oss-120b/H200/8k1k_tp1_conc4.yaml @@ -6,7 +6,6 @@ cuda_graph_config: enable_attention_dp: false kv_cache_config: dtype: auto - enable_block_reuse: false free_gpu_memory_fraction: 0.85 moe_config: backend: TRITON diff --git a/examples/configs/database/openai/gpt-oss-120b/H200/8k1k_tp1_conc64.yaml b/examples/configs/database/openai/gpt-oss-120b/H200/8k1k_tp1_conc64.yaml index 3ae56a300a..e61ad0b3de 100644 --- a/examples/configs/database/openai/gpt-oss-120b/H200/8k1k_tp1_conc64.yaml +++ b/examples/configs/database/openai/gpt-oss-120b/H200/8k1k_tp1_conc64.yaml @@ -6,7 +6,6 @@ cuda_graph_config: enable_attention_dp: false kv_cache_config: dtype: auto - enable_block_reuse: false free_gpu_memory_fraction: 0.85 moe_config: backend: TRITON diff --git a/examples/configs/database/openai/gpt-oss-120b/H200/8k1k_tp1_conc8.yaml b/examples/configs/database/openai/gpt-oss-120b/H200/8k1k_tp1_conc8.yaml index c18bc3c758..53b5461cf4 100644 --- a/examples/configs/database/openai/gpt-oss-120b/H200/8k1k_tp1_conc8.yaml +++ b/examples/configs/database/openai/gpt-oss-120b/H200/8k1k_tp1_conc8.yaml @@ -6,7 +6,6 @@ cuda_graph_config: enable_attention_dp: false kv_cache_config: dtype: auto - enable_block_reuse: false free_gpu_memory_fraction: 0.85 moe_config: backend: TRITON diff --git a/examples/configs/database/openai/gpt-oss-120b/H200/8k1k_tp2_conc16.yaml b/examples/configs/database/openai/gpt-oss-120b/H200/8k1k_tp2_conc16.yaml index e88b4e05fe..41d9a88b20 100644 --- a/examples/configs/database/openai/gpt-oss-120b/H200/8k1k_tp2_conc16.yaml +++ b/examples/configs/database/openai/gpt-oss-120b/H200/8k1k_tp2_conc16.yaml @@ -6,7 +6,6 @@ cuda_graph_config: enable_attention_dp: false kv_cache_config: dtype: auto - enable_block_reuse: false free_gpu_memory_fraction: 0.85 moe_config: backend: TRITON diff --git a/examples/configs/database/openai/gpt-oss-120b/H200/8k1k_tp2_conc32.yaml b/examples/configs/database/openai/gpt-oss-120b/H200/8k1k_tp2_conc32.yaml index 95b8e20733..5bd3aa86c7 100644 --- a/examples/configs/database/openai/gpt-oss-120b/H200/8k1k_tp2_conc32.yaml +++ b/examples/configs/database/openai/gpt-oss-120b/H200/8k1k_tp2_conc32.yaml @@ -6,7 +6,6 @@ cuda_graph_config: enable_attention_dp: false kv_cache_config: dtype: auto - enable_block_reuse: false free_gpu_memory_fraction: 0.85 moe_config: backend: TRITON diff --git a/examples/configs/database/openai/gpt-oss-120b/H200/8k1k_tp2_conc4.yaml b/examples/configs/database/openai/gpt-oss-120b/H200/8k1k_tp2_conc4.yaml index c35b691a81..90854c5fbd 100644 --- a/examples/configs/database/openai/gpt-oss-120b/H200/8k1k_tp2_conc4.yaml +++ b/examples/configs/database/openai/gpt-oss-120b/H200/8k1k_tp2_conc4.yaml @@ -6,7 +6,6 @@ cuda_graph_config: enable_attention_dp: false kv_cache_config: dtype: auto - enable_block_reuse: false free_gpu_memory_fraction: 0.85 moe_config: backend: TRITON diff --git a/examples/configs/database/openai/gpt-oss-120b/H200/8k1k_tp2_conc64.yaml b/examples/configs/database/openai/gpt-oss-120b/H200/8k1k_tp2_conc64.yaml index ce0f7c2757..d16f3ecc95 100644 --- a/examples/configs/database/openai/gpt-oss-120b/H200/8k1k_tp2_conc64.yaml +++ b/examples/configs/database/openai/gpt-oss-120b/H200/8k1k_tp2_conc64.yaml @@ -6,7 +6,6 @@ cuda_graph_config: enable_attention_dp: false kv_cache_config: dtype: auto - enable_block_reuse: false free_gpu_memory_fraction: 0.85 moe_config: backend: TRITON diff --git a/examples/configs/database/openai/gpt-oss-120b/H200/8k1k_tp2_conc8.yaml b/examples/configs/database/openai/gpt-oss-120b/H200/8k1k_tp2_conc8.yaml index 344166bc32..3617162742 100644 --- a/examples/configs/database/openai/gpt-oss-120b/H200/8k1k_tp2_conc8.yaml +++ b/examples/configs/database/openai/gpt-oss-120b/H200/8k1k_tp2_conc8.yaml @@ -6,7 +6,6 @@ cuda_graph_config: enable_attention_dp: false kv_cache_config: dtype: auto - enable_block_reuse: false free_gpu_memory_fraction: 0.85 moe_config: backend: TRITON diff --git a/examples/configs/database/openai/gpt-oss-120b/H200/8k1k_tp4_conc16.yaml b/examples/configs/database/openai/gpt-oss-120b/H200/8k1k_tp4_conc16.yaml index 4f895199b1..bb3a41bc2f 100644 --- a/examples/configs/database/openai/gpt-oss-120b/H200/8k1k_tp4_conc16.yaml +++ b/examples/configs/database/openai/gpt-oss-120b/H200/8k1k_tp4_conc16.yaml @@ -6,7 +6,6 @@ cuda_graph_config: enable_attention_dp: false kv_cache_config: dtype: auto - enable_block_reuse: false free_gpu_memory_fraction: 0.85 moe_config: backend: TRITON diff --git a/examples/configs/database/openai/gpt-oss-120b/H200/8k1k_tp4_conc32.yaml b/examples/configs/database/openai/gpt-oss-120b/H200/8k1k_tp4_conc32.yaml index ca549de3d2..1dd6379a4e 100644 --- a/examples/configs/database/openai/gpt-oss-120b/H200/8k1k_tp4_conc32.yaml +++ b/examples/configs/database/openai/gpt-oss-120b/H200/8k1k_tp4_conc32.yaml @@ -6,7 +6,6 @@ cuda_graph_config: enable_attention_dp: false kv_cache_config: dtype: auto - enable_block_reuse: false free_gpu_memory_fraction: 0.85 moe_config: backend: TRITON diff --git a/examples/configs/database/openai/gpt-oss-120b/H200/8k1k_tp4_conc4.yaml b/examples/configs/database/openai/gpt-oss-120b/H200/8k1k_tp4_conc4.yaml index b87044bbc0..e63805377e 100644 --- a/examples/configs/database/openai/gpt-oss-120b/H200/8k1k_tp4_conc4.yaml +++ b/examples/configs/database/openai/gpt-oss-120b/H200/8k1k_tp4_conc4.yaml @@ -6,7 +6,6 @@ cuda_graph_config: enable_attention_dp: false kv_cache_config: dtype: auto - enable_block_reuse: false free_gpu_memory_fraction: 0.85 moe_config: backend: TRITON diff --git a/examples/configs/database/openai/gpt-oss-120b/H200/8k1k_tp4_conc64.yaml b/examples/configs/database/openai/gpt-oss-120b/H200/8k1k_tp4_conc64.yaml index 9af104970e..52be12ec84 100644 --- a/examples/configs/database/openai/gpt-oss-120b/H200/8k1k_tp4_conc64.yaml +++ b/examples/configs/database/openai/gpt-oss-120b/H200/8k1k_tp4_conc64.yaml @@ -6,7 +6,6 @@ cuda_graph_config: enable_attention_dp: false kv_cache_config: dtype: auto - enable_block_reuse: false free_gpu_memory_fraction: 0.85 moe_config: backend: TRITON diff --git a/examples/configs/database/openai/gpt-oss-120b/H200/8k1k_tp4_conc8.yaml b/examples/configs/database/openai/gpt-oss-120b/H200/8k1k_tp4_conc8.yaml index 7440c3fcb7..a80d8f26ab 100644 --- a/examples/configs/database/openai/gpt-oss-120b/H200/8k1k_tp4_conc8.yaml +++ b/examples/configs/database/openai/gpt-oss-120b/H200/8k1k_tp4_conc8.yaml @@ -6,7 +6,6 @@ cuda_graph_config: enable_attention_dp: false kv_cache_config: dtype: auto - enable_block_reuse: false free_gpu_memory_fraction: 0.85 moe_config: backend: TRITON diff --git a/examples/configs/database/openai/gpt-oss-120b/H200/8k1k_tp8_conc16.yaml b/examples/configs/database/openai/gpt-oss-120b/H200/8k1k_tp8_conc16.yaml index b1d8a6eead..4c416c78bb 100644 --- a/examples/configs/database/openai/gpt-oss-120b/H200/8k1k_tp8_conc16.yaml +++ b/examples/configs/database/openai/gpt-oss-120b/H200/8k1k_tp8_conc16.yaml @@ -6,7 +6,6 @@ cuda_graph_config: enable_attention_dp: false kv_cache_config: dtype: auto - enable_block_reuse: false free_gpu_memory_fraction: 0.85 moe_config: backend: TRITON diff --git a/examples/configs/database/openai/gpt-oss-120b/H200/8k1k_tp8_conc32.yaml b/examples/configs/database/openai/gpt-oss-120b/H200/8k1k_tp8_conc32.yaml index f8c7fec13a..eade986209 100644 --- a/examples/configs/database/openai/gpt-oss-120b/H200/8k1k_tp8_conc32.yaml +++ b/examples/configs/database/openai/gpt-oss-120b/H200/8k1k_tp8_conc32.yaml @@ -6,7 +6,6 @@ cuda_graph_config: enable_attention_dp: false kv_cache_config: dtype: auto - enable_block_reuse: false free_gpu_memory_fraction: 0.85 moe_config: backend: TRITON diff --git a/examples/configs/database/openai/gpt-oss-120b/H200/8k1k_tp8_conc4.yaml b/examples/configs/database/openai/gpt-oss-120b/H200/8k1k_tp8_conc4.yaml index f9cb8feb69..24916df78a 100644 --- a/examples/configs/database/openai/gpt-oss-120b/H200/8k1k_tp8_conc4.yaml +++ b/examples/configs/database/openai/gpt-oss-120b/H200/8k1k_tp8_conc4.yaml @@ -6,7 +6,6 @@ cuda_graph_config: enable_attention_dp: false kv_cache_config: dtype: auto - enable_block_reuse: false free_gpu_memory_fraction: 0.85 moe_config: backend: TRITON diff --git a/examples/configs/database/openai/gpt-oss-120b/H200/8k1k_tp8_conc64.yaml b/examples/configs/database/openai/gpt-oss-120b/H200/8k1k_tp8_conc64.yaml index a9124d7007..a609e99a88 100644 --- a/examples/configs/database/openai/gpt-oss-120b/H200/8k1k_tp8_conc64.yaml +++ b/examples/configs/database/openai/gpt-oss-120b/H200/8k1k_tp8_conc64.yaml @@ -6,7 +6,6 @@ cuda_graph_config: enable_attention_dp: false kv_cache_config: dtype: auto - enable_block_reuse: false free_gpu_memory_fraction: 0.85 moe_config: backend: TRITON diff --git a/examples/configs/database/openai/gpt-oss-120b/H200/8k1k_tp8_conc8.yaml b/examples/configs/database/openai/gpt-oss-120b/H200/8k1k_tp8_conc8.yaml index 7c2507ace7..9c37e6359d 100644 --- a/examples/configs/database/openai/gpt-oss-120b/H200/8k1k_tp8_conc8.yaml +++ b/examples/configs/database/openai/gpt-oss-120b/H200/8k1k_tp8_conc8.yaml @@ -6,7 +6,6 @@ cuda_graph_config: enable_attention_dp: false kv_cache_config: dtype: auto - enable_block_reuse: false free_gpu_memory_fraction: 0.85 moe_config: backend: TRITON diff --git a/examples/disaggregated/README.md b/examples/disaggregated/README.md index 8b99f8845f..64dd80cbdf 100644 --- a/examples/disaggregated/README.md +++ b/examples/disaggregated/README.md @@ -23,10 +23,10 @@ cache_transceiver_config: kv_transfer_sender_future_timeout_ms: ``` -The following is an example, consisting of the `ctx_extra-llm-api-config.yaml` and `gen_extra-llm-api-config.yaml` files needed in the sections below. +The following is an example, consisting of the `ctx_config.yaml` and `gen_config.yaml` files needed in the sections below. ```yaml -# ctx_extra-llm-api-config.yaml +# ctx_config.yaml # The overlap scheduler for context servers is currently disabled, as it is # not yet supported in disaggregated context server architectures. @@ -37,7 +37,7 @@ cache_transceiver_config: ``` ```yaml -# gen_extra-llm-api-config.yaml +# gen_config.yaml cache_transceiver_config: backend: UCX @@ -54,16 +54,16 @@ Suppose we have three CUDA devices on the same machine. The first two devices ar # Start context servers CUDA_VISIBLE_DEVICES=0 trtllm-serve TinyLlama/TinyLlama-1.1B-Chat-v1.0 \ --host localhost --port 8001 \ - --extra_llm_api_options ./ctx_extra-llm-api-config.yaml &> log_ctx_0 & + --config ./ctx_config.yaml &> log_ctx_0 & CUDA_VISIBLE_DEVICES=1 trtllm-serve TinyLlama/TinyLlama-1.1B-Chat-v1.0 \ --host localhost --port 8002 \ - --extra_llm_api_options ./ctx_extra-llm-api-config.yaml &> log_ctx_1 & + --config ./ctx_config.yaml &> log_ctx_1 & # Start generation server CUDA_VISIBLE_DEVICES=2 trtllm-serve TinyLlama/TinyLlama-1.1B-Chat-v1.0 \ --host localhost --port 8003 \ - --extra_llm_api_options ./gen_extra-llm-api-config.yaml &> log_gen_0 & + --config ./gen_config.yaml &> log_gen_0 & ``` Once the context and generation servers are launched, you can launch the disaggregated @@ -131,16 +131,16 @@ After starting the node and entering interactive mode, you can run the following # Start context servers CUDA_VISIBLE_DEVICES=0 trtllm-llmapi-launch trtllm-serve TinyLlama/TinyLlama-1.1B-Chat-v1.0 \ --host localhost --port 8001 \ - --extra_llm_api_options ./ctx_extra-llm-api-config.yaml &> log_ctx_0 & + --config ./ctx_config.yaml &> log_ctx_0 & CUDA_VISIBLE_DEVICES=1 trtllm-llmapi-launch trtllm-serve TinyLlama/TinyLlama-1.1B-Chat-v1.0 \ --host localhost --port 8002 \ - --extra_llm_api_options ./ctx_extra-llm-api-config.yaml &> log_ctx_1 & + --config ./ctx_config.yaml &> log_ctx_1 & # Start generation server CUDA_VISIBLE_DEVICES=2 trtllm-llmapi-launch trtllm-serve TinyLlama/TinyLlama-1.1B-Chat-v1.0 \ --host localhost --port 8003 \ - --extra_llm_api_options ./gen_extra-llm-api-config.yaml &> log_gen_0 & + --config ./gen_config.yaml &> log_gen_0 & # Start proxy trtllm-llmapi-launch trtllm-serve disaggregated -c disagg_config.yaml @@ -182,7 +182,7 @@ srun -A -p -t