/* * SPDX-FileCopyrightText: Copyright (c) 2022-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. * SPDX-License-Identifier: Apache-2.0 * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ #include "tensorrt_llm/batch_manager/GptManager.h" #include "tensorrt_llm/batch_manager/inferenceRequest.h" #include "tensorrt_llm/batch_manager/namedTensor.h" #include "tensorrt_llm/common/assert.h" #include "tensorrt_llm/common/logger.h" #include "tensorrt_llm/common/mpiUtils.h" #include "tensorrt_llm/common/stringUtils.h" #include "tensorrt_llm/executor/executor.h" #include "tensorrt_llm/executor/tensor.h" #include "tensorrt_llm/executor/types.h" #include "tensorrt_llm/plugins/api/tllmPlugin.h" #include "tensorrt_llm/runtime/common.h" #include "tensorrt_llm/runtime/gptJsonConfig.h" #include "tensorrt_llm/runtime/tllmLogger.h" #include "tensorrt_llm/runtime/utils/numpyUtils.h" #include "tensorrt_llm/runtime/worldConfig.h" #include "utils/utils.h" #include #include #include #include #include #include #include #include #include #include #include #include using namespace tensorrt_llm::batch_manager; using namespace tensorrt_llm::runtime; using namespace tensorrt_llm::benchmark; namespace tc = tensorrt_llm::common; namespace texec = tensorrt_llm::executor; namespace mpi = tensorrt_llm::mpi; namespace trt = nvinfer1; namespace { using TensorPtr = ITensor::SharedPtr; class LoraLib { public: LoraLib(std::string const& loraDir) : mLoraDir(loraDir) , mBufferManager(std::make_shared()) , mTaskPaths(parseDirPaths(mLoraDir)) , mLoras(readLoras(mTaskPaths)) { } TensorPtr getLoraWeights(uint64_t taskId) const { return mLoras.at(taskId).first; } TensorPtr getLoraConfig(uint64_t taskId) const { return mLoras.at(taskId).second; } void clear() { mLoras.clear(); } std::map> const& getLoras() { return mLoras; } private: std::string const mLoraDir; BufferManager mBufferManager; std::map mTaskPaths; std::map> mLoras; std::map> readLoras(std::map taskPaths) { std::map> loras; for (auto const& [id, p] : taskPaths) { TensorPtr loraWeights = utils::loadNpy(mBufferManager, (p / "model.lora_weights.npy").string(), MemoryType::kCPU); TensorPtr loraConfig = utils::loadNpy(mBufferManager, (p / "model.lora_config.npy").string(), MemoryType::kCPU); loras.insert_or_assign(id, std::make_pair(loraWeights, loraConfig)); } return loras; } std::map parseDirPaths(std::string const& loraDir) { std::map taskPaths; if (loraDir == "") { return taskPaths; } for (auto const& entry : fs::recursive_directory_iterator(loraDir)) { if (entry.is_directory()) { auto taskId = parseId(entry.path()); taskPaths.insert_or_assign(taskId, entry.path()); } } return taskPaths; } uint64_t parseId(fs::path p) { auto fn = p.filename().string(); auto dashPos = fn.find_first_of("-"); std::string idStr = fn; if (dashPos != std::string::npos) { auto idStr = fn.substr(0, dashPos); } uint64_t id = static_cast(std::stoi(idStr)); return id; } }; } // namespace struct BenchInfo { BenchInfo() = default; BenchInfo(int inputLength, std::chrono::time_point start) : inputLength(inputLength) , start(start) { } int inputLength; int outputLength{0}; std::chrono::time_point start; std::chrono::time_point end; std::chrono::time_point firstTokenTs; float latency{}; // millisecond bool hasError{false}; float firstTokenLatency{}; std::optional avgGenT2TLatency{}; bool firstTokenSeen{false}; SizeType32 decodingIter{0}; }; class Recorder { using TensorPtr = ITensor::SharedPtr; public: explicit Recorder(std::string opCsvFile, bool streaming = false, int beamWidth = 1, std::string responsesJsonFile = "", bool excludeInputInOutput = false) : mOpCsvFile(std::move(opCsvFile)) , mStreaming(streaming) , mBeamWidth(beamWidth) , mRespJsonFile(std::move(responsesJsonFile)) , mOutputHasInput(!excludeInputInOutput) { } void initialize() { mStart = std::chrono::steady_clock::now(); mRequestsQueueingLatencies.clear(); } void finalize() { mEnd = std::chrono::steady_clock::now(); } void recordQueueLatency(std::vector const& latencies) { mRequestsQueueingLatencies.insert(mRequestsQueueingLatencies.end(), latencies.begin(), latencies.end()); } void recordStart(std::shared_ptr request, uint64_t requestId) { auto const inputLength = request->getInputIds()->getSize(); auto const maxNewTokens = request->getMaxNewTokensNamed(); auto const& outputLengthTensor = maxNewTokens.tensor; TLLM_CHECK_WITH_INFO(outputLengthTensor != nullptr && outputLengthTensor->getSize() > 0, "Undefined scalar vector for %s", maxNewTokens.name.c_str()); auto const outputLength = *bufferCast(*outputLengthTensor); auto const start = std::chrono::steady_clock::now(); mRequestBenchInfos[requestId] = BenchInfo(inputLength, start); } // number of output tokens not calculated from output sequence here, instead set to max_output_len // - if eos_id == -1 (default behavior), this is correct since output seq will have max permissible length. // - However, if eos_id != -1, the token size of output sequence may be less than max_output_len, and token // throughput may be inaccurate void recordStart(SizeType32 inputLength, SizeType32 maxNewTokens, uint64_t requestId, std::chrono::time_point const& start) { mRequestBenchInfos[requestId] = BenchInfo(inputLength, start); } void recordEnd(uint64_t requestId, bool hasError) { mRequestBenchInfos[requestId].end = std::chrono::steady_clock::now(); mRequestBenchInfos[requestId].hasError = hasError; } void recordToken(uint64_t requestId) { TLLM_CHECK(mStreaming); TLLM_CHECK_WITH_INFO(mBeamWidth == 1, "gptManagerBenchmark streaming mode does not support beam > 1"); if (!mRequestBenchInfos[requestId].firstTokenSeen) { mRequestBenchInfos[requestId].firstTokenTs = std::chrono::steady_clock::now(); mRequestBenchInfos[requestId].firstTokenSeen = true; } mRequestBenchInfos[requestId].decodingIter += 1; } void recordToken(uint64_t requestId, std::list const& responseTensors) { int32_t outputLength = 1; for (auto& tensor : responseTensors) { if (tensor.name == inference_request::kSequenceLengthTensorName) { // Tensor of shape nBeams, and we only need the first one outputLength = *(bufferCast(*(tensor.tensor))); break; } } mRequestBenchInfos[requestId].outputLength += outputLength; this->recordToken(requestId); } void recordToken(uint64_t requestId, texec::Response const& response) { auto outputTokenIds = response.getResult().outputTokenIds; int32_t outputLength = 1; for (auto const& beam : outputTokenIds) { outputLength = std::max(static_cast(beam.size()), outputLength); } mRequestBenchInfos[requestId].outputLength += outputLength; this->recordToken(requestId); } void recordEnd(uint64_t requestId, std::list const& responseTensors, bool hasError) { this->recordEnd(requestId, hasError); if (!mStreaming) { for (auto& tensor : responseTensors) { if (tensor.name == inference_request::kOutputIdsTensorName) { mResponseTensors[requestId] = tensor.tensor; } else if (tensor.name == inference_request::kSequenceLengthTensorName) { // Tensor of shape nBeams, and we only need the first one int32_t outputSeqLen = *(bufferCast(*(tensor.tensor))); if (mOutputHasInput) { int inputSeqLen = mRequestBenchInfos[requestId].inputLength; outputSeqLen -= inputSeqLen; } mRequestBenchInfos[requestId].outputLength = outputSeqLen; } } } else { this->recordToken(requestId, responseTensors); } } void recordEnd(uint64_t requestId, texec::Response const& response) { this->recordEnd(requestId, response.hasError()); // Get the actual output length if (!response.hasError()) { if (!mStreaming) { TLLM_LOG_DEBUG("response.getResult().outputTokenIds"); auto outputTokenIds = response.getResult().outputTokenIds; int32_t outSeqLen = 0; for (auto const& beam : outputTokenIds) { outSeqLen = std::max(static_cast(beam.size()), outSeqLen); } if (mOutputHasInput) { int inputSeqLen = mRequestBenchInfos[requestId].inputLength; outSeqLen -= inputSeqLen; } mRequestBenchInfos[requestId].outputLength = outSeqLen; mRequestBenchInfos[requestId].decodingIter = response.getResult().decodingIter; } else { this->recordToken(requestId, response); } } } float calcPercentile(std::vector const& latencies, int percentile) { int const index = static_cast(std::ceil((percentile / 100.0) * latencies.size())) - 1; return latencies[index]; } void calculateLatencies() { for (auto& reqInfo : mRequestBenchInfos) { reqInfo.second.latency = std::chrono::duration(reqInfo.second.end - reqInfo.second.start).count(); if (mStreaming) { reqInfo.second.firstTokenLatency = std::chrono::duration(reqInfo.second.firstTokenTs - reqInfo.second.start) .count(); if (reqInfo.second.outputLength > 1) { reqInfo.second.avgGenT2TLatency = std::chrono::duration(reqInfo.second.end - reqInfo.second.firstTokenTs) .count() / static_cast(reqInfo.second.outputLength - 1); } } } } void calculateMetrics() { calculateLatencies(); std::vector reqLatencies; std::vector ftLatencies; std::vector genT2TLatencies; int totalOutputTokens{0}; int totalDecodingIter{0}; mNumErrorSamples = 0; mNumSamples = 0; for (auto reqInfo : mRequestBenchInfos) { if (!reqInfo.second.hasError) { reqLatencies.push_back(reqInfo.second.latency); totalOutputTokens += reqInfo.second.outputLength; totalDecodingIter += reqInfo.second.decodingIter; if (mStreaming) { ftLatencies.push_back(reqInfo.second.firstTokenLatency); if (reqInfo.second.avgGenT2TLatency) { genT2TLatencies.push_back(reqInfo.second.avgGenT2TLatency.value()); } } ++mNumSamples; } else { ++mNumErrorSamples; } } mTotalLatency = std::chrono::duration(mEnd - mStart).count(); mSeqThroughput = mNumSamples / (mTotalLatency / 1000); mTokenThroughput = totalOutputTokens / (mTotalLatency / 1000); mAcceptanceRate = totalDecodingIter ? (static_cast(totalOutputTokens) / static_cast(totalDecodingIter)) : 0.0f; mAvgSeqLatency = std::accumulate(reqLatencies.begin(), reqLatencies.end(), 0.F) / reqLatencies.size(); std::sort(reqLatencies.begin(), reqLatencies.end()); mP99SeqLatency = calcPercentile(reqLatencies, 99); mP90SeqLatency = calcPercentile(reqLatencies, 90); mP50SeqLatency = calcPercentile(reqLatencies, 50); mMaxSeqLatency = reqLatencies.back(); mMinSeqLatency = reqLatencies.front(); if (mStreaming) { mAvgFtLatency = std::accumulate(ftLatencies.begin(), ftLatencies.end(), 0.F) / ftLatencies.size(); std::sort(ftLatencies.begin(), ftLatencies.end()); mP99FtLatency = calcPercentile(ftLatencies, 99); mP90FtLatency = calcPercentile(ftLatencies, 90); mP50FtLatency = calcPercentile(ftLatencies, 50); mMaxFtLatency = ftLatencies.back(); mMinFtLatency = ftLatencies.front(); if (!genT2TLatencies.empty()) { mAvgGenT2TLatency = std::accumulate(genT2TLatencies.begin(), genT2TLatencies.end(), 0.F) / genT2TLatencies.size(); std::sort(genT2TLatencies.begin(), genT2TLatencies.end()); mP99GenT2TLatency = calcPercentile(genT2TLatencies, 99); mP90GenT2TLatency = calcPercentile(genT2TLatencies, 90); mP50GenT2TLatency = calcPercentile(genT2TLatencies, 50); mMaxGenT2TLatency = genT2TLatencies.back(); mMinGenT2TLatency = genT2TLatencies.front(); } mAvgReqQueueingLatency = std::accumulate(mRequestsQueueingLatencies.begin(), mRequestsQueueingLatencies.end(), 0.F) / mRequestsQueueingLatencies.size(); std::sort(mRequestsQueueingLatencies.begin(), mRequestsQueueingLatencies.end()); mP99ReqQueueingLatency = calcPercentile(mRequestsQueueingLatencies, 99); mP90ReqQueueingLatency = calcPercentile(mRequestsQueueingLatencies, 90); mP50ReqQueueingLatency = calcPercentile(mRequestsQueueingLatencies, 50); mMaxReqQueueingLatency = mRequestsQueueingLatencies.back(); mMinReqQueueingLatency = mRequestsQueueingLatencies.front(); } } void report() { printf("[BENCHMARK] num_samples %d\n", mNumSamples); printf("[BENCHMARK] num_error_samples %d\n", mNumErrorSamples); printf("\n[BENCHMARK] num_samples %d\n", mNumSamples); printf("[BENCHMARK] total_latency(ms) %.2f\n", mTotalLatency); printf("[BENCHMARK] seq_throughput(seq/sec) %.2f\n", mSeqThroughput); printf("[BENCHMARK] token_throughput(token/sec) %.2f\n", mTokenThroughput); printf("[BENCHMARK] avg_acceptance_rate(tokens/decoding steps) %.2f\n\n", mAcceptanceRate); printf("[BENCHMARK] avg_sequence_latency(ms) %.2f\n", mAvgSeqLatency); printf("[BENCHMARK] max_sequence_latency(ms) %.2f\n", mMaxSeqLatency); printf("[BENCHMARK] min_sequence_latency(ms) %.2f\n", mMinSeqLatency); printf("[BENCHMARK] p99_sequence_latency(ms) %.2f\n", mP99SeqLatency); printf("[BENCHMARK] p90_sequence_latency(ms) %.2f\n", mP90SeqLatency); printf("[BENCHMARK] p50_sequence_latency(ms) %.2f\n\n", mP50SeqLatency); if (mStreaming) { printf("[BENCHMARK] avg_time_to_first_token(ms) %.2f\n", mAvgFtLatency); printf("[BENCHMARK] max_time_to_first_token(ms) %.2f\n", mMaxFtLatency); printf("[BENCHMARK] min_time_to_first_token(ms) %.2f\n", mMinFtLatency); printf("[BENCHMARK] p99_time_to_first_token(ms) %.2f\n", mP99FtLatency); printf("[BENCHMARK] p90_time_to_first_token(ms) %.2f\n", mP90FtLatency); printf("[BENCHMARK] p50_time_to_first_token(ms) %.2f\n\n", mP50FtLatency); printf("[BENCHMARK] avg_inter_token_latency(ms) %.2f\n", mAvgGenT2TLatency); printf("[BENCHMARK] max_inter_token_latency(ms) %.2f\n", mMaxGenT2TLatency); printf("[BENCHMARK] min_inter_token_latency(ms) %.2f\n", mMinGenT2TLatency); printf("[BENCHMARK] p99_inter_token_latency(ms) %.2f\n", mP99GenT2TLatency); printf("[BENCHMARK] p90_inter_token_latency(ms) %.2f\n", mP90GenT2TLatency); printf("[BENCHMARK] p50_inter_token_latency(ms) %.2f\n\n", mP50GenT2TLatency); printf("[BENCHMARK] avg_request_queueing_latency(ms) %.2f\n", mAvgReqQueueingLatency); printf("[BENCHMARK] max_request_queueing_latency(ms) %.2f\n", mMaxReqQueueingLatency); printf("[BENCHMARK] min_request_queueing_latency(ms) %.2f\n", mMinReqQueueingLatency); printf("[BENCHMARK] p99_request_queueing_latency(ms) %.2f\n", mP99ReqQueueingLatency); printf("[BENCHMARK] p90_request_queueing_latency(ms) %.2f\n", mP90ReqQueueingLatency); printf("[BENCHMARK] p50_request_queueing_latency(ms) %.2f\n\n", mP50ReqQueueingLatency); } } void writeOpMetricsToCsv() { if (!mOpCsvFile.empty()) { std::vector headers = {"num_samples", "num_error_samples", "total_latency(ms)", "seq_throughput(seq/sec)", "token_throughput(token/sec)", "avg_sequence_latency(ms)", "max_sequence_latency(ms)", "min_sequence_latency(ms)", "p99_sequence_latency(ms)", "p90_sequence_latency(ms)", "p50_sequence_latency(ms)", "avg_acceptance_rate(tokens/decoding steps)"}; if (mStreaming) { std::vector streamingHeaders = {"avg_time_to_first_token(ms)", "max_time_to_first_token(ms)", "min_time_to_first_token(ms)", "p99_time_to_first_token(ms)", "p90_time_to_first_token(ms)", "p50_time_to_first_token(ms)", "avg_inter_token_latency(ms)", "max_inter_token_latency(ms)", "min_inter_token_latency(ms)", "p99_inter_token_latency(ms)", "p90_inter_token_latency(ms)", "p50_inter_token_latency(ms)"}; headers.insert(headers.end(), streamingHeaders.begin(), streamingHeaders.end()); } std::ofstream outputFile(mOpCsvFile); if (outputFile.is_open()) { for (auto const& header : headers) { outputFile << header << ","; } outputFile << "\n"; outputFile << mNumSamples << "," << mNumErrorSamples << "," << mTotalLatency << "," << mSeqThroughput << "," << mTokenThroughput << "," << mAvgSeqLatency << "," << mMaxSeqLatency << "," << mMinSeqLatency << "," << mP99SeqLatency << "," << mP90SeqLatency << "," << mP50SeqLatency << "," << mAcceptanceRate; if (mStreaming) { outputFile << "," << mAvgFtLatency << "," << mMaxFtLatency << "," << mMinFtLatency << "," << mP99FtLatency << "," << mP90FtLatency << "," << mP50FtLatency << "," << mAvgGenT2TLatency << "," << mMaxGenT2TLatency << "," << mMinGenT2TLatency << "," << mP99GenT2TLatency << "," << mP90GenT2TLatency << "," << mP50GenT2TLatency; } outputFile << "\n"; } else { std::cerr << "Error opening file '" << mOpCsvFile << "' for writing.\n"; } } } void dumpResponseSeqs() { if (mRespJsonFile.empty()) return; nlohmann::json jsonResponses = nlohmann::json::array(); for (auto const& [respId, respTokensTensor] : mResponseTensors) { int inputLength = mRequestBenchInfos[respId].inputLength; int outputLength = mRequestBenchInfos[respId].outputLength; std::vector outputTokens(outputLength); int32_t* outputToksBufferPtr = bufferCast(*respTokensTensor); if (mOutputHasInput) outputToksBufferPtr += inputLength; std::copy(outputToksBufferPtr, outputToksBufferPtr + outputLength, outputTokens.begin()); nlohmann::json currResp; currResp["response_id"] = respId; currResp["response_tokens"] = outputTokens; jsonResponses.push_back(currResp); } std::ofstream outFile(mRespJsonFile); outFile << jsonResponses; outFile.close(); } private: std::unordered_map mRequestBenchInfos; std::chrono::time_point mStart; std::chrono::time_point mEnd; int mNumSamples{}; int mNumErrorSamples{}; float mTotalLatency{}; float mSeqThroughput{}; float mAvgSeqLatency{}; float mAvgGenT2TLatency{}; float mAvgFtLatency{}; float mTokenThroughput{}; float mAcceptanceRate{}; float mP99SeqLatency{}; float mP90SeqLatency{}; float mP50SeqLatency{}; float mMaxSeqLatency{}; float mMinSeqLatency{}; float mP99FtLatency{}; float mP90FtLatency{}; float mP50FtLatency{}; float mMaxFtLatency{}; float mMinFtLatency{}; float mP99GenT2TLatency{}; float mP90GenT2TLatency{}; float mP50GenT2TLatency{}; float mMaxGenT2TLatency{}; float mMinGenT2TLatency{}; float mAvgReqQueueingLatency{}; float mP99ReqQueueingLatency{}; float mP90ReqQueueingLatency{}; float mP50ReqQueueingLatency{}; float mMaxReqQueueingLatency{}; float mMinReqQueueingLatency{}; std::vector mRequestsQueueingLatencies{}; std::string mOpCsvFile; bool mStreaming; int mBeamWidth; std::string mRespJsonFile; std::unordered_map mResponseTensors; bool mOutputHasInput; }; // class Recorder class ExecutorServer { public: ExecutorServer(std::optional const& decoderTrtEnginePath, std::optional const& encoderTrtEnginePath, texec::BatchingType batchingType, int32_t maxBeamWidth, texec::CapacitySchedulerPolicy capacitySchedulerPolicy, BenchmarkParams const& benchmarkParams, std::shared_ptr recorder, std::chrono::milliseconds waitSleep, bool logIterationData, texec::ModelType executorModelType) : mRecorder(std::move(recorder)) , mWaitSleep(waitSleep) , mConcurrency(benchmarkParams.concurrency) , mActiveCount(0) , mNumFinished(0) , mShutdown(false) , mLogIterationData(logIterationData) { texec::DynamicBatchConfig dynamicBatchConfig( benchmarkParams.enableBatchSizeTuning, benchmarkParams.enableMaxNumTokensTuning); texec::SchedulerConfig schedulerConfig(capacitySchedulerPolicy, std::nullopt, dynamicBatchConfig); texec::KvCacheConfig kvCacheConfig(benchmarkParams.enableBlockReuse, benchmarkParams.maxTokensInPagedKvCache, benchmarkParams.maxAttentionWindowVec, benchmarkParams.sinkTokenLength, benchmarkParams.freeGpuMemoryFraction, benchmarkParams.kvHostCacheSize, benchmarkParams.kvOnboardBlocks, benchmarkParams.crossKvCacheFraction); texec::PeftCacheConfig peftCacheConfig(0, benchmarkParams.loraDeviceNumModLayers, 8, 64, 4, 4, 4, 24, 8, std::nullopt, benchmarkParams.loraHostCacheSize); texec::ExtendedRuntimePerfKnobConfig extendedRuntimePerfKnobConfig(benchmarkParams.multiBlockMode, benchmarkParams.enableContextFMHAFP32Acc, benchmarkParams.cudaGraphMode, benchmarkParams.cudaGraphCacheSize); texec::ExecutorConfig executorConfig( maxBeamWidth, schedulerConfig, kvCacheConfig, benchmarkParams.enableChunkedContext, true); executorConfig.setGpuWeightsPercent(benchmarkParams.gpuWeightsPercent); executorConfig.setPeftCacheConfig(peftCacheConfig); executorConfig.setBatchingType(batchingType); if (benchmarkParams.maxBatchSize) { executorConfig.setMaxBatchSize(benchmarkParams.maxBatchSize.value()); } if (benchmarkParams.maxNumTokens) { executorConfig.setMaxNumTokens(benchmarkParams.maxNumTokens.value()); } executorConfig.setDecodingConfig( texec::DecodingConfig(benchmarkParams.medusaChoices.has_value() ? texec::DecodingMode::Medusa() : benchmarkParams.executorLookaheadConfig.has_value() ? texec::DecodingMode::Lookahead() : texec::DecodingMode::Auto(), benchmarkParams.executorLookaheadConfig, benchmarkParams.medusaChoices)); executorConfig.setExtendedRuntimePerfKnobConfig(extendedRuntimePerfKnobConfig); if (executorModelType == texec::ModelType::kDECODER_ONLY) { mExecutor = std::make_unique(decoderTrtEnginePath.value(), executorModelType, executorConfig); } else if (executorModelType == texec::ModelType::kENCODER_DECODER) { mExecutor = std::make_unique( encoderTrtEnginePath.value(), decoderTrtEnginePath.value(), executorModelType, executorConfig); } else if (executorModelType == texec::ModelType::kENCODER_ONLY) { mExecutor = std::make_unique(encoderTrtEnginePath.value(), executorModelType, executorConfig); } else { TLLM_LOG_ERROR("not a supported executor model type in executor server."); } auto const& world = tensorrt_llm::mpi::MpiComm::world(); auto worldRank = world.getRank(); if (worldRank == 0) { mCollectStatsThread = std::thread(&ExecutorServer::collectStats, this); } } ~ExecutorServer() { mShutdown = true; if (mCollectStatsThread.joinable()) { mCollectStatsThread.join(); } } void enqueue(std::vector requests, bool warmup = false) { try { std::vector inputLengths; std::vector maxNewTokens; for (auto const& request : requests) { inputLengths.push_back(request.getInputTokenIds().size()); maxNewTokens.push_back(request.getMaxTokens()); } auto const start = std::chrono::steady_clock::now(); auto reqIds = mExecutor->enqueueRequests(std::move(requests)); for (int req = 0; req < reqIds.size(); ++req) { if (!warmup) { mRecorder->recordStart(inputLengths.at(req), maxNewTokens.at(req), reqIds.at(req), start); } mActiveCount++; } } catch (std::exception const& e) { TLLM_THROW("%s", e.what()); } } void resetNumFinished() { mNumFinished = 0; } bool canEnqueue(int numSentRequests) const { return !mConcurrency || (numSentRequests - mNumFinished < mConcurrency); } void waitForResponses(SizeType32 numRequests, bool warmup = false) { while (mActiveCount || (mNumFinished < numRequests)) { auto responses = mExecutor->awaitResponses(mWaitSleep); for (auto const& response : responses) { auto const reqId = response.getRequestId(); TLLM_LOG_DEBUG("response.getResult().isFinal"); if (response.getResult().isFinal) { mActiveCount--; mNumFinished++; if (!warmup) { mRecorder->recordEnd(reqId, response); } } else { if (!warmup && !response.hasError()) { mRecorder->recordToken(reqId, response); } } } } } void collectStats() const { while (!mShutdown) { auto iterStats = mExecutor->getLatestIterationStats(); for (auto const& iterStat : iterStats) { SizeType32 numNewActiveRequests = iterStat.numNewActiveRequests; if (numNewActiveRequests > 0) { float avgQueueingTime = static_cast(iterStat.newActiveRequestsQueueLatencyMS / numNewActiveRequests); std::vector requestsQueueLatencyMS(numNewActiveRequests, avgQueueingTime); mRecorder->recordQueueLatency(requestsQueueLatencyMS); } if (mLogIterationData) { TLLM_LOG_INFO(texec::JsonSerialization::toJsonStr(iterStat)); } } auto const waitSleep = std::chrono::milliseconds(50); std::this_thread::sleep_for(waitSleep); } } private: std::unique_ptr mExecutor; std::thread mCollectStatsThread; std::shared_ptr mRecorder; std::chrono::milliseconds mWaitSleep; std::optional mConcurrency; std::atomic mActiveCount; std::atomic mNumFinished; std::atomic mShutdown; bool mLogIterationData; }; // class ExecutorServer namespace { texec::Request makeExecutorRequest(Sample const& sample, SizeType32 const& beamWidth, std::optional const& eosId, std::optional const& padId, bool streaming = false, bool const& returnContextLogits = false, bool const& returnGenerationLogits = false, std::optional const& loraConfig = std::nullopt, std::optional const& lookaheadConfig = std::nullopt, std::optional encoderInputTokenIds = std::nullopt) { auto samplingConfig = texec::SamplingConfig{beamWidth}; auto outputConfig = texec::OutputConfig{false, returnContextLogits, returnGenerationLogits, false}; return texec::Request(sample.inputIds, sample.outputLen, streaming, samplingConfig, outputConfig, eosId, padId, std::nullopt, // positionIds std::nullopt, // badWords std::nullopt, // stopWords std::nullopt, // embeddingBias std::nullopt, // speculativeDecoding std::nullopt, // pTuning std::nullopt, // mRopeConfig loraConfig, // loraConfig lookaheadConfig, // lookaheadConfig std::nullopt, // kvCacheRetentionConfig std::nullopt, // logitsPostProcessorName encoderInputTokenIds.has_value() ? encoderInputTokenIds : std::nullopt); } void benchmarkExecutor(std::optional const& decoderEngineDir, std::optional const& encoderEngineDir, texec::BatchingType batchingType, std::string const& datasetPath, std::string const& opCsvFile, int maxNumSamples, int beamWidth, int warmUp, std::optional const& eosId, std::optional const& padId, BenchmarkParams const& benchmarkParams, texec::CapacitySchedulerPolicy capacitySchedulerPolicy, std::chrono::milliseconds waitSleep, bool returnContextLogits, bool returnGenerationLogits, std::optional const staticEmulatedBatchSize, bool logIterationData, std::optional const maxPromptLen, texec::ModelType executorModelType) { auto const& world = tensorrt_llm::mpi::MpiComm::world(); auto worldRank = world.getRank(); // Load dataset auto const samples = parseWorkloadJson(datasetPath, maxNumSamples, maxPromptLen); auto const numSamples = samples.size(); auto recorder = std::make_shared(opCsvFile, benchmarkParams.streaming, beamWidth); int32_t decoderStartTokenId = 0; std::shared_ptr executorServer; if (executorModelType == texec::ModelType::kDECODER_ONLY) { TLLM_CHECK_WITH_INFO( decoderEngineDir.has_value(), "decoder models require a path to decoder engine in executor benchmark."); executorServer = std::make_shared(decoderEngineDir.value(), std::nullopt, batchingType, beamWidth, capacitySchedulerPolicy, benchmarkParams, recorder, waitSleep, logIterationData, executorModelType); } else if (executorModelType == texec::ModelType::kENCODER_DECODER) { TLLM_CHECK_WITH_INFO(encoderEngineDir.has_value(), "encoder-decoder models require a path to encoder engine in executor benchmark."); executorServer = std::make_shared(decoderEngineDir.value(), encoderEngineDir.value(), batchingType, beamWidth, capacitySchedulerPolicy, benchmarkParams, recorder, waitSleep, logIterationData, executorModelType); try { std::ifstream decoderJsonConfigPath(decoderEngineDir.value() / "config.json"); auto const decoderPretrainedConfig = nlohmann::json::parse(decoderJsonConfigPath, nullptr, true, true).at("pretrained_config"); decoderStartTokenId = decoderPretrainedConfig.at("decoder_start_token_id").template get(); } catch (nlohmann::json::out_of_range& e) { TLLM_LOG_ERROR( "Parameter %s cannot be read from decoder config.json in pretrained_config. Using default id %d.", std::string("decoder_start_token_id").c_str(), decoderStartTokenId); } catch (nlohmann::json::type_error const& e) { TLLM_LOG_ERROR( "Parameter %s has error type in decoder config.json in pretrained_config. Using default id %d.", std::string("decoder_start_token_id").c_str(), decoderStartTokenId); } } else if (executorModelType == texec::ModelType::kENCODER_ONLY) { TLLM_CHECK_WITH_INFO( encoderEngineDir.has_value(), "encoder models require a path to encoder engine in executor benchmark."); executorServer = std::make_shared(std::nullopt, encoderEngineDir.value(), batchingType, beamWidth, capacitySchedulerPolicy, benchmarkParams, recorder, waitSleep, logIterationData, executorModelType); } else { TLLM_LOG_ERROR("not a supported executor model type in executor benchmark."); return; } if (worldRank == 0) { if (benchmarkParams.loraDir) { auto startLoraLoad = std::chrono::steady_clock::now(); LoraLib loras(benchmarkParams.loraDir.value()); std::vector requests; for (auto& [taskId, p] : loras.getLoras()) { // squeeze lora configs and weights since LoraConfig requires them to be 2D tensors p.first->squeeze(0); p.second->squeeze(0); texec::LoraConfig loraConfig( taskId, texec::detail::ofITensor(p.first), texec::detail::ofITensor(p.second)); if (executorModelType == texec::ModelType::kENCODER_DECODER) { Sample s{std::vector{decoderStartTokenId}, 1, static_cast(taskId)}; requests.emplace_back(makeExecutorRequest(s, beamWidth, eosId, padId, false, false, false, loraConfig, std::nullopt, std::vector{1, 2, 3, 4, 5})); } else { Sample s{std::vector{1, 2, 3, 4, 5}, 1, static_cast(taskId)}; requests.emplace_back( makeExecutorRequest(s, beamWidth, eosId, padId, false, false, false, loraConfig, std::nullopt)); } } executorServer->enqueue(std::move(requests), true); executorServer->waitForResponses(loras.getLoras().size(), true); auto endLoraLoad = std::chrono::steady_clock::now(); printf("[BENCHMARK] time to preload LoRAs(ms) %.2f\n", std::chrono::duration(endLoraLoad - startLoraLoad).count()); } // Warm up { std::vector requests; for (auto i = 0; i < warmUp; ++i) { if (executorModelType == texec::ModelType::kENCODER_DECODER) { Sample s{std::vector{decoderStartTokenId}, samples[0].outputLen, samples[0].taskId}; requests.emplace_back(makeExecutorRequest(s, beamWidth, eosId, padId, benchmarkParams.streaming, returnContextLogits, returnGenerationLogits, std::nullopt, benchmarkParams.requestLookaheadConfig, samples[0].inputIds)); } else { requests.emplace_back(makeExecutorRequest(samples[0], beamWidth, eosId, padId, benchmarkParams.streaming, returnContextLogits, returnGenerationLogits, std::nullopt, benchmarkParams.requestLookaheadConfig)); } } executorServer->enqueue(std::move(requests), true); executorServer->waitForResponses(warmUp, true); } // Benchmark { auto timeDelays = computeTimeDelays(benchmarkParams, numSamples - 1); // Create requests recorder->initialize(); std::vector requests; for (std::size_t i = 0; i < numSamples; ++i) { std::optional loraConfig; if (samples[i].taskId >= 0) { loraConfig = texec::LoraConfig(samples[i].taskId); } if (executorModelType == texec::ModelType::kENCODER_DECODER) { Sample s{std::vector{decoderStartTokenId}, samples[i].outputLen, samples[i].taskId}; requests.emplace_back(makeExecutorRequest(s, beamWidth, eosId, padId, benchmarkParams.streaming, returnContextLogits, returnGenerationLogits, loraConfig, benchmarkParams.requestLookaheadConfig, samples[i].inputIds)); } else { requests.emplace_back(makeExecutorRequest(samples[i], beamWidth, eosId, padId, benchmarkParams.streaming, returnContextLogits, returnGenerationLogits, loraConfig, benchmarkParams.requestLookaheadConfig)); } } bool const hasDelay = std::any_of(timeDelays.begin(), timeDelays.end(), [](auto const& delay) { return delay > 0.0; }); executorServer->resetNumFinished(); if (!staticEmulatedBatchSize) { // Launch a thread that will wait for responses std::thread waitThread( [numSamples, executorServer]() { executorServer->waitForResponses(numSamples); }); // Enqueue requests one by one int numSentRequests = 0; while (numSentRequests < numSamples) { if (executorServer->canEnqueue(numSentRequests)) { executorServer->enqueue({requests.at(numSentRequests)}); if (hasDelay && numSentRequests < numSamples - 1) { std::this_thread::sleep_for( std::chrono::milliseconds(static_cast(timeDelays.at(numSentRequests) * 1000))); } numSentRequests += 1; } } waitThread.join(); } else { TLLM_CHECK_WITH_INFO( !hasDelay, "Executor benchmark doesn't support delays with emulated static batch sizes"); SizeType32 numRequests = requests.size(); SizeType32 maxBatchSize = staticEmulatedBatchSize.value(); for (SizeType32 req = 0; req < numRequests; req += maxBatchSize) { auto batchSize = std::min(maxBatchSize, numRequests - req); std::vector requestsBatch(std::make_move_iterator(requests.begin() + req), std::make_move_iterator(requests.begin() + req + batchSize)); // Enqueue in batches executorServer->enqueue(std::move(requestsBatch)); // Wait for current batch to be done executorServer->waitForResponses(batchSize); } } } recorder->finalize(); recorder->calculateMetrics(); recorder->report(); recorder->writeOpMetricsToCsv(); // Send terminateReqId to terminate servers on all ranks // Sever on rank 0 will broadcast the terminate signal to other servers on multi-GPU cases // gptServer->enqueue(std::make_shared(terminateReqId)); } } } // namespace int main(int argc, char* argv[]) { cxxopts::Options options( "TensorRT-LLM BatchManager Benchmark", "TensorRT-LLM BatchManager Benchmark for GPT and GPT-like models."); options.add_options()("h,help", "Print usage"); options.add_options()("engine_dir, decoder_engine_dir", "Directory that store the engines of decoder models.", cxxopts::value()); options.add_options()( "encoder_engine_dir", "Directory that store the engines of the encoder models.", cxxopts::value()); options.add_options()( "api", "API type: gptManager or executor.", cxxopts::value()->default_value("executor")); options.add_options()("type", "Batching type: choose between inflight/static. (IFB/V1 options are going to be deprecated)", cxxopts::value()->default_value("inflight")); options.add_options()("dataset", "Dataset that is used for benchmarking BatchManager.", cxxopts::value()->default_value("")); options.add_options()( "output_csv", "Write output metrics to CSV", cxxopts::value()->default_value("")); options.add_options()("max_num_samples", "maximum number of samples to use from dataset/generate", cxxopts::value()->default_value("100000")); options.add_options()( "beam_width", "Specify beam width you want to benchmark.", cxxopts::value()->default_value("1")); options.add_options()( "warm_up", "Specify warm up iterations before benchmark starts.", cxxopts::value()->default_value("2")); options.add_options()( "eos_id", "Specify the end-of-sequence token id.", cxxopts::value()->default_value("-1")); options.add_options()("pad_id", "Specify the padding token id.", cxxopts::value()); options.add_options()("max_tokens_in_paged_kvcache", "Max tokens in paged K-V Cache.", cxxopts::value()); options.add_options()( "max_attention_window", "Max KV cache length per sequence", cxxopts::value>()); options.add_options()("sink_token_len", "Sink token length in kv cache per sequence.", cxxopts::value()); options.add_options()( "random_seed", "integer random seed for exponential time delays.", cxxopts::value()->default_value("420")); options.add_options()( "kv_cache_free_gpu_mem_fraction", "K-V Cache Free Gpu Mem Fraction.", cxxopts::value()); options.add_options()( "cross_kv_cache_fraction", "Cross K-V Cache Fraction (from 0.0 to 1.0).", cxxopts::value()); options.add_options()("request_rate", "request rate in reqs/sec. Skipping this arg or negative value will trigger offline/0-delay.", cxxopts::value()); options.add_options()("concurrency", "Concurrent number of connections with the server.", cxxopts::value()); options.add_options()("max_batch_size", "The max runtime batch size when benchmarking", cxxopts::value()); options.add_options()( "max_num_tokens", "The max runtime number of tokens per batch when benchmarking", cxxopts::value()); options.add_options()( "enable_batch_size_tuning", "Dynamic tuning of batch size", cxxopts::value()->default_value("false")); options.add_options()("enable_max_num_tokens_tuning", "Dynamic tuning of max num tokens", cxxopts::value()->default_value("false")); options.add_options()("enable_exp_delays", "Enables exponential delay distr to mimic real world request arrival", cxxopts::value()->default_value("false")); options.add_options()("streaming", "Operate in streaming mode", cxxopts::value()->default_value("false")); options.add_options()( "enable_kv_cache_reuse", "Enables the KV cache reuse.", cxxopts::value()->default_value("false")); options.add_options()("enable_chunked_context", "Whether to enable context chunking.", cxxopts::value()->default_value("false")); options.add_options()( "return_context_logits", "Whether to return context logits.", cxxopts::value()->default_value("false")); options.add_options()("return_generation_logits", "Whether to return generation logits.", cxxopts::value()->default_value("false")); options.add_options()("scheduler_policy", "Choose scheduler policy between max_utilization/guaranteed_no_evict/static_batch.", cxxopts::value()->default_value("guaranteed_no_evict")); options.add_options()("static_emulated_batch_size", "Emulate static batching performance with the provided batch size.", cxxopts::value()); options.add_options()("log_level", "Choose log level between verbose/info/warning/error/internal_error.", cxxopts::value()->default_value("error")); options.add_options()("log_iteration_data", "On each decoder iteration, print batch state metadata.", cxxopts::value()->default_value("false")); options.add_options()("wait_sleep", "Specify how many milliseconds to sleep each iteration of waitForEmpty loop.", cxxopts::value()->default_value("25")); options.add_options()("lora_dir", "Directory containing LoRAs", cxxopts::value()->default_value("")); options.add_options()("lora_host_cache_bytes", "LoRA host cache memory in bytes", cxxopts::value()); options.add_options()("lora_num_device_mod_layers", "LoRA number 1d cache rows", cxxopts::value()); options.add_options()("kv_host_cache_bytes", "Size of secondary memory pool used for offloading kv cache blocks (in bytes).", cxxopts::value()->default_value("0")); options.add_options()("kv_onboard_blocks", "If offloaded blocks should be onboarded to primary memory before reuse", cxxopts::value()->default_value("true")); options.add_options()( "max_prompt_len", "Truncate all prompts from dataset to the length specified.", cxxopts::value()); options.add_options()("gpu_weights_percent", "Specify the percentage of weights that reside on GPU (from 0.0 to 1.0).", cxxopts::value()->default_value("1.0")); options.add_options()( "medusa_choices", "Medusa choices in the format of [[0], [0, 1], [0, 0, 1]]", cxxopts::value()); options.add_options()("multi_block_mode", "Distribute the work across multiple CUDA thread-blocks on the GPU for masked MHA kernel", cxxopts::value()->default_value("true")); options.add_options()("cuda_graph_mode", "When enabled, inference is executed with cuda graph.", cxxopts::value()->default_value("false")); options.add_options()("cuda_graph_cache_size", "Specify how many cuda graphs are cached in the runtime. Larger cache gives better perf, but consumes more GPU " "memory.", cxxopts::value()->default_value("0")); options.add_options()("enable_context_fmha_fp32_acc", "Enable FMHA runner FP32 accumulation", cxxopts::value()->default_value("false")); options.add_options()("executor_lookahead_config", "lookahead config in the format of [max_window_size, max_ngram_size, max_verification_set_size]", cxxopts::value()); options.add_options()("request_lookahead_config", "lookahead config in the format of [max_window_size, max_ngram_size, max_verification_set_size], and each <= " "executor lookahead config", cxxopts::value()); auto result = options.parse(argc, argv); if (result.count("help")) { std::cout << options.help() << std::endl; return 0; } // Argument: Engine directory if (!result.count("engine_dir") && !result.count("encoder_engine_dir")) { std::cout << options.help() << std::endl; TLLM_LOG_ERROR("Please specify engine directory."); return 1; } // Argument: Batching Type auto const type = result["type"].as(); texec::BatchingType batchingType{texec::BatchingType::kINFLIGHT}; if (type == "V1" || type == "static") { if (type == "V1") { TLLM_LOG_WARNING("type option \"V1\" is going to be renamed to \"static\"."); } batchingType = texec::BatchingType::kSTATIC; } else if (type == "IFB" || type == "inflight") { if (type == "IFB") { TLLM_LOG_WARNING("type option \"IFB\" is going to be renamed to \"inflight\"."); } batchingType = texec::BatchingType::kINFLIGHT; } else { TLLM_LOG_ERROR("Unexpected batching type: %s", type.c_str()); return 1; } // Argument: Dataset auto const datasetPath = result["dataset"].as(); auto const maxNumSamples = result["max_num_samples"].as(); // Argument: Output metrics CSV auto const opCsvFile = result["output_csv"].as(); // Argument: beam width auto const beamWidth = result["beam_width"].as(); // Argument: wait_sleep auto const waitSleep = std::chrono::milliseconds(result["wait_sleep"].as()); BenchmarkParams benchmarkParams; // Argument: Max tokens in paged K-V Cache if (result.count("max_tokens_in_paged_kvcache")) { benchmarkParams.maxTokensInPagedKvCache = result["max_tokens_in_paged_kvcache"].as(); } // Argument: Max KV cache length if (result.count("max_attention_window")) { benchmarkParams.maxAttentionWindowVec = result["max_attention_window"].as>(); } // Argument: Sink token length if (result.count("sink_token_len")) { benchmarkParams.sinkTokenLength = result["sink_token_len"].as(); } if (result.count("random_seed")) { benchmarkParams.randomSeed = result["random_seed"].as(); } // Argument: K-V Cache Free Gpu Mem Fraction if (result.count("kv_cache_free_gpu_mem_fraction")) { benchmarkParams.freeGpuMemoryFraction = result["kv_cache_free_gpu_mem_fraction"].as(); } // Argument: K-V Cache Cross Attention Fraction. Only applicable to enc-dec models. if (result.count("encoder_engine_dir") && result.count("decoder_engine_dir")) { if (result.count("cross_kv_cache_fraction")) { benchmarkParams.crossKvCacheFraction = result["cross_kv_cache_fraction"].as(); } else { benchmarkParams.crossKvCacheFraction = 0.5f; // default value if not set. but non enc-dec should not even have this param set } } // Argument: Enable dynamic tuning of batch size benchmarkParams.enableBatchSizeTuning = result["enable_batch_size_tuning"].as(); // Argument: Enable dynamic tuning of max num tokens benchmarkParams.enableMaxNumTokensTuning = result["enable_max_num_tokens_tuning"].as(); // Argument: Enable KV cache reuse benchmarkParams.enableBlockReuse = result["enable_kv_cache_reuse"].as(); // Argument: streaming benchmarkParams.streaming = result["streaming"].as(); TLLM_CHECK_WITH_INFO(!(result.count("request_rate") && result.count("concurrency")), "request_rate and concurrency cannot be specified at the same time."); // Argument: request rate if (result.count("request_rate")) { benchmarkParams.requestRate = result["request_rate"].as(); } // Argument: concurrency if (result.count("concurrency")) { benchmarkParams.concurrency = result["concurrency"].as(); } // Argument: request rate if (result.count("max_batch_size")) { benchmarkParams.maxBatchSize = result["max_batch_size"].as(); } // Argument: request rate if (result.count("max_num_tokens")) { benchmarkParams.maxNumTokens = result["max_num_tokens"].as(); } benchmarkParams.enableExpDelays = result["enable_exp_delays"].as(); // Argument: Enable batch stats output bool logIterationData = result["log_iteration_data"].as(); if (logIterationData) { TLLM_LOG_WARNING("Setting log_iteration_data to true adds overheads and may result in lower perf"); } // Argument: Enable chunked context benchmarkParams.enableChunkedContext = result["enable_chunked_context"].as(); // Argument: Enable return context logits bool returnContextLogits = result["return_context_logits"].as(); // Argument: Enable return context logits bool returnGenerationLogits = result["return_generation_logits"].as(); if (result.count("lora_dir")) { benchmarkParams.loraDir = result["lora_dir"].as(); } if (result.count("lora_host_cache_bytes")) { benchmarkParams.loraHostCacheSize = result["lora_host_cache_bytes"].as(); } if (result.count("lora_num_device_mod_layers")) { benchmarkParams.loraDeviceNumModLayers = result["lora_num_device_mod_layers"].as(); } // Argument: How many KV cache blocks (as fraction of number of GPU kv cache blocks). benchmarkParams.kvHostCacheSize = result["kv_host_cache_bytes"].as(); // Argument: If offloaded blocks should be onboarded to primary memory before they are reused. benchmarkParams.kvOnboardBlocks = result["kv_onboard_blocks"].as(); // Argument: Medusa choices for the Medusa speculative decoding. if (result.count("medusa_choices")) { benchmarkParams.medusaChoices = parseVectorOfVectors(result["medusa_choices"].as()); } if (result.count("executor_lookahead_config")) { benchmarkParams.executorLookaheadConfig = parseLookaheadConfig(result["executor_lookahead_config"].as()); } if (result.count("request_lookahead_config")) { benchmarkParams.requestLookaheadConfig = parseLookaheadConfig(result["request_lookahead_config"].as()); } // Argument: multi_block_mode benchmarkParams.multiBlockMode = result["multi_block_mode"].as(); // Argument: enable_context_fmha_fp32_acc benchmarkParams.enableContextFMHAFP32Acc = result["enable_context_fmha_fp32_acc"].as(); // Argument: cuda_graph_mode benchmarkParams.cudaGraphMode = result["cuda_graph_mode"].as(); // Argument: cuda_graph_cache_size benchmarkParams.cudaGraphCacheSize = result["cuda_graph_cache_size"].as(); std::optional padId; // Argument: Padding token id if (result.count("pad_id")) { padId = result["pad_id"].as(); } // Argument: End-of-sentence token id std::optional eosId = result["eos_id"].as(); std::optional staticEmulatedBatchSize; // Argument: Static emulated batch size if (result.count("static_emulated_batch_size")) { staticEmulatedBatchSize = result["static_emulated_batch_size"].as(); } // Argument: Scheduler policy texec::CapacitySchedulerPolicy capacitySchedulerPolicy; auto const capacitySchedulerPolicyArg = result["scheduler_policy"].as(); if (capacitySchedulerPolicyArg == "max_utilization") { capacitySchedulerPolicy = texec::CapacitySchedulerPolicy::kMAX_UTILIZATION; } else if (capacitySchedulerPolicyArg == "guaranteed_no_evict") { capacitySchedulerPolicy = texec::CapacitySchedulerPolicy::kGUARANTEED_NO_EVICT; } else if (capacitySchedulerPolicyArg == "static_batch") { capacitySchedulerPolicy = texec::CapacitySchedulerPolicy::kSTATIC_BATCH; } else { TLLM_LOG_ERROR("Unexpected scheduler policy: " + capacitySchedulerPolicyArg); return 1; } // Argument: max_prompt_len std::optional maxPromptLen; if (result.count("max_prompt_len")) { maxPromptLen = result["max_prompt_len"].as(); } // Argument: GPU weights percentage auto gpuWeightsPercent = result["gpu_weights_percent"].as(); if (gpuWeightsPercent < 0 || gpuWeightsPercent > 1) { TLLM_LOG_ERROR("--gpu_weights_percent must be between 0.0 and 1.0 but got: %f", gpuWeightsPercent); return 1; } benchmarkParams.gpuWeightsPercent = gpuWeightsPercent; // Argument: Log level auto logger = std::make_shared(); auto const logLevel = result["log_level"].as(); if (logLevel == "verbose") { logger->setLevel(trt::ILogger::Severity::kVERBOSE); } else if (logLevel == "info") { logger->setLevel(trt::ILogger::Severity::kINFO); } else if (logLevel == "warning") { logger->setLevel(trt::ILogger::Severity::kWARNING); } else if (logLevel == "error") { logger->setLevel(trt::ILogger::Severity::kERROR); } else if (logLevel == "internal_error") { logger->setLevel(trt::ILogger::Severity::kINTERNAL_ERROR); } else { TLLM_LOG_ERROR("Unexpected log level: " + logLevel); return 1; } initTrtLlmPlugins(logger.get()); // Argument: API auto const api = result["api"].as(); if (api == "executor") { texec::ModelType executorModelType; std::optional decoderEngineDir = std::nullopt, encoderEngineDir = std::nullopt; if (result.count("encoder_engine_dir") && result.count("decoder_engine_dir")) { TLLM_CHECK_WITH_INFO(api == "executor", "encoder-decoder only support executor api."); TLLM_CHECK_WITH_INFO( batchingType == texec::BatchingType::kINFLIGHT, "encoder-decoder only support inflight batching."); executorModelType = texec::ModelType::kENCODER_DECODER; encoderEngineDir = result["encoder_engine_dir"].as(); decoderEngineDir = result["decoder_engine_dir"].as(); } else if (result.count("engine_dir")) { executorModelType = texec::ModelType::kDECODER_ONLY; decoderEngineDir = result["engine_dir"].as(); } else { executorModelType = texec::ModelType::kENCODER_ONLY; encoderEngineDir = result["encoder_engine_dir"].as(); } try { benchmarkExecutor(decoderEngineDir, encoderEngineDir, batchingType, datasetPath, opCsvFile, maxNumSamples, beamWidth, result["warm_up"].as(), eosId, padId, benchmarkParams, capacitySchedulerPolicy, waitSleep, returnContextLogits, returnGenerationLogits, staticEmulatedBatchSize, logIterationData, maxPromptLen, executorModelType); } catch (std::exception const& e) { TLLM_LOG_ERROR(e.what()); return 1; } } else if (api == "gptManager") { TLLM_LOG_ERROR("gptManager is deprecated, please use the executor API."); return 1; } else { TLLM_LOG_ERROR("api parameter must be gptManager or executor"); return 1; } return 0; }