/* * SPDX-FileCopyrightText: Copyright (c) 2022-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved. * SPDX-License-Identifier: Apache-2.0 * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ #include "tensorrt_llm/common/memoryUtils.h" #include "tensorrt_llm/plugins/api/tllmPlugin.h" #include "tensorrt_llm/runtime/gptJsonConfig.h" #include "tensorrt_llm/runtime/gptSession.h" #include "tensorrt_llm/runtime/iTensor.h" #include "tensorrt_llm/runtime/memoryCounters.h" #include "tensorrt_llm/runtime/tllmLogger.h" #include #include #include #include #include using namespace tensorrt_llm::runtime; namespace tc = tensorrt_llm::common; namespace trt = nvinfer1; namespace { void benchmarkGptSession(std::string const& modelName, std::filesystem::path const& dataPath, std::vector const& batchSizes, int beamWidth, std::vector> const& inOutLen, std::shared_ptr const& logger, int warmUp, int numRuns, int duration, GptSession::Config& sessionConfig, bool cudaGraphMode, bool printAllLogits) { std::string modelNameHyphen = modelName; std::filesystem::path jsonFileName = dataPath / "config.json"; if (tc::strStartsWith(modelName, "chatglm")) { std::replace(modelNameHyphen.begin(), modelNameHyphen.end(), '_', '-'); jsonFileName = dataPath / (modelNameHyphen + std::string("-config.json")); } auto const json = GptJsonConfig::parse(jsonFileName); auto const modelConfig = json.getModelConfig(); auto const inputPacked = modelConfig.usePackedInput(); SizeType deviceCount{0}; TLLM_CUDA_CHECK(cudaGetDeviceCount(&deviceCount)); auto const worldConfig = WorldConfig::mpi(*logger, deviceCount, json.getTensorParallelism(), json.getPipelineParallelism()); auto const enginePath = dataPath / json.engineFilename(worldConfig, modelNameHyphen); auto const dtype = modelConfig.getDataType(); auto const useHalf = (dtype == nvinfer1::DataType::kHALF); SamplingConfig samplingConfig{beamWidth}; samplingConfig.temperature = std::vector{1.0f}; samplingConfig.randomSeed = std::vector{42ull}; samplingConfig.topK = std::vector{1}; samplingConfig.topP = std::vector{0.0f}; auto const maxBatchSize = *std::max_element(batchSizes.begin(), batchSizes.end()); sessionConfig.maxBatchSize = maxBatchSize; sessionConfig.maxBeamWidth = beamWidth; sessionConfig.decoderPerRequest = false; sessionConfig.cudaGraphMode = cudaGraphMode; for (auto inOut : inOutLen) { auto const maxInputLength = inOut[0]; auto const maxNewTokens = inOut[1]; sessionConfig.maxSequenceLength = maxInputLength + maxNewTokens; samplingConfig.minLength = std::vector{maxNewTokens}; GptSession session{sessionConfig, modelConfig, worldConfig, enginePath.string(), logger}; // Use bufferManager for copying data to and from the GPU auto& bufferManager = session.getBufferManager(); auto constexpr endId = 50256; auto constexpr padId = 50256; auto& memoryCounter = MemoryCounters::getInstance(); TLLM_LOG_INFO(memoryCounter.toString()); for (auto const batchSize : batchSizes) { try { TLLM_LOG_INFO(memoryCounter.toString()); std::vector inputLenghtsHost(batchSize, maxInputLength); auto inputLenghts = bufferManager.copyFrom(inputLenghtsHost, ITensor::makeShape({batchSize}), MemoryType::kGPU); // copy inputs and wrap into shared_ptr GenerationInput::TensorPtr inputIds; std::vector inputsHost(batchSize * maxInputLength, padId); if (inputPacked) { inputIds = bufferManager.copyFrom( inputsHost, ITensor::makeShape({1, batchSize * maxInputLength}), MemoryType::kGPU); } else { inputIds = bufferManager.copyFrom( inputsHost, ITensor::makeShape({batchSize, maxInputLength}), MemoryType::kGPU); } TLLM_LOG_INFO(memoryCounter.toString()); GenerationInput generationInput{ endId, padId, std::move(inputIds), std::move(inputLenghts), inputPacked}; // runtime will allocate memory for output if this tensor is empty GenerationOutput generationOutput{ bufferManager.emptyTensor(MemoryType::kGPU, nvinfer1::DataType::kINT32), bufferManager.emptyTensor(MemoryType::kGPU, nvinfer1::DataType::kINT32)}; if (session.getModelConfig().computeContextLogits()) { generationOutput.contextLogits = bufferManager.emptyTensor(MemoryType::kGPU, nvinfer1::DataType::kFLOAT); } if (session.getModelConfig().computeGenerationLogits()) { generationOutput.generationLogits = bufferManager.emptyTensor(MemoryType::kGPU, nvinfer1::DataType::kFLOAT); bufferManager.setZero(*generationOutput.generationLogits); } TLLM_LOG_INFO(memoryCounter.toString()); for (auto r = 0; r < warmUp; ++r) { SizeType numSteps = 0; generationOutput.onTokenGenerated = [&numSteps, maxNewTokens](GenerationOutput::TensorPtr const& outputIds, SizeType step, bool finished) { ++numSteps; }; session.generate(generationOutput, generationInput, samplingConfig); bufferManager.getStream().synchronize(); } cudaDeviceSynchronize(); TLLM_LOG_INFO(memoryCounter.toString()); int iterIdx = 0; float curDuration = 0; while (iterIdx < numRuns || curDuration / 1000 < duration) { auto const start = std::chrono::steady_clock::now(); SizeType numSteps = 0; generationOutput.onTokenGenerated = [&numSteps, maxNewTokens](GenerationOutput::TensorPtr const& outputIds, SizeType step, bool finished) { ++numSteps; }; session.generate(generationOutput, generationInput, samplingConfig); bufferManager.getStream().synchronize(); auto const end = std::chrono::steady_clock::now(); iterIdx += 1; curDuration += std::chrono::duration(end - start).count(); } TLLM_LOG_INFO(memoryCounter.toString()); printf("Benchmarking done. Iteration: %d, duration: %.2f sec.\n", iterIdx, curDuration / 1000); if (worldConfig.getRank() == 0) { auto const averageLatency = curDuration / iterIdx; float const tokensPerSec = batchSize * maxNewTokens / (averageLatency / 1000); printf( "[BENCHMARK] batch_size %d input_length %d output_length %d latency(ms) %.2f tokensPerSec " "%.2f\n", batchSize, maxInputLength, maxNewTokens, averageLatency, tokensPerSec); } // logits are store in last rank if (worldConfig.getRank() == worldConfig.getSize() - 1) { if (session.getModelConfig().computeContextLogits() && printAllLogits) { std::cout << "generationOutput.contextLogits.shape: " << generationOutput.contextLogits->getShape() << std::endl; // (batchsize, prompt_len, vocabsize) std::cout << "generationOutput.contextLogits" << *generationOutput.contextLogits << std::endl; } if (session.getModelConfig().computeGenerationLogits() && printAllLogits) { std::cout << "generationOutput.generationLogits.shape: " << generationOutput.generationLogits->getShape() << std::endl; // (batchsize, beamwidth, maxNewTokens-1, vocabsize) generationOutput.generationLogits->reshape(ITensor::makeShape({batchSize * beamWidth, maxNewTokens - 1, modelConfig.getVocabSizePadded(worldConfig.getSize())})); std::cout << "generationOutput.generationLogits: " << *generationOutput.generationLogits << std::endl; } } } catch (std::runtime_error& e) { std::size_t found = std::string(e.what()).find("out of memory"); // Unexpected error; rethrow if (found == std::string::npos) { throw; } // We can ignore the OOM exception and continue the rest of the benchmark if (worldConfig.getRank() == 0) { TLLM_LOG_EXCEPTION(e); printf( "[BENCHMARK] batch_size %d input_length %d output_length %d latency(ms) N/A tokensPerSec N/A\n", batchSize, maxInputLength, maxNewTokens); } continue; } } TLLM_LOG_INFO(memoryCounter.toString()); } } } // namespace int main(int argc, char* argv[]) { cxxopts::Options options( "TensorRT-LLM C++ Runtime Benchmark", "TensorRT-LLM C++ Runtime Benchmark for GPT and GPT-like models."); options.add_options()("h,help", "Print usage"); options.add_options()( "m,model", "Model name specified for engines.", cxxopts::value()->default_value("gpt_350m")); options.add_options()("engine_dir", "Directory that store the engines.", cxxopts::value()); options.add_options()("batch_size", "Specify batch size(s) you want to benchmark. Multiple batch sizes can be separated by \";\", example: " "\"1;8;64\".", cxxopts::value()->default_value("8")); options.add_options()( "beam_width", "Specify beam width you want to benchmark.", cxxopts::value()->default_value("1")); options.add_options()("input_output_len", "Specify input-output length(s) you want to benchmark. Multiple input lengths can be separated by \";\", " "example: \"60,20;128,20\".", cxxopts::value()->default_value("128,20")); options.add_options()("log_level", "Choose log level between verbose/info/warning/error/internal_error.", cxxopts::value()->default_value("error")); options.add_options()( "warm_up", "Specify warm up iterations before benchmark starts.", cxxopts::value()->default_value("2")); options.add_options()("num_runs", "Minimal number of iterations to run during benchmarking.", cxxopts::value()->default_value("10")); options.add_options()("duration", "Minimal duration of iterations to measure in seconds.", cxxopts::value()->default_value("60")); options.add_options()("ctx_micro_batch_size", "Batch size for context phase.", cxxopts::value()); options.add_options()("gen_micro_batch_size", "Batch size for generation phase.", cxxopts::value()); options.add_options()("max_tokens_in_paged_kvcache", "Max tokens in paged K-V Cache.", cxxopts::value()); options.add_options()( "kv_cache_free_gpu_mem_fraction", "K-V Cache Free Gpu Mem Fraction.", cxxopts::value()); options.add_options()("enable_cuda_graph", "Execute GPT session with CUDA graph."); options.add_options()("print_all_logits", "Print all context and generation logits."); auto result = options.parse(argc, argv); if (result.count("help")) { std::cout << options.help() << std::endl; exit(0); } // Argument: Engine directory if (!result.count("engine_dir")) { std::cout << options.help() << std::endl; TLLM_LOG_ERROR("Please specify engine directory."); return 1; } // Argument: Batch sizes std::istringstream ssBatchSizesArg; ssBatchSizesArg.str(result["batch_size"].as()); std::vector batchSizes; for (std::string token; std::getline(ssBatchSizesArg, token, ';');) { batchSizes.push_back(std::stoi(token)); } // Argument: beam width auto const beamWidth = result["beam_width"].as(); // Argument: Input-output lengths std::istringstream ssInOutLenArg; ssInOutLenArg.str(result["input_output_len"].as()); std::vector> inOutLen; for (std::string token; std::getline(ssInOutLenArg, token, ';');) { std::istringstream ssTmp(token); std::vector inOut; for (std::string t; std::getline(ssTmp, t, ',');) { inOut.push_back(std::stoi(t)); } inOutLen.push_back(inOut); } // Argument: Log level auto logger = std::make_shared(); auto const logLevel = result["log_level"].as(); if (logLevel == "verbose") { logger->setLevel(trt::ILogger::Severity::kVERBOSE); } else if (logLevel == "info") { logger->setLevel(trt::ILogger::Severity::kINFO); } else if (logLevel == "warning") { logger->setLevel(trt::ILogger::Severity::kWARNING); } else if (logLevel == "error") { logger->setLevel(trt::ILogger::Severity::kERROR); } else if (logLevel == "internal_error") { logger->setLevel(trt::ILogger::Severity::kINTERNAL_ERROR); } else { TLLM_LOG_ERROR("Unexpected log level: " + logLevel); return 1; } GptSession::Config sessionConfig{0, 0, 0}; // Argument: Batch size for context phase if (result.count("ctx_micro_batch_size")) { sessionConfig.ctxMicroBatchSize = result["ctx_micro_batch_size"].as(); } // Argument: Batch size for generation phase if (result.count("gen_micro_batch_size")) { sessionConfig.genMicroBatchSize = result["gen_micro_batch_size"].as(); } // Argument: Max tokens in paged K-V Cache if (result.count("max_tokens_in_paged_kvcache")) { sessionConfig.kvCacheConfig.maxTokens = result["max_tokens_in_paged_kvcache"].as(); } // Argument: K-V Cache Free Gpu Mem Fraction if (result.count("kv_cache_free_gpu_mem_fraction")) { sessionConfig.kvCacheConfig.freeGpuMemoryFraction = result["kv_cache_free_gpu_mem_fraction"].as(); } // Argument: Enable CUDA graph auto enableCudaGraph = result.count("enable_cuda_graph") > 0; auto printAllLogits = result.count("print_all_logits") > 0; initTrtLlmPlugins(logger.get()); try { benchmarkGptSession(result["model"].as(), result["engine_dir"].as(), batchSizes, beamWidth, inOutLen, logger, result["warm_up"].as(), result["num_runs"].as(), result["duration"].as(), sessionConfig, enableCudaGraph, printAllLogits); } catch (const std::exception& e) { TLLM_LOG_ERROR(e.what()); return 1; } return 0; }