/* * SPDX-FileCopyrightText: Copyright (c) 2022-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. * SPDX-License-Identifier: Apache-2.0 * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ #include "tensorrt_llm/common/memoryUtils.h" #include "tensorrt_llm/plugins/api/tllmPlugin.h" #include "tensorrt_llm/runtime/iTensor.h" #include "tensorrt_llm/runtime/rawEngine.h" #include "tensorrt_llm/runtime/tllmLogger.h" #include "tensorrt_llm/runtime/tllmRuntime.h" #include "tensorrt_llm/runtime/worldConfig.h" #include #include #include #include #include #include #include #include #include using namespace tensorrt_llm::runtime; namespace tc = tensorrt_llm::common; namespace trt = nvinfer1; namespace { // follows https://github.com/NVIDIA/TensorRT/blob/release/8.6/samples/common/sampleEngines.cpp std::vector loadEngine(std::string const& enginePath) { std::ifstream engineFile(enginePath, std::ios::binary); TLLM_CHECK_WITH_INFO(engineFile.good(), std::string("Error opening engine file: " + enginePath)); engineFile.seekg(0, std::ifstream::end); auto const size = engineFile.tellg(); engineFile.seekg(0, std::ifstream::beg); std::vector engineBlob(size); engineFile.read(reinterpret_cast(engineBlob.data()), size); TLLM_CHECK_WITH_INFO(engineFile.good(), std::string("Error loading engine file: " + enginePath)); return engineBlob; } std::string engineFilename( std::filesystem::path const& dataPath, WorldConfig const& worldConfig, std::string const& model) { auto constexpr allowExceptions = true; auto constexpr ignoreComments = true; auto const jsonFilePath = dataPath / "config.json"; TLLM_CHECK_WITH_INFO( std::filesystem::exists(jsonFilePath), std::string("File does not exist: ") + jsonFilePath.string()); std::ifstream jsonStream(jsonFilePath); auto const json = nlohmann::json::parse(jsonStream, nullptr, allowExceptions, ignoreComments); auto const& builderConfig = json.at("builder_config"); auto const precision = builderConfig.at("precision").template get(); auto const worldSize = builderConfig.at("tensor_parallel").template get(); TLLM_CHECK_WITH_INFO(worldSize == worldConfig.getSize(), "world size mismatch"); return model + "_" + precision + "_tp" + std::to_string(worldConfig.getSize()) + "_rank" + std::to_string(worldConfig.getRank()) + ".engine"; } void benchmarkBert(std::string const& modelName, std::filesystem::path const& dataPath, std::vector const& batchSizes, std::vector const& inLens, bool useGpuDirectStorage, std::vector const& gpuWeightsPercents, std::shared_ptr const& logger, int warmUp, int numRuns, int duration) { auto const worldConfig = WorldConfig::mpi(); auto const enginePath = dataPath / engineFilename(dataPath, worldConfig, modelName); for (float gpuWeightsPercent : gpuWeightsPercents) { auto rt = std::make_shared( RawEngine(enginePath), logger.get(), useGpuDirectStorage, gpuWeightsPercent); rt->addContext(0); for (auto inLen : inLens) { for (auto const batchSize : batchSizes) { auto& allocator = rt->getBufferManager(); TllmRuntime::TensorMap tensorMap{}; // input_ids std::vector inputIdsHost(batchSize * inLen, inLen); auto inputIdsBuffer = std::shared_ptr{ allocator.copyFrom(inputIdsHost, ITensor::makeShape({batchSize, inLen}), MemoryType::kGPU)}; allocator.setZero(*inputIdsBuffer); tensorMap.insert(std::make_pair("input_ids", inputIdsBuffer)); // input_lengths std::vector inputLengthsHost(batchSize); auto inLensBuffer = std::shared_ptr{ allocator.copyFrom(inputLengthsHost, ITensor::makeShape({batchSize}), MemoryType::kGPU)}; allocator.setZero(*inLensBuffer); tensorMap.insert(std::make_pair("input_lengths", inLensBuffer)); rt->setInputTensors(0, tensorMap); rt->setOutputTensors(0, tensorMap); cudaDeviceSynchronize(); for (auto r = 0; r < warmUp; ++r) { rt->executeContext(0); rt->getStream().synchronize(); } cudaDeviceSynchronize(); int iterIdx = 0; float curDuration = 0; while (iterIdx < numRuns || curDuration / 1000 < duration) { auto const start = std::chrono::steady_clock::now(); rt->executeContext(0); rt->getStream().synchronize(); auto const end = std::chrono::steady_clock::now(); iterIdx += 1; curDuration += (static_cast( std::chrono::duration_cast(end - start).count()) / 1000); } printf("Benchmarking done. Iteration: %d, duration: %.2f sec.\n", iterIdx, curDuration / 1000); auto averageLatency = curDuration / iterIdx; if (worldConfig.getRank() == 0) { printf("[BENCHMARK] batch_size %d input_length %d latency(ms) %.2f\n", batchSize, inLen, averageLatency); } } } } } } // namespace int main(int argc, char* argv[]) { cxxopts::Options options("TensorRT-LLM C++ Runtime Benchmark", "TensorRT-LLM C++ Runtime Benchmark for BERT."); options.add_options()("h,help", "Print usage"); options.add_options()( "m,model", "Model name specified for engines.", cxxopts::value()->default_value("bert_base")); options.add_options()("engine_dir", "Directory that store the engines.", cxxopts::value()); options.add_options()("batch_size", "Specify batch size(s) you want to benchmark. Multiple batch sizes can be separated by \";\", example: " "\"1;8;64\".", cxxopts::value()->default_value("8")); options.add_options()("input_len", "Specify input length(s) you want to benchmark. Multiple input lengths can be " "separated by \";\", example: \"60;128\".", cxxopts::value()->default_value("128")); options.add_options()("log_level", "Choose log level between verbose/info/warning/error/internal_error.", cxxopts::value()->default_value("error")); options.add_options()( "warm_up", "Specify warm up iterations before benchmark starts.", cxxopts::value()->default_value("2")); options.add_options()("num_runs", "Minimal number of iterations to run during benchmarking.", cxxopts::value()->default_value("10")); options.add_options()("duration", "Minimal duration of iterations to measure in seconds.", cxxopts::value()->default_value("60")); options.add_options()("gpu_weights_percent", "Specify the percentage of weights that reside on GPU (from 0.0 to 1.0). Multiple percentages can be separated " "by \";\", " "example: \"0.0;0.5;1.0\".", cxxopts::value()->default_value("1.0")); options.add_options()("use_gpu_direct_storage", "Enable GPUDirect Storage (GDS) for loading engine.", cxxopts::value()->default_value("false")); auto result = options.parse(argc, argv); if (result.count("help")) { std::cout << options.help() << std::endl; exit(0); } // Argument: Engine directory if (!result.count("engine_dir")) { std::cout << options.help() << std::endl; TLLM_LOG_ERROR("Please specify engine directory."); return 1; } // Argument: Batch sizes std::istringstream ssBatchSizesArg; ssBatchSizesArg.str(result["batch_size"].as()); std::vector batchSizes; for (std::string token; std::getline(ssBatchSizesArg, token, ';');) { batchSizes.push_back(std::stoi(token)); } // Argument : Input lengths std::istringstream ssInLenArg; ssInLenArg.str(result["input_len"].as()); std::vector inLens; for (std::string token; std::getline(ssInLenArg, token, ';');) { inLens.push_back(std::stoi(token)); } // Argument: GPU weights percentage std::istringstream ssGpuPercentArg; ssGpuPercentArg.str(result["gpu_weights_percent"].as()); std::vector gpuWeightsPercents; for (std::string token; std::getline(ssGpuPercentArg, token, ';');) { auto gpuWeightsPercent = std::stof(token); if (gpuWeightsPercent < 0 || gpuWeightsPercent > 1) { TLLM_LOG_ERROR( "--gpu_weights_percent must have percents between 0.0 and 1.0 but got: %f", gpuWeightsPercent); return 1; } gpuWeightsPercents.push_back(gpuWeightsPercent); } // Argument: Log level auto logger = std::make_shared(); auto const logLevel = result["log_level"].as(); if (logLevel == "verbose") { logger->setLevel(trt::ILogger::Severity::kVERBOSE); } else if (logLevel == "info") { logger->setLevel(trt::ILogger::Severity::kINFO); } else if (logLevel == "warning") { logger->setLevel(trt::ILogger::Severity::kWARNING); } else if (logLevel == "error") { logger->setLevel(trt::ILogger::Severity::kERROR); } else if (logLevel == "internal_error") { logger->setLevel(trt::ILogger::Severity::kINTERNAL_ERROR); } else { TLLM_LOG_ERROR("Unexpected log level: " + logLevel); return 1; } initTrtLlmPlugins(logger.get()); try { benchmarkBert(result["model"].as(), result["engine_dir"].as(), batchSizes, inLens, result["use_gpu_direct_storage"].as(), gpuWeightsPercents, logger, result["warm_up"].as(), result["num_runs"].as(), result["duration"].as()); } catch (std::exception const& e) { TLLM_LOG_ERROR(e.what()); return 1; } return 0; }