/* * SPDX-FileCopyrightText: Copyright (c) 2022-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. * SPDX-License-Identifier: Apache-2.0 * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ #include "tensorrt_llm/common/config.h" #include "tensorrt_llm/executor/executor.h" #include #include #include #include #include #include #include #include #pragma once TRTLLM_NAMESPACE_BEGIN namespace benchmark { // using namespace tensorrt_llm::batch_manager; using namespace tensorrt_llm::runtime; namespace texec = tensorrt_llm::executor; std::vector> parseVectorOfVectors(std::string const& input); texec::LookaheadDecodingConfig parseLookaheadConfig(std::string const& input); struct BenchmarkParams { std::optional maxTokensInPagedKvCache{std::nullopt}; std::optional freeGpuMemoryFraction{std::nullopt}; std::vector> freeGpuMemoryFractions{std::nullopt}; std::optional crossKvCacheFraction{std::nullopt}; bool enableTrtOverlap{false}; bool enableBatchSizeTuning{false}; bool enableMaxNumTokensTuning{false}; bool enableBlockReuse{false}; bool enableChunkedContext{true}; bool streaming{false}; bool enableExpDelays{false}; std::vector> enableChunekedContextVec{std::nullopt}; std::optional requestRate{std::nullopt}; std::optional concurrency{std::nullopt}; std::optional maxBatchSize{std::nullopt}; std::vector> maxBatchSizes{std::nullopt}; std::optional maxNumTokens{std::nullopt}; std::vector> maxNumTokensVec{std::nullopt}; int randomSeed = 430; std::optional> maxAttentionWindowVec{std::nullopt}; std::optional sinkTokenLength{std::nullopt}; bool multiBlockMode{true}; bool enableContextFMHAFP32Acc{false}; bool cudaGraphMode{false}; SizeType32 cudaGraphCacheSize{0}; // lora / peft params std::optional loraDir{std::nullopt}; SizeType32 loraDeviceNumModLayers{0}; size_t loraHostCacheSize{1024 * 2024 * 1024}; // KV cache block offloading size_t kvHostCacheSize{0}; bool kvOnboardBlocks{true}; // Weights offloading float gpuWeightsPercent{1.0}; // Decoding params std::optional>> medusaChoices; std::optional eagleConfig; std::optional temperature; std::optional executorLookaheadConfig; std::optional requestLookaheadConfig; bool enableCollectkvCacheTransferTime = false; bool enableCollectIterStats = false; }; struct RecordTimeMetric { RecordTimeMetric(std::string tag) : mTag(std::move(tag)) { } std::string mTag; std::vector mDataTimes; float mAvg; float mP99; float mP95; float mP90; float mP50; float mMax; float mMin; static float calcPercentile(std::vector const& latencies, int percentile) { int const index = static_cast(std::ceil((percentile / 100.0) * latencies.size())) - 1; return latencies[index]; } void calculate() { TLLM_CHECK_WITH_INFO(mDataTimes.size() > 0, "No data to calculate for tag:%s", mTag.c_str()); mAvg = std::accumulate(mDataTimes.begin(), mDataTimes.end(), 0.F) / mDataTimes.size(); std::sort(mDataTimes.begin(), mDataTimes.end()); mP99 = calcPercentile(mDataTimes, 99); mP90 = calcPercentile(mDataTimes, 90); mP50 = calcPercentile(mDataTimes, 50); mMax = mDataTimes.back(); mMin = mDataTimes.front(); } void report() const { printf("[BENCHMARK] avg_%s(ms) %.2f\n", mTag.c_str(), mAvg); printf("[BENCHMARK] max_%s(ms) %.2f\n", mTag.c_str(), mMax); printf("[BENCHMARK] min_%s(ms) %.2f\n", mTag.c_str(), mMin); printf("[BENCHMARK] p99_%s(ms) %.2f\n", mTag.c_str(), mP99); printf("[BENCHMARK] p90_%s(ms) %.2f\n", mTag.c_str(), mP90); printf("[BENCHMARK] p50_%s(ms) %.2f\n\n", mTag.c_str(), mP50); } std::vector genHeaders() const { std::string timeTag = mTag + "(ms)"; return { "avg_" + timeTag, "max_" + timeTag, "min_" + timeTag, "p99" + timeTag, "p90" + timeTag, "p50" + timeTag}; } }; struct RecordBwMetric { RecordBwMetric(std::string tag) : mTag(std::move(tag)) { } std::string mTag; std::vector mDataTps; float mAvg; float mP99; float mP95; float mP90; float mP50; float mMax; float mMin; static float calcPercentile(std::vector const& throughputs, int percentile) { int const index = static_cast(std::ceil((percentile / 100.0) * throughputs.size())) - 1; return throughputs[index]; } void calculate() { TLLM_CHECK_WITH_INFO(mDataTps.size() > 0, "No data to calculate for tag:%s", mTag.c_str()); mAvg = std::accumulate(mDataTps.begin(), mDataTps.end(), 0.F) / mDataTps.size(); std::sort(mDataTps.begin(), mDataTps.end(), std::greater()); mP99 = calcPercentile(mDataTps, 99); mP90 = calcPercentile(mDataTps, 90); mP50 = calcPercentile(mDataTps, 50); mMax = mDataTps.front(); mMin = mDataTps.back(); } void report() const { printf("[BENCHMARK] avg_%s(Gb/sec) %.8f\n", mTag.c_str(), mAvg); printf("[BENCHMARK] max_%s(Gb/sec) %.8f\n", mTag.c_str(), mMax); printf("[BENCHMARK] min_%s(Gb/sec) %.8f\n", mTag.c_str(), mMin); printf("[BENCHMARK] p99_%s(Gb/sec) %.8f\n", mTag.c_str(), mP99); printf("[BENCHMARK] p90_%s(Gb/sec) %.8f\n", mTag.c_str(), mP90); printf("[BENCHMARK] p50_%s(Gb/sec) %.8f\n\n", mTag.c_str(), mP50); } std::vector genHeaders() const { std::string tpTag = mTag + "(Gb/sec)"; return {"avg_" + tpTag, "max_" + tpTag, "min_" + tpTag, "p99" + tpTag, "p90" + tpTag, "p50" + tpTag}; } }; std::ostream& operator<<(std::ostream& os, RecordTimeMetric const& metric); std::ostream& operator<<(std::ostream& os, RecordBwMetric const& metric); struct Sample { std::vector inputIds; int32_t outputLen; int32_t taskId; }; using Samples = std::vector; Samples parseWorkloadJson( std::filesystem::path const& datasetPath, int maxNumSamples, std::optional const maxPromptLen); std::vector generateRandomExponentialValues(int count, float lambda, int seed); std::vector computeTimeDelays(BenchmarkParams const& benchmarkParams, int numDelays); } // namespace benchmark TRTLLM_NAMESPACE_END