TensorRT-LLMs/cpp/tests/utils/common.h
Robin Kobus d31fefde2c
[TRTLLM-5171] chore: Remove GptSession/V1 from TRT workflow (#4092)
* chore: Remove GptSession/V1 from TRT workflow

Signed-off-by: Robin Kobus <19427718+Funatiq@users.noreply.github.com>

* chore: Remove stateful decoders

Signed-off-by: Robin Kobus <19427718+Funatiq@users.noreply.github.com>

* chore: Remove GptSession buffers

Signed-off-by: Robin Kobus <19427718+Funatiq@users.noreply.github.com>

* chore: Remove GptSession utils

Signed-off-by: Robin Kobus <19427718+Funatiq@users.noreply.github.com>

* chore: Remove GptSession kernels

Signed-off-by: Robin Kobus <19427718+Funatiq@users.noreply.github.com>

* chore: Remove V1 GPT models from tests

Signed-off-by: Robin Kobus <19427718+Funatiq@users.noreply.github.com>

* chore: Remove gptSessionBenchmark from scripts and docs

Signed-off-by: Robin Kobus <19427718+Funatiq@users.noreply.github.com>

* chore: Remove gptSession IO classes

Signed-off-by: Robin Kobus <19427718+Funatiq@users.noreply.github.com>

* chore: Remove GptSession from test lists

Signed-off-by: Robin Kobus <19427718+Funatiq@users.noreply.github.com>

* chore: Remove GptSession from docs

Signed-off-by: Robin Kobus <19427718+Funatiq@users.noreply.github.com>

* chore: Remove useless encoder test

Signed-off-by: Robin Kobus <19427718+Funatiq@users.noreply.github.com>

* chore: Remove mActualBatchSize from DecoderState

Signed-off-by: Robin Kobus <19427718+Funatiq@users.noreply.github.com>

* chore: Remove static batching from ExecutorTest

- Updated `validateContextLogits` and `validateGenerationLogits` functions to remove the `batchingType` parameter.
- Adjusted related test functions to reflect the changes in parameter lists.
- Cleaned up the instantiation of test cases to eliminate unnecessary batchingType references.

Signed-off-by: Robin Kobus <19427718+Funatiq@users.noreply.github.com>

---------

Signed-off-by: Robin Kobus <19427718+Funatiq@users.noreply.github.com>
2025-05-14 23:10:04 +02:00

344 lines
14 KiB
C++

/*
* SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
* SPDX-License-Identifier: NVIDIA TensorRT Source Code License Agreement
*
* NVIDIA CORPORATION, its affiliates and licensors retain all intellectual
* property and proprietary rights in and to this material, related
* documentation and any modifications thereto. Any use, reproduction,
* disclosure or distribution of this material and related documentation
* without an express license agreement from NVIDIA CORPORATION or
* its affiliates is strictly prohibited.
*/
#pragma once
#ifndef TOP_LEVEL_DIR
#error "Define TOP_LEVEL_DIR"
#endif
#include "tensorrt_llm/executor/executor.h"
#include "tensorrt_llm/executor/types.h"
#include "tensorrt_llm/runtime/bufferManager.h"
#include "tensorrt_llm/runtime/common.h"
#include "tensorrt_llm/runtime/iBuffer.h"
#include "tensorrt_llm/runtime/iTensor.h"
#include <cmath>
#include <filesystem>
#include <random>
#include <set>
#include <string>
#include <utility>
#include <vector>
namespace tensorrt_llm::testing
{
namespace fs = std::filesystem;
namespace tr = tensorrt_llm::runtime;
using tr::SizeType32;
using tr::TokenIdType;
using tr::ITensor;
using tr::MemoryType;
auto const TEST_RESOURCE_PATH = fs::path{TOP_LEVEL_DIR} / "cpp/tests/resources";
auto const ENGINE_PATH = TEST_RESOURCE_PATH / "models/rt_engine";
auto const GPT_MODEL_PATH = ENGINE_PATH / "gpt2";
auto const LLAMA_MODEL_PATH = ENGINE_PATH / "Llama-3.2-1B";
auto const MEDUSA_MODEL_PATH = ENGINE_PATH / "vicuna-7b-medusa";
auto const CHATGLM_MODEL_PATH = ENGINE_PATH / "chatglm-6b";
auto const CHATGLM2_MODEL_PATH = ENGINE_PATH / "chatglm2-6b";
auto const CHATGLM3_MODEL_PATH = ENGINE_PATH / "chatglm3-6b";
auto const GLM_MODEL_PATH = ENGINE_PATH / "glm-10b";
auto const ENC_DEC_ENGINE_BASE = TEST_RESOURCE_PATH / "models/enc_dec/trt_engines";
auto const DATA_PATH = TEST_RESOURCE_PATH / "data";
auto const GPT_DATA_PATH = DATA_PATH / "gpt2";
auto const GPT_XGRAMMAR_TOKENIZER_INFO_PATH = GPT_DATA_PATH / "xgrammar_tokenizer_info.json";
auto const LLAMA_DATA_PATH = DATA_PATH / "Llama-3.2-1B";
auto const LLAMA_XGRAMMAR_TOKENIZER_INFO_PATH = LLAMA_DATA_PATH / "xgrammar_tokenizer_info.json";
auto const MEDUSA_DATA_PATH = DATA_PATH / "vicuna-7b-medusa";
auto const CHATGLM_DATA_PATH = DATA_PATH / "chatglm-6b";
auto const CHATGLM2_DATA_PATH = DATA_PATH / "chatglm2-6b";
auto const CHATGLM3_DATA_PATH = DATA_PATH / "chatglm3-6b";
auto const GLM_DATA_PATH = DATA_PATH / "glm-10b";
auto const ENC_DEC_DATA_BASE = DATA_PATH / "enc_dec";
auto constexpr T5_NAME = "t5-small";
auto constexpr BART_NAME = "bart-large-cnn";
auto constexpr LANGUAGE_ADAPTER_NAME = "language_adapter-enc_dec_language_adapter";
class PathUtil
{
public:
static std::string EXECUTOR_WORKER_PATH()
{
return (std::filesystem::path{TOP_LEVEL_DIR} / "cpp/build/tensorrt_llm/executor_worker/executorWorker")
.string();
}
// model paths
static std::string FP16_GPT_ATTENTION_PACKED_DIR();
static std::string FP16_GPT_ATTENTION_PACKED_PAGED_DIR();
static std::string FP16_GPT_LORA_DIR();
static std::string FP16_GPT_ATTENTION_PACKED_PAGED_DRAFT_TOKENS_DIR();
static std::string FP16_GPT_ATTENTION_PACKED_PAGED_GATHER_DIR();
static std::string FP16_PLUGIN_PACKED_PAGED_RESULT_FILE();
static std::string FP16_PLUGIN_PACKED_PAGED_LONG_RESULT_FILE();
static std::string FP16_PLUGIN_PACKED_PAGED_GATHER_RESULT_FILE();
// logits
static std::string FP16_PLUGIN_PACKED_PAGED_GENERATION_LOGITS_FILE();
static std::string FP16_PLUGIN_PACKED_PAGED_CONTEXT_LOGITS_FILE();
static std::string FP16_PLUGIN_PACKED_PAGED_CUM_LOG_PROBS_FILE();
static std::string FP16_PLUGIN_PACKED_PAGED_GATHER_CUM_LOG_PROBS_FILE();
static std::string FP16_PLUGIN_PACKED_PAGED_LOG_PROBS_FILE();
static std::string FP16_PLUGIN_PACKED_PAGED_GATHER_LOG_PROBS_FILE();
// results
static std::string FP16_PLUGIN_PACKED_PAGED_RESULT_TP1_PP1_FILE();
static std::string FP16_PLUGIN_PACKED_PAGED_RESULT_TP4_PP1_FILE();
static std::string FP16_PLUGIN_PACKED_PAGED_RESULT_TP2_PP2_FILE();
static std::string FP16_PLUGIN_PACKED_PAGED_RESULT_TP1_PP4_FILE();
static std::string FP16_PLUGIN_PACKED_PAGED_RESULT_TP1_PP2_FILE();
static std::string FP16_PLUGIN_PACKED_PAGED_RESULT_TP2_PP1_FILE();
// GptExecutorTest.GenerationLogitsEarlyStop requires to use context_fmha_fp32_acc flag in runtime for better
// accuracy
static std::string FP16_PLUGIN_PACKED_PAGED_GATHER_CONTEXTFMHAFP32ACC_RESULT_FILE();
static std::string FP16_PLUGIN_PACKED_PAGED_CONTEXTFMHAFP32ACC_GENERATION_LOGITS_FILE();
static std::string FP16_PLUGIN_PACKED_PAGED_CONTEXTFMHAFP32ACC_CONTEXT_LOGITS_FILE();
};
class ModelIds
{
public:
ModelIds() = default;
constexpr ModelIds(TokenIdType endId, TokenIdType padId)
: endId{endId}
, padId{padId}
{
}
TokenIdType endId{};
TokenIdType padId{};
};
class BeamResult
{
public:
explicit BeamResult(SizeType32 beamWidth)
: beamWidth{beamWidth} {};
BeamResult(SizeType32 beamWidth, fs::path resultsFile, fs::path contextLogitsFile, fs::path genLogitsFile,
fs::path cumLogProbsFile, fs::path logProbsFile)
: beamWidth{beamWidth}
, resultsFile{std::move(resultsFile)}
, contextLogitsFile{std::move(contextLogitsFile)}
, genLogitsFile{std::move(genLogitsFile)}
, cumLogProbsFile{std::move(cumLogProbsFile)}
, logProbsFile{std::move(logProbsFile)} {};
SizeType32 beamWidth;
fs::path resultsFile;
fs::path contextLogitsFile;
fs::path genLogitsFile;
fs::path cumLogProbsFile;
fs::path logProbsFile;
};
using BeamResults = std::vector<BeamResult>;
struct FlakyTestInfo
{
// Pair of batch ID + beam which are flaky
std::set<std::pair<SizeType32, SizeType32>> batchIdBeams;
};
class TestData
{
public:
explicit TestData(SizeType32 nbGivenInputs, SizeType32 beamWidth)
: nbGivenInputs{nbGivenInputs}
, beamWidth{beamWidth}
{
expectedOutputLengths.resize(nbGivenInputs * beamWidth);
draftTokens.resize(nbGivenInputs);
draftLogits.resize(nbGivenInputs);
acceptedDraftTokensLengths.resize(nbGivenInputs);
expectedGenerationLogits.resize(nbGivenInputs);
expectedContextLogits.resize(nbGivenInputs);
expectedCumLogProbs.resize(nbGivenInputs);
expectedLogProbs.resize(nbGivenInputs);
}
void loadLogProbs(fs::path const& cumLogProbsFile, fs::path const& logProbsFile, tr::BufferManager const& manager);
void loadContextLogits(fs::path const& contextLogitsFile, std::vector<SizeType32> const& givenInputLengths,
tr::BufferManager const& manager);
void loadGenerationLogits(fs::path const& genLogitsFile, tr::BufferManager const& manager);
void makeDraft(SizeType32 maxDraftTokens, bool acceptDraftByLogits, fs::path const& genLogitsFile,
std::vector<SizeType32> const& givenInputLengths, tr::BufferManager const& manager);
static TestData loadTestData(BeamResult const& beamResults, ITensor const& givenInput, SizeType32 maxBeamWidth,
tr::BufferManager& manager, executor::OutputConfig const& outConfig, ModelIds const& modelIds);
void verifyOutput(std::unordered_map<SizeType32, std::vector<executor::BeamTokens>> const& resultTokens,
std::vector<SizeType32> const& givenInputLengths, SizeType32 nbGivenInputs, bool streaming,
bool excludeInputFromOutput, FlakyTestInfo flakyTestInfo, bool isSpeculativeDecoding,
bool returnAllGeneratedTokens, SizeType32 reqBeamWidth, SizeType32 numReturnSequences,
bool isNonGreedySampling);
void verifyLogProbs(bool computeLogProbs, bool streaming, bool excludeInputFromOutput, SizeType32 inputLength,
SizeType32 beamWidth, executor::BeamTokens const& beamTokens,
std::optional<executor::VecLogProbs> const& cumLogProbs,
std::optional<std::vector<executor::VecLogProbs>> const& logProbs, SizeType32 batchId,
FlakyTestInfo flakyTestInfo);
void validateContextLogits(bool getContextLogits, SizeType32 inputLength, SizeType32 beamWidth,
std::optional<executor::Tensor> const& contextLogits, SizeType32 vocabSizePadded, SizeType32 batchId);
void validateGenerationLogits(bool getGenLogits, bool isFinal, bool streaming, bool excludeInputFromOutput,
SizeType32 inputLength, SizeType32 maxOutputLen, SizeType32 beamWidth, executor::BeamTokens const& beamTokens,
std::optional<executor::Tensor> const& genLogits, SizeType32 vocabSizePadded, SizeType32 batchId,
bool returnAllGeneratedTokens);
SizeType32 nbGivenInputs{};
SizeType32 beamWidth{};
SizeType32 maxSeqLen{};
ITensor::SharedPtr expectedOutputIds;
std::vector<SizeType32> expectedOutputLengths;
std::vector<TokenIdType> endIds;
std::vector<tensorrt_llm::executor::VecTokens> draftTokens;
std::vector<ITensor::SharedPtr> draftLogits;
std::vector<SizeType32> acceptedDraftTokensLengths;
std::vector<ITensor::SharedPtr> expectedGenerationLogits;
std::vector<ITensor::SharedPtr> expectedContextLogits;
std::vector<ITensor::SharedPtr> expectedCumLogProbs;
std::vector<ITensor::SharedPtr> expectedLogProbs;
};
inline bool almostEqual(float a, float b, float atol = 1e-2, float rtol = 1e-3)
{
// Params: a = value to compare and b = reference
// This function follows implementation of numpy.isclose(), which checks
// abs(a - b) <= (atol + rtol * abs(b)).
// Note that the inequality above is asymmetric where b is considered as
// a reference value. To account into both absolute/relative errors, it
// uses absolute tolerance and relative tolerance at the same time. The
// default values of atol and rtol borrowed from numpy.isclose(). For the
// case of nan value, the result will be true.
if (std::isnan(a) && std::isnan(b))
{
return true;
}
return fabs(a - b) <= (atol + rtol * fabs(b));
}
bool compareLogits(ITensor const& groundTruthLogits, ITensor const& outputLogits, float atol = 1e-2, float rtol = 1e-3);
std::tuple<SizeType32, SizeType32> getRequestGivenInputIdxLength(
std::uint64_t requestId, SizeType32 nbGivenInputs, std::vector<SizeType32> const& givenInputLengths);
std::tuple<std::vector<SizeType32>, SizeType32, SizeType32> getGivenInputLengths(
ITensor const& givenInput, SizeType32 padId);
/// @brief Generates a vector of floating point values summing to 1, that can be used as logits.
///
/// @tparam TEngine The type of the random engine.
/// @tparam TLogits The type of floating point values.
/// @param vocabSize The vocabulary size, i.e. the size of the vector.
/// @param engine A random engine.
/// @return std::vector<TLogits> A vector of floating point values, summing to 1.
template <typename TEngine, typename TLogits>
std::vector<TLogits> randomLogits(runtime::SizeType32 vocabSize, TEngine* engine)
{
if constexpr (std::disjunction_v<std::is_floating_point<TLogits>, std::is_same<TLogits, half>>)
{
// This algorithm ensures the resulting values sum to 1 by:
// 1. Sampling in the interval 0..1
// 2. Sorting the sampled values and adding a last value equal to 1
// 3. Calculating the adjacent differences of the sorted values
// Since the values are sorted and the last value is 1, we get that all the differences are positive and must
// sum to 1. It can be proven recursively by seeing that the first value sums to itself, and the n-1 first
// values must sum to the value at n, minus the difference between the n-th and n-1-th values.
// It is also helpful to convince yourself of it with a quick drawing.
auto distribution = std::uniform_real_distribution<float>(0, 1);
std::vector<float> samples(vocabSize);
samples.back() = 1.0;
std::transform(samples.begin(), samples.end() - 1, samples.begin(),
[&](auto const /*i*/) { return distribution(*engine); });
std::sort(samples.begin(), samples.end() - 1);
std::vector<float> result(vocabSize);
std::adjacent_difference(samples.begin(), samples.end(), result.begin());
if constexpr (std::is_same_v<TLogits, float>)
{
return result;
}
if constexpr (std::is_same_v<TLogits, half>)
{
std::vector<half> halfResults(vocabSize);
std::transform(
result.begin(), result.end(), halfResults.begin(), [&](auto const f) { return __float2half(f); });
return halfResults;
}
}
TLLM_THROW("Unsupported logits type.");
}
std::vector<tensorrt_llm::executor::TokenIdType> createConsecutiveTokenSequence(
tr::SizeType32 length, tr::SizeType32 vocabSize, tr::TokenIdType firstTokenId);
/**
* GPU timer for recording the elapsed time across kernel(s) launched in GPU stream
*/
struct GpuTimer
{
cudaStream_t _stream_id;
cudaEvent_t _start;
cudaEvent_t _stop;
/// Construct`or
GpuTimer()
: _stream_id(0)
{
TLLM_CUDA_CHECK(cudaEventCreate(&_start));
TLLM_CUDA_CHECK(cudaEventCreate(&_stop));
}
/// Destructor
~GpuTimer()
{
TLLM_CUDA_CHECK(cudaEventDestroy(_start));
TLLM_CUDA_CHECK(cudaEventDestroy(_stop));
}
/// Start the timer for a given stream (defaults to the default stream)
void start(cudaStream_t stream_id = 0)
{
_stream_id = stream_id;
TLLM_CUDA_CHECK(cudaEventRecord(_start, _stream_id));
}
/// Stop the timer
void stop()
{
TLLM_CUDA_CHECK(cudaEventRecord(_stop, _stream_id));
}
/// Return the elapsed time (in milliseconds)
float elapsed_millis()
{
float elapsed = 0.0;
TLLM_CUDA_CHECK(cudaEventSynchronize(_stop));
TLLM_CUDA_CHECK(cudaEventElapsedTime(&elapsed, _start, _stop));
return elapsed;
}
};
} // namespace tensorrt_llm::testing