/* * SPDX-FileCopyrightText: Copyright (c) 2023-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. * SPDX-License-Identifier: NVIDIA TensorRT Source Code License Agreement * * NVIDIA CORPORATION, its affiliates and licensors retain all intellectual * property and proprietary rights in and to this material, related * documentation and any modifications thereto. Any use, reproduction, * disclosure or distribution of this material and related documentation * without an express license agreement from NVIDIA CORPORATION or * its affiliates is strictly prohibited. */ #include "executorTest.h" #include "modelSpec.h" #include "tensorrt_llm/common/logger.h" #include "tensorrt_llm/executor/executor.h" #include "tensorrt_llm/executor/types.h" #include "tensorrt_llm/runtime/iBuffer.h" #include "tensorrt_llm/runtime/iTensor.h" #include "tensorrt_llm/runtime/utils/mpiUtils.h" #include "tensorrt_llm/runtime/utils/numpyUtils.h" #include "tests/utils/common.h" #include #include #include #include #include #include #include #include namespace tr = tensorrt_llm::runtime; using namespace tensorrt_llm::testing; using namespace tensorrt_llm::executor; using namespace std::chrono_literals; using tensorrt_llm::testing::KVCacheType; namespace { std::string getEncDecEnginePath(std::string const& modelName, SizeType32 tp, SizeType32 pp, SizeType32 cp) { return modelName + '/' + std::to_string(tp * pp * cp) + "-gpu/float16"; } TokenIdType getDecTokenFromJsonConfig(std::filesystem::path decEnginePath, std::string const& token_name) { TokenIdType tokenId = 0; try { std::ifstream decoderJsonConfigPath(decEnginePath / "config.json"); auto const decoderPretrainedConfig = nlohmann::json::parse(decoderJsonConfigPath, nullptr, true, true).at("pretrained_config"); tokenId = decoderPretrainedConfig.at(token_name).template get(); } catch (nlohmann::json::out_of_range& e) { TLLM_LOG_ERROR( "Parameter %s cannot be found from decoder config.json in pretrained_config. Using default id 0.", token_name.c_str()); } catch (nlohmann::json::type_error const& e) { TLLM_LOG_ERROR( "Parameter %s has a different type from decoder config.json in pretrained_config. Using default id 0.", token_name.c_str()); } return tokenId; } } // namespace using EncDecParamsType = std::tuple>; std::string generateTestNameEncDec(testing::TestParamInfo const& info) { auto modelName = std::get<0>(info.param); auto const beamWidth = std::get<1>(info.param); auto const maxNewTokens = std::get<2>(info.param); auto const tp = std::get<3>(info.param); auto const pp = std::get<4>(info.param); // GTEST does not allow '-' in its test name for (auto& c : modelName) { if (c == '-') { c = '_'; } } std::string name = "EncDecTest"; name.append("_" + modelName); name.append("_BeamWidth" + std::to_string(beamWidth)); name.append("_MaxNewTokens" + std::to_string(maxNewTokens)); name.append("_TP" + std::to_string(tp)); name.append("_PP" + std::to_string(pp)); return name; } bool isLanguageAdapterName(std::string const& modelName) { return modelName == LANGUAGE_ADAPTER_NAME; } class EncDecParamsTest : public GptExecutorTest, public ::testing::WithParamInterface { }; TEST_P(EncDecParamsTest, validEncDecCtor) { auto const modelName = std::get<0>(GetParam()); SizeType32 const beamWidth = std::get<1>(GetParam()); SizeType32 const maxNewTokens = std::get<2>(GetParam()); SizeType32 const tp = std::get<3>(GetParam()); SizeType32 const pp = std::get<4>(GetParam()); SizeType32 const cp = std::get<5>(GetParam()); auto const enginePathName = getEncDecEnginePath(modelName, tp, pp, cp); std::filesystem::path encEnginePath = ENC_DEC_ENGINE_BASE / enginePathName / "encoder"; std::filesystem::path decEnginePath = ENC_DEC_ENGINE_BASE / enginePathName / "decoder"; ExecutorConfig executorConfig{}; FloatType freeGpuMemoryFraction = 0.5f; FloatType crossKvCacheFraction = 0.5f; KvCacheConfig kvCacheConfig{false, std::nullopt, std::nullopt, std::nullopt, freeGpuMemoryFraction}; kvCacheConfig.setCrossKvCacheFraction(crossKvCacheFraction); executorConfig.setKvCacheConfig(kvCacheConfig); auto executor = Executor(encEnginePath, decEnginePath, ModelType::kENCODER_DECODER, executorConfig); } TEST_P(EncDecParamsTest, Forward) { bool constexpr VERBOSE = false; auto const modelName = std::get<0>(GetParam()); SizeType32 const beamWidth = std::get<1>(GetParam()); SizeType32 const maxNewTokens = std::get<2>(GetParam()); SizeType32 const tp = std::get<3>(GetParam()); SizeType32 const pp = std::get<4>(GetParam()); SizeType32 const cp = std::get<5>(GetParam()); // Parameters for language adapter test SizeType32 const numLanguages = std::get<6>(GetParam()); std::vector languageAdapterUids = std::get<7>(GetParam()); bool const streaming = false; auto const enginePathName = getEncDecEnginePath(modelName, tp, pp, cp); std::filesystem::path encEnginePath = ENC_DEC_ENGINE_BASE / enginePathName / "encoder"; std::filesystem::path decEnginePath = ENC_DEC_ENGINE_BASE / enginePathName / "decoder"; // load ground truth input & output data auto manager = tr::BufferManager(std::make_shared()); auto inputsIdsHost = tr::utils::loadNpy(manager, (ENC_DEC_DATA_BASE / "input_ids.npy").string(), tr::MemoryType::kCPU); auto inputsIdsPtr = tr::bufferCast(*inputsIdsHost); auto inputLengthsHost = tr::utils::loadNpy(manager, (ENC_DEC_DATA_BASE / "input_lengths.npy").string(), tr::MemoryType::kCPU); auto inputLengthsPtr = tr::bufferCast(*inputLengthsHost); auto encoderOutputHost = tr::utils::loadNpy(manager, (ENC_DEC_DATA_BASE / "encoder_output.npy").string(), tr::MemoryType::kCPU); auto encoderOutputPtr = tr::bufferCast(*encoderOutputHost); auto decoderOutputHost = tr::utils::loadNpy(manager, (ENC_DEC_DATA_BASE / "output_ids_beam").string() + std::to_string(beamWidth) + ".npy", tr::MemoryType::kCPU); auto decoderOutputPtr = tr::bufferCast(*decoderOutputHost); // Rank and size info auto& comm = tensorrt_llm::mpi::MpiComm::world(); auto const worldRank = comm.getRank(); auto const worldSize = comm.getSize(); // create executor BatchingType const batchingType = BatchingType::kINFLIGHT; FloatType freeGpuMemoryFraction = 0.5f; FloatType crossKvCacheFraction = 0.5f; KvCacheConfig kvCacheConfig{false, std::nullopt, std::nullopt, std::nullopt, freeGpuMemoryFraction}; kvCacheConfig.setCrossKvCacheFraction(crossKvCacheFraction); ExecutorConfig executorConfig{beamWidth}; executorConfig.setBatchingType(batchingType); executorConfig.setKvCacheConfig(kvCacheConfig); executorConfig.setNormalizeLogProbs(false); // TODO: OrchestratorMode test does not pass bool const useOrchestratorMode = (tp * pp) > worldSize; std::optional orchestratorConfig = std::nullopt; if (useOrchestratorMode) { orchestratorConfig = OrchestratorConfig(true, PathUtil::EXECUTOR_WORKER_PATH()); } auto parallelConfig = ParallelConfig(CommunicationType::kMPI, useOrchestratorMode ? CommunicationMode::kORCHESTRATOR : CommunicationMode::kLEADER, std::nullopt, std::nullopt, orchestratorConfig); executorConfig.setParallelConfig(parallelConfig); auto executor = Executor(encEnginePath, decEnginePath, ModelType::kENCODER_DECODER, executorConfig); OutputConfig outConfig; outConfig.excludeInputFromOutput = false; outConfig.returnLogProbs = false; outConfig.returnGenerationLogits = false; outConfig.returnContextLogits = false; outConfig.returnEncoderOutput = false; TokenIdType bosId = getDecTokenFromJsonConfig(decEnginePath, "bos_token_id"); TokenIdType padId = getDecTokenFromJsonConfig(decEnginePath, "pad_token_id"); TokenIdType eosId = getDecTokenFromJsonConfig(decEnginePath, "eos_token_id"); TokenIdType decoderStartTokenId = getDecTokenFromJsonConfig(decEnginePath, "decoder_start_token_id"); bool const isLanguageAdapterTest = isLanguageAdapterName(modelName); // create requests SizeType32 const nbRequests = inputLengthsHost->getShape().d[0]; std::vector requests; for (int i = 0, cumInputLen = 0; i < nbRequests; i++) { auto encoderInput = VecTokens(&inputsIdsPtr[cumInputLen], &inputsIdsPtr[cumInputLen] + inputLengthsPtr[i]); // assume inputIds is flattened / no-padding cumInputLen += inputLengthsPtr[i]; auto decoderInput = VecTokens{decoderStartTokenId}; Request req(decoderInput, maxNewTokens, streaming, tensorrt_llm::executor::SamplingConfig(beamWidth), outConfig, eosId, padId); req.setEncoderInputTokenIds(encoderInput); if (isLanguageAdapterTest) { req.setLanguageAdapterUid(languageAdapterUids[i]); } requests.emplace_back(req); } using namespace std::chrono; // enqueue requests if (worldRank == 0) { auto tik = high_resolution_clock::now(); std::vector reqIds = executor.enqueueRequests(std::move(requests)); // get responses milliseconds waitTime(5000); auto responsesAll = executor.awaitResponses(reqIds, waitTime); auto tok = high_resolution_clock::now(); TLLM_LOG_DEBUG("TRT-LLM C++ E2E time %d ms", duration_cast(tok - tik).count()); TLLM_LOG_DEBUG("Number of responses: %d", responsesAll.size()); int32_t numFinished = 0; int iter = 0; SizeType32 numResponses = 0; std::unordered_map> outputTokens; for_each(reqIds.begin(), reqIds.end(), [&outputTokens, &beamWidth](auto const& id) { TLLM_LOG_DEBUG("Request IDs: %d", id); outputTokens[id] = {}; for (int i = 0; i < beamWidth; i++) { outputTokens[id].emplace_back(VecTokens{}); } }); for (int i = 0; i < reqIds.size(); i++) { auto& responses = responsesAll[i]; for (auto& response : responses) { numResponses++; if (!response.hasError()) { auto result = response.getResult(); numFinished += result.isFinal; for (int beam = 0; beam < beamWidth; beam++) { auto& resTokens = result.outputTokenIds.at(beam); auto& outTokens = outputTokens.at(response.getRequestId()).at(beam); outTokens.insert(outTokens.end(), std::make_move_iterator(resTokens.begin()), std::make_move_iterator(resTokens.end())); } } else { // Allow response with error only if awaitResponse processed a terminated request id std::string err = "ReqId " + std::to_string(response.getRequestId()) + " has already been processed and was terminated."; EXPECT_EQ(response.getErrorMsg(), err); } } } // print output & check correctness with ground truth for (auto const& [reqId, tokens] : outputTokens) { SizeType32 gtMaxLength = decoderOutputHost->getShape().d[1]; auto gtOutput = decoderOutputPtr + (reqId - 1) * gtMaxLength; if constexpr (VERBOSE) { std::cout << ">>> Request ID: " << reqId << std::endl; for (int beam = 0; beam < beamWidth; beam++) { std::cout << "output tokens, beam " << beam << ", output length " << tokens[beam].size() << ": " << std::endl; for_each(tokens[beam].begin(), tokens[beam].end(), [](auto const& token) { std::cout << token << ", "; }); std::cout << std::endl; } std::cout << "ground truth tokens: " << std::endl; SizeType32 gtLength = 0; for (int i = 0; i < gtMaxLength; i++) { if (gtOutput[i] != eosId) { std::cout << gtOutput[i] << ", "; gtLength++; } } std::cout << std::endl; std::cout << "ground truth length: " << gtLength << std::endl; } // check token-by-token match between beam 0 & ground truth ASSERT_TRUE(tokens.size() <= gtMaxLength) << "Request ID " << reqId << "'s generated length is longer than ground truth length " << gtMaxLength; for (int i = 0; i < gtMaxLength; i++) { if (outConfig.excludeInputFromOutput) { // if results exclude decoder start token, skip it in ground truth too continue; } if (i < tokens[0].size()) { ASSERT_EQ(tokens[0][i], gtOutput[i]) << "Generated token id: " << tokens[0][i] << " v.s. ground truth: " << gtOutput[i]; } else { ASSERT_EQ(gtOutput[i], eosId) << "Request ID " << reqId << "'s generated length " << tokens.size() << " is shorter than ground truth length " << gtMaxLength; } } } } } TEST_P(EncDecParamsTest, ExecutorKVCacheManager) { auto const modelName = std::get<0>(GetParam()); SizeType32 const beamWidth = std::get<1>(GetParam()); SizeType32 const maxNewTokens = std::get<2>(GetParam()); SizeType32 const tp = std::get<3>(GetParam()); SizeType32 const pp = std::get<4>(GetParam()); SizeType32 const cp = std::get<5>(GetParam()); auto const enginePathName = getEncDecEnginePath(modelName, tp, pp, cp); std::filesystem::path encEnginePath = ENC_DEC_ENGINE_BASE / enginePathName / "encoder"; ExecutorConfig executorConfig{}; executorConfig.setBatchingType(BatchingType::kSTATIC); auto executor = Executor(encEnginePath, ModelType::kENCODER_ONLY, executorConfig); EXPECT_EQ(executor.getKVCacheEventManager(), std::nullopt); } INSTANTIATE_TEST_SUITE_P(T5BasicTest, EncDecParamsTest, testing::Combine(testing::Values(T5_NAME), testing::Values(1), testing::Values(64), testing::Values(1), testing::Values(1), testing::Values(1), testing::Values(0), testing::Values(std::vector{})), generateTestNameEncDec); INSTANTIATE_TEST_SUITE_P(T5Beam2Test, EncDecParamsTest, testing::Combine(testing::Values(T5_NAME), testing::Values(2), testing::Values(64), testing::Values(1), testing::Values(1), testing::Values(1), testing::Values(0), testing::Values(std::vector{})), generateTestNameEncDec); INSTANTIATE_TEST_SUITE_P(T5MultiGPUTest, EncDecParamsTest, testing::Combine(testing::Values(T5_NAME), testing::Values(1), testing::Values(64), testing::Values(4), testing::Values(1), testing::Values(1), testing::Values(0), testing::Values(std::vector{})), generateTestNameEncDec); INSTANTIATE_TEST_SUITE_P(BartBasicTest, EncDecParamsTest, testing::Combine(testing::Values(BART_NAME), testing::Values(1), testing::Values(64), testing::Values(1), testing::Values(1), testing::Values(1), testing::Values(0), testing::Values(std::vector{})), generateTestNameEncDec); INSTANTIATE_TEST_SUITE_P(BartBeam2Test, EncDecParamsTest, testing::Combine(testing::Values(BART_NAME), testing::Values(2), testing::Values(64), testing::Values(1), testing::Values(1), testing::Values(1), testing::Values(0), testing::Values(std::vector{})), generateTestNameEncDec); INSTANTIATE_TEST_SUITE_P(BartMultiGPUTest, EncDecParamsTest, testing::Combine(testing::Values(BART_NAME), testing::Values(1), testing::Values(64), testing::Values(4), testing::Values(1), testing::Values(1), testing::Values(0), testing::Values(std::vector{})), generateTestNameEncDec); INSTANTIATE_TEST_SUITE_P(LanguageAdapterBasicTest, EncDecParamsTest, testing::Combine(testing::Values(LANGUAGE_ADAPTER_NAME), testing::Values(1), testing::Values(64), testing::Values(1), testing::Values(1), testing::Values(1), testing::Values(4), testing::Values(std::vector{2, 3})), generateTestNameEncDec);