Update TensorRT-LLM (#2253)

* Update TensorRT-LLM

---------

Co-authored-by: Ivan Sorokin <isorokin@nvidia.com>
Co-authored-by: lkm2835 <lkm2835@gmail.com>
This commit is contained in:
Kaiyu Xie 2024-09-24 23:27:31 +08:00 committed by GitHub
parent a65dba7aaf
commit e153372759
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
98 changed files with 1719 additions and 512 deletions

View File

@ -17,6 +17,15 @@ TensorRT-LLM
<div align="left">
## Latest News
* [2024/09/17] ✨ NVIDIA TensorRT-LLM Meetup
[➡️ link](https://drive.google.com/file/d/1RR8GqC-QbuaKuHj82rZcXb3MS20SWo6F/view?usp=share_link)
* [2024/09/17] ✨ Accelerating LLM Inference at Databricks with TensorRT-LLM
[➡️ link](https://drive.google.com/file/d/1NeSmrLaWRJAY1rxD9lJmzpB9rzr38j8j/view?usp=sharing)
* [2024/09/17] ✨ TensorRT-LLM @ Baseten
[➡️ link](https://drive.google.com/file/d/1Y7L2jqW-aRmt31mCdqhwvGMmCSOzBUjG/view?usp=share_link)
* [2024/09/04] 🏎️🏎️🏎️ Best Practices for Tuning TensorRT-LLM for Optimal Serving with BentoML
[➡️ link](https://www.bentoml.com/blog/tuning-tensor-rt-llm-for-optimal-serving-with-bentoml)
@ -46,6 +55,9 @@ TensorRT-LLM
* [2024/07/02] Let the @MistralAI MoE tokens fly 📈 🚀 #Mixtral 8x7B with NVIDIA #TensorRT #LLM on #H100.
[➡️ Tech blog](https://developer.nvidia.com/blog/achieving-high-mixtral-8x7b-performance-with-nvidia-h100-tensor-core-gpus-and-tensorrt-llm?ncid=so-twit-928467)
<details close>
<summary>Previous News</summary>
* [2024/06/24] Enhanced with NVIDIA #TensorRT #LLM, @upstage.ais solar-10.7B-instruct is ready to power your developer projects through our API catalog 🏎️. ✨[➡️ link](https://build.nvidia.com/upstage/solar-10_7b-instruct?snippet_tab=Try )
* [2024/06/18] CYMI: 🤩 Stable Diffusion 3 dropped last week 🎊 🏎️ Speed up your SD3 with #TensorRT INT8 Quantization[➡️ link](https://build.nvidia.com/upstage/solar-10_7b-instruct?snippet_tab=Try )
@ -58,10 +70,6 @@ Technical Deep Dive for serious coders ✅+99% compression ✅1 set of weights
* [2024/06/04] ✨ #TensorRT and GeForce #RTX unlock ComfyUI SD superhero powers 🦸⚡ 🎥 Demo: [➡️ link](https://youtu.be/64QEVfbPHyg)
📗 DIY notebook: [➡️ link](https://console.brev.dev/launchable/deploy?userID=2x2sil999&orgID=ktj33l4xj&name=ComfyUI_TensorRT&instance=L4%40g2-standard-4%3Anvidia-l4%3A1&diskStorage=500&cloudID=GCP&baseImage=docker.io%2Fpytorch%2Fpytorch%3A2.2.0-cuda12.1-cudnn8-runtime&ports=ComfUI%3A8188&file=https%3A%2F%2Fgithub.com%2Fbrevdev%2Fnotebooks%2Fblob%2Fmain%2Ftensorrt-comfyui.ipynb&launchableID=env-2hQX3n7ae5mq3NjNZ32DfAG0tJf)
<details close>
<summary>Previous News</summary>
* [2024/05/28] ✨#TensorRT weight stripping for ResNet-50 ✨ ✅+99% compression
✅1 set of weights → ** GPUs\ ✅0 performance loss ✅** models…LLM, CNN, etc
👀 📚 DIY [➡️ link](https://console.brev.dev/launchable/deploy?userID=2x2sil999&orgID=ktj33l4xj&launchableID=env-2h6bym7h5GFNho3vpWQQeUYMwTM&instance=L4%40g6.xlarge&diskStorage=500&cloudID=devplane-brev-1&baseImage=nvcr.io%2Fnvidia%2Ftensorrt%3A24.05-py3&file=https%3A%2F%2Fgithub.com%2FNVIDIA%2FTensorRT%2Fblob%2Frelease%2F10.0%2Fsamples%2Fpython%2Fsample_weight_stripping%2Fnotebooks%2Fweight_stripping.ipynb&name=tensorrt_weight_stripping_resnet50)
@ -71,10 +79,8 @@ Serverless TensorRT-LLM (LLaMA 3 8B) | Modal Docs [➡️ link](https://modal.co
* [2024/05/08] NVIDIA TensorRT Model Optimizer -- the newest member of the #TensorRT ecosystem is a library of post-training and training-in-the-loop model optimization techniques ✅quantization ✅sparsity ✅QAT [➡️ blog](https://developer.nvidia.com/blog/accelerate-generative-ai-inference-performance-with-nvidia-tensorrt-model-optimizer-now-publicly-available/)
* [2024/05/07] 🦙🦙🦙 24,000 tokens per second 🛫Meta Llama 3 takes off with #TensorRT #LLM 📚[➡️ link](https://blogs.nvidia.com/blog/meta-llama3-inference-acceleration/)
* [2024/02/06] [🚀 Speed up inference with SOTA quantization techniques in TRT-LLM](./docs/source/blogs/quantization-in-TRT-LLM.md)
* [2024/01/30] [ New XQA-kernel provides 2.4x more Llama-70B throughput within the same latency budget](./docs/source/blogs/XQA-kernel.md)
* [2023/12/04] [Falcon-180B on a single H200 GPU with INT4 AWQ, and 6.7x faster Llama-70B over A100](./docs/source/blogs/Falcon180B-H200.md)

View File

@ -40,24 +40,21 @@ namespace tensorrt_llm::batch_manager
* @brief The state of the request.
*
* Enum order must follow chronological order for state dependency check, @see hasReachedState().
*
* @todo(rkobus): refactor
*/
enum LlmRequestState_t
enum class LlmRequestState : int32_t
{
REQUEST_STATE_UNKNOWN = 0, ///< Unknown state
REQUEST_STATE_ENCODER_INIT = 1, ///< Encoder phase starts (for encoder-decoder models)
REQUEST_STATE_CONTEXT_INIT = 2, ///< Context phase starts
REQUEST_STATE_GENERATION_IN_PROGRESS = 3, ///< Generation phase is in progress
REQUEST_STATE_GENERATION_TO_COMPLETE = 4, ///< Generation phase is to be completed
REQUEST_STATE_GENERATION_COMPLETE = 5, ///< Generation phase completed
REQUEST_STATE_DISAGG_GENERATION_INIT = 6, ///< For disaggregated serving only:
/// new Generation request arrived at generation model
REQUEST_STATE_DISAGG_CONTEXT_TRANS_IN_PROGRESS = 7, ///< For disaggregated serving only:
/// Waiting context-only request transmitting the kv cache
REQUEST_STATE_DISAGG_CONTEXT_COMPLETE = 8, ///< Context-only request finished kv cache transmission.
REQUEST_STATE_DISAGG_GENERATION_TRANS_IN_PROGRESS
= 9, ///< For disaggregated serving only: transmitting the kv cache
kUNKNOWN = 0, ///< Unknown state
kENCODER_INIT = 1, ///< Encoder phase starts (for encoder-decoder models)
kCONTEXT_INIT = 2, ///< Context phase starts
kGENERATION_IN_PROGRESS = 3, ///< Generation phase is in progress
kGENERATION_TO_COMPLETE = 4, ///< Generation phase is to be completed
kGENERATION_COMPLETE = 5, ///< Generation phase completed
kDISAGG_GENERATION_INIT = 6, ///< For disaggregated serving only:
/// new Generation request arrived at generation model
kDISAGG_CONTEXT_TRANS_IN_PROGRESS = 7, ///< For disaggregated serving only:
/// Waiting context-only request transmitting the kv cache
kDISAGG_CONTEXT_COMPLETE = 8, ///< Context-only request finished kv cache transmission.
kDISAGG_GENERATION_TRANS_IN_PROGRESS = 9, ///< For disaggregated serving only: transmitting the kv cache
};
enum LlmRequestType
@ -115,7 +112,7 @@ public:
, mPromptLen(inputTokens->size())
, mMaxNewTokens(maxNewTokens)
, mSamplingConfig(samplingConfig)
, mState(REQUEST_STATE_CONTEXT_INIT)
, mState(LlmRequestState::kCONTEXT_INIT)
, mEndId(endId)
, mPadId(padId)
, mLogitsPostProcessor(logitsPostProcessor)
@ -160,7 +157,7 @@ public:
{
if (mEncoderTokens.has_value() || encoderInputFeatures.has_value())
{
mState = REQUEST_STATE_ENCODER_INIT;
mState = LlmRequestState::kENCODER_INIT;
}
initialize(*inputTokens, returnLogProbs);
@ -171,7 +168,7 @@ public:
, mPromptLen(req.getInputTokenIds().size())
, mMaxNewTokens(req.getMaxTokens())
, mSamplingConfig(req.getSamplingConfig(), req.getExternalDraftTokensConfig())
, mState(REQUEST_STATE_CONTEXT_INIT)
, mState(LlmRequestState::kCONTEXT_INIT)
, mEndId(req.getEndId())
, mPadId(req.getPadId())
, mClientId(req.getClientId())
@ -213,7 +210,7 @@ public:
{
if (req.getRequestType() == executor::RequestType::REQUEST_TYPE_GENERATION_ONLY)
{
mState = REQUEST_STATE_DISAGG_GENERATION_INIT;
mState = LlmRequestState::kDISAGG_GENERATION_INIT;
}
if (mIsStreaming && mSamplingConfig.beamWidth > 1 && !mReturnAllGeneratedTokens)
{
@ -237,7 +234,7 @@ public:
if (req.getEncoderInputTokenIds().has_value() || req.getEncoderInputFeatures().has_value())
{
mState = REQUEST_STATE_ENCODER_INIT;
mState = LlmRequestState::kENCODER_INIT;
if (req.getEncoderInputTokenIds().has_value())
{
mEncoderTokens = std::make_shared<VecTokens>(req.getEncoderInputTokenIds().value());
@ -716,8 +713,8 @@ public:
}
// for enc-dec models, pause means saving generated tokens to prompt but need to re-do encoder phase
mState = mEncoderTokens.has_value() || mEncoderInputFeatures ? REQUEST_STATE_ENCODER_INIT
: REQUEST_STATE_CONTEXT_INIT;
mState = mEncoderTokens.has_value() || mEncoderInputFeatures ? LlmRequestState::kENCODER_INIT
: LlmRequestState::kCONTEXT_INIT;
mContextCurrentPosition = 0;
mContextChunkSize = std::nullopt;
mSeqSlot.reset();
@ -1101,44 +1098,44 @@ public:
mGenerationLogitsFragments.clear();
}
[[nodiscard]] bool hasReachedState(LlmRequestState_t state) const noexcept
[[nodiscard]] bool hasReachedState(LlmRequestState state) const noexcept
{
return mState >= state;
}
[[nodiscard]] bool isEncoderInitState() const noexcept
{
return mState == REQUEST_STATE_ENCODER_INIT;
return mState == LlmRequestState::kENCODER_INIT;
}
[[nodiscard]] bool isContextInitState() const noexcept
{
return mState == REQUEST_STATE_CONTEXT_INIT;
return mState == LlmRequestState::kCONTEXT_INIT;
}
[[nodiscard]] bool isGenerationInProgressState() const noexcept
{
return mState == REQUEST_STATE_GENERATION_IN_PROGRESS || mState == REQUEST_STATE_GENERATION_TO_COMPLETE;
return mState == LlmRequestState::kGENERATION_IN_PROGRESS || mState == LlmRequestState::kGENERATION_TO_COMPLETE;
}
[[nodiscard]] bool isGenerationCompleteState() const noexcept
{
return mState == REQUEST_STATE_GENERATION_COMPLETE;
return mState == LlmRequestState::kGENERATION_COMPLETE;
}
[[nodiscard]] bool isDisaggGenerationInitState() const noexcept
{
return mState == REQUEST_STATE_DISAGG_GENERATION_INIT;
return mState == LlmRequestState::kDISAGG_GENERATION_INIT;
}
[[nodiscard]] bool isDisaggContextTransmissionState() const noexcept
{
return mState == REQUEST_STATE_DISAGG_CONTEXT_TRANS_IN_PROGRESS;
return mState == LlmRequestState::kDISAGG_CONTEXT_TRANS_IN_PROGRESS;
}
[[nodiscard]] bool isDisaggContextCompleteState() const noexcept
{
return mState == REQUEST_STATE_DISAGG_CONTEXT_COMPLETE;
return mState == LlmRequestState::kDISAGG_CONTEXT_COMPLETE;
}
/// To determine whether the context is unchunked. When a context is chunked into only a part, it
@ -1252,7 +1249,7 @@ public:
std::optional<executor::Response> createResponse()
{
TLLM_CHECK(!isDisaggContextCompleteState());
if (isGenerationCompleteState() || (mIsStreaming && isGenerationInProgressState())
if (isGenerationCompleteState() || (mIsStreaming && mState == LlmRequestState::kGENERATION_IN_PROGRESS)
|| isDisaggContextTransmissionState())
{
TLLM_LOG_DEBUG("Creating response for request %lu", mRequestId);
@ -1400,7 +1397,7 @@ public:
SizeType32 mMaxNewTokens;
// Tokens [beam_size, mPromptLen + getMaxNumGeneratedTokens()]
runtime::SamplingConfig mSamplingConfig;
LlmRequestState_t mState;
LlmRequestState mState;
std::optional<TokenIdType> mEndId;
std::optional<TokenIdType> mPadId;
std::optional<SizeType32> mSeqSlot;

View File

@ -1,3 +1,3 @@
version https://git-lfs.github.com/spec/v1
oid sha256:679f0879be2232dc93113c5d96b128628fa2f518cc7aebffd12cbd6b06d68573
size 4667768
oid sha256:e08b60b89bb4934490ee61383c55c22d831fa1cfcccedea5735400e3574aadbc
size 4671466

View File

@ -1,3 +1,3 @@
version https://git-lfs.github.com/spec/v1
oid sha256:6dd4a4c8600f6da076c9c60047043c23a2d020528d833572f3c2a0fcfce8cf12
size 4772870
oid sha256:2b6b3bf449c4b4d67f0bb9879af6b8eda6f46f272eaa5b7305582a2cc8c73e17
size 4775694

View File

@ -1,3 +1,3 @@
8a04a7d0057b71b63a9c6e4f33cc30e7 libtensorrt_llm_batch_manager_static.a
915451635c4e57cd8fd49a6dedb22ab2 libtensorrt_llm_batch_manager_static.pre_cxx11.a
bb95842cfaecdd3437b38fdb5c45b7d523bc5d36 commit
f229593e4699180b52e38f99c8ac31dc libtensorrt_llm_batch_manager_static.a
440b3ae47982d88fc8517c5f01f67b3c libtensorrt_llm_batch_manager_static.pre_cxx11.a
0144f02d9054532ac5b39f5accd89e607f5966a2 commit

View File

@ -1,3 +1,3 @@
version https://git-lfs.github.com/spec/v1
oid sha256:0d63dbd6150c592cea517b6490cc69a5d60aede23362b885300de4c0b248ba50
size 4519402
oid sha256:1a71c70d555673ce9a5086c27cbd27642f940d2439effb72a75f1302725a2513
size 4522988

View File

@ -1,3 +1,3 @@
version https://git-lfs.github.com/spec/v1
oid sha256:4be8b156080d2bb8b3f80f1ecd02a01b9dfad6f8044ba1f87a416c4d8e7dd1f3
size 4483702
oid sha256:93c436037b41d06d735acfbf065ccef4ea50085052920cb6a54fb3f84c59fb12
size 4486958

View File

@ -1,3 +1,3 @@
a46c69375658ab41016ef6e7c4744135 libtensorrt_llm_batch_manager_static.a
eb86de29ef2413010975fed7106356b7 libtensorrt_llm_batch_manager_static.pre_cxx11.a
bb95842cfaecdd3437b38fdb5c45b7d523bc5d36 commit
f402983564a358853384650313301b32 libtensorrt_llm_batch_manager_static.a
44558f89a86de69191f18a2704cff505 libtensorrt_llm_batch_manager_static.pre_cxx11.a
0144f02d9054532ac5b39f5accd89e607f5966a2 commit

View File

@ -1,3 +1,3 @@
version https://git-lfs.github.com/spec/v1
oid sha256:4a356ece970fe47f6e41dbe1c98b551b5623341eb209294e27fd932cf12b0ee0
size 28212158
oid sha256:1f8f3d6e22edead45c5bde864b541311a4b9a28f1916cd7b5bbf1292746c06c5
size 28211626

View File

@ -1,2 +1,2 @@
51e88c8d94071a4dc24f8eea43bf8c97 tensorrt_llm_batch_manager_static.lib
bb95842cfaecdd3437b38fdb5c45b7d523bc5d36 commit
7b8f52e42d11765115c185d7d13e40a3 tensorrt_llm_batch_manager_static.lib
0144f02d9054532ac5b39f5accd89e607f5966a2 commit

View File

@ -16,6 +16,7 @@
#include <algorithm>
#include <numeric>
#include <unordered_set>
#include "tensorrt_llm/common/mpiUtils.h"
@ -127,7 +128,6 @@ std::vector<int> getWorldRanks(MpiComm const& comm)
MPICHECK(MPI_Group_translate_ranks(group, groupSize, ranks.data(), worldGroup, worldRanks.data()));
MPICHECK(MPI_Group_free(&group));
MPICHECK(MPI_Group_free(&worldGroup));
std::sort(worldRanks.begin(), worldRanks.end());
#else
std::vector<int> worldRanks{0};
#endif
@ -391,31 +391,30 @@ MpiComm& MpiComm::mutableLocalSession()
void MpiComm::refreshLocalSession()
{
#if ENABLE_MULTI_DEVICE
static std::vector<int> initSessionRanks;
static std::mutex mutex;
std::unique_lock lock(mutex);
if (initSessionRanks.empty())
{
auto initSessionRanks = getWorldRanks(MpiComm::session());
auto localSessionRanks = getWorldRanks(MpiComm::localSession());
std::vector<int> intersectionRanks;
std::set_intersection(initSessionRanks.begin(), initSessionRanks.end(), localSessionRanks.begin(),
localSessionRanks.end(), std::back_inserter(intersectionRanks));
auto initSessionRanks = getWorldRanks(MpiComm::session());
auto localSessionRanks = getWorldRanks(MpiComm::localSession());
MPI_Group worldGroup;
MPICHECK(MPI_Comm_group(MPI_COMM_WORLD, &worldGroup));
MPI_Group localGroup;
MPICHECK(MPI_Group_incl(worldGroup, intersectionRanks.size(), intersectionRanks.data(), &localGroup));
MPI_Comm localComm;
MPICHECK(MPI_Comm_create_group(MPI_COMM_WORLD, localGroup, intersectionRanks.front(), &localComm));
MpiComm::mutableLocalSession().mFreeComm = true;
MpiComm::mutableLocalSession() = MpiComm{localComm, false};
}
else
// Add to intersectionRanks in order of initSessionRanks
std::vector<int> intersectionRanks;
std::unordered_set<int> localSessionRanksSet(localSessionRanks.begin(), localSessionRanks.end());
for (auto rank : initSessionRanks)
{
TLLM_CHECK_WITH_INFO(getWorldRanks(MpiComm::session()) == initSessionRanks,
"Executors in the same process must use the same participant IDs.");
if (localSessionRanksSet.find(rank) != localSessionRanksSet.end())
{
intersectionRanks.push_back(rank);
}
}
MPI_Group worldGroup;
MPICHECK(MPI_Comm_group(MPI_COMM_WORLD, &worldGroup));
MPI_Group localGroup;
MPICHECK(MPI_Group_incl(worldGroup, intersectionRanks.size(), intersectionRanks.data(), &localGroup));
MPI_Comm localComm;
MPICHECK(MPI_Comm_create_group(MPI_COMM_WORLD, localGroup, intersectionRanks.front(), &localComm));
MpiComm::mutableLocalSession().mFreeComm = true;
MpiComm::mutableLocalSession() = MpiComm{localComm, false};
TLLM_LOG_INFO("Refreshed the MPI local session");
#endif // ENABLE_MULTI_DEVICE
}

View File

@ -1,3 +1,3 @@
version https://git-lfs.github.com/spec/v1
oid sha256:674d432118c54977329079a06882c84d0d25b764759dfbbaaff0c7bc666eef57
size 1741228
oid sha256:954f77299e1d61a038c90bc578936ec06da842909ace2e8ba978fd0c0da0cc1f
size 1782460

View File

@ -1,3 +1,3 @@
version https://git-lfs.github.com/spec/v1
oid sha256:9c5fed43853462c21cddc12ff3439c410f413c24fd5c970f9a315265df1bb932
size 1768982
oid sha256:4ac77ca2662830a5990dbd06ee4d664f8ac97dc342206f5c51ca9f9ca6cb6ce1
size 1808956

View File

@ -1,3 +1,3 @@
0171ced884334f5d26ff73a701cf5343 libtensorrt_llm_executor_static.a
afc62322817744d37b422e870f202c23 libtensorrt_llm_executor_static.pre_cxx11.a
bb95842cfaecdd3437b38fdb5c45b7d523bc5d36 commit
425c9af7b0ae82e622771fc3ef7e3f01 libtensorrt_llm_executor_static.a
efa5708f62775822591ebd50974ccfd8 libtensorrt_llm_executor_static.pre_cxx11.a
0144f02d9054532ac5b39f5accd89e607f5966a2 commit

View File

@ -1,3 +1,3 @@
version https://git-lfs.github.com/spec/v1
oid sha256:2faea1a5e1ef32e99fe8570650bc1b300cea4c5c5b1a21652005685b45cf26d7
size 1807914
oid sha256:d74551056bd413556a9485bdb7b084e14264e78f12ae6037472878371f2b3b62
size 1846866

View File

@ -1,3 +1,3 @@
version https://git-lfs.github.com/spec/v1
oid sha256:192efad1021271bbdb03d1bb82699f761f0f463a3d85cb4d4a8421685fb9a3e4
size 1717616
oid sha256:393724263d6f08b831b8d8b56a3e0677f51df80183ba0d4b1fa7f40c2f8611ca
size 1757514

View File

@ -1,3 +1,3 @@
789c268816a73f2723a9d3d85e02987e libtensorrt_llm_executor_static.a
ca0412898bdc79b3508c68394dd5e3ea libtensorrt_llm_executor_static.pre_cxx11.a
bb95842cfaecdd3437b38fdb5c45b7d523bc5d36 commit
12fd54cc3b5ec8462a9db96d646ea578 libtensorrt_llm_executor_static.a
75fcebc1eae90d05bd3e2d3321a50041 libtensorrt_llm_executor_static.pre_cxx11.a
0144f02d9054532ac5b39f5accd89e607f5966a2 commit

View File

@ -1,3 +1,3 @@
version https://git-lfs.github.com/spec/v1
oid sha256:f7e9b154c122fedc45327e35b7f46aacb4cd43784f7b6273ee4207c8e2639169
size 19234764
oid sha256:98dcfc32cb6a6fcbd84625f0232f9f6d8305b38a7e8380b7e228be5f820c0dd4
size 19615228

View File

@ -1,2 +1,2 @@
a5b2ac5b4fcc5dfb46b97d0e74fd88b6 tensorrt_llm_executor_static.lib
bb95842cfaecdd3437b38fdb5c45b7d523bc5d36 commit
422bcd24325bf7ec0b26a9e4a23cce63 tensorrt_llm_executor_static.lib
0144f02d9054532ac5b39f5accd89e607f5966a2 commit

View File

@ -1,2 +1,2 @@
88c30973b9b3452baa3f063d34d08169 libtensorrt_llm_nvrtc_wrapper.so
bb95842cfaecdd3437b38fdb5c45b7d523bc5d36 commit
0144f02d9054532ac5b39f5accd89e607f5966a2 commit

View File

@ -1,2 +1,2 @@
95e9f87610383348e444d2d0b8396f2d libtensorrt_llm_nvrtc_wrapper.so
bb95842cfaecdd3437b38fdb5c45b7d523bc5d36 commit
0144f02d9054532ac5b39f5accd89e607f5966a2 commit

View File

@ -1,3 +1,3 @@
version https://git-lfs.github.com/spec/v1
oid sha256:a16c9f1c7c85ab6302be0401c370ddcfb2b5a493cf23348ee8c665ca0af50593
oid sha256:a9a2ccc0462e815aae0e7fd458c0423f76b3c0bb63ecc7a8902b94194803c4bc
size 1128448

View File

@ -1,3 +1,3 @@
version https://git-lfs.github.com/spec/v1
oid sha256:e207a8f57b944529163c7ed2ab30639a5f2779c5118602c6ebd50a623d16f845
oid sha256:e74ab8e65851dfc44e015714fe166f521649b781c85bd0215d42b488218e9ca5
size 3488

View File

@ -1,3 +1,3 @@
b7e624ba775e9f5090ef4b67bcdbd7a2 tensorrt_llm_nvrtc_wrapper.lib
3e1e3245888be6fd9b7d5934fb2a7718 tensorrt_llm_nvrtc_wrapper.dll
bb95842cfaecdd3437b38fdb5c45b7d523bc5d36 commit
700fc148d9a0f939e0088bf69e899360 tensorrt_llm_nvrtc_wrapper.lib
de95527e8d24da458cf9e3c30b21abea tensorrt_llm_nvrtc_wrapper.dll
0144f02d9054532ac5b39f5accd89e607f5966a2 commit

View File

@ -1,3 +1,3 @@
version https://git-lfs.github.com/spec/v1
oid sha256:82d12e1445abc726a172d3383a2d7759f8a4309656b84d767a98aba7e2876e2c
oid sha256:e29b1c9c454f90b68feffb65f270bba757a649c4cfa26134d21a5c71ecee9d17
size 25364090

View File

@ -1,3 +1,3 @@
version https://git-lfs.github.com/spec/v1
oid sha256:dbb231f365dc916557d7c2407008072f4bcd8a54c1196c2c4b08b0cbb91fe1d4
oid sha256:616fd0c5db05f9ba95fec4180907a890e470a42fe0d361a212b161c915d61a7b
size 25768990

View File

@ -1,3 +1,3 @@
7379ff8f89a0af16364f4440cbcf53bd libtensorrt_llm_internal_cutlass_kernels_static.a
014c7b9c179959a57faaa2398b63082b libtensorrt_llm_internal_cutlass_kernels_static.pre_cxx11.a
bb95842cfaecdd3437b38fdb5c45b7d523bc5d36 commit
dc54f44211570918194805b138a0c5eb libtensorrt_llm_internal_cutlass_kernels_static.a
7b74b643b98590d9540d62e22764a45d libtensorrt_llm_internal_cutlass_kernels_static.pre_cxx11.a
0144f02d9054532ac5b39f5accd89e607f5966a2 commit

View File

@ -1,3 +1,3 @@
version https://git-lfs.github.com/spec/v1
oid sha256:03fff62938a4aea11bbd9a3a01d06e1be1eb07333b56084a2523fe3aa771e653
oid sha256:04908e6011b2cda19555da0ca257e0f562719a32da76ba288cd35ae17b19c762
size 44173632

View File

@ -1,3 +1,3 @@
version https://git-lfs.github.com/spec/v1
oid sha256:b70d2553c65a07f5b5ad0bcecd91c9970e84bf8adcfd70ee13316a8a25787e3e
oid sha256:de44f2b89ef65a7016bd074d6a69610f81752e1cebd6e86b2314c4348cc48cc9
size 43561142

View File

@ -1,3 +1,3 @@
6c7dbbe475d18ef707233096bca3ffcd libtensorrt_llm_internal_cutlass_kernels_static.a
02ea1e93d2dbd74f7c89289256ee7d95 libtensorrt_llm_internal_cutlass_kernels_static.pre_cxx11.a
bb95842cfaecdd3437b38fdb5c45b7d523bc5d36 commit
52fd02865207f1e48afb7e2090c750c3 libtensorrt_llm_internal_cutlass_kernels_static.a
6d1e31066fce1e3c70ec0c56c4b0abb5 libtensorrt_llm_internal_cutlass_kernels_static.pre_cxx11.a
0144f02d9054532ac5b39f5accd89e607f5966a2 commit

View File

@ -1,3 +1,3 @@
version https://git-lfs.github.com/spec/v1
oid sha256:b59abe44ebe9a79100bed9318941895cd09cdcbebea00b06d6ee29cc177368f9
size 88130432
oid sha256:04f935b72c3af6c2081f19b94216f0bbcecae37511d9958ff6c62304aa022cab
size 88141376

View File

@ -1,2 +1,2 @@
bfe49c735d179edaa07fd0833eab824d tensorrt_llm_internal_cutlass_kernels_static.lib
bb95842cfaecdd3437b38fdb5c45b7d523bc5d36 commit
1ce107485d9c6133e135f4dbb0a5c4dc tensorrt_llm_internal_cutlass_kernels_static.lib
0144f02d9054532ac5b39f5accd89e607f5966a2 commit

View File

@ -121,14 +121,20 @@ private:
class DebugTensor
{
public:
DebugTensor(runtime::ITensor const& tensor, char const* name)
DebugTensor(runtime::ITensor const& tensor, char const* name,
std::shared_ptr<runtime::BufferManager> bufferManager = nullptr,
std::shared_ptr<runtime::CudaStream> stream = nullptr)
: mTensor(tensor)
, mName(name)
, mBufferManager(bufferManager)
, mStream(stream)
{
}
DebugTensor(runtime::ITensor::SharedConstPtr tensor, char const* name)
: DebugTensor(*tensor, name)
DebugTensor(runtime::ITensor::SharedConstPtr tensor, char const* name,
std::shared_ptr<runtime::BufferManager> bufferManager = nullptr,
std::shared_ptr<runtime::CudaStream> stream = nullptr)
: DebugTensor(*tensor, name, bufferManager, stream)
{
}
@ -187,9 +193,11 @@ public:
runtime::BufferManager::ITensorPtr hostPtr{nullptr};
if (mTensor.getMemoryType() == runtime::MemoryType::kGPU)
{
runtime::BufferManager manager{std::make_shared<runtime::CudaStream>()};
hostPtr = manager.copyFrom(mTensor, runtime::MemoryType::kCPU);
manager.getStream().synchronize();
auto theManager = mBufferManager
? mBufferManager
: std::make_shared<runtime::BufferManager>(mStream ? mStream : std::make_shared<runtime::CudaStream>());
hostPtr = theManager->copyFrom(mTensor, runtime::MemoryType::kCPU);
theManager->getStream().synchronize();
}
return hostPtr;
}
@ -343,12 +351,80 @@ public:
TLLM_LOG_DEBUG(shape());
}
template <typename T>
void randomize(runtime::SizeType32 vtype)
{
runtime::BufferRange<T> tensorRange(const_cast<runtime::ITensor&>(mTensor));
for (auto& item : tensorRange)
{
item = vtype == 0 ? 0 : vtype == 1 ? 1 : rand();
}
}
void randomize(void)
{
if (mTensor.getMemoryType() == runtime::MemoryType::kGPU)
{
runtime::ITensor& nonConstTensor = const_cast<runtime::ITensor&>(mTensor);
runtime::BufferManager manager{std::make_shared<runtime::CudaStream>()};
runtime::ITensor::SharedConstPtr cpuBuffer = manager.cpu(mTensor.getShape(), mTensor.getDataType());
DebugTensor(cpuBuffer, "cpuBuffer").randomize();
manager.copy(*cpuBuffer, nonConstTensor);
manager.getStream().synchronize();
}
else
{
switch (mTensor.getDataType())
{
case nvinfer1::DataType::kBOOL: return randomize<bool>(3);
case nvinfer1::DataType::kFLOAT: return randomize<float>(3);
case nvinfer1::DataType::kINT8: return randomize<std::int8_t>(3);
case nvinfer1::DataType::kINT32: return randomize<std::int32_t>(3);
case nvinfer1::DataType::kINT64: return randomize<std::int64_t>(3);
case nvinfer1::DataType::kUINT8: return randomize<std::uint8_t>(3);
default: return;
}
}
}
void setZeros(void)
{
switch (mTensor.getDataType())
{
case nvinfer1::DataType::kBOOL: return randomize<bool>(0);
case nvinfer1::DataType::kFLOAT: return randomize<float>(0);
case nvinfer1::DataType::kINT8: return randomize<std::int8_t>(0);
case nvinfer1::DataType::kINT32: return randomize<std::int32_t>(0);
case nvinfer1::DataType::kINT64: return randomize<std::int64_t>(0);
case nvinfer1::DataType::kUINT8: return randomize<std::uint8_t>(0);
default: return;
}
}
void setOnes(void)
{
switch (mTensor.getDataType())
{
case nvinfer1::DataType::kBOOL: return randomize<bool>(1);
case nvinfer1::DataType::kFLOAT: return randomize<float>(1);
case nvinfer1::DataType::kINT8: return randomize<std::int8_t>(1);
case nvinfer1::DataType::kINT32: return randomize<std::int32_t>(1);
case nvinfer1::DataType::kINT64: return randomize<std::int64_t>(1);
case nvinfer1::DataType::kUINT8: return randomize<std::uint8_t>(1);
default: return;
}
}
private:
runtime::ITensor const& mTensor;
std::string mName;
std::shared_ptr<runtime::BufferManager> mBufferManager;
std::shared_ptr<runtime::CudaStream> mStream;
};
#define D(x) tensorrt_llm::layers::DebugTensor(x, #x)
#define Db(x, bufferManager) tensorrt_llm::layers::DebugTensor(x, #x, bufferManager, nullptr)
#define Ds(x, stream) tensorrt_llm::layers::DebugTensor(x, #x, nullptr, stream)
#define PRINT_TOKENS(x) D(x).print_tokens()
#define PRINT_VALUES(x) D(x).print_values()
#define PRINT_SHAPE(x) D(x).print_shape()

View File

@ -317,13 +317,13 @@ PYBIND11_MODULE(TRTLLM_PYBIND_MODULE, m)
py::overload_cast<tr::WorldConfig const&>(&tr::GptJsonConfig::engineFilename, py::const_),
py::arg("world_config"));
py::enum_<tb::LlmRequestState_t>(m, "LlmRequestState")
.value("REQUEST_STATE_UNKNOWN", tb::LlmRequestState_t::REQUEST_STATE_UNKNOWN)
.value("REQUEST_STATE_ENCODER_INIT", tb::LlmRequestState_t::REQUEST_STATE_ENCODER_INIT)
.value("REQUEST_STATE_CONTEXT_INIT", tb::LlmRequestState_t::REQUEST_STATE_CONTEXT_INIT)
.value("REQUEST_STATE_GENERATION_IN_PROGRESS", tb::LlmRequestState_t::REQUEST_STATE_GENERATION_IN_PROGRESS)
.value("REQUEST_STATE_GENERATION_TO_COMPLETE", tb::LlmRequestState_t::REQUEST_STATE_GENERATION_TO_COMPLETE)
.value("REQUEST_STATE_GENERATION_COMPLETE", tb::LlmRequestState_t::REQUEST_STATE_GENERATION_COMPLETE);
py::enum_<tb::LlmRequestState>(m, "LlmRequestState")
.value("UNKNOWN", tb::LlmRequestState::kUNKNOWN)
.value("ENCODER_INIT", tb::LlmRequestState::kENCODER_INIT)
.value("CONTEXT_INIT", tb::LlmRequestState::kCONTEXT_INIT)
.value("GENERATION_IN_PROGRESS", tb::LlmRequestState::kGENERATION_IN_PROGRESS)
.value("GENERATION_TO_COMPLETE", tb::LlmRequestState::kGENERATION_TO_COMPLETE)
.value("GENERATION_COMPLETE", tb::LlmRequestState::kGENERATION_COMPLETE);
tpb::NamedTensor::initBindings(m);
tpb::LlmRequest::initBindings(m);

View File

@ -54,7 +54,7 @@ LookaheadRuntimeBuffers::LookaheadRuntimeBuffers(SizeType32 maxBatchSize, SizeTy
packedMaskHost = manager.cpu(packedMasksDevice->getShape(), nvinfer1::DataType::kINT32);
positionOffsetsHost = manager.cpu(positionOffsetsDevice->getShape(), nvinfer1::DataType::kINT32);
generationLengthsHost = manager.cpu(generationLengthsDevice->getShape(), nvinfer1::DataType::kINT32);
positionIdsHost = manager.gpu(positionOffsetsDevice->getShape(), nvinfer1::DataType::kINT32);
positionIdsHost = manager.cpu(positionIdsDevice->getShape(), nvinfer1::DataType::kINT32);
packedMaskHostCopy = manager.cpu(packedMasksDevice->getShape(), nvinfer1::DataType::kINT32);
positionOffsetsHostCopy = manager.cpu(positionOffsetsDevice->getShape(), nvinfer1::DataType::kINT32);

View File

@ -96,7 +96,7 @@ void MedusaModule::initMedusaTensorsFromChoices(MedusaChoices const& choices, st
if (curDepth != depth)
{
TLLM_CHECK(depth + 1 == curDepth);
TLLM_CHECK_WITH_INFO(depth <= getMaxDraftPathLen(),
TLLM_CHECK_WITH_INFO(curDepth <= getMaxDraftPathLen(),
"Medusa choices require more Medusa heads than the engine was built with.");
// Save TopK
topKs[depth - 1] = maxTopK;

View File

@ -45,12 +45,20 @@ set(TOP_LEVEL_DIR "${PROJECT_SOURCE_DIR}/..")
add_custom_target(google-tests)
set(CASE_REPORT_WRAPPER
${CMAKE_CURRENT_SOURCE_DIR}/resources/scripts/case_report_wrapper.py)
function(add_gtest test_name test_src)
set(options NO_GTEST_MAIN NO_TLLM_LINKAGE)
cmake_parse_arguments(ARGS "${options}" "${oneValueArgs}" "${multiValueArgs}"
${ARGN})
add_executable(${test_name} ${test_src})
if($ENV{LLM_MEMORY_PROFILING})
set_property(TARGET ${test_name} PROPERTY TEST_LAUNCHER
${CASE_REPORT_WRAPPER})
endif()
target_link_libraries(${test_name} PUBLIC gmock_main nvonnxparser)
if(NOT ARGS_NO_GTEST_MAIN)
target_link_libraries(${test_name} PUBLIC gtest_main)

View File

@ -1087,11 +1087,37 @@ protected:
void BasicPermuteTest(int k = 1, int64_t hidden_size = DEFAULT_HIDDEN_SIZE, int64_t num_experts = 4);
std::vector<int> calcPermuteMapExpertParallel(std::vector<int> const& expected_experts);
void ExpertParallelTest(int k = 1);
void TensorParallelTest(int k = 1);
void ExpertParallelTest(int k = 1, int64_t hidden_size = DEFAULT_HIDDEN_SIZE, int64_t num_experts = 4)
{
// 2 experts per rank
ParallelelismTest(k, 1, num_experts / 2, hidden_size, num_experts);
// 1 expert per rank
ParallelelismTest(k, 1, num_experts, hidden_size, num_experts);
}
void MixedParallelTest(int k = 1);
void TensorParallelTest(int k = 1, int64_t hidden_size = DEFAULT_HIDDEN_SIZE, int64_t num_experts = 4)
{
ParallelelismTest(k, 2, 1, hidden_size, num_experts);
ParallelelismTest(k, 4, 1, hidden_size, num_experts);
ParallelelismTest(k, 8, 1, hidden_size, num_experts);
}
void MixedParallelTest(int k = 1, int64_t hidden_size = DEFAULT_HIDDEN_SIZE, int64_t num_experts = 4)
{
// 2 experts per rank
ParallelelismTest(k, 2, num_experts / 2, hidden_size, num_experts);
ParallelelismTest(k, 4, num_experts / 2, hidden_size, num_experts);
ParallelelismTest(k, 8, num_experts / 2, hidden_size, num_experts);
// 1 expert per rank
ParallelelismTest(k, 2, num_experts, hidden_size, num_experts);
ParallelelismTest(k, 4, num_experts, hidden_size, num_experts);
ParallelelismTest(k, 8, num_experts, hidden_size, num_experts);
}
void ParallelelismTest(int k = 1, int tp_size = 4, int ep_size = 2, int64_t hidden_size = DEFAULT_HIDDEN_SIZE,
int64_t num_experts = 4);
};
template <class WeightParams>
@ -1276,6 +1302,7 @@ TYPED_TEST(MixtureOfExpertsTest, PermuteMixtral8x7b)
{
this->mUseBias = false;
this->mActType = tensorrt_llm::ActivationType::Swiglu;
this->mNormMode = tensorrt_llm::kernels::MOEExpertScaleNormalizationMode::RENORMALIZE;
this->BasicPermuteTest(2, 4096, 8);
}
@ -1299,7 +1326,8 @@ std::vector<int> MixtureOfExpertsTest<TypeParam_>::calcPermuteMapExpertParallel(
}
template <class TypeParam_>
void MixtureOfExpertsTest<TypeParam_>::ExpertParallelTest(int k)
void MixtureOfExpertsTest<TypeParam_>::ParallelelismTest(
int k, int tp_size, int ep_size, int64_t hidden_size, int64_t num_experts)
{
if (FP8)
{
@ -1307,15 +1335,20 @@ void MixtureOfExpertsTest<TypeParam_>::ExpertParallelTest(int k)
mUseBias = false;
}
ASSERT_LE(ep_size, num_experts);
if (tp_size == 1)
{
// Only the first 4 experts are ever used. They should be split across at least 2 ranks
ASSERT_LT(num_experts / ep_size, 4)
<< "Expert parallelism must have less than 4 experts per rank or the test is ineffective";
}
auto test_archs = getAllTileConfigsToTest();
for (auto [gemm1, gemm2] : test_archs)
{
mInternalSelectedConfig1 = gemm1;
mInternalSelectedConfig2 = gemm2;
int64_t hidden_size = DEFAULT_HIDDEN_SIZE;
int parallelism = 2;
int64_t num_experts = 4;
int64_t num_tokens = 3;
std::vector<DataType> hidden_states(hidden_size * num_tokens);
@ -1327,122 +1360,9 @@ void MixtureOfExpertsTest<TypeParam_>::ExpertParallelTest(int k)
0.25, 0.21, 0.35, 0.19, //
};
std::vector<int> expected_experts{0, 3, 2};
if (k == 2)
expected_experts = {0, 2, 3, 1, 2, 0};
else if (k == 3)
expected_experts = {0, 2, 3, 3, 1, 2, 2, 0, 1};
std::vector<OutputType> results(hidden_states.size(), 0);
for (int i = 0; i < parallelism; i++)
{
if (i == 0)
{
// Only need to init the inputs on the first iteration
runMoEPermute({hidden_states}, {probs}, hidden_size, num_experts, k, {},
MOEParallelismConfig{1, 0, parallelism, i});
}
else
{
runMoEPermute(MOEParallelismConfig{1, 0, parallelism, i});
}
auto selected_expert = getDataFromDevice(mSelectedExpert, num_tokens * k);
// Experts should only be selected when we are on the right node
// Note the index is [0,num_experts_per_node), so we offset the experts by the start for this node
int const start_expert = i * (mNumExperts / parallelism);
std::transform(selected_expert.begin(), selected_expert.end(), selected_expert.begin(),
[&](int val) { return val >= mNumExperts ? val : val + start_expert; });
auto masked_expected_experts = maskSelectedExpertsForTP(expected_experts, parallelism, i);
ASSERT_EQ(selected_expert, masked_expected_experts);
auto proj_map = getDataFromDevice(mSourceToExpandedMap, num_tokens * k);
auto permute_map = calcPermuteMapExpertParallel(masked_expected_experts);
ASSERT_EQ(permute_map, proj_map) << "Iteration " << i;
compareSoftmax(expected_experts, probs);
// Do the final reduce
auto iter_results = getDataFromDevice(mFinalOutput, num_tokens * hidden_size);
std::transform(
iter_results.cbegin(), iter_results.cend(), results.cbegin(), results.begin(), std::plus<>{});
}
compareFinal(expected_experts, probs, raw_unquant_input, results);
}
}
TYPED_TEST(MixtureOfExpertsTest, ExpertParallel)
{
this->ExpertParallelTest();
}
TYPED_TEST(MixtureOfExpertsTest, ExpertParallelK2)
{
this->ExpertParallelTest(2);
}
TYPED_TEST(MixtureOfExpertsTest, ExpertParallelNoBias)
{
this->mUseBias = false;
this->ExpertParallelTest();
this->ExpertParallelTest(2);
}
TYPED_TEST(MixtureOfExpertsTest, ExpertParallelRenorm)
{
this->mNormMode = MOEExpertScaleNormalizationMode::RENORMALIZE;
this->ExpertParallelTest();
this->ExpertParallelTest(2);
}
TYPED_TEST(MixtureOfExpertsTest, ExpertParallelSparseMixer)
{
this->mNormMode = MOEExpertScaleNormalizationMode::SPARSE_MIXER;
this->ExpertParallelTest();
this->ExpertParallelTest(2);
}
TYPED_TEST(MixtureOfExpertsTest, ExpertParallelGeglu)
{
this->mActType = tensorrt_llm::ActivationType::Geglu;
this->ExpertParallelTest();
this->ExpertParallelTest(2);
}
TYPED_TEST(MixtureOfExpertsTest, ExpertParallelSwiglu)
{
this->mActType = tensorrt_llm::ActivationType::Swiglu;
this->ExpertParallelTest();
this->ExpertParallelTest(2);
}
template <class TypeParam_>
void MixtureOfExpertsTest<TypeParam_>::TensorParallelTest(int k)
{
if (FP8)
{
// TODO Remove this when bias + FP8 is supported
mUseBias = false;
}
auto test_archs = getAllTileConfigsToTest();
for (auto [gemm1, gemm2] : test_archs)
{
mInternalSelectedConfig1 = gemm1;
mInternalSelectedConfig2 = gemm2;
int64_t hidden_size = DEFAULT_HIDDEN_SIZE;
int parallelism = 8;
int64_t num_experts = 4;
int64_t num_tokens = 3;
std::vector<DataType> hidden_states(hidden_size * num_tokens);
auto raw_unquant_input = populateTokens(hidden_states);
std::vector<float> probs = {
0.5, 0.1, 0.25, 0.15, //
0.03, 0.2, 0.07, 0.7, //
0.25, 0.21, 0.35, 0.19, //
};
std::vector<std::vector<DataType>> hidden_input = {hidden_states};
std::vector<std::vector<float>> router_input = {probs};
resizeRouterInputs(router_input, num_experts, num_tokens);
std::vector<int> expected_experts{0, 3, 2};
if (k == 2)
@ -1450,159 +1370,34 @@ void MixtureOfExpertsTest<TypeParam_>::TensorParallelTest(int k)
else if (k == 3)
expected_experts = {0, 2, 3, 3, 1, 2, 2, 0, 1};
std::vector<OutputType> results(hidden_states.size(), 0);
for (int i = 0; i < parallelism; i++)
for (int i = 0; i < tp_size; i++)
{
if (i == 0)
{
// Only need to init the inputs on the first iteration
runMoEPermute({hidden_states}, {probs}, hidden_size, num_experts, k, {},
MOEParallelismConfig{parallelism, i, 1, 0});
}
else
{
runMoEPermute(MOEParallelismConfig{parallelism, i, 1, 0});
}
auto selected_expert = getDataFromDevice(mSelectedExpert, num_tokens * k);
EXPECT_EQ(selected_expert, expected_experts);
auto proj_map = getDataFromDevice(mSourceToExpandedMap, num_tokens * k);
std::vector<int> permute_map{0, 2, 1};
if (k == 2)
permute_map = {0, 5, 4, 3, 2, 1};
if (k == 3)
permute_map = {0, 8, 6, 4, 2, 1, 7, 5, 3};
ASSERT_EQ(permute_map, proj_map) << "Iteration " << i;
// Do the final reduce
auto iter_results = getDataFromDevice(mFinalOutput, num_tokens * hidden_size);
std::transform(
iter_results.cbegin(), iter_results.cend(), results.cbegin(), results.begin(), std::plus<>{});
}
compareFinal(expected_experts, probs, raw_unquant_input, results);
}
}
TYPED_TEST(MixtureOfExpertsTest, TensorParallel)
{
this->TensorParallelTest();
}
TYPED_TEST(MixtureOfExpertsTest, TensorParallelK2)
{
this->TensorParallelTest(2);
}
TYPED_TEST(MixtureOfExpertsTest, TensorParallelK3)
{
this->TensorParallelTest(3);
}
TYPED_TEST(MixtureOfExpertsTest, TensorParallelNoBias)
{
this->mUseBias = false;
this->TensorParallelTest();
this->TensorParallelTest(2);
this->TensorParallelTest(3);
}
TYPED_TEST(MixtureOfExpertsTest, TensorParallelRenorm)
{
this->mNormMode = MOEExpertScaleNormalizationMode::RENORMALIZE;
this->TensorParallelTest();
this->TensorParallelTest(2);
this->TensorParallelTest(3);
}
TYPED_TEST(MixtureOfExpertsTest, TensorParallelSparseMixer)
{
this->mNormMode = MOEExpertScaleNormalizationMode::SPARSE_MIXER;
this->TensorParallelTest();
this->TensorParallelTest(2);
}
TYPED_TEST(MixtureOfExpertsTest, TensorParallelGeglu)
{
this->mActType = tensorrt_llm::ActivationType::Geglu;
this->TensorParallelTest();
this->TensorParallelTest(2);
this->TensorParallelTest(3);
}
TYPED_TEST(MixtureOfExpertsTest, TensorParallelSwiglu)
{
this->mActType = tensorrt_llm::ActivationType::Swiglu;
this->TensorParallelTest();
this->TensorParallelTest(2);
this->TensorParallelTest(3);
}
template <class TypeParam_>
void MixtureOfExpertsTest<TypeParam_>::MixedParallelTest(int k)
{
if (FP8)
{
// TODO Remove this when bias + FP8 is supported
mUseBias = false;
}
auto test_archs = getAllTileConfigsToTest();
for (auto [gemm1, gemm2] : test_archs)
{
mInternalSelectedConfig1 = gemm1;
mInternalSelectedConfig2 = gemm2;
int64_t hidden_size = DEFAULT_HIDDEN_SIZE;
int tp_parallelism = 2;
int ep_parallelism = 2;
int64_t num_experts = 4;
int64_t num_tokens = 3;
std::vector<DataType> hidden_states(hidden_size * num_tokens);
auto raw_unquant_input = populateTokens(hidden_states);
std::vector<float> probs = {
0.5, 0.1, 0.25, 0.15, //
0.03, 0.2, 0.07, 0.7, //
0.25, 0.21, 0.35, 0.19, //
};
std::vector<int> expected_experts{0, 3, 2};
if (k == 2)
expected_experts = {0, 2, 3, 1, 2, 0};
else if (k == 3)
expected_experts = {0, 2, 3, 3, 1, 2, 2, 0, 1};
std::vector<OutputType> results(hidden_states.size(), 0);
for (int i = 0; i < tp_parallelism; i++)
{
for (int j = 0; j < ep_parallelism; j++)
for (int j = 0; j < ep_size; j++)
{
if (i == 0 && j == 0)
{
// Only need to init the inputs on the first iteration
runMoEPermute({hidden_states}, {probs}, hidden_size, num_experts, k, {},
MOEParallelismConfig{tp_parallelism, i, ep_parallelism, j});
runMoEPermute(hidden_input, router_input, hidden_size, num_experts, k, {},
MOEParallelismConfig{tp_size, i, ep_size, j});
}
else
{
runMoEPermute(MOEParallelismConfig{tp_parallelism, i, ep_parallelism, j});
runMoEPermute(MOEParallelismConfig{tp_size, i, ep_size, j});
}
auto selected_expert = getDataFromDevice(mSelectedExpert, num_tokens * k);
// Experts should only be selected when we are on the right node
// Note the index is [0,num_experts_per_node), so we offset the experts by the start for this node
int const start_expert = j * (mNumExperts / ep_parallelism);
int const start_expert = j * (mNumExperts / ep_size);
std::transform(selected_expert.begin(), selected_expert.end(), selected_expert.begin(),
[&](int val) { return val >= mNumExperts ? val : val + start_expert; });
auto masked_expected_experts = maskSelectedExpertsForTP(expected_experts, ep_parallelism, j);
auto masked_expected_experts = maskSelectedExpertsForTP(expected_experts, ep_size, j);
ASSERT_EQ(selected_expert, masked_expected_experts);
auto proj_map = getDataFromDevice(mSourceToExpandedMap, num_tokens * k);
auto permute_map = calcPermuteMapExpertParallel(masked_expected_experts);
ASSERT_EQ(permute_map, proj_map) << "Iteration " << i << " " << j;
compareSoftmax(expected_experts, probs);
compareSoftmax(expected_experts, router_input[0]);
// Do the final reduce
auto iter_results = getDataFromDevice(mFinalOutput, num_tokens * hidden_size);
@ -1611,54 +1406,76 @@ void MixtureOfExpertsTest<TypeParam_>::MixedParallelTest(int k)
}
}
compareFinal(expected_experts, probs, raw_unquant_input, results);
compareFinal(expected_experts, router_input[0], raw_unquant_input, results);
}
}
TYPED_TEST(MixtureOfExpertsTest, MixedParallel)
{
this->MixedParallelTest();
}
#define PARALLEL_TEST_SUITE(ParallelismType) \
TYPED_TEST(MixtureOfExpertsTest, ParallelismType) \
{ \
this->ParallelismType##Test(); \
} \
\
TYPED_TEST(MixtureOfExpertsTest, ParallelismType##K2) \
{ \
this->ParallelismType##Test(2); \
} \
\
TYPED_TEST(MixtureOfExpertsTest, ParallelismType##K3) \
{ \
this->ParallelismType##Test(3); \
} \
\
TYPED_TEST(MixtureOfExpertsTest, ParallelismType##NoBias) \
{ \
this->mUseBias = false; \
this->ParallelismType##Test(); \
this->ParallelismType##Test(2); \
this->ParallelismType##Test(3); \
} \
\
TYPED_TEST(MixtureOfExpertsTest, ParallelismType##Renorm) \
{ \
this->mNormMode = MOEExpertScaleNormalizationMode::RENORMALIZE; \
this->ParallelismType##Test(); \
this->ParallelismType##Test(2); \
this->ParallelismType##Test(3); \
} \
TYPED_TEST(MixtureOfExpertsTest, ParallelismType##SparseMixer) \
{ \
this->mNormMode = MOEExpertScaleNormalizationMode::SPARSE_MIXER; \
this->ParallelismType##Test(); \
this->ParallelismType##Test(2); \
/* k=3 is not supported for sparse mixer tests */ \
} \
\
TYPED_TEST(MixtureOfExpertsTest, ParallelismType##Geglu) \
{ \
this->mActType = tensorrt_llm::ActivationType::Geglu; \
this->ParallelismType##Test(); \
this->ParallelismType##Test(2); \
this->ParallelismType##Test(3); \
} \
\
TYPED_TEST(MixtureOfExpertsTest, ParallelismType##Swiglu) \
{ \
this->mActType = tensorrt_llm::ActivationType::Swiglu; \
this->ParallelismType##Test(); \
this->ParallelismType##Test(2); \
this->ParallelismType##Test(3); \
} \
\
TYPED_TEST(MixtureOfExpertsTest, ParallelismType##Mixtral8x7b) \
{ \
this->mUseBias = false; \
this->mActType = tensorrt_llm::ActivationType::Swiglu; \
this->mNormMode = tensorrt_llm::kernels::MOEExpertScaleNormalizationMode::RENORMALIZE; \
this->ParallelismType##Test(2, 4096, 8); \
}
TYPED_TEST(MixtureOfExpertsTest, MixedParallelK2)
{
this->MixedParallelTest(2);
}
TYPED_TEST(MixtureOfExpertsTest, MixedParallelNoBias)
{
this->mUseBias = false;
this->MixedParallelTest();
this->MixedParallelTest(2);
}
TYPED_TEST(MixtureOfExpertsTest, MixedParallelRenorm)
{
this->mNormMode = MOEExpertScaleNormalizationMode::RENORMALIZE;
this->MixedParallelTest();
this->MixedParallelTest(2);
}
TYPED_TEST(MixtureOfExpertsTest, MixedParallelSparseMixer)
{
this->mNormMode = MOEExpertScaleNormalizationMode::SPARSE_MIXER;
this->MixedParallelTest();
this->MixedParallelTest(2);
}
TYPED_TEST(MixtureOfExpertsTest, MixedParallelGeglu)
{
this->mActType = tensorrt_llm::ActivationType::Geglu;
this->MixedParallelTest();
this->MixedParallelTest(2);
}
TYPED_TEST(MixtureOfExpertsTest, MixedParallelSwiglu)
{
this->mActType = tensorrt_llm::ActivationType::Swiglu;
this->MixedParallelTest();
this->MixedParallelTest(2);
}
PARALLEL_TEST_SUITE(ExpertParallel)
PARALLEL_TEST_SUITE(TensorParallel)
PARALLEL_TEST_SUITE(MixedParallel)
TYPED_TEST(MixtureOfExpertsTest, ConfigSweep)
{

View File

@ -0,0 +1,42 @@
#!/usr/bin/env python3
# SPDX-FileCopyrightText: Copyright (c) 2022-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import subprocess
import sys
import time
if __name__ == '__main__':
case = ''
for arg in sys.argv[1:]:
if '--gtest_filter=' in arg:
case = arg.removeprefix('--gtest_filter=')
gtest = subprocess.Popen(sys.argv[1:])
if case:
import multiprocessing.connection
with multiprocessing.connection.Client("/tmp/profiling_scribe.unix",
"AF_UNIX") as client:
client.send({
"type": "gtest_case",
"timestamp": time.time(),
"case": case,
"pid": gtest.pid
})
gtest.wait()
exit(gtest.returncode)

View File

@ -16,13 +16,16 @@
import argparse as _arg
import copy
import functools
import glob
import logging as _log
import os as _os
import pathlib as _pl
import platform
import signal
import subprocess as _sp
import sys as _sys
import time as _time
import typing as _tp
build_script_dir = _pl.Path(
@ -556,6 +559,31 @@ def build_tests(build_dir: _pl.Path):
run_command(make_google_tests, cwd=build_dir, timeout=300)
def with_memory_monitor(func):
if not _os.environ.get('LLM_MEMORY_PROFILING', False):
return func
@functools.wraps(func)
def wrapper(*args, **kwargs):
memory_collector = _sp.Popen([
"/usr/bin/python3",
find_root_dir() /
"tests/llm-test-defs/turtle/defs/memory_collector.py",
"-p",
str(_os.getpid()),
"-i",
"0.2",
])
try:
func(*args, **kwargs)
finally:
memory_collector.send_signal(signal.SIGINT)
memory_collector.wait()
return wrapper
@with_memory_monitor
def run_unit_tests(build_dir: _pl.Path, timeout=1800):
build_tests(build_dir=build_dir)
@ -579,6 +607,7 @@ def run_unit_tests(build_dir: _pl.Path, timeout=1800):
parallel_run_ctest(ctest, cwd=build_dir, env=cpp_env, timeout=timeout)
@with_memory_monitor
def run_single_gpu_tests(build_dir: _pl.Path,
run_gpt,
run_gptj,
@ -646,6 +675,7 @@ def produce_mpirun_command(*, global_commands, nranks, local_commands,
return l[:-1]
@with_memory_monitor
def run_multi_gpu_tests(build_dir: _pl.Path, timeout=1500):
build_tests(build_dir=build_dir)
@ -1068,4 +1098,24 @@ if __name__ == "__main__":
del test_args.run_all_models
run_tests(**vars(test_args))
do_memory_profiling = _os.environ.get('LLM_MEMORY_PROFILING', False)
if do_memory_profiling:
unix_socket = "/tmp/profiling_scribe.unix"
scribe = _sp.Popen([
"/usr/bin/python3",
find_root_dir() /
"tests/llm-test-defs/turtle/defs/profiling_scribe.py", "-l",
unix_socket
])
while not _os.path.exists(unix_socket):
_time.sleep(0.1)
try:
run_tests(**vars(test_args))
finally:
if do_memory_profiling:
scribe.send_signal(signal.SIGINT)
scribe.wait(timeout=10)
scribe.kill()

View File

@ -1,3 +1,5 @@
(kv-cache-reuse)=
# KV cache reuse
This document describes how kv cache pages can be shared and reused by requests that start with the same prompt. This can greatly lower first token latency, the time it takes before the first output token is generated. Many use cases can benefit from this, including multi-turn requests and system prompts.

View File

@ -1,3 +1,5 @@
(speculative-decoding)=
# Speculative Sampling
Speculative Sampling (also referred to as Speculative Decoding) is a set of techniques designed to allow generation of more than one token per forward pass iteration. This can lead to a reduction in the average per-token latency **in situations where the GPU
@ -30,7 +32,7 @@ may prove simpler than generating a summary for an article.
Furthermore, when integrating Medusa with a standard PyTorch model implementation which may not be as finely
tuned as TensorRT-LLM, the potential time savings are more pronounced.
# Draft Model Approach
## Draft Model Approach
The Draft model approach involves the use of two distinct models trained independently
but sharing the same vocabulary: a smaller Draft model and a larger Target model.
@ -58,7 +60,7 @@ it is advisable to enable KV cache reuse for both models.
This can be achieved by adding the `--use_paged_context_fmha=enable` flag to the `trtllm-build` command
and setting `enableBlockReuse=true` in the `KVCacheConfig`.
## Using Draft model approach with Triton Inference Server
### Using Draft model approach with Triton Inference Server
+ Draft model approach is supported since TensorRT-LLM-0.7.0 (using two separate Tritonserver to maintain draft and target model respectively), but has significant optimization in TensorRT-LLM-0.10.0 (using one Tritonserver with [Business Logic Scripting](https://github.com/triton-inference-server/python_backend?tab=readme-ov-file#business-logic-scripting), BLS).
+ The source file of Draft model with BLS can be found [here](https://github.com/triton-inference-server/tensorrtllm_backend/blob/main/all_models/inflight_batcher_llm/tensorrt_llm_bls/1/lib/decode.py).
@ -218,7 +220,7 @@ and setting `enableBlockReuse=true` in the `KVCacheConfig`.
pkill -9 -f tritonserver
```
# Medusa
## Medusa
This approach leverages a single model to both generate and verify draft tokens.
It enhances the existing model by adding multiple extra language model heads, known as Medusa heads.
@ -249,7 +251,7 @@ In the TensorRT-LLM implementation of Medusa, the configuration of the tree is a
This flexibility allows you to experiment and identify the optimal tree structure for your use case,
which can then be utilized in a production environment.
## Medusa Tree
### Medusa Tree
Consider the following diagram, which illustrates how the hidden states from the last layer of the base model
are passed to the base model's language model (LM) head and to four Medusa heads (MHs).
@ -294,11 +296,11 @@ So, only `9` candidates are specified.
**Specifying paths-only instead of all choices is currently supported only in the Python runtime.**
## Using Medusa with TensorRT-LLM
### Using Medusa with TensorRT-LLM
For guidance on constructing and executing Medusa with the Python runtime, consult the [Medusa README](https://github.com/NVIDIA/TensorRT-LLM/tree/main/examples/medusa/README.md). When utilizing the Inflight Fused Batching (IFB) with the C++ API, it is necessary to define the `medusa_choices` explicitly within the model configuration. For detailed instructions, refer to the [model configuration in TensorRT-LLM backend](https://github.com/triton-inference-server/tensorrtllm_backend?tab=readme-ov-file#modify-the-model-configuration) for more details.
### Limitations
#### Limitations
- TensorRT-LLM supports Medusa only for Vicuna (fine tuned LLaMA).
However, similar to any new model, you can follow the same approach to define your own Medusa model and deploy with TensorRT-LLM.
@ -306,7 +308,7 @@ However, similar to any new model, you can follow the same approach to define yo
- Beam search is **not** compatible with Medusa.
# ReDrafter
## ReDrafter
This approach enhances the single-model Medusa method by predicting and verifying tokens using the same model. However, unlike Medusa, it predicts draft tokens using a recurrent predictor, where each draft token depends on the previous one. This method also allows the use of beam search to identify more prominent draft tokens. For more details, please read [the ReDrafter paper](https://arxiv.org/html/2403.09919v1).

View File

@ -205,7 +205,7 @@ void invokeQuantization(...) {
```
For more details on how TensorRT-LLM implements the GPT Attention operator, see
the [Multi-head, Multi-query and Group-query Attention](gpt_attention.md) document.
the [Multi-head, Multi-query and Group-query Attention](../advanced/gpt-attention.md) document.
# Runtime
@ -214,7 +214,7 @@ the runtime components is to load the TensorRT engines and drive their
execution. Typically, for an auto-regressive model like GPT, the runtime is in
charge of loading the engine that implements both the processing of the input
sequence as well as the body of the generation loop. See the [GPT C++
Runtime](gpt_runtime.md) document for details on the C++ Runtime.
Runtime](../advanced/gpt-runtime.md) document for details on the C++ Runtime.
(multi-gpu-multi-node)=

View File

@ -96,11 +96,14 @@ Welcome to TensorRT-LLM's Documentation!
advanced/gpt-attention.md
advanced/gpt-runtime.md
advanced/executor.md
advanced/graph-rewriting.md
advanced/batch-manager.md
advanced/inference-request.md
advanced/lora.md
advanced/expert-parallelism.md
advanced/kv-cache-reuse.md
advanced/speculative-decoding.md
.. toctree::
:maxdepth: 2

View File

@ -377,11 +377,11 @@ All published functionality in the Release Notes has been fully tested and verif
### Key Features and Enhancements
- Chunked context support (see docs/source/gpt_attention.md#chunked-context)
- Chunked context support (see docs/source/advanced/gpt-attention.md#chunked-context)
- LoRA support for C++ runtime (see docs/source/lora.md)
- Medusa decoding support (see examples/medusa/README.md)
- The support is limited to Python runtime for Ampere or newer GPUs with fp16 and bf16 accuracy, and the `temperature` parameter of sampling configuration should be 0
- StreamingLLM support for LLaMA (see docs/source/gpt_attention.md#streamingllm)
- StreamingLLM support for LLaMA (see docs/source/advanced/gpt-attention.md#streamingllm)
- Support for batch manager to return logits from context and/or generation phases
- Include support in the Triton backend
- Support AWQ and GPTQ for QWEN

View File

@ -1,5 +1,5 @@
--extra-index-url https://pypi.nvidia.com
tensorrt_llm==0.14.0.dev2024091700
tensorrt_llm==0.14.0.dev2024092400
datasets~=2.15.0
evaluate~=0.4.1
rouge_score~=0.1.2

View File

@ -1,5 +1,5 @@
--extra-index-url https://pypi.nvidia.com
tensorrt_llm==0.14.0.dev2024091700
tensorrt_llm==0.14.0.dev2024092400
datasets~=2.14.5
evaluate~=0.4.1
rouge_score~=0.1.2

View File

@ -1,5 +1,5 @@
--extra-index-url https://pypi.nvidia.com
tensorrt_llm==0.14.0.dev2024091700
tensorrt_llm==0.14.0.dev2024092400
datasets~=2.14.5
evaluate~=0.4.1
protobuf

View File

@ -1,5 +1,5 @@
--extra-index-url https://pypi.nvidia.com
tensorrt_llm==0.14.0.dev2024091700
tensorrt_llm==0.14.0.dev2024092400
datasets~=2.14.5
evaluate~=0.4.1
rouge_score~=0.1.2

View File

@ -0,0 +1,77 @@
# Deepseek-v1
This document shows how to build and run [deepseek-v1](https://arxiv.org/pdf/2401.06066) model in TensorRT-LLM.
- [Deepseek-v1](#deepseek-v1)
- [Prerequisite](#prerequistie)
- [Hardware](#hardware)
- [Overview](#overview)
- [Support Matrix](#support-matrix)
- [Usage](#usage)
- [Build TensorRT engine(s)](#build-tensorrt-engines)
## Prerequisite
First, please download Deepseek-v1 weights from HF https://huggingface.co/deepseek-ai/deepseek-moe-16b-base.
```bash
git lfs install
git clone https://huggingface.co/deepseek-ai/deepseek-moe-16b-base
```
## Hardware
The Deepseek-v1 model requires 1x80G GPU memory.
## Overview
The TensorRT-LLM Deepseek-v1 implementation can be found in [tensorrt_llm/models/deepseek_v1/model.py](../../tensorrt_llm/models/deepseek_v1/model.py). The TensorRT-LLM Deepseek-v1 example code is located in [`example/deepseek_v1`](./). There is one main file:
* [`convert_checkpoint.py`](./convert_checkpoint.py) to convert the Deepseek-v1 model into tensorrt-llm checkpoint format.
In addition, there are three shared files in the parent folder [`examples`](../) can be used for inference and evaluation:
* [`../run.py`](../run.py) to run the model inference output by given an input text.
* [`../summarize.py`](../summarize.py) to summarize the article from [cnn_dailmail](https://huggingface.co/datasets/cnn_dailymail) dataset, it can running the summarize from HF model and TensorRT-LLM model.
* [`../mmlu.py`](../mmlu.py) to running score script from https://github.com/declare-lab/instruct-eval to compare HF model and TensorRT-LLM model on the MMLU dataset.
## Support Matrix
- [x] FP16
- [x] TENSOR PARALLEL
- [ ] FP8
## Usage
The TensorRT-LLM Deepseek-v1 example code locates at [examples/deepseek_v1](./). It takes PyTorch weights as input, and builds corresponding TensorRT engines. The number of TensorRT engines depends on the number of GPUs used to run inference.
### Build TensorRT engine(s)
Below is the step-by-step to run Deepseek-v1 with TensorRT-LLM.
First the checkpoint will be converted to the TensorRT-LLM checkpoint format by apply [`convert_checkpoint.py`](./convert_checkpoint.py). After that, the TensorRT engine(s) can be build with TensorRT-LLM checkpoint.
```bash
# Build the bfloat16 engine from Deepseek-v1 HF weights.
python convert_checkpoint.py --model_dir ./deepseek_moe_16b/ \
--output_dir ./trtllm_checkpoint_deepseek_v1_1gpu_bf16 \
--dtype bfloat16 \
--tp_size 1
trtllm-build --checkpoint_dir ./trtllm_checkpoint_deepseek_v1_1gpu_bf16 \
--output_dir ./trtllm_engines/deepseek_v1/bf16/tp1 \
--gpt_attention_plugin bfloat16 \
--gemm_plugin bfloat16 \
--moe_plugin bfloat16 \
```
Then, test the engine with [run.py](../run.py) script:
```bash
python ../run.py --engine_dir ./trtllm_engines/deepseek_v1/bf16/tp1 \
--tokenizer_dir ./deepseek_moe_16b/ \
--max_output_len 32 \
--top_p 0 \
--input_text "The president of the United States is person who"
```
## Credits
This Deepseek-v1 model example exists thanks to @akhoroshev(https://github.com/akhoroshev) community contribution!

View File

@ -0,0 +1,14 @@
# SPDX-FileCopyrightText: Copyright (c) 2022-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

View File

@ -0,0 +1,215 @@
# SPDX-FileCopyrightText: Copyright (c) 2022-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import argparse
import os
import time
import traceback
from concurrent.futures import ThreadPoolExecutor, as_completed
import tensorrt_llm
from tensorrt_llm._utils import release_gc
from tensorrt_llm.layers import MoeConfig
from tensorrt_llm.mapping import Mapping
from tensorrt_llm.models import DeepseekForCausalLM
from tensorrt_llm.models.deepseek_v1.convert import load_hf_deepseek
def parse_arguments():
parser = argparse.ArgumentParser()
parser.add_argument('--model_dir', type=str, default=None, required=True)
parser.add_argument('--tp_size',
type=int,
default=1,
help='N-way tensor parallelism size')
parser.add_argument('--pp_size',
type=int,
default=1,
help='N-way pipeline parallelism size')
parser.add_argument(
'--moe_tp_size',
type=int,
default=-1,
help=
'N-way tensor parallelism size for MoE, default is tp_size, which will do tp-only for MoE'
)
parser.add_argument(
'--moe_ep_size',
type=int,
default=-1,
help=
'N-way expert parallelism size for MoE, default is 1, which will do tp-only for MoE'
)
parser.add_argument('--dtype',
type=str,
default='float16',
choices=['float32', 'bfloat16', 'float16'])
parser.add_argument(
'--use_parallel_embedding',
action="store_true",
default=False,
help=
'By default embedding parallelism is disabled. By setting this flag, embedding parallelism is enabled'
)
parser.add_argument(
'--embedding_sharding_dim',
type=int,
default=0,
choices=[0, 1],
help=
'By default the embedding lookup table is sharded along vocab dimension (embedding_sharding_dim=0)'
'To shard it along hidden dimension, set embedding_sharding_dim=1'
'Note: embedding sharing is only enabled when embedding_sharding_dim=0')
parser.add_argument(
'--use_embedding_sharing',
action="store_true",
default=False,
help=
'Try to reduce the engine size by sharing the embedding lookup table between two layers'
'Note: the flag might not take effect when the criteria are not met')
parser.add_argument('--output_dir',
type=str,
default='trtllm_checkpoint',
required=True,
help='The path to save the TensorRT-LLM checkpoint')
parser.add_argument(
'--workers',
type=int,
default=1,
help='The number of workers for converting checkpoint in parallel')
parser.add_argument(
'--moe_num_experts',
type=int,
default=0,
help='Specify the number of experts to use for MOE layers')
parser.add_argument(
'--moe_top_k',
type=int,
default=0,
help=
'Specify the top_k value to use for MOE layers. Default to 1 if --moe_num_experts is set'
)
parser.add_argument(
'--moe_renorm_mode',
type=int,
default=MoeConfig.ExpertScaleNormalizationMode.RENORMALIZE,
help=
'Controls renormalization after gate logits. Check layers/moe.py for accepted values'
)
parser.add_argument(
'--save_config_only',
action="store_true",
default=False,
help=
'Only save the model config w/o read and converting weights, be careful, this is for debug only'
)
parser.add_argument(
'--disable_weight_only_quant_plugin',
default=False,
action="store_true",
help=
'By default, using plugin implementation for weight quantization. Enabling disable_weight_only_quant_plugin flag will use ootb implementation instead of plugin.'
'You must also use --use_weight_only for that argument to have an impact'
)
# Add quantization related feature later
args = parser.parse_args()
return args
def args_to_build_options(args):
return {
'use_parallel_embedding': args.use_parallel_embedding,
'embedding_sharding_dim': args.embedding_sharding_dim,
'share_embedding_table': args.use_embedding_sharing,
'disable_weight_only_quant_plugin':
args.disable_weight_only_quant_plugin
}
def execute(workers, func, args):
if workers == 1:
for rank, f in enumerate(func):
f(args, rank)
else:
with ThreadPoolExecutor(max_workers=workers) as p:
futures = [p.submit(f, args, rank) for rank, f in enumerate(func)]
exceptions = []
for future in as_completed(futures):
try:
future.result()
except Exception as e:
traceback.print_exc()
exceptions.append(e)
assert len(
exceptions
) == 0, "Checkpoint conversion failed, please check error log."
def convert_and_save_hf(args):
model_dir = args.model_dir
world_size = args.tp_size * args.pp_size
# Need to convert the cli args to the kay-value pairs and override them in the generate config dict.
# Ideally these fields will be moved out of the config and pass them into build API, keep them here for compatibility purpose for now,
# before the refactor is done.
override_fields = {}
override_fields.update(args_to_build_options(args))
hf_model = load_hf_deepseek(model_dir)
def convert_and_save_rank(args, rank):
mapping = Mapping(world_size=world_size,
rank=rank,
tp_size=args.tp_size,
pp_size=args.pp_size,
moe_tp_size=args.moe_tp_size,
moe_ep_size=args.moe_ep_size)
deepseekv1 = DeepseekForCausalLM.from_hugging_face(
hf_model, args.model_dir, args.dtype, mapping, **override_fields)
deepseekv1.save_checkpoint(args.output_dir, save_config=(rank == 0))
del deepseekv1
execute(args.workers, [convert_and_save_rank] * world_size, args)
release_gc()
def main():
print(tensorrt_llm.__version__)
args = parse_arguments()
args.tp_size * args.pp_size
if (args.moe_tp_size == -1 and args.moe_ep_size == -1):
# moe default to tp-only
args.moe_tp_size = args.tp_size
args.moe_ep_size = 1
elif (args.moe_tp_size == -1):
args.moe_tp_size = args.tp_size // args.moe_ep_size
elif (args.moe_ep_size == -1):
args.moe_ep_size = args.tp_size // args.moe_tp_size
assert (args.moe_tp_size * args.moe_ep_size == args.tp_size
), "moe_tp_size * moe_ep_size must equal to tp_size"
tik = time.time()
if not os.path.exists(args.output_dir):
os.makedirs(args.output_dir)
assert args.model_dir is not None
convert_and_save_hf(args)
tok = time.time()
t = time.strftime('%H:%M:%S', time.gmtime(tok - tik))
print(f'Total time of converting checkpoints: {t}')
if __name__ == '__main__':
main()

View File

@ -0,0 +1,5 @@
--extra-index-url https://pypi.nvidia.com
tensorrt_llm==0.11.0
datasets~=2.14.6
evaluate~=0.4.1
rouge_score~=0.1.2

View File

@ -1,5 +1,5 @@
--extra-index-url https://pypi.nvidia.com
tensorrt_llm==0.14.0.dev2024091700
tensorrt_llm==0.14.0.dev2024092400
transformers>=4.31.0
datasets~=2.14.5
evaluate~=0.4.1

View File

@ -3,7 +3,7 @@
# WAR the new posting of "nvidia-cudnn-cu12~=9.0".
# "jax[cuda12_pip]~=0.4.19" specifies "nvidia-cudnn-cu12>=8.9" but actually requires "nvidia-cudnn-cu12~=8.9".
nvidia-cudnn-cu12~=8.9; platform_machine == "x86_64"
tensorrt_llm==0.14.0.dev2024091700
tensorrt_llm==0.14.0.dev2024092400
flax~=0.8.0
# jax[cuda12_pip]~=0.4.19; platform_system != "Windows"
jax~=0.4.19; platform_system == "Windows"

View File

@ -1,5 +1,5 @@
--extra-index-url https://pypi.nvidia.com
tensorrt_llm==0.14.0.dev2024091700
tensorrt_llm==0.14.0.dev2024092400
datasets~=2.14.5
evaluate~=0.4.1
rouge_score~=0.1.2

View File

@ -1,5 +1,5 @@
--extra-index-url https://pypi.nvidia.com
tensorrt_llm==0.14.0.dev2024091700
tensorrt_llm==0.14.0.dev2024092400
datasets~=2.14.5
evaluate~=0.4.1
rouge_score~=0.1.2

View File

@ -1,5 +1,5 @@
--extra-index-url https://pypi.nvidia.com
tensorrt_llm==0.14.0.dev2024091700
tensorrt_llm==0.14.0.dev2024092400
datasets~=2.14.5
rouge_score~=0.1.2
evaluate~=0.4.1

View File

@ -1,6 +1,6 @@
-f https://storage.googleapis.com/jax-releases/jax_cuda_releases.html
--extra-index-url https://pypi.nvidia.com
tensorrt_llm==0.14.0.dev2024091700
tensorrt_llm==0.14.0.dev2024092400
datasets==2.14.6
evaluate~=0.4.1
rouge_score~=0.1.2

View File

@ -1,5 +1,5 @@
--extra-index-url https://pypi.nvidia.com
tensorrt_llm==0.14.0.dev2024091700
tensorrt_llm==0.14.0.dev2024092400
datasets==2.14.5
rouge_score~=0.1.2
sentencepiece~=0.1.99

View File

@ -1,5 +1,5 @@
--extra-index-url https://pypi.nvidia.com
tensorrt_llm==0.14.0.dev2024091700
tensorrt_llm==0.14.0.dev2024092400
datasets~=2.14.5
evaluate~=0.4.1
rouge_score~=0.1.2

View File

@ -1,5 +1,5 @@
--extra-index-url https://pypi.nvidia.com
tensorrt_llm==0.14.0.dev2024091700
tensorrt_llm==0.14.0.dev2024092400
datasets==2.14.6
evaluate~=0.4.1
rouge_score~=0.1.2

View File

@ -1,2 +1,2 @@
--extra-index-url https://pypi.nvidia.com
tensorrt_llm==0.14.0.dev2024091700
tensorrt_llm==0.14.0.dev2024092400

View File

@ -1,5 +1,5 @@
--extra-index-url https://pypi.nvidia.com
tensorrt_llm==0.14.0.dev2024091700
tensorrt_llm==0.14.0.dev2024092400
transformers>=4.39.0
datasets~=2.14.5
evaluate

View File

@ -1,5 +1,5 @@
--extra-index-url https://pypi.nvidia.com
tensorrt_llm==0.14.0.dev2024091700
tensorrt_llm==0.14.0.dev2024092400
datasets~=2.14.5
rouge_score~=0.1.2
sentencepiece~=0.1.99

View File

@ -1,4 +1,4 @@
--extra-index-url https://pypi.nvidia.com
tensorrt_llm==0.14.0.dev2024091700
tensorrt_llm==0.14.0.dev2024092400
transformers==4.38.2
accelerate==0.25.0

View File

@ -1,5 +1,5 @@
--extra-index-url https://pypi.nvidia.com
tensorrt_llm==0.14.0.dev2024091700
tensorrt_llm==0.14.0.dev2024092400
datasets~=2.14.5
evaluate~=0.4.1
rouge_score~=0.1.2

View File

@ -1,5 +1,5 @@
--extra-index-url https://pypi.nvidia.com
tensorrt_llm==0.14.0.dev2024091700
tensorrt_llm==0.14.0.dev2024092400
nemo-toolkit[all]==2.0.0rc1
megatron-core==0.8.0
datasets~=2.14.5

View File

@ -1,5 +1,5 @@
--extra-index-url https://pypi.nvidia.com
tensorrt_llm==0.14.0.dev2024091700
tensorrt_llm==0.14.0.dev2024092400
datasets~=2.14.5
evaluate~=0.4.1
rouge_score~=0.1.2

View File

@ -1,5 +1,5 @@
--extra-index-url https://pypi.nvidia.com
tensorrt_llm==0.14.0.dev2024091700
tensorrt_llm==0.14.0.dev2024092400
datasets~=2.14.5
evaluate~=0.4.1
rouge_score~=0.1.2

View File

@ -1,5 +1,5 @@
--extra-index-url https://pypi.nvidia.com
tensorrt_llm==0.14.0.dev2024091700
tensorrt_llm==0.14.0.dev2024092400
datasets>=2.14.4
nemo-toolkit[all]<=1.20.0,>=1.18.0
rouge_score~=0.1.2

View File

@ -1,5 +1,5 @@
--extra-index-url https://pypi.nvidia.com
tensorrt_llm==0.14.0.dev2024091700
tensorrt_llm==0.14.0.dev2024092400
datasets~=2.16.0
evaluate~=0.4.1
rouge_score~=0.1.2

View File

@ -1,5 +1,5 @@
--extra-index-url https://pypi.nvidia.com
tensorrt_llm==0.14.0.dev2024091700
tensorrt_llm==0.14.0.dev2024092400
datasets~=2.16.0
evaluate~=0.4.1
rouge_score~=0.1.2

View File

@ -1,5 +1,5 @@
--extra-index-url https://pypi.nvidia.com
tensorrt_llm==0.14.0.dev2024091700
tensorrt_llm==0.14.0.dev2024092400
git+https://github.com/google-deepmind/recurrentgemma.git
flax>=0.8.2
jax~=0.4.23

View File

@ -1,5 +1,5 @@
--extra-index-url https://pypi.nvidia.com
tensorrt_llm==0.14.0.dev2024091700
tensorrt_llm==0.14.0.dev2024092400
datasets~=2.14.5
rouge_score~=0.1.2
sentencepiece~=0.1.99

View File

@ -1,5 +1,5 @@
--extra-index-url https://pypi.nvidia.com
tensorrt_llm==0.14.0.dev2024091700
tensorrt_llm==0.14.0.dev2024092400
datasets~=2.16.1
evaluate~=0.4.1
rouge_score~=0.1.2

View File

@ -1,5 +1,5 @@
--extra-index-url https://pypi.nvidia.com
tensorrt_llm==0.14.0.dev2024091700
tensorrt_llm==0.14.0.dev2024092400
datasets==2.14.6
evaluate~=0.4.1
rouge_score~=0.1.2

View File

@ -1,5 +1,5 @@
--extra-index-url https://pypi.nvidia.com
tensorrt_llm==0.14.0.dev2024091700
tensorrt_llm==0.14.0.dev2024092400
tiktoken
datasets
kaldialign

View File

@ -18,3 +18,4 @@ bandit==1.7.7
jsonlines==4.0.0
jieba==0.42.1
rouge==1.0.1
pytest-rerunfailures

View File

@ -74,7 +74,7 @@ def parse_arguments():
parser.add_argument(
'--max_batch_size',
type=int,
default=256,
default=2048,
help="Maximum number of requests that the engine can schedule.")
parser.add_argument('--max_input_len',
type=int,

View File

@ -4063,7 +4063,7 @@ def bert_attention(tensor: Tensor,
The maximum distance of relative position in attention, for implicit mode.
Default value is 0, meaning to use the regular mode of relative attention bias.
Implicit mode is only enabled when passing in non-zero positive max_distance value.
See relative attention bias in docs/gpt_attention.md
See relative attention bias in docs/source/advanced/gpt-attention.md
max_input_length: Tensor = None
The maximum input sequence length represented by Tensor shape. Requires for remove_input_padding to pre-define plugin workspace size.
@ -4619,19 +4619,19 @@ def gpt_attention(
arguments that are likely to be removed or merged with others in the future
release.
See docs/gpt_attention.md for the documentation of that function.
See docs/source/advanced/gpt-attention.md for the documentation of that function.
Parameters:
qkv: Tensor (On GPU)
The input QKV tensor. Its shape is [batch_beam_size, max_seqlen, qkv_dim] in padded mode and [1, num_tokens, qkv_dim] in
packed mode. Where qkv_dim depends on using MQA, GQA, or MHA. See QKV Input in docs/gpt_attention.md,
packed mode. Where qkv_dim depends on using MQA, GQA, or MHA. See QKV Input in docs/source/advanced/gpt-attention.md,
past_key_value: Tensor (On GPU)
The tensor that stores KV cache data. Its shape is
[max_batch_size * max_beam_width, 2, num_kv_heads, max_seqlen, hidden_dim_per_head]
in contiguous mode and
[max_blocks, 2, num_kv_heads, num_tokens_per_block, hidden_dim_per_head]
in paged mode. See KV Cache in docs/gpt_attention.md,
in paged mode. See KV Cache in docs/source/advanced/gpt-attention.md,
context_fmha_custom_mask: Tensor (On GPU)
The tensor that stores the packed custom mask for fmha.
@ -4639,7 +4639,7 @@ def gpt_attention(
sequence_lengths: Tensor (On GPU)
The tensor that stores the length of each sequence. Its shape is
[batch_size]. See QKV Input in docs/gpt_attention.md,
[batch_size]. See QKV Input in docs/source/advanced/gpt-attention.md,
host_past_key_value_lengths: Tensor (On CPU)
An INT32 tensor of shape [batch_size],
@ -4657,12 +4657,12 @@ def gpt_attention(
cache_indirection: Tensor (On GPU)
The tensor to reconstruct the paths when using beam-search. Its
shape is [batch_size, beam_width, max_seqlen]. See Beam-Search in
docs/gpt_attention.md,
docs/source/advanced/gpt-attention.md,
host_request_types: Tensor = None (On CPU)
The tensor on the host that indicates if a request is in context or
generation phase. Its shape is [batch_size]. See Inflight Batching
in docs/gpt_attention.md,
in docs/source/advanced/gpt-attention.md,
layer_idx: int
The index of this attention layer, used to access kv_cache_block_offsets,
@ -4678,7 +4678,7 @@ def gpt_attention(
q_scaling: float
The value used to compute the scaling factor applied to the output
of the Q*K^T product. See Scaling Factors in docs/gpt_attention.md,
of the Q*K^T product. See Scaling Factors in docs/source/advanced/gpt-attention.md,
qk_tanh_scale: float
The scale * tanh(value / scale) used to compute the scaling factor applied to the output
@ -4726,12 +4726,12 @@ def gpt_attention(
kv_orig_quant_scale: Tensor
The tensor to store the scaling factor for quantization to INT8/FP8
in the KV cache. Its shape is [1]. See INT8/FP8 KV Cache in
docs/gpt_attention.md,
docs/source/advanced/gpt-attention.md,
kv_quant_orig_scale: Tensor
The tensor to store the scaling factor for dequantization from
INT8/FP8 in the KV cache. Its shape is [1]. See INT8/FP8 KV Cache
in docs/gpt_attention.md,
in docs/source/advanced/gpt-attention.md,
attention_output_orig_quant_scale: Tensor
The tensor to store the scaling factor for quantization to FP8
@ -4742,7 +4742,7 @@ def gpt_attention(
max_context_length: int32_t
The length of the longest input sequence. See QKV Input in
docs/gpt_attention.md,
docs/source/advanced/gpt-attention.md,
mask_type: int = 1
The type of mask:
@ -4779,14 +4779,14 @@ def gpt_attention(
kv_cache_block_offsets:
The tensor of block offsets for the KV cache. Its shape is
[num_layers, max_batch_size, max_beam_width, 2, max_blocks_per_sequence * 2],
See KV cache section in docs/gpt_attention.md, on gpu,
See KV cache section in docs/source/advanced/gpt-attention.md, on gpu,
host_kv_cache_block_offsets:
The same as kv_cache_block_offsets, but on cpu,
host_kv_cache_pool_pointers:
The tensor of pool pointers for the KV cache. Its shape is [2],
See KV cache section in docs/gpt_attention.md, on gpu,
See KV cache section in docs/source/advanced/gpt-attention.md, on gpu,
do_cross_attention: bool = False
Do we use this as cross attention instead of self attention,
@ -4809,7 +4809,7 @@ def gpt_attention(
The maximum distance of relative position in attention, for implicit mode.
Default value is 0, meaning to use the regular mode of relative attention bias.
Implicit mode is only enabled when passing in non-zero positive max_distance value.
See relative attention bias in docs/gpt_attention.md
See relative attention bias in docs/source/advanced/gpt-attention.md
host_context_lengths: Tensor = None (On CPU)
A host tensor that contains the lengths of the different inputs,
@ -5609,7 +5609,7 @@ def lora_plugin(
host_request_types : Tensor = None
The tensor on the host that indicates if a request is in context or
generation phase. Its shape is [batch_size]. See Inflight Batching
in docs/gpt_attention.md,
in docs/source/advanced/gpt-attention.md,
transa : bool
Is the first input transposed? Set to 'True' if you want the first
@ -5736,7 +5736,7 @@ def mamba_conv1d(input: Tensor,
host_request_types : Tensor (On CPU)
The tensor on the host that indicates if a request is in context or
generation phase. Its shape is [batch_size]. See Inflight Batching
in docs/gpt_attention.md,
in docs/source/advanced/gpt-attention.md,
last_token_ids : Tensor (On GPU)
The inclusive prefix-sum of the lengths or the lengths of the
@ -5883,7 +5883,7 @@ def selective_scan(input: Tensor,
host_request_types : Tensor (On CPU)
The tensor on the host that indicates if a request is in context or
generation phase. Its shape is [batch_size]. See Inflight Batching
in docs/gpt_attention.md
in docs/source/advanced/gpt-attention.md
last_token_ids : Tensor (On GPU)
The inclusive prefix-sum of the lengths or the lengths of the
@ -6029,7 +6029,7 @@ def rg_lru(input: Tensor,
host_request_types : Tensor (On CPU)
The tensor on the host that indicates if a request is in context or
generation phase. Its shape is [batch_size]. See Inflight Batching
in docs/gpt_attention.md,
in docs/source/advanced/gpt-attention.md,
last_token_ids : Tensor (On GPU)
The inclusive prefix-sum of the lengths or the lengths of the

View File

@ -23,7 +23,7 @@ from .embedding import Embedding, PromptTuningEmbedding
from .linear import ColumnLinear, Linear, RowLinear
from .lora import Lora, LoraParams, LoraRuntimeParams
from .mlp import MLP, FusedGatedMLP, GatedMLP
from .moe import MOE, MoeConfig
from .moe import MOE, MoeConfig, SharedMoE
from .normalization import GroupNorm, LayerNorm, RmsNorm
from .pooling import AvgPool2d
from .recurrent import FusedRgLru, GroupedLinear, Recurrent, RgLru
@ -61,6 +61,7 @@ __all__ = [
'LoraRuntimeParams',
'MOE',
'MoeConfig',
'SharedMoE',
'Mamba',
'Mamba2',
'Recurrent',

View File

@ -90,15 +90,10 @@ class Embedding(Module):
param.value = loaded_weight
def postprocess(self, tllm_key, weights, **kwargs):
config = kwargs.get("config", None)
if weights is None:
return {}
weights = weights.to(str_dtype_to_torch(self.dtype))
if config.share_embedding_table:
return {}
else:
weights = weights.clone()
return {tllm_key: weights}
return {tllm_key: weights}
class PromptTuningEmbedding(Embedding):

View File

@ -61,6 +61,9 @@ class MoeConfig:
SPARSE_MIXER = 2
num_experts: int = 0
moe_intermediate_size: int = 0 # Add moe inter size (shanshan)
num_shared_experts: int = 0 # Add number of shared experts (shanshan)
top_k: int = 0
normalization_mode: ExpertScaleNormalizationMode = ExpertScaleNormalizationMode.RENORMALIZE
sparse_mixer_epsilon: float = 0.01
@ -832,3 +835,51 @@ class MoeOOTB(MOE):
if is_gated_act:
expert.gate.bias.value = experts_bias_1_raw[
i, :self.expert_inter_size]
# Add SharedMoE class (shanshan)
class SharedMoE(Module):
def __init__(self,
moe_config: MoeConfig,
hidden_size: int,
ffn_hidden_size: int,
hidden_act: str,
mapping: Mapping = Mapping(),
bias: bool = True,
dtype=None,
**kwargs):
super().__init__()
self.moe_config = moe_config
self.hidden_size = hidden_size
self.ffn_hidden_size = ffn_hidden_size
self.hidden_act = hidden_act
self.mapping = mapping
self.bias = bias
self.dtype = dtype
self.moe = MOE(hidden_size=self.hidden_size,
moe_config=self.moe_config,
mapping=self.mapping,
ffn_hidden_size=self.moe_config.moe_intermediate_size,
hidden_act=self.hidden_act,
dtype=self.dtype,
bias=False,
tp_group=self.mapping.tp_group,
tp_size=self.mapping.tp_size)
ClsMLP = GatedMLP if is_gated_activation(self.hidden_act) else MLP
self.shared_experts = ClsMLP(
hidden_size=self.hidden_size,
ffn_hidden_size=self.ffn_hidden_size,
hidden_act=non_gated_version(self.hidden_act), # deepseek use SiLU
bias=False,
dtype=self.dtype,
tp_group=self.mapping.tp_group,
tp_size=self.mapping.tp_size)
def forward(self, hidden_states):
if self.moe_config.num_shared_experts > 0:
return self.moe(hidden_states) + self.shared_experts(hidden_states)
else:
return self.moe(hidden_states)

View File

@ -23,6 +23,7 @@ from .cogvlm.model import CogVLMForCausalLM
from .dbrx.config import DbrxConfig
from .dbrx.model import DbrxForCausalLM
from .deci.model import DeciLMForCausalLM
from .deepseek_v1.model import DeepseekForCausalLM
from .dit.model import DiT
from .enc_dec.model import DecoderModel, EncoderModel, WhisperEncoder
from .falcon.config import FalconConfig
@ -57,6 +58,7 @@ __all__ = [
'BloomModel',
'BloomForCausalLM',
'DiT',
'DeepseekForCausalLM',
'FalconConfig',
'FalconForCausalLM',
'FalconModel',
@ -158,5 +160,6 @@ MODEL_MAP = {
'RecurrentGemmaForCausalLM': RecurrentGemmaForCausalLM,
'CogVLMForCausalLM': CogVLMForCausalLM,
'DiT': DiT,
'DeepseekForCausalLM': DeepseekForCausalLM,
'DeciLMForCausalLM': DeciLMForCausalLM,
}

View File

@ -0,0 +1,14 @@
# SPDX-FileCopyrightText: Copyright (c) 2022-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

View File

@ -0,0 +1,361 @@
# SPDX-FileCopyrightText: Copyright (c) 2022-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import time
import torch
from transformers import AutoConfig, AutoModelForCausalLM
from tensorrt_llm.layers import MoeConfig
from ..._utils import pad_vocab_size, release_gc
from ...mapping import Mapping
## Convert config parameters to dict
def create_trt_config_from_hf(model_dir,
dtype,
mapping: Mapping,
override_fields: dict = {}):
config = {}
assert isinstance(model_dir, str)
hf_config = AutoConfig.from_pretrained(model_dir, trust_remote_code=True)
dtype = dtype
n_layer = hf_config.num_hidden_layers
n_head = hf_config.num_attention_heads
n_embd = hf_config.hidden_size
inter_size = hf_config.intermediate_size
n_kv_head = hf_config.num_key_value_heads
vocab_size = hf_config.vocab_size
n_positions = hf_config.max_position_embeddings
hidden_act = 'swiglu' # TRT-LLM request make gated activation explicit for MOE implementation
rotary_base = hf_config.rope_theta
rms_norm_eps = hf_config.rms_norm_eps
moe_num_experts = hf_config.n_routed_experts
moe_top_k = hf_config.num_experts_per_tok
## shanshan fix
moe_renorm_mode = MoeConfig.ExpertScaleNormalizationMode.NONE
moe_num_shared_experts = hf_config.n_shared_experts
moe_inter_size = hf_config.moe_intermediate_size
rotary_scaling = hf_config.rope_scaling
config = {
'architecture': "DeepseekForCausalLM",
'dtype': dtype,
'logits_type': 'float32',
'num_hidden_layers': n_layer,
'num_attention_heads': n_head,
'hidden_size': n_embd,
'intermediate_size': inter_size,
'num_key_value_heads': n_kv_head,
'vocab_size': vocab_size,
'position_embedding_type': 'rope_gpt_neox',
'max_position_embeddings': n_positions,
'hidden_act': hidden_act,
'rotary_base': rotary_base,
'norm_epsilon': rms_norm_eps,
'rotary_scaling': rotary_scaling,
'moe_num_experts': moe_num_experts,
'moe_top_k': moe_top_k,
'moe_renorm_mode': moe_renorm_mode,
'moe_num_shared_experts': moe_num_shared_experts,
'moe_inter_size': moe_inter_size,
'mapping': {
'world_size': mapping.tp_size * mapping.pp_size,
'tp_size': mapping.tp_size,
'pp_size': mapping.pp_size,
'moe_tp_size': mapping.moe_tp_size,
'moe_ep_size': mapping.moe_ep_size,
},
}
config.update(override_fields)
moe_config = MoeConfig(num_experts=config['moe_num_experts'],
moe_intermediate_size=config['moe_inter_size'],
num_shared_experts=config['moe_num_shared_experts'],
top_k=config['moe_top_k'],
normalization_mode=config['moe_renorm_mode'])
moe_config.validate()
return config
## Get HF model
def load_hf_deepseek(model_dir):
model = AutoModelForCausalLM.from_pretrained(model_dir,
device_map='auto',
torch_dtype='auto',
trust_remote_code=True)
return model
## Prepare weights for TP
def split(v, tp_size, idx, dim=0):
if tp_size == 1:
return v
if len(v.shape) == 1:
return torch.chunk(v, tp_size)[idx].contiguous()
else:
return torch.chunk(v, tp_size, dim=dim)[idx].contiguous()
def split_qkv_tp(v, n_head, n_hidden, tensor_parallel, rank):
"""
Splits the QKV matrix according to tensor parallelism
"""
v = v.reshape(3, n_hidden, n_hidden)
split_v = split(v, tensor_parallel, rank, dim=1)
split_v = split_v.reshape(3 * (n_hidden // tensor_parallel), n_hidden)
return split_v.contiguous()
def split_matrix_tp(v, tensor_parallel, rank, dim):
return split(v, tensor_parallel, rank, dim=dim)
def get_weight(config, prefix, dtype, postfix='.weight'):
if config[prefix + postfix].dtype != dtype:
config[prefix + postfix].data = config[prefix + postfix].to(dtype)
return config[prefix + postfix].detach().cpu()
def get_trtllm_linear_weight(weight, prefix, postfix='weight'):
results = {}
results[prefix + postfix] = weight
return results
def convert_deepseek(hf_model,
config,
mapping,
dtype='float32',
use_parallel_embedding=False,
sharding_dim=0,
share_embedding_table=False):
weights = {}
tik = time.time()
mapping.tp_size
model_params = dict(hf_model.named_parameters())
dtype = getattr(torch, dtype)
moe_config = MoeConfig(num_experts=config['moe_num_experts'],
moe_intermediate_size=config['moe_inter_size'],
num_shared_experts=config['moe_num_shared_experts'],
top_k=config['moe_top_k'],
normalization_mode=config['moe_renorm_mode'])
layers_range = mapping.pp_layers(config['num_hidden_layers'])
def convert_layer(l):
prefix = f'model.layers.{l}.'
print(prefix)
trtllm_prex = f'transformer.layers.{l - layers_range[0]}.'
q_weight = get_weight(model_params, prefix + 'self_attn.q_proj', dtype)
k_weight = get_weight(model_params, prefix + 'self_attn.k_proj', dtype)
v_weight = get_weight(model_params, prefix + 'self_attn.v_proj', dtype)
qkv_weight = torch.cat([q_weight, k_weight, v_weight], dim=0)
split_v = split_qkv_tp(qkv_weight, config['num_attention_heads'],
config['hidden_size'], mapping.tp_size,
mapping.tp_rank)
weights.update(
get_trtllm_linear_weight(split_v, trtllm_prex + 'attention.qkv.'))
attn_dense_weight = get_weight(model_params,
prefix + 'self_attn.o_proj', dtype)
split_v = split_matrix_tp(attn_dense_weight,
mapping.tp_size,
mapping.tp_rank,
dim=1)
weights.update(
get_trtllm_linear_weight(split_v, trtllm_prex + 'attention.dense.'))
if moe_config.has_moe() and l > 0:
rank_experts = list(range(moe_config.num_experts))
if mapping.has_moe_ep():
rank_experts = mapping.ep_experts(moe_config.num_experts)
for suffix in ["gate_proj", "down_proj", "up_proj"]:
model_params[f'model.layers.{l}.mlp.experts.{suffix}.weight'] = \
torch.stack([model_params[f'model.layers.{l}.mlp.experts.{expert}.{suffix}.weight'].detach().cpu()
for expert in rank_experts])
gate_proj = model_params[
f'model.layers.{l}.mlp.experts.gate_proj.weight']
down_proj = model_params[
f'model.layers.{l}.mlp.experts.down_proj.weight']
up_proj = model_params[
f'model.layers.{l}.mlp.experts.up_proj.weight']
if mapping.has_moe_tp():
gate_proj = split(gate_proj,
mapping.tp_size,
mapping.tp_rank,
dim=1)
down_proj = split(down_proj,
mapping.tp_size,
mapping.tp_rank,
dim=2)
up_proj = split(up_proj,
mapping.tp_size,
mapping.tp_rank,
dim=1)
model_params[
f'model.layers.{l}.mlp.experts.up_gate_proj.weight'] = torch.concat(
[up_proj, gate_proj], dim=-2)
model_params[
f'model.layers.{l}.mlp.experts.down_proj.weight'] = down_proj
## mlp.experts.down_proj.weight
moe_experts_down_proj_weights = get_weight(
model_params, prefix + 'mlp.experts.down_proj', dtype)
weights.update(
get_trtllm_linear_weight(moe_experts_down_proj_weights,
trtllm_prex + 'mlp.moe.proj.'))
##mlp.experts.up_gate.weight
moe_experts_up_gate_proj_weights = get_weight(
model_params, prefix + 'mlp.experts.up_gate_proj', dtype)
weights.update(
get_trtllm_linear_weight(moe_experts_up_gate_proj_weights,
trtllm_prex + 'mlp.moe.fc.'))
## MOE hardcoded routing_input into trt.float32, please refer to moe.py line 397
moe_experts_gate_weights = get_weight(model_params,
prefix + 'mlp.gate',
torch.float32)
weights.update(
get_trtllm_linear_weight(moe_experts_gate_weights,
trtllm_prex + 'mlp.moe.router.'))
if moe_config.num_shared_experts > 0:
## mlp.shared_experts.gate_proj.weight
shared_moe_gate_proj_weights = get_weight(
model_params, prefix + 'mlp.shared_experts.gate_proj',
dtype)
split_v = split_matrix_tp(shared_moe_gate_proj_weights,
mapping.tp_size,
mapping.tp_rank,
dim=0)
weights.update(
get_trtllm_linear_weight(
split_v, trtllm_prex + 'mlp.shared_experts.fc.'))
# mlp.shared_experts.down_proj.weight
shared_moe_down_proj_weights = get_weight(
model_params, prefix + 'mlp.shared_experts.down_proj',
dtype)
split_v = split_matrix_tp(shared_moe_down_proj_weights,
mapping.tp_size,
mapping.tp_rank,
dim=1)
weights.update(
get_trtllm_linear_weight(
split_v, trtllm_prex + 'mlp.shared_experts.proj.'))
## mlp.shared_experts.up_proj.weight
shared_moe_up_proj_weights = get_weight(
model_params, prefix + 'mlp.shared_experts.up_proj', dtype)
split_v = split_matrix_tp(shared_moe_up_proj_weights,
mapping.tp_size,
mapping.tp_rank,
dim=0)
weights.update(
get_trtllm_linear_weight(
split_v, trtllm_prex + 'mlp.shared_experts.gate.'))
else:
## Current deepseek model has one MLP layer only, if it goes large consider to do fuse
mlp_gate_weight = get_weight(model_params, prefix + 'mlp.up_proj',
dtype)
split_gate = split_matrix_tp(mlp_gate_weight,
mapping.tp_size,
mapping.tp_rank,
dim=0)
weights.update(
get_trtllm_linear_weight(split_gate, trtllm_prex + 'mlp.gate.'))
mlp_fc_weight = get_weight(model_params, prefix + 'mlp.gate_proj',
dtype)
split_fc = split_matrix_tp(mlp_fc_weight,
mapping.tp_size,
mapping.tp_rank,
dim=0)
weights.update(
get_trtllm_linear_weight(split_fc, trtllm_prex + 'mlp.fc.'))
mlp_proj_weight = get_weight(model_params, prefix + 'mlp.down_proj',
dtype)
split_proj = split_matrix_tp(mlp_proj_weight,
mapping.tp_size,
mapping.tp_rank,
dim=1)
weights.update(
get_trtllm_linear_weight(split_proj, trtllm_prex + 'mlp.proj.'))
# Layer norms do not use tensor parallelism
input_ln_weight = get_weight(model_params, prefix + 'input_layernorm',
dtype)
weights[trtllm_prex + 'input_layernorm.weight'] = input_ln_weight
post_ln_weight = get_weight(model_params,
prefix + 'post_attention_layernorm', dtype)
weights[trtllm_prex + 'post_layernorm.weight'] = post_ln_weight
for l in layers_range:
convert_layer(l)
release_gc()
v = get_weight(model_params, 'model.embed_tokens', dtype)
if hf_model.config.tie_word_embeddings:
# lm_head.weight has the same weights as embedding
if mapping.is_last_pp_rank():
if config['vocab_size'] % mapping.tp_size != 0:
# padding
vocab_size_padded = pad_vocab_size(config['vocab_size'],
mapping.tp_size)
pad_width = vocab_size_padded - config['vocab_size']
v = torch.nn.functional.pad(v, (0, 0, 0, pad_width), 'constant',
0)
weights['lm_head.weight'] = split(v, mapping.tp_size,
mapping.tp_rank)
if use_parallel_embedding:
v = split_matrix_tp(v,
mapping.tp_size,
mapping.tp_rank,
dim=config.embedding_sharding_dim)
if mapping.is_first_pp_rank():
weights['transformer.vocab_embedding.weight'] = v
lm_head_weights = get_weight(model_params, 'lm_head', dtype)
if mapping.is_last_pp_rank():
if config['vocab_size'] % mapping.tp_size != 0:
# padding
vocab_size_padded = pad_vocab_size(config['vocab_size'],
mapping.tp_size)
pad_width = vocab_size_padded - config['vocab_size']
lm_head_weights = torch.nn.functional.pad(lm_head_weights,
(0, 0, 0, pad_width),
'constant',
value=0)
weights['lm_head.weight'] = split_matrix_tp(lm_head_weights,
mapping.tp_size,
mapping.tp_rank,
dim=0)
ln_f_w = get_weight(model_params, 'model.norm', dtype)
weights['transformer.ln_f.weight'] = ln_f_w
tok = time.time()
t = time.strftime('%H:%M:%S', time.gmtime(tok - tik))
print(f'Weights loaded. Total time: {t}')
#print(set(weights.keys()))
return weights

View File

@ -0,0 +1,257 @@
# SPDX-FileCopyrightText: Copyright (c) 2022-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from typing import Optional
import torch
from ..._utils import pad_vocab_size, torch_dtype_to_str
from ...functional import Tensor, non_gated_version, recv, send
from ...layers import (Attention, AttentionMaskType, ColumnLinear, Embedding,
GatedMLP, MoeConfig, PositionEmbeddingType, RmsNorm,
SharedMoE)
from ...mapping import Mapping
from ...module import Module
from ...plugin import init_all_reduce_helper
from ..modeling_utils import (DecoderLayerList, DecoderModelForCausalLM,
PretrainedConfig)
from .convert import convert_deepseek, create_trt_config_from_hf
class DeepseekDecoderLayer(Module):
def __init__(self, config: PretrainedConfig, layer_idx: int):
super().__init__()
self.layer_idx = layer_idx
self.config = config
### Input layernorm in Deepseek v1 is same as Llama
self.input_layernorm = RmsNorm(normalized_shape=config.hidden_size,
eps=config.norm_epsilon,
dtype=config.dtype)
layers_range = config.mapping.pp_layers(config.num_hidden_layers)
local_layer_idx = layer_idx - layers_range[0]
### Deepseek v1 model with standard attention
self.attention = Attention(
local_layer_idx=local_layer_idx,
hidden_size=config.hidden_size,
attention_head_size=config.head_size,
num_attention_heads=config.num_attention_heads,
num_kv_heads=config.num_key_value_heads,
max_position_embeddings=config.max_position_embeddings,
dtype=config.dtype,
attention_mask_type=AttentionMaskType.causal,
bias=False,
position_embedding_type=PositionEmbeddingType.rope_gpt_neox,
rotary_embedding_base=config.rotary_base,
rotary_embedding_scaling=config.rotary_scaling,
tp_group=config.mapping.tp_group,
tp_size=config.mapping.tp_size,
tp_rank=config.mapping.tp_rank)
ClsMLP = GatedMLP
moe_config = MoeConfig(num_experts=config.moe_num_experts,
moe_intermediate_size=config.moe_inter_size,
num_shared_experts=config.moe_num_shared_experts,
top_k=config.moe_top_k,
normalization_mode=config.moe_renorm_mode)
mlp_kwargs = {}
if config.moe_num_experts > 0 and layer_idx > 0:
mlp_hidden_size = moe_config.num_shared_experts * moe_config.moe_intermediate_size
hidden_act = config.hidden_act
ClsMLP = SharedMoE
mlp_kwargs = {"moe_config": moe_config, "mapping": config.mapping}
else:
ClsMLP = GatedMLP
mlp_hidden_size = config.intermediate_size
hidden_act = non_gated_version(
config.hidden_act) # back to non gated for dense layers
self.mlp = ClsMLP(hidden_size=config.hidden_size,
ffn_hidden_size=mlp_hidden_size,
hidden_act=hidden_act,
dtype=config.dtype,
bias=False,
tp_group=config.mapping.tp_group,
tp_size=config.mapping.tp_size,
**mlp_kwargs)
### Pose layernorm in Deepseek v1 is same as Llama )
self.post_layernorm = RmsNorm(normalized_shape=config.hidden_size,
eps=config.norm_epsilon,
dtype=config.dtype)
def forward(self,
hidden_states,
attention_mask=None,
use_cache=False,
spec_decoding_params=None,
kv_cache_params=None,
attention_params=None):
residual = hidden_states
hidden_states = self.input_layernorm(hidden_states)
attention_output = self.attention(
hidden_states,
attention_mask=attention_mask,
use_cache=use_cache,
spec_decoding_params=spec_decoding_params,
kv_cache_params=kv_cache_params,
attention_params=attention_params)
if use_cache:
attention_output, presents = attention_output
hidden_states = residual + attention_output
residual = hidden_states
hidden_states = self.post_layernorm(hidden_states)
hidden_states = self.mlp(hidden_states)
hidden_states = residual + hidden_states
if use_cache:
return (hidden_states, presents)
return hidden_states
class DeepseekModel(Module):
def __init__(self, config: PretrainedConfig) -> None:
super().__init__()
init_all_reduce_helper() # enable use_customer_all_reduce
self.mapping = config.mapping
if self.mapping.is_first_pp_rank():
self.vocab_embedding = Embedding(config.vocab_size,
config.hidden_size,
dtype=config.dtype)
self.layers = DecoderLayerList(DeepseekDecoderLayer, config)
if self.mapping.is_last_pp_rank():
self.ln_f = RmsNorm(normalized_shape=config.hidden_size,
eps=config.norm_epsilon,
dtype=config.dtype)
def forward(self,
input_ids,
position_ids=None,
use_cache=False,
attention_mask=None,
spec_decoding_params=None,
kv_cache_params=None,
attention_params=None,
hidden_states=None,
prompt_embedding_table: Optional[Tensor] = None,
prompt_tasks: Optional[Tensor] = None,
prompt_vocab_size: Optional[Tensor] = None):
ptuning_args = [
prompt_embedding_table, prompt_tasks, prompt_vocab_size
] if prompt_embedding_table is not None else []
if self.mapping.is_first_pp_rank():
hidden_states = self.vocab_embedding(input_ids, *ptuning_args)
else:
hidden_states = recv(hidden_states, self.mapping.prev_pp_rank())
hidden_states = self.layers.forward(
hidden_states,
use_cache=use_cache,
attention_mask=attention_mask,
kv_cache_params=kv_cache_params,
attention_params=attention_params,
spec_decoding_params=spec_decoding_params)
if use_cache:
hidden_states, presents = hidden_states
if self.mapping.is_last_pp_rank():
hidden_states = self.ln_f(hidden_states)
else:
hidden_states = send(hidden_states, self.mapping.next_pp_rank())
if use_cache:
return (hidden_states, tuple(presents))
return hidden_states
class DeepseekForCausalLM(DecoderModelForCausalLM):
def __init__(self, config: PretrainedConfig):
transformer = DeepseekModel(config)
vocab_size_padded = pad_vocab_size(config.vocab_size,
config.mapping.tp_size)
if config.mapping.is_last_pp_rank():
lm_head = ColumnLinear(config.hidden_size,
vocab_size_padded,
bias=False,
dtype=config.dtype,
tp_group=config.mapping.tp_group,
tp_size=config.mapping.tp_size,
gather_output=True)
else:
lm_head = None
self.mapping = config.mapping
super().__init__(config, transformer, lm_head)
@classmethod
def from_hugging_face(cls,
hf_model,
model_dir,
dtype: str = 'auto',
mapping: Optional[Mapping] = None,
override_fields={},
**kwargs):
assert hf_model is not None
if mapping is None:
mapping = Mapping()
config = create_trt_config_from_hf(model_dir,
dtype,
mapping=mapping,
override_fields=override_fields)
print(config)
pretrained_config = PretrainedConfig.from_dict(config)
pretrained_config.set_rank(mapping.rank) # TODO:remove this hack
if dtype == 'auto':
dtype = getattr(config, 'torch_dtype', None)
if dtype is None:
dtype = 'float16'
if isinstance(dtype, torch.dtype):
dtype = torch_dtype_to_str(dtype)
if dtype == 'float32': # should remove "float32"
dtype = 'float16'
if dtype == 'bfloat16' and torch.cuda.get_device_properties(
0).major < 8:
logger.warning(
"Pre SM 80 GPUs do not support bfloat16, fallback to float16")
dtype = 'float16'
deepseek = cls.from_config(pretrained_config)
weights = convert_deepseek(
hf_model,
config,
mapping,
dtype=dtype,
use_parallel_embedding=config.get('use_parallel_embedding', False),
sharding_dim=config.get('embedding_sharding_dim', 0),
share_embedding_table=config.get('share_embedding_table', False))
#check_share_embedding(weights, config)
deepseek.load(weights)
return deepseek

View File

@ -352,8 +352,7 @@ class LLaMAForCausalLM(DecoderModelForCausalLM):
hf_model_dir = quant_ckpt_path
loader = ModelWeightsLoader(hf_model_dir, custom_dict)
if config.share_embedding_table:
config.share_embedding_table = loader.check_share_embedding()
loader.check_share_embedding(config)
model = cls(config)
loader.generate_tllm_weights(model)
else:

View File

@ -283,23 +283,49 @@ class ModelWeightsLoader:
return weight_dict
def check_share_embedding(self):
def check_share_embedding(self, config):
# TODO: Remove after --use_share_embedding is removed
if not config.share_embedding_table:
return
from ..logger import logger
lm_head_weights = self.load_tensor(
self.translate_to_external_key("lm_head.weight",
self.tllm_to_externel_key_dict))
vocab_embed_weights = self.load_tensor(
self.translate_to_external_key("transformer.vocab_embedding.weight",
self.tllm_to_externel_key_dict))
share_embedding_table = False
if lm_head_weights is not None and vocab_embed_weights is not None:
if lm_head_weights.shape == vocab_embed_weights.shape:
if not (lm_head_weights - vocab_embed_weights).any():
return True
from ..logger import logger
logger.warning(
"lm_head.weight and transformer.vocab_embedding.weight are not identical, "
"share_embedding_table cannot be enabled; setting share_embedding_table=False."
)
return False
share_embedding_table = True
elif lm_head_weights is None and vocab_embed_weights is not None:
self.tllm_to_externel_key_dict[
'lm_head'] = self.tllm_to_externel_key_dict[
'transformer'] + '.' + self.tllm_to_externel_key_dict[
'vocab_embedding']
share_embedding_table = True
elif lm_head_weights is not None and vocab_embed_weights is None:
self.tllm_to_externel_key_dict[
'vocab_embedding'] = self.tllm_to_externel_key_dict['lm_head']
share_embedding_table = True
# Validation
mapping = config.mapping
if mapping.tp_size > 1:
if (not config.use_parallel_embedding) or (
config.use_parallel_embedding
and config.embedding_sharding_dim == 1):
share_embedding_table = False
if mapping.pp_size > 1:
share_embedding_table = False
if mapping.cp_size > 1:
share_embedding_table = False
config.share_embedding_table = share_embedding_table
if config.share_embedding_table:
logger.info("share_embedding_table enabled.")
def update_key_mapping(self, model):
self.model = weakref.ref(model)()
@ -313,6 +339,13 @@ class ModelWeightsLoader:
pp_layers)
})
# Share embedding
if self.tllm_to_externel_key_dict[
'vocab_embedding'] == self.tllm_to_externel_key_dict['lm_head']:
self.model.transformer.vocab_embedding.tllm_to_externel_key_dict = {
self.tllm_to_externel_key_dict['transformer']: '',
}
def fill(self, weights):
for tllm_key, param in self.model.named_parameters():
if param.is_buffer:

View File

@ -1286,6 +1286,9 @@ def preprocess_weights(weights: Dict[str, torch.Tensor],
def check_share_embedding(weights: Dict[str, torch.Tensor],
model_config: PretrainedConfig):
if model_config.share_embedding_table:
if "lm_head.weight" in weights:
if weights["lm_head.weight"] is None:
weights.pop("lm_head.weight")
if "lm_head.weight" in weights and "transformer.vocab_embedding.weight" in weights:
if (weights["lm_head.weight"] -
weights["transformer.vocab_embedding.weight"]).any():

View File

@ -25,6 +25,7 @@ from ...functional import Tensor, allreduce, recv, send, sigmoid
from ...layers import (MLP, MOE, Attention, AttentionMaskType, ColumnLinear,
Embedding, GatedMLP, RmsNorm, RowLinear)
from ...layers.moe import MOEWeightWrapper
from ...logger import logger
from ...lora_manager import (LoraConfig,
get_default_trtllm_modules_to_hf_modules, use_lora)
from ...mapping import Mapping
@ -427,12 +428,18 @@ class QWenForCausalLM(DecoderModelForCausalLM):
else:
if not use_preloading:
hf_model = load_hf_qwen(hf_model_dir, load_model_on_cpu)
logger.debug(f"HuggingFace model: {hf_model}")
model = QWenForCausalLM(config)
logger.debug(f"TensorRT-LLM model: {model}")
if use_hf_gptq_checkpoint:
weights = load_weights_from_hf_gptq_model(hf_model, config)
else:
weights = load_weights_from_hf_model(hf_model, config)
check_share_embedding(weights, config)
model = QWenForCausalLM(config)
model.load(weights)
return model

View File

@ -18,6 +18,18 @@ from ._common import default_net
from .logger import logger
def _addindent(s_, numSpaces):
s = s_.split('\n')
# don't do anything for single-line stuff
if len(s) == 1:
return s_
first = s.pop(0)
s = [(numSpaces * ' ') + line for line in s]
s = '\n'.join(s)
s = first + '\n' + s
return s
class Module(object):
def __init__(self) -> None:
@ -191,6 +203,23 @@ class Module(object):
for k, v in self.named_parameters():
v.value = tm[k].detach().cpu().numpy()
def _get_name(self):
return self.__class__.__name__
def __repr__(self):
# We treat the extra repr like the sub-module, one item per line
child_lines = []
for key, module in self._modules.items():
mod_str = repr(module)
mod_str = _addindent(mod_str, 2)
child_lines.append('(' + key + '): ' + mod_str)
main_str = self._get_name() + '('
if child_lines:
# simple one-liner info, which most builtin Modules will use
main_str += '\n ' + '\n '.join(child_lines) + '\n'
main_str += ')'
return main_str
class ModuleList(Module):
@ -221,3 +250,35 @@ class ModuleList(Module):
def __len__(self):
return len(self._modules)
def __repr__(self):
"""Return a custom repr for ModuleList that compresses repeated module representations."""
list_of_reprs = [repr(item) for item in self]
if len(list_of_reprs) == 0:
return self._get_name() + "()"
start_end_indices = [[0, 0]]
repeated_blocks = [list_of_reprs[0]]
for i, r in enumerate(list_of_reprs[1:], 1):
if r == repeated_blocks[-1]:
start_end_indices[-1][1] += 1
continue
start_end_indices.append([i, i])
repeated_blocks.append(r)
lines = []
main_str = self._get_name() + "("
for (start_id, end_id), b in zip(start_end_indices, repeated_blocks):
local_repr = f"({start_id}): {b}" # default repr
if start_id != end_id:
n = end_id - start_id + 1
local_repr = f"({start_id}-{end_id}): {n} x {b}"
local_repr = _addindent(local_repr, 2)
lines.append(local_repr)
main_str += "\n " + "\n ".join(lines) + "\n"
main_str += ")"
return main_str

View File

@ -12,4 +12,4 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
__version__ = "0.14.0.dev2024091700"
__version__ = "0.14.0.dev2024092400"

View File

@ -314,7 +314,7 @@ def test_llm_request():
assert llm_request.max_num_generated_tokens == 2
llm_request.pause(0)
assert llm_request.state == _tb.LlmRequestState.REQUEST_STATE_CONTEXT_INIT
assert llm_request.state == _tb.LlmRequestState.CONTEXT_INIT
llm_request.max_sent_token_len = 1
assert llm_request.max_sent_token_len == 1

106
tests/conftest.py Normal file
View File

@ -0,0 +1,106 @@
# SPDX-FileCopyrightText: Copyright (c) 2022-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# # Force resource release after test
import gc
import multiprocessing.connection
import os
import sys
import time
import pytest
memory_profiling_enabled = os.environ.get("LLM_MEMORY_PROFILING", False)
if memory_profiling_enabled:
@pytest.hookimpl(trylast=True)
def pytest_sessionstart(session):
import xdist
session.stash["reporter"] = multiprocessing.connection.Client(
"/tmp/profiling_scribe.unix", "AF_UNIX")
session.stash["worker_id"] = xdist.get_xdist_worker_id(session)
session.stash["reporter"].send({
"type": "identity",
"identifier": "unittest",
"pid": os.getpid(),
"worker_id": session.stash["worker_id"]
})
@pytest.hookimpl(trylast=True)
def pytest_collection_modifyitems(session, config, items):
for item in items:
item.stash["reporter"] = session.stash["reporter"]
item.stash["worker_id"] = session.stash["worker_id"]
@pytest.hookimpl(trylast=True)
def pytest_sessionfinish(session):
session.stash["reporter"].close()
@pytest.hookimpl(tryfirst=True, wrapper=True)
def pytest_runtest_protocol(item, nextitem):
if memory_profiling_enabled:
path, line, name = item.reportinfo()
item.stash["reporter"].send({
"type": "unit_case",
"timestamp": time.time(),
"case": {
"path": str(path),
"line": line,
"name": name
},
"worker_id": item.stash["worker_id"],
"pid": os.getpid()
})
result = yield
if not any(module == 'torch' or module.startswith('torch.')
for module in sys.modules):
return result
import torch
if memory_profiling_enabled:
item.stash["reporter"].send({
"type": "torch_report",
"timestamp": time.time(),
"case": {
"path": str(path),
"line": line,
"name": name
},
"context": "unit",
"worker_id": item.stash["worker_id"],
"pid": os.getpid(),
"report": {
"allocated": torch.cuda.memory_allocated(),
"max_allocated": torch.cuda.max_memory_allocated(),
"reserved": torch.cuda.memory_reserved(),
"max_reserved": torch.cuda.max_memory_reserved(),
}
})
torch.cuda.reset_peak_memory_stats()
worker_count = int(os.environ.get('PYTEST_XDIST_WORKER_COUNT', 1))
if (torch.cuda.memory_reserved(0) + torch.cuda.memory_allocated(0)
) >= (torch.cuda.get_device_properties(0).total_memory //
worker_count) * 0.9:
gc.collect()
torch.cuda.empty_cache()
return result

View File

@ -72,6 +72,7 @@ class TestModule(unittest.TestCase):
def test_module(self):
m = Module3()
print(m)
m.forward()
self.assertEqual(4, len(list(m.named_modules())))
@ -88,6 +89,7 @@ class TestModule(unittest.TestCase):
def test_module_list(self):
m = Module4()
print(m)
m.forward()
self.assertEqual(8, len(list(m.named_modules())))