Update TensorRT-LLM (#1835)

* Update TensorRT-LLM

---------

Co-authored-by: Morgan Funtowicz <funtowiczmo@gmail.com>
This commit is contained in:
Kaiyu Xie 2024-06-25 21:10:30 +08:00 committed by GitHub
parent 2a115dae84
commit 9691e12bce
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
94 changed files with 1129 additions and 988 deletions

View File

@ -165,6 +165,9 @@ struct BenchmarkParams
// Weights offloading
float gpuWeightsPercent{1.0};
// Decoding params
std::optional<std::vector<std::vector<SizeType32>>> medusaChoices;
};
class InferenceRequestsSyncSend
@ -791,6 +794,10 @@ public:
executorConfig.setMaxBatchSize(benchmarkParams.maxBatchSize.value());
}
executorConfig.setDecodingConfig(texec::DecodingConfig(
benchmarkParams.medusaChoices.has_value() ? texec::DecodingMode::Medusa() : texec::DecodingMode::Auto(),
std::nullopt, benchmarkParams.medusaChoices));
mExecutor = std::make_unique<texec::Executor>(trtEnginePath, texec::ModelType::kDECODER_ONLY, executorConfig);
if (logIterationData)
@ -1346,6 +1353,9 @@ void benchmarkGptManager(std::filesystem::path const& engineDir, TrtGptModelType
optionalParams.maxBeamWidth = beamWidth;
optionalParams.maxBatchSize = benchmarkParams.maxBatchSize;
optionalParams.schedulerConfig = texec::SchedulerConfig{capacitySchedulerPolicy};
optionalParams.decodingConfig = texec::DecodingConfig(
benchmarkParams.medusaChoices.has_value() ? texec::DecodingMode::Medusa() : texec::DecodingMode::Auto(),
std::nullopt, benchmarkParams.medusaChoices);
auto const jsonConfig = GptJsonConfig::parse(engineDir / "config.json");
SizeType32 deviceCount{0};
@ -1600,6 +1610,32 @@ void benchmarkExecutor(std::filesystem::path const& engineDir, TrtGptModelType m
}
}
std::vector<std::vector<SizeType32>> parseVectorOfVectors(std::string const& input)
{
std::vector<std::vector<SizeType32>> result;
std::regex outer_regex(R"(\[(.*?)\])");
std::regex inner_regex(R"(\d+)");
auto outer_begin = std::sregex_iterator(input.begin(), input.end(), outer_regex);
auto outer_end = std::sregex_iterator();
for (std::sregex_iterator i = outer_begin; i != outer_end; ++i)
{
std::smatch match = *i;
std::string inner_str = match.str(1);
std::vector<int> inner_vec;
auto inner_begin = std::sregex_iterator(inner_str.begin(), inner_str.end(), inner_regex);
auto inner_end = std::sregex_iterator();
for (std::sregex_iterator j = inner_begin; j != inner_end; ++j)
{
std::smatch inner_match = *j;
inner_vec.push_back(std::stoi(inner_match.str()));
}
result.push_back(inner_vec);
}
return result;
}
} // namespace
int main(int argc, char* argv[])
@ -1692,6 +1728,8 @@ int main(int argc, char* argv[])
options.add_options()("gpu_weights_percent",
"Specify the percentage of weights that reside on GPU (from 0.0 to 1.0).",
cxxopts::value<float>()->default_value("1.0"));
options.add_options()(
"medusa_choices", "Medusa choices in the format of [[0], [0, 1], [0, 0, 1]]", cxxopts::value<std::string>());
auto result = options.parse(argc, argv);
@ -1823,6 +1861,12 @@ int main(int argc, char* argv[])
// Argument: If offloaded blocks should be onboarded to primary memory before they are reused.
benchmarkParams.kvOnboardBlocks = !result["kv_dont_onboard_blocks"].as<bool>();
// Argument: Medusa choices for the Medusa speculative decoding.
if (result.count("medusa_choices"))
{
benchmarkParams.medusaChoices = parseVectorOfVectors(result["medusa_choices"].as<std::string>());
}
std::optional<TokenIdType> padId;
// Argument: Padding token id
if (result.count("pad_id"))

View File

@ -944,6 +944,7 @@ def build_gpt(args):
network = builder.create_network()
network.trt_network.name = engine_name
network.plugin_config.to_legacy_setting()
network.plugin_config.dtype = args.dtype
# Plugins
if args.mode in ['plugin', 'plugin-ifb']:

View File

@ -1,3 +1,3 @@
version https://git-lfs.github.com/spec/v1
oid sha256:1fec0fdc00c076761ec48eb5e2ea93473a329e844a8091e26c6e3e02fd14a8b1
size 3931604
oid sha256:8b6ad33047e2684c7d22471f87febbb96ae26f4eac6529e2f3b7c1469ec2ec6d
size 3931504

View File

@ -1,3 +1,3 @@
version https://git-lfs.github.com/spec/v1
oid sha256:1fec0fdc00c076761ec48eb5e2ea93473a329e844a8091e26c6e3e02fd14a8b1
size 3931604
oid sha256:560f736af15a4dfba849ab29efc3520d6ec8c87bf2aa16589299b232dc171cca
size 3989220

View File

@ -1,3 +1,3 @@
93adf3003d7c422586a9bf892367371d libtensorrt_llm_batch_manager_static.a
93adf3003d7c422586a9bf892367371d libtensorrt_llm_batch_manager_static.pre_cxx11.a
c0bd2b69c932257678a2aad9bd8baba4b291795e commit
f8538ac35803837e5d457ea8c1a58053 libtensorrt_llm_batch_manager_static.a
dc6fc82dc4ba319899e1d6777bd8c3a4 libtensorrt_llm_batch_manager_static.pre_cxx11.a
265b039443334094026fbd8f396d52fe29c2d9d1 commit

View File

@ -1,3 +1,3 @@
version https://git-lfs.github.com/spec/v1
oid sha256:bd757c26886a3ffd6947615d9f2829434e94839b693007a64b47c6b5c26416e4
size 3812158
oid sha256:74948e00ff7341914b1831ccfdce9ae242dd149603b1ba7e24ee993f08b63542
size 3812960

View File

@ -1,3 +1,3 @@
version https://git-lfs.github.com/spec/v1
oid sha256:87321383075adf2d87cfbdc8a12a3d3815ef058d5da9b6aaa8d7d3f3263af439
size 3773896
oid sha256:0421ceacd5d07bc172bb4d0979edaf466aa8950290b4d6d1a7d355dbcefc2c84
size 3772832

View File

@ -1,3 +1,3 @@
version https://git-lfs.github.com/spec/v1
oid sha256:58cdc0a330f8bfb7b50e3202aeac47bde0835b1dc600b4bfdcd2b30801e66e03
size 22381766
oid sha256:46eb1d351e3e8da3945a3f451166f12536aae3e440d57337d8891492424aff78
size 22387798

View File

@ -1,3 +1,3 @@
version https://git-lfs.github.com/spec/v1
oid sha256:18a967eaa1e9a7164e0b104a84b13ea95404f7c7c278375feb2513d5f063bafe
size 1396404
oid sha256:19585b7709736197d9c1762d1bb8e3099e298d6dcc1c521d51c83637cc624c20
size 1397814

View File

@ -1,3 +1,3 @@
version https://git-lfs.github.com/spec/v1
oid sha256:18a967eaa1e9a7164e0b104a84b13ea95404f7c7c278375feb2513d5f063bafe
size 1396404
oid sha256:f5d5475663640c068af2e9b5772b9b602656641dd17ca473ce7125ef7f2ec855
size 1423172

View File

@ -1,3 +1,3 @@
7d12b9c04cb6738bb5f7747a88b00c1c libtensorrt_llm_executor_static.a
7d12b9c04cb6738bb5f7747a88b00c1c libtensorrt_llm_executor_static.pre_cxx11.a
c0bd2b69c932257678a2aad9bd8baba4b291795e commit
e18e84fb356995b11c04b79e55c4c3f5 libtensorrt_llm_executor_static.a
f0555b76f21d43e676e5808bf197cc58 libtensorrt_llm_executor_static.pre_cxx11.a
265b039443334094026fbd8f396d52fe29c2d9d1 commit

View File

@ -1,3 +1,3 @@
version https://git-lfs.github.com/spec/v1
oid sha256:e503b4cfb1c842850287a359ffed23a1773a67a96475d365b66d757a283ac218
size 1448772
oid sha256:8496c9e4a20efd3d2072520cf843dac70cbb0fe23621cfba2a1e0ef3e5fa22ed
size 1450288

View File

@ -1,3 +1,3 @@
version https://git-lfs.github.com/spec/v1
oid sha256:f8c80cf7aca2b135a656a060456fb30a820e459b4b36560162b02fa65121ef50
size 1375430
oid sha256:1b76267834252836e26ddecc2e1b9449e33a67fb1981e5d42f721bc439be1c02
size 1377018

View File

@ -1,3 +1,3 @@
version https://git-lfs.github.com/spec/v1
oid sha256:cc65971d6d74260cb49b354aa4b0b82f92863cc722fbf206bf8a4919a4897532
size 14031364
oid sha256:9bd0faf48175745d7aeff58f539ae021db365b73933dab9c51329de9e92f2d86
size 14039826

View File

@ -424,7 +424,8 @@ std::vector<cutlass_extensions::CutlassGemmConfig> MoeGemmRunner<T, WeightType>:
template <typename T, typename WeightType>
bool MoeGemmRunner<T, WeightType>::isHopperSpecialised() const
{
bool config_is_sm90 = best_config_ && best_config_->is_sm90;
TLLM_CHECK_WITH_INFO(best_config_, "Cannot determine if hopper is specialised without a selected config");
bool config_is_sm90 = best_config_->is_sm90;
return supportsHopperSpecialisation() && config_is_sm90;
}
@ -440,7 +441,7 @@ int MoeGemmRunner<T, WeightType>::getSM() const
return this->sm_;
}
// currently support sm80 bf16/fp16 gate ativation, only set predication tensor for m direction
// currently support sm80 bf16/fp16 gate activation, only set predication tensor for m direction
template <typename T, typename WeightType>
bool MoeGemmRunner<T, WeightType>::isFusedGatedActivation(bool is_gated_activation, int gemm_n, int gemm_k) const
{

View File

@ -1,2 +1,2 @@
5b6c74ce66f62d2a58aa9cac16f11ad6 libtensorrt_llm_nvrtc_wrapper.so
c0bd2b69c932257678a2aad9bd8baba4b291795e commit
265b039443334094026fbd8f396d52fe29c2d9d1 commit

View File

@ -1,3 +1,3 @@
version https://git-lfs.github.com/spec/v1
oid sha256:84319476e8ecf9666f40f69355f19ec3b585fc0987f940be14af9e11e3f524c3
oid sha256:9f2f97eb5b4181917a47b6028a857d7a597ca93faa5846af42c4cb24797d7fa7
size 1080832

View File

@ -1072,10 +1072,38 @@ std::vector<size_t> CutlassMoeFCRunner<T, WeightType, OutputType, Enable>::getWo
size_t const hopper_size = using_hopper ? HopperGroupedGemmInput::workspaceSize(num_experts_per_node) : 0;
size_t const gemm_workspace_size = moe_gemm_runner_.getMaxWorkspaceSize(num_experts_per_node);
std::vector<size_t> workspace{source_rows_size, permuted_rows_size, permuted_experts_size, permuted_data_size,
total_rows_before_expert_size, softmax_out_size, glu_inter_size,
// We do some overlapping of the large workspace buffers. Although we could overlap some of the other buffers, they
// are small enough (i.e no factor of hidden size) they will only be a couple MiB at most, so we don't bother
// in the case of fused activation we overlap permuted_data and fc2_result
// in the case of unfused activation we overlap permuted_data and fc1_result
// we need to calculate the max possible size, so use the max of all three
size_t overlapped_gemm1_gemm2_inputs = std::max(permuted_data_size, fc2_result_size);
// When glu_inter_elems is 0 we are always fused, otherwise we may need the un-fused case
if (glu_inter_elems > 0)
{
overlapped_gemm1_gemm2_inputs = std::max(overlapped_gemm1_gemm2_inputs, fc1_result_size);
}
// if we have glu_inter we overlap it with fc2_result, otherwise we use fc1_result by itself
size_t overlapped_gemm1_gemm2_outputs = fc1_result_size;
if (glu_inter_elems > 0)
{
overlapped_gemm1_gemm2_outputs
= std::max(std::max(glu_inter_size, fc2_result_size), overlapped_gemm1_gemm2_outputs);
}
std::vector<size_t> workspace{ //
source_rows_size, //
permuted_rows_size, //
permuted_experts_size, //
total_rows_before_expert_size, //
softmax_out_size, //
sorter_size, //
// These pointers reuse the same memory
std::max(fc1_result_size, sorter_size), fc2_result_size, hopper_size, gemm_workspace_size};
overlapped_gemm1_gemm2_inputs, //
overlapped_gemm1_gemm2_outputs, //
hopper_size, //
gemm_workspace_size};
return workspace;
}
@ -1088,7 +1116,9 @@ size_t CutlassMoeFCRunner<T, WeightType, OutputType, Enable>::getWorkspaceSize(i
TLLM_CHECK_WITH_INFO(num_experts % ep_size == 0, "Number of experts must be a multiple of ep size");
auto workspace = getWorkspaceBufferSizes(
num_rows, hidden_size, inter_size, num_experts, num_experts / ep_size, k, activation_type);
return tensorrt_llm::common::calculateTotalWorkspaceSize(workspace.data(), workspace.size());
auto ws_size = tensorrt_llm::common::calculateTotalWorkspaceSize(workspace.data(), workspace.size());
TLLM_LOG_DEBUG("Mixture Of Experts Plugin requires workspace of %2f MiB", ws_size / 1024.f / 1024.f);
return ws_size;
}
template <class T, class WeightType, class OutputType, class Enable>
@ -1109,29 +1139,38 @@ void CutlassMoeFCRunner<T, WeightType, OutputType, Enable>::configureWsPtrs(char
source_rows_ = (int*) ws_sliced[0];
permuted_rows_ = (int*) ws_sliced[1];
permuted_experts_ = (int*) ws_sliced[2];
permuted_data_ = (T*) ws_sliced[3];
total_rows_before_expert_ = (int64_t*) ws_sliced[4];
total_rows_before_expert_ = (int64_t*) ws_sliced[3];
softmax_out_ = nullptr;
bool const is_pow_2 = (num_experts != 0) && ((num_experts & (num_experts - 1)) == 0);
if (!is_pow_2 || num_experts > 256)
{
softmax_out_ = (float*) ws_sliced[5];
softmax_out_ = (float*) ws_sliced[4];
}
glu_inter_result_ = (T*) ws_sliced[6];
sorter_ws_ = (char*) ws_sliced[5];
// These pointers are aliased. Since the sort ws can be overwritten after it is finished
sorter_ws_ = (char*) ws_sliced[7];
fc1_result_ = (T*) ws_sliced[7];
// Always 6, but overlapped with either fc1_result_ or fc2_result_
permuted_data_ = (T*) ws_sliced[6];
fc2_result_ = (T*) ws_sliced[8];
bool const is_gated_activation = isGatedActivation(activation_type);
bool const use_fused_moe = moe_gemm_runner_.isFusedGatedActivation(is_gated_activation, inter_size, hidden_size);
bool const using_hopper = moe_gemm_runner_.isHopperSpecialised();
bool const hopper_has_glu = using_hopper && (mayHaveDifferentGEMMOutputType() || is_gated_activation);
bool const non_hopper_has_glu = !using_hopper && !use_fused_moe && is_gated_activation;
bool const has_glu_inter_result = hopper_has_glu || non_hopper_has_glu;
// Always 7, ignored if not needed
glu_inter_result_ = has_glu_inter_result ? (T*) ws_sliced[7] : nullptr;
// fc1 and fc2 alias one of the above pointers, but it depends on if actfn is fused/unfused which is overlapped
fc1_result_ = has_glu_inter_result ? (T*) ws_sliced[6] : (T*) ws_sliced[7];
fc2_result_ = has_glu_inter_result ? (T*) ws_sliced[7] : (T*) ws_sliced[6];
hopper_grouped_gemm_input_ = {};
if (moe_gemm_runner_.isHopperSpecialised())
{
hopper_grouped_gemm_input_.configureWorkspace(ws_sliced[9], num_experts_per_node, ws_sliced[10], ws_sizes[10]);
hopper_grouped_gemm_input_.configureWorkspace(ws_sliced[8], num_experts_per_node, ws_sliced[9], ws_sizes[9]);
}
}
@ -1293,6 +1332,7 @@ void CutlassMoeFCRunner<T, WeightType, OutputType, Enable>::runMoe(void const* i
}
else
{
// Run the GEMM with activation function overridden with `Identity`, we do the activation separately
ActivationType activation_type = (use_fused_moe) ? fc1_activation_type : ActivationType::Identity;
T* gemm_result = (use_fused_moe) ? fc1_result_ : static_cast<T*>(glu_inter_result_);

View File

@ -431,7 +431,8 @@ void InitBindings(pybind11::module_& m)
&tle::DecodingConfig::setLookaheadDecoding)
.def_property("medusa_choices", &tle::DecodingConfig::getMedusaChoices, &tle::DecodingConfig::setMedusaChoices);
auto executorConfigGetState = [&](tle::ExecutorConfig const& self)
auto executorConfigGetState = [&peftCacheConfigGetstate, &kvCacheConfigGetstate, &schedulerConfigGetstate,
&parallelConfigGetstate](tle::ExecutorConfig const& self)
{
py::object peftCacheConfigState = py::none();
@ -453,7 +454,8 @@ void InitBindings(pybind11::module_& m)
peftCacheConfigState, self.getLogitsPostProcessorMap(), self.getLogitsPostProcessorBatched(),
self.getDecodingConfig(), self.getGpuWeightsPercent());
};
auto executorConfigSetState = [&](py::tuple state)
auto executorConfigSetState = [&kvCacheConfigSetstate, &peftCacheConfigSetstate, &schedulerConfigSetstate,
&parallelConfigSetstate](py::tuple state)
{
if (state.size() != 15)
{

View File

@ -96,8 +96,11 @@ void MedusaModule::initMedusaTensorsFromChoices(MedusaChoices const& choices, st
if (curDepth != depth)
{
TLLM_CHECK(depth + 1 == curDepth);
TLLM_CHECK_WITH_INFO(depth <= getMaxDraftPathLen(),
"Medusa choices require more Medusa heads than the engine was built with.");
// Save TopK
topKs[depth - 1] = maxTopK;
// Accumulate TopK for global indexing in tree
globalNodeInTreeIdx += maxTopK;

View File

@ -257,7 +257,7 @@ protected:
template <class T>
T* allocBuffer(size_t size)
{
managed_buffers.emplace_back(mBufferManager->managed(size * sizeof(T)));
managed_buffers.emplace_back(mBufferManager->gpu(size * sizeof(T)));
EXPECT_EQ(cudaGetLastError(), cudaSuccess) << "Error allocating buffer of size: " << size;
T* ptr = static_cast<T*>(managed_buffers.back()->data());
return ptr;
@ -268,15 +268,27 @@ protected:
this->managed_buffers.clear(); // Make sure all the previous buffers are freed
check_cuda_error(cudaDeviceSynchronize()); // Sync to make sure all previous operations are resolved
size_t weight_size = hidden_size * hidden_size * 4 * num_experts * sizeof(WeightType);
// Skip the test if the GPU does not have enough memory
size_t workspace_size = this->mMoERunner.getWorkspaceSize(
// Calculate the size contributions for all the large buffers to check if the GPU has enough space
bool const is_gated = tensorrt_llm::isGatedActivation(mActType);
size_t const num_gemms = 2 + is_gated;
// Expert weights
size_t const weight_size = hidden_size * (hidden_size * 4) * num_experts * sizeof(WeightStorage) * num_gemms;
// Workspace size
size_t const workspace_size = this->mMoERunner.getWorkspaceSize(
num_tokens, hidden_size, hidden_size * 4, num_experts, k, this->mActType, {});
// The input/output buffers
size_t const in_out_size = 2 * num_tokens * hidden_size * sizeof(DataType);
size_t total_size = workspace_size + weight_size * 2;
// This should be correct to within 100MiB (on tests with 30GiB total)
size_t const total_size = workspace_size + weight_size + in_out_size;
size_t const memory_pool_free_mem_size = mBufferManager->memoryPoolFree();
auto const [freeMem, totalMem] = tensorrt_llm::common::getDeviceMemoryInfo(false);
return freeMem >= total_size;
float const freeMemBuffer = 0.9f; // Add some buffer so we aren't completely pushing the limits
std::cout << "Free memory is: " << freeMem << ", memory pool size is: " << memory_pool_free_mem_size
<< ", required memory is: " << total_size << ", device total memory capacity: " << totalMem
<< std::endl;
return (freeMem + memory_pool_free_mem_size) * freeMemBuffer >= total_size;
}
void initBuffersPermute(std::vector<std::vector<DataType>> h_hidden_states,
@ -362,7 +374,10 @@ protected:
initFP8Scales(mMaxInput);
}
mTpExpertScratch = allocBuffer<DataType>(mTpExpertScratchSize);
if (parallelism_config.tp_size > 1 || parallelism_config.ep_size > 1)
{
mTpExpertScratch = allocBuffer<DataType>(mTpExpertScratchSize);
}
mActiveRows = mTotalTokens;
mFinished = nullptr;
@ -475,10 +490,18 @@ protected:
ASSERT_NE(mExpertFP8Scale1, nullptr);
ASSERT_NE(mExpertFP8Scale2, nullptr);
ASSERT_NE(mExpertFP8Scale3, nullptr);
// Dequant values for each expert are 1/(w_i*a_i) calculated above
std::fill_n(mExpertFP8Scale1, mNumExperts, 1.f / (scaleW1 * scaleAct1));
std::fill_n(mExpertFP8Scale3, mNumExperts, 1.f / (scaleW2 * scaleAct2));
*mExpertFP8Scale2 = scaleAct2;
std::vector<float> scales_1(mNumExperts, 1.f / (scaleW1 * scaleAct1));
std::vector<float> scales_2(1, scaleAct2);
std::vector<float> scales_3(mNumExperts, 1.f / (scaleW2 * scaleAct2));
check_cuda_error(cudaMemcpyAsync(mExpertFP8Scale1, scales_1.data(), scales_1.size() * sizeof(float),
cudaMemcpyHostToDevice, mStream->get()));
check_cuda_error(cudaMemcpyAsync(mExpertFP8Scale2, scales_2.data(), scales_2.size() * sizeof(float),
cudaMemcpyHostToDevice, mStream->get()));
check_cuda_error(cudaMemcpyAsync(mExpertFP8Scale3, scales_3.data(), scales_3.size() * sizeof(float),
cudaMemcpyHostToDevice, mStream->get()));
check_cuda_error(cudaStreamSynchronize(mStream->get()));
}
@ -561,6 +584,13 @@ protected:
void* ep_scale_2 = FP8 ? (void*) mExpertFP8Scale2 : (void*) mExpertIntScale2;
void* ep_scale_3 = FP8 ? mExpertFP8Scale3 : nullptr;
// Handle the case with no parallelism to not require the extra alloc
if (parallelism_config.tp_size == 1 && parallelism_config.ep_size == 1)
{
return std::tuple{
mExpertWeight1, mExpertWeight2, mExpertBias1, mExpertBias2, ep_scale_1, ep_scale_2, ep_scale_3};
}
// Slice weights for EP
size_t const gated_inter = mInterSize * mGatedMultiplier;
size_t const experts_per_node = mNumExperts / parallelism_config.ep_size;

View File

@ -28,7 +28,7 @@ def build_engine(weight_dir: _pl.Path, medusa_dir: _pl.Path,
covert_cmd = [_sys.executable, "examples/medusa/convert_checkpoint.py"] + (
['--model_dir', str(weight_dir)] if weight_dir else []) + [
'--medusa_model_dir', str(medusa_dir), \
'--output_dir', str(engine_dir), '--dtype=float16', '--fixed_num_medusa_heads=4'
'--output_dir', str(engine_dir), '--dtype=float16', '--num_medusa_heads=4'
] + list(args)
run_command(covert_cmd)

View File

@ -1,6 +1,6 @@
# Multi-stage Dockerfile
ARG BASE_IMAGE=nvcr.io/nvidia/pytorch
ARG BASE_TAG=24.04-py3
ARG BASE_TAG=24.05-py3
ARG DEVEL_IMAGE=devel
FROM ${BASE_IMAGE}:${BASE_TAG} as base

View File

@ -4,8 +4,8 @@ set -ex
# Use latest stable version from https://pypi.org/project/torch/#history
# and closest to the version specified in
# https://docs.nvidia.com/deeplearning/frameworks/pytorch-release-notes/rel-24-04.html#rel-24-04
TORCH_VERSION="2.3.0"
# https://docs.nvidia.com/deeplearning/frameworks/pytorch-release-notes/rel-24-05.html#rel-24-05
TORCH_VERSION="2.3.1"
SYSTEM_ID=$(grep -oP '(?<=^ID=).+' /etc/os-release | tr -d '"')
prepare_environment() {

View File

@ -2,12 +2,12 @@
set -ex
TRT_VER="10.0.1.6"
TRT_VER="10.1.0.27"
# Align with the pre-installed cuDNN / cuBLAS / NCCL versions from
# https://docs.nvidia.com/deeplearning/frameworks/pytorch-release-notes/rel-24-04.html#rel-24-04
# https://docs.nvidia.com/deeplearning/frameworks/pytorch-release-notes/rel-24-05.html#rel-24-05
CUDA_VER="12.4" # 12.4.1
# Keep the installation for cuDNN if users want to install PyTorch with source codes.
# PyTorch 2.3.0 can compile with cuDNN v9.
# PyTorch 2.3.x can compile with cuDNN v9.
CUDNN_VER="9.1.0.70-1"
NCCL_VER="2.21.5-1+cuda12.4"
CUBLAS_VER="12.4.5.8-1"
@ -86,8 +86,7 @@ install_tensorrt() {
if [ "$ARCH" = "amd64" ];then ARCH="x86_64";fi
if [ "$ARCH" = "x86_64" ];then DIR_NAME="x64-agnostic"; else DIR_NAME=${ARCH};fi
if [ "$ARCH" = "aarch64" ];then OS1="Ubuntu22_04" && OS2="Ubuntu-22.04" && OS="ubuntu-22.04"; else OS1="Linux" && OS2="Linux" && OS="linux";fi
RELEASE_URL_TRT=https://developer.nvidia.com/downloads/compute/machine-learning/tensorrt/10.0.1/tars/TensorRT-${TRT_VER}.${OS2}.${ARCH}-gnu.cuda-${TRT_CUDA_VERSION}.tar.gz
RELEASE_URL_TRT=https://developer.nvidia.com/downloads/compute/machine-learning/tensorrt/10.1.0/tars/TensorRT-${TRT_VER}.${OS2}.${ARCH}-gnu.cuda-${TRT_CUDA_VERSION}.tar.gz
fi
wget --no-verbose ${RELEASE_URL_TRT} -O /tmp/TensorRT.tar
tar -xf /tmp/TensorRT.tar -C /usr/local/

View File

@ -44,9 +44,9 @@ The following table shows the supported software for TensorRT-LLM.
* -
- Software Compatibility
* - Container
- [24.04](https://docs.nvidia.com/deeplearning/frameworks/support-matrix/index.html)
- [24.05](https://docs.nvidia.com/deeplearning/frameworks/support-matrix/index.html)
* - TensorRT
- [10.0](https://docs.nvidia.com/deeplearning/tensorrt/release-notes/index.html)
- [10.1](https://docs.nvidia.com/deeplearning/tensorrt/release-notes/index.html)
* - Precision
-
- Hopper (SM90) - FP32, FP16, BF16, FP8, INT8, INT4

View File

@ -23,10 +23,11 @@ All published functionality in the Release Notes has been fully tested and verif
- TBD
### Infrastructure changes
- Base Docker image for TensorRT-LLM is updated to `nvcr.io/nvidia/pytorch:24.04-py3`.
- Base Docker image for TensorRT-LLM backend is updated to `nvcr.io/nvidia/tritonserver:24.04-py3`.
- Base Docker image for TensorRT-LLM is updated to `nvcr.io/nvidia/pytorch:24.05-py3`.
- Base Docker image for TensorRT-LLM backend is updated to `nvcr.io/nvidia/tritonserver:24.05-py3`.
- The dependent TensorRT version is updated to 10.1.0.
- The dependent CUDA version is updated to 12.4.1.
- The dependent PyTorch version is updated to 2.3.0.
- The dependent PyTorch version is updated to 2.3.1.
## TensorRT-LLM Release 0.10.0

View File

@ -15,10 +15,10 @@ If the first assumption holds true, the latency of speculative decoding will no
The combination of both these allows speculative decoding to result in reduced latency.
TensorRT-LLM supports several approaches for generating draft tokens, including:
1. Utilizing a smaller, auxiliary model, known as the draft model approach. For more information, refer to the [Fast Inference from Transformers via Speculative Decoding paper](https://arxiv.org/pdf/2211.17192.pdf).
2. Implementing additional language model heads that predict tokens for future positions, as detailed in the [Medusa: Simple LLM Inference Acceleration Framework with Multiple Decoding Heads paper](https://arxiv.org/abs/2401.10774).
## Performance Improvements
It's important to note that the effectiveness of speculative decoding techniques is highly dependent
@ -56,6 +56,166 @@ it is advisable to enable KV cache reuse for both models.
This can be achieved by adding the `--use_paged_context_fmha=enable` flag to the `trtllm-build` command
and setting `enableBlockReuse=true` in the `KVCacheConfig`.
## Using Draft model approach with Triton Inference Server
+ Draft model approach is supported since TensorRT-LLM-0.7.0 (using two separate Tritonserver to maintain draft and target model respectively), but has significant optimization in TensorRT-LLM-0.10.0 (using one Tritonserver with [Business Logic Scripting](https://github.com/triton-inference-server/python_backend?tab=readme-ov-file#business-logic-scripting), BLS).
+ The source file of Draft model with BLS can be found [here](https://github.com/triton-inference-server/tensorrtllm_backend/blob/main/all_models/inflight_batcher_llm/tensorrt_llm_bls/1/lib/decode.py).
+ This example is based on TensorRT-LLM-0.10.0 and TRTLLM-backend-0.10.0, using docker image `nvcr.io/nvidia/tritonserver:24.05-trtllm-py3`.
+ Llama-7B-hf and Llama-30B-hf are used as draft and target model respectively in this example, assuming the paths to the models' repository are `DRAFT_MODEL_PATH` and `TARGET_MODEL_PATH`.
+ Maximum number of draft tokens is set to 10 in this example.
1. Prepare TensorRT engine for inference
+ Here are the commands to build draft / target engines in FP16 or FP8. All combinations of the data type (Draft-FP16/FP8 + Target-FP16/FP8) are supported.
+ `--remove_input_padding=enable --paged_kv_cache=enable` are necessary for inflight-batching.
+ `--context_fmha=enable --use_paged_context_fmha=enable` are optional, but recommended for the performance.
+ `--gather_generation_logits` is necessary if using generation logits for selecting tokens in target model.
+ `--tp_size` can be modified set if using TP mode for draft / target model.
+ `--max_batch_size` more than 1 is acceptable in general usage, but we use 1 in this example.
```bash
export MAX_DRAFT_LENGTH=10
export COMMON_COMMAND="--max_batch_size=1 --max_input_len=2048 --max_output_len=1024 --gpt_attention_plugin=float16 --gemm_plugin=float16 --remove_input_padding=enable --paged_kv_cache=enable --context_fmha=enable --use_paged_context_fmha=enable --gather_generation_logits"
export DRAFT_COMMAND_FP16="$COMMON_COMMAND"
export TARGET_COMMAND_FP16="$DRAFT_COMMAND_FP16 --max_draft_len=$MAX_DRAFT_LENGTH --speculative_decoding_mode draft_tokens_external"
export DRAFT_COMMAND_FP8="$COMMON_COMMAND --strongly_typed --use_fp8_context_fmha=enable"
export TARGET_COMMAND_FP8="$DRAFT_COMMAND_FP8 --max_draft_len=$MAX_DRAFT_LENGTH --speculative_decoding_mode draft_tokens_external"
# Build checkpoints and engines in tensorrt_llm/examples/llama/
# FP16 mode
export DRAFT_NAME=llama-7b-fp16-tp1
export TARGET_NAME=llama-30b-fp16-tp1
python3 convert_checkpoint.py --model_dir=$DRAFT_MODEL_PATH --output_dir=ckpt/$DRAFT_NAME --tp_size=1
python3 convert_checkpoint.py --model_dir=$TARGET_MODEL_PATH --output_dir=ckpt/$TARGET_NAME --tp_size=1
trtllm-build --checkpoint_dir=ckpt/$DRAFT_NAME --output_dir=engine/draft/$DRAFT_NAME $DRAFT_COMMAND_FP16
trtllm-build --checkpoint_dir=ckpt/$TARGET_NAME --output_dir=engine/target/$TARGET_NAME $TARGET_COMMAND_FP16
export DRAFT_ENGINE_PATH=$(pwd)/engine/draft/$DRAFT_NAME
export TARGET_ENGINE_PATH=$(pwd)/engine/target/$TARGET_NAME
# FP8 mode
export DRAFT_NAME=llama-7b-fp8-tp1
export TARGET_NAME=llama-30b-fp8-tp1
python3 convert_checkpoint.py --model_dir=$DRAFT_MODEL_PATH --output_dir=ckpt/$DRAFT_NAME --tp_size=1
python3 convert_checkpoint.py --model_dir=$TARGET_MODEL_PATH --output_dir=ckpt/$TARGET_NAME --tp_size=1
trtllm-build --checkpoint_dir=ckpt/$DRAFT_NAME --output_dir=engine/draft/$DRAFT_NAME $DRAFT_COMMAND_FP8
trtllm-build --checkpoint_dir=ckpt/$TARGET_NAME --output_dir=engine/target/$TARGET_NAME $TARGET_COMMAND_FP8
export DRAFT_ENGINE_PATH=$(pwd)/engine/draft/$DRAFT_NAME
export TARGET_ENGINE_PATH=$(pwd)/engine/target/$TARGET_NAME
```
2. Edit Triton configuration
+ If both draft and target model can be placed in one GPU (for example, llama-7B-FP8 + llama-30B-FP8, totally 40GiB in one H100-80GiB GPU), `DRAFT_GPU_DEVICE_IDS` and `TARGET_GPU_DEVICE_IDS` can be the same, `0` as example. It appears better performance than placing on two separate GPUs.
+ Elsewise, the draft and target models can be placed in different GPUs, `DRAFT_GPU_DEVICE_IDS="0"` and `TARGET_GPU_DEVICE_IDS="1"` as example.
+ Furthermore, if TP mode is used, the value of `GPU_DEVICE_IDS` can be a list, `DRAFT_GPU_DEVICE_IDS="0"` and `TARGET_GPU_DEVICE_IDS="1,2,3,4"` as example.
+ For more configuration of launching models with Tritonserver, please visit [TensorRT-LLM Backed repo](https://github.com/triton-inference-server/tensorrtllm_backend/blob/main/README.md).
```bash
ACCUMULATE_TOKEN="false"
BACKEND="tensorrtllm"
BATCH_SCHEDULER_POLICY="guaranteed_no_evict"
BATCHING_STRATEGY="inflight_fused_batching"
BLS_INSTANCE_COUNT="1"
DECODING_MODE="top_k_top_p"
DECOUPLED_MODE="False"
DRAFT_GPU_DEVICE_IDS="0"
E2E_MODEL_NAME="ensemble"
ENABLE_KV_CACHE_REUSE="true"
ENGINE_PATH=$TARGET_ENGINE_PATH
EXCLUDE_INPUT_IN_OUTPUT="false"
KV_CACHE_FREE_GPU_MEM_FRACTION="0.8"
MAX_ATTENTION_WINDOW_SIZE=""
MAX_BEAM_WIDTH="1"
MAX_QUEUE_DELAY_MICROSECONDS="0"
MAX_TOKENS_IN_KV_CACHE=""
NORMALIZE_LOG_PROBS="true"
POSTPROCESSING_INSTANCE_COUNT="1"
PREPROCESSING_INSTANCE_COUNT="1"
TARGET_GPU_DEVICE_IDS="1"
TENSORRT_LLM_DRAFT_MODEL_NAME="tensorrt_llm_draft"
TENSORRT_LLM_MODEL_NAME="tensorrt_llm"
TOKENIZER_PATH=$DRAFT_MODEL_PATH
TOKENIZER_TYPE=llama
TRITON_GRPC_PORT="8001"
TRITON_HTTP_PORT="8000"
TRITON_MAX_BATCH_SIZE="4"
TRITON_METRICS_PORT="8002"
TRITON_REPO="triton_repo"
USE_DRAFT_LOGITS="false"
# Make a copy of triton repo and replace the fields in the configuration files
cd /tensorrtllm_backend/
apt-get update && apt-get install -y build-essential cmake git-lfs
pip3 install git-lfs tritonclient grpcio
rm -rf ${TRITON_REPO}
cp -R all_models/inflight_batcher_llm ${TRITON_REPO}
python3 tools/fill_template.py -i ${TRITON_REPO}/ensemble/config.pbtxt triton_max_batch_size:${TRITON_MAX_BATCH_SIZE}
python3 tools/fill_template.py -i ${TRITON_REPO}/preprocessing/config.pbtxt tokenizer_dir:${TOKENIZER_PATH},triton_max_batch_size:${TRITON_MAX_BATCH_SIZE},preprocessing_instance_count:${PREPROCESSING_INSTANCE_COUNT}
python3 tools/fill_template.py -i ${TRITON_REPO}/postprocessing/config.pbtxt tokenizer_dir:${TOKENIZER_PATH},triton_max_batch_size:${TRITON_MAX_BATCH_SIZE},postprocessing_instance_count:${POSTPROCESSING_INSTANCE_COUNT}
python3 tools/fill_template.py -i ${TRITON_REPO}/tensorrt_llm_bls/config.pbtxt triton_max_batch_size:${TRITON_MAX_BATCH_SIZE},decoupled_mode:${DECOUPLED_MODE},accumulate_tokens:${ACCUMULATE_TOKEN},bls_instance_count:${BLS_INSTANCE_COUNT},tensorrt_llm_model_name:${TENSORRT_LLM_MODEL_NAME},tensorrt_llm_draft_model_name:${TENSORRT_LLM_DRAFT_MODEL_NAME}
# Make a copy of tensorrt_llm as configurations of draft / target models.
cp -R ${TRITON_REPO}/tensorrt_llm ${TRITON_REPO}/tensorrt_llm_draft
sed -i 's/name: "tensorrt_llm"/name: "tensorrt_llm_draft"/g' ${TRITON_REPO}/tensorrt_llm_draft/config.pbtxt
python3 tools/fill_template.py -i ${TRITON_REPO}/tensorrt_llm/config.pbtxt triton_backend:${BACKEND},engine_dir:${ENGINE_PATH},decoupled_mode:${DECOUPLED_MODE},max_tokens_in_paged_kv_cache:${MAX_TOKENS_IN_KV_CACHE},max_attention_window_size:${MAX_ATTENTION_WINDOW_SIZE},batch_scheduler_policy:${BATCH_SCHEDULER_POLICY},batching_strategy:${BATCHING_STRATEGY},kv_cache_free_gpu_mem_fraction:${KV_CACHE_FREE_GPU_MEM_FRACTION},exclude_input_in_output:${EXCLUDE_INPUT_IN_OUTPUT},triton_max_batch_size:${TRITON_MAX_BATCH_SIZE},max_queue_delay_microseconds:${MAX_QUEUE_DELAY_MICROSECONDS},max_beam_width:${MAX_BEAM_WIDTH},enable_kv_cache_reuse:${ENABLE_KV_CACHE_REUSE},normalize_log_probs:${NORMALIZE_LOG_PROBS},enable_chunked_context:${ENABLE_CHUNKED_CONTEXT},gpu_device_ids:${TARGET_GPU_DEVICE_IDS},decoding_mode:${DECODING_MODE}
python3 tools/fill_template.py -i ${TRITON_REPO}/tensorrt_llm_draft/config.pbtxt triton_backend:${BACKEND},engine_dir:${DRAFT_ENGINE_PATH},decoupled_mode:${DECOUPLED_MODE},max_tokens_in_paged_kv_cache:${MAX_TOKENS_IN_KV_CACHE},max_attention_window_size:${MAX_ATTENTION_WINDOW_SIZE},batch_scheduler_policy:${BATCH_SCHEDULER_POLICY},batching_strategy:${BATCHING_STRATEGY},kv_cache_free_gpu_mem_fraction:${KV_CACHE_FREE_GPU_MEM_FRACTION},exclude_input_in_output:${EXCLUDE_INPUT_IN_OUTPUT},triton_max_batch_size:${TRITON_MAX_BATCH_SIZE},max_queue_delay_microseconds:${MAX_QUEUE_DELAY_MICROSECONDS},max_beam_width:${MAX_BEAM_WIDTH},enable_kv_cache_reuse:${ENABLE_KV_CACHE_REUSE},normalize_log_probs:${NORMALIZE_LOG_PROBS},enable_chunked_context:${ENABLE_CHUNKED_CONTEXT},gpu_device_ids:${DRAFT_GPU_DEVICE_IDS},decoding_mode:${DECODING_MODE}
```
3. Launch Triton server
+ `--multi-model` is necessary if TP mode is used for target model.
```bash
python3 scripts/launch_triton_server.py \
--model_repo=${TRITON_REPO} \
--tensorrt_llm_model_name "tensorrt_llm,tensorrt_llm_draft" \
--multi-model \
--log &
```
+ Verbose log will be written in to file `triton_log.txt`. Triton server launches successfully if you see the output below in the file:
```txt
Started HTTPService at 0.0.0.0:8000
Started GRPCInferenceService at 0.0.0.0:8001
Started Metrics Service at 0.0.0.0:8002
```
4. Send Requests
+ Prepare a JSON file `input_data.json` containing input data as below (more requests are acceptable).
```json
[
{
"input": "James Best, best known for his ",
"instruction": "Continue writing the following story:",
"output": " "
}
]
```
+ Use command below to launch requests for inference.
+ `--num-draft-tokens` can be modified by runtime draft lengths, 4 is used in this example.
```bash
python3 tools/inflight_batcher_llm/speculative_decoding_test.py \
--max-input-len 2048 \
--dataset=input_data.json \
--url-target=localhost:8001 \
--url-draft=localhost:8001 \
--draft-tensorrt-llm-model-name="${TENSORRT_LLM_DRAFT_MODEL_NAME}" \
--target-tensorrt-llm-model-name="${TENSORRT_LLM_MODEL_NAME}" \
--bls-speculative-tensorrt-llm-model-name="tensorrt_llm_bls" \
--execute-bls-speculative-decoding \
--disable-output-comparison \
--num-draft-tokens=4 \
--verbose
```
5. Kill Tritonserver after finishing inference
```bash
pkill -9 -f trtllmExecutorWorker
pkill -9 -f tritonserver
```
# Medusa
This approach leverages a single model to both generate and verify draft tokens.

View File

@ -1,5 +1,5 @@
--extra-index-url https://pypi.nvidia.com
tensorrt_llm==0.11.0.dev2024061800
tensorrt_llm==0.11.0.dev2024062500
datasets~=2.15.0
evaluate~=0.4.1
rouge_score~=0.1.2

View File

@ -1,5 +1,5 @@
--extra-index-url https://pypi.nvidia.com
tensorrt_llm==0.11.0.dev2024061800
tensorrt_llm==0.11.0.dev2024062500
datasets~=2.14.5
evaluate~=0.4.1
rouge_score~=0.1.2

View File

@ -1,5 +1,5 @@
--extra-index-url https://pypi.nvidia.com
tensorrt_llm==0.11.0.dev2024061800
tensorrt_llm==0.11.0.dev2024062500
datasets~=2.14.5
evaluate~=0.4.1
protobuf

View File

@ -119,7 +119,7 @@ def parse_arguments():
'By default, we use dtype for KV cache. int8_kv_cache chooses int8 quantization for KV'
)
parser.add_argument(
'--modelopt_quant_ckpt_path',
'--quant_ckpt_path',
type=str,
default=None,
help='Path of a quantized model checkpoint in .npz format')
@ -438,7 +438,7 @@ def main():
if args.use_weight_only and args.weight_only_precision == 'int4_gptq':
weights = load_weights_from_gptq(
args.modelopt_quant_ckpt_path,
args.quant_ckpt_path,
PretrainedConfig.from_dict(copy.deepcopy(config)),
)

View File

@ -1,5 +1,5 @@
--extra-index-url https://pypi.nvidia.com
tensorrt_llm==0.11.0.dev2024061800
tensorrt_llm==0.11.0.dev2024062500
datasets~=2.14.5
evaluate~=0.4.1
rouge_score~=0.1.2

View File

@ -1,5 +1,5 @@
--extra-index-url https://pypi.nvidia.com
tensorrt_llm==0.11.0.dev2024061800
tensorrt_llm==0.11.0.dev2024062500
transformers>=4.31.0
datasets~=2.14.5
evaluate~=0.4.1

View File

@ -71,7 +71,7 @@ def parse_arguments():
"By default, we use dtype for KV cache. fp8_kv_cache chooses fp8 quantization for KV",
)
parser.add_argument(
"--modelopt_quant_ckpt_path",
"--quant_ckpt_path",
default=None,
help=
"Path of a directory to quantized model checkpoints in .safetensors format or \
@ -944,7 +944,7 @@ def convert(worker_rank, args, convert_kwargs):
weight_scales = quantize_fp8_weights(
weights, trt_llm_config.num_hidden_layers,
trt_llm_config.mapping)
scales = load_from_fp8_gemma(args.modelopt_quant_ckpt_path,
scales = load_from_fp8_gemma(args.quant_ckpt_path,
trt_llm_config.num_hidden_layers,
trt_llm_config.mapping,
args.fp8_kv_cache, weight_scales)

View File

@ -3,7 +3,7 @@
# WAR the new posting of "nvidia-cudnn-cu12~=9.0".
# "jax[cuda12_pip]~=0.4.19" specifies "nvidia-cudnn-cu12>=8.9" but actually requires "nvidia-cudnn-cu12~=8.9".
nvidia-cudnn-cu12~=8.9; platform_machine == "x86_64"
tensorrt_llm==0.11.0.dev2024061800
tensorrt_llm==0.11.0.dev2024062500
flax~=0.8.0
# jax[cuda12_pip]~=0.4.19; platform_system != "Windows"
jax~=0.4.19; platform_system == "Windows"

View File

@ -1,5 +1,5 @@
--extra-index-url https://pypi.nvidia.com
tensorrt_llm==0.11.0.dev2024061800
tensorrt_llm==0.11.0.dev2024062500
datasets~=2.14.5
evaluate~=0.4.1
rouge_score~=0.1.2

View File

@ -1,5 +1,5 @@
--extra-index-url https://pypi.nvidia.com
tensorrt_llm==0.11.0.dev2024061800
tensorrt_llm==0.11.0.dev2024062500
datasets~=2.14.5
evaluate~=0.4.1
rouge_score~=0.1.2

View File

@ -167,7 +167,7 @@ sh gptq_convert.sh
### 3. Convert weights from HF Transformers to TensorRT-LLM format
To apply groupwise quantization GPTQ, addition command-line flags need to be passed to `convert_checkpoint.py`:
Here `--modelopt_quant_ckpt_path` flag specifies the output safetensors of `gptq_convert.sh` script.
Here `--quant_ckpt_path` flag specifies the output safetensors of `gptq_convert.sh` script.
```bash
# Single GPU
@ -175,7 +175,7 @@ python3 convert_checkpoint.py --model_dir ./gptneox_model \
--dtype float16 \
--use_weight_only \
--weight_only_precision int4_gptq \
--modelopt_quant_ckpt_path ./gptneox_model/gptneox-20b-4bit-gs128.safetensors \
--quant_ckpt_path ./gptneox_model/gptneox-20b-4bit-gs128.safetensors \
--output_dir ./gptneox/20B/trt_ckpt/int4_gptq/1-gpu/
# With 2-way Tensor Parallel
python3 convert_checkpoint.py --model_dir ./gptneox_model \
@ -184,7 +184,7 @@ python3 convert_checkpoint.py --model_dir ./gptneox_model \
--weight_only_precision int4_gptq \
--tp_size 2 \
--workers 2 \
--modelopt_quant_ckpt_path ./gptneox_model/gptneox-20b-4bit-gs128.safetensors \
--quant_ckpt_path ./gptneox_model/gptneox-20b-4bit-gs128.safetensors \
--output_dir ./gptneox/20B/trt_ckpt/int4_gptq/2-gpu/
```

View File

@ -50,7 +50,7 @@ def parse_arguments():
'Define the precision for the weights when using weight-only quantization.'
'You must also use --use_weight_only for that argument to have an impact.'
)
parser.add_argument('--modelopt_quant_ckpt_path',
parser.add_argument('--quant_ckpt_path',
type=str,
default=None,
help='Path of a quantized model checkpoint')
@ -708,8 +708,7 @@ if __name__ == '__main__':
'has_zero_point':
True,
'group_size':
get_gptq_gptneox_group_size(args.modelopt_quant_ckpt_path,
hf_config)
get_gptq_gptneox_group_size(args.quant_ckpt_path, hf_config)
})
with open(os.path.join(args.output_dir, 'config.json'), 'w') as f:
@ -723,7 +722,7 @@ if __name__ == '__main__':
if args.use_weight_only and args.weight_only_precision == 'int4_gptq':
weights = load_from_gptq_gptneox(
args.modelopt_quant_ckpt_path,
args.quant_ckpt_path,
hf_config,
use_parallel_embedding=args.use_parallel_embedding,
sharding_dim=args.embedding_sharding_dim,

View File

@ -1,5 +1,5 @@
--extra-index-url https://pypi.nvidia.com
tensorrt_llm==0.11.0.dev2024061800
tensorrt_llm==0.11.0.dev2024062500
datasets~=2.14.5
rouge_score~=0.1.2
evaluate~=0.4.1

View File

@ -1,6 +1,6 @@
-f https://storage.googleapis.com/jax-releases/jax_cuda_releases.html
--extra-index-url https://pypi.nvidia.com
tensorrt_llm==0.11.0.dev2024061800
tensorrt_llm==0.11.0.dev2024062500
datasets==2.14.6
evaluate~=0.4.1
rouge_score~=0.1.2

View File

@ -1,2 +1,2 @@
--extra-index-url https://pypi.nvidia.com
tensorrt_llm==0.11.0.dev2024061800
tensorrt_llm==0.11.0.dev2024062500

View File

@ -1,5 +1,5 @@
--extra-index-url https://pypi.nvidia.com
tensorrt_llm==0.11.0.dev2024061800
tensorrt_llm==0.11.0.dev2024062500
datasets==2.14.5
rouge_score~=0.1.2
sentencepiece~=0.1.99

View File

@ -644,7 +644,7 @@ One can enable AWQ/GPTQ INT4 weight only quantization with these options when bu
- `--use_weight_only` enables weight only GEMMs in the network.
- `--per_group` enable groupwise weight only quantization, for GPT-J example, we support AWQ with the group size default as 128.
- `--weight_only_precision` should specify the weight only quantization format. Supported formats are `int4_awq` or `int4_gptq`.
- `--modelopt_quant_ckpt_path` passes the quantized checkpoint to build the engine.
- `--quant_ckpt_path` passes the quantized checkpoint to build the engine.
AWQ/GPTQ examples below involves 2 steps:
1. Weight quantization
@ -700,7 +700,7 @@ To run the GPTQ LLaMa example, the following steps are required:
python convert_checkpoint.py --model_dir /tmp/llama-7b-hf \
--output_dir ./tllm_checkpoint_2gpu_gptq \
--dtype float16 \
--modelopt_quant_ckpt_path ./llama-7b-4bit-gs128.safetensors \
--quant_ckpt_path ./llama-7b-4bit-gs128.safetensors \
--use_weight_only \
--weight_only_precision int4_gptq \
--per_group \

View File

@ -1,5 +1,5 @@
--extra-index-url https://pypi.nvidia.com
tensorrt_llm==0.11.0.dev2024061800
tensorrt_llm==0.11.0.dev2024062500
datasets==2.14.6
evaluate~=0.4.1
rouge_score~=0.1.2

View File

@ -29,9 +29,6 @@ Please install required packages first and setup `git-lfs`:
```bash
pip install -r requirements.txt
pip install "transformers>=4.39.0"
# Setup git-lfs
git lfs install
```
@ -98,8 +95,7 @@ The `trtllm-build` command builds TensorRT-LLM engines from TensorRT-LLM checkpo
# mamba-2.8b
trtllm-build --checkpoint_dir ./mamba_model/mamba-2.8b/trt_ckpt/bf16/1-gpu/ \
--paged_kv_cache disable \
--gemm_plugin bfloat16 \
--mamba_conv1d_plugin bfloat16 \
--gemm_plugin auto \
--max_batch_size 8 \
--max_input_len 924 \
--max_seq_len 1024 \
@ -108,8 +104,7 @@ trtllm-build --checkpoint_dir ./mamba_model/mamba-2.8b/trt_ckpt/bf16/1-gpu/ \
# mamba-1.4b
trtllm-build --checkpoint_dir ./mamba_model/mamba-1.4b/trt_ckpt/fp16/1-gpu/ \
--paged_kv_cache disable \
--gemm_plugin float16 \
--mamba_conv1d_plugin float16 \
--gemm_plugin auto \
--max_batch_size 8 \
--max_input_len 924 \
--max_seq_len 1024 \
@ -118,8 +113,7 @@ trtllm-build --checkpoint_dir ./mamba_model/mamba-1.4b/trt_ckpt/fp16/1-gpu/ \
# mamba-790m
trtllm-build --checkpoint_dir ./mamba_model/mamba-790m/trt_ckpt/fp16/1-gpu/ \
--paged_kv_cache disable \
--gemm_plugin float16 \
--mamba_conv1d_plugin float16 \
--gemm_plugin auto \
--max_batch_size 8 \
--max_input_len 924 \
--max_seq_len 1024 \
@ -128,8 +122,7 @@ trtllm-build --checkpoint_dir ./mamba_model/mamba-790m/trt_ckpt/fp16/1-gpu/ \
# mamba-370m
trtllm-build --checkpoint_dir ./mamba_model/mamba-370m/trt_ckpt/fp16/1-gpu/ \
--paged_kv_cache disable \
--gemm_plugin float16 \
--mamba_conv1d_plugin float16 \
--gemm_plugin auto \
--max_batch_size 8 \
--max_input_len 924 \
--max_seq_len 1024 \
@ -138,8 +131,7 @@ trtllm-build --checkpoint_dir ./mamba_model/mamba-370m/trt_ckpt/fp16/1-gpu/ \
# mamba-130m
trtllm-build --checkpoint_dir ./mamba_model/mamba-130m/trt_ckpt/fp16/1-gpu/ \
--paged_kv_cache disable \
--gemm_plugin float16 \
--mamba_conv1d_plugin float16 \
--gemm_plugin auto \
--max_batch_size 8 \
--max_input_len 924 \
--max_seq_len 1024 \

View File

@ -1,5 +1,6 @@
--extra-index-url https://pypi.nvidia.com
tensorrt_llm==0.11.0.dev2024061800
tensorrt_llm==0.11.0.dev2024062500
transformers>=4.39.0
datasets~=2.14.5
evaluate
rouge_score

View File

@ -13,6 +13,7 @@ For more info about Medusa visit [speculative decoding documentation](../../docs
* GPU Compute Capability >= 8.0 (Ampere or newer)
* FP16
* BF16
* FP8 (base model)
* PAGED_KV_CACHE
* Tensor Parallel
@ -32,7 +33,7 @@ https://huggingface.co/FasterDecoding/medusa-vicuna-7b-v1.3
```
We use `convert_checkpoint.py` script to convert the model for Medusa decoding into TensorRT-LLM checkpoint format.
Here we also add `--fixed_num_medusa_heads 4` as `medusa_num_heads` is 2 in `config.json` of `medusa-vicuna-7b-v1.3` but it actually has 4.
We could use `--num_medusa_heads` to set the number of medusa heads that we want to use. If not, `num_medusa_heads` will be set according to the `medusa_num_heads` from medusa weights' `config.json`.
Here is the example:
```bash
@ -41,20 +42,21 @@ python convert_checkpoint.py --model_dir ./vicuna-7b-v1.3 \
--medusa_model_dir medusa-vicuna-7b-v1.3 \
--output_dir ./tllm_checkpoint_1gpu_medusa \
--dtype float16 \
--fixed_num_medusa_heads 4
--num_medusa_heads 4
# Note: Increasing the batch size may have a negative impact on performance
trtllm-build --checkpoint_dir ./tllm_checkpoint_1gpu_medusa \
--output_dir ./tmp/medusa/7B/trt_engines/fp16/1-gpu/ \
--gemm_plugin float16 \
--speculative_decoding_mode medusa \
--max_batch_size 8
--max_batch_size 4
# Convert and Build Medusa decoding support for vicuna-13b-v1.3 with 4-way tensor parallelism.
python convert_checkpoint.py --model_dir ./vicuna-7b-v1.3 \
--medusa_model_dir medusa-vicuna-7b-v1.3 \
--output_dir ./tllm_checkpoint_1gpu_medusa \
--dtype float16 \
--fixed_num_medusa_heads 4 \
--num_medusa_heads 4 \
--tp_size 4 \
--workers 4
@ -62,7 +64,30 @@ trtllm-build --checkpoint_dir ./tllm_checkpoint_1gpu_medusa \
--output_dir ./tmp/medusa/7B/trt_engines/fp16/1-gpu/ \
--gemm_plugin float16 \
--speculative_decoding_mode medusa \
--max_batch_size 8
--max_batch_size 4
```
### FP8 Post-Training Quantization for Base Model
The example below quantizes the base model to FP8, while keeping the weight of the medusa head non-quantize.
```bash
# Quantize base model into FP8 and export trtllm checkpoint
python ../quantization/quantize.py --model_dir /path/to/base-model-hf/ \
--dtype float16 \
--qformat fp8 \
--kv_cache_dtype fp8 \
--output_dir ./tllm_checkpoint_1gpu_base_model_fp8_medusa_fp16 \
--calib_size 512 \
--tp_size 1 \
--medusa_model_dir /path/to/medusa_head/ \
--num_medusa_heads 4
# Build trtllm engines from the trtllm checkpoint
trtllm-build --checkpoint_dir ./tllm_checkpoint_1gpu_base_model_fp8_medusa_fp16 \
--output_dir ./trt_engine_1gpu_base_model_fp8_medusa_fp16 \
--gemm_plugin float16 \
--gpt_attention_plugin float16 \
--speculative_decoding_mode medusa \
--max_batch_size 4
```
### Run

View File

@ -19,12 +19,13 @@ from transformers.models.llama.modeling_llama import LlamaDecoderLayer
from transformers.pytorch_utils import Conv1D
import tensorrt_llm
from tensorrt_llm._utils import str_dtype_to_torch
from tensorrt_llm.logger import logger
from tensorrt_llm.mapping import Mapping
from tensorrt_llm.models import PretrainedConfig
from tensorrt_llm.models.convert_utils import load_calib_dataset
from tensorrt_llm.models.llama.convert import load_weights_from_hf_by_shard
from tensorrt_llm.models.medusa.weight import (get_tllm_linear_weight,
load_medusa_hf)
from tensorrt_llm.quantization import QuantAlgo
try:
@ -108,11 +109,6 @@ def parse_arguments():
help=
'By default, we use dtype for KV cache. int8_kv_cache chooses int8 quantization for KV'
)
parser.add_argument(
'--modelopt_quant_ckpt_path',
type=str,
default=None,
help='Path of a quantized model checkpoint in .npz format')
parser.add_argument(
'--per_group',
@ -182,13 +178,6 @@ def parse_arguments():
help='The number of workers for converting checkpoint in parallel')
parser.add_argument('--num_medusa_heads', type=int, default=4)
parser.add_argument(
'--fixed_num_medusa_heads',
type=int,
default=None,
help="If exist, fix medusa_num_heads from config.json."
"num_medusa_heads < medusa_num_heads in config.json < fixed_num_medusa_heads"
)
parser.add_argument('--num_medusa_layers', type=int, default=1)
parser.add_argument('--max_medusa_token_len', type=int, default=63)
parser.add_argument('--medusa_hidden_act', type=str, default="silu")
@ -570,29 +559,6 @@ def get_weight_and_bias(config, prefix, dtype):
return get_weight(config, prefix, dtype), get_bias(config, prefix, dtype)
def get_tllm_linear_weight(weight,
prefix,
bias=None,
use_weight_only=False,
plugin_weight_only_quant_type=torch.int8,
postfix='weight'):
results = {}
if use_weight_only:
v = weight.t().contiguous().cpu()
processed_torch_weights, torch_weight_scales = \
torch.ops.trtllm.symmetric_quantize_last_axis_of_batched_matrix(
v, plugin_weight_only_quant_type)
results[prefix + postfix] = processed_torch_weights
results[prefix + 'per_channel_scale'] = torch_weight_scales
else:
results[prefix + postfix] = weight.contiguous()
if bias is not None:
results[prefix + 'bias'] = bias
return results
def dup_kv_weight(v, num_head, tp_size):
assert tp_size % num_head == 0
reps = tp_size // num_head
@ -1189,77 +1155,28 @@ if __name__ == '__main__':
qkv_para=convert_args['llama_qkv_para'],
smoother=convert_args['llama_smoother'])
def load_medusa_hf(medusa_path: str,
mapping=Mapping(),
dtype='float32'):
logger.info("Loading Medusa heads' weights ...")
is_ckpt_safetensors = False
ckpt_file = Path(medusa_path) / "medusa_lm_head.pt"
if not ckpt_file.exists():
ckpt_file = Path(
medusa_path) / "medusa_lm_head.safetensors"
is_ckpt_safetensors = True
if is_ckpt_safetensors:
logger.info("Safetensors Found ...")
from safetensors.torch import load_file
state_dict = load_file(ckpt_file)
else:
state_dict = torch.load(ckpt_file, map_location="cpu")
torch_dtype = str_dtype_to_torch(dtype)
weights = {}
for h in range(args.num_medusa_heads):
for l in range(args.num_medusa_layers):
w = state_dict[f"{h}.{l}.linear.weight"].clone().to(
torch_dtype)
split_v = split(w, mapping.tp_size, mapping.tp_rank)
weights.update(
get_tllm_linear_weight(
split_v,
f'medusa_heads.{h}.medusa_layers.{l}.linear.',
None, args.use_weight_only,
plugin_weight_only_quant_type))
b = state_dict[f"{h}.{l}.linear.bias"].clone().to(
torch_dtype)
weights[
'medusa_heads.{}.medusa_layers.{}.linear.bias'.
format(h, l)] = split(b, mapping.tp_size,
mapping.tp_rank)
lm = state_dict[
f"{h}.{args.num_medusa_layers}.weight"].clone().to(
torch_dtype) # LM Head
weights['medusa_heads.{}.lm_head.weight'.format(
h)] = split(lm, mapping.tp_size, mapping.tp_rank)
return weights
if args.medusa_model_dir is not None:
config_file = Path(args.medusa_model_dir) / "config.json"
with open(config_file) as fp:
config = json.load(fp)
args.num_medusa_heads = config.get('medusa_num_heads',
args.num_medusa_heads)
num_medusa_heads_from_config = config.get(
'medusa_num_heads', args.num_medusa_heads)
args.num_medusa_layers = config.get('medusa_num_layers',
args.num_medusa_layers)
if args.fixed_num_medusa_heads is not None and args.fixed_num_medusa_heads != args.num_medusa_heads:
logger.info(
f"fixing num_medusa_heads from {args.num_medusa_heads} to {args.fixed_num_medusa_heads}"
)
args.num_medusa_heads = args.fixed_num_medusa_heads
if args.num_medusa_heads is None:
args.num_medusa_heads = num_medusa_heads_from_config
assert args.max_medusa_token_len > 0, "should have max_medusa_token_len > 0"
medusa_weights = load_medusa_hf(args.medusa_model_dir,
mapping,
dtype=args.dtype)
medusa_weights = load_medusa_hf(
medusa_path=args.medusa_model_dir,
num_medusa_heads=args.num_medusa_heads,
num_medusa_layers=args.num_medusa_layers,
mapping=mapping,
dtype=args.dtype,
use_weight_only=args.use_weight_only,
plugin_weight_only_quant_type=
plugin_weight_only_quant_type)
weights.update(medusa_weights)
safetensors.torch.save_file(

View File

@ -1,5 +1,5 @@
--extra-index-url https://pypi.nvidia.com
tensorrt_llm==0.11.0.dev2024061800
tensorrt_llm==0.11.0.dev2024062500
datasets~=2.14.5
rouge_score~=0.1.2
sentencepiece~=0.1.99

View File

@ -1,4 +1,4 @@
--extra-index-url https://pypi.nvidia.com
tensorrt_llm==0.11.0.dev2024061800
tensorrt_llm==0.11.0.dev2024062500
transformers==4.38.2
accelerate==0.25.0

View File

@ -363,11 +363,13 @@ def main():
if args.test_trt_llm:
assert not args.test_hf, "Cannot test both TRT-LLM and HF"
runner_cls = ModelRunner if (args.debug_mode
or not PYTHON_BINDINGS) else ModelRunnerCpp
runner_cls = ModelRunner if not PYTHON_BINDINGS else ModelRunnerCpp
runner_kwargs = {}
if PYTHON_BINDINGS:
runner_kwargs.update(max_beam_width=1)
model = runner_cls.from_dir(args.engine_dir,
rank=runtime_rank,
debug_mode=args.debug_mode)
**runner_kwargs)
else:
assert args.test_hf, "Must test either TRT-LLM or HF"
if model_name == 'ChatGLMForCausalLM' and model_version == 'glm':

View File

@ -1,5 +1,5 @@
--extra-index-url https://pypi.nvidia.com
tensorrt_llm==0.11.0.dev2024061800
tensorrt_llm==0.11.0.dev2024062500
datasets~=2.14.5
evaluate~=0.4.1
rouge_score~=0.1.2

View File

@ -1,5 +1,5 @@
--extra-index-url https://pypi.nvidia.com
tensorrt_llm==0.11.0.dev2024061800
tensorrt_llm==0.11.0.dev2024062500
transformers==4.40.2
datasets~=2.14.5
evaluate~=0.4.1

View File

@ -1,5 +1,5 @@
--extra-index-url https://pypi.nvidia.com
tensorrt_llm==0.11.0.dev2024061800
tensorrt_llm==0.11.0.dev2024062500
datasets~=2.14.5
evaluate~=0.4.1
rouge_score~=0.1.2

View File

@ -15,10 +15,9 @@ models using TensorRT-LLM and run on a single GPU.
## Overview
The TensorRT-LLM Phi implementation can be found in [`tensorrt_llm/models/phi/model.py`](../../tensorrt_llm/models/phi/model.py) and [`tensorrt_llm/models/phi3/model.py`](../../tensorrt_llm/models/phi3/model.py). The TensorRT-LLM Phi example code is located in [`examples/phi`](./). There are two files:
The TensorRT-LLM Phi implementation can be found in [`tensorrt_llm/models/phi/model.py`](../../tensorrt_llm/models/phi/model.py) and [`tensorrt_llm/models/phi3/model.py`](../../tensorrt_llm/models/phi3/model.py). The TensorRT-LLM Phi example code is located in [`examples/phi`](./) with a single file:
* [`convert_checkpoint.py`](./convert_checkpoint.py) to convert a checkpoint from the [HuggingFace (HF) Transformers](https://github.com/huggingface/transformers) format to the TensorRT-LLM format
* [`postprocess_quant_checkpoint.py`](./postprocess_quant_checkpoint.py) to post-process FP8 or INT8 SmoothQuant quantized checkpoints for Phi-3-small variants.
In addition, there are two shared files in the parent folder [`examples`](../) for inference and evaluation:
@ -29,18 +28,19 @@ In addition, there are two shared files in the parent folder [`examples`](../) f
* FP16
* BF16
* FP8
* INT8
* Tensor Parallel
## Support Matrix
| Model Name | FP16 | BF16 | FP8 | TP |
| :--------------: | :---: | :---: | :---: | :---: |
| phi-2 | Y | Y | | Y |
| Phi-3-mini-4k-instruct | Y | Y | | |
| Phi-3-mini-128k-instruct | Y | Y | | |
| Phi-3-small-8k-instruct | Y | Y | Y | Y |
| Phi-3-small-128k-instruct | Y | Y | Y | Y |
| Phi-3-medium-8k-instruct | Y | Y | | | Y |
| Phi-3-medium-128k-instruct | Y | Y | | | Y |
| Model Name | FP16 | BF16 | FP8 | INT8 | TP |
| :--------------: | :---: | :---: | :---: | :---: | :---: |
| phi-2 | Y | Y | | | Y |
| Phi-3-mini-4k-instruct | Y | Y | Y | Y |
| Phi-3-mini-128k-instruct | Y | Y | Y | Y |
| Phi-3-small-8k-instruct | Y | Y | Y | Y | Y |
| Phi-3-small-128k-instruct | Y | Y | Y | Y | Y |
| Phi-3-medium-8k-instruct | Y | Y | Y | Y |
| Phi-3-medium-128k-instruct | Y | Y | Y | Y |
* Model Name: the name of the model, the same as the name on HuggingFace
* TP: Tensor Parallel
@ -128,9 +128,9 @@ python3 ../summarize.py --engine_dir ./phi-engine-tp2 \
```
### 5. Quantization options for Phi-3-small
### 5. Quantization
Phi-3-small variants support post-training quantization to FP8 and INT8 SmoothQuant formats.
All Phi-3 variants support post-training quantization to FP8 and INT8 SmoothQuant formats.
FP8 checkpoints can be built as follows:
@ -141,8 +141,6 @@ python3 ../quantization/quantize.py \
--output_dir ./phi3-checkpoint \
--dtype $DTYPE \
--qformat fp8 --kv_cache_dtype fp8
python3 postprocess_quant_checkpoint.py --checkpoint_dir ./phi3-checkpoint
```
INT8 checkpoints can be built as follows:
@ -154,8 +152,6 @@ python3 ../quantization/quantize.py \
--output_dir ./phi3-checkpoint \
--dtype $DTYPE \
--qformat int8_sq --kv_cache_dtype int8
python3 postprocess_quant_checkpoint.py --checkpoint_dir ./phi3-checkpoint
```
The commands to [build TensorRT engines](#2-build-tensorrt-engines) from quantized checkpoints

View File

@ -19,8 +19,7 @@ import time
from transformers import AutoConfig
import tensorrt_llm
from tensorrt_llm.models import (Phi3ForCausalLM, Phi3SmallForCausalLM,
PhiForCausalLM)
from tensorrt_llm.models import Phi3ForCausalLM, PhiForCausalLM
def parse_arguments():
@ -81,16 +80,14 @@ if __name__ == '__main__':
model_config = AutoConfig.from_pretrained(args.model_dir,
trust_remote_code=True)
model_type = model_config.architectures[0]
supported_model = {
'PhiForCausalLM': PhiForCausalLM,
'Phi3ForCausalLM': Phi3ForCausalLM,
'Phi3VForCausalLM': Phi3ForCausalLM,
'Phi3SmallForCausalLM': Phi3SmallForCausalLM
}
supported_models = [
'PhiForCausalLM', 'Phi3ForCausalLM', 'Phi3VForCausalLM',
'Phi3SmallForCausalLM'
]
modelForCausalLM = None
if model_type not in supported_model:
if model_type not in supported_models:
assert False, "Invalid model type"
modelForCausalLM = supported_model[model_type]
modelForCausalLM = PhiForCausalLM if model_type == 'PhiForCausalLM' else Phi3ForCausalLM
modelForCausalLM.convert_hf_checkpoint(args.model_dir,
dtype=args.dtype,

View File

@ -1,63 +0,0 @@
# SPDX-FileCopyrightText: Copyright (c) 2022-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import argparse
import json
import time
import safetensors
from safetensors.torch import save_file
import tensorrt_llm
from tensorrt_llm.models.phi3.phi3small.convert import shuffle_qkv_weights
def parse_arguments():
parser = argparse.ArgumentParser()
parser.add_argument('--checkpoint_dir', type=str, default=None)
args = parser.parse_args()
return args
if __name__ == '__main__':
print(tensorrt_llm.__version__)
args = parse_arguments()
tensorrt_llm.logger.set_level('info')
tik = time.time()
with open(f"{args.checkpoint_dir}/config.json", "r") as f:
config = json.load(f)
weights = {}
with safetensors.safe_open(f"{args.checkpoint_dir}/rank0.safetensors",
framework="pt") as f:
for k in f.keys():
weights[k] = f.get_tensor(k)
# Transform QKV weights from custom Phi3Small format to TRT-LLM format
num_total_heads = config[
'num_attention_heads'] + 2 * config['num_key_value_heads']
for key, value in weights.items():
if "qkv." in key:
if 'scaling_factor' in key and value.shape[0] % num_total_heads != 0:
continue
weights[key] = shuffle_qkv_weights(value, config)
save_file(weights, f'{args.checkpoint_dir}/rank0.safetensors')
tok = time.time()
t = time.strftime('%H:%M:%S', time.gmtime(tok - tik))
print(f'Total time of converting checkpoints: {t}')

View File

@ -1,5 +1,5 @@
--extra-index-url https://pypi.nvidia.com
tensorrt_llm==0.11.0.dev2024061800
tensorrt_llm==0.11.0.dev2024062500
datasets~=2.14.5
evaluate~=0.4.1
rouge_score~=0.1.2

View File

@ -90,6 +90,17 @@ if __name__ == "__main__":
help="KV Cache dtype.",
default=None,
choices=["int8", "fp8", None])
# Medusa
parser.add_argument('--num_medusa_heads', type=int, default=4)
parser.add_argument('--num_medusa_layers', type=int, default=1)
parser.add_argument('--max_draft_len', type=int, default=63)
parser.add_argument('--medusa_hidden_act', type=str, default="silu")
parser.add_argument('--medusa_model_dir', type=str, default=None)
parser.add_argument('--quant_medusa_head',
default=False,
action='store_true',
help="whether to quantize the weights of medusa heads")
args = parser.parse_args()
if args.model_dir is not None:
@ -108,7 +119,13 @@ if __name__ == "__main__":
tp_size=args.tp_size,
pp_size=args.pp_size,
seed=args.seed,
tokenizer_max_seq_length=args.tokenizer_max_seq_length)
tokenizer_max_seq_length=args.tokenizer_max_seq_length,
num_medusa_heads=args.num_medusa_heads,
num_medusa_layers=args.num_medusa_layers,
max_draft_len=args.max_draft_len,
medusa_hidden_act=args.medusa_hidden_act,
medusa_model_dir=args.medusa_model_dir,
quant_medusa_head=args.quant_medusa_head)
elif args.nemo_ckpt_path is not None:
quantize_nemo_and_export(nemo_ckpt_path=args.nemo_ckpt_path,
decoder_type=args.decoder_type,

View File

@ -1,5 +1,5 @@
--extra-index-url https://pypi.nvidia.com
tensorrt_llm==0.11.0.dev2024061800
tensorrt_llm==0.11.0.dev2024062500
datasets>=2.14.4
nemo-toolkit[all]<=1.20.0,>=1.18.0
rouge_score~=0.1.2

View File

@ -1,5 +1,5 @@
--extra-index-url https://pypi.nvidia.com
tensorrt_llm==0.11.0.dev2024061800
tensorrt_llm==0.11.0.dev2024062500
datasets~=2.16.0
evaluate~=0.4.1
rouge_score~=0.1.2

View File

@ -1,5 +1,5 @@
--extra-index-url https://pypi.nvidia.com
tensorrt_llm==0.11.0.dev2024061800
tensorrt_llm==0.11.0.dev2024062500
datasets~=2.16.0
evaluate~=0.4.1
rouge_score~=0.1.2

View File

@ -1,5 +1,5 @@
--extra-index-url https://pypi.nvidia.com
tensorrt_llm==0.11.0.dev2024061800
tensorrt_llm==0.11.0.dev2024062500
git+https://github.com/google-deepmind/recurrentgemma.git
flax>=0.8.2
jax~=0.4.23

View File

@ -247,7 +247,7 @@ def main(args):
model_name, model_version = read_model_name(
args.engine_dir) if not is_enc_dec else ("", "")
if args.tokenizer_dir is None:
if args.tokenizer_dir is None and model_name in DEFAULT_HF_MODEL_DIRS:
logger.warning(
"tokenizer_dir is not specified. Try to infer from model_name, but this may be incorrect."
)

View File

@ -1,5 +1,5 @@
--extra-index-url https://pypi.nvidia.com
tensorrt_llm==0.11.0.dev2024061800
tensorrt_llm==0.11.0.dev2024062500
datasets~=2.16.1
evaluate~=0.4.1
rouge_score~=0.1.2

View File

@ -1,5 +1,5 @@
--extra-index-url https://pypi.nvidia.com
tensorrt_llm==0.11.0.dev2024061800
tensorrt_llm==0.11.0.dev2024062500
datasets==2.14.6
evaluate~=0.4.1
rouge_score~=0.1.2

View File

@ -1,5 +1,5 @@
--extra-index-url https://pypi.nvidia.com
tensorrt_llm==0.11.0.dev2024061800
tensorrt_llm==0.11.0.dev2024062500
tiktoken
datasets
kaldialign

View File

@ -16,12 +16,13 @@ pandas
h5py==3.10.0
StrEnum
sentencepiece>=0.1.99
tensorrt==10.0.1
# https://github.com/pytorch/pytorch/blob/v2.3.0/version.txt uses 2.3.0a0.
# https://docs.nvidia.com/deeplearning/frameworks/pytorch-release-notes/rel-24-04.html#rel-24-04 uses 2.3.0a0.
torch>=2.3.0a,<=2.3.0
tensorrt==10.1.0
# https://github.com/pytorch/pytorch/blob/v2.3.1/version.txt uses 2.3.0a0.
# https://docs.nvidia.com/deeplearning/frameworks/pytorch-release-notes/rel-24-05.html#rel-24-05 uses 2.4.0a0.
torch>=2.3.0a0,<=2.4.0a0
nvidia-modelopt~=0.11,<0.12
transformers>=4.38.2
pillow==10.2.0
wheel
optimum
evaluate

View File

@ -1146,8 +1146,8 @@ class GraphGroup(ABC):
num_kv_heads = plugin_info.pfc_as_ndarray["num_kv_heads"].copy()
tp_size = plugin_info.pfc_as_ndarray["tp_size"].copy()
tp_rank = plugin_info.pfc_as_ndarray["tp_rank"].copy()
num_kv_heads = num_kv_heads // kv_partition
num_heads = num_heads // partition
num_kv_heads = np.maximum(num_kv_heads // kv_partition, 1)
num_heads = np.maximum(num_heads // partition, 1)
tp_size[0] = partition
tp_rank[0] = index

View File

@ -15,6 +15,11 @@ class PluginNode(Node):
layer.name)
layer.to_base_class()
def _collect_strategies(self, device_mesh):
raise NotImplementedError(
f"Auto parallel does not support {self.plugin_type} plugin right now."
)
def _default_strategy(self, device_mesh):
strategies_vector = StrategiesVector(self)
dim_partition_dict_mapping = {}

View File

@ -1,5 +1,6 @@
from enum import Enum, auto
import numpy as np
import torch
from tensorrt_llm.functional import PositionEmbeddingType
@ -369,8 +370,8 @@ class GPTAttentionPlugin(PluginNode):
num_kv_heads = self.plugin_info.pfc_as_ndarray["num_kv_heads"].copy()
tp_size = self.plugin_info.pfc_as_ndarray["tp_size"].copy()
tp_rank = self.plugin_info.pfc_as_ndarray["tp_rank"].copy()
num_kv_heads = num_kv_heads // kv_partition
num_heads = num_heads // partition
num_kv_heads = np.maximum(num_kv_heads // kv_partition, 1)
num_heads = np.maximum(num_heads // partition, 1)
tp_size[0] = partition
tp_rank[0] = 0

View File

@ -30,10 +30,9 @@ from ..builder import BuildConfig, Engine, build
from ..logger import logger
from ..lora_manager import LoraConfig, LoraManager
from ..models import MODEL_MAP, PretrainedConfig
from ..models.modeling_utils import (WEIGHT_LOADER_MODELS, QuantConfig,
from ..models.modeling_utils import (WEIGHT_LOADER_MODELS,
SpeculativeDecodingMode)
from ..plugin import PluginConfig, add_plugin_argument
from ..quantization import QuantAlgo
def parse_arguments():
@ -133,10 +132,6 @@ def parse_arguments():
type=str,
default=None,
choices=['float16', 'float32'])
parser.add_argument('--weight_only_precision',
type=str,
default=None,
choices=['int8', 'int4'])
parser.add_argument('--weight_sparsity', default=False, action='store_true')
parser.add_argument(
'--max_draft_len',
@ -270,14 +265,6 @@ def build_model(build_config: BuildConfig,
if logits_dtype is not None:
model_config.logits_dtype = logits_dtype
weight_only_precision = kwargs.get('weight_only_precision', None)
if not model_config.quant_mode.has_any_quant(
) and weight_only_precision is not None:
if weight_only_precision == 'int4':
model_config.quantization = QuantConfig(QuantAlgo.W4A16)
else:
model_config.quantization = QuantConfig(QuantAlgo.W8A16)
architecture = model_config.architecture
assert not build_config.plugin_config.streamingllm or architecture == "LlamaForCausalLM", \
"StreamingLLM is only supported in the llama model."
@ -420,7 +407,6 @@ def main():
kwargs = {
'logits_dtype': args.logits_dtype,
'use_fused_mlp': args.use_fused_mlp,
'weight_only_precision': args.weight_only_precision,
'tp_size': args.tp_size,
'pp_size': args.pp_size,
'lora_dir': args.lora_dir,

View File

@ -40,7 +40,6 @@ from .modeling_utils import (PretrainedConfig, PretrainedModel,
from .mpt.model import MPTForCausalLM, MPTModel
from .opt.model import OPTForCausalLM, OPTModel
from .phi3.model import Phi3ForCausalLM, Phi3Model
from .phi3.phi3small.model import Phi3SmallForCausalLM, Phi3SmallModel
from .phi.model import PhiForCausalLM, PhiModel
from .qwen.model import QWenForCausalLM
from .recurrentgemma.model import RecurrentGemmaForCausalLM
@ -70,10 +69,8 @@ __all__ = [
'GPTNeoXForCausalLM',
'PhiModel',
'Phi3Model',
'Phi3SmallModel',
'PhiForCausalLM',
'Phi3ForCausalLM',
'Phi3SmallForCausalLM',
'ChatGLMForCausalLM',
'ChatGLMModel',
'BaichuanForCausalLM',
@ -103,7 +100,7 @@ MODEL_MAP = {
'FalconForCausalLM': FalconForCausalLM,
'PhiForCausalLM': PhiForCausalLM,
'Phi3ForCausalLM': Phi3ForCausalLM,
'Phi3SmallForCausalLM': Phi3SmallForCausalLM,
'Phi3SmallForCausalLM': Phi3ForCausalLM,
'MambaForCausalLM': MambaForCausalLM,
'GPTNeoXForCausalLM': GPTNeoXForCausalLM,
'GPTJForCausalLM': GPTJForCausalLM,

View File

@ -292,6 +292,7 @@ class GemmaForCausalLM(DecoderModelForCausalLM):
return tllm_llama
def check_config(self, config):
config.set_if_not_exist("share_embedding_table", True)
config.set_if_not_exist('use_parallel_embedding', False)
config.set_if_not_exist('embedding_sharding_dim', 0)
config.set_if_not_exist('mlp_bias', False)

View File

@ -62,6 +62,80 @@ class GenerationMixin:
[split_point[-1], max_num_tokens, max_num_tokens])
return num_tokens_ranges
@staticmethod
def get_profiles_ranges(
*,
max_batch_size,
max_beam_width,
max_input_len,
max_num_tokens,
max_draft_len,
opt_batch_size,
opt_num_tokens,
enable_ctx_gen_opt_profiles,
multiple_profiles,
):
default_range = GenerationMixin.default_range
if opt_batch_size:
bb_range_cxt = [1, opt_batch_size, max_batch_size]
bb_range_gen = [
1, opt_batch_size * max_beam_width,
max_batch_size * max_beam_width
]
else:
bb_range_cxt = default_range(max_batch_size)
bb_range_gen = default_range(max_batch_size * max_beam_width)
tokens_per_engine_step = max_draft_len + 1
tokens_per_engine_step_range = [
1, tokens_per_engine_step, tokens_per_engine_step
]
bbd_range_ctx = [
bb_range_cxt[i] * (tokens_per_engine_step if i != 0 else 1)
for i in range(len(bb_range_cxt))
]
bbd_range_gen = [
bb_range_gen[i] * (tokens_per_engine_step if i != 0 else 1)
for i in range(len(bb_range_gen))
]
inlen_range_cxt = default_range(max_input_len)
inlen_range_gen = [1, 1, tokens_per_engine_step]
if enable_ctx_gen_opt_profiles:
num_profiles = 2
bb_range = [bb_range_cxt, bb_range_gen]
bbd_range = [bbd_range_ctx, bbd_range_gen]
inlen_range = [inlen_range_cxt, inlen_range_gen]
position_ids_inlen_range = [inlen_range_cxt, [1, 1, 1]]
num_tokens_range_ctx = default_range(max_batch_size * max_input_len)
# Draft tokens cannot be combined with beam search
num_tokens_range_gen = default_range(
max_batch_size * max(tokens_per_engine_step, max_beam_width))
num_tokens_range = [num_tokens_range_ctx, num_tokens_range_gen]
else:
if multiple_profiles:
num_tokens_range = GenerationMixin.split_num_tokens_range(
max_num_tokens)
else:
if opt_num_tokens is None:
opt_num_tokens = min(max_num_tokens,
max_batch_size * max_beam_width)
num_tokens_range = [[1, opt_num_tokens, max_num_tokens]]
num_profiles = len(num_tokens_range)
bb_range = [bb_range_gen] * num_profiles
bbd_range = [bbd_range_gen] * num_profiles
inlen_range = [[1, 1, max_input_len]] * num_profiles
position_ids_inlen_range = [[1, 1, max_input_len]] * num_profiles
tokens_per_engine_step_range = [tokens_per_engine_step_range
] * num_profiles
ranges = {
'bb_range': bb_range,
'bbd_range': bbd_range,
'inlen_range': inlen_range,
'position_ids_inlen_range': position_ids_inlen_range,
'num_tokens_range': num_tokens_range,
'tokens_per_engine_step_range': tokens_per_engine_step_range,
}
return num_profiles, ranges
def prepare_attention_inputs(self,
*,
max_batch_size,
@ -343,61 +417,26 @@ class GenerationMixin:
streamingllm: bool = False,
opt_batch_size=None):
default_range = GenerationMixin.default_range
tokens_per_engine_step = max_draft_len + 1
tokens_per_engine_step_range = [
1, tokens_per_engine_step, tokens_per_engine_step
]
if opt_batch_size:
bb_range_cxt = [1, opt_batch_size, max_batch_size]
bb_range_gen = [
1, opt_batch_size * max_beam_width,
max_batch_size * max_beam_width
]
else:
bb_range_cxt = default_range(max_batch_size)
bb_range_gen = default_range(max_batch_size * max_beam_width)
bbd_range_ctx = [
bb_range_cxt[i] * (tokens_per_engine_step if i != 0 else 1)
for i in range(len(bb_range_cxt))
]
bbd_range_gen = [
bb_range_gen[i] * (tokens_per_engine_step if i != 0 else 1)
for i in range(len(bb_range_gen))
]
inlen_range_cxt = default_range(max_input_len)
inlen_range_gen = [1, 1, tokens_per_engine_step]
enable_ctx_gen_opt_profiles = GenerationMixin.has_ctx_gen_opt_profiles(
use_gpt_attention_plugin, use_gemm_plugin, remove_input_padding,
paged_kv_cache)
if enable_ctx_gen_opt_profiles:
num_profiles = 2
bb_range = [bb_range_cxt, bb_range_gen]
bbd_range = [bbd_range_ctx, bbd_range_gen]
inlen_range = [inlen_range_cxt, inlen_range_gen]
position_ids_inlen_range = [inlen_range_cxt, [1, 1, 1]]
num_tokens_range_ctx = default_range(max_batch_size * max_input_len)
# Draft tokens cannot be combined with beam search
num_tokens_range_gen = default_range(
max_batch_size * max(tokens_per_engine_step, max_beam_width))
num_tokens_range = [num_tokens_range_ctx, num_tokens_range_gen]
else:
if multiple_profiles:
num_tokens_range = GenerationMixin.split_num_tokens_range(
max_num_tokens)
else:
if opt_num_tokens is None:
opt_num_tokens = min(max_num_tokens,
max_batch_size * max_beam_width)
num_tokens_range = [[1, opt_num_tokens, max_num_tokens]]
num_profiles = len(num_tokens_range)
bb_range = [bb_range_gen] * num_profiles
bbd_range = [bbd_range_gen] * num_profiles
inlen_range = [[1, 1, max_input_len]] * num_profiles
position_ids_inlen_range = [[1, 1, max_input_len]] * num_profiles
tokens_per_engine_step_range = [tokens_per_engine_step_range
] * num_profiles
num_profiles, ranges = GenerationMixin.get_profiles_ranges(
max_batch_size=max_batch_size,
max_beam_width=max_beam_width,
max_input_len=max_input_len,
max_num_tokens=max_num_tokens,
max_draft_len=max_draft_len,
opt_batch_size=opt_batch_size,
opt_num_tokens=opt_num_tokens,
enable_ctx_gen_opt_profiles=enable_ctx_gen_opt_profiles,
multiple_profiles=multiple_profiles)
bb_range = ranges['bb_range']
bbd_range = ranges['bbd_range']
inlen_range = ranges['inlen_range']
num_tokens_range = ranges['num_tokens_range']
position_ids_inlen_range = ranges['position_ids_inlen_range']
tokens_per_engine_step_range = ranges['tokens_per_engine_step_range']
position_ids_num_tokens_range = num_tokens_range
input_ids = None
@ -597,12 +636,13 @@ class GenerationMixin:
spec_decoding_params = None
# Use positional offsets and packed mask only when not in SpS spec decoding
if speculative_decoding_draft_tokens_external == False and max_draft_len > 0:
tokens_per_engine_step = max_draft_len + 1
# 32 bits packed mask aligned.
num_packed_masks = (tokens_per_engine_step + 32 - 1) // 32
packed_mask_len_range = [[0, 1, num_packed_masks]] * num_profiles
# total number of spec decoding tokens for all sequences (sequence length can be variable).
num_gen_tokens_range = [
default_range(
GenerationMixin.default_range(
max_batch_size * max_beam_width * tokens_per_engine_step,
min_range=0)
] * num_profiles

View File

@ -1535,6 +1535,12 @@ def load_weights_from_hf_safetensors(model_dir: str, config: LLaMAConfig):
moe_config = config.moe
kv_tp_size = None
kv_tp_rank = None
if config.num_key_value_heads < mapping.tp_size:
kv_tp_size = config.num_key_value_heads
kv_tp_rank = mapping.tp_rank * kv_tp_size // mapping.tp_size
model_prefix = "model."
key_list = [
"embed_tokens.weight", # vocab_embedding
@ -1552,7 +1558,12 @@ def load_weights_from_hf_safetensors(model_dir: str, config: LLaMAConfig):
torch_dtype = str_dtype_to_torch(dtype)
def load(key, tp_dim=-1, no_prefix=0, is_expert_weights=False):
def load(key,
tp_dim=-1,
no_prefix=0,
is_expert_weights=False,
tp_size=None,
tp_rank=None):
if not no_prefix:
key = model_prefix + key
ptr_idx = safetensors_map[key] if key in safetensors_map else 0
@ -1560,38 +1571,28 @@ def load_weights_from_hf_safetensors(model_dir: str, config: LLaMAConfig):
if key not in safetensors_ptrs[ptr_idx].keys():
return None
tensor_slice = safetensors_ptrs[ptr_idx].get_slice(key)
tensor_shape = tensor_slice.get_shape()
if tp_dim == -1:
res = safetensors_ptrs[ptr_idx].get_tensor(key)
else:
res = tensor_slice[:]
elif tp_dim >= 0 and tp_dim < len(tensor_shape):
if is_expert_weights:
tp_size = mapping.moe_tp_size
tp_rank = mapping.moe_tp_rank
else:
tp_size = mapping.tp_size
tp_rank = mapping.tp_rank
tensor_slice = safetensors_ptrs[ptr_idx].get_slice(key)
tensor_shape = tensor_slice.get_shape()
if len(tensor_shape) == 1:
if tp_dim == 0:
slice_width = tensor_shape[0] // tp_size
res = tensor_slice[slice_width * tp_rank:slice_width *
(tp_rank + 1)]
else:
res = tensor_slice[:]
else:
if tensor_shape[tp_dim] % tp_size != 0:
logger.error(
"Current weight shape is invalid for tp_size=" +
str(tp_size))
slice_width = tensor_shape[tp_dim] // tp_size
if tp_dim == 0:
res = tensor_slice[slice_width * tp_rank:slice_width *
(tp_rank + 1), :]
elif tp_dim == 1:
res = tensor_slice[:, slice_width * tp_rank:slice_width *
(tp_rank + 1)]
else:
assert False, "Invalid TP dim"
tp_size = tp_size or mapping.tp_size
tp_rank = tp_rank or mapping.tp_rank
dim_size = tensor_shape[tp_dim]
if dim_size % tp_size != 0:
logger.error(
f"Current weight shape {tensor_shape} is invalid at dimension {tp_dim} for TP size {tp_size}"
)
indices = [slice(None)] * len(tensor_shape)
indices[tp_dim] = slice(dim_size * tp_rank // tp_size,
dim_size * (tp_rank + 1) // tp_size)
res = tensor_slice[indices]
else:
raise ValueError(f"Invalid TP dim: {tp_dim}")
return res.to(torch_dtype).contiguous(
) if "block_sparse_moe.gate" not in key else res.to(torch.float32)
@ -1632,11 +1633,19 @@ def load_weights_from_hf_safetensors(model_dir: str, config: LLaMAConfig):
# Attention
qkv_list = []
for comp in ["q", "k", "v"]:
weight_part = load(prefix + key_list[3] + comp + key_list[4], 0)
tp_size = kv_tp_size if comp != "q" else None
tp_rank = kv_tp_rank if comp != "q" else None
weight_part = load(prefix + key_list[3] + comp + key_list[4],
0,
tp_size=tp_size,
tp_rank=tp_rank)
qkv_list.append(weight_part)
bias_part = load(
(prefix + key_list[3] + comp + key_list[4]).replace(
"weight", "bias"), 0)
"weight", "bias"),
0,
tp_size=tp_size,
tp_rank=tp_rank)
if bias_part is not None:
qkv_list.append(bias_part)
if len(qkv_list) == 3:

View File

@ -23,6 +23,7 @@ from ...functional import (Tensor, arange, cast, concat, expand,
gather_last_token_logits, shape, unsqueeze)
from ...layers import Embedding, LayerNorm, Linear, Mamba, RmsNorm
from ...module import Module, ModuleList
from ...plugin import current_all_reduce_helper
from ..generation_mixin import GenerationMixin
from ..modeling_utils import PretrainedConfig, PretrainedModel
@ -192,6 +193,7 @@ class MambaForCausalLM(PretrainedModel):
ssm_states,
host_request_types,
last_token_ids,
last_token_ids_for_logits,
host_context_lengths,
slot_mapping: Optional[Tensor] = None):
hidden_states, present_convs, present_ssms = self.backbone(
@ -200,7 +202,7 @@ class MambaForCausalLM(PretrainedModel):
if not self.gather_context_logits:
hidden_states = gather_last_token_logits(
hidden_states, last_token_ids,
hidden_states, last_token_ids_for_logits,
default_net().plugin_config.remove_input_padding)
lm_logits = self.lm_head(hidden_states)
@ -218,9 +220,9 @@ class MambaForCausalLM(PretrainedModel):
max_batch_size,
max_input_len,
max_seq_len,
max_num_tokens,
use_cache,
max_beam_width: int = 1,
max_num_tokens: int = None,
opt_num_tokens: int = None,
opt_batch_size: int = 0,
prompt_embedding_table_size: int = 0,
@ -235,56 +237,79 @@ class MambaForCausalLM(PretrainedModel):
@return: a list contains values which can be fed into the self.forward()
'''
assert speculative_decoding_draft_tokens_external == False, "Speculative decoding is not supported in Mamba"
assert max_beam_width == 1, "We don't support beam search for the Mamba model."
remove_input_padding = default_net().plugin_config.remove_input_padding
use_gemm_plugin = default_net().plugin_config.gemm_plugin
paged_state = default_net().plugin_config.paged_state
multiple_profiles = default_net().plugin_config.multiple_profiles
use_mamba_conv1d_plugin = default_net(
).plugin_config.mamba_conv1d_plugin
batch_range = [GenerationMixin.default_range(max_batch_size)]
use_custom_all_reduce = default_net(
).plugin_config.use_custom_all_reduce
self.gather_context_logits = gather_context_logits
mapping = self.config.mapping
# basic inputs
enable_ctx_gen_opt_profiles = GenerationMixin.has_ctx_gen_opt_profiles(
True, use_gemm_plugin, remove_input_padding, paged_state)
num_profiles, ranges = GenerationMixin.get_profiles_ranges(
max_batch_size=max_batch_size,
max_beam_width=max_beam_width,
max_input_len=max_input_len,
max_num_tokens=max_num_tokens,
max_draft_len=max_draft_len,
opt_batch_size=opt_batch_size,
opt_num_tokens=opt_num_tokens,
enable_ctx_gen_opt_profiles=enable_ctx_gen_opt_profiles,
multiple_profiles=multiple_profiles)
if remove_input_padding:
assert use_mamba_conv1d_plugin, "mamba_conv1d_plugin is needed to support remove_input_padding"
max_num_tokens = max(
max_input_len * max_batch_size,
max_beam_width * (max_draft_len + 1) * max_batch_size)
if opt_num_tokens is None:
opt_num_tokens = max_beam_width * (max_draft_len +
1) * max_batch_size
num_tokens_range = [[1, opt_num_tokens, max_num_tokens]]
input_ids = Tensor(name='input_ids',
dtype=trt.int32,
shape=[-1],
dim_range=OrderedDict([
('num_tokens', num_tokens_range),
('num_tokens', ranges['num_tokens_range']),
]))
else:
input_ids = Tensor(name='input_ids',
dtype=trt.int32,
shape=[-1, -1],
dim_range=OrderedDict([
('batch_size', batch_range),
('input_len', [[1, 1, max_input_len]]),
('batch_size_beam_width',
ranges['bb_range']),
('input_len', ranges['inlen_range']),
]))
if use_custom_all_reduce and mapping.tp_size > 1:
current_all_reduce_helper().set_workspace_tensor(
mapping, num_profiles)
# recurrent inputs
conv_states = []
ssm_states = []
if use_mamba_conv1d_plugin:
conv_state_dim_range = OrderedDict([
('batch_size', batch_range),
('kernel_size', [self.d_conv - 1]),
('dim_size', [self.d_inner]),
('batch_size', ranges['bb_range']),
('kernel_size', [self.d_conv - 1] * num_profiles),
('dim_size', [self.d_inner] * num_profiles),
])
else:
conv_state_dim_range = OrderedDict([
('batch_size', batch_range),
('dim_size', [self.d_inner]),
('kernel_size', [self.d_conv - 1]),
('batch_size', ranges['bb_range']),
('dim_size', [self.d_inner] * num_profiles),
('kernel_size', [self.d_conv - 1] * num_profiles),
])
ssm_state_dim_range = OrderedDict([
('batch_size', batch_range),
('state_size', [self.d_state]),
('dim_size', [self.d_inner]),
('batch_size', ranges['bb_range']),
('state_size', [self.d_state] * num_profiles),
('dim_size', [self.d_inner] * num_profiles),
])
one_dim_range = OrderedDict([
('buffer_count', [1]),
('buffer_count', [1] * num_profiles),
])
for i in range(self.config.num_hidden_layers):
@ -324,7 +349,7 @@ class MambaForCausalLM(PretrainedModel):
name='host_request_types',
dtype=trt.int32,
shape=[-1],
dim_range=OrderedDict([('batch_size', batch_range)]),
dim_range=OrderedDict([('batch_size', ranges['bb_range'])]),
)
if use_mamba_conv1d_plugin and remove_input_padding:
@ -332,21 +357,22 @@ class MambaForCausalLM(PretrainedModel):
name='host_context_lengths',
dtype=trt.int32,
shape=[-1],
dim_range=OrderedDict([('batch_size', batch_range)]),
dim_range=OrderedDict([('batch_size', ranges['bb_range'])]),
)
else:
host_context_lengths = None
last_token_ids = None
last_token_ids = Tensor(
name='last_token_ids',
dtype=trt.int32,
shape=[-1],
dim_range=OrderedDict([
('batch_size', ranges['bbd_range']),
]),
)
last_token_ids_for_logits = None
if not gather_context_logits:
last_token_ids = Tensor(
name='last_token_ids',
dtype=trt.int32,
shape=[-1],
dim_range=OrderedDict([
('batch_size', batch_range),
]),
)
last_token_ids_for_logits = last_token_ids
return_dict = {
'input_ids': input_ids,
@ -354,6 +380,7 @@ class MambaForCausalLM(PretrainedModel):
'ssm_states': ssm_states,
'host_request_types': host_request_types,
'last_token_ids': last_token_ids,
'last_token_ids_for_logits': last_token_ids_for_logits,
'host_context_lengths': host_context_lengths,
}
@ -362,7 +389,7 @@ class MambaForCausalLM(PretrainedModel):
name='slot_mapping',
dtype=trt.int32,
shape=[-1],
dim_range=OrderedDict([('batch_size', batch_range)]),
dim_range=OrderedDict([('batch_size', ranges['bb_range'])]),
)
return_dict['slot_mapping'] = slot_mapping

View File

@ -1,43 +1,80 @@
from pathlib import Path
import numpy as np
import torch
from tensorrt_llm import logger
from tensorrt_llm._utils import str_dtype_to_torch, torch_to_numpy
from tensorrt_llm._utils import str_dtype_to_torch
from tensorrt_llm.mapping import Mapping
from tensorrt_llm.models import MedusaLM
from tensorrt_llm.models.convert_utils import split
def get_tllm_linear_weight(weight,
prefix,
bias=None,
use_weight_only=False,
plugin_weight_only_quant_type=torch.int8,
postfix='weight'):
results = {}
if use_weight_only:
v = weight.t().contiguous().cpu()
processed_torch_weights, torch_weight_scales = \
torch.ops.trtllm.symmetric_quantize_last_axis_of_batched_matrix(
v, plugin_weight_only_quant_type)
results[prefix + postfix] = processed_torch_weights
results[prefix + 'per_channel_scale'] = torch_weight_scales
else:
results[prefix + postfix] = weight.contiguous()
if bias is not None:
results[prefix + 'bias'] = bias
return results
def load_medusa_hf(medusa_path: str,
trt_llm_medusa: MedusaLM,
num_medusa_heads: int,
num_medusa_layers: int,
mapping=Mapping(),
dtype='float32'):
dtype='float32',
use_weight_only=False,
plugin_weight_only_quant_type=None):
logger.info("Loading Medusa heads' weights ...")
is_ckpt_safetensors = False
ckpt_file = Path(medusa_path) / "medusa_lm_head.pt"
state_dict = torch.load(ckpt_file, map_location="cpu")
if not ckpt_file.exists():
ckpt_file = Path(medusa_path) / "medusa_lm_head.safetensors"
is_ckpt_safetensors = True
if is_ckpt_safetensors:
logger.info("Safetensors Found ...")
from safetensors.torch import load_file
state_dict = load_file(ckpt_file)
else:
state_dict = torch.load(ckpt_file, map_location="cpu")
torch_dtype = str_dtype_to_torch(dtype)
for h in range(trt_llm_medusa.num_medusa_heads):
for l in range(trt_llm_medusa.num_medusa_layers):
w = state_dict[f"{h}.{l}.linear.weight"].clone()
w = torch_to_numpy(w.to(torch_dtype).detach().cpu())
trt_llm_medusa.medusa_heads[h].medusa_layers[
l].linear.weight.value = np.ascontiguousarray(
split(w, mapping.tp_size, mapping.tp_rank))
if trt_llm_medusa.medusa_heads[h].medusa_layers[
l].linear.bias is not None:
# print(f"Setting bias for {h} {l}")
b = state_dict[f"{h}.{l}.linear.bias"].clone()
b = torch_to_numpy(b.to(torch_dtype).detach().cpu())
trt_llm_medusa.medusa_heads[h].medusa_layers[
l].linear.bias.value = np.ascontiguousarray(
np.split(b, mapping.tp_size,
axis=0)[mapping.tp_rank].copy())
lm = state_dict[f"{h}.{trt_llm_medusa.num_medusa_layers}.weight"].clone(
) # LM Head
lm = torch_to_numpy(lm.to(torch_dtype).detach().cpu())
trt_llm_medusa.medusa_heads[
h].lm_head.weight.value = np.ascontiguousarray(
split(lm, mapping.tp_size, mapping.tp_rank))
return
weights = {}
for h in range(num_medusa_heads):
for l in range(num_medusa_layers):
w = state_dict[f"{h}.{l}.linear.weight"].clone().to(torch_dtype)
split_v = split(w, mapping.tp_size, mapping.tp_rank)
weights.update(
get_tllm_linear_weight(
split_v, f'medusa_heads.{h}.medusa_layers.{l}.linear.',
None, use_weight_only, plugin_weight_only_quant_type))
b = state_dict[f"{h}.{l}.linear.bias"].clone().to(torch_dtype)
weights['medusa_heads.{}.medusa_layers.{}.linear.bias'.format(
h, l)] = split(b, mapping.tp_size, mapping.tp_rank)
lm = state_dict[f"{h}.{num_medusa_layers}.weight"].clone().to(
torch_dtype) # LM Head
weights['medusa_heads.{}.lm_head.weight'.format(h)] = split(
lm, mapping.tp_size, mapping.tp_rank)
return weights

View File

@ -27,6 +27,10 @@ from ..mapping import Mapping
from ..module import Module, ModuleList
from ..parameter import Parameter
from ..quantization import QuantMode
from ..quantization.layers import (WeightOnlyGroupwiseQuantLinear,
WeightOnlyGroupwiseQuantRowLinear,
WeightOnlyQuantLinear,
WeightOnlyQuantRowLinear)
from ..quantization.mode import W8A8_SQ_PLUGIN_LIST, QuantAlgo
from ..top_model_mixin import TopModelMixin
from .convert_utils import weight_only_quantize_dict
@ -841,7 +845,8 @@ def unfuse_qkv_gemm(model: PretrainedModel) -> PretrainedModel:
continue
qkv_params = get_init_params(layer.qkv, ColumnLinear)
qkv_params["bias"] = qkv_params["bias"] is not None
qkv_params["strict_dtype"] = qkv_params["strict_dtype"] is not None
qkv_params["strict_dtype"] = qkv_params.get(
"strict_dtype") is not None
q = ColumnLinear(
**{
**qkv_params,
@ -866,20 +871,34 @@ def unfuse_qkv_gemm(model: PretrainedModel) -> PretrainedModel:
q = quantize(q, model.config.quantization)
k = quantize(k, model.config.quantization)
v = quantize(v, model.config.quantization)
out_features = q.out_features + k.out_features + v.out_features
if isinstance(layer.qkv, (
WeightOnlyQuantLinear,
WeightOnlyQuantRowLinear,
WeightOnlyGroupwiseQuantLinear,
WeightOnlyGroupwiseQuantRowLinear,
)):
out_dim = 1
else:
out_dim = 0
if layer.qkv.weight.is_inited():
qkv_weight = layer.qkv.weight.raw_value
weights = np.split(qkv_weight, [
q.out_features,
q.out_features + k.out_features,
])
qkv_weight.shape[out_dim] * q.out_features // out_features,
qkv_weight.shape[out_dim] *
(q.out_features + k.out_features) // out_features,
],
axis=out_dim)
for gemm, weight in zip([q, k, v], weights):
gemm.weight.value = weight
if layer.qkv.bias is not None and layer.qkv.bias.is_inited():
qkv_bias = layer.qkv.bias.raw_value
biases = np.split(qkv_bias, [
q.out_features,
q.out_features + k.out_features,
])
qkv_bias.shape[out_dim] * q.out_features // out_features,
qkv_bias.shape[out_dim] *
(q.out_features + k.out_features) // out_features,
],
axis=out_dim)
for gemm, bias in zip([q, k, v], biases):
gemm.bias.value = bias
for name, parameter in layer.qkv._parameters.items():

View File

@ -1,9 +1,12 @@
import torch
from tensorrt_llm.quantization import QuantAlgo
from ..._utils import str_dtype_to_torch
from .split_weights import shuffle_qkv_weights, split_weights_tp
def convert_hf_weights(hf_model, dtype, **kwargs):
def convert_hf_weights(hf_model, dtype, config, small_variant, args, rank):
torch_dtype = str_dtype_to_torch(dtype)
hf_state_dict = hf_model.state_dict()
weights = {}
@ -15,13 +18,16 @@ def convert_hf_weights(hf_model, dtype, **kwargs):
key = key.replace("model.layers.", "transformer.layers.")
#Attention
key = key.replace("self_attn.", "attention.")
key = key.replace("query_key_value.", "qkv.") # small
key = key.replace("Wqkv.weight", "qkv.weight")
key = key.replace("qkv_proj.", "qkv.") #128k
#MLP
key = key.replace("mlp.fc1.", "mlp.fc.")
key = key.replace("mlp.fc2.", "mlp.proj.")
key = key.replace("mlp.gate_up_proj.", "mlp.fc.")
key = key.replace("mlp.up_proj.", "mlp.gate.") #128k
key = key.replace(
"mlp.up_proj.",
"mlp.fc." if small_variant else "mlp.gate.") #128k
key = key.replace("mlp.down_proj.", "mlp.proj.") #128k
key = key.replace("mlp.gate_proj.", "mlp.fc.") #128k
key = key.replace("o_proj.", "dense.") #128k
@ -53,27 +59,67 @@ def convert_hf_weights(hf_model, dtype, **kwargs):
key = key.replace("q_proj.weight", "qkv.weight")
elif "k_proj" in key or "v_proj" in key:
continue
weights[key] = value.to(torch_dtype).cpu()
if small_variant:
weights['lm_head.weight'] = weights[
'transformer.vocab_embedding.weight'].clone()
# Transform QKV weights from custom Phi3Small format to TRT-LLM format
for key, value in weights.items():
if "qkv." in key:
weights[key] = shuffle_qkv_weights(weights[key], config)
weights = split_weights_tp(config, weights, args, rank, torch_dtype)
return weights
def convert_hf_config(hf_config, dtype, **kwargs):
def convert_small_hf_config(hf_config):
return {
'architecture': "Phi3SmallForCausalLM",
'rotary_base': hf_config.rope_embedding_base,
'gegelu_limit': hf_config.gegelu_limit,
'mup_attn_multiplier': hf_config.mup_attn_multiplier,
'mup_embedding_multiplier': hf_config.mup_embedding_multiplier,
'mup_use_scaling': hf_config.mup_use_scaling,
'mup_width_multiplier': hf_config.mup_width_multiplier,
'blocksparse_block_size': hf_config.blocksparse_block_size,
'blocksparse_homo_head_pattern':
hf_config.blocksparse_homo_head_pattern,
'blocksparse_num_local_blocks': hf_config.blocksparse_num_local_blocks,
'blocksparse_vertical_stride': hf_config.blocksparse_vert_stride,
'dense_attention_every_n_layers':
hf_config.dense_attention_every_n_layers,
}
def convert_hf_config(hf_config, dtype, args):
config = {
'architecture': "Phi3ForCausalLM",
'dtype': dtype,
'num_hidden_layers': hf_config.num_hidden_layers,
'num_attention_heads': hf_config.num_attention_heads,
'num_key_value_heads': hf_config.num_key_value_heads,
'rope_theta': hf_config.rope_theta,
'hidden_size': hf_config.hidden_size,
'intermediate_size': hf_config.intermediate_size,
'vocab_size': hf_config.vocab_size,
'max_position_embeddings': hf_config.max_position_embeddings,
'hidden_act': hf_config.hidden_act,
'share_embedding_table': False,
'norm_epsilon': hf_config.rms_norm_eps,
}
small_variant = hf_config.architectures[0] == "Phi3SmallForCausalLM"
if small_variant:
config.update(convert_small_hf_config(hf_config))
else:
config.update({
'rotary_base': hf_config.rope_theta,
'norm_epsilon': hf_config.rms_norm_eps,
})
# Long-context variants
if hf_config.max_position_embeddings >= 128000:
config.update({
'original_max_position_embeddings':
@ -83,6 +129,31 @@ def convert_hf_config(hf_config, dtype, **kwargs):
'longrope_scaling_long_factors':
hf_config.rope_scaling["long_factor"]
})
if small_variant:
config.update({
'longrope_long_mscale':
hf_config.rope_scaling["long_mscale"],
'longrope_short_mscale':
hf_config.rope_scaling["short_mscale"]
})
if config["hidden_act"] == "silu":
config["hidden_act"] = "swiglu"
# Tensor parallelism and weight-only quantization
if args is not None:
config.update({
'mapping': {
'world_size': args.tp_size * args.pp_size,
'tp_size': args.tp_size,
'pp_size': args.pp_size,
}
})
if args.use_weight_only and args.weight_only_precision == 'int8':
config.update({'quantization': {'quant_algo': QuantAlgo.W8A16}})
elif args.use_weight_only and args.weight_only_precision == 'int4':
config.update({'quantization': {'quant_algo': QuantAlgo.W4A16}})
return config

View File

@ -1,15 +1,20 @@
import json
import os
import traceback
from concurrent.futures import ThreadPoolExecutor, as_completed
from typing import Optional
import numpy as np
import safetensors
from transformers import AutoModelForCausalLM
from ..._utils import pad_vocab_size
from ...functional import PositionEmbeddingType, Tensor
from ...layers import (MLP, Attention, AttentionMaskType, Embedding,
ParallelLMHead, RmsNorm)
from ...layers import (MLP, Attention, AttentionMaskType, BlockSparseAttnParams,
Embedding, LayerNorm, ParallelLMHead, RmsNorm)
from ...module import Module
from ..modeling_utils import (DecoderLayerList, DecoderModelForCausalLM,
PretrainedConfig, save_checkpoint)
PretrainedConfig)
from .convert import convert_hf_config, convert_hf_weights
@ -22,28 +27,65 @@ class Phi3DecoderLayer(Module):
tp_group = config.mapping.tp_group
tp_size = config.mapping.tp_size
self.input_layernorm = RmsNorm(normalized_shape=config.hidden_size,
eps=config.norm_epsilon,
dtype=config.dtype)
self.post_layernorm = RmsNorm(normalized_shape=config.hidden_size,
eps=config.norm_epsilon,
dtype=config.dtype)
attention_mask_type = AttentionMaskType.causal
block_sparse_attn_params = BlockSparseAttnParams()
q_scaling = 1.0
self.gegelu_limit = None
self.small_variant = config.architecture == "Phi3SmallForCausalLM"
if self.small_variant:
self.gegelu_limit = config.gegelu_limit
# MuP uses norm_factor=attention_head_size (rather than sqrt(attention_head_size))
# We achieve this using q_scaling = sqrt(attention_head_size)
hidden_size = config.hidden_size
num_attention_heads = config.num_attention_heads
attention_head_size = hidden_size / num_attention_heads
q_scaling = attention_head_size**.5
block_sparse = (
(layer_idx + 1) % config.dense_attention_every_n_layers) != 0
attention_mask_type = AttentionMaskType.blocksparse if block_sparse else AttentionMaskType.causal
block_sparse_attn_params = BlockSparseAttnParams(
config.blocksparse_block_size,
config.blocksparse_homo_head_pattern,
config.blocksparse_num_local_blocks,
config.blocksparse_vertical_stride)
self.input_layernorm = LayerNorm(
normalized_shape=config.hidden_size, dtype=config.dtype)
self.post_layernorm = LayerNorm(normalized_shape=config.hidden_size,
dtype=config.dtype)
else:
self.input_layernorm = RmsNorm(normalized_shape=config.hidden_size,
eps=config.norm_epsilon,
dtype=config.dtype)
self.post_layernorm = RmsNorm(normalized_shape=config.hidden_size,
eps=config.norm_epsilon,
dtype=config.dtype)
layers_range = config.mapping.pp_layers(config.num_hidden_layers)
local_layer_idx = layer_idx - layers_range[0]
position_embedding_type = PositionEmbeddingType.rope_gpt_neox
rope_scaling_short_factors = 1.0
rope_scaling_long_factors = 1.0
rope_scaling_short_factors, rope_scaling_long_factors = 1.0, 1.0
rope_scaling_short_mscale, rope_scaling_long_mscale = 1.0, 1.0
original_max_position_embeddings = config.max_position_embeddings
if hasattr(config, "longrope_scaling_short_factors"):
rope_scaling_short_factors = np.asarray(
config.longrope_scaling_short_factors).astype(np.float32)
rope_scaling_long_factors = np.asarray(
config.longrope_scaling_long_factors).astype(np.float32)
original_max_position_embeddings = config.original_max_position_embeddings
position_embedding_type = PositionEmbeddingType.long_rope
if self.small_variant:
rope_scaling_short_mscale = config.longrope_short_mscale
rope_scaling_long_mscale = config.longrope_long_mscale
self.attention = Attention(
local_layer_idx=local_layer_idx,
hidden_size=config.hidden_size,
@ -53,15 +95,18 @@ class Phi3DecoderLayer(Module):
rotary_embedding_base=config.rotary_base,
max_position_embeddings=config.max_position_embeddings,
dtype=config.dtype,
attention_mask_type=AttentionMaskType.causal,
bias=False,
attention_mask_type=attention_mask_type,
bias=self.small_variant,
q_scaling=q_scaling,
tp_group=tp_group,
tp_size=tp_size,
quant_mode=config.quant_mode,
rope_scaling_short_factors=rope_scaling_short_factors,
rope_scaling_long_factors=rope_scaling_long_factors,
rope_scaling_short_mscale=rope_scaling_short_mscale,
rope_scaling_long_mscale=rope_scaling_long_mscale,
original_max_position_embeddings=original_max_position_embeddings,
)
block_sparse_params=block_sparse_attn_params)
self.mlp = MLP(hidden_size=config.hidden_size,
ffn_hidden_size=config.intermediate_size,
@ -70,7 +115,7 @@ class Phi3DecoderLayer(Module):
tp_group=tp_group,
tp_size=tp_size,
quant_mode=config.quant_mode,
bias=False)
bias=self.small_variant)
def forward(
self,
@ -88,7 +133,7 @@ class Phi3DecoderLayer(Module):
use_cache=use_cache,
kv_cache_params=kv_cache_params,
attention_params=attention_params,
norm_before_bmm1=True,
norm_before_bmm1=not self.small_variant,
)
if use_cache:
@ -96,7 +141,8 @@ class Phi3DecoderLayer(Module):
post_attention_input = hidden_states + attention_output
post_attention_output = self.post_layernorm(post_attention_input)
feed_forward_hidden_states = self.mlp(post_attention_output, )
feed_forward_hidden_states = self.mlp(post_attention_output,
gegelu_limit=self.gegelu_limit)
hidden_states = post_attention_input + feed_forward_hidden_states
if use_cache:
return (hidden_states, presents)
@ -112,9 +158,15 @@ class Phi3Model(Module):
dtype=config.dtype)
self.layers = DecoderLayerList(Phi3DecoderLayer, config)
self.ln_f = RmsNorm(normalized_shape=config.hidden_size,
eps=config.norm_epsilon,
dtype=config.dtype)
self.small_variant = config.architecture == "Phi3SmallForCausalLM"
if self.small_variant:
self.ln_f = LayerNorm(normalized_shape=config.hidden_size,
dtype=config.dtype)
self.mup_embedding_multiplier = config.mup_embedding_multiplier
else:
self.ln_f = RmsNorm(normalized_shape=config.hidden_size,
eps=config.norm_epsilon,
dtype=config.dtype)
def forward(
self,
@ -132,6 +184,9 @@ class Phi3Model(Module):
] if prompt_embedding_table is not None else []
hidden_states = self.vocab_embedding(input_ids, *args)
if self.small_variant and self.mup_embedding_multiplier > 0.0:
hidden_states = hidden_states * self.mup_embedding_multiplier
hidden_states = self.layers(
hidden_states,
use_cache=use_cache,
@ -152,7 +207,6 @@ class Phi3Model(Module):
class Phi3ForCausalLM(DecoderModelForCausalLM):
def __init__(self, config: PretrainedConfig):
self.check_config(config)
transformer = Phi3Model(config)
vocab_size_padded = pad_vocab_size(config.vocab_size,
config.mapping.tp_size)
@ -167,25 +221,48 @@ class Phi3ForCausalLM(DecoderModelForCausalLM):
super().__init__(config, transformer, lm_head)
def check_config(self, config):
config.set_if_not_exist('rotary_base', 10000.0)
@classmethod
def convert_hf_checkpoint(cls,
hf_model_dir: str,
dtype: Optional[str] = "float16",
output_dir: Optional[str] = None,
**kwargs):
args=None):
'''
Convert Huggingface checkpoint to TRT-LLM checkpoint
'''
hf_model = AutoModelForCausalLM.from_pretrained(hf_model_dir,
torch_dtype="auto",
trust_remote_code=True)
config = convert_hf_config(hf_model.config, dtype=dtype, **kwargs)
weights = convert_hf_weights(hf_model, dtype=dtype, **kwargs)
config = convert_hf_config(hf_model.config, dtype, args)
with open(os.path.join(output_dir, 'config.json'), 'w') as f:
json.dump(config, f, indent=4)
if output_dir:
save_checkpoint(output_dir, config=config, weights=weights)
small_variant = config['architecture'] == "Phi3SmallForCausalLM"
return {"weights": weights, "config": config}
def covert_and_save(rank):
weights = convert_hf_weights(hf_model, dtype, config, small_variant,
args, rank)
safetensors.torch.save_file(
weights, os.path.join(output_dir, f'rank{rank}.safetensors'))
world_size = args.tp_size * args.pp_size
if args.workers == 1:
for rank in range(world_size):
covert_and_save(rank)
else:
with ThreadPoolExecutor(max_workers=args.workers) as p:
futures = [
p.submit(covert_and_save, rank)
for rank in range(world_size)
]
exceptions = []
for future in as_completed(futures):
try:
future.result()
except Exception as e:
traceback.print_exc()
exceptions.append(e)
assert len(
exceptions
) == 0, "Checkpoint conversion failed, please check error log."

View File

@ -1,14 +0,0 @@
# SPDX-FileCopyrightText: Copyright (c) 2022-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

View File

@ -1,257 +0,0 @@
# SPDX-FileCopyrightText: Copyright (c) 2022-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import json
import os
import traceback
from concurrent.futures import ThreadPoolExecutor, as_completed
import numpy as np
import safetensors
from transformers import AutoModelForCausalLM
from ...._utils import pad_vocab_size
from ....functional import PositionEmbeddingType, Tensor
from ....layers import (MLP, Attention, AttentionMaskType,
BlockSparseAttnParams, Embedding, LayerNorm,
ParallelLMHead)
from ....module import Module
from ...modeling_utils import (DecoderLayerList, DecoderModelForCausalLM,
PretrainedConfig)
from .convert import convert_hf_config, convert_hf_weights
class Phi3SmallDecoderLayer(Module):
def __init__(self, config: PretrainedConfig, layer_idx: int):
super().__init__()
self.config = config
self.layer_idx = layer_idx
tp_group = config.mapping.tp_group
tp_size = config.mapping.tp_size
self.gegelu_limit = config.gegelu_limit
self.input_layernorm = LayerNorm(normalized_shape=config.hidden_size,
dtype=config.dtype)
# MuP uses norm_factor=attention_head_size (rather than sqrt(attention_head_size))
# We achieve this using q_scaling = sqrt(attention_head_size)
hidden_size = config.hidden_size
num_attention_heads = config.num_attention_heads
attention_head_size = hidden_size / num_attention_heads
q_scaling = attention_head_size**.5
block_sparse = (
(layer_idx + 1) % config.dense_attention_every_n_layers) != 0
attention_mask_type = AttentionMaskType.blocksparse if block_sparse else AttentionMaskType.causal
block_sparse_attn_params = BlockSparseAttnParams(
config.blocksparse_block_size, config.blocksparse_homo_head_pattern,
config.blocksparse_num_local_blocks,
config.blocksparse_vertical_stride)
layers_range = config.mapping.pp_layers(config.num_hidden_layers)
local_layer_idx = layer_idx - layers_range[0]
position_embedding_type = PositionEmbeddingType.rope_gpt_neox
original_max_position_embeddings = config.max_position_embeddings
rope_scaling_short_factors, rope_scaling_long_factors = 1.0, 1.0
rope_scaling_short_mscale, rope_scaling_long_mscale = 1.0, 1.0
if hasattr(config, "longrope_scaling_short_factors"):
rope_scaling_short_factors = np.asarray(
config.longrope_scaling_short_factors).astype(np.float32)
rope_scaling_long_factors = np.asarray(
config.longrope_scaling_long_factors).astype(np.float32)
rope_scaling_short_mscale = config.longrope_short_mscale
rope_scaling_long_mscale = config.longrope_long_mscale
position_embedding_type = PositionEmbeddingType.long_rope
original_max_position_embeddings = config.original_max_position_embeddings
self.attention = Attention(
local_layer_idx=local_layer_idx,
hidden_size=config.hidden_size,
num_attention_heads=config.num_attention_heads,
num_kv_heads=config.num_kv_heads,
position_embedding_type=position_embedding_type,
rotary_embedding_base=config.rotary_embedding_base,
max_position_embeddings=config.max_position_embeddings,
original_max_position_embeddings=original_max_position_embeddings,
dtype=config.dtype,
attention_mask_type=attention_mask_type,
bias=True,
q_scaling=q_scaling,
tp_group=tp_group,
tp_size=tp_size,
quant_mode=config.quant_mode,
rope_scaling_short_factors=rope_scaling_short_factors,
rope_scaling_long_factors=rope_scaling_long_factors,
rope_scaling_short_mscale=rope_scaling_short_mscale,
rope_scaling_long_mscale=rope_scaling_long_mscale,
block_sparse_params=block_sparse_attn_params)
self.post_layernorm = LayerNorm(normalized_shape=config.hidden_size,
dtype=config.dtype)
self.mlp = MLP(hidden_size=config.hidden_size,
ffn_hidden_size=config.intermediate_size,
hidden_act=config.hidden_act,
dtype=config.dtype,
tp_group=tp_group,
tp_size=tp_size,
quant_mode=config.quant_mode)
def forward(
self,
hidden_states: Tensor,
attention_mask=None,
use_cache=False,
kv_cache_params=None,
attention_params=None,
):
residual = hidden_states
input_layernorm_output = self.input_layernorm(hidden_states)
# Self attention
attention_output = self.attention(
input_layernorm_output,
attention_mask=attention_mask,
use_cache=use_cache,
kv_cache_params=kv_cache_params,
attention_params=attention_params,
)
if use_cache:
attention_output, presents = attention_output
hidden_states = residual + attention_output
# Fully connected
residual = hidden_states
hidden_states = self.post_layernorm(hidden_states)
hidden_states = self.mlp(hidden_states, gegelu_limit=self.gegelu_limit)
hidden_states = residual + hidden_states
if use_cache:
return (hidden_states, presents)
return hidden_states
class Phi3SmallModel(Module):
def __init__(self, config: PretrainedConfig):
super().__init__()
self.vocab_embedding = Embedding(num_embeddings=config.vocab_size,
embedding_dim=config.hidden_size,
dtype=config.dtype)
self.layers = DecoderLayerList(Phi3SmallDecoderLayer, config)
self.ln_f = LayerNorm(normalized_shape=config.hidden_size,
dtype=config.dtype)
self.mup_embedding_multiplier = config.mup_embedding_multiplier
def forward(
self,
input_ids: Tensor,
position_ids=None,
use_cache=False,
attention_mask=None,
kv_cache_params=None,
attention_params=None,
prompt_embedding_table=None,
prompt_tasks=None,
prompt_vocab_size=None,
):
args = [prompt_embedding_table, prompt_tasks, prompt_vocab_size
] if prompt_embedding_table is not None else []
hidden_states = self.vocab_embedding(input_ids, *args)
if self.mup_embedding_multiplier is not None and self.mup_embedding_multiplier > 0.0:
hidden_states = hidden_states * self.mup_embedding_multiplier
hidden_states = self.layers(
hidden_states,
use_cache=use_cache,
attention_mask=attention_mask,
kv_cache_params=kv_cache_params,
attention_params=attention_params,
)
if use_cache:
hidden_states, presents = hidden_states
hidden_states = self.ln_f(hidden_states)
if use_cache:
return (hidden_states, tuple(presents))
return hidden_states
class Phi3SmallForCausalLM(DecoderModelForCausalLM):
def __init__(self, config: PretrainedConfig):
transformer = Phi3SmallModel(config)
vocab_size_padded = pad_vocab_size(config.vocab_size,
config.mapping.tp_size)
lm_head = ParallelLMHead(config.hidden_size,
vocab_size_padded,
bias=False,
dtype=config.dtype,
tp_group=config.mapping.tp_group,
tp_size=config.mapping.tp_size,
gather_output=True)
super().__init__(config, transformer, lm_head)
@classmethod
def convert_hf_checkpoint(cls, model_dir, dtype, output_dir, args=None):
'''
Convert Huggingface checkpoint to TRT-LLM checkpoint
'''
hf_model = AutoModelForCausalLM.from_pretrained(model_dir,
torch_dtype="auto",
trust_remote_code=True)
config = convert_hf_config(hf_model.config, dtype, args)
with open(os.path.join(output_dir, 'config.json'), 'w') as f:
json.dump(config, f, indent=4)
def covert_and_save(rank):
weights = convert_hf_weights(hf_model, config, args, rank)
safetensors.torch.save_file(
weights, os.path.join(output_dir, f'rank{rank}.safetensors'))
world_size = args.tp_size * args.pp_size
if args.workers == 1:
for rank in range(world_size):
covert_and_save(rank)
else:
with ThreadPoolExecutor(max_workers=args.workers) as p:
futures = [
p.submit(covert_and_save, rank)
for rank in range(world_size)
]
exceptions = []
for future in as_completed(futures):
try:
future.result()
except Exception as e:
traceback.print_exc()
exceptions.append(e)
assert len(
exceptions
) == 0, "Checkpoint conversion failed, please check error log."

View File

@ -15,10 +15,6 @@
import torch
from tensorrt_llm.quantization import QuantAlgo
from ...._utils import str_dtype_to_torch
def shuffle_qkv_weights(weights, config):
# Input weights are organized as
@ -29,8 +25,7 @@ def shuffle_qkv_weights(weights, config):
# (q00, q01, ..., qnm), (k0, k1, .., kn), (v0, v1, .., vn)
num_heads = config['num_attention_heads']
num_kv_heads = config['num_kv_heads'] if 'num_kv_heads' in config.keys(
) else config['num_key_value_heads']
num_kv_heads = config['num_key_value_heads']
num_q_per_kv = num_heads // num_kv_heads
hidden_size = config['hidden_size']
@ -152,7 +147,7 @@ def get_tllm_linear_weight(weight,
def split_weights_tp(config, weights, args, rank, dtype):
num_heads = config['num_attention_heads']
num_kv_heads = config['num_kv_heads']
num_kv_heads = config['num_key_value_heads']
hidden_size = config['hidden_size']
mha_mode = num_heads == num_kv_heads
@ -228,96 +223,3 @@ def split_weights_tp(config, weights, args, rank, dtype):
dim=0)
return weights
def convert_hf_weights(hf_model, config, args, rank):
torch_dtype = str_dtype_to_torch(args.dtype)
hf_state_dict = hf_model.state_dict()
weights = {}
# replace key name
for key, value in hf_state_dict.items():
# Decoder Layers
if "model.layers." in key:
key = key.replace("model.layers.", "transformer.layers.")
key = key.replace("self_attn.", "attention.")
key = key.replace("query_key_value.", "qkv.")
key = key.replace("mlp.up_proj.", "mlp.fc.")
key = key.replace("mlp.down_proj.", "mlp.proj.")
key = key.replace("post_attention_layernorm.", "post_layernorm.")
# Embedding
key = key.replace("model.embed_tokens.weight",
"transformer.vocab_embedding.weight")
# Final Layer norm
key = key.replace("model.final_layernorm.", "transformer.ln_f.")
weights[key] = value.to(torch_dtype).cpu()
weights['lm_head.weight'] = weights[
'transformer.vocab_embedding.weight'].clone()
# Transform QKV weights from custom Phi3Small format to TRT-LLM format
for key, value in weights.items():
if "qkv." in key:
weights[key] = shuffle_qkv_weights(weights[key], config)
weights = split_weights_tp(config, weights, args, rank, torch_dtype)
return weights
def convert_hf_config(hf_config, dtype, args):
config = {
'architecture': 'Phi3SmallForCausalLM',
'dtype': dtype,
'num_hidden_layers': hf_config.num_hidden_layers,
'num_attention_heads': hf_config.num_attention_heads,
'num_kv_heads': hf_config.num_key_value_heads,
'rotary_embedding_base': hf_config.rope_embedding_base,
'hidden_size': hf_config.hidden_size,
'intermediate_size': hf_config.intermediate_size,
'vocab_size': hf_config.vocab_size,
'max_position_embeddings': hf_config.max_position_embeddings,
'hidden_act': hf_config.hidden_act,
'share_embedding_table': False,
'gegelu_limit': hf_config.gegelu_limit,
'mup_attn_multiplier': hf_config.mup_attn_multiplier,
'mup_embedding_multiplier': hf_config.mup_embedding_multiplier,
'mup_use_scaling': hf_config.mup_use_scaling,
'mup_width_multiplier': hf_config.mup_width_multiplier,
'blocksparse_block_size': hf_config.blocksparse_block_size,
'blocksparse_homo_head_pattern':
hf_config.blocksparse_homo_head_pattern,
'blocksparse_num_local_blocks': hf_config.blocksparse_num_local_blocks,
'blocksparse_vertical_stride': hf_config.blocksparse_vert_stride,
'dense_attention_every_n_layers':
hf_config.dense_attention_every_n_layers,
}
if args is not None:
config.update({
'mapping': {
'world_size': args.tp_size * args.pp_size,
'tp_size': args.tp_size,
'pp_size': args.pp_size,
}
})
if args.use_weight_only and args.weight_only_precision == 'int8':
config.update({'quantization': {'quant_algo': QuantAlgo.W8A16}})
elif args.use_weight_only and args.weight_only_precision == 'int4':
config.update({'quantization': {'quant_algo': QuantAlgo.W4A16}})
if hf_config.max_position_embeddings >= 128000:
config.update({
'original_max_position_embeddings':
hf_config.original_max_position_embeddings,
'longrope_scaling_short_factors':
hf_config.rope_scaling["short_factor"],
'longrope_scaling_long_factors':
hf_config.rope_scaling["long_factor"],
'longrope_long_mscale':
hf_config.rope_scaling["long_mscale"],
'longrope_short_mscale':
hf_config.rope_scaling["short_mscale"]
})
return config

View File

@ -398,9 +398,9 @@ class RecurrentGemmaForCausalLM(PretrainedModel):
max_batch_size,
max_input_len,
max_seq_len,
max_num_tokens,
use_cache,
max_beam_width: int = 1,
max_num_tokens: int = None,
opt_num_tokens: int = None,
opt_batch_size: int = 0,
prompt_embedding_table_size: int = 0,
@ -434,44 +434,20 @@ class RecurrentGemmaForCausalLM(PretrainedModel):
self.gather_context_logits = gather_context_logits
mapping = self.config.mapping
default_range = GenerationMixin.default_range
batch_range = default_range(max_batch_size)
bbd_range = [
batch_range[i] * ((max_draft_len + 1) if i != 0 else 1)
for i in range(len(batch_range))
]
inlen_range_cxt = default_range(max_input_len)
inlen_range_gen = [1, 1, max_draft_len + 1]
# basic inputs
enable_ctx_gen_opt_profiles = GenerationMixin.has_ctx_gen_opt_profiles(
use_gpt_attention_plugin, use_gemm_plugin, remove_input_padding,
paged_kv_cache)
if max_num_tokens is None:
max_num_tokens = max(
max_input_len * max_batch_size,
max_beam_width * (max_draft_len + 1) * max_batch_size)
if enable_ctx_gen_opt_profiles:
num_profiles = 2
inlen_range = [inlen_range_cxt, inlen_range_gen]
num_tokens_range_ctx = default_range(max_num_tokens)
num_tokens_range_gen = default_range(
max_batch_size * (max_draft_len + 1) * max_beam_width)
num_tokens_range = [num_tokens_range_ctx, num_tokens_range_gen]
position_ids_inlen_range = [inlen_range_cxt, [1, 1, 1]]
else:
max_bs_x_max_bw = max_batch_size * max_beam_width
if opt_num_tokens is None:
opt_num_tokens = max_bs_x_max_bw
if multiple_profiles:
num_tokens_range = GenerationMixin.split_num_tokens_range(
max_num_tokens)
else:
num_tokens_range = [[1, opt_num_tokens, max_num_tokens]]
num_profiles = len(num_tokens_range)
inlen_range = [[1, 1, max_input_len]] * num_profiles
position_ids_inlen_range = [[1, 1, max_input_len]] * num_profiles
bb_range = [batch_range] * num_profiles
num_profiles, ranges = GenerationMixin.get_profiles_ranges(
max_batch_size=max_batch_size,
max_beam_width=max_beam_width,
max_input_len=max_input_len,
max_num_tokens=max_num_tokens,
max_draft_len=max_draft_len,
opt_batch_size=opt_batch_size,
opt_num_tokens=opt_num_tokens,
enable_ctx_gen_opt_profiles=enable_ctx_gen_opt_profiles,
multiple_profiles=multiple_profiles)
if remove_input_padding:
assert use_mamba_conv1d_plugin, "mamba_conv1d_plugin is needed to support remove_input_padding"
@ -479,14 +455,14 @@ class RecurrentGemmaForCausalLM(PretrainedModel):
dtype=trt.int32,
shape=[-1],
dim_range=OrderedDict([
('num_tokens', num_tokens_range),
('num_tokens', ranges['num_tokens_range']),
]))
position_ids = Tensor(name='position_ids',
dtype=trt.int32,
shape=[-1],
dim_range=OrderedDict([
('position_ids_num_tokens_range',
num_tokens_range),
ranges['num_tokens_range']),
]))
else:
input_ids = Tensor(name='input_ids',
@ -494,16 +470,17 @@ class RecurrentGemmaForCausalLM(PretrainedModel):
shape=[-1, -1],
dim_range=OrderedDict([
('batch_size_beam_width',
[batch_range] * num_profiles),
('input_len', inlen_range),
ranges['bb_range']),
('input_len', ranges['inlen_range']),
]))
position_ids = Tensor(name='position_ids',
dtype=trt.int32,
shape=[-1, -1],
dim_range=OrderedDict([
('batch_size_beam_width', bb_range),
('batch_size_beam_width',
ranges['bb_range']),
('position_ids_inlen_range',
position_ids_inlen_range),
ranges['position_ids_inlen_range']),
]))
if use_custom_all_reduce and mapping.tp_size > 1:
current_all_reduce_helper().set_workspace_tensor(
@ -559,7 +536,8 @@ class RecurrentGemmaForCausalLM(PretrainedModel):
name='host_request_types',
dtype=trt.int32,
shape=[-1],
dim_range=OrderedDict([('batch_size_beam_width', bb_range)]),
dim_range=OrderedDict([('batch_size_beam_width',
ranges['bb_range'])]),
)
last_token_ids = Tensor(
@ -567,7 +545,7 @@ class RecurrentGemmaForCausalLM(PretrainedModel):
dtype=trt.int32,
shape=[-1],
dim_range=OrderedDict([
('batch_size_last_token_ids', [bbd_range] * num_profiles),
('batch_size_last_token_ids', ranges['bbd_range']),
]),
)
last_token_ids_for_logits = None
@ -581,7 +559,8 @@ class RecurrentGemmaForCausalLM(PretrainedModel):
name='host_context_lengths',
dtype=trt.int32,
shape=[-1],
dim_range=OrderedDict([('batch_size_beam_width', bb_range)]),
dim_range=OrderedDict([('batch_size_beam_width',
ranges['bb_range'])]),
)
else:
host_context_lengths = None

View File

@ -883,7 +883,6 @@ class FP8Linear(Linear):
assert lora_runtime_params is None or default_net(
).plugin_config.lora_plugin == self.dtype
lora_hidden_state = x if lora_runtime_params is not None else None
if default_net().strongly_typed:
assert is_same_dtype(
x.dtype,
@ -894,8 +893,13 @@ class FP8Linear(Linear):
activation_scaling_factor = cast(activation_scaling_factor, self.dtype)
if x.dtype != trt.fp8:
quantized_out = quantize(x, activation_scaling_factor, 'fp8')
lora_hidden_state = x if lora_runtime_params is not None else None
else:
quantized_out = x
# TODO: add fp8 LoRA support
lora_hidden_state = dequantize(
x, activation_scaling_factor, -1,
self.dtype) if lora_runtime_params is not None else None
weights_scaling_factor = constant(
self.weights_scaling_factor.raw_value.copy())
@ -956,14 +960,18 @@ class FP8RowLinear(RowLinear):
assert lora_runtime_params is None or default_net(
).plugin_config.lora_plugin == self.dtype
lora_hidden_state = x if lora_runtime_params is not None else None
activation_scaling_factor = constant(
self.activation_scaling_factor.raw_value.copy())
activation_scaling_factor = cast(activation_scaling_factor, self.dtype)
if x.dtype != trt.fp8:
quantized_out = quantize(x, activation_scaling_factor, 'fp8')
lora_hidden_state = x if lora_runtime_params is not None else None
else:
quantized_out = x
# TODO: add fp8 LoRA support
lora_hidden_state = dequantize(
x, activation_scaling_factor, -1,
self.dtype) if lora_runtime_params is not None else None
weights_scaling_factor = constant(
self.weights_scaling_factor.raw_value.copy())

View File

@ -57,7 +57,10 @@ def quantize_layers(
if preprocess_init_params is not None:
preprocess_init_params(init_params, name, module)
quant_layer = quant_cls(**init_params)
setattr(parent, module_name, quant_layer)
if parent is not None:
setattr(parent, module_name, quant_layer)
else:
model = quant_layer
setattr(model, 'quant_mode', quant_config.quant_mode)
return model
@ -78,7 +81,7 @@ def weight_only_quantize(model, quant_config: QuantConfig):
module_name = name.rsplit('.', 1)[-1]
init_params["transb"] = module_name == "lm_head"
quantize_layers(
model = quantize_layers(
model,
quant_config,
quant_map,
@ -102,7 +105,7 @@ def weight_only_groupwise_quantize(model, quant_config: QuantConfig):
init_params[
"use_w4a8_awq"] = quant_config.quant_algo == QuantAlgo.W4A8_AWQ
quantize_layers(
model = quantize_layers(
model,
quant_config,
quant_map,
@ -120,7 +123,7 @@ def smooth_quantize_ootb(
RowLinear: Int8SmoothQuantRowLinear,
}
quantize_layers(
model = quantize_layers(
model,
quant_config,
quant_map,
@ -138,7 +141,7 @@ def smooth_quantize_plugin(model, quant_mode):
}
for name, layer, parent in model.named_modules_with_parent():
layer_name = name.rsplit('.', 1)[-1]
if layer_name in ['ln_f']:
if layer_name in ['ln_f', 'ln_embed']:
continue
quant_cls = None
@ -156,7 +159,10 @@ def smooth_quantize_plugin(model, quant_mode):
init_params[
"num_attention_heads"] = layer.num_attention_heads * layer.tp_size
quant_layer = quant_cls(**init_params)
setattr(parent, layer_name, quant_layer)
if parent is not None:
setattr(parent, layer_name, quant_layer)
else:
model = quant_layer
setattr(model, 'quant_mode', quant_mode)
return model
@ -178,7 +184,7 @@ def fp8_quantize(model, quant_config: QuantConfig):
RowLinear: FP8RowLinear,
}
quantize_layers(
model = quantize_layers(
model,
quant_config,
quant_map,

View File

@ -31,10 +31,12 @@ import numpy as np
import safetensors
import torch
from datasets import load_dataset
from safetensors.torch import load_file, save_file
from torch.utils.data import DataLoader
from transformers import AutoConfig, AutoModelForCausalLM, AutoTokenizer
from ..logger import logger
from ..mapping import Mapping
from .mode import QuantAlgo
EMPTY_CFG = {
@ -122,7 +124,8 @@ MODEL_NAME_PATTERN_MAP = {
"Gemma": "gemma",
"MixtralForCausalLM": "llama",
"ArcticForCausalLM": "llama",
"Phi3SmallForCausalLM": "phi",
"Phi3SmallForCausalLM": "phi3small",
"Phi3ForCausalLM": "phi3",
}
@ -263,10 +266,95 @@ def quantize_model(model, quant_cfg, calib_dataloader=None):
return model
def quantize_and_export(*, model_dir, device, calib_dataset, dtype, qformat,
kv_cache_dtype, calib_size, batch_size,
calib_max_seq_length, awq_block_size, output_dir,
tp_size, pp_size, seed, tokenizer_max_seq_length):
def combine_medusa_weight(tp_size, pp_size, base_model_output_dir,
num_medusa_heads, num_medusa_layers, max_draft_len,
medusa_hidden_act, medusa_model_dir,
quant_medusa_head):
with open(f"{medusa_model_dir}/config.json", "r") as fp:
medusa_config = json.load(fp)
num_medusa_heads_from_config = medusa_config.get('medusa_num_heads',
num_medusa_heads)
num_medusa_layers = medusa_config.get('medusa_num_layers',
num_medusa_layers)
if num_medusa_heads is None:
num_medusa_heads = num_medusa_heads_from_config
assert max_draft_len > 0, "should have max_draft_len > 0"
world_size = tp_size * pp_size
# Process for each rank
for rank in range(world_size):
mapping = Mapping(world_size=world_size,
rank=rank,
tp_size=tp_size,
pp_size=pp_size)
# 1. Load medusa weight for each rank
from tensorrt_llm.models.medusa.weight import load_medusa_hf
medusa_weights = load_medusa_hf(medusa_path=medusa_model_dir,
num_medusa_heads=num_medusa_heads,
num_medusa_layers=num_medusa_layers,
mapping=mapping,
dtype="float16")
# 2. Load base model safetensors (after quant)
base_model_weights = load_file(
f"{base_model_output_dir}/rank{rank}.safetensors")
# 3. Combine and save weight
base_model_weights.update(medusa_weights)
save_file(base_model_weights,
f"{base_model_output_dir}/rank{rank}.safetensors")
# 4. Add medusa config into config.json
with open(f"{base_model_output_dir}/config.json", 'r') as f:
base_model_config = json.load(f)
f.close()
with open(f"{base_model_output_dir}/config.json", 'w') as f:
base_model_config['architecture'] = "MedusaForCausalLM"
base_model_config['quantization']['exclude_modules'] = [
'lm_head',
'*router',
'*vocab_embedding',
'*position_embedding',
'*block_embedding',
]
if not quant_medusa_head:
base_model_config['quantization']['exclude_modules'].append(
'*medusa_heads*')
base_model_config['max_draft_len'] = max_draft_len
base_model_config['num_medusa_heads'] = num_medusa_heads
base_model_config['num_medusa_layers'] = num_medusa_layers
json.dump(base_model_config, f, indent=4)
torch.cuda.empty_cache()
print("Combine medusa heads' weight, done.")
def quantize_and_export(*,
model_dir,
device,
calib_dataset,
dtype,
qformat,
kv_cache_dtype,
calib_size,
batch_size,
calib_max_seq_length,
awq_block_size,
output_dir,
tp_size,
pp_size,
seed,
tokenizer_max_seq_length,
num_medusa_heads=None,
num_medusa_layers=None,
max_draft_len=None,
medusa_hidden_act=None,
medusa_model_dir=None,
quant_medusa_head=None):
'''
Load model from the model_dir, call Modelopt to quantize the model, and then export
the quantized model as TRT-LLM checkpoint
@ -419,24 +507,16 @@ def quantize_and_export(*, model_dir, device, calib_dataset, dtype, qformat,
with open(f"{export_path}/config.json", "w") as f:
json.dump(tensorrt_llm_config, f, indent=4)
if model_type == 'phi':
with open(f"{export_path}/config.json", "r") as f:
tensorrt_llm_config = json.load(f)
phi_config = AutoConfig.from_pretrained(model_dir,
trust_remote_code=True)
from ..models.phi3.phi3small.convert import \
convert_hf_config as phi_config_converter
phi_config = phi_config_converter(phi_config, dtype, None)
for key, value in phi_config.items():
tensorrt_llm_config[key] = value
with open(f"{export_path}/config.json", "w") as f:
json.dump(tensorrt_llm_config, f, indent=4)
torch.cuda.empty_cache(
) # otherwise torch is keeping using GPU, other routine like build engine has less free GPU to use
# Workaround for combining medusa head
# TODO: move these integration into modelopt to avoid redundant reading and writing
if medusa_model_dir is not None:
combine_medusa_weight(tp_size, pp_size, export_path,
num_medusa_heads, num_medusa_layers,
max_draft_len, medusa_hidden_act,
medusa_model_dir, quant_medusa_head)
end_time = time.time()
print(
"Quantized model exported to {} \nTotal time used {:.2f} s.".format(

View File

@ -12,4 +12,4 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
__version__ = "0.11.0.dev2024061800"
__version__ = "0.11.0.dev2024062500"

View File

@ -84,10 +84,12 @@ class TestMamba(unittest.TestCase):
hf_config, hf_path, hf_mamba, load_mode, dtype)
with net_guard(network):
network.set_named_parameters(tensorrt_llm_mamba.named_parameters())
inputs = tensorrt_llm_mamba.prepare_inputs(batch_size,
input_len,
input_len + output_len,
use_cache=False)
inputs = tensorrt_llm_mamba.prepare_inputs(
batch_size,
input_len,
input_len + output_len,
max_num_tokens=batch_size * input_len,
use_cache=False)
# Prepare
tensorrt_llm_mamba(**inputs)
return network

View File

@ -80,7 +80,7 @@ test_gptq() {
python convert_checkpoint.py --model_dir ${MODEL} \
--output_dir ./tllm_checkpoint/2gpu_gptq \
--dtype float16 \
--modelopt_quant_ckpt_path /home/scratch.trt_llm_data/llm-models/int4-quantized-gptq-awq/llama-7b-4bit-gs128.safetensors \
--quant_ckpt_path /home/scratch.trt_llm_data/llm-models/int4-quantized-gptq-awq/llama-7b-4bit-gs128.safetensors \
--use_weight_only \
--weight_only_precision int4_gptq \
--per_group \