mirror of
https://github.com/NVIDIA/TensorRT-LLM.git
synced 2026-01-13 22:18:36 +08:00
Update TensorRT-LLM (#1835)
* Update TensorRT-LLM --------- Co-authored-by: Morgan Funtowicz <funtowiczmo@gmail.com>
This commit is contained in:
parent
2a115dae84
commit
9691e12bce
@ -165,6 +165,9 @@ struct BenchmarkParams
|
||||
|
||||
// Weights offloading
|
||||
float gpuWeightsPercent{1.0};
|
||||
|
||||
// Decoding params
|
||||
std::optional<std::vector<std::vector<SizeType32>>> medusaChoices;
|
||||
};
|
||||
|
||||
class InferenceRequestsSyncSend
|
||||
@ -791,6 +794,10 @@ public:
|
||||
executorConfig.setMaxBatchSize(benchmarkParams.maxBatchSize.value());
|
||||
}
|
||||
|
||||
executorConfig.setDecodingConfig(texec::DecodingConfig(
|
||||
benchmarkParams.medusaChoices.has_value() ? texec::DecodingMode::Medusa() : texec::DecodingMode::Auto(),
|
||||
std::nullopt, benchmarkParams.medusaChoices));
|
||||
|
||||
mExecutor = std::make_unique<texec::Executor>(trtEnginePath, texec::ModelType::kDECODER_ONLY, executorConfig);
|
||||
|
||||
if (logIterationData)
|
||||
@ -1346,6 +1353,9 @@ void benchmarkGptManager(std::filesystem::path const& engineDir, TrtGptModelType
|
||||
optionalParams.maxBeamWidth = beamWidth;
|
||||
optionalParams.maxBatchSize = benchmarkParams.maxBatchSize;
|
||||
optionalParams.schedulerConfig = texec::SchedulerConfig{capacitySchedulerPolicy};
|
||||
optionalParams.decodingConfig = texec::DecodingConfig(
|
||||
benchmarkParams.medusaChoices.has_value() ? texec::DecodingMode::Medusa() : texec::DecodingMode::Auto(),
|
||||
std::nullopt, benchmarkParams.medusaChoices);
|
||||
|
||||
auto const jsonConfig = GptJsonConfig::parse(engineDir / "config.json");
|
||||
SizeType32 deviceCount{0};
|
||||
@ -1600,6 +1610,32 @@ void benchmarkExecutor(std::filesystem::path const& engineDir, TrtGptModelType m
|
||||
}
|
||||
}
|
||||
|
||||
std::vector<std::vector<SizeType32>> parseVectorOfVectors(std::string const& input)
|
||||
{
|
||||
std::vector<std::vector<SizeType32>> result;
|
||||
std::regex outer_regex(R"(\[(.*?)\])");
|
||||
std::regex inner_regex(R"(\d+)");
|
||||
auto outer_begin = std::sregex_iterator(input.begin(), input.end(), outer_regex);
|
||||
auto outer_end = std::sregex_iterator();
|
||||
|
||||
for (std::sregex_iterator i = outer_begin; i != outer_end; ++i)
|
||||
{
|
||||
std::smatch match = *i;
|
||||
std::string inner_str = match.str(1);
|
||||
std::vector<int> inner_vec;
|
||||
auto inner_begin = std::sregex_iterator(inner_str.begin(), inner_str.end(), inner_regex);
|
||||
auto inner_end = std::sregex_iterator();
|
||||
|
||||
for (std::sregex_iterator j = inner_begin; j != inner_end; ++j)
|
||||
{
|
||||
std::smatch inner_match = *j;
|
||||
inner_vec.push_back(std::stoi(inner_match.str()));
|
||||
}
|
||||
result.push_back(inner_vec);
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
} // namespace
|
||||
|
||||
int main(int argc, char* argv[])
|
||||
@ -1692,6 +1728,8 @@ int main(int argc, char* argv[])
|
||||
options.add_options()("gpu_weights_percent",
|
||||
"Specify the percentage of weights that reside on GPU (from 0.0 to 1.0).",
|
||||
cxxopts::value<float>()->default_value("1.0"));
|
||||
options.add_options()(
|
||||
"medusa_choices", "Medusa choices in the format of [[0], [0, 1], [0, 0, 1]]", cxxopts::value<std::string>());
|
||||
|
||||
auto result = options.parse(argc, argv);
|
||||
|
||||
@ -1823,6 +1861,12 @@ int main(int argc, char* argv[])
|
||||
// Argument: If offloaded blocks should be onboarded to primary memory before they are reused.
|
||||
benchmarkParams.kvOnboardBlocks = !result["kv_dont_onboard_blocks"].as<bool>();
|
||||
|
||||
// Argument: Medusa choices for the Medusa speculative decoding.
|
||||
if (result.count("medusa_choices"))
|
||||
{
|
||||
benchmarkParams.medusaChoices = parseVectorOfVectors(result["medusa_choices"].as<std::string>());
|
||||
}
|
||||
|
||||
std::optional<TokenIdType> padId;
|
||||
// Argument: Padding token id
|
||||
if (result.count("pad_id"))
|
||||
|
||||
@ -944,6 +944,7 @@ def build_gpt(args):
|
||||
network = builder.create_network()
|
||||
network.trt_network.name = engine_name
|
||||
network.plugin_config.to_legacy_setting()
|
||||
network.plugin_config.dtype = args.dtype
|
||||
|
||||
# Plugins
|
||||
if args.mode in ['plugin', 'plugin-ifb']:
|
||||
|
||||
@ -1,3 +1,3 @@
|
||||
version https://git-lfs.github.com/spec/v1
|
||||
oid sha256:1fec0fdc00c076761ec48eb5e2ea93473a329e844a8091e26c6e3e02fd14a8b1
|
||||
size 3931604
|
||||
oid sha256:8b6ad33047e2684c7d22471f87febbb96ae26f4eac6529e2f3b7c1469ec2ec6d
|
||||
size 3931504
|
||||
|
||||
@ -1,3 +1,3 @@
|
||||
version https://git-lfs.github.com/spec/v1
|
||||
oid sha256:1fec0fdc00c076761ec48eb5e2ea93473a329e844a8091e26c6e3e02fd14a8b1
|
||||
size 3931604
|
||||
oid sha256:560f736af15a4dfba849ab29efc3520d6ec8c87bf2aa16589299b232dc171cca
|
||||
size 3989220
|
||||
|
||||
@ -1,3 +1,3 @@
|
||||
93adf3003d7c422586a9bf892367371d libtensorrt_llm_batch_manager_static.a
|
||||
93adf3003d7c422586a9bf892367371d libtensorrt_llm_batch_manager_static.pre_cxx11.a
|
||||
c0bd2b69c932257678a2aad9bd8baba4b291795e commit
|
||||
f8538ac35803837e5d457ea8c1a58053 libtensorrt_llm_batch_manager_static.a
|
||||
dc6fc82dc4ba319899e1d6777bd8c3a4 libtensorrt_llm_batch_manager_static.pre_cxx11.a
|
||||
265b039443334094026fbd8f396d52fe29c2d9d1 commit
|
||||
@ -1,3 +1,3 @@
|
||||
version https://git-lfs.github.com/spec/v1
|
||||
oid sha256:bd757c26886a3ffd6947615d9f2829434e94839b693007a64b47c6b5c26416e4
|
||||
size 3812158
|
||||
oid sha256:74948e00ff7341914b1831ccfdce9ae242dd149603b1ba7e24ee993f08b63542
|
||||
size 3812960
|
||||
|
||||
@ -1,3 +1,3 @@
|
||||
version https://git-lfs.github.com/spec/v1
|
||||
oid sha256:87321383075adf2d87cfbdc8a12a3d3815ef058d5da9b6aaa8d7d3f3263af439
|
||||
size 3773896
|
||||
oid sha256:0421ceacd5d07bc172bb4d0979edaf466aa8950290b4d6d1a7d355dbcefc2c84
|
||||
size 3772832
|
||||
|
||||
@ -1,3 +1,3 @@
|
||||
version https://git-lfs.github.com/spec/v1
|
||||
oid sha256:58cdc0a330f8bfb7b50e3202aeac47bde0835b1dc600b4bfdcd2b30801e66e03
|
||||
size 22381766
|
||||
oid sha256:46eb1d351e3e8da3945a3f451166f12536aae3e440d57337d8891492424aff78
|
||||
size 22387798
|
||||
|
||||
@ -1,3 +1,3 @@
|
||||
version https://git-lfs.github.com/spec/v1
|
||||
oid sha256:18a967eaa1e9a7164e0b104a84b13ea95404f7c7c278375feb2513d5f063bafe
|
||||
size 1396404
|
||||
oid sha256:19585b7709736197d9c1762d1bb8e3099e298d6dcc1c521d51c83637cc624c20
|
||||
size 1397814
|
||||
|
||||
@ -1,3 +1,3 @@
|
||||
version https://git-lfs.github.com/spec/v1
|
||||
oid sha256:18a967eaa1e9a7164e0b104a84b13ea95404f7c7c278375feb2513d5f063bafe
|
||||
size 1396404
|
||||
oid sha256:f5d5475663640c068af2e9b5772b9b602656641dd17ca473ce7125ef7f2ec855
|
||||
size 1423172
|
||||
|
||||
@ -1,3 +1,3 @@
|
||||
7d12b9c04cb6738bb5f7747a88b00c1c libtensorrt_llm_executor_static.a
|
||||
7d12b9c04cb6738bb5f7747a88b00c1c libtensorrt_llm_executor_static.pre_cxx11.a
|
||||
c0bd2b69c932257678a2aad9bd8baba4b291795e commit
|
||||
e18e84fb356995b11c04b79e55c4c3f5 libtensorrt_llm_executor_static.a
|
||||
f0555b76f21d43e676e5808bf197cc58 libtensorrt_llm_executor_static.pre_cxx11.a
|
||||
265b039443334094026fbd8f396d52fe29c2d9d1 commit
|
||||
@ -1,3 +1,3 @@
|
||||
version https://git-lfs.github.com/spec/v1
|
||||
oid sha256:e503b4cfb1c842850287a359ffed23a1773a67a96475d365b66d757a283ac218
|
||||
size 1448772
|
||||
oid sha256:8496c9e4a20efd3d2072520cf843dac70cbb0fe23621cfba2a1e0ef3e5fa22ed
|
||||
size 1450288
|
||||
|
||||
@ -1,3 +1,3 @@
|
||||
version https://git-lfs.github.com/spec/v1
|
||||
oid sha256:f8c80cf7aca2b135a656a060456fb30a820e459b4b36560162b02fa65121ef50
|
||||
size 1375430
|
||||
oid sha256:1b76267834252836e26ddecc2e1b9449e33a67fb1981e5d42f721bc439be1c02
|
||||
size 1377018
|
||||
|
||||
@ -1,3 +1,3 @@
|
||||
version https://git-lfs.github.com/spec/v1
|
||||
oid sha256:cc65971d6d74260cb49b354aa4b0b82f92863cc722fbf206bf8a4919a4897532
|
||||
size 14031364
|
||||
oid sha256:9bd0faf48175745d7aeff58f539ae021db365b73933dab9c51329de9e92f2d86
|
||||
size 14039826
|
||||
|
||||
@ -424,7 +424,8 @@ std::vector<cutlass_extensions::CutlassGemmConfig> MoeGemmRunner<T, WeightType>:
|
||||
template <typename T, typename WeightType>
|
||||
bool MoeGemmRunner<T, WeightType>::isHopperSpecialised() const
|
||||
{
|
||||
bool config_is_sm90 = best_config_ && best_config_->is_sm90;
|
||||
TLLM_CHECK_WITH_INFO(best_config_, "Cannot determine if hopper is specialised without a selected config");
|
||||
bool config_is_sm90 = best_config_->is_sm90;
|
||||
return supportsHopperSpecialisation() && config_is_sm90;
|
||||
}
|
||||
|
||||
@ -440,7 +441,7 @@ int MoeGemmRunner<T, WeightType>::getSM() const
|
||||
return this->sm_;
|
||||
}
|
||||
|
||||
// currently support sm80 bf16/fp16 gate ativation, only set predication tensor for m direction
|
||||
// currently support sm80 bf16/fp16 gate activation, only set predication tensor for m direction
|
||||
template <typename T, typename WeightType>
|
||||
bool MoeGemmRunner<T, WeightType>::isFusedGatedActivation(bool is_gated_activation, int gemm_n, int gemm_k) const
|
||||
{
|
||||
|
||||
@ -1,2 +1,2 @@
|
||||
5b6c74ce66f62d2a58aa9cac16f11ad6 libtensorrt_llm_nvrtc_wrapper.so
|
||||
c0bd2b69c932257678a2aad9bd8baba4b291795e commit
|
||||
265b039443334094026fbd8f396d52fe29c2d9d1 commit
|
||||
@ -1,3 +1,3 @@
|
||||
version https://git-lfs.github.com/spec/v1
|
||||
oid sha256:84319476e8ecf9666f40f69355f19ec3b585fc0987f940be14af9e11e3f524c3
|
||||
oid sha256:9f2f97eb5b4181917a47b6028a857d7a597ca93faa5846af42c4cb24797d7fa7
|
||||
size 1080832
|
||||
|
||||
@ -1072,10 +1072,38 @@ std::vector<size_t> CutlassMoeFCRunner<T, WeightType, OutputType, Enable>::getWo
|
||||
size_t const hopper_size = using_hopper ? HopperGroupedGemmInput::workspaceSize(num_experts_per_node) : 0;
|
||||
size_t const gemm_workspace_size = moe_gemm_runner_.getMaxWorkspaceSize(num_experts_per_node);
|
||||
|
||||
std::vector<size_t> workspace{source_rows_size, permuted_rows_size, permuted_experts_size, permuted_data_size,
|
||||
total_rows_before_expert_size, softmax_out_size, glu_inter_size,
|
||||
// We do some overlapping of the large workspace buffers. Although we could overlap some of the other buffers, they
|
||||
// are small enough (i.e no factor of hidden size) they will only be a couple MiB at most, so we don't bother
|
||||
// in the case of fused activation we overlap permuted_data and fc2_result
|
||||
// in the case of unfused activation we overlap permuted_data and fc1_result
|
||||
// we need to calculate the max possible size, so use the max of all three
|
||||
size_t overlapped_gemm1_gemm2_inputs = std::max(permuted_data_size, fc2_result_size);
|
||||
// When glu_inter_elems is 0 we are always fused, otherwise we may need the un-fused case
|
||||
if (glu_inter_elems > 0)
|
||||
{
|
||||
overlapped_gemm1_gemm2_inputs = std::max(overlapped_gemm1_gemm2_inputs, fc1_result_size);
|
||||
}
|
||||
|
||||
// if we have glu_inter we overlap it with fc2_result, otherwise we use fc1_result by itself
|
||||
size_t overlapped_gemm1_gemm2_outputs = fc1_result_size;
|
||||
if (glu_inter_elems > 0)
|
||||
{
|
||||
overlapped_gemm1_gemm2_outputs
|
||||
= std::max(std::max(glu_inter_size, fc2_result_size), overlapped_gemm1_gemm2_outputs);
|
||||
}
|
||||
|
||||
std::vector<size_t> workspace{ //
|
||||
source_rows_size, //
|
||||
permuted_rows_size, //
|
||||
permuted_experts_size, //
|
||||
total_rows_before_expert_size, //
|
||||
softmax_out_size, //
|
||||
sorter_size, //
|
||||
// These pointers reuse the same memory
|
||||
std::max(fc1_result_size, sorter_size), fc2_result_size, hopper_size, gemm_workspace_size};
|
||||
overlapped_gemm1_gemm2_inputs, //
|
||||
overlapped_gemm1_gemm2_outputs, //
|
||||
hopper_size, //
|
||||
gemm_workspace_size};
|
||||
return workspace;
|
||||
}
|
||||
|
||||
@ -1088,7 +1116,9 @@ size_t CutlassMoeFCRunner<T, WeightType, OutputType, Enable>::getWorkspaceSize(i
|
||||
TLLM_CHECK_WITH_INFO(num_experts % ep_size == 0, "Number of experts must be a multiple of ep size");
|
||||
auto workspace = getWorkspaceBufferSizes(
|
||||
num_rows, hidden_size, inter_size, num_experts, num_experts / ep_size, k, activation_type);
|
||||
return tensorrt_llm::common::calculateTotalWorkspaceSize(workspace.data(), workspace.size());
|
||||
auto ws_size = tensorrt_llm::common::calculateTotalWorkspaceSize(workspace.data(), workspace.size());
|
||||
TLLM_LOG_DEBUG("Mixture Of Experts Plugin requires workspace of %2f MiB", ws_size / 1024.f / 1024.f);
|
||||
return ws_size;
|
||||
}
|
||||
|
||||
template <class T, class WeightType, class OutputType, class Enable>
|
||||
@ -1109,29 +1139,38 @@ void CutlassMoeFCRunner<T, WeightType, OutputType, Enable>::configureWsPtrs(char
|
||||
source_rows_ = (int*) ws_sliced[0];
|
||||
permuted_rows_ = (int*) ws_sliced[1];
|
||||
permuted_experts_ = (int*) ws_sliced[2];
|
||||
permuted_data_ = (T*) ws_sliced[3];
|
||||
|
||||
total_rows_before_expert_ = (int64_t*) ws_sliced[4];
|
||||
total_rows_before_expert_ = (int64_t*) ws_sliced[3];
|
||||
|
||||
softmax_out_ = nullptr;
|
||||
bool const is_pow_2 = (num_experts != 0) && ((num_experts & (num_experts - 1)) == 0);
|
||||
if (!is_pow_2 || num_experts > 256)
|
||||
{
|
||||
softmax_out_ = (float*) ws_sliced[5];
|
||||
softmax_out_ = (float*) ws_sliced[4];
|
||||
}
|
||||
|
||||
glu_inter_result_ = (T*) ws_sliced[6];
|
||||
sorter_ws_ = (char*) ws_sliced[5];
|
||||
|
||||
// These pointers are aliased. Since the sort ws can be overwritten after it is finished
|
||||
sorter_ws_ = (char*) ws_sliced[7];
|
||||
fc1_result_ = (T*) ws_sliced[7];
|
||||
// Always 6, but overlapped with either fc1_result_ or fc2_result_
|
||||
permuted_data_ = (T*) ws_sliced[6];
|
||||
|
||||
fc2_result_ = (T*) ws_sliced[8];
|
||||
bool const is_gated_activation = isGatedActivation(activation_type);
|
||||
bool const use_fused_moe = moe_gemm_runner_.isFusedGatedActivation(is_gated_activation, inter_size, hidden_size);
|
||||
bool const using_hopper = moe_gemm_runner_.isHopperSpecialised();
|
||||
bool const hopper_has_glu = using_hopper && (mayHaveDifferentGEMMOutputType() || is_gated_activation);
|
||||
bool const non_hopper_has_glu = !using_hopper && !use_fused_moe && is_gated_activation;
|
||||
bool const has_glu_inter_result = hopper_has_glu || non_hopper_has_glu;
|
||||
// Always 7, ignored if not needed
|
||||
glu_inter_result_ = has_glu_inter_result ? (T*) ws_sliced[7] : nullptr;
|
||||
|
||||
// fc1 and fc2 alias one of the above pointers, but it depends on if actfn is fused/unfused which is overlapped
|
||||
fc1_result_ = has_glu_inter_result ? (T*) ws_sliced[6] : (T*) ws_sliced[7];
|
||||
fc2_result_ = has_glu_inter_result ? (T*) ws_sliced[7] : (T*) ws_sliced[6];
|
||||
|
||||
hopper_grouped_gemm_input_ = {};
|
||||
if (moe_gemm_runner_.isHopperSpecialised())
|
||||
{
|
||||
hopper_grouped_gemm_input_.configureWorkspace(ws_sliced[9], num_experts_per_node, ws_sliced[10], ws_sizes[10]);
|
||||
hopper_grouped_gemm_input_.configureWorkspace(ws_sliced[8], num_experts_per_node, ws_sliced[9], ws_sizes[9]);
|
||||
}
|
||||
}
|
||||
|
||||
@ -1293,6 +1332,7 @@ void CutlassMoeFCRunner<T, WeightType, OutputType, Enable>::runMoe(void const* i
|
||||
}
|
||||
else
|
||||
{
|
||||
|
||||
// Run the GEMM with activation function overridden with `Identity`, we do the activation separately
|
||||
ActivationType activation_type = (use_fused_moe) ? fc1_activation_type : ActivationType::Identity;
|
||||
T* gemm_result = (use_fused_moe) ? fc1_result_ : static_cast<T*>(glu_inter_result_);
|
||||
|
||||
@ -431,7 +431,8 @@ void InitBindings(pybind11::module_& m)
|
||||
&tle::DecodingConfig::setLookaheadDecoding)
|
||||
.def_property("medusa_choices", &tle::DecodingConfig::getMedusaChoices, &tle::DecodingConfig::setMedusaChoices);
|
||||
|
||||
auto executorConfigGetState = [&](tle::ExecutorConfig const& self)
|
||||
auto executorConfigGetState = [&peftCacheConfigGetstate, &kvCacheConfigGetstate, &schedulerConfigGetstate,
|
||||
¶llelConfigGetstate](tle::ExecutorConfig const& self)
|
||||
{
|
||||
py::object peftCacheConfigState = py::none();
|
||||
|
||||
@ -453,7 +454,8 @@ void InitBindings(pybind11::module_& m)
|
||||
peftCacheConfigState, self.getLogitsPostProcessorMap(), self.getLogitsPostProcessorBatched(),
|
||||
self.getDecodingConfig(), self.getGpuWeightsPercent());
|
||||
};
|
||||
auto executorConfigSetState = [&](py::tuple state)
|
||||
auto executorConfigSetState = [&kvCacheConfigSetstate, &peftCacheConfigSetstate, &schedulerConfigSetstate,
|
||||
¶llelConfigSetstate](py::tuple state)
|
||||
{
|
||||
if (state.size() != 15)
|
||||
{
|
||||
|
||||
@ -96,8 +96,11 @@ void MedusaModule::initMedusaTensorsFromChoices(MedusaChoices const& choices, st
|
||||
if (curDepth != depth)
|
||||
{
|
||||
TLLM_CHECK(depth + 1 == curDepth);
|
||||
TLLM_CHECK_WITH_INFO(depth <= getMaxDraftPathLen(),
|
||||
"Medusa choices require more Medusa heads than the engine was built with.");
|
||||
// Save TopK
|
||||
topKs[depth - 1] = maxTopK;
|
||||
|
||||
// Accumulate TopK for global indexing in tree
|
||||
globalNodeInTreeIdx += maxTopK;
|
||||
|
||||
|
||||
@ -257,7 +257,7 @@ protected:
|
||||
template <class T>
|
||||
T* allocBuffer(size_t size)
|
||||
{
|
||||
managed_buffers.emplace_back(mBufferManager->managed(size * sizeof(T)));
|
||||
managed_buffers.emplace_back(mBufferManager->gpu(size * sizeof(T)));
|
||||
EXPECT_EQ(cudaGetLastError(), cudaSuccess) << "Error allocating buffer of size: " << size;
|
||||
T* ptr = static_cast<T*>(managed_buffers.back()->data());
|
||||
return ptr;
|
||||
@ -268,15 +268,27 @@ protected:
|
||||
this->managed_buffers.clear(); // Make sure all the previous buffers are freed
|
||||
check_cuda_error(cudaDeviceSynchronize()); // Sync to make sure all previous operations are resolved
|
||||
|
||||
size_t weight_size = hidden_size * hidden_size * 4 * num_experts * sizeof(WeightType);
|
||||
// Skip the test if the GPU does not have enough memory
|
||||
size_t workspace_size = this->mMoERunner.getWorkspaceSize(
|
||||
// Calculate the size contributions for all the large buffers to check if the GPU has enough space
|
||||
bool const is_gated = tensorrt_llm::isGatedActivation(mActType);
|
||||
size_t const num_gemms = 2 + is_gated;
|
||||
// Expert weights
|
||||
size_t const weight_size = hidden_size * (hidden_size * 4) * num_experts * sizeof(WeightStorage) * num_gemms;
|
||||
// Workspace size
|
||||
size_t const workspace_size = this->mMoERunner.getWorkspaceSize(
|
||||
num_tokens, hidden_size, hidden_size * 4, num_experts, k, this->mActType, {});
|
||||
// The input/output buffers
|
||||
size_t const in_out_size = 2 * num_tokens * hidden_size * sizeof(DataType);
|
||||
|
||||
size_t total_size = workspace_size + weight_size * 2;
|
||||
// This should be correct to within 100MiB (on tests with 30GiB total)
|
||||
size_t const total_size = workspace_size + weight_size + in_out_size;
|
||||
|
||||
size_t const memory_pool_free_mem_size = mBufferManager->memoryPoolFree();
|
||||
auto const [freeMem, totalMem] = tensorrt_llm::common::getDeviceMemoryInfo(false);
|
||||
return freeMem >= total_size;
|
||||
float const freeMemBuffer = 0.9f; // Add some buffer so we aren't completely pushing the limits
|
||||
std::cout << "Free memory is: " << freeMem << ", memory pool size is: " << memory_pool_free_mem_size
|
||||
<< ", required memory is: " << total_size << ", device total memory capacity: " << totalMem
|
||||
<< std::endl;
|
||||
return (freeMem + memory_pool_free_mem_size) * freeMemBuffer >= total_size;
|
||||
}
|
||||
|
||||
void initBuffersPermute(std::vector<std::vector<DataType>> h_hidden_states,
|
||||
@ -362,7 +374,10 @@ protected:
|
||||
initFP8Scales(mMaxInput);
|
||||
}
|
||||
|
||||
mTpExpertScratch = allocBuffer<DataType>(mTpExpertScratchSize);
|
||||
if (parallelism_config.tp_size > 1 || parallelism_config.ep_size > 1)
|
||||
{
|
||||
mTpExpertScratch = allocBuffer<DataType>(mTpExpertScratchSize);
|
||||
}
|
||||
|
||||
mActiveRows = mTotalTokens;
|
||||
mFinished = nullptr;
|
||||
@ -475,10 +490,18 @@ protected:
|
||||
ASSERT_NE(mExpertFP8Scale1, nullptr);
|
||||
ASSERT_NE(mExpertFP8Scale2, nullptr);
|
||||
ASSERT_NE(mExpertFP8Scale3, nullptr);
|
||||
|
||||
// Dequant values for each expert are 1/(w_i*a_i) calculated above
|
||||
std::fill_n(mExpertFP8Scale1, mNumExperts, 1.f / (scaleW1 * scaleAct1));
|
||||
std::fill_n(mExpertFP8Scale3, mNumExperts, 1.f / (scaleW2 * scaleAct2));
|
||||
*mExpertFP8Scale2 = scaleAct2;
|
||||
std::vector<float> scales_1(mNumExperts, 1.f / (scaleW1 * scaleAct1));
|
||||
std::vector<float> scales_2(1, scaleAct2);
|
||||
std::vector<float> scales_3(mNumExperts, 1.f / (scaleW2 * scaleAct2));
|
||||
|
||||
check_cuda_error(cudaMemcpyAsync(mExpertFP8Scale1, scales_1.data(), scales_1.size() * sizeof(float),
|
||||
cudaMemcpyHostToDevice, mStream->get()));
|
||||
check_cuda_error(cudaMemcpyAsync(mExpertFP8Scale2, scales_2.data(), scales_2.size() * sizeof(float),
|
||||
cudaMemcpyHostToDevice, mStream->get()));
|
||||
check_cuda_error(cudaMemcpyAsync(mExpertFP8Scale3, scales_3.data(), scales_3.size() * sizeof(float),
|
||||
cudaMemcpyHostToDevice, mStream->get()));
|
||||
|
||||
check_cuda_error(cudaStreamSynchronize(mStream->get()));
|
||||
}
|
||||
@ -561,6 +584,13 @@ protected:
|
||||
void* ep_scale_2 = FP8 ? (void*) mExpertFP8Scale2 : (void*) mExpertIntScale2;
|
||||
void* ep_scale_3 = FP8 ? mExpertFP8Scale3 : nullptr;
|
||||
|
||||
// Handle the case with no parallelism to not require the extra alloc
|
||||
if (parallelism_config.tp_size == 1 && parallelism_config.ep_size == 1)
|
||||
{
|
||||
return std::tuple{
|
||||
mExpertWeight1, mExpertWeight2, mExpertBias1, mExpertBias2, ep_scale_1, ep_scale_2, ep_scale_3};
|
||||
}
|
||||
|
||||
// Slice weights for EP
|
||||
size_t const gated_inter = mInterSize * mGatedMultiplier;
|
||||
size_t const experts_per_node = mNumExperts / parallelism_config.ep_size;
|
||||
|
||||
@ -28,7 +28,7 @@ def build_engine(weight_dir: _pl.Path, medusa_dir: _pl.Path,
|
||||
covert_cmd = [_sys.executable, "examples/medusa/convert_checkpoint.py"] + (
|
||||
['--model_dir', str(weight_dir)] if weight_dir else []) + [
|
||||
'--medusa_model_dir', str(medusa_dir), \
|
||||
'--output_dir', str(engine_dir), '--dtype=float16', '--fixed_num_medusa_heads=4'
|
||||
'--output_dir', str(engine_dir), '--dtype=float16', '--num_medusa_heads=4'
|
||||
] + list(args)
|
||||
|
||||
run_command(covert_cmd)
|
||||
|
||||
@ -1,6 +1,6 @@
|
||||
# Multi-stage Dockerfile
|
||||
ARG BASE_IMAGE=nvcr.io/nvidia/pytorch
|
||||
ARG BASE_TAG=24.04-py3
|
||||
ARG BASE_TAG=24.05-py3
|
||||
ARG DEVEL_IMAGE=devel
|
||||
|
||||
FROM ${BASE_IMAGE}:${BASE_TAG} as base
|
||||
|
||||
@ -4,8 +4,8 @@ set -ex
|
||||
|
||||
# Use latest stable version from https://pypi.org/project/torch/#history
|
||||
# and closest to the version specified in
|
||||
# https://docs.nvidia.com/deeplearning/frameworks/pytorch-release-notes/rel-24-04.html#rel-24-04
|
||||
TORCH_VERSION="2.3.0"
|
||||
# https://docs.nvidia.com/deeplearning/frameworks/pytorch-release-notes/rel-24-05.html#rel-24-05
|
||||
TORCH_VERSION="2.3.1"
|
||||
SYSTEM_ID=$(grep -oP '(?<=^ID=).+' /etc/os-release | tr -d '"')
|
||||
|
||||
prepare_environment() {
|
||||
|
||||
@ -2,12 +2,12 @@
|
||||
|
||||
set -ex
|
||||
|
||||
TRT_VER="10.0.1.6"
|
||||
TRT_VER="10.1.0.27"
|
||||
# Align with the pre-installed cuDNN / cuBLAS / NCCL versions from
|
||||
# https://docs.nvidia.com/deeplearning/frameworks/pytorch-release-notes/rel-24-04.html#rel-24-04
|
||||
# https://docs.nvidia.com/deeplearning/frameworks/pytorch-release-notes/rel-24-05.html#rel-24-05
|
||||
CUDA_VER="12.4" # 12.4.1
|
||||
# Keep the installation for cuDNN if users want to install PyTorch with source codes.
|
||||
# PyTorch 2.3.0 can compile with cuDNN v9.
|
||||
# PyTorch 2.3.x can compile with cuDNN v9.
|
||||
CUDNN_VER="9.1.0.70-1"
|
||||
NCCL_VER="2.21.5-1+cuda12.4"
|
||||
CUBLAS_VER="12.4.5.8-1"
|
||||
@ -86,8 +86,7 @@ install_tensorrt() {
|
||||
if [ "$ARCH" = "amd64" ];then ARCH="x86_64";fi
|
||||
if [ "$ARCH" = "x86_64" ];then DIR_NAME="x64-agnostic"; else DIR_NAME=${ARCH};fi
|
||||
if [ "$ARCH" = "aarch64" ];then OS1="Ubuntu22_04" && OS2="Ubuntu-22.04" && OS="ubuntu-22.04"; else OS1="Linux" && OS2="Linux" && OS="linux";fi
|
||||
RELEASE_URL_TRT=https://developer.nvidia.com/downloads/compute/machine-learning/tensorrt/10.0.1/tars/TensorRT-${TRT_VER}.${OS2}.${ARCH}-gnu.cuda-${TRT_CUDA_VERSION}.tar.gz
|
||||
|
||||
RELEASE_URL_TRT=https://developer.nvidia.com/downloads/compute/machine-learning/tensorrt/10.1.0/tars/TensorRT-${TRT_VER}.${OS2}.${ARCH}-gnu.cuda-${TRT_CUDA_VERSION}.tar.gz
|
||||
fi
|
||||
wget --no-verbose ${RELEASE_URL_TRT} -O /tmp/TensorRT.tar
|
||||
tar -xf /tmp/TensorRT.tar -C /usr/local/
|
||||
|
||||
@ -44,9 +44,9 @@ The following table shows the supported software for TensorRT-LLM.
|
||||
* -
|
||||
- Software Compatibility
|
||||
* - Container
|
||||
- [24.04](https://docs.nvidia.com/deeplearning/frameworks/support-matrix/index.html)
|
||||
- [24.05](https://docs.nvidia.com/deeplearning/frameworks/support-matrix/index.html)
|
||||
* - TensorRT
|
||||
- [10.0](https://docs.nvidia.com/deeplearning/tensorrt/release-notes/index.html)
|
||||
- [10.1](https://docs.nvidia.com/deeplearning/tensorrt/release-notes/index.html)
|
||||
* - Precision
|
||||
-
|
||||
- Hopper (SM90) - FP32, FP16, BF16, FP8, INT8, INT4
|
||||
|
||||
@ -23,10 +23,11 @@ All published functionality in the Release Notes has been fully tested and verif
|
||||
- TBD
|
||||
|
||||
### Infrastructure changes
|
||||
- Base Docker image for TensorRT-LLM is updated to `nvcr.io/nvidia/pytorch:24.04-py3`.
|
||||
- Base Docker image for TensorRT-LLM backend is updated to `nvcr.io/nvidia/tritonserver:24.04-py3`.
|
||||
- Base Docker image for TensorRT-LLM is updated to `nvcr.io/nvidia/pytorch:24.05-py3`.
|
||||
- Base Docker image for TensorRT-LLM backend is updated to `nvcr.io/nvidia/tritonserver:24.05-py3`.
|
||||
- The dependent TensorRT version is updated to 10.1.0.
|
||||
- The dependent CUDA version is updated to 12.4.1.
|
||||
- The dependent PyTorch version is updated to 2.3.0.
|
||||
- The dependent PyTorch version is updated to 2.3.1.
|
||||
|
||||
|
||||
## TensorRT-LLM Release 0.10.0
|
||||
|
||||
@ -15,10 +15,10 @@ If the first assumption holds true, the latency of speculative decoding will no
|
||||
The combination of both these allows speculative decoding to result in reduced latency.
|
||||
|
||||
TensorRT-LLM supports several approaches for generating draft tokens, including:
|
||||
|
||||
1. Utilizing a smaller, auxiliary model, known as the draft model approach. For more information, refer to the [Fast Inference from Transformers via Speculative Decoding paper](https://arxiv.org/pdf/2211.17192.pdf).
|
||||
2. Implementing additional language model heads that predict tokens for future positions, as detailed in the [Medusa: Simple LLM Inference Acceleration Framework with Multiple Decoding Heads paper](https://arxiv.org/abs/2401.10774).
|
||||
|
||||
|
||||
## Performance Improvements
|
||||
|
||||
It's important to note that the effectiveness of speculative decoding techniques is highly dependent
|
||||
@ -56,6 +56,166 @@ it is advisable to enable KV cache reuse for both models.
|
||||
This can be achieved by adding the `--use_paged_context_fmha=enable` flag to the `trtllm-build` command
|
||||
and setting `enableBlockReuse=true` in the `KVCacheConfig`.
|
||||
|
||||
## Using Draft model approach with Triton Inference Server
|
||||
|
||||
+ Draft model approach is supported since TensorRT-LLM-0.7.0 (using two separate Tritonserver to maintain draft and target model respectively), but has significant optimization in TensorRT-LLM-0.10.0 (using one Tritonserver with [Business Logic Scripting](https://github.com/triton-inference-server/python_backend?tab=readme-ov-file#business-logic-scripting), BLS).
|
||||
+ The source file of Draft model with BLS can be found [here](https://github.com/triton-inference-server/tensorrtllm_backend/blob/main/all_models/inflight_batcher_llm/tensorrt_llm_bls/1/lib/decode.py).
|
||||
+ This example is based on TensorRT-LLM-0.10.0 and TRTLLM-backend-0.10.0, using docker image `nvcr.io/nvidia/tritonserver:24.05-trtllm-py3`.
|
||||
+ Llama-7B-hf and Llama-30B-hf are used as draft and target model respectively in this example, assuming the paths to the models' repository are `DRAFT_MODEL_PATH` and `TARGET_MODEL_PATH`.
|
||||
+ Maximum number of draft tokens is set to 10 in this example.
|
||||
|
||||
1. Prepare TensorRT engine for inference
|
||||
+ Here are the commands to build draft / target engines in FP16 or FP8. All combinations of the data type (Draft-FP16/FP8 + Target-FP16/FP8) are supported.
|
||||
+ `--remove_input_padding=enable --paged_kv_cache=enable` are necessary for inflight-batching.
|
||||
+ `--context_fmha=enable --use_paged_context_fmha=enable` are optional, but recommended for the performance.
|
||||
+ `--gather_generation_logits` is necessary if using generation logits for selecting tokens in target model.
|
||||
+ `--tp_size` can be modified set if using TP mode for draft / target model.
|
||||
+ `--max_batch_size` more than 1 is acceptable in general usage, but we use 1 in this example.
|
||||
|
||||
```bash
|
||||
export MAX_DRAFT_LENGTH=10
|
||||
export COMMON_COMMAND="--max_batch_size=1 --max_input_len=2048 --max_output_len=1024 --gpt_attention_plugin=float16 --gemm_plugin=float16 --remove_input_padding=enable --paged_kv_cache=enable --context_fmha=enable --use_paged_context_fmha=enable --gather_generation_logits"
|
||||
export DRAFT_COMMAND_FP16="$COMMON_COMMAND"
|
||||
export TARGET_COMMAND_FP16="$DRAFT_COMMAND_FP16 --max_draft_len=$MAX_DRAFT_LENGTH --speculative_decoding_mode draft_tokens_external"
|
||||
export DRAFT_COMMAND_FP8="$COMMON_COMMAND --strongly_typed --use_fp8_context_fmha=enable"
|
||||
export TARGET_COMMAND_FP8="$DRAFT_COMMAND_FP8 --max_draft_len=$MAX_DRAFT_LENGTH --speculative_decoding_mode draft_tokens_external"
|
||||
|
||||
# Build checkpoints and engines in tensorrt_llm/examples/llama/
|
||||
# FP16 mode
|
||||
export DRAFT_NAME=llama-7b-fp16-tp1
|
||||
export TARGET_NAME=llama-30b-fp16-tp1
|
||||
python3 convert_checkpoint.py --model_dir=$DRAFT_MODEL_PATH --output_dir=ckpt/$DRAFT_NAME --tp_size=1
|
||||
python3 convert_checkpoint.py --model_dir=$TARGET_MODEL_PATH --output_dir=ckpt/$TARGET_NAME --tp_size=1
|
||||
trtllm-build --checkpoint_dir=ckpt/$DRAFT_NAME --output_dir=engine/draft/$DRAFT_NAME $DRAFT_COMMAND_FP16
|
||||
trtllm-build --checkpoint_dir=ckpt/$TARGET_NAME --output_dir=engine/target/$TARGET_NAME $TARGET_COMMAND_FP16
|
||||
export DRAFT_ENGINE_PATH=$(pwd)/engine/draft/$DRAFT_NAME
|
||||
export TARGET_ENGINE_PATH=$(pwd)/engine/target/$TARGET_NAME
|
||||
|
||||
# FP8 mode
|
||||
export DRAFT_NAME=llama-7b-fp8-tp1
|
||||
export TARGET_NAME=llama-30b-fp8-tp1
|
||||
python3 convert_checkpoint.py --model_dir=$DRAFT_MODEL_PATH --output_dir=ckpt/$DRAFT_NAME --tp_size=1
|
||||
python3 convert_checkpoint.py --model_dir=$TARGET_MODEL_PATH --output_dir=ckpt/$TARGET_NAME --tp_size=1
|
||||
trtllm-build --checkpoint_dir=ckpt/$DRAFT_NAME --output_dir=engine/draft/$DRAFT_NAME $DRAFT_COMMAND_FP8
|
||||
trtllm-build --checkpoint_dir=ckpt/$TARGET_NAME --output_dir=engine/target/$TARGET_NAME $TARGET_COMMAND_FP8
|
||||
export DRAFT_ENGINE_PATH=$(pwd)/engine/draft/$DRAFT_NAME
|
||||
export TARGET_ENGINE_PATH=$(pwd)/engine/target/$TARGET_NAME
|
||||
```
|
||||
|
||||
2. Edit Triton configuration
|
||||
+ If both draft and target model can be placed in one GPU (for example, llama-7B-FP8 + llama-30B-FP8, totally 40GiB in one H100-80GiB GPU), `DRAFT_GPU_DEVICE_IDS` and `TARGET_GPU_DEVICE_IDS` can be the same, `0` as example. It appears better performance than placing on two separate GPUs.
|
||||
+ Elsewise, the draft and target models can be placed in different GPUs, `DRAFT_GPU_DEVICE_IDS="0"` and `TARGET_GPU_DEVICE_IDS="1"` as example.
|
||||
+ Furthermore, if TP mode is used, the value of `GPU_DEVICE_IDS` can be a list, `DRAFT_GPU_DEVICE_IDS="0"` and `TARGET_GPU_DEVICE_IDS="1,2,3,4"` as example.
|
||||
+ For more configuration of launching models with Tritonserver, please visit [TensorRT-LLM Backed repo](https://github.com/triton-inference-server/tensorrtllm_backend/blob/main/README.md).
|
||||
|
||||
```bash
|
||||
ACCUMULATE_TOKEN="false"
|
||||
BACKEND="tensorrtllm"
|
||||
BATCH_SCHEDULER_POLICY="guaranteed_no_evict"
|
||||
BATCHING_STRATEGY="inflight_fused_batching"
|
||||
BLS_INSTANCE_COUNT="1"
|
||||
DECODING_MODE="top_k_top_p"
|
||||
DECOUPLED_MODE="False"
|
||||
DRAFT_GPU_DEVICE_IDS="0"
|
||||
E2E_MODEL_NAME="ensemble"
|
||||
ENABLE_KV_CACHE_REUSE="true"
|
||||
ENGINE_PATH=$TARGET_ENGINE_PATH
|
||||
EXCLUDE_INPUT_IN_OUTPUT="false"
|
||||
KV_CACHE_FREE_GPU_MEM_FRACTION="0.8"
|
||||
MAX_ATTENTION_WINDOW_SIZE=""
|
||||
MAX_BEAM_WIDTH="1"
|
||||
MAX_QUEUE_DELAY_MICROSECONDS="0"
|
||||
MAX_TOKENS_IN_KV_CACHE=""
|
||||
NORMALIZE_LOG_PROBS="true"
|
||||
POSTPROCESSING_INSTANCE_COUNT="1"
|
||||
PREPROCESSING_INSTANCE_COUNT="1"
|
||||
TARGET_GPU_DEVICE_IDS="1"
|
||||
TENSORRT_LLM_DRAFT_MODEL_NAME="tensorrt_llm_draft"
|
||||
TENSORRT_LLM_MODEL_NAME="tensorrt_llm"
|
||||
TOKENIZER_PATH=$DRAFT_MODEL_PATH
|
||||
TOKENIZER_TYPE=llama
|
||||
TRITON_GRPC_PORT="8001"
|
||||
TRITON_HTTP_PORT="8000"
|
||||
TRITON_MAX_BATCH_SIZE="4"
|
||||
TRITON_METRICS_PORT="8002"
|
||||
TRITON_REPO="triton_repo"
|
||||
USE_DRAFT_LOGITS="false"
|
||||
|
||||
# Make a copy of triton repo and replace the fields in the configuration files
|
||||
cd /tensorrtllm_backend/
|
||||
apt-get update && apt-get install -y build-essential cmake git-lfs
|
||||
pip3 install git-lfs tritonclient grpcio
|
||||
rm -rf ${TRITON_REPO}
|
||||
cp -R all_models/inflight_batcher_llm ${TRITON_REPO}
|
||||
python3 tools/fill_template.py -i ${TRITON_REPO}/ensemble/config.pbtxt triton_max_batch_size:${TRITON_MAX_BATCH_SIZE}
|
||||
python3 tools/fill_template.py -i ${TRITON_REPO}/preprocessing/config.pbtxt tokenizer_dir:${TOKENIZER_PATH},triton_max_batch_size:${TRITON_MAX_BATCH_SIZE},preprocessing_instance_count:${PREPROCESSING_INSTANCE_COUNT}
|
||||
python3 tools/fill_template.py -i ${TRITON_REPO}/postprocessing/config.pbtxt tokenizer_dir:${TOKENIZER_PATH},triton_max_batch_size:${TRITON_MAX_BATCH_SIZE},postprocessing_instance_count:${POSTPROCESSING_INSTANCE_COUNT}
|
||||
python3 tools/fill_template.py -i ${TRITON_REPO}/tensorrt_llm_bls/config.pbtxt triton_max_batch_size:${TRITON_MAX_BATCH_SIZE},decoupled_mode:${DECOUPLED_MODE},accumulate_tokens:${ACCUMULATE_TOKEN},bls_instance_count:${BLS_INSTANCE_COUNT},tensorrt_llm_model_name:${TENSORRT_LLM_MODEL_NAME},tensorrt_llm_draft_model_name:${TENSORRT_LLM_DRAFT_MODEL_NAME}
|
||||
|
||||
# Make a copy of tensorrt_llm as configurations of draft / target models.
|
||||
cp -R ${TRITON_REPO}/tensorrt_llm ${TRITON_REPO}/tensorrt_llm_draft
|
||||
sed -i 's/name: "tensorrt_llm"/name: "tensorrt_llm_draft"/g' ${TRITON_REPO}/tensorrt_llm_draft/config.pbtxt
|
||||
python3 tools/fill_template.py -i ${TRITON_REPO}/tensorrt_llm/config.pbtxt triton_backend:${BACKEND},engine_dir:${ENGINE_PATH},decoupled_mode:${DECOUPLED_MODE},max_tokens_in_paged_kv_cache:${MAX_TOKENS_IN_KV_CACHE},max_attention_window_size:${MAX_ATTENTION_WINDOW_SIZE},batch_scheduler_policy:${BATCH_SCHEDULER_POLICY},batching_strategy:${BATCHING_STRATEGY},kv_cache_free_gpu_mem_fraction:${KV_CACHE_FREE_GPU_MEM_FRACTION},exclude_input_in_output:${EXCLUDE_INPUT_IN_OUTPUT},triton_max_batch_size:${TRITON_MAX_BATCH_SIZE},max_queue_delay_microseconds:${MAX_QUEUE_DELAY_MICROSECONDS},max_beam_width:${MAX_BEAM_WIDTH},enable_kv_cache_reuse:${ENABLE_KV_CACHE_REUSE},normalize_log_probs:${NORMALIZE_LOG_PROBS},enable_chunked_context:${ENABLE_CHUNKED_CONTEXT},gpu_device_ids:${TARGET_GPU_DEVICE_IDS},decoding_mode:${DECODING_MODE}
|
||||
python3 tools/fill_template.py -i ${TRITON_REPO}/tensorrt_llm_draft/config.pbtxt triton_backend:${BACKEND},engine_dir:${DRAFT_ENGINE_PATH},decoupled_mode:${DECOUPLED_MODE},max_tokens_in_paged_kv_cache:${MAX_TOKENS_IN_KV_CACHE},max_attention_window_size:${MAX_ATTENTION_WINDOW_SIZE},batch_scheduler_policy:${BATCH_SCHEDULER_POLICY},batching_strategy:${BATCHING_STRATEGY},kv_cache_free_gpu_mem_fraction:${KV_CACHE_FREE_GPU_MEM_FRACTION},exclude_input_in_output:${EXCLUDE_INPUT_IN_OUTPUT},triton_max_batch_size:${TRITON_MAX_BATCH_SIZE},max_queue_delay_microseconds:${MAX_QUEUE_DELAY_MICROSECONDS},max_beam_width:${MAX_BEAM_WIDTH},enable_kv_cache_reuse:${ENABLE_KV_CACHE_REUSE},normalize_log_probs:${NORMALIZE_LOG_PROBS},enable_chunked_context:${ENABLE_CHUNKED_CONTEXT},gpu_device_ids:${DRAFT_GPU_DEVICE_IDS},decoding_mode:${DECODING_MODE}
|
||||
```
|
||||
|
||||
3. Launch Triton server
|
||||
+ `--multi-model` is necessary if TP mode is used for target model.
|
||||
|
||||
```bash
|
||||
python3 scripts/launch_triton_server.py \
|
||||
--model_repo=${TRITON_REPO} \
|
||||
--tensorrt_llm_model_name "tensorrt_llm,tensorrt_llm_draft" \
|
||||
--multi-model \
|
||||
--log &
|
||||
```
|
||||
|
||||
+ Verbose log will be written in to file `triton_log.txt`. Triton server launches successfully if you see the output below in the file:
|
||||
|
||||
```txt
|
||||
Started HTTPService at 0.0.0.0:8000
|
||||
Started GRPCInferenceService at 0.0.0.0:8001
|
||||
Started Metrics Service at 0.0.0.0:8002
|
||||
```
|
||||
|
||||
4. Send Requests
|
||||
+ Prepare a JSON file `input_data.json` containing input data as below (more requests are acceptable).
|
||||
|
||||
```json
|
||||
[
|
||||
{
|
||||
"input": "James Best, best known for his ",
|
||||
"instruction": "Continue writing the following story:",
|
||||
"output": " "
|
||||
}
|
||||
]
|
||||
```
|
||||
|
||||
+ Use command below to launch requests for inference.
|
||||
+ `--num-draft-tokens` can be modified by runtime draft lengths, 4 is used in this example.
|
||||
|
||||
```bash
|
||||
python3 tools/inflight_batcher_llm/speculative_decoding_test.py \
|
||||
--max-input-len 2048 \
|
||||
--dataset=input_data.json \
|
||||
--url-target=localhost:8001 \
|
||||
--url-draft=localhost:8001 \
|
||||
--draft-tensorrt-llm-model-name="${TENSORRT_LLM_DRAFT_MODEL_NAME}" \
|
||||
--target-tensorrt-llm-model-name="${TENSORRT_LLM_MODEL_NAME}" \
|
||||
--bls-speculative-tensorrt-llm-model-name="tensorrt_llm_bls" \
|
||||
--execute-bls-speculative-decoding \
|
||||
--disable-output-comparison \
|
||||
--num-draft-tokens=4 \
|
||||
--verbose
|
||||
```
|
||||
|
||||
5. Kill Tritonserver after finishing inference
|
||||
|
||||
```bash
|
||||
pkill -9 -f trtllmExecutorWorker
|
||||
pkill -9 -f tritonserver
|
||||
```
|
||||
|
||||
# Medusa
|
||||
|
||||
This approach leverages a single model to both generate and verify draft tokens.
|
||||
|
||||
@ -1,5 +1,5 @@
|
||||
--extra-index-url https://pypi.nvidia.com
|
||||
tensorrt_llm==0.11.0.dev2024061800
|
||||
tensorrt_llm==0.11.0.dev2024062500
|
||||
datasets~=2.15.0
|
||||
evaluate~=0.4.1
|
||||
rouge_score~=0.1.2
|
||||
|
||||
@ -1,5 +1,5 @@
|
||||
--extra-index-url https://pypi.nvidia.com
|
||||
tensorrt_llm==0.11.0.dev2024061800
|
||||
tensorrt_llm==0.11.0.dev2024062500
|
||||
datasets~=2.14.5
|
||||
evaluate~=0.4.1
|
||||
rouge_score~=0.1.2
|
||||
|
||||
@ -1,5 +1,5 @@
|
||||
--extra-index-url https://pypi.nvidia.com
|
||||
tensorrt_llm==0.11.0.dev2024061800
|
||||
tensorrt_llm==0.11.0.dev2024062500
|
||||
datasets~=2.14.5
|
||||
evaluate~=0.4.1
|
||||
protobuf
|
||||
|
||||
@ -119,7 +119,7 @@ def parse_arguments():
|
||||
'By default, we use dtype for KV cache. int8_kv_cache chooses int8 quantization for KV'
|
||||
)
|
||||
parser.add_argument(
|
||||
'--modelopt_quant_ckpt_path',
|
||||
'--quant_ckpt_path',
|
||||
type=str,
|
||||
default=None,
|
||||
help='Path of a quantized model checkpoint in .npz format')
|
||||
@ -438,7 +438,7 @@ def main():
|
||||
|
||||
if args.use_weight_only and args.weight_only_precision == 'int4_gptq':
|
||||
weights = load_weights_from_gptq(
|
||||
args.modelopt_quant_ckpt_path,
|
||||
args.quant_ckpt_path,
|
||||
PretrainedConfig.from_dict(copy.deepcopy(config)),
|
||||
)
|
||||
|
||||
|
||||
@ -1,5 +1,5 @@
|
||||
--extra-index-url https://pypi.nvidia.com
|
||||
tensorrt_llm==0.11.0.dev2024061800
|
||||
tensorrt_llm==0.11.0.dev2024062500
|
||||
datasets~=2.14.5
|
||||
evaluate~=0.4.1
|
||||
rouge_score~=0.1.2
|
||||
|
||||
@ -1,5 +1,5 @@
|
||||
--extra-index-url https://pypi.nvidia.com
|
||||
tensorrt_llm==0.11.0.dev2024061800
|
||||
tensorrt_llm==0.11.0.dev2024062500
|
||||
transformers>=4.31.0
|
||||
datasets~=2.14.5
|
||||
evaluate~=0.4.1
|
||||
|
||||
@ -71,7 +71,7 @@ def parse_arguments():
|
||||
"By default, we use dtype for KV cache. fp8_kv_cache chooses fp8 quantization for KV",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--modelopt_quant_ckpt_path",
|
||||
"--quant_ckpt_path",
|
||||
default=None,
|
||||
help=
|
||||
"Path of a directory to quantized model checkpoints in .safetensors format or \
|
||||
@ -944,7 +944,7 @@ def convert(worker_rank, args, convert_kwargs):
|
||||
weight_scales = quantize_fp8_weights(
|
||||
weights, trt_llm_config.num_hidden_layers,
|
||||
trt_llm_config.mapping)
|
||||
scales = load_from_fp8_gemma(args.modelopt_quant_ckpt_path,
|
||||
scales = load_from_fp8_gemma(args.quant_ckpt_path,
|
||||
trt_llm_config.num_hidden_layers,
|
||||
trt_llm_config.mapping,
|
||||
args.fp8_kv_cache, weight_scales)
|
||||
|
||||
@ -3,7 +3,7 @@
|
||||
# WAR the new posting of "nvidia-cudnn-cu12~=9.0".
|
||||
# "jax[cuda12_pip]~=0.4.19" specifies "nvidia-cudnn-cu12>=8.9" but actually requires "nvidia-cudnn-cu12~=8.9".
|
||||
nvidia-cudnn-cu12~=8.9; platform_machine == "x86_64"
|
||||
tensorrt_llm==0.11.0.dev2024061800
|
||||
tensorrt_llm==0.11.0.dev2024062500
|
||||
flax~=0.8.0
|
||||
# jax[cuda12_pip]~=0.4.19; platform_system != "Windows"
|
||||
jax~=0.4.19; platform_system == "Windows"
|
||||
|
||||
@ -1,5 +1,5 @@
|
||||
--extra-index-url https://pypi.nvidia.com
|
||||
tensorrt_llm==0.11.0.dev2024061800
|
||||
tensorrt_llm==0.11.0.dev2024062500
|
||||
datasets~=2.14.5
|
||||
evaluate~=0.4.1
|
||||
rouge_score~=0.1.2
|
||||
|
||||
@ -1,5 +1,5 @@
|
||||
--extra-index-url https://pypi.nvidia.com
|
||||
tensorrt_llm==0.11.0.dev2024061800
|
||||
tensorrt_llm==0.11.0.dev2024062500
|
||||
datasets~=2.14.5
|
||||
evaluate~=0.4.1
|
||||
rouge_score~=0.1.2
|
||||
|
||||
@ -167,7 +167,7 @@ sh gptq_convert.sh
|
||||
### 3. Convert weights from HF Transformers to TensorRT-LLM format
|
||||
|
||||
To apply groupwise quantization GPTQ, addition command-line flags need to be passed to `convert_checkpoint.py`:
|
||||
Here `--modelopt_quant_ckpt_path` flag specifies the output safetensors of `gptq_convert.sh` script.
|
||||
Here `--quant_ckpt_path` flag specifies the output safetensors of `gptq_convert.sh` script.
|
||||
|
||||
```bash
|
||||
# Single GPU
|
||||
@ -175,7 +175,7 @@ python3 convert_checkpoint.py --model_dir ./gptneox_model \
|
||||
--dtype float16 \
|
||||
--use_weight_only \
|
||||
--weight_only_precision int4_gptq \
|
||||
--modelopt_quant_ckpt_path ./gptneox_model/gptneox-20b-4bit-gs128.safetensors \
|
||||
--quant_ckpt_path ./gptneox_model/gptneox-20b-4bit-gs128.safetensors \
|
||||
--output_dir ./gptneox/20B/trt_ckpt/int4_gptq/1-gpu/
|
||||
# With 2-way Tensor Parallel
|
||||
python3 convert_checkpoint.py --model_dir ./gptneox_model \
|
||||
@ -184,7 +184,7 @@ python3 convert_checkpoint.py --model_dir ./gptneox_model \
|
||||
--weight_only_precision int4_gptq \
|
||||
--tp_size 2 \
|
||||
--workers 2 \
|
||||
--modelopt_quant_ckpt_path ./gptneox_model/gptneox-20b-4bit-gs128.safetensors \
|
||||
--quant_ckpt_path ./gptneox_model/gptneox-20b-4bit-gs128.safetensors \
|
||||
--output_dir ./gptneox/20B/trt_ckpt/int4_gptq/2-gpu/
|
||||
```
|
||||
|
||||
|
||||
@ -50,7 +50,7 @@ def parse_arguments():
|
||||
'Define the precision for the weights when using weight-only quantization.'
|
||||
'You must also use --use_weight_only for that argument to have an impact.'
|
||||
)
|
||||
parser.add_argument('--modelopt_quant_ckpt_path',
|
||||
parser.add_argument('--quant_ckpt_path',
|
||||
type=str,
|
||||
default=None,
|
||||
help='Path of a quantized model checkpoint')
|
||||
@ -708,8 +708,7 @@ if __name__ == '__main__':
|
||||
'has_zero_point':
|
||||
True,
|
||||
'group_size':
|
||||
get_gptq_gptneox_group_size(args.modelopt_quant_ckpt_path,
|
||||
hf_config)
|
||||
get_gptq_gptneox_group_size(args.quant_ckpt_path, hf_config)
|
||||
})
|
||||
|
||||
with open(os.path.join(args.output_dir, 'config.json'), 'w') as f:
|
||||
@ -723,7 +722,7 @@ if __name__ == '__main__':
|
||||
|
||||
if args.use_weight_only and args.weight_only_precision == 'int4_gptq':
|
||||
weights = load_from_gptq_gptneox(
|
||||
args.modelopt_quant_ckpt_path,
|
||||
args.quant_ckpt_path,
|
||||
hf_config,
|
||||
use_parallel_embedding=args.use_parallel_embedding,
|
||||
sharding_dim=args.embedding_sharding_dim,
|
||||
|
||||
@ -1,5 +1,5 @@
|
||||
--extra-index-url https://pypi.nvidia.com
|
||||
tensorrt_llm==0.11.0.dev2024061800
|
||||
tensorrt_llm==0.11.0.dev2024062500
|
||||
datasets~=2.14.5
|
||||
rouge_score~=0.1.2
|
||||
evaluate~=0.4.1
|
||||
|
||||
@ -1,6 +1,6 @@
|
||||
-f https://storage.googleapis.com/jax-releases/jax_cuda_releases.html
|
||||
--extra-index-url https://pypi.nvidia.com
|
||||
tensorrt_llm==0.11.0.dev2024061800
|
||||
tensorrt_llm==0.11.0.dev2024062500
|
||||
datasets==2.14.6
|
||||
evaluate~=0.4.1
|
||||
rouge_score~=0.1.2
|
||||
|
||||
@ -1,2 +1,2 @@
|
||||
--extra-index-url https://pypi.nvidia.com
|
||||
tensorrt_llm==0.11.0.dev2024061800
|
||||
tensorrt_llm==0.11.0.dev2024062500
|
||||
|
||||
@ -1,5 +1,5 @@
|
||||
--extra-index-url https://pypi.nvidia.com
|
||||
tensorrt_llm==0.11.0.dev2024061800
|
||||
tensorrt_llm==0.11.0.dev2024062500
|
||||
datasets==2.14.5
|
||||
rouge_score~=0.1.2
|
||||
sentencepiece~=0.1.99
|
||||
|
||||
@ -644,7 +644,7 @@ One can enable AWQ/GPTQ INT4 weight only quantization with these options when bu
|
||||
- `--use_weight_only` enables weight only GEMMs in the network.
|
||||
- `--per_group` enable groupwise weight only quantization, for GPT-J example, we support AWQ with the group size default as 128.
|
||||
- `--weight_only_precision` should specify the weight only quantization format. Supported formats are `int4_awq` or `int4_gptq`.
|
||||
- `--modelopt_quant_ckpt_path` passes the quantized checkpoint to build the engine.
|
||||
- `--quant_ckpt_path` passes the quantized checkpoint to build the engine.
|
||||
|
||||
AWQ/GPTQ examples below involves 2 steps:
|
||||
1. Weight quantization
|
||||
@ -700,7 +700,7 @@ To run the GPTQ LLaMa example, the following steps are required:
|
||||
python convert_checkpoint.py --model_dir /tmp/llama-7b-hf \
|
||||
--output_dir ./tllm_checkpoint_2gpu_gptq \
|
||||
--dtype float16 \
|
||||
--modelopt_quant_ckpt_path ./llama-7b-4bit-gs128.safetensors \
|
||||
--quant_ckpt_path ./llama-7b-4bit-gs128.safetensors \
|
||||
--use_weight_only \
|
||||
--weight_only_precision int4_gptq \
|
||||
--per_group \
|
||||
|
||||
@ -1,5 +1,5 @@
|
||||
--extra-index-url https://pypi.nvidia.com
|
||||
tensorrt_llm==0.11.0.dev2024061800
|
||||
tensorrt_llm==0.11.0.dev2024062500
|
||||
datasets==2.14.6
|
||||
evaluate~=0.4.1
|
||||
rouge_score~=0.1.2
|
||||
|
||||
@ -29,9 +29,6 @@ Please install required packages first and setup `git-lfs`:
|
||||
|
||||
```bash
|
||||
pip install -r requirements.txt
|
||||
pip install "transformers>=4.39.0"
|
||||
|
||||
# Setup git-lfs
|
||||
git lfs install
|
||||
```
|
||||
|
||||
@ -98,8 +95,7 @@ The `trtllm-build` command builds TensorRT-LLM engines from TensorRT-LLM checkpo
|
||||
# mamba-2.8b
|
||||
trtllm-build --checkpoint_dir ./mamba_model/mamba-2.8b/trt_ckpt/bf16/1-gpu/ \
|
||||
--paged_kv_cache disable \
|
||||
--gemm_plugin bfloat16 \
|
||||
--mamba_conv1d_plugin bfloat16 \
|
||||
--gemm_plugin auto \
|
||||
--max_batch_size 8 \
|
||||
--max_input_len 924 \
|
||||
--max_seq_len 1024 \
|
||||
@ -108,8 +104,7 @@ trtllm-build --checkpoint_dir ./mamba_model/mamba-2.8b/trt_ckpt/bf16/1-gpu/ \
|
||||
# mamba-1.4b
|
||||
trtllm-build --checkpoint_dir ./mamba_model/mamba-1.4b/trt_ckpt/fp16/1-gpu/ \
|
||||
--paged_kv_cache disable \
|
||||
--gemm_plugin float16 \
|
||||
--mamba_conv1d_plugin float16 \
|
||||
--gemm_plugin auto \
|
||||
--max_batch_size 8 \
|
||||
--max_input_len 924 \
|
||||
--max_seq_len 1024 \
|
||||
@ -118,8 +113,7 @@ trtllm-build --checkpoint_dir ./mamba_model/mamba-1.4b/trt_ckpt/fp16/1-gpu/ \
|
||||
# mamba-790m
|
||||
trtllm-build --checkpoint_dir ./mamba_model/mamba-790m/trt_ckpt/fp16/1-gpu/ \
|
||||
--paged_kv_cache disable \
|
||||
--gemm_plugin float16 \
|
||||
--mamba_conv1d_plugin float16 \
|
||||
--gemm_plugin auto \
|
||||
--max_batch_size 8 \
|
||||
--max_input_len 924 \
|
||||
--max_seq_len 1024 \
|
||||
@ -128,8 +122,7 @@ trtllm-build --checkpoint_dir ./mamba_model/mamba-790m/trt_ckpt/fp16/1-gpu/ \
|
||||
# mamba-370m
|
||||
trtllm-build --checkpoint_dir ./mamba_model/mamba-370m/trt_ckpt/fp16/1-gpu/ \
|
||||
--paged_kv_cache disable \
|
||||
--gemm_plugin float16 \
|
||||
--mamba_conv1d_plugin float16 \
|
||||
--gemm_plugin auto \
|
||||
--max_batch_size 8 \
|
||||
--max_input_len 924 \
|
||||
--max_seq_len 1024 \
|
||||
@ -138,8 +131,7 @@ trtllm-build --checkpoint_dir ./mamba_model/mamba-370m/trt_ckpt/fp16/1-gpu/ \
|
||||
# mamba-130m
|
||||
trtllm-build --checkpoint_dir ./mamba_model/mamba-130m/trt_ckpt/fp16/1-gpu/ \
|
||||
--paged_kv_cache disable \
|
||||
--gemm_plugin float16 \
|
||||
--mamba_conv1d_plugin float16 \
|
||||
--gemm_plugin auto \
|
||||
--max_batch_size 8 \
|
||||
--max_input_len 924 \
|
||||
--max_seq_len 1024 \
|
||||
|
||||
@ -1,5 +1,6 @@
|
||||
--extra-index-url https://pypi.nvidia.com
|
||||
tensorrt_llm==0.11.0.dev2024061800
|
||||
tensorrt_llm==0.11.0.dev2024062500
|
||||
transformers>=4.39.0
|
||||
datasets~=2.14.5
|
||||
evaluate
|
||||
rouge_score
|
||||
|
||||
@ -13,6 +13,7 @@ For more info about Medusa visit [speculative decoding documentation](../../docs
|
||||
* GPU Compute Capability >= 8.0 (Ampere or newer)
|
||||
* FP16
|
||||
* BF16
|
||||
* FP8 (base model)
|
||||
* PAGED_KV_CACHE
|
||||
* Tensor Parallel
|
||||
|
||||
@ -32,7 +33,7 @@ https://huggingface.co/FasterDecoding/medusa-vicuna-7b-v1.3
|
||||
```
|
||||
|
||||
We use `convert_checkpoint.py` script to convert the model for Medusa decoding into TensorRT-LLM checkpoint format.
|
||||
Here we also add `--fixed_num_medusa_heads 4` as `medusa_num_heads` is 2 in `config.json` of `medusa-vicuna-7b-v1.3` but it actually has 4.
|
||||
We could use `--num_medusa_heads` to set the number of medusa heads that we want to use. If not, `num_medusa_heads` will be set according to the `medusa_num_heads` from medusa weights' `config.json`.
|
||||
|
||||
Here is the example:
|
||||
```bash
|
||||
@ -41,20 +42,21 @@ python convert_checkpoint.py --model_dir ./vicuna-7b-v1.3 \
|
||||
--medusa_model_dir medusa-vicuna-7b-v1.3 \
|
||||
--output_dir ./tllm_checkpoint_1gpu_medusa \
|
||||
--dtype float16 \
|
||||
--fixed_num_medusa_heads 4
|
||||
--num_medusa_heads 4
|
||||
|
||||
# Note: Increasing the batch size may have a negative impact on performance
|
||||
trtllm-build --checkpoint_dir ./tllm_checkpoint_1gpu_medusa \
|
||||
--output_dir ./tmp/medusa/7B/trt_engines/fp16/1-gpu/ \
|
||||
--gemm_plugin float16 \
|
||||
--speculative_decoding_mode medusa \
|
||||
--max_batch_size 8
|
||||
--max_batch_size 4
|
||||
|
||||
# Convert and Build Medusa decoding support for vicuna-13b-v1.3 with 4-way tensor parallelism.
|
||||
python convert_checkpoint.py --model_dir ./vicuna-7b-v1.3 \
|
||||
--medusa_model_dir medusa-vicuna-7b-v1.3 \
|
||||
--output_dir ./tllm_checkpoint_1gpu_medusa \
|
||||
--dtype float16 \
|
||||
--fixed_num_medusa_heads 4 \
|
||||
--num_medusa_heads 4 \
|
||||
--tp_size 4 \
|
||||
--workers 4
|
||||
|
||||
@ -62,7 +64,30 @@ trtllm-build --checkpoint_dir ./tllm_checkpoint_1gpu_medusa \
|
||||
--output_dir ./tmp/medusa/7B/trt_engines/fp16/1-gpu/ \
|
||||
--gemm_plugin float16 \
|
||||
--speculative_decoding_mode medusa \
|
||||
--max_batch_size 8
|
||||
--max_batch_size 4
|
||||
```
|
||||
|
||||
### FP8 Post-Training Quantization for Base Model
|
||||
The example below quantizes the base model to FP8, while keeping the weight of the medusa head non-quantize.
|
||||
```bash
|
||||
# Quantize base model into FP8 and export trtllm checkpoint
|
||||
python ../quantization/quantize.py --model_dir /path/to/base-model-hf/ \
|
||||
--dtype float16 \
|
||||
--qformat fp8 \
|
||||
--kv_cache_dtype fp8 \
|
||||
--output_dir ./tllm_checkpoint_1gpu_base_model_fp8_medusa_fp16 \
|
||||
--calib_size 512 \
|
||||
--tp_size 1 \
|
||||
--medusa_model_dir /path/to/medusa_head/ \
|
||||
--num_medusa_heads 4
|
||||
|
||||
# Build trtllm engines from the trtllm checkpoint
|
||||
trtllm-build --checkpoint_dir ./tllm_checkpoint_1gpu_base_model_fp8_medusa_fp16 \
|
||||
--output_dir ./trt_engine_1gpu_base_model_fp8_medusa_fp16 \
|
||||
--gemm_plugin float16 \
|
||||
--gpt_attention_plugin float16 \
|
||||
--speculative_decoding_mode medusa \
|
||||
--max_batch_size 4
|
||||
```
|
||||
|
||||
### Run
|
||||
|
||||
@ -19,12 +19,13 @@ from transformers.models.llama.modeling_llama import LlamaDecoderLayer
|
||||
from transformers.pytorch_utils import Conv1D
|
||||
|
||||
import tensorrt_llm
|
||||
from tensorrt_llm._utils import str_dtype_to_torch
|
||||
from tensorrt_llm.logger import logger
|
||||
from tensorrt_llm.mapping import Mapping
|
||||
from tensorrt_llm.models import PretrainedConfig
|
||||
from tensorrt_llm.models.convert_utils import load_calib_dataset
|
||||
from tensorrt_llm.models.llama.convert import load_weights_from_hf_by_shard
|
||||
from tensorrt_llm.models.medusa.weight import (get_tllm_linear_weight,
|
||||
load_medusa_hf)
|
||||
from tensorrt_llm.quantization import QuantAlgo
|
||||
|
||||
try:
|
||||
@ -108,11 +109,6 @@ def parse_arguments():
|
||||
help=
|
||||
'By default, we use dtype for KV cache. int8_kv_cache chooses int8 quantization for KV'
|
||||
)
|
||||
parser.add_argument(
|
||||
'--modelopt_quant_ckpt_path',
|
||||
type=str,
|
||||
default=None,
|
||||
help='Path of a quantized model checkpoint in .npz format')
|
||||
|
||||
parser.add_argument(
|
||||
'--per_group',
|
||||
@ -182,13 +178,6 @@ def parse_arguments():
|
||||
help='The number of workers for converting checkpoint in parallel')
|
||||
|
||||
parser.add_argument('--num_medusa_heads', type=int, default=4)
|
||||
parser.add_argument(
|
||||
'--fixed_num_medusa_heads',
|
||||
type=int,
|
||||
default=None,
|
||||
help="If exist, fix medusa_num_heads from config.json."
|
||||
"num_medusa_heads < medusa_num_heads in config.json < fixed_num_medusa_heads"
|
||||
)
|
||||
parser.add_argument('--num_medusa_layers', type=int, default=1)
|
||||
parser.add_argument('--max_medusa_token_len', type=int, default=63)
|
||||
parser.add_argument('--medusa_hidden_act', type=str, default="silu")
|
||||
@ -570,29 +559,6 @@ def get_weight_and_bias(config, prefix, dtype):
|
||||
return get_weight(config, prefix, dtype), get_bias(config, prefix, dtype)
|
||||
|
||||
|
||||
def get_tllm_linear_weight(weight,
|
||||
prefix,
|
||||
bias=None,
|
||||
use_weight_only=False,
|
||||
plugin_weight_only_quant_type=torch.int8,
|
||||
postfix='weight'):
|
||||
results = {}
|
||||
if use_weight_only:
|
||||
v = weight.t().contiguous().cpu()
|
||||
processed_torch_weights, torch_weight_scales = \
|
||||
torch.ops.trtllm.symmetric_quantize_last_axis_of_batched_matrix(
|
||||
v, plugin_weight_only_quant_type)
|
||||
results[prefix + postfix] = processed_torch_weights
|
||||
results[prefix + 'per_channel_scale'] = torch_weight_scales
|
||||
else:
|
||||
results[prefix + postfix] = weight.contiguous()
|
||||
|
||||
if bias is not None:
|
||||
results[prefix + 'bias'] = bias
|
||||
|
||||
return results
|
||||
|
||||
|
||||
def dup_kv_weight(v, num_head, tp_size):
|
||||
assert tp_size % num_head == 0
|
||||
reps = tp_size // num_head
|
||||
@ -1189,77 +1155,28 @@ if __name__ == '__main__':
|
||||
qkv_para=convert_args['llama_qkv_para'],
|
||||
smoother=convert_args['llama_smoother'])
|
||||
|
||||
def load_medusa_hf(medusa_path: str,
|
||||
mapping=Mapping(),
|
||||
dtype='float32'):
|
||||
logger.info("Loading Medusa heads' weights ...")
|
||||
is_ckpt_safetensors = False
|
||||
|
||||
ckpt_file = Path(medusa_path) / "medusa_lm_head.pt"
|
||||
if not ckpt_file.exists():
|
||||
ckpt_file = Path(
|
||||
medusa_path) / "medusa_lm_head.safetensors"
|
||||
is_ckpt_safetensors = True
|
||||
|
||||
if is_ckpt_safetensors:
|
||||
logger.info("Safetensors Found ...")
|
||||
from safetensors.torch import load_file
|
||||
state_dict = load_file(ckpt_file)
|
||||
else:
|
||||
state_dict = torch.load(ckpt_file, map_location="cpu")
|
||||
|
||||
torch_dtype = str_dtype_to_torch(dtype)
|
||||
weights = {}
|
||||
|
||||
for h in range(args.num_medusa_heads):
|
||||
for l in range(args.num_medusa_layers):
|
||||
w = state_dict[f"{h}.{l}.linear.weight"].clone().to(
|
||||
torch_dtype)
|
||||
|
||||
split_v = split(w, mapping.tp_size, mapping.tp_rank)
|
||||
weights.update(
|
||||
get_tllm_linear_weight(
|
||||
split_v,
|
||||
f'medusa_heads.{h}.medusa_layers.{l}.linear.',
|
||||
None, args.use_weight_only,
|
||||
plugin_weight_only_quant_type))
|
||||
|
||||
b = state_dict[f"{h}.{l}.linear.bias"].clone().to(
|
||||
torch_dtype)
|
||||
|
||||
weights[
|
||||
'medusa_heads.{}.medusa_layers.{}.linear.bias'.
|
||||
format(h, l)] = split(b, mapping.tp_size,
|
||||
mapping.tp_rank)
|
||||
|
||||
lm = state_dict[
|
||||
f"{h}.{args.num_medusa_layers}.weight"].clone().to(
|
||||
torch_dtype) # LM Head
|
||||
|
||||
weights['medusa_heads.{}.lm_head.weight'.format(
|
||||
h)] = split(lm, mapping.tp_size, mapping.tp_rank)
|
||||
|
||||
return weights
|
||||
|
||||
if args.medusa_model_dir is not None:
|
||||
config_file = Path(args.medusa_model_dir) / "config.json"
|
||||
with open(config_file) as fp:
|
||||
config = json.load(fp)
|
||||
args.num_medusa_heads = config.get('medusa_num_heads',
|
||||
args.num_medusa_heads)
|
||||
num_medusa_heads_from_config = config.get(
|
||||
'medusa_num_heads', args.num_medusa_heads)
|
||||
args.num_medusa_layers = config.get('medusa_num_layers',
|
||||
args.num_medusa_layers)
|
||||
if args.fixed_num_medusa_heads is not None and args.fixed_num_medusa_heads != args.num_medusa_heads:
|
||||
logger.info(
|
||||
f"fixing num_medusa_heads from {args.num_medusa_heads} to {args.fixed_num_medusa_heads}"
|
||||
)
|
||||
args.num_medusa_heads = args.fixed_num_medusa_heads
|
||||
if args.num_medusa_heads is None:
|
||||
args.num_medusa_heads = num_medusa_heads_from_config
|
||||
|
||||
assert args.max_medusa_token_len > 0, "should have max_medusa_token_len > 0"
|
||||
|
||||
medusa_weights = load_medusa_hf(args.medusa_model_dir,
|
||||
mapping,
|
||||
dtype=args.dtype)
|
||||
medusa_weights = load_medusa_hf(
|
||||
medusa_path=args.medusa_model_dir,
|
||||
num_medusa_heads=args.num_medusa_heads,
|
||||
num_medusa_layers=args.num_medusa_layers,
|
||||
mapping=mapping,
|
||||
dtype=args.dtype,
|
||||
use_weight_only=args.use_weight_only,
|
||||
plugin_weight_only_quant_type=
|
||||
plugin_weight_only_quant_type)
|
||||
weights.update(medusa_weights)
|
||||
|
||||
safetensors.torch.save_file(
|
||||
|
||||
@ -1,5 +1,5 @@
|
||||
--extra-index-url https://pypi.nvidia.com
|
||||
tensorrt_llm==0.11.0.dev2024061800
|
||||
tensorrt_llm==0.11.0.dev2024062500
|
||||
datasets~=2.14.5
|
||||
rouge_score~=0.1.2
|
||||
sentencepiece~=0.1.99
|
||||
|
||||
@ -1,4 +1,4 @@
|
||||
--extra-index-url https://pypi.nvidia.com
|
||||
tensorrt_llm==0.11.0.dev2024061800
|
||||
tensorrt_llm==0.11.0.dev2024062500
|
||||
transformers==4.38.2
|
||||
accelerate==0.25.0
|
||||
|
||||
@ -363,11 +363,13 @@ def main():
|
||||
|
||||
if args.test_trt_llm:
|
||||
assert not args.test_hf, "Cannot test both TRT-LLM and HF"
|
||||
runner_cls = ModelRunner if (args.debug_mode
|
||||
or not PYTHON_BINDINGS) else ModelRunnerCpp
|
||||
runner_cls = ModelRunner if not PYTHON_BINDINGS else ModelRunnerCpp
|
||||
runner_kwargs = {}
|
||||
if PYTHON_BINDINGS:
|
||||
runner_kwargs.update(max_beam_width=1)
|
||||
model = runner_cls.from_dir(args.engine_dir,
|
||||
rank=runtime_rank,
|
||||
debug_mode=args.debug_mode)
|
||||
**runner_kwargs)
|
||||
else:
|
||||
assert args.test_hf, "Must test either TRT-LLM or HF"
|
||||
if model_name == 'ChatGLMForCausalLM' and model_version == 'glm':
|
||||
|
||||
@ -1,5 +1,5 @@
|
||||
--extra-index-url https://pypi.nvidia.com
|
||||
tensorrt_llm==0.11.0.dev2024061800
|
||||
tensorrt_llm==0.11.0.dev2024062500
|
||||
datasets~=2.14.5
|
||||
evaluate~=0.4.1
|
||||
rouge_score~=0.1.2
|
||||
|
||||
@ -1,5 +1,5 @@
|
||||
--extra-index-url https://pypi.nvidia.com
|
||||
tensorrt_llm==0.11.0.dev2024061800
|
||||
tensorrt_llm==0.11.0.dev2024062500
|
||||
transformers==4.40.2
|
||||
datasets~=2.14.5
|
||||
evaluate~=0.4.1
|
||||
|
||||
@ -1,5 +1,5 @@
|
||||
--extra-index-url https://pypi.nvidia.com
|
||||
tensorrt_llm==0.11.0.dev2024061800
|
||||
tensorrt_llm==0.11.0.dev2024062500
|
||||
datasets~=2.14.5
|
||||
evaluate~=0.4.1
|
||||
rouge_score~=0.1.2
|
||||
|
||||
@ -15,10 +15,9 @@ models using TensorRT-LLM and run on a single GPU.
|
||||
|
||||
## Overview
|
||||
|
||||
The TensorRT-LLM Phi implementation can be found in [`tensorrt_llm/models/phi/model.py`](../../tensorrt_llm/models/phi/model.py) and [`tensorrt_llm/models/phi3/model.py`](../../tensorrt_llm/models/phi3/model.py). The TensorRT-LLM Phi example code is located in [`examples/phi`](./). There are two files:
|
||||
The TensorRT-LLM Phi implementation can be found in [`tensorrt_llm/models/phi/model.py`](../../tensorrt_llm/models/phi/model.py) and [`tensorrt_llm/models/phi3/model.py`](../../tensorrt_llm/models/phi3/model.py). The TensorRT-LLM Phi example code is located in [`examples/phi`](./) with a single file:
|
||||
|
||||
* [`convert_checkpoint.py`](./convert_checkpoint.py) to convert a checkpoint from the [HuggingFace (HF) Transformers](https://github.com/huggingface/transformers) format to the TensorRT-LLM format
|
||||
* [`postprocess_quant_checkpoint.py`](./postprocess_quant_checkpoint.py) to post-process FP8 or INT8 SmoothQuant quantized checkpoints for Phi-3-small variants.
|
||||
|
||||
In addition, there are two shared files in the parent folder [`examples`](../) for inference and evaluation:
|
||||
|
||||
@ -29,18 +28,19 @@ In addition, there are two shared files in the parent folder [`examples`](../) f
|
||||
* FP16
|
||||
* BF16
|
||||
* FP8
|
||||
* INT8
|
||||
* Tensor Parallel
|
||||
## Support Matrix
|
||||
|
||||
| Model Name | FP16 | BF16 | FP8 | TP |
|
||||
| :--------------: | :---: | :---: | :---: | :---: |
|
||||
| phi-2 | Y | Y | | Y |
|
||||
| Phi-3-mini-4k-instruct | Y | Y | | |
|
||||
| Phi-3-mini-128k-instruct | Y | Y | | |
|
||||
| Phi-3-small-8k-instruct | Y | Y | Y | Y |
|
||||
| Phi-3-small-128k-instruct | Y | Y | Y | Y |
|
||||
| Phi-3-medium-8k-instruct | Y | Y | | | Y |
|
||||
| Phi-3-medium-128k-instruct | Y | Y | | | Y |
|
||||
| Model Name | FP16 | BF16 | FP8 | INT8 | TP |
|
||||
| :--------------: | :---: | :---: | :---: | :---: | :---: |
|
||||
| phi-2 | Y | Y | | | Y |
|
||||
| Phi-3-mini-4k-instruct | Y | Y | Y | Y |
|
||||
| Phi-3-mini-128k-instruct | Y | Y | Y | Y |
|
||||
| Phi-3-small-8k-instruct | Y | Y | Y | Y | Y |
|
||||
| Phi-3-small-128k-instruct | Y | Y | Y | Y | Y |
|
||||
| Phi-3-medium-8k-instruct | Y | Y | Y | Y |
|
||||
| Phi-3-medium-128k-instruct | Y | Y | Y | Y |
|
||||
|
||||
* Model Name: the name of the model, the same as the name on HuggingFace
|
||||
* TP: Tensor Parallel
|
||||
@ -128,9 +128,9 @@ python3 ../summarize.py --engine_dir ./phi-engine-tp2 \
|
||||
```
|
||||
|
||||
|
||||
### 5. Quantization options for Phi-3-small
|
||||
### 5. Quantization
|
||||
|
||||
Phi-3-small variants support post-training quantization to FP8 and INT8 SmoothQuant formats.
|
||||
All Phi-3 variants support post-training quantization to FP8 and INT8 SmoothQuant formats.
|
||||
|
||||
FP8 checkpoints can be built as follows:
|
||||
|
||||
@ -141,8 +141,6 @@ python3 ../quantization/quantize.py \
|
||||
--output_dir ./phi3-checkpoint \
|
||||
--dtype $DTYPE \
|
||||
--qformat fp8 --kv_cache_dtype fp8
|
||||
|
||||
python3 postprocess_quant_checkpoint.py --checkpoint_dir ./phi3-checkpoint
|
||||
```
|
||||
|
||||
INT8 checkpoints can be built as follows:
|
||||
@ -154,8 +152,6 @@ python3 ../quantization/quantize.py \
|
||||
--output_dir ./phi3-checkpoint \
|
||||
--dtype $DTYPE \
|
||||
--qformat int8_sq --kv_cache_dtype int8
|
||||
|
||||
python3 postprocess_quant_checkpoint.py --checkpoint_dir ./phi3-checkpoint
|
||||
```
|
||||
|
||||
The commands to [build TensorRT engines](#2-build-tensorrt-engines) from quantized checkpoints
|
||||
|
||||
@ -19,8 +19,7 @@ import time
|
||||
from transformers import AutoConfig
|
||||
|
||||
import tensorrt_llm
|
||||
from tensorrt_llm.models import (Phi3ForCausalLM, Phi3SmallForCausalLM,
|
||||
PhiForCausalLM)
|
||||
from tensorrt_llm.models import Phi3ForCausalLM, PhiForCausalLM
|
||||
|
||||
|
||||
def parse_arguments():
|
||||
@ -81,16 +80,14 @@ if __name__ == '__main__':
|
||||
model_config = AutoConfig.from_pretrained(args.model_dir,
|
||||
trust_remote_code=True)
|
||||
model_type = model_config.architectures[0]
|
||||
supported_model = {
|
||||
'PhiForCausalLM': PhiForCausalLM,
|
||||
'Phi3ForCausalLM': Phi3ForCausalLM,
|
||||
'Phi3VForCausalLM': Phi3ForCausalLM,
|
||||
'Phi3SmallForCausalLM': Phi3SmallForCausalLM
|
||||
}
|
||||
supported_models = [
|
||||
'PhiForCausalLM', 'Phi3ForCausalLM', 'Phi3VForCausalLM',
|
||||
'Phi3SmallForCausalLM'
|
||||
]
|
||||
modelForCausalLM = None
|
||||
if model_type not in supported_model:
|
||||
if model_type not in supported_models:
|
||||
assert False, "Invalid model type"
|
||||
modelForCausalLM = supported_model[model_type]
|
||||
modelForCausalLM = PhiForCausalLM if model_type == 'PhiForCausalLM' else Phi3ForCausalLM
|
||||
|
||||
modelForCausalLM.convert_hf_checkpoint(args.model_dir,
|
||||
dtype=args.dtype,
|
||||
|
||||
@ -1,63 +0,0 @@
|
||||
# SPDX-FileCopyrightText: Copyright (c) 2022-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
import argparse
|
||||
import json
|
||||
import time
|
||||
|
||||
import safetensors
|
||||
from safetensors.torch import save_file
|
||||
|
||||
import tensorrt_llm
|
||||
from tensorrt_llm.models.phi3.phi3small.convert import shuffle_qkv_weights
|
||||
|
||||
|
||||
def parse_arguments():
|
||||
parser = argparse.ArgumentParser()
|
||||
parser.add_argument('--checkpoint_dir', type=str, default=None)
|
||||
args = parser.parse_args()
|
||||
|
||||
return args
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
print(tensorrt_llm.__version__)
|
||||
args = parse_arguments()
|
||||
tensorrt_llm.logger.set_level('info')
|
||||
|
||||
tik = time.time()
|
||||
with open(f"{args.checkpoint_dir}/config.json", "r") as f:
|
||||
config = json.load(f)
|
||||
|
||||
weights = {}
|
||||
with safetensors.safe_open(f"{args.checkpoint_dir}/rank0.safetensors",
|
||||
framework="pt") as f:
|
||||
for k in f.keys():
|
||||
weights[k] = f.get_tensor(k)
|
||||
|
||||
# Transform QKV weights from custom Phi3Small format to TRT-LLM format
|
||||
num_total_heads = config[
|
||||
'num_attention_heads'] + 2 * config['num_key_value_heads']
|
||||
for key, value in weights.items():
|
||||
if "qkv." in key:
|
||||
if 'scaling_factor' in key and value.shape[0] % num_total_heads != 0:
|
||||
continue
|
||||
weights[key] = shuffle_qkv_weights(value, config)
|
||||
|
||||
save_file(weights, f'{args.checkpoint_dir}/rank0.safetensors')
|
||||
|
||||
tok = time.time()
|
||||
t = time.strftime('%H:%M:%S', time.gmtime(tok - tik))
|
||||
print(f'Total time of converting checkpoints: {t}')
|
||||
@ -1,5 +1,5 @@
|
||||
--extra-index-url https://pypi.nvidia.com
|
||||
tensorrt_llm==0.11.0.dev2024061800
|
||||
tensorrt_llm==0.11.0.dev2024062500
|
||||
datasets~=2.14.5
|
||||
evaluate~=0.4.1
|
||||
rouge_score~=0.1.2
|
||||
|
||||
@ -90,6 +90,17 @@ if __name__ == "__main__":
|
||||
help="KV Cache dtype.",
|
||||
default=None,
|
||||
choices=["int8", "fp8", None])
|
||||
# Medusa
|
||||
parser.add_argument('--num_medusa_heads', type=int, default=4)
|
||||
parser.add_argument('--num_medusa_layers', type=int, default=1)
|
||||
parser.add_argument('--max_draft_len', type=int, default=63)
|
||||
parser.add_argument('--medusa_hidden_act', type=str, default="silu")
|
||||
parser.add_argument('--medusa_model_dir', type=str, default=None)
|
||||
parser.add_argument('--quant_medusa_head',
|
||||
default=False,
|
||||
action='store_true',
|
||||
help="whether to quantize the weights of medusa heads")
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
if args.model_dir is not None:
|
||||
@ -108,7 +119,13 @@ if __name__ == "__main__":
|
||||
tp_size=args.tp_size,
|
||||
pp_size=args.pp_size,
|
||||
seed=args.seed,
|
||||
tokenizer_max_seq_length=args.tokenizer_max_seq_length)
|
||||
tokenizer_max_seq_length=args.tokenizer_max_seq_length,
|
||||
num_medusa_heads=args.num_medusa_heads,
|
||||
num_medusa_layers=args.num_medusa_layers,
|
||||
max_draft_len=args.max_draft_len,
|
||||
medusa_hidden_act=args.medusa_hidden_act,
|
||||
medusa_model_dir=args.medusa_model_dir,
|
||||
quant_medusa_head=args.quant_medusa_head)
|
||||
elif args.nemo_ckpt_path is not None:
|
||||
quantize_nemo_and_export(nemo_ckpt_path=args.nemo_ckpt_path,
|
||||
decoder_type=args.decoder_type,
|
||||
|
||||
@ -1,5 +1,5 @@
|
||||
--extra-index-url https://pypi.nvidia.com
|
||||
tensorrt_llm==0.11.0.dev2024061800
|
||||
tensorrt_llm==0.11.0.dev2024062500
|
||||
datasets>=2.14.4
|
||||
nemo-toolkit[all]<=1.20.0,>=1.18.0
|
||||
rouge_score~=0.1.2
|
||||
|
||||
@ -1,5 +1,5 @@
|
||||
--extra-index-url https://pypi.nvidia.com
|
||||
tensorrt_llm==0.11.0.dev2024061800
|
||||
tensorrt_llm==0.11.0.dev2024062500
|
||||
datasets~=2.16.0
|
||||
evaluate~=0.4.1
|
||||
rouge_score~=0.1.2
|
||||
|
||||
@ -1,5 +1,5 @@
|
||||
--extra-index-url https://pypi.nvidia.com
|
||||
tensorrt_llm==0.11.0.dev2024061800
|
||||
tensorrt_llm==0.11.0.dev2024062500
|
||||
datasets~=2.16.0
|
||||
evaluate~=0.4.1
|
||||
rouge_score~=0.1.2
|
||||
|
||||
@ -1,5 +1,5 @@
|
||||
--extra-index-url https://pypi.nvidia.com
|
||||
tensorrt_llm==0.11.0.dev2024061800
|
||||
tensorrt_llm==0.11.0.dev2024062500
|
||||
git+https://github.com/google-deepmind/recurrentgemma.git
|
||||
flax>=0.8.2
|
||||
jax~=0.4.23
|
||||
|
||||
@ -247,7 +247,7 @@ def main(args):
|
||||
|
||||
model_name, model_version = read_model_name(
|
||||
args.engine_dir) if not is_enc_dec else ("", "")
|
||||
if args.tokenizer_dir is None:
|
||||
if args.tokenizer_dir is None and model_name in DEFAULT_HF_MODEL_DIRS:
|
||||
logger.warning(
|
||||
"tokenizer_dir is not specified. Try to infer from model_name, but this may be incorrect."
|
||||
)
|
||||
|
||||
@ -1,5 +1,5 @@
|
||||
--extra-index-url https://pypi.nvidia.com
|
||||
tensorrt_llm==0.11.0.dev2024061800
|
||||
tensorrt_llm==0.11.0.dev2024062500
|
||||
datasets~=2.16.1
|
||||
evaluate~=0.4.1
|
||||
rouge_score~=0.1.2
|
||||
|
||||
@ -1,5 +1,5 @@
|
||||
--extra-index-url https://pypi.nvidia.com
|
||||
tensorrt_llm==0.11.0.dev2024061800
|
||||
tensorrt_llm==0.11.0.dev2024062500
|
||||
datasets==2.14.6
|
||||
evaluate~=0.4.1
|
||||
rouge_score~=0.1.2
|
||||
|
||||
@ -1,5 +1,5 @@
|
||||
--extra-index-url https://pypi.nvidia.com
|
||||
tensorrt_llm==0.11.0.dev2024061800
|
||||
tensorrt_llm==0.11.0.dev2024062500
|
||||
tiktoken
|
||||
datasets
|
||||
kaldialign
|
||||
|
||||
@ -16,12 +16,13 @@ pandas
|
||||
h5py==3.10.0
|
||||
StrEnum
|
||||
sentencepiece>=0.1.99
|
||||
tensorrt==10.0.1
|
||||
# https://github.com/pytorch/pytorch/blob/v2.3.0/version.txt uses 2.3.0a0.
|
||||
# https://docs.nvidia.com/deeplearning/frameworks/pytorch-release-notes/rel-24-04.html#rel-24-04 uses 2.3.0a0.
|
||||
torch>=2.3.0a,<=2.3.0
|
||||
tensorrt==10.1.0
|
||||
# https://github.com/pytorch/pytorch/blob/v2.3.1/version.txt uses 2.3.0a0.
|
||||
# https://docs.nvidia.com/deeplearning/frameworks/pytorch-release-notes/rel-24-05.html#rel-24-05 uses 2.4.0a0.
|
||||
torch>=2.3.0a0,<=2.4.0a0
|
||||
nvidia-modelopt~=0.11,<0.12
|
||||
transformers>=4.38.2
|
||||
pillow==10.2.0
|
||||
wheel
|
||||
optimum
|
||||
evaluate
|
||||
|
||||
@ -1146,8 +1146,8 @@ class GraphGroup(ABC):
|
||||
num_kv_heads = plugin_info.pfc_as_ndarray["num_kv_heads"].copy()
|
||||
tp_size = plugin_info.pfc_as_ndarray["tp_size"].copy()
|
||||
tp_rank = plugin_info.pfc_as_ndarray["tp_rank"].copy()
|
||||
num_kv_heads = num_kv_heads // kv_partition
|
||||
num_heads = num_heads // partition
|
||||
num_kv_heads = np.maximum(num_kv_heads // kv_partition, 1)
|
||||
num_heads = np.maximum(num_heads // partition, 1)
|
||||
tp_size[0] = partition
|
||||
tp_rank[0] = index
|
||||
|
||||
|
||||
@ -15,6 +15,11 @@ class PluginNode(Node):
|
||||
layer.name)
|
||||
layer.to_base_class()
|
||||
|
||||
def _collect_strategies(self, device_mesh):
|
||||
raise NotImplementedError(
|
||||
f"Auto parallel does not support {self.plugin_type} plugin right now."
|
||||
)
|
||||
|
||||
def _default_strategy(self, device_mesh):
|
||||
strategies_vector = StrategiesVector(self)
|
||||
dim_partition_dict_mapping = {}
|
||||
|
||||
@ -1,5 +1,6 @@
|
||||
from enum import Enum, auto
|
||||
|
||||
import numpy as np
|
||||
import torch
|
||||
|
||||
from tensorrt_llm.functional import PositionEmbeddingType
|
||||
@ -369,8 +370,8 @@ class GPTAttentionPlugin(PluginNode):
|
||||
num_kv_heads = self.plugin_info.pfc_as_ndarray["num_kv_heads"].copy()
|
||||
tp_size = self.plugin_info.pfc_as_ndarray["tp_size"].copy()
|
||||
tp_rank = self.plugin_info.pfc_as_ndarray["tp_rank"].copy()
|
||||
num_kv_heads = num_kv_heads // kv_partition
|
||||
num_heads = num_heads // partition
|
||||
num_kv_heads = np.maximum(num_kv_heads // kv_partition, 1)
|
||||
num_heads = np.maximum(num_heads // partition, 1)
|
||||
tp_size[0] = partition
|
||||
tp_rank[0] = 0
|
||||
|
||||
|
||||
@ -30,10 +30,9 @@ from ..builder import BuildConfig, Engine, build
|
||||
from ..logger import logger
|
||||
from ..lora_manager import LoraConfig, LoraManager
|
||||
from ..models import MODEL_MAP, PretrainedConfig
|
||||
from ..models.modeling_utils import (WEIGHT_LOADER_MODELS, QuantConfig,
|
||||
from ..models.modeling_utils import (WEIGHT_LOADER_MODELS,
|
||||
SpeculativeDecodingMode)
|
||||
from ..plugin import PluginConfig, add_plugin_argument
|
||||
from ..quantization import QuantAlgo
|
||||
|
||||
|
||||
def parse_arguments():
|
||||
@ -133,10 +132,6 @@ def parse_arguments():
|
||||
type=str,
|
||||
default=None,
|
||||
choices=['float16', 'float32'])
|
||||
parser.add_argument('--weight_only_precision',
|
||||
type=str,
|
||||
default=None,
|
||||
choices=['int8', 'int4'])
|
||||
parser.add_argument('--weight_sparsity', default=False, action='store_true')
|
||||
parser.add_argument(
|
||||
'--max_draft_len',
|
||||
@ -270,14 +265,6 @@ def build_model(build_config: BuildConfig,
|
||||
if logits_dtype is not None:
|
||||
model_config.logits_dtype = logits_dtype
|
||||
|
||||
weight_only_precision = kwargs.get('weight_only_precision', None)
|
||||
if not model_config.quant_mode.has_any_quant(
|
||||
) and weight_only_precision is not None:
|
||||
if weight_only_precision == 'int4':
|
||||
model_config.quantization = QuantConfig(QuantAlgo.W4A16)
|
||||
else:
|
||||
model_config.quantization = QuantConfig(QuantAlgo.W8A16)
|
||||
|
||||
architecture = model_config.architecture
|
||||
assert not build_config.plugin_config.streamingllm or architecture == "LlamaForCausalLM", \
|
||||
"StreamingLLM is only supported in the llama model."
|
||||
@ -420,7 +407,6 @@ def main():
|
||||
kwargs = {
|
||||
'logits_dtype': args.logits_dtype,
|
||||
'use_fused_mlp': args.use_fused_mlp,
|
||||
'weight_only_precision': args.weight_only_precision,
|
||||
'tp_size': args.tp_size,
|
||||
'pp_size': args.pp_size,
|
||||
'lora_dir': args.lora_dir,
|
||||
|
||||
@ -40,7 +40,6 @@ from .modeling_utils import (PretrainedConfig, PretrainedModel,
|
||||
from .mpt.model import MPTForCausalLM, MPTModel
|
||||
from .opt.model import OPTForCausalLM, OPTModel
|
||||
from .phi3.model import Phi3ForCausalLM, Phi3Model
|
||||
from .phi3.phi3small.model import Phi3SmallForCausalLM, Phi3SmallModel
|
||||
from .phi.model import PhiForCausalLM, PhiModel
|
||||
from .qwen.model import QWenForCausalLM
|
||||
from .recurrentgemma.model import RecurrentGemmaForCausalLM
|
||||
@ -70,10 +69,8 @@ __all__ = [
|
||||
'GPTNeoXForCausalLM',
|
||||
'PhiModel',
|
||||
'Phi3Model',
|
||||
'Phi3SmallModel',
|
||||
'PhiForCausalLM',
|
||||
'Phi3ForCausalLM',
|
||||
'Phi3SmallForCausalLM',
|
||||
'ChatGLMForCausalLM',
|
||||
'ChatGLMModel',
|
||||
'BaichuanForCausalLM',
|
||||
@ -103,7 +100,7 @@ MODEL_MAP = {
|
||||
'FalconForCausalLM': FalconForCausalLM,
|
||||
'PhiForCausalLM': PhiForCausalLM,
|
||||
'Phi3ForCausalLM': Phi3ForCausalLM,
|
||||
'Phi3SmallForCausalLM': Phi3SmallForCausalLM,
|
||||
'Phi3SmallForCausalLM': Phi3ForCausalLM,
|
||||
'MambaForCausalLM': MambaForCausalLM,
|
||||
'GPTNeoXForCausalLM': GPTNeoXForCausalLM,
|
||||
'GPTJForCausalLM': GPTJForCausalLM,
|
||||
|
||||
@ -292,6 +292,7 @@ class GemmaForCausalLM(DecoderModelForCausalLM):
|
||||
return tllm_llama
|
||||
|
||||
def check_config(self, config):
|
||||
config.set_if_not_exist("share_embedding_table", True)
|
||||
config.set_if_not_exist('use_parallel_embedding', False)
|
||||
config.set_if_not_exist('embedding_sharding_dim', 0)
|
||||
config.set_if_not_exist('mlp_bias', False)
|
||||
|
||||
@ -62,6 +62,80 @@ class GenerationMixin:
|
||||
[split_point[-1], max_num_tokens, max_num_tokens])
|
||||
return num_tokens_ranges
|
||||
|
||||
@staticmethod
|
||||
def get_profiles_ranges(
|
||||
*,
|
||||
max_batch_size,
|
||||
max_beam_width,
|
||||
max_input_len,
|
||||
max_num_tokens,
|
||||
max_draft_len,
|
||||
opt_batch_size,
|
||||
opt_num_tokens,
|
||||
enable_ctx_gen_opt_profiles,
|
||||
multiple_profiles,
|
||||
):
|
||||
default_range = GenerationMixin.default_range
|
||||
if opt_batch_size:
|
||||
bb_range_cxt = [1, opt_batch_size, max_batch_size]
|
||||
bb_range_gen = [
|
||||
1, opt_batch_size * max_beam_width,
|
||||
max_batch_size * max_beam_width
|
||||
]
|
||||
else:
|
||||
bb_range_cxt = default_range(max_batch_size)
|
||||
bb_range_gen = default_range(max_batch_size * max_beam_width)
|
||||
tokens_per_engine_step = max_draft_len + 1
|
||||
tokens_per_engine_step_range = [
|
||||
1, tokens_per_engine_step, tokens_per_engine_step
|
||||
]
|
||||
bbd_range_ctx = [
|
||||
bb_range_cxt[i] * (tokens_per_engine_step if i != 0 else 1)
|
||||
for i in range(len(bb_range_cxt))
|
||||
]
|
||||
bbd_range_gen = [
|
||||
bb_range_gen[i] * (tokens_per_engine_step if i != 0 else 1)
|
||||
for i in range(len(bb_range_gen))
|
||||
]
|
||||
inlen_range_cxt = default_range(max_input_len)
|
||||
inlen_range_gen = [1, 1, tokens_per_engine_step]
|
||||
if enable_ctx_gen_opt_profiles:
|
||||
num_profiles = 2
|
||||
bb_range = [bb_range_cxt, bb_range_gen]
|
||||
bbd_range = [bbd_range_ctx, bbd_range_gen]
|
||||
inlen_range = [inlen_range_cxt, inlen_range_gen]
|
||||
position_ids_inlen_range = [inlen_range_cxt, [1, 1, 1]]
|
||||
num_tokens_range_ctx = default_range(max_batch_size * max_input_len)
|
||||
# Draft tokens cannot be combined with beam search
|
||||
num_tokens_range_gen = default_range(
|
||||
max_batch_size * max(tokens_per_engine_step, max_beam_width))
|
||||
num_tokens_range = [num_tokens_range_ctx, num_tokens_range_gen]
|
||||
else:
|
||||
if multiple_profiles:
|
||||
num_tokens_range = GenerationMixin.split_num_tokens_range(
|
||||
max_num_tokens)
|
||||
else:
|
||||
if opt_num_tokens is None:
|
||||
opt_num_tokens = min(max_num_tokens,
|
||||
max_batch_size * max_beam_width)
|
||||
num_tokens_range = [[1, opt_num_tokens, max_num_tokens]]
|
||||
num_profiles = len(num_tokens_range)
|
||||
bb_range = [bb_range_gen] * num_profiles
|
||||
bbd_range = [bbd_range_gen] * num_profiles
|
||||
inlen_range = [[1, 1, max_input_len]] * num_profiles
|
||||
position_ids_inlen_range = [[1, 1, max_input_len]] * num_profiles
|
||||
tokens_per_engine_step_range = [tokens_per_engine_step_range
|
||||
] * num_profiles
|
||||
ranges = {
|
||||
'bb_range': bb_range,
|
||||
'bbd_range': bbd_range,
|
||||
'inlen_range': inlen_range,
|
||||
'position_ids_inlen_range': position_ids_inlen_range,
|
||||
'num_tokens_range': num_tokens_range,
|
||||
'tokens_per_engine_step_range': tokens_per_engine_step_range,
|
||||
}
|
||||
return num_profiles, ranges
|
||||
|
||||
def prepare_attention_inputs(self,
|
||||
*,
|
||||
max_batch_size,
|
||||
@ -343,61 +417,26 @@ class GenerationMixin:
|
||||
streamingllm: bool = False,
|
||||
opt_batch_size=None):
|
||||
|
||||
default_range = GenerationMixin.default_range
|
||||
tokens_per_engine_step = max_draft_len + 1
|
||||
tokens_per_engine_step_range = [
|
||||
1, tokens_per_engine_step, tokens_per_engine_step
|
||||
]
|
||||
if opt_batch_size:
|
||||
bb_range_cxt = [1, opt_batch_size, max_batch_size]
|
||||
bb_range_gen = [
|
||||
1, opt_batch_size * max_beam_width,
|
||||
max_batch_size * max_beam_width
|
||||
]
|
||||
else:
|
||||
bb_range_cxt = default_range(max_batch_size)
|
||||
bb_range_gen = default_range(max_batch_size * max_beam_width)
|
||||
bbd_range_ctx = [
|
||||
bb_range_cxt[i] * (tokens_per_engine_step if i != 0 else 1)
|
||||
for i in range(len(bb_range_cxt))
|
||||
]
|
||||
bbd_range_gen = [
|
||||
bb_range_gen[i] * (tokens_per_engine_step if i != 0 else 1)
|
||||
for i in range(len(bb_range_gen))
|
||||
]
|
||||
inlen_range_cxt = default_range(max_input_len)
|
||||
inlen_range_gen = [1, 1, tokens_per_engine_step]
|
||||
|
||||
enable_ctx_gen_opt_profiles = GenerationMixin.has_ctx_gen_opt_profiles(
|
||||
use_gpt_attention_plugin, use_gemm_plugin, remove_input_padding,
|
||||
paged_kv_cache)
|
||||
if enable_ctx_gen_opt_profiles:
|
||||
num_profiles = 2
|
||||
bb_range = [bb_range_cxt, bb_range_gen]
|
||||
bbd_range = [bbd_range_ctx, bbd_range_gen]
|
||||
inlen_range = [inlen_range_cxt, inlen_range_gen]
|
||||
position_ids_inlen_range = [inlen_range_cxt, [1, 1, 1]]
|
||||
num_tokens_range_ctx = default_range(max_batch_size * max_input_len)
|
||||
# Draft tokens cannot be combined with beam search
|
||||
num_tokens_range_gen = default_range(
|
||||
max_batch_size * max(tokens_per_engine_step, max_beam_width))
|
||||
num_tokens_range = [num_tokens_range_ctx, num_tokens_range_gen]
|
||||
else:
|
||||
if multiple_profiles:
|
||||
num_tokens_range = GenerationMixin.split_num_tokens_range(
|
||||
max_num_tokens)
|
||||
else:
|
||||
if opt_num_tokens is None:
|
||||
opt_num_tokens = min(max_num_tokens,
|
||||
max_batch_size * max_beam_width)
|
||||
num_tokens_range = [[1, opt_num_tokens, max_num_tokens]]
|
||||
num_profiles = len(num_tokens_range)
|
||||
bb_range = [bb_range_gen] * num_profiles
|
||||
bbd_range = [bbd_range_gen] * num_profiles
|
||||
inlen_range = [[1, 1, max_input_len]] * num_profiles
|
||||
position_ids_inlen_range = [[1, 1, max_input_len]] * num_profiles
|
||||
tokens_per_engine_step_range = [tokens_per_engine_step_range
|
||||
] * num_profiles
|
||||
|
||||
num_profiles, ranges = GenerationMixin.get_profiles_ranges(
|
||||
max_batch_size=max_batch_size,
|
||||
max_beam_width=max_beam_width,
|
||||
max_input_len=max_input_len,
|
||||
max_num_tokens=max_num_tokens,
|
||||
max_draft_len=max_draft_len,
|
||||
opt_batch_size=opt_batch_size,
|
||||
opt_num_tokens=opt_num_tokens,
|
||||
enable_ctx_gen_opt_profiles=enable_ctx_gen_opt_profiles,
|
||||
multiple_profiles=multiple_profiles)
|
||||
bb_range = ranges['bb_range']
|
||||
bbd_range = ranges['bbd_range']
|
||||
inlen_range = ranges['inlen_range']
|
||||
num_tokens_range = ranges['num_tokens_range']
|
||||
position_ids_inlen_range = ranges['position_ids_inlen_range']
|
||||
tokens_per_engine_step_range = ranges['tokens_per_engine_step_range']
|
||||
position_ids_num_tokens_range = num_tokens_range
|
||||
|
||||
input_ids = None
|
||||
@ -597,12 +636,13 @@ class GenerationMixin:
|
||||
spec_decoding_params = None
|
||||
# Use positional offsets and packed mask only when not in SpS spec decoding
|
||||
if speculative_decoding_draft_tokens_external == False and max_draft_len > 0:
|
||||
tokens_per_engine_step = max_draft_len + 1
|
||||
# 32 bits packed mask aligned.
|
||||
num_packed_masks = (tokens_per_engine_step + 32 - 1) // 32
|
||||
packed_mask_len_range = [[0, 1, num_packed_masks]] * num_profiles
|
||||
# total number of spec decoding tokens for all sequences (sequence length can be variable).
|
||||
num_gen_tokens_range = [
|
||||
default_range(
|
||||
GenerationMixin.default_range(
|
||||
max_batch_size * max_beam_width * tokens_per_engine_step,
|
||||
min_range=0)
|
||||
] * num_profiles
|
||||
|
||||
@ -1535,6 +1535,12 @@ def load_weights_from_hf_safetensors(model_dir: str, config: LLaMAConfig):
|
||||
|
||||
moe_config = config.moe
|
||||
|
||||
kv_tp_size = None
|
||||
kv_tp_rank = None
|
||||
if config.num_key_value_heads < mapping.tp_size:
|
||||
kv_tp_size = config.num_key_value_heads
|
||||
kv_tp_rank = mapping.tp_rank * kv_tp_size // mapping.tp_size
|
||||
|
||||
model_prefix = "model."
|
||||
key_list = [
|
||||
"embed_tokens.weight", # vocab_embedding
|
||||
@ -1552,7 +1558,12 @@ def load_weights_from_hf_safetensors(model_dir: str, config: LLaMAConfig):
|
||||
|
||||
torch_dtype = str_dtype_to_torch(dtype)
|
||||
|
||||
def load(key, tp_dim=-1, no_prefix=0, is_expert_weights=False):
|
||||
def load(key,
|
||||
tp_dim=-1,
|
||||
no_prefix=0,
|
||||
is_expert_weights=False,
|
||||
tp_size=None,
|
||||
tp_rank=None):
|
||||
if not no_prefix:
|
||||
key = model_prefix + key
|
||||
ptr_idx = safetensors_map[key] if key in safetensors_map else 0
|
||||
@ -1560,38 +1571,28 @@ def load_weights_from_hf_safetensors(model_dir: str, config: LLaMAConfig):
|
||||
if key not in safetensors_ptrs[ptr_idx].keys():
|
||||
return None
|
||||
|
||||
tensor_slice = safetensors_ptrs[ptr_idx].get_slice(key)
|
||||
tensor_shape = tensor_slice.get_shape()
|
||||
if tp_dim == -1:
|
||||
res = safetensors_ptrs[ptr_idx].get_tensor(key)
|
||||
else:
|
||||
res = tensor_slice[:]
|
||||
elif tp_dim >= 0 and tp_dim < len(tensor_shape):
|
||||
if is_expert_weights:
|
||||
tp_size = mapping.moe_tp_size
|
||||
tp_rank = mapping.moe_tp_rank
|
||||
else:
|
||||
tp_size = mapping.tp_size
|
||||
tp_rank = mapping.tp_rank
|
||||
tensor_slice = safetensors_ptrs[ptr_idx].get_slice(key)
|
||||
tensor_shape = tensor_slice.get_shape()
|
||||
if len(tensor_shape) == 1:
|
||||
if tp_dim == 0:
|
||||
slice_width = tensor_shape[0] // tp_size
|
||||
res = tensor_slice[slice_width * tp_rank:slice_width *
|
||||
(tp_rank + 1)]
|
||||
else:
|
||||
res = tensor_slice[:]
|
||||
else:
|
||||
if tensor_shape[tp_dim] % tp_size != 0:
|
||||
logger.error(
|
||||
"Current weight shape is invalid for tp_size=" +
|
||||
str(tp_size))
|
||||
slice_width = tensor_shape[tp_dim] // tp_size
|
||||
if tp_dim == 0:
|
||||
res = tensor_slice[slice_width * tp_rank:slice_width *
|
||||
(tp_rank + 1), :]
|
||||
elif tp_dim == 1:
|
||||
res = tensor_slice[:, slice_width * tp_rank:slice_width *
|
||||
(tp_rank + 1)]
|
||||
else:
|
||||
assert False, "Invalid TP dim"
|
||||
tp_size = tp_size or mapping.tp_size
|
||||
tp_rank = tp_rank or mapping.tp_rank
|
||||
dim_size = tensor_shape[tp_dim]
|
||||
if dim_size % tp_size != 0:
|
||||
logger.error(
|
||||
f"Current weight shape {tensor_shape} is invalid at dimension {tp_dim} for TP size {tp_size}"
|
||||
)
|
||||
indices = [slice(None)] * len(tensor_shape)
|
||||
indices[tp_dim] = slice(dim_size * tp_rank // tp_size,
|
||||
dim_size * (tp_rank + 1) // tp_size)
|
||||
res = tensor_slice[indices]
|
||||
else:
|
||||
raise ValueError(f"Invalid TP dim: {tp_dim}")
|
||||
return res.to(torch_dtype).contiguous(
|
||||
) if "block_sparse_moe.gate" not in key else res.to(torch.float32)
|
||||
|
||||
@ -1632,11 +1633,19 @@ def load_weights_from_hf_safetensors(model_dir: str, config: LLaMAConfig):
|
||||
# Attention
|
||||
qkv_list = []
|
||||
for comp in ["q", "k", "v"]:
|
||||
weight_part = load(prefix + key_list[3] + comp + key_list[4], 0)
|
||||
tp_size = kv_tp_size if comp != "q" else None
|
||||
tp_rank = kv_tp_rank if comp != "q" else None
|
||||
weight_part = load(prefix + key_list[3] + comp + key_list[4],
|
||||
0,
|
||||
tp_size=tp_size,
|
||||
tp_rank=tp_rank)
|
||||
qkv_list.append(weight_part)
|
||||
bias_part = load(
|
||||
(prefix + key_list[3] + comp + key_list[4]).replace(
|
||||
"weight", "bias"), 0)
|
||||
"weight", "bias"),
|
||||
0,
|
||||
tp_size=tp_size,
|
||||
tp_rank=tp_rank)
|
||||
if bias_part is not None:
|
||||
qkv_list.append(bias_part)
|
||||
if len(qkv_list) == 3:
|
||||
|
||||
@ -23,6 +23,7 @@ from ...functional import (Tensor, arange, cast, concat, expand,
|
||||
gather_last_token_logits, shape, unsqueeze)
|
||||
from ...layers import Embedding, LayerNorm, Linear, Mamba, RmsNorm
|
||||
from ...module import Module, ModuleList
|
||||
from ...plugin import current_all_reduce_helper
|
||||
from ..generation_mixin import GenerationMixin
|
||||
from ..modeling_utils import PretrainedConfig, PretrainedModel
|
||||
|
||||
@ -192,6 +193,7 @@ class MambaForCausalLM(PretrainedModel):
|
||||
ssm_states,
|
||||
host_request_types,
|
||||
last_token_ids,
|
||||
last_token_ids_for_logits,
|
||||
host_context_lengths,
|
||||
slot_mapping: Optional[Tensor] = None):
|
||||
hidden_states, present_convs, present_ssms = self.backbone(
|
||||
@ -200,7 +202,7 @@ class MambaForCausalLM(PretrainedModel):
|
||||
|
||||
if not self.gather_context_logits:
|
||||
hidden_states = gather_last_token_logits(
|
||||
hidden_states, last_token_ids,
|
||||
hidden_states, last_token_ids_for_logits,
|
||||
default_net().plugin_config.remove_input_padding)
|
||||
|
||||
lm_logits = self.lm_head(hidden_states)
|
||||
@ -218,9 +220,9 @@ class MambaForCausalLM(PretrainedModel):
|
||||
max_batch_size,
|
||||
max_input_len,
|
||||
max_seq_len,
|
||||
max_num_tokens,
|
||||
use_cache,
|
||||
max_beam_width: int = 1,
|
||||
max_num_tokens: int = None,
|
||||
opt_num_tokens: int = None,
|
||||
opt_batch_size: int = 0,
|
||||
prompt_embedding_table_size: int = 0,
|
||||
@ -235,56 +237,79 @@ class MambaForCausalLM(PretrainedModel):
|
||||
@return: a list contains values which can be fed into the self.forward()
|
||||
'''
|
||||
assert speculative_decoding_draft_tokens_external == False, "Speculative decoding is not supported in Mamba"
|
||||
assert max_beam_width == 1, "We don't support beam search for the Mamba model."
|
||||
|
||||
remove_input_padding = default_net().plugin_config.remove_input_padding
|
||||
use_gemm_plugin = default_net().plugin_config.gemm_plugin
|
||||
paged_state = default_net().plugin_config.paged_state
|
||||
multiple_profiles = default_net().plugin_config.multiple_profiles
|
||||
use_mamba_conv1d_plugin = default_net(
|
||||
).plugin_config.mamba_conv1d_plugin
|
||||
batch_range = [GenerationMixin.default_range(max_batch_size)]
|
||||
use_custom_all_reduce = default_net(
|
||||
).plugin_config.use_custom_all_reduce
|
||||
|
||||
self.gather_context_logits = gather_context_logits
|
||||
mapping = self.config.mapping
|
||||
|
||||
# basic inputs
|
||||
enable_ctx_gen_opt_profiles = GenerationMixin.has_ctx_gen_opt_profiles(
|
||||
True, use_gemm_plugin, remove_input_padding, paged_state)
|
||||
|
||||
num_profiles, ranges = GenerationMixin.get_profiles_ranges(
|
||||
max_batch_size=max_batch_size,
|
||||
max_beam_width=max_beam_width,
|
||||
max_input_len=max_input_len,
|
||||
max_num_tokens=max_num_tokens,
|
||||
max_draft_len=max_draft_len,
|
||||
opt_batch_size=opt_batch_size,
|
||||
opt_num_tokens=opt_num_tokens,
|
||||
enable_ctx_gen_opt_profiles=enable_ctx_gen_opt_profiles,
|
||||
multiple_profiles=multiple_profiles)
|
||||
|
||||
if remove_input_padding:
|
||||
assert use_mamba_conv1d_plugin, "mamba_conv1d_plugin is needed to support remove_input_padding"
|
||||
max_num_tokens = max(
|
||||
max_input_len * max_batch_size,
|
||||
max_beam_width * (max_draft_len + 1) * max_batch_size)
|
||||
if opt_num_tokens is None:
|
||||
opt_num_tokens = max_beam_width * (max_draft_len +
|
||||
1) * max_batch_size
|
||||
num_tokens_range = [[1, opt_num_tokens, max_num_tokens]]
|
||||
input_ids = Tensor(name='input_ids',
|
||||
dtype=trt.int32,
|
||||
shape=[-1],
|
||||
dim_range=OrderedDict([
|
||||
('num_tokens', num_tokens_range),
|
||||
('num_tokens', ranges['num_tokens_range']),
|
||||
]))
|
||||
else:
|
||||
input_ids = Tensor(name='input_ids',
|
||||
dtype=trt.int32,
|
||||
shape=[-1, -1],
|
||||
dim_range=OrderedDict([
|
||||
('batch_size', batch_range),
|
||||
('input_len', [[1, 1, max_input_len]]),
|
||||
('batch_size_beam_width',
|
||||
ranges['bb_range']),
|
||||
('input_len', ranges['inlen_range']),
|
||||
]))
|
||||
if use_custom_all_reduce and mapping.tp_size > 1:
|
||||
current_all_reduce_helper().set_workspace_tensor(
|
||||
mapping, num_profiles)
|
||||
|
||||
# recurrent inputs
|
||||
conv_states = []
|
||||
ssm_states = []
|
||||
if use_mamba_conv1d_plugin:
|
||||
conv_state_dim_range = OrderedDict([
|
||||
('batch_size', batch_range),
|
||||
('kernel_size', [self.d_conv - 1]),
|
||||
('dim_size', [self.d_inner]),
|
||||
('batch_size', ranges['bb_range']),
|
||||
('kernel_size', [self.d_conv - 1] * num_profiles),
|
||||
('dim_size', [self.d_inner] * num_profiles),
|
||||
])
|
||||
else:
|
||||
conv_state_dim_range = OrderedDict([
|
||||
('batch_size', batch_range),
|
||||
('dim_size', [self.d_inner]),
|
||||
('kernel_size', [self.d_conv - 1]),
|
||||
('batch_size', ranges['bb_range']),
|
||||
('dim_size', [self.d_inner] * num_profiles),
|
||||
('kernel_size', [self.d_conv - 1] * num_profiles),
|
||||
])
|
||||
|
||||
ssm_state_dim_range = OrderedDict([
|
||||
('batch_size', batch_range),
|
||||
('state_size', [self.d_state]),
|
||||
('dim_size', [self.d_inner]),
|
||||
('batch_size', ranges['bb_range']),
|
||||
('state_size', [self.d_state] * num_profiles),
|
||||
('dim_size', [self.d_inner] * num_profiles),
|
||||
])
|
||||
one_dim_range = OrderedDict([
|
||||
('buffer_count', [1]),
|
||||
('buffer_count', [1] * num_profiles),
|
||||
])
|
||||
|
||||
for i in range(self.config.num_hidden_layers):
|
||||
@ -324,7 +349,7 @@ class MambaForCausalLM(PretrainedModel):
|
||||
name='host_request_types',
|
||||
dtype=trt.int32,
|
||||
shape=[-1],
|
||||
dim_range=OrderedDict([('batch_size', batch_range)]),
|
||||
dim_range=OrderedDict([('batch_size', ranges['bb_range'])]),
|
||||
)
|
||||
|
||||
if use_mamba_conv1d_plugin and remove_input_padding:
|
||||
@ -332,21 +357,22 @@ class MambaForCausalLM(PretrainedModel):
|
||||
name='host_context_lengths',
|
||||
dtype=trt.int32,
|
||||
shape=[-1],
|
||||
dim_range=OrderedDict([('batch_size', batch_range)]),
|
||||
dim_range=OrderedDict([('batch_size', ranges['bb_range'])]),
|
||||
)
|
||||
else:
|
||||
host_context_lengths = None
|
||||
|
||||
last_token_ids = None
|
||||
last_token_ids = Tensor(
|
||||
name='last_token_ids',
|
||||
dtype=trt.int32,
|
||||
shape=[-1],
|
||||
dim_range=OrderedDict([
|
||||
('batch_size', ranges['bbd_range']),
|
||||
]),
|
||||
)
|
||||
last_token_ids_for_logits = None
|
||||
if not gather_context_logits:
|
||||
last_token_ids = Tensor(
|
||||
name='last_token_ids',
|
||||
dtype=trt.int32,
|
||||
shape=[-1],
|
||||
dim_range=OrderedDict([
|
||||
('batch_size', batch_range),
|
||||
]),
|
||||
)
|
||||
last_token_ids_for_logits = last_token_ids
|
||||
|
||||
return_dict = {
|
||||
'input_ids': input_ids,
|
||||
@ -354,6 +380,7 @@ class MambaForCausalLM(PretrainedModel):
|
||||
'ssm_states': ssm_states,
|
||||
'host_request_types': host_request_types,
|
||||
'last_token_ids': last_token_ids,
|
||||
'last_token_ids_for_logits': last_token_ids_for_logits,
|
||||
'host_context_lengths': host_context_lengths,
|
||||
}
|
||||
|
||||
@ -362,7 +389,7 @@ class MambaForCausalLM(PretrainedModel):
|
||||
name='slot_mapping',
|
||||
dtype=trt.int32,
|
||||
shape=[-1],
|
||||
dim_range=OrderedDict([('batch_size', batch_range)]),
|
||||
dim_range=OrderedDict([('batch_size', ranges['bb_range'])]),
|
||||
)
|
||||
return_dict['slot_mapping'] = slot_mapping
|
||||
|
||||
|
||||
@ -1,43 +1,80 @@
|
||||
from pathlib import Path
|
||||
|
||||
import numpy as np
|
||||
import torch
|
||||
|
||||
from tensorrt_llm import logger
|
||||
from tensorrt_llm._utils import str_dtype_to_torch, torch_to_numpy
|
||||
from tensorrt_llm._utils import str_dtype_to_torch
|
||||
from tensorrt_llm.mapping import Mapping
|
||||
from tensorrt_llm.models import MedusaLM
|
||||
from tensorrt_llm.models.convert_utils import split
|
||||
|
||||
|
||||
def get_tllm_linear_weight(weight,
|
||||
prefix,
|
||||
bias=None,
|
||||
use_weight_only=False,
|
||||
plugin_weight_only_quant_type=torch.int8,
|
||||
postfix='weight'):
|
||||
results = {}
|
||||
if use_weight_only:
|
||||
v = weight.t().contiguous().cpu()
|
||||
processed_torch_weights, torch_weight_scales = \
|
||||
torch.ops.trtllm.symmetric_quantize_last_axis_of_batched_matrix(
|
||||
v, plugin_weight_only_quant_type)
|
||||
results[prefix + postfix] = processed_torch_weights
|
||||
results[prefix + 'per_channel_scale'] = torch_weight_scales
|
||||
else:
|
||||
results[prefix + postfix] = weight.contiguous()
|
||||
|
||||
if bias is not None:
|
||||
results[prefix + 'bias'] = bias
|
||||
|
||||
return results
|
||||
|
||||
|
||||
def load_medusa_hf(medusa_path: str,
|
||||
trt_llm_medusa: MedusaLM,
|
||||
num_medusa_heads: int,
|
||||
num_medusa_layers: int,
|
||||
mapping=Mapping(),
|
||||
dtype='float32'):
|
||||
dtype='float32',
|
||||
use_weight_only=False,
|
||||
plugin_weight_only_quant_type=None):
|
||||
logger.info("Loading Medusa heads' weights ...")
|
||||
is_ckpt_safetensors = False
|
||||
|
||||
ckpt_file = Path(medusa_path) / "medusa_lm_head.pt"
|
||||
state_dict = torch.load(ckpt_file, map_location="cpu")
|
||||
if not ckpt_file.exists():
|
||||
ckpt_file = Path(medusa_path) / "medusa_lm_head.safetensors"
|
||||
is_ckpt_safetensors = True
|
||||
|
||||
if is_ckpt_safetensors:
|
||||
logger.info("Safetensors Found ...")
|
||||
from safetensors.torch import load_file
|
||||
state_dict = load_file(ckpt_file)
|
||||
else:
|
||||
state_dict = torch.load(ckpt_file, map_location="cpu")
|
||||
|
||||
torch_dtype = str_dtype_to_torch(dtype)
|
||||
for h in range(trt_llm_medusa.num_medusa_heads):
|
||||
for l in range(trt_llm_medusa.num_medusa_layers):
|
||||
w = state_dict[f"{h}.{l}.linear.weight"].clone()
|
||||
w = torch_to_numpy(w.to(torch_dtype).detach().cpu())
|
||||
trt_llm_medusa.medusa_heads[h].medusa_layers[
|
||||
l].linear.weight.value = np.ascontiguousarray(
|
||||
split(w, mapping.tp_size, mapping.tp_rank))
|
||||
if trt_llm_medusa.medusa_heads[h].medusa_layers[
|
||||
l].linear.bias is not None:
|
||||
# print(f"Setting bias for {h} {l}")
|
||||
b = state_dict[f"{h}.{l}.linear.bias"].clone()
|
||||
b = torch_to_numpy(b.to(torch_dtype).detach().cpu())
|
||||
trt_llm_medusa.medusa_heads[h].medusa_layers[
|
||||
l].linear.bias.value = np.ascontiguousarray(
|
||||
np.split(b, mapping.tp_size,
|
||||
axis=0)[mapping.tp_rank].copy())
|
||||
lm = state_dict[f"{h}.{trt_llm_medusa.num_medusa_layers}.weight"].clone(
|
||||
) # LM Head
|
||||
lm = torch_to_numpy(lm.to(torch_dtype).detach().cpu())
|
||||
trt_llm_medusa.medusa_heads[
|
||||
h].lm_head.weight.value = np.ascontiguousarray(
|
||||
split(lm, mapping.tp_size, mapping.tp_rank))
|
||||
return
|
||||
weights = {}
|
||||
|
||||
for h in range(num_medusa_heads):
|
||||
for l in range(num_medusa_layers):
|
||||
w = state_dict[f"{h}.{l}.linear.weight"].clone().to(torch_dtype)
|
||||
|
||||
split_v = split(w, mapping.tp_size, mapping.tp_rank)
|
||||
weights.update(
|
||||
get_tllm_linear_weight(
|
||||
split_v, f'medusa_heads.{h}.medusa_layers.{l}.linear.',
|
||||
None, use_weight_only, plugin_weight_only_quant_type))
|
||||
|
||||
b = state_dict[f"{h}.{l}.linear.bias"].clone().to(torch_dtype)
|
||||
|
||||
weights['medusa_heads.{}.medusa_layers.{}.linear.bias'.format(
|
||||
h, l)] = split(b, mapping.tp_size, mapping.tp_rank)
|
||||
|
||||
lm = state_dict[f"{h}.{num_medusa_layers}.weight"].clone().to(
|
||||
torch_dtype) # LM Head
|
||||
|
||||
weights['medusa_heads.{}.lm_head.weight'.format(h)] = split(
|
||||
lm, mapping.tp_size, mapping.tp_rank)
|
||||
|
||||
return weights
|
||||
|
||||
@ -27,6 +27,10 @@ from ..mapping import Mapping
|
||||
from ..module import Module, ModuleList
|
||||
from ..parameter import Parameter
|
||||
from ..quantization import QuantMode
|
||||
from ..quantization.layers import (WeightOnlyGroupwiseQuantLinear,
|
||||
WeightOnlyGroupwiseQuantRowLinear,
|
||||
WeightOnlyQuantLinear,
|
||||
WeightOnlyQuantRowLinear)
|
||||
from ..quantization.mode import W8A8_SQ_PLUGIN_LIST, QuantAlgo
|
||||
from ..top_model_mixin import TopModelMixin
|
||||
from .convert_utils import weight_only_quantize_dict
|
||||
@ -841,7 +845,8 @@ def unfuse_qkv_gemm(model: PretrainedModel) -> PretrainedModel:
|
||||
continue
|
||||
qkv_params = get_init_params(layer.qkv, ColumnLinear)
|
||||
qkv_params["bias"] = qkv_params["bias"] is not None
|
||||
qkv_params["strict_dtype"] = qkv_params["strict_dtype"] is not None
|
||||
qkv_params["strict_dtype"] = qkv_params.get(
|
||||
"strict_dtype") is not None
|
||||
q = ColumnLinear(
|
||||
**{
|
||||
**qkv_params,
|
||||
@ -866,20 +871,34 @@ def unfuse_qkv_gemm(model: PretrainedModel) -> PretrainedModel:
|
||||
q = quantize(q, model.config.quantization)
|
||||
k = quantize(k, model.config.quantization)
|
||||
v = quantize(v, model.config.quantization)
|
||||
out_features = q.out_features + k.out_features + v.out_features
|
||||
if isinstance(layer.qkv, (
|
||||
WeightOnlyQuantLinear,
|
||||
WeightOnlyQuantRowLinear,
|
||||
WeightOnlyGroupwiseQuantLinear,
|
||||
WeightOnlyGroupwiseQuantRowLinear,
|
||||
)):
|
||||
out_dim = 1
|
||||
else:
|
||||
out_dim = 0
|
||||
if layer.qkv.weight.is_inited():
|
||||
qkv_weight = layer.qkv.weight.raw_value
|
||||
weights = np.split(qkv_weight, [
|
||||
q.out_features,
|
||||
q.out_features + k.out_features,
|
||||
])
|
||||
qkv_weight.shape[out_dim] * q.out_features // out_features,
|
||||
qkv_weight.shape[out_dim] *
|
||||
(q.out_features + k.out_features) // out_features,
|
||||
],
|
||||
axis=out_dim)
|
||||
for gemm, weight in zip([q, k, v], weights):
|
||||
gemm.weight.value = weight
|
||||
if layer.qkv.bias is not None and layer.qkv.bias.is_inited():
|
||||
qkv_bias = layer.qkv.bias.raw_value
|
||||
biases = np.split(qkv_bias, [
|
||||
q.out_features,
|
||||
q.out_features + k.out_features,
|
||||
])
|
||||
qkv_bias.shape[out_dim] * q.out_features // out_features,
|
||||
qkv_bias.shape[out_dim] *
|
||||
(q.out_features + k.out_features) // out_features,
|
||||
],
|
||||
axis=out_dim)
|
||||
for gemm, bias in zip([q, k, v], biases):
|
||||
gemm.bias.value = bias
|
||||
for name, parameter in layer.qkv._parameters.items():
|
||||
|
||||
@ -1,9 +1,12 @@
|
||||
import torch
|
||||
|
||||
from tensorrt_llm.quantization import QuantAlgo
|
||||
|
||||
from ..._utils import str_dtype_to_torch
|
||||
from .split_weights import shuffle_qkv_weights, split_weights_tp
|
||||
|
||||
|
||||
def convert_hf_weights(hf_model, dtype, **kwargs):
|
||||
def convert_hf_weights(hf_model, dtype, config, small_variant, args, rank):
|
||||
torch_dtype = str_dtype_to_torch(dtype)
|
||||
hf_state_dict = hf_model.state_dict()
|
||||
weights = {}
|
||||
@ -15,13 +18,16 @@ def convert_hf_weights(hf_model, dtype, **kwargs):
|
||||
key = key.replace("model.layers.", "transformer.layers.")
|
||||
#Attention
|
||||
key = key.replace("self_attn.", "attention.")
|
||||
key = key.replace("query_key_value.", "qkv.") # small
|
||||
key = key.replace("Wqkv.weight", "qkv.weight")
|
||||
key = key.replace("qkv_proj.", "qkv.") #128k
|
||||
#MLP
|
||||
key = key.replace("mlp.fc1.", "mlp.fc.")
|
||||
key = key.replace("mlp.fc2.", "mlp.proj.")
|
||||
key = key.replace("mlp.gate_up_proj.", "mlp.fc.")
|
||||
key = key.replace("mlp.up_proj.", "mlp.gate.") #128k
|
||||
key = key.replace(
|
||||
"mlp.up_proj.",
|
||||
"mlp.fc." if small_variant else "mlp.gate.") #128k
|
||||
key = key.replace("mlp.down_proj.", "mlp.proj.") #128k
|
||||
key = key.replace("mlp.gate_proj.", "mlp.fc.") #128k
|
||||
key = key.replace("o_proj.", "dense.") #128k
|
||||
@ -53,27 +59,67 @@ def convert_hf_weights(hf_model, dtype, **kwargs):
|
||||
key = key.replace("q_proj.weight", "qkv.weight")
|
||||
elif "k_proj" in key or "v_proj" in key:
|
||||
continue
|
||||
|
||||
weights[key] = value.to(torch_dtype).cpu()
|
||||
|
||||
if small_variant:
|
||||
weights['lm_head.weight'] = weights[
|
||||
'transformer.vocab_embedding.weight'].clone()
|
||||
|
||||
# Transform QKV weights from custom Phi3Small format to TRT-LLM format
|
||||
for key, value in weights.items():
|
||||
if "qkv." in key:
|
||||
weights[key] = shuffle_qkv_weights(weights[key], config)
|
||||
|
||||
weights = split_weights_tp(config, weights, args, rank, torch_dtype)
|
||||
|
||||
return weights
|
||||
|
||||
|
||||
def convert_hf_config(hf_config, dtype, **kwargs):
|
||||
def convert_small_hf_config(hf_config):
|
||||
return {
|
||||
'architecture': "Phi3SmallForCausalLM",
|
||||
'rotary_base': hf_config.rope_embedding_base,
|
||||
'gegelu_limit': hf_config.gegelu_limit,
|
||||
'mup_attn_multiplier': hf_config.mup_attn_multiplier,
|
||||
'mup_embedding_multiplier': hf_config.mup_embedding_multiplier,
|
||||
'mup_use_scaling': hf_config.mup_use_scaling,
|
||||
'mup_width_multiplier': hf_config.mup_width_multiplier,
|
||||
'blocksparse_block_size': hf_config.blocksparse_block_size,
|
||||
'blocksparse_homo_head_pattern':
|
||||
hf_config.blocksparse_homo_head_pattern,
|
||||
'blocksparse_num_local_blocks': hf_config.blocksparse_num_local_blocks,
|
||||
'blocksparse_vertical_stride': hf_config.blocksparse_vert_stride,
|
||||
'dense_attention_every_n_layers':
|
||||
hf_config.dense_attention_every_n_layers,
|
||||
}
|
||||
|
||||
|
||||
def convert_hf_config(hf_config, dtype, args):
|
||||
config = {
|
||||
'architecture': "Phi3ForCausalLM",
|
||||
'dtype': dtype,
|
||||
'num_hidden_layers': hf_config.num_hidden_layers,
|
||||
'num_attention_heads': hf_config.num_attention_heads,
|
||||
'num_key_value_heads': hf_config.num_key_value_heads,
|
||||
'rope_theta': hf_config.rope_theta,
|
||||
'hidden_size': hf_config.hidden_size,
|
||||
'intermediate_size': hf_config.intermediate_size,
|
||||
'vocab_size': hf_config.vocab_size,
|
||||
'max_position_embeddings': hf_config.max_position_embeddings,
|
||||
'hidden_act': hf_config.hidden_act,
|
||||
'share_embedding_table': False,
|
||||
'norm_epsilon': hf_config.rms_norm_eps,
|
||||
}
|
||||
|
||||
small_variant = hf_config.architectures[0] == "Phi3SmallForCausalLM"
|
||||
if small_variant:
|
||||
config.update(convert_small_hf_config(hf_config))
|
||||
else:
|
||||
config.update({
|
||||
'rotary_base': hf_config.rope_theta,
|
||||
'norm_epsilon': hf_config.rms_norm_eps,
|
||||
})
|
||||
|
||||
# Long-context variants
|
||||
if hf_config.max_position_embeddings >= 128000:
|
||||
config.update({
|
||||
'original_max_position_embeddings':
|
||||
@ -83,6 +129,31 @@ def convert_hf_config(hf_config, dtype, **kwargs):
|
||||
'longrope_scaling_long_factors':
|
||||
hf_config.rope_scaling["long_factor"]
|
||||
})
|
||||
|
||||
if small_variant:
|
||||
config.update({
|
||||
'longrope_long_mscale':
|
||||
hf_config.rope_scaling["long_mscale"],
|
||||
'longrope_short_mscale':
|
||||
hf_config.rope_scaling["short_mscale"]
|
||||
})
|
||||
|
||||
if config["hidden_act"] == "silu":
|
||||
config["hidden_act"] = "swiglu"
|
||||
|
||||
# Tensor parallelism and weight-only quantization
|
||||
if args is not None:
|
||||
config.update({
|
||||
'mapping': {
|
||||
'world_size': args.tp_size * args.pp_size,
|
||||
'tp_size': args.tp_size,
|
||||
'pp_size': args.pp_size,
|
||||
}
|
||||
})
|
||||
|
||||
if args.use_weight_only and args.weight_only_precision == 'int8':
|
||||
config.update({'quantization': {'quant_algo': QuantAlgo.W8A16}})
|
||||
elif args.use_weight_only and args.weight_only_precision == 'int4':
|
||||
config.update({'quantization': {'quant_algo': QuantAlgo.W4A16}})
|
||||
|
||||
return config
|
||||
|
||||
@ -1,15 +1,20 @@
|
||||
import json
|
||||
import os
|
||||
import traceback
|
||||
from concurrent.futures import ThreadPoolExecutor, as_completed
|
||||
from typing import Optional
|
||||
|
||||
import numpy as np
|
||||
import safetensors
|
||||
from transformers import AutoModelForCausalLM
|
||||
|
||||
from ..._utils import pad_vocab_size
|
||||
from ...functional import PositionEmbeddingType, Tensor
|
||||
from ...layers import (MLP, Attention, AttentionMaskType, Embedding,
|
||||
ParallelLMHead, RmsNorm)
|
||||
from ...layers import (MLP, Attention, AttentionMaskType, BlockSparseAttnParams,
|
||||
Embedding, LayerNorm, ParallelLMHead, RmsNorm)
|
||||
from ...module import Module
|
||||
from ..modeling_utils import (DecoderLayerList, DecoderModelForCausalLM,
|
||||
PretrainedConfig, save_checkpoint)
|
||||
PretrainedConfig)
|
||||
from .convert import convert_hf_config, convert_hf_weights
|
||||
|
||||
|
||||
@ -22,28 +27,65 @@ class Phi3DecoderLayer(Module):
|
||||
tp_group = config.mapping.tp_group
|
||||
tp_size = config.mapping.tp_size
|
||||
|
||||
self.input_layernorm = RmsNorm(normalized_shape=config.hidden_size,
|
||||
eps=config.norm_epsilon,
|
||||
dtype=config.dtype)
|
||||
self.post_layernorm = RmsNorm(normalized_shape=config.hidden_size,
|
||||
eps=config.norm_epsilon,
|
||||
dtype=config.dtype)
|
||||
attention_mask_type = AttentionMaskType.causal
|
||||
block_sparse_attn_params = BlockSparseAttnParams()
|
||||
q_scaling = 1.0
|
||||
self.gegelu_limit = None
|
||||
|
||||
self.small_variant = config.architecture == "Phi3SmallForCausalLM"
|
||||
if self.small_variant:
|
||||
self.gegelu_limit = config.gegelu_limit
|
||||
|
||||
# MuP uses norm_factor=attention_head_size (rather than sqrt(attention_head_size))
|
||||
# We achieve this using q_scaling = sqrt(attention_head_size)
|
||||
hidden_size = config.hidden_size
|
||||
num_attention_heads = config.num_attention_heads
|
||||
attention_head_size = hidden_size / num_attention_heads
|
||||
q_scaling = attention_head_size**.5
|
||||
|
||||
block_sparse = (
|
||||
(layer_idx + 1) % config.dense_attention_every_n_layers) != 0
|
||||
attention_mask_type = AttentionMaskType.blocksparse if block_sparse else AttentionMaskType.causal
|
||||
|
||||
block_sparse_attn_params = BlockSparseAttnParams(
|
||||
config.blocksparse_block_size,
|
||||
config.blocksparse_homo_head_pattern,
|
||||
config.blocksparse_num_local_blocks,
|
||||
config.blocksparse_vertical_stride)
|
||||
|
||||
self.input_layernorm = LayerNorm(
|
||||
normalized_shape=config.hidden_size, dtype=config.dtype)
|
||||
self.post_layernorm = LayerNorm(normalized_shape=config.hidden_size,
|
||||
dtype=config.dtype)
|
||||
else:
|
||||
self.input_layernorm = RmsNorm(normalized_shape=config.hidden_size,
|
||||
eps=config.norm_epsilon,
|
||||
dtype=config.dtype)
|
||||
self.post_layernorm = RmsNorm(normalized_shape=config.hidden_size,
|
||||
eps=config.norm_epsilon,
|
||||
dtype=config.dtype)
|
||||
|
||||
layers_range = config.mapping.pp_layers(config.num_hidden_layers)
|
||||
local_layer_idx = layer_idx - layers_range[0]
|
||||
position_embedding_type = PositionEmbeddingType.rope_gpt_neox
|
||||
|
||||
rope_scaling_short_factors = 1.0
|
||||
rope_scaling_long_factors = 1.0
|
||||
rope_scaling_short_factors, rope_scaling_long_factors = 1.0, 1.0
|
||||
rope_scaling_short_mscale, rope_scaling_long_mscale = 1.0, 1.0
|
||||
original_max_position_embeddings = config.max_position_embeddings
|
||||
|
||||
if hasattr(config, "longrope_scaling_short_factors"):
|
||||
rope_scaling_short_factors = np.asarray(
|
||||
config.longrope_scaling_short_factors).astype(np.float32)
|
||||
rope_scaling_long_factors = np.asarray(
|
||||
config.longrope_scaling_long_factors).astype(np.float32)
|
||||
|
||||
original_max_position_embeddings = config.original_max_position_embeddings
|
||||
position_embedding_type = PositionEmbeddingType.long_rope
|
||||
|
||||
if self.small_variant:
|
||||
rope_scaling_short_mscale = config.longrope_short_mscale
|
||||
rope_scaling_long_mscale = config.longrope_long_mscale
|
||||
|
||||
self.attention = Attention(
|
||||
local_layer_idx=local_layer_idx,
|
||||
hidden_size=config.hidden_size,
|
||||
@ -53,15 +95,18 @@ class Phi3DecoderLayer(Module):
|
||||
rotary_embedding_base=config.rotary_base,
|
||||
max_position_embeddings=config.max_position_embeddings,
|
||||
dtype=config.dtype,
|
||||
attention_mask_type=AttentionMaskType.causal,
|
||||
bias=False,
|
||||
attention_mask_type=attention_mask_type,
|
||||
bias=self.small_variant,
|
||||
q_scaling=q_scaling,
|
||||
tp_group=tp_group,
|
||||
tp_size=tp_size,
|
||||
quant_mode=config.quant_mode,
|
||||
rope_scaling_short_factors=rope_scaling_short_factors,
|
||||
rope_scaling_long_factors=rope_scaling_long_factors,
|
||||
rope_scaling_short_mscale=rope_scaling_short_mscale,
|
||||
rope_scaling_long_mscale=rope_scaling_long_mscale,
|
||||
original_max_position_embeddings=original_max_position_embeddings,
|
||||
)
|
||||
block_sparse_params=block_sparse_attn_params)
|
||||
|
||||
self.mlp = MLP(hidden_size=config.hidden_size,
|
||||
ffn_hidden_size=config.intermediate_size,
|
||||
@ -70,7 +115,7 @@ class Phi3DecoderLayer(Module):
|
||||
tp_group=tp_group,
|
||||
tp_size=tp_size,
|
||||
quant_mode=config.quant_mode,
|
||||
bias=False)
|
||||
bias=self.small_variant)
|
||||
|
||||
def forward(
|
||||
self,
|
||||
@ -88,7 +133,7 @@ class Phi3DecoderLayer(Module):
|
||||
use_cache=use_cache,
|
||||
kv_cache_params=kv_cache_params,
|
||||
attention_params=attention_params,
|
||||
norm_before_bmm1=True,
|
||||
norm_before_bmm1=not self.small_variant,
|
||||
)
|
||||
|
||||
if use_cache:
|
||||
@ -96,7 +141,8 @@ class Phi3DecoderLayer(Module):
|
||||
|
||||
post_attention_input = hidden_states + attention_output
|
||||
post_attention_output = self.post_layernorm(post_attention_input)
|
||||
feed_forward_hidden_states = self.mlp(post_attention_output, )
|
||||
feed_forward_hidden_states = self.mlp(post_attention_output,
|
||||
gegelu_limit=self.gegelu_limit)
|
||||
hidden_states = post_attention_input + feed_forward_hidden_states
|
||||
if use_cache:
|
||||
return (hidden_states, presents)
|
||||
@ -112,9 +158,15 @@ class Phi3Model(Module):
|
||||
dtype=config.dtype)
|
||||
|
||||
self.layers = DecoderLayerList(Phi3DecoderLayer, config)
|
||||
self.ln_f = RmsNorm(normalized_shape=config.hidden_size,
|
||||
eps=config.norm_epsilon,
|
||||
dtype=config.dtype)
|
||||
self.small_variant = config.architecture == "Phi3SmallForCausalLM"
|
||||
if self.small_variant:
|
||||
self.ln_f = LayerNorm(normalized_shape=config.hidden_size,
|
||||
dtype=config.dtype)
|
||||
self.mup_embedding_multiplier = config.mup_embedding_multiplier
|
||||
else:
|
||||
self.ln_f = RmsNorm(normalized_shape=config.hidden_size,
|
||||
eps=config.norm_epsilon,
|
||||
dtype=config.dtype)
|
||||
|
||||
def forward(
|
||||
self,
|
||||
@ -132,6 +184,9 @@ class Phi3Model(Module):
|
||||
] if prompt_embedding_table is not None else []
|
||||
hidden_states = self.vocab_embedding(input_ids, *args)
|
||||
|
||||
if self.small_variant and self.mup_embedding_multiplier > 0.0:
|
||||
hidden_states = hidden_states * self.mup_embedding_multiplier
|
||||
|
||||
hidden_states = self.layers(
|
||||
hidden_states,
|
||||
use_cache=use_cache,
|
||||
@ -152,7 +207,6 @@ class Phi3Model(Module):
|
||||
class Phi3ForCausalLM(DecoderModelForCausalLM):
|
||||
|
||||
def __init__(self, config: PretrainedConfig):
|
||||
self.check_config(config)
|
||||
transformer = Phi3Model(config)
|
||||
vocab_size_padded = pad_vocab_size(config.vocab_size,
|
||||
config.mapping.tp_size)
|
||||
@ -167,25 +221,48 @@ class Phi3ForCausalLM(DecoderModelForCausalLM):
|
||||
|
||||
super().__init__(config, transformer, lm_head)
|
||||
|
||||
def check_config(self, config):
|
||||
config.set_if_not_exist('rotary_base', 10000.0)
|
||||
|
||||
@classmethod
|
||||
def convert_hf_checkpoint(cls,
|
||||
hf_model_dir: str,
|
||||
dtype: Optional[str] = "float16",
|
||||
output_dir: Optional[str] = None,
|
||||
**kwargs):
|
||||
args=None):
|
||||
'''
|
||||
Convert Huggingface checkpoint to TRT-LLM checkpoint
|
||||
'''
|
||||
|
||||
hf_model = AutoModelForCausalLM.from_pretrained(hf_model_dir,
|
||||
torch_dtype="auto",
|
||||
trust_remote_code=True)
|
||||
config = convert_hf_config(hf_model.config, dtype=dtype, **kwargs)
|
||||
weights = convert_hf_weights(hf_model, dtype=dtype, **kwargs)
|
||||
config = convert_hf_config(hf_model.config, dtype, args)
|
||||
with open(os.path.join(output_dir, 'config.json'), 'w') as f:
|
||||
json.dump(config, f, indent=4)
|
||||
|
||||
if output_dir:
|
||||
save_checkpoint(output_dir, config=config, weights=weights)
|
||||
small_variant = config['architecture'] == "Phi3SmallForCausalLM"
|
||||
|
||||
return {"weights": weights, "config": config}
|
||||
def covert_and_save(rank):
|
||||
weights = convert_hf_weights(hf_model, dtype, config, small_variant,
|
||||
args, rank)
|
||||
safetensors.torch.save_file(
|
||||
weights, os.path.join(output_dir, f'rank{rank}.safetensors'))
|
||||
|
||||
world_size = args.tp_size * args.pp_size
|
||||
if args.workers == 1:
|
||||
for rank in range(world_size):
|
||||
covert_and_save(rank)
|
||||
else:
|
||||
with ThreadPoolExecutor(max_workers=args.workers) as p:
|
||||
futures = [
|
||||
p.submit(covert_and_save, rank)
|
||||
for rank in range(world_size)
|
||||
]
|
||||
exceptions = []
|
||||
for future in as_completed(futures):
|
||||
try:
|
||||
future.result()
|
||||
except Exception as e:
|
||||
traceback.print_exc()
|
||||
exceptions.append(e)
|
||||
assert len(
|
||||
exceptions
|
||||
) == 0, "Checkpoint conversion failed, please check error log."
|
||||
|
||||
@ -1,14 +0,0 @@
|
||||
# SPDX-FileCopyrightText: Copyright (c) 2022-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
@ -1,257 +0,0 @@
|
||||
# SPDX-FileCopyrightText: Copyright (c) 2022-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
import json
|
||||
import os
|
||||
import traceback
|
||||
from concurrent.futures import ThreadPoolExecutor, as_completed
|
||||
|
||||
import numpy as np
|
||||
import safetensors
|
||||
from transformers import AutoModelForCausalLM
|
||||
|
||||
from ...._utils import pad_vocab_size
|
||||
from ....functional import PositionEmbeddingType, Tensor
|
||||
from ....layers import (MLP, Attention, AttentionMaskType,
|
||||
BlockSparseAttnParams, Embedding, LayerNorm,
|
||||
ParallelLMHead)
|
||||
from ....module import Module
|
||||
from ...modeling_utils import (DecoderLayerList, DecoderModelForCausalLM,
|
||||
PretrainedConfig)
|
||||
from .convert import convert_hf_config, convert_hf_weights
|
||||
|
||||
|
||||
class Phi3SmallDecoderLayer(Module):
|
||||
|
||||
def __init__(self, config: PretrainedConfig, layer_idx: int):
|
||||
super().__init__()
|
||||
self.config = config
|
||||
self.layer_idx = layer_idx
|
||||
tp_group = config.mapping.tp_group
|
||||
tp_size = config.mapping.tp_size
|
||||
self.gegelu_limit = config.gegelu_limit
|
||||
|
||||
self.input_layernorm = LayerNorm(normalized_shape=config.hidden_size,
|
||||
dtype=config.dtype)
|
||||
|
||||
# MuP uses norm_factor=attention_head_size (rather than sqrt(attention_head_size))
|
||||
# We achieve this using q_scaling = sqrt(attention_head_size)
|
||||
hidden_size = config.hidden_size
|
||||
num_attention_heads = config.num_attention_heads
|
||||
attention_head_size = hidden_size / num_attention_heads
|
||||
q_scaling = attention_head_size**.5
|
||||
|
||||
block_sparse = (
|
||||
(layer_idx + 1) % config.dense_attention_every_n_layers) != 0
|
||||
attention_mask_type = AttentionMaskType.blocksparse if block_sparse else AttentionMaskType.causal
|
||||
|
||||
block_sparse_attn_params = BlockSparseAttnParams(
|
||||
config.blocksparse_block_size, config.blocksparse_homo_head_pattern,
|
||||
config.blocksparse_num_local_blocks,
|
||||
config.blocksparse_vertical_stride)
|
||||
|
||||
layers_range = config.mapping.pp_layers(config.num_hidden_layers)
|
||||
local_layer_idx = layer_idx - layers_range[0]
|
||||
|
||||
position_embedding_type = PositionEmbeddingType.rope_gpt_neox
|
||||
original_max_position_embeddings = config.max_position_embeddings
|
||||
|
||||
rope_scaling_short_factors, rope_scaling_long_factors = 1.0, 1.0
|
||||
rope_scaling_short_mscale, rope_scaling_long_mscale = 1.0, 1.0
|
||||
|
||||
if hasattr(config, "longrope_scaling_short_factors"):
|
||||
rope_scaling_short_factors = np.asarray(
|
||||
config.longrope_scaling_short_factors).astype(np.float32)
|
||||
rope_scaling_long_factors = np.asarray(
|
||||
config.longrope_scaling_long_factors).astype(np.float32)
|
||||
rope_scaling_short_mscale = config.longrope_short_mscale
|
||||
rope_scaling_long_mscale = config.longrope_long_mscale
|
||||
|
||||
position_embedding_type = PositionEmbeddingType.long_rope
|
||||
original_max_position_embeddings = config.original_max_position_embeddings
|
||||
|
||||
self.attention = Attention(
|
||||
local_layer_idx=local_layer_idx,
|
||||
hidden_size=config.hidden_size,
|
||||
num_attention_heads=config.num_attention_heads,
|
||||
num_kv_heads=config.num_kv_heads,
|
||||
position_embedding_type=position_embedding_type,
|
||||
rotary_embedding_base=config.rotary_embedding_base,
|
||||
max_position_embeddings=config.max_position_embeddings,
|
||||
original_max_position_embeddings=original_max_position_embeddings,
|
||||
dtype=config.dtype,
|
||||
attention_mask_type=attention_mask_type,
|
||||
bias=True,
|
||||
q_scaling=q_scaling,
|
||||
tp_group=tp_group,
|
||||
tp_size=tp_size,
|
||||
quant_mode=config.quant_mode,
|
||||
rope_scaling_short_factors=rope_scaling_short_factors,
|
||||
rope_scaling_long_factors=rope_scaling_long_factors,
|
||||
rope_scaling_short_mscale=rope_scaling_short_mscale,
|
||||
rope_scaling_long_mscale=rope_scaling_long_mscale,
|
||||
block_sparse_params=block_sparse_attn_params)
|
||||
|
||||
self.post_layernorm = LayerNorm(normalized_shape=config.hidden_size,
|
||||
dtype=config.dtype)
|
||||
|
||||
self.mlp = MLP(hidden_size=config.hidden_size,
|
||||
ffn_hidden_size=config.intermediate_size,
|
||||
hidden_act=config.hidden_act,
|
||||
dtype=config.dtype,
|
||||
tp_group=tp_group,
|
||||
tp_size=tp_size,
|
||||
quant_mode=config.quant_mode)
|
||||
|
||||
def forward(
|
||||
self,
|
||||
hidden_states: Tensor,
|
||||
attention_mask=None,
|
||||
use_cache=False,
|
||||
kv_cache_params=None,
|
||||
attention_params=None,
|
||||
):
|
||||
residual = hidden_states
|
||||
input_layernorm_output = self.input_layernorm(hidden_states)
|
||||
|
||||
# Self attention
|
||||
attention_output = self.attention(
|
||||
input_layernorm_output,
|
||||
attention_mask=attention_mask,
|
||||
use_cache=use_cache,
|
||||
kv_cache_params=kv_cache_params,
|
||||
attention_params=attention_params,
|
||||
)
|
||||
|
||||
if use_cache:
|
||||
attention_output, presents = attention_output
|
||||
|
||||
hidden_states = residual + attention_output
|
||||
|
||||
# Fully connected
|
||||
residual = hidden_states
|
||||
hidden_states = self.post_layernorm(hidden_states)
|
||||
hidden_states = self.mlp(hidden_states, gegelu_limit=self.gegelu_limit)
|
||||
hidden_states = residual + hidden_states
|
||||
|
||||
if use_cache:
|
||||
return (hidden_states, presents)
|
||||
return hidden_states
|
||||
|
||||
|
||||
class Phi3SmallModel(Module):
|
||||
|
||||
def __init__(self, config: PretrainedConfig):
|
||||
super().__init__()
|
||||
self.vocab_embedding = Embedding(num_embeddings=config.vocab_size,
|
||||
embedding_dim=config.hidden_size,
|
||||
dtype=config.dtype)
|
||||
|
||||
self.layers = DecoderLayerList(Phi3SmallDecoderLayer, config)
|
||||
self.ln_f = LayerNorm(normalized_shape=config.hidden_size,
|
||||
dtype=config.dtype)
|
||||
self.mup_embedding_multiplier = config.mup_embedding_multiplier
|
||||
|
||||
def forward(
|
||||
self,
|
||||
input_ids: Tensor,
|
||||
position_ids=None,
|
||||
use_cache=False,
|
||||
attention_mask=None,
|
||||
kv_cache_params=None,
|
||||
attention_params=None,
|
||||
prompt_embedding_table=None,
|
||||
prompt_tasks=None,
|
||||
prompt_vocab_size=None,
|
||||
):
|
||||
args = [prompt_embedding_table, prompt_tasks, prompt_vocab_size
|
||||
] if prompt_embedding_table is not None else []
|
||||
hidden_states = self.vocab_embedding(input_ids, *args)
|
||||
|
||||
if self.mup_embedding_multiplier is not None and self.mup_embedding_multiplier > 0.0:
|
||||
hidden_states = hidden_states * self.mup_embedding_multiplier
|
||||
|
||||
hidden_states = self.layers(
|
||||
hidden_states,
|
||||
use_cache=use_cache,
|
||||
attention_mask=attention_mask,
|
||||
kv_cache_params=kv_cache_params,
|
||||
attention_params=attention_params,
|
||||
)
|
||||
if use_cache:
|
||||
hidden_states, presents = hidden_states
|
||||
|
||||
hidden_states = self.ln_f(hidden_states)
|
||||
|
||||
if use_cache:
|
||||
return (hidden_states, tuple(presents))
|
||||
return hidden_states
|
||||
|
||||
|
||||
class Phi3SmallForCausalLM(DecoderModelForCausalLM):
|
||||
|
||||
def __init__(self, config: PretrainedConfig):
|
||||
transformer = Phi3SmallModel(config)
|
||||
vocab_size_padded = pad_vocab_size(config.vocab_size,
|
||||
config.mapping.tp_size)
|
||||
|
||||
lm_head = ParallelLMHead(config.hidden_size,
|
||||
vocab_size_padded,
|
||||
bias=False,
|
||||
dtype=config.dtype,
|
||||
tp_group=config.mapping.tp_group,
|
||||
tp_size=config.mapping.tp_size,
|
||||
gather_output=True)
|
||||
|
||||
super().__init__(config, transformer, lm_head)
|
||||
|
||||
@classmethod
|
||||
def convert_hf_checkpoint(cls, model_dir, dtype, output_dir, args=None):
|
||||
'''
|
||||
Convert Huggingface checkpoint to TRT-LLM checkpoint
|
||||
'''
|
||||
|
||||
hf_model = AutoModelForCausalLM.from_pretrained(model_dir,
|
||||
torch_dtype="auto",
|
||||
trust_remote_code=True)
|
||||
|
||||
config = convert_hf_config(hf_model.config, dtype, args)
|
||||
with open(os.path.join(output_dir, 'config.json'), 'w') as f:
|
||||
json.dump(config, f, indent=4)
|
||||
|
||||
def covert_and_save(rank):
|
||||
weights = convert_hf_weights(hf_model, config, args, rank)
|
||||
safetensors.torch.save_file(
|
||||
weights, os.path.join(output_dir, f'rank{rank}.safetensors'))
|
||||
|
||||
world_size = args.tp_size * args.pp_size
|
||||
if args.workers == 1:
|
||||
for rank in range(world_size):
|
||||
covert_and_save(rank)
|
||||
else:
|
||||
with ThreadPoolExecutor(max_workers=args.workers) as p:
|
||||
futures = [
|
||||
p.submit(covert_and_save, rank)
|
||||
for rank in range(world_size)
|
||||
]
|
||||
exceptions = []
|
||||
for future in as_completed(futures):
|
||||
try:
|
||||
future.result()
|
||||
except Exception as e:
|
||||
traceback.print_exc()
|
||||
exceptions.append(e)
|
||||
assert len(
|
||||
exceptions
|
||||
) == 0, "Checkpoint conversion failed, please check error log."
|
||||
@ -15,10 +15,6 @@
|
||||
|
||||
import torch
|
||||
|
||||
from tensorrt_llm.quantization import QuantAlgo
|
||||
|
||||
from ...._utils import str_dtype_to_torch
|
||||
|
||||
|
||||
def shuffle_qkv_weights(weights, config):
|
||||
# Input weights are organized as
|
||||
@ -29,8 +25,7 @@ def shuffle_qkv_weights(weights, config):
|
||||
# (q00, q01, ..., qnm), (k0, k1, .., kn), (v0, v1, .., vn)
|
||||
|
||||
num_heads = config['num_attention_heads']
|
||||
num_kv_heads = config['num_kv_heads'] if 'num_kv_heads' in config.keys(
|
||||
) else config['num_key_value_heads']
|
||||
num_kv_heads = config['num_key_value_heads']
|
||||
num_q_per_kv = num_heads // num_kv_heads
|
||||
|
||||
hidden_size = config['hidden_size']
|
||||
@ -152,7 +147,7 @@ def get_tllm_linear_weight(weight,
|
||||
|
||||
def split_weights_tp(config, weights, args, rank, dtype):
|
||||
num_heads = config['num_attention_heads']
|
||||
num_kv_heads = config['num_kv_heads']
|
||||
num_kv_heads = config['num_key_value_heads']
|
||||
hidden_size = config['hidden_size']
|
||||
|
||||
mha_mode = num_heads == num_kv_heads
|
||||
@ -228,96 +223,3 @@ def split_weights_tp(config, weights, args, rank, dtype):
|
||||
dim=0)
|
||||
|
||||
return weights
|
||||
|
||||
|
||||
def convert_hf_weights(hf_model, config, args, rank):
|
||||
torch_dtype = str_dtype_to_torch(args.dtype)
|
||||
hf_state_dict = hf_model.state_dict()
|
||||
weights = {}
|
||||
|
||||
# replace key name
|
||||
for key, value in hf_state_dict.items():
|
||||
# Decoder Layers
|
||||
if "model.layers." in key:
|
||||
key = key.replace("model.layers.", "transformer.layers.")
|
||||
key = key.replace("self_attn.", "attention.")
|
||||
key = key.replace("query_key_value.", "qkv.")
|
||||
key = key.replace("mlp.up_proj.", "mlp.fc.")
|
||||
key = key.replace("mlp.down_proj.", "mlp.proj.")
|
||||
key = key.replace("post_attention_layernorm.", "post_layernorm.")
|
||||
# Embedding
|
||||
key = key.replace("model.embed_tokens.weight",
|
||||
"transformer.vocab_embedding.weight")
|
||||
# Final Layer norm
|
||||
key = key.replace("model.final_layernorm.", "transformer.ln_f.")
|
||||
weights[key] = value.to(torch_dtype).cpu()
|
||||
|
||||
weights['lm_head.weight'] = weights[
|
||||
'transformer.vocab_embedding.weight'].clone()
|
||||
|
||||
# Transform QKV weights from custom Phi3Small format to TRT-LLM format
|
||||
for key, value in weights.items():
|
||||
if "qkv." in key:
|
||||
weights[key] = shuffle_qkv_weights(weights[key], config)
|
||||
|
||||
weights = split_weights_tp(config, weights, args, rank, torch_dtype)
|
||||
|
||||
return weights
|
||||
|
||||
|
||||
def convert_hf_config(hf_config, dtype, args):
|
||||
config = {
|
||||
'architecture': 'Phi3SmallForCausalLM',
|
||||
'dtype': dtype,
|
||||
'num_hidden_layers': hf_config.num_hidden_layers,
|
||||
'num_attention_heads': hf_config.num_attention_heads,
|
||||
'num_kv_heads': hf_config.num_key_value_heads,
|
||||
'rotary_embedding_base': hf_config.rope_embedding_base,
|
||||
'hidden_size': hf_config.hidden_size,
|
||||
'intermediate_size': hf_config.intermediate_size,
|
||||
'vocab_size': hf_config.vocab_size,
|
||||
'max_position_embeddings': hf_config.max_position_embeddings,
|
||||
'hidden_act': hf_config.hidden_act,
|
||||
'share_embedding_table': False,
|
||||
'gegelu_limit': hf_config.gegelu_limit,
|
||||
'mup_attn_multiplier': hf_config.mup_attn_multiplier,
|
||||
'mup_embedding_multiplier': hf_config.mup_embedding_multiplier,
|
||||
'mup_use_scaling': hf_config.mup_use_scaling,
|
||||
'mup_width_multiplier': hf_config.mup_width_multiplier,
|
||||
'blocksparse_block_size': hf_config.blocksparse_block_size,
|
||||
'blocksparse_homo_head_pattern':
|
||||
hf_config.blocksparse_homo_head_pattern,
|
||||
'blocksparse_num_local_blocks': hf_config.blocksparse_num_local_blocks,
|
||||
'blocksparse_vertical_stride': hf_config.blocksparse_vert_stride,
|
||||
'dense_attention_every_n_layers':
|
||||
hf_config.dense_attention_every_n_layers,
|
||||
}
|
||||
|
||||
if args is not None:
|
||||
config.update({
|
||||
'mapping': {
|
||||
'world_size': args.tp_size * args.pp_size,
|
||||
'tp_size': args.tp_size,
|
||||
'pp_size': args.pp_size,
|
||||
}
|
||||
})
|
||||
|
||||
if args.use_weight_only and args.weight_only_precision == 'int8':
|
||||
config.update({'quantization': {'quant_algo': QuantAlgo.W8A16}})
|
||||
elif args.use_weight_only and args.weight_only_precision == 'int4':
|
||||
config.update({'quantization': {'quant_algo': QuantAlgo.W4A16}})
|
||||
|
||||
if hf_config.max_position_embeddings >= 128000:
|
||||
config.update({
|
||||
'original_max_position_embeddings':
|
||||
hf_config.original_max_position_embeddings,
|
||||
'longrope_scaling_short_factors':
|
||||
hf_config.rope_scaling["short_factor"],
|
||||
'longrope_scaling_long_factors':
|
||||
hf_config.rope_scaling["long_factor"],
|
||||
'longrope_long_mscale':
|
||||
hf_config.rope_scaling["long_mscale"],
|
||||
'longrope_short_mscale':
|
||||
hf_config.rope_scaling["short_mscale"]
|
||||
})
|
||||
return config
|
||||
@ -398,9 +398,9 @@ class RecurrentGemmaForCausalLM(PretrainedModel):
|
||||
max_batch_size,
|
||||
max_input_len,
|
||||
max_seq_len,
|
||||
max_num_tokens,
|
||||
use_cache,
|
||||
max_beam_width: int = 1,
|
||||
max_num_tokens: int = None,
|
||||
opt_num_tokens: int = None,
|
||||
opt_batch_size: int = 0,
|
||||
prompt_embedding_table_size: int = 0,
|
||||
@ -434,44 +434,20 @@ class RecurrentGemmaForCausalLM(PretrainedModel):
|
||||
self.gather_context_logits = gather_context_logits
|
||||
mapping = self.config.mapping
|
||||
|
||||
default_range = GenerationMixin.default_range
|
||||
batch_range = default_range(max_batch_size)
|
||||
bbd_range = [
|
||||
batch_range[i] * ((max_draft_len + 1) if i != 0 else 1)
|
||||
for i in range(len(batch_range))
|
||||
]
|
||||
inlen_range_cxt = default_range(max_input_len)
|
||||
inlen_range_gen = [1, 1, max_draft_len + 1]
|
||||
|
||||
# basic inputs
|
||||
enable_ctx_gen_opt_profiles = GenerationMixin.has_ctx_gen_opt_profiles(
|
||||
use_gpt_attention_plugin, use_gemm_plugin, remove_input_padding,
|
||||
paged_kv_cache)
|
||||
if max_num_tokens is None:
|
||||
max_num_tokens = max(
|
||||
max_input_len * max_batch_size,
|
||||
max_beam_width * (max_draft_len + 1) * max_batch_size)
|
||||
if enable_ctx_gen_opt_profiles:
|
||||
num_profiles = 2
|
||||
inlen_range = [inlen_range_cxt, inlen_range_gen]
|
||||
num_tokens_range_ctx = default_range(max_num_tokens)
|
||||
num_tokens_range_gen = default_range(
|
||||
max_batch_size * (max_draft_len + 1) * max_beam_width)
|
||||
num_tokens_range = [num_tokens_range_ctx, num_tokens_range_gen]
|
||||
position_ids_inlen_range = [inlen_range_cxt, [1, 1, 1]]
|
||||
else:
|
||||
max_bs_x_max_bw = max_batch_size * max_beam_width
|
||||
if opt_num_tokens is None:
|
||||
opt_num_tokens = max_bs_x_max_bw
|
||||
if multiple_profiles:
|
||||
num_tokens_range = GenerationMixin.split_num_tokens_range(
|
||||
max_num_tokens)
|
||||
else:
|
||||
num_tokens_range = [[1, opt_num_tokens, max_num_tokens]]
|
||||
num_profiles = len(num_tokens_range)
|
||||
inlen_range = [[1, 1, max_input_len]] * num_profiles
|
||||
position_ids_inlen_range = [[1, 1, max_input_len]] * num_profiles
|
||||
bb_range = [batch_range] * num_profiles
|
||||
num_profiles, ranges = GenerationMixin.get_profiles_ranges(
|
||||
max_batch_size=max_batch_size,
|
||||
max_beam_width=max_beam_width,
|
||||
max_input_len=max_input_len,
|
||||
max_num_tokens=max_num_tokens,
|
||||
max_draft_len=max_draft_len,
|
||||
opt_batch_size=opt_batch_size,
|
||||
opt_num_tokens=opt_num_tokens,
|
||||
enable_ctx_gen_opt_profiles=enable_ctx_gen_opt_profiles,
|
||||
multiple_profiles=multiple_profiles)
|
||||
|
||||
if remove_input_padding:
|
||||
assert use_mamba_conv1d_plugin, "mamba_conv1d_plugin is needed to support remove_input_padding"
|
||||
@ -479,14 +455,14 @@ class RecurrentGemmaForCausalLM(PretrainedModel):
|
||||
dtype=trt.int32,
|
||||
shape=[-1],
|
||||
dim_range=OrderedDict([
|
||||
('num_tokens', num_tokens_range),
|
||||
('num_tokens', ranges['num_tokens_range']),
|
||||
]))
|
||||
position_ids = Tensor(name='position_ids',
|
||||
dtype=trt.int32,
|
||||
shape=[-1],
|
||||
dim_range=OrderedDict([
|
||||
('position_ids_num_tokens_range',
|
||||
num_tokens_range),
|
||||
ranges['num_tokens_range']),
|
||||
]))
|
||||
else:
|
||||
input_ids = Tensor(name='input_ids',
|
||||
@ -494,16 +470,17 @@ class RecurrentGemmaForCausalLM(PretrainedModel):
|
||||
shape=[-1, -1],
|
||||
dim_range=OrderedDict([
|
||||
('batch_size_beam_width',
|
||||
[batch_range] * num_profiles),
|
||||
('input_len', inlen_range),
|
||||
ranges['bb_range']),
|
||||
('input_len', ranges['inlen_range']),
|
||||
]))
|
||||
position_ids = Tensor(name='position_ids',
|
||||
dtype=trt.int32,
|
||||
shape=[-1, -1],
|
||||
dim_range=OrderedDict([
|
||||
('batch_size_beam_width', bb_range),
|
||||
('batch_size_beam_width',
|
||||
ranges['bb_range']),
|
||||
('position_ids_inlen_range',
|
||||
position_ids_inlen_range),
|
||||
ranges['position_ids_inlen_range']),
|
||||
]))
|
||||
if use_custom_all_reduce and mapping.tp_size > 1:
|
||||
current_all_reduce_helper().set_workspace_tensor(
|
||||
@ -559,7 +536,8 @@ class RecurrentGemmaForCausalLM(PretrainedModel):
|
||||
name='host_request_types',
|
||||
dtype=trt.int32,
|
||||
shape=[-1],
|
||||
dim_range=OrderedDict([('batch_size_beam_width', bb_range)]),
|
||||
dim_range=OrderedDict([('batch_size_beam_width',
|
||||
ranges['bb_range'])]),
|
||||
)
|
||||
|
||||
last_token_ids = Tensor(
|
||||
@ -567,7 +545,7 @@ class RecurrentGemmaForCausalLM(PretrainedModel):
|
||||
dtype=trt.int32,
|
||||
shape=[-1],
|
||||
dim_range=OrderedDict([
|
||||
('batch_size_last_token_ids', [bbd_range] * num_profiles),
|
||||
('batch_size_last_token_ids', ranges['bbd_range']),
|
||||
]),
|
||||
)
|
||||
last_token_ids_for_logits = None
|
||||
@ -581,7 +559,8 @@ class RecurrentGemmaForCausalLM(PretrainedModel):
|
||||
name='host_context_lengths',
|
||||
dtype=trt.int32,
|
||||
shape=[-1],
|
||||
dim_range=OrderedDict([('batch_size_beam_width', bb_range)]),
|
||||
dim_range=OrderedDict([('batch_size_beam_width',
|
||||
ranges['bb_range'])]),
|
||||
)
|
||||
else:
|
||||
host_context_lengths = None
|
||||
|
||||
@ -883,7 +883,6 @@ class FP8Linear(Linear):
|
||||
assert lora_runtime_params is None or default_net(
|
||||
).plugin_config.lora_plugin == self.dtype
|
||||
|
||||
lora_hidden_state = x if lora_runtime_params is not None else None
|
||||
if default_net().strongly_typed:
|
||||
assert is_same_dtype(
|
||||
x.dtype,
|
||||
@ -894,8 +893,13 @@ class FP8Linear(Linear):
|
||||
activation_scaling_factor = cast(activation_scaling_factor, self.dtype)
|
||||
if x.dtype != trt.fp8:
|
||||
quantized_out = quantize(x, activation_scaling_factor, 'fp8')
|
||||
lora_hidden_state = x if lora_runtime_params is not None else None
|
||||
else:
|
||||
quantized_out = x
|
||||
# TODO: add fp8 LoRA support
|
||||
lora_hidden_state = dequantize(
|
||||
x, activation_scaling_factor, -1,
|
||||
self.dtype) if lora_runtime_params is not None else None
|
||||
|
||||
weights_scaling_factor = constant(
|
||||
self.weights_scaling_factor.raw_value.copy())
|
||||
@ -956,14 +960,18 @@ class FP8RowLinear(RowLinear):
|
||||
assert lora_runtime_params is None or default_net(
|
||||
).plugin_config.lora_plugin == self.dtype
|
||||
|
||||
lora_hidden_state = x if lora_runtime_params is not None else None
|
||||
activation_scaling_factor = constant(
|
||||
self.activation_scaling_factor.raw_value.copy())
|
||||
activation_scaling_factor = cast(activation_scaling_factor, self.dtype)
|
||||
if x.dtype != trt.fp8:
|
||||
quantized_out = quantize(x, activation_scaling_factor, 'fp8')
|
||||
lora_hidden_state = x if lora_runtime_params is not None else None
|
||||
else:
|
||||
quantized_out = x
|
||||
# TODO: add fp8 LoRA support
|
||||
lora_hidden_state = dequantize(
|
||||
x, activation_scaling_factor, -1,
|
||||
self.dtype) if lora_runtime_params is not None else None
|
||||
|
||||
weights_scaling_factor = constant(
|
||||
self.weights_scaling_factor.raw_value.copy())
|
||||
|
||||
@ -57,7 +57,10 @@ def quantize_layers(
|
||||
if preprocess_init_params is not None:
|
||||
preprocess_init_params(init_params, name, module)
|
||||
quant_layer = quant_cls(**init_params)
|
||||
setattr(parent, module_name, quant_layer)
|
||||
if parent is not None:
|
||||
setattr(parent, module_name, quant_layer)
|
||||
else:
|
||||
model = quant_layer
|
||||
|
||||
setattr(model, 'quant_mode', quant_config.quant_mode)
|
||||
return model
|
||||
@ -78,7 +81,7 @@ def weight_only_quantize(model, quant_config: QuantConfig):
|
||||
module_name = name.rsplit('.', 1)[-1]
|
||||
init_params["transb"] = module_name == "lm_head"
|
||||
|
||||
quantize_layers(
|
||||
model = quantize_layers(
|
||||
model,
|
||||
quant_config,
|
||||
quant_map,
|
||||
@ -102,7 +105,7 @@ def weight_only_groupwise_quantize(model, quant_config: QuantConfig):
|
||||
init_params[
|
||||
"use_w4a8_awq"] = quant_config.quant_algo == QuantAlgo.W4A8_AWQ
|
||||
|
||||
quantize_layers(
|
||||
model = quantize_layers(
|
||||
model,
|
||||
quant_config,
|
||||
quant_map,
|
||||
@ -120,7 +123,7 @@ def smooth_quantize_ootb(
|
||||
RowLinear: Int8SmoothQuantRowLinear,
|
||||
}
|
||||
|
||||
quantize_layers(
|
||||
model = quantize_layers(
|
||||
model,
|
||||
quant_config,
|
||||
quant_map,
|
||||
@ -138,7 +141,7 @@ def smooth_quantize_plugin(model, quant_mode):
|
||||
}
|
||||
for name, layer, parent in model.named_modules_with_parent():
|
||||
layer_name = name.rsplit('.', 1)[-1]
|
||||
if layer_name in ['ln_f']:
|
||||
if layer_name in ['ln_f', 'ln_embed']:
|
||||
continue
|
||||
|
||||
quant_cls = None
|
||||
@ -156,7 +159,10 @@ def smooth_quantize_plugin(model, quant_mode):
|
||||
init_params[
|
||||
"num_attention_heads"] = layer.num_attention_heads * layer.tp_size
|
||||
quant_layer = quant_cls(**init_params)
|
||||
setattr(parent, layer_name, quant_layer)
|
||||
if parent is not None:
|
||||
setattr(parent, layer_name, quant_layer)
|
||||
else:
|
||||
model = quant_layer
|
||||
|
||||
setattr(model, 'quant_mode', quant_mode)
|
||||
return model
|
||||
@ -178,7 +184,7 @@ def fp8_quantize(model, quant_config: QuantConfig):
|
||||
RowLinear: FP8RowLinear,
|
||||
}
|
||||
|
||||
quantize_layers(
|
||||
model = quantize_layers(
|
||||
model,
|
||||
quant_config,
|
||||
quant_map,
|
||||
|
||||
@ -31,10 +31,12 @@ import numpy as np
|
||||
import safetensors
|
||||
import torch
|
||||
from datasets import load_dataset
|
||||
from safetensors.torch import load_file, save_file
|
||||
from torch.utils.data import DataLoader
|
||||
from transformers import AutoConfig, AutoModelForCausalLM, AutoTokenizer
|
||||
|
||||
from ..logger import logger
|
||||
from ..mapping import Mapping
|
||||
from .mode import QuantAlgo
|
||||
|
||||
EMPTY_CFG = {
|
||||
@ -122,7 +124,8 @@ MODEL_NAME_PATTERN_MAP = {
|
||||
"Gemma": "gemma",
|
||||
"MixtralForCausalLM": "llama",
|
||||
"ArcticForCausalLM": "llama",
|
||||
"Phi3SmallForCausalLM": "phi",
|
||||
"Phi3SmallForCausalLM": "phi3small",
|
||||
"Phi3ForCausalLM": "phi3",
|
||||
}
|
||||
|
||||
|
||||
@ -263,10 +266,95 @@ def quantize_model(model, quant_cfg, calib_dataloader=None):
|
||||
return model
|
||||
|
||||
|
||||
def quantize_and_export(*, model_dir, device, calib_dataset, dtype, qformat,
|
||||
kv_cache_dtype, calib_size, batch_size,
|
||||
calib_max_seq_length, awq_block_size, output_dir,
|
||||
tp_size, pp_size, seed, tokenizer_max_seq_length):
|
||||
def combine_medusa_weight(tp_size, pp_size, base_model_output_dir,
|
||||
num_medusa_heads, num_medusa_layers, max_draft_len,
|
||||
medusa_hidden_act, medusa_model_dir,
|
||||
quant_medusa_head):
|
||||
|
||||
with open(f"{medusa_model_dir}/config.json", "r") as fp:
|
||||
medusa_config = json.load(fp)
|
||||
|
||||
num_medusa_heads_from_config = medusa_config.get('medusa_num_heads',
|
||||
num_medusa_heads)
|
||||
num_medusa_layers = medusa_config.get('medusa_num_layers',
|
||||
num_medusa_layers)
|
||||
if num_medusa_heads is None:
|
||||
num_medusa_heads = num_medusa_heads_from_config
|
||||
|
||||
assert max_draft_len > 0, "should have max_draft_len > 0"
|
||||
|
||||
world_size = tp_size * pp_size
|
||||
# Process for each rank
|
||||
for rank in range(world_size):
|
||||
mapping = Mapping(world_size=world_size,
|
||||
rank=rank,
|
||||
tp_size=tp_size,
|
||||
pp_size=pp_size)
|
||||
# 1. Load medusa weight for each rank
|
||||
from tensorrt_llm.models.medusa.weight import load_medusa_hf
|
||||
medusa_weights = load_medusa_hf(medusa_path=medusa_model_dir,
|
||||
num_medusa_heads=num_medusa_heads,
|
||||
num_medusa_layers=num_medusa_layers,
|
||||
mapping=mapping,
|
||||
dtype="float16")
|
||||
# 2. Load base model safetensors (after quant)
|
||||
base_model_weights = load_file(
|
||||
f"{base_model_output_dir}/rank{rank}.safetensors")
|
||||
|
||||
# 3. Combine and save weight
|
||||
base_model_weights.update(medusa_weights)
|
||||
save_file(base_model_weights,
|
||||
f"{base_model_output_dir}/rank{rank}.safetensors")
|
||||
|
||||
# 4. Add medusa config into config.json
|
||||
with open(f"{base_model_output_dir}/config.json", 'r') as f:
|
||||
base_model_config = json.load(f)
|
||||
f.close()
|
||||
|
||||
with open(f"{base_model_output_dir}/config.json", 'w') as f:
|
||||
base_model_config['architecture'] = "MedusaForCausalLM"
|
||||
base_model_config['quantization']['exclude_modules'] = [
|
||||
'lm_head',
|
||||
'*router',
|
||||
'*vocab_embedding',
|
||||
'*position_embedding',
|
||||
'*block_embedding',
|
||||
]
|
||||
if not quant_medusa_head:
|
||||
base_model_config['quantization']['exclude_modules'].append(
|
||||
'*medusa_heads*')
|
||||
|
||||
base_model_config['max_draft_len'] = max_draft_len
|
||||
base_model_config['num_medusa_heads'] = num_medusa_heads
|
||||
base_model_config['num_medusa_layers'] = num_medusa_layers
|
||||
json.dump(base_model_config, f, indent=4)
|
||||
|
||||
torch.cuda.empty_cache()
|
||||
print("Combine medusa heads' weight, done.")
|
||||
|
||||
|
||||
def quantize_and_export(*,
|
||||
model_dir,
|
||||
device,
|
||||
calib_dataset,
|
||||
dtype,
|
||||
qformat,
|
||||
kv_cache_dtype,
|
||||
calib_size,
|
||||
batch_size,
|
||||
calib_max_seq_length,
|
||||
awq_block_size,
|
||||
output_dir,
|
||||
tp_size,
|
||||
pp_size,
|
||||
seed,
|
||||
tokenizer_max_seq_length,
|
||||
num_medusa_heads=None,
|
||||
num_medusa_layers=None,
|
||||
max_draft_len=None,
|
||||
medusa_hidden_act=None,
|
||||
medusa_model_dir=None,
|
||||
quant_medusa_head=None):
|
||||
'''
|
||||
Load model from the model_dir, call Modelopt to quantize the model, and then export
|
||||
the quantized model as TRT-LLM checkpoint
|
||||
@ -419,24 +507,16 @@ def quantize_and_export(*, model_dir, device, calib_dataset, dtype, qformat,
|
||||
with open(f"{export_path}/config.json", "w") as f:
|
||||
json.dump(tensorrt_llm_config, f, indent=4)
|
||||
|
||||
if model_type == 'phi':
|
||||
with open(f"{export_path}/config.json", "r") as f:
|
||||
tensorrt_llm_config = json.load(f)
|
||||
phi_config = AutoConfig.from_pretrained(model_dir,
|
||||
trust_remote_code=True)
|
||||
|
||||
from ..models.phi3.phi3small.convert import \
|
||||
convert_hf_config as phi_config_converter
|
||||
phi_config = phi_config_converter(phi_config, dtype, None)
|
||||
|
||||
for key, value in phi_config.items():
|
||||
tensorrt_llm_config[key] = value
|
||||
|
||||
with open(f"{export_path}/config.json", "w") as f:
|
||||
json.dump(tensorrt_llm_config, f, indent=4)
|
||||
|
||||
torch.cuda.empty_cache(
|
||||
) # otherwise torch is keeping using GPU, other routine like build engine has less free GPU to use
|
||||
|
||||
# Workaround for combining medusa head
|
||||
# TODO: move these integration into modelopt to avoid redundant reading and writing
|
||||
if medusa_model_dir is not None:
|
||||
combine_medusa_weight(tp_size, pp_size, export_path,
|
||||
num_medusa_heads, num_medusa_layers,
|
||||
max_draft_len, medusa_hidden_act,
|
||||
medusa_model_dir, quant_medusa_head)
|
||||
end_time = time.time()
|
||||
print(
|
||||
"Quantized model exported to {} \nTotal time used {:.2f} s.".format(
|
||||
|
||||
@ -12,4 +12,4 @@
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
__version__ = "0.11.0.dev2024061800"
|
||||
__version__ = "0.11.0.dev2024062500"
|
||||
|
||||
@ -84,10 +84,12 @@ class TestMamba(unittest.TestCase):
|
||||
hf_config, hf_path, hf_mamba, load_mode, dtype)
|
||||
with net_guard(network):
|
||||
network.set_named_parameters(tensorrt_llm_mamba.named_parameters())
|
||||
inputs = tensorrt_llm_mamba.prepare_inputs(batch_size,
|
||||
input_len,
|
||||
input_len + output_len,
|
||||
use_cache=False)
|
||||
inputs = tensorrt_llm_mamba.prepare_inputs(
|
||||
batch_size,
|
||||
input_len,
|
||||
input_len + output_len,
|
||||
max_num_tokens=batch_size * input_len,
|
||||
use_cache=False)
|
||||
# Prepare
|
||||
tensorrt_llm_mamba(**inputs)
|
||||
return network
|
||||
|
||||
@ -80,7 +80,7 @@ test_gptq() {
|
||||
python convert_checkpoint.py --model_dir ${MODEL} \
|
||||
--output_dir ./tllm_checkpoint/2gpu_gptq \
|
||||
--dtype float16 \
|
||||
--modelopt_quant_ckpt_path /home/scratch.trt_llm_data/llm-models/int4-quantized-gptq-awq/llama-7b-4bit-gs128.safetensors \
|
||||
--quant_ckpt_path /home/scratch.trt_llm_data/llm-models/int4-quantized-gptq-awq/llama-7b-4bit-gs128.safetensors \
|
||||
--use_weight_only \
|
||||
--weight_only_precision int4_gptq \
|
||||
--per_group \
|
||||
|
||||
Loading…
Reference in New Issue
Block a user