mirror of
https://github.com/NVIDIA/TensorRT-LLM.git
synced 2026-02-11 13:33:40 +08:00
* Update TensorRT-LLM --------- Co-authored-by: RunningLeon <mnsheng@yeah.net> Co-authored-by: Tlntin <TlntinDeng01@Gmail.com> Co-authored-by: ZHENG, Zhen <zhengzhen.z@qq.com> Co-authored-by: Pham Van Ngoan <ngoanpham1196@gmail.com> Co-authored-by: Nathan Price <nathan@abridge.com> Co-authored-by: Tushar Goel <tushar.goel.ml@gmail.com> Co-authored-by: Mati <132419219+matichon-vultureprime@users.noreply.github.com>
64 lines
2.0 KiB
Python
64 lines
2.0 KiB
Python
# SPDX-FileCopyrightText: Copyright (c) 2022-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
|
|
# SPDX-License-Identifier: Apache-2.0
|
|
#
|
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
# you may not use this file except in compliance with the License.
|
|
# You may obtain a copy of the License at
|
|
#
|
|
# http://www.apache.org/licenses/LICENSE-2.0
|
|
#
|
|
# Unless required by applicable law or agreed to in writing, software
|
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
# See the License for the specific language governing permissions and
|
|
# limitations under the License.
|
|
from typing import Union
|
|
|
|
import numpy as np
|
|
import torch
|
|
|
|
|
|
def gen_suffix(rank, use_smooth_quant, quant_per_channel):
|
|
suffix = f"{rank}.bin"
|
|
if use_smooth_quant:
|
|
sq_prefix = "int8."
|
|
if quant_per_channel:
|
|
sq_prefix += "col."
|
|
suffix = sq_prefix + suffix
|
|
return suffix
|
|
|
|
|
|
def extract_layer_idx(name):
|
|
ss = name.split('.')
|
|
for s in ss:
|
|
if s.isdigit():
|
|
return s
|
|
return None
|
|
|
|
|
|
def split(v: Union[np.ndarray, torch.Tensor],
|
|
tp_size: int,
|
|
tp_rank: int,
|
|
dim=0):
|
|
if tp_size == 1:
|
|
return v
|
|
assert len(v.shape) > 1 or dim == 0
|
|
if isinstance(v, np.ndarray):
|
|
return np.ascontiguousarray(
|
|
np.split(v, tp_size, axis=dim)[tp_rank].copy())
|
|
else:
|
|
assert v.shape[dim] % tp_size == 0, \
|
|
'Unable to split: shape={v.shape} (dim={dim}) tp_size={tp_size}.'
|
|
split_size = v.shape[dim] // tp_size
|
|
return v.split(split_size, dim=dim)[tp_rank].clone().detach()
|
|
|
|
|
|
def dup_kv_weight(v, num_head, tp_size):
|
|
assert tp_size % num_head == 0
|
|
reps = tp_size // num_head
|
|
head_size = v.shape[0] // num_head
|
|
v = v.reshape(num_head, head_size,
|
|
-1)[:, None, :, :].expand(num_head, reps, head_size,
|
|
v.shape[1])
|
|
return v.reshape(num_head * reps * head_size, -1).clone().detach()
|