mirror of
https://github.com/NVIDIA/TensorRT-LLM.git
synced 2026-01-14 06:27:45 +08:00
AWQ support Modelopt ckpts. (#3258)
Signed-off-by: Tracin <10434017+Tracin@users.noreply.github.com> Co-authored-by: QI JUN <22017000+QiJune@users.noreply.github.com>
This commit is contained in:
parent
b763051ba4
commit
bb6c338730
@ -1705,14 +1705,16 @@ def preprocess_perlayer_weights(weights,
|
||||
dtype = torch.float16
|
||||
if model_config.dtype == "bfloat16":
|
||||
dtype = torch.bfloat16
|
||||
weights[name] = preprocessor(param.T, torch.quint4x2,
|
||||
weights[name] = preprocessor(param.transpose(-1, -2),
|
||||
torch.quint4x2,
|
||||
activation_type).view(dtype)
|
||||
if name.endswith('weights_scaling_factor'
|
||||
) and param.shape[0] > param.shape[1]:
|
||||
# TODO: refine on supporting ModelOpt HF-AWQ
|
||||
weights[name] = param.T.contiguous().to(
|
||||
if name.endswith('weights_scaling_factor'):
|
||||
weights[name] = param.transpose(-1, -2).contiguous().to(
|
||||
str_dtype_to_torch(model_config.dtype))
|
||||
if name.endswith('prequant_scaling_factor'):
|
||||
if len(weights[name].shape) == 2:
|
||||
# MoE experts share the same scaling factor.
|
||||
param = param[0, :]
|
||||
weights[name] = param.reshape(1, -1)
|
||||
if model_config.mapping.tp_rank > 0:
|
||||
if name.endswith('attention.dense.bias') or name.endswith(
|
||||
|
||||
@ -888,24 +888,39 @@ def test_llm_mixtral_1gpu_fp4_llmapi(
|
||||
venv_check_call(llm_venv, mmlu_cmd)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("model_name", ['mixtral-8x7b-v0.1-AWQ'])
|
||||
@pytest.mark.parametrize(
|
||||
"model_name", ['mixtral-8x7b-v0.1-AWQ', 'Mixtral-8x7B-Instruct-v0.1'])
|
||||
def test_llm_mixtral_int4_awq_1gpu_summary(llama_example_root,
|
||||
llm_datasets_root, model_name,
|
||||
llm_rouge_root, llm_venv, cmodel_dir,
|
||||
engine_dir):
|
||||
engine_dir,
|
||||
qcache_dir_without_install_package):
|
||||
models_root = llm_models_root()
|
||||
model_dir = os.path.join(models_root, model_name)
|
||||
ckpt_dir = os.path.join(cmodel_dir, model_name)
|
||||
|
||||
print("Convert checkpoint...")
|
||||
convert_cmd = [
|
||||
f"{llama_example_root}/convert_checkpoint.py",
|
||||
"--model_dir",
|
||||
model_dir,
|
||||
"--output_dir",
|
||||
ckpt_dir,
|
||||
]
|
||||
venv_check_call(llm_venv, convert_cmd)
|
||||
if 'AWQ' in model_name:
|
||||
print("Convert checkpoint...")
|
||||
convert_cmd = [
|
||||
f"{llama_example_root}/convert_checkpoint.py",
|
||||
"--model_dir",
|
||||
model_dir,
|
||||
"--output_dir",
|
||||
ckpt_dir,
|
||||
]
|
||||
venv_check_call(llm_venv, convert_cmd)
|
||||
else:
|
||||
print("Quantizing model...")
|
||||
ckpt_dir = quantize_data(
|
||||
llm_venv,
|
||||
llama_example_root,
|
||||
model_dir=model_dir,
|
||||
calib_dataset=f"{llm_datasets_root}/cnn_dailymail",
|
||||
dtype="float16",
|
||||
qformat="int4_awq",
|
||||
quantize_dir=qcache_dir_without_install_package,
|
||||
tp_size=1,
|
||||
calib_size=32)
|
||||
|
||||
print("Build engines...")
|
||||
build_cmd = [
|
||||
|
||||
@ -180,6 +180,7 @@ examples/test_mixtral.py::test_llm_mixtral_wo_2gpus_summary[Mixtral-8x7B-v0.1-in
|
||||
examples/test_mixtral.py::test_llm_mixtral_wo_2gpus_summary[Mixtral-8x7B-v0.1-int8-nb:4]
|
||||
examples/test_mixtral.py::test_llm_mixtral_1gpu_fp4_llmapi[Mixtral-8x7B-Instruct-v0.1]
|
||||
examples/test_mixtral.py::test_llm_mixtral_int4_awq_1gpu_summary[mixtral-8x7b-v0.1-AWQ]
|
||||
examples/test_mixtral.py::test_llm_mixtral_int4_awq_1gpu_summary[Mixtral-8x7B-Instruct-v0.1]
|
||||
examples/test_multimodal.py::test_llm_multimodal_general[Phi-3-vision-128k-instruct-pp:1-tp:1-float16-bs:1-cpp_e2e:False-nb:1]
|
||||
examples/test_multimodal.py::test_llm_multimodal_general[Phi-3-vision-128k-instruct-pp:1-tp:1-float16-bs:8-cpp_e2e:False-nb:1]
|
||||
examples/test_multimodal.py::test_llm_multimodal_general[Phi-3.5-vision-instruct-pp:1-tp:1-float16-bs:1-cpp_e2e:False-nb:1]
|
||||
|
||||
Loading…
Reference in New Issue
Block a user