AWQ support Modelopt ckpts. (#3258)

Signed-off-by: Tracin <10434017+Tracin@users.noreply.github.com>
Co-authored-by: QI JUN <22017000+QiJune@users.noreply.github.com>
This commit is contained in:
Tracin 2025-04-04 08:10:35 +08:00 committed by GitHub
parent b763051ba4
commit bb6c338730
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
3 changed files with 34 additions and 16 deletions

View File

@ -1705,14 +1705,16 @@ def preprocess_perlayer_weights(weights,
dtype = torch.float16
if model_config.dtype == "bfloat16":
dtype = torch.bfloat16
weights[name] = preprocessor(param.T, torch.quint4x2,
weights[name] = preprocessor(param.transpose(-1, -2),
torch.quint4x2,
activation_type).view(dtype)
if name.endswith('weights_scaling_factor'
) and param.shape[0] > param.shape[1]:
# TODO: refine on supporting ModelOpt HF-AWQ
weights[name] = param.T.contiguous().to(
if name.endswith('weights_scaling_factor'):
weights[name] = param.transpose(-1, -2).contiguous().to(
str_dtype_to_torch(model_config.dtype))
if name.endswith('prequant_scaling_factor'):
if len(weights[name].shape) == 2:
# MoE experts share the same scaling factor.
param = param[0, :]
weights[name] = param.reshape(1, -1)
if model_config.mapping.tp_rank > 0:
if name.endswith('attention.dense.bias') or name.endswith(

View File

@ -888,24 +888,39 @@ def test_llm_mixtral_1gpu_fp4_llmapi(
venv_check_call(llm_venv, mmlu_cmd)
@pytest.mark.parametrize("model_name", ['mixtral-8x7b-v0.1-AWQ'])
@pytest.mark.parametrize(
"model_name", ['mixtral-8x7b-v0.1-AWQ', 'Mixtral-8x7B-Instruct-v0.1'])
def test_llm_mixtral_int4_awq_1gpu_summary(llama_example_root,
llm_datasets_root, model_name,
llm_rouge_root, llm_venv, cmodel_dir,
engine_dir):
engine_dir,
qcache_dir_without_install_package):
models_root = llm_models_root()
model_dir = os.path.join(models_root, model_name)
ckpt_dir = os.path.join(cmodel_dir, model_name)
print("Convert checkpoint...")
convert_cmd = [
f"{llama_example_root}/convert_checkpoint.py",
"--model_dir",
model_dir,
"--output_dir",
ckpt_dir,
]
venv_check_call(llm_venv, convert_cmd)
if 'AWQ' in model_name:
print("Convert checkpoint...")
convert_cmd = [
f"{llama_example_root}/convert_checkpoint.py",
"--model_dir",
model_dir,
"--output_dir",
ckpt_dir,
]
venv_check_call(llm_venv, convert_cmd)
else:
print("Quantizing model...")
ckpt_dir = quantize_data(
llm_venv,
llama_example_root,
model_dir=model_dir,
calib_dataset=f"{llm_datasets_root}/cnn_dailymail",
dtype="float16",
qformat="int4_awq",
quantize_dir=qcache_dir_without_install_package,
tp_size=1,
calib_size=32)
print("Build engines...")
build_cmd = [

View File

@ -180,6 +180,7 @@ examples/test_mixtral.py::test_llm_mixtral_wo_2gpus_summary[Mixtral-8x7B-v0.1-in
examples/test_mixtral.py::test_llm_mixtral_wo_2gpus_summary[Mixtral-8x7B-v0.1-int8-nb:4]
examples/test_mixtral.py::test_llm_mixtral_1gpu_fp4_llmapi[Mixtral-8x7B-Instruct-v0.1]
examples/test_mixtral.py::test_llm_mixtral_int4_awq_1gpu_summary[mixtral-8x7b-v0.1-AWQ]
examples/test_mixtral.py::test_llm_mixtral_int4_awq_1gpu_summary[Mixtral-8x7B-Instruct-v0.1]
examples/test_multimodal.py::test_llm_multimodal_general[Phi-3-vision-128k-instruct-pp:1-tp:1-float16-bs:1-cpp_e2e:False-nb:1]
examples/test_multimodal.py::test_llm_multimodal_general[Phi-3-vision-128k-instruct-pp:1-tp:1-float16-bs:8-cpp_e2e:False-nb:1]
examples/test_multimodal.py::test_llm_multimodal_general[Phi-3.5-vision-instruct-pp:1-tp:1-float16-bs:1-cpp_e2e:False-nb:1]