AWQ support Modelopt ckpts. (#3258)

Signed-off-by: Tracin <10434017+Tracin@users.noreply.github.com> Co-authored-by: QI JUN <22017000+QiJune@users.noreply.github.com>
2026-01-14 06:27:45 +08:00 · 2025-04-04 08:10:35 +08:00 · 2025-04-04 08:10:35 +08:00 · bb6c338730
commit bb6c338730
parent b763051ba4
3 changed files with 34 additions and 16 deletions
--- a/tensorrt_llm/models/modeling_utils.py
+++ b/tensorrt_llm/models/modeling_utils.py
@ -1705,14 +1705,16 @@ def preprocess_perlayer_weights(weights,
                dtype = torch.float16
                if model_config.dtype == "bfloat16":
                    dtype = torch.bfloat16
-                weights[name] = preprocessor(param.T, torch.quint4x2,
+                weights[name] = preprocessor(param.transpose(-1, -2),
+                                             torch.quint4x2,
                                             activation_type).view(dtype)
-            if name.endswith('weights_scaling_factor'
-                             ) and param.shape[0] > param.shape[1]:
-                # TODO: refine on supporting ModelOpt HF-AWQ
-                weights[name] = param.T.contiguous().to(
+            if name.endswith('weights_scaling_factor'):
+                weights[name] = param.transpose(-1, -2).contiguous().to(
                    str_dtype_to_torch(model_config.dtype))
            if name.endswith('prequant_scaling_factor'):
+                if len(weights[name].shape) == 2:
+                    # MoE experts share the same scaling factor.
+                    param = param[0, :]
                weights[name] = param.reshape(1, -1)
            if model_config.mapping.tp_rank > 0:
                if name.endswith('attention.dense.bias') or name.endswith(
--- a/tests/integration/defs/examples/test_mixtral.py
+++ b/tests/integration/defs/examples/test_mixtral.py
@ -888,24 +888,39 @@ def test_llm_mixtral_1gpu_fp4_llmapi(
    venv_check_call(llm_venv, mmlu_cmd)


-@pytest.mark.parametrize("model_name", ['mixtral-8x7b-v0.1-AWQ'])
+@pytest.mark.parametrize(
+    "model_name", ['mixtral-8x7b-v0.1-AWQ', 'Mixtral-8x7B-Instruct-v0.1'])
 def test_llm_mixtral_int4_awq_1gpu_summary(llama_example_root,
                                           llm_datasets_root, model_name,
                                           llm_rouge_root, llm_venv, cmodel_dir,
-                                           engine_dir):
+                                           engine_dir,
+                                           qcache_dir_without_install_package):
    models_root = llm_models_root()
    model_dir = os.path.join(models_root, model_name)
    ckpt_dir = os.path.join(cmodel_dir, model_name)

-    print("Convert checkpoint...")
-    convert_cmd = [
-        f"{llama_example_root}/convert_checkpoint.py",
-        "--model_dir",
-        model_dir,
-        "--output_dir",
-        ckpt_dir,
-    ]
-    venv_check_call(llm_venv, convert_cmd)
+    if 'AWQ' in model_name:
+        print("Convert checkpoint...")
+        convert_cmd = [
+            f"{llama_example_root}/convert_checkpoint.py",
+            "--model_dir",
+            model_dir,
+            "--output_dir",
+            ckpt_dir,
+        ]
+        venv_check_call(llm_venv, convert_cmd)
+    else:
+        print("Quantizing model...")
+        ckpt_dir = quantize_data(
+            llm_venv,
+            llama_example_root,
+            model_dir=model_dir,
+            calib_dataset=f"{llm_datasets_root}/cnn_dailymail",
+            dtype="float16",
+            qformat="int4_awq",
+            quantize_dir=qcache_dir_without_install_package,
+            tp_size=1,
+            calib_size=32)

    print("Build engines...")
    build_cmd = [
--- a/tests/integration/test_lists/qa/examples_test_list.txt
+++ b/tests/integration/test_lists/qa/examples_test_list.txt
@ -180,6 +180,7 @@ examples/test_mixtral.py::test_llm_mixtral_wo_2gpus_summary[Mixtral-8x7B-v0.1-in
 examples/test_mixtral.py::test_llm_mixtral_wo_2gpus_summary[Mixtral-8x7B-v0.1-int8-nb:4]
 examples/test_mixtral.py::test_llm_mixtral_1gpu_fp4_llmapi[Mixtral-8x7B-Instruct-v0.1]
 examples/test_mixtral.py::test_llm_mixtral_int4_awq_1gpu_summary[mixtral-8x7b-v0.1-AWQ]
+examples/test_mixtral.py::test_llm_mixtral_int4_awq_1gpu_summary[Mixtral-8x7B-Instruct-v0.1]
 examples/test_multimodal.py::test_llm_multimodal_general[Phi-3-vision-128k-instruct-pp:1-tp:1-float16-bs:1-cpp_e2e:False-nb:1]
 examples/test_multimodal.py::test_llm_multimodal_general[Phi-3-vision-128k-instruct-pp:1-tp:1-float16-bs:8-cpp_e2e:False-nb:1]
 examples/test_multimodal.py::test_llm_multimodal_general[Phi-3.5-vision-instruct-pp:1-tp:1-float16-bs:1-cpp_e2e:False-nb:1]