Merge branch 'main' into fix_spec_gate

This commit is contained in:
Zheyu Fu 2025-12-21 23:52:35 -08:00 committed by GitHub
commit 6ea3cb59fc
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
2 changed files with 22 additions and 24 deletions

View File

@ -1144,24 +1144,31 @@ class NVFP4LinearMethod(LinearMethodBase):
assert (
weight_col_size * 2
) % module.scaling_vector_size == 0, f"weight column size after padding {weight_col_size} must be divisible by scaling_vector_size {module.scaling_vector_size}"
# Pad weight_scale to match padded weight dimensions
# Padding should be performed on unswizzled weight_scale tensor
scale_rows = fp4_utils.pad_up(module.out_features, 128)
scale_cols = fp4_utils.pad_up(
module.in_features // module.scaling_vector_size, 4)
weight_scale_unswizzle = unswizzle_sf(module.weight_scale.data,
scale_rows, scale_cols,
module.scaling_vector_size)
weight_scale_unswizzle_pad = F.pad(
weight_scale_unswizzle,
(0, (col_pad_size * 2) // module.scaling_vector_size, 0,
row_pad_size),
mode='constant',
value=0)
module.weight_scale = Parameter(
torch.ops.trtllm.block_scale_interleave(
weight_scale_unswizzle_pad),
requires_grad=False)
scale_pad_row = fp4_utils.pad_up(module.out_features + row_pad_size,
128) - scale_rows
# here one col_size of weight equals two linear in_features
scale_pad_col = fp4_utils.pad_up(
(module.in_features + (col_pad_size * 2)) //
module.scaling_vector_size, 4) - scale_cols
# Pad weight_scale to match padded weight dimensions
# Padding should be performed on unswizzled weight_scale tensor
if scale_pad_row != 0 or scale_pad_col != 0:
weight_scale_unswizzle = unswizzle_sf(
module.weight_scale.data, scale_rows,
scale_cols * module.scaling_vector_size,
module.scaling_vector_size)
weight_scale_unswizzle_pad = F.pad(
weight_scale_unswizzle,
(0, scale_pad_col, 0, scale_pad_row),
mode='constant',
value=0)
module.weight_scale = Parameter(
torch.ops.trtllm.block_scale_interleave(
weight_scale_unswizzle_pad),
requires_grad=False)
class W4A8NVFP4FP8LinearMethod(LinearMethodBase):

View File

@ -226,7 +226,6 @@ triton_server/test_triton.py::test_gpt_ib_streaming[gpt-ib-streaming] SKIP (http
triton_server/test_triton.py::test_gpt_ib_ptuning[gpt-ib-ptuning] SKIP (https://nvbugs/5445624)
triton_server/test_triton.py::test_mistral_ib_mm[mistral-ib-mm] SKIP (https://nvbugs/5371343)
triton_server/test_triton.py::test_t5_ib[t5-ib] SKIP (https://nvbugs/5456482)
accuracy/test_llm_api.py::TestLlama3_1_8BInstruct::test_gather_generation_logits_cuda_graph SKIP (https://nvbugs/5365525)
examples/test_phi.py::test_phi_fp8_with_bf16_lora[Phi-3-mini-128k-instruct] SKIP (https://nvbugs/5465143)
examples/test_phi.py::test_phi_fp8_with_bf16_lora[Phi-3-small-128k-instruct] SKIP (https://nvbugs/5465143)
examples/test_phi.py::test_phi_fp8_with_bf16_lora[Phi-3.5-mini-instruct] SKIP (https://nvbugs/5465143)
@ -253,7 +252,6 @@ accuracy/test_cli_flow.py::TestPhi4MiniInstruct::test_auto_dtype SKIP (https://n
accuracy/test_cli_flow.py::TestPhi4MiniInstruct::test_tp2 SKIP (https://nvbugs/5465143, 5481206 WNF)
accuracy/test_cli_flow.py::TestLongAlpaca7B::test_auto_dtype SKIP (https://nvbugs/5481075)
accuracy/test_llm_api.py::TestPhi4MiniInstruct::test_fp8 SKIP (https://nvbugs/5465143, 5481206 WNF)
accuracy/test_disaggregated_serving.py::TestLlama3_1_8BInstruct::test_ngram SKIP (https://nvbugs/5488118)
accuracy/test_disaggregated_serving.py::TestDeepSeekV32Exp::test_auto_dtype[False] SKIP (https://nvbugs/5738168)
test_e2e.py::test_trtllm_bench_iteration_log[TRT-streaming-meta-llama/Llama-3.1-8B-llama-3.1-model/Meta-Llama-3.1-8B] SKIP (https://nvbugs/5448523)
accuracy/test_llm_api_pytorch.py::TestLlama3_2_3B::test_auto_dtype SKIP (https://nvbugs/5520319)
@ -313,7 +311,6 @@ accuracy/test_cli_flow.py::TestMinitron4BBase::test_fp8 SKIP (https://nvbugs/560
examples/test_gpt.py::test_llm_minitron_fp8_with_pseudo_loras[4b] SKIP (https://nvbugs/5606233)
test_e2e.py::test_trtllm_bench_pytorch_backend_sanity[meta-llama/Llama-3.1-8B-llama-3.1-8b-hf-nvfp4-False-False] SKIP (https://nvbugs/5629791)
accuracy/test_disaggregated_serving.py::TestLlama4ScoutInstruct::test_auto_dtype[False] SKIP (https://nvbugs/5629792)
accuracy/test_llm_api_pytorch.py::TestQwen3_235B_A22B::test_fp8[throughput_latency] SKIP (https://nvbugs/5631036)
accuracy/test_llm_api_pytorch.py::TestQwen3_235B_A22B::test_nvfp4[latency_moe_trtllm_attention_dp] SKIP (https://nvbugs/5637220)
llmapi/test_llm_examples.py::test_llmapi_example_multilora SKIP (https://nvbugs/5636857)
accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_bfloat16_4gpus[tp4-attn_backend=TRTLLM-torch_compile=False] SKIP (https://nvbugs/5616182)
@ -350,13 +347,10 @@ accuracy/test_llm_api_pytorch.py::TestQwen3_30B_A3B::test_nvfp4[tep4_latency_moe
examples/test_qwen.py::test_llm_qwen_int4_single_gpu_summary[qwen2.5_14b_instruct_int4-nb:4] SKIP (https://nvbugs/5666826)
examples/test_llama.py::test_llm_llama_1gpu_fp4[llama-3.1-70b-instruct-enable_norm_quant_fusion-enable_fused_quant-fp4_plugin-bfloat16] SKIP (https://nvbugs/5451216)
accuracy/test_llm_api_pytorch_multimodal.py::TestNemotron_Nano_12B_V2_VL::test_auto_dtype SKIP (https://nvbugs/5588376)
accuracy/test_llm_api_pytorch.py::TestLlama3_3_70BInstruct::test_fp8_eagle3_tp8[eagle3_one_model=True] SKIP (https://nvbugs/5673743)
accuracy/test_llm_api_pytorch.py::TestLlama3_3_70BInstruct::test_fp8_eagle3_tp8[eagle3_one_model=False] SKIP (https://nvbugs/5673743)
accuracy/test_llm_api_pytorch.py::TestNemotronNas::test_auto_dtype_tp8 SKIP (https://nvbugs/5673527)
accuracy/test_llm_api_autodeploy.py::TestLlama3_1_8B::test_auto_dtype[False-4] SKIP (https://nvbugs/5680312, https://nvbugs/5636912)
accuracy/test_llm_api_autodeploy.py::TestLlama3_1_8B::test_auto_dtype[False-2] SKIP (https://nvbugs/5680312, https://nvbugs/5636912)
unittest/_torch/auto_deploy/unit/multigpu/test_ad_build_small_multi.py::test_build_ad[meta-llama/Meta-Llama-3.1-8B-Instruct-llm_extra_args0-2] SKIP (https://nvbugs/5680755)
examples/test_ray.py::test_ray_disaggregated_serving[tp2] SKIP (https://nvbugs/5683039)
full:H100_PCIe/unittest/llmapi/test_llm_pytorch.py::test_llama_7b_multi_lora_evict_and_reload_lora_gpu_cache SKIP (https://nvbugs/5682551)
unittest/_torch/speculative/test_eagle3.py::test_qwen3_eagle3[True-True-True-True] SKIP (https://nvbugspro.nvidia.com/bug/5749988)
accuracy/test_llm_api_pytorch.py::TestSeedOss_36B::test_auto_dtype SKIP (https://nvbugs/5612438)
@ -366,14 +360,12 @@ accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_fp8_4gpus[tp4-fp
triton_server/test_triton_llm.py::test_llmapi_backend[4-0-disableDecoupleMode-tensorrt_llm] SKIP (https://nvbugs/5701480)
accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16_4gpus[ep4-mtp_nextn=0-attention_dp=False-cuda_graph=False-overlap_scheduler=False-torch_compile=False] SKIP (https://nvbugs/5701425)
accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16_4gpus[tp4-mtp_nextn=2-attention_dp=False-cuda_graph=True-overlap_scheduler=True-torch_compile=False] SKIP (https://nvbugs/5701425)
accuracy/test_llm_api_pytorch.py::TestDeepSeekR1::test_nvfp4_multi_gpus[throughput_tp8] SKIP (https://nvbugs/5698897)
unittest/_torch/modules/tests_lora_modules/test_lora_attention_pytorch_flow_vs_trt.py::TestLoraAttentionPytorchFlowVsTRT::test_lora_attention SKIP (https://nvbugs/5701421)
accuracy/test_llm_api_pytorch.py::TestQwen3NextInstruct::test_nvfp4[no_cuda_graph_overlap-cutlass] SKIP (https://nvbugs/5702795)
accuracy/test_llm_api_pytorch.py::TestQwen3NextInstruct::test_nvfp4[tp1-cutlass] SKIP (https://nvbugs/5702795)
accuracy/test_llm_api_pytorch.py::TestQwen3NextInstruct::test_nvfp4[tp4ep1-cutlass] SKIP (https://nvbugs/5702795)
accuracy/test_llm_api_pytorch.py::TestQwen3NextInstruct::test_nvfp4[tp4ep4-cutlass] SKIP (https://nvbugs/5702795)
accuracy/test_llm_api_pytorch.py::TestQwen3NextInstruct::test_nvfp4[tp4ep4-trtllm] SKIP (https://nvbugs/5702795)
accuracy/test_llm_api_pytorch.py::TestDeepSeekR1::test_nvfp4_multi_gpus[latency_trtllmgen] SKIP (https://nvbugs/5698897)
test_e2e.py::test_ptp_quickstart_multimodal[mistral-small-3.1-24b-instruct-Mistral-Small-3.1-24B-Instruct-2503-mixture_text_image-True] SKIP (https://nvbugs/5648560)
test_e2e.py::test_ptp_quickstart_multimodal_2gpu[mistral-small-3.1-24b-instruct-Mistral-Small-3.1-24B-Instruct-2503] SKIP (https://nvbugs/5648560)
test_e2e.py::test_openai_chat_harmony SKIP (https://nvbugs/5633700)
@ -462,7 +454,6 @@ accuracy/test_cli_flow.py::TestPhi3_5MiniInstruct::test_auto_dtype SKIP (https:/
unittest/_torch/auto_deploy/unit/singlegpu/models/test_llama4_vlm_patch.py::test_build_run_llama4_vlm SKIP (https://nvbugs/5747878)
accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4_4gpus_online_eplb[fp8kv=True-moe_backend=TRTLLM] SKIP (https://nvbugs/5740377)
disaggregated/test_auto_scaling.py::test_minimal_instances[etcd-round_robin] SKIP (https://nvbugs/5748564)
disaggregated/test_auto_scaling.py::test_minimal_instances[http-round_robin] SKIP (https://nvbugs/5755963)
disaggregated/test_auto_scaling.py::test_service_discovery[etcd-load_balancing] SKIP (https://nvbugs/5757415)
disaggregated/test_auto_scaling.py::test_service_discovery[http-kv_cache_aware] SKIP (https://nvbugs/5758225)
unittest/llmapi/apps/test_disagg_serving_perf_metrics.py SKIP (https://nvbugs/5752516)