[https://nvbugs/5608489][fix] Fix output unpack issues for Llama3/4 NVFP4 models. (#8679)

Signed-off-by: Yukun He <23156053+hyukn@users.noreply.github.com>
This commit is contained in:
Yukun He 2025-10-28 14:21:47 +08:00 committed by GitHub
parent 28c9a51c06
commit e04354bc09
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
5 changed files with 8 additions and 8 deletions

View File

@ -598,7 +598,7 @@ class Llama4DecoderLayer(DecoderLayer):
))
# Unpack the allreduce output
if self.next_attn is not None and self.is_nvfp4:
if self.post_feed_forward_fusion_op == AllReduceFusionOp.RESIDUAL_RMS_NORM_QUANT_NVFP4:
act_fp4, act_sf, residual = allreduce_output
hidden_states = Fp4QuantizedTensor(act_fp4, act_sf)
else:
@ -789,7 +789,7 @@ class LlamaDecoderLayer(DecoderLayer):
scale=scale,
eps=self.next_layer_layernorm.variance_epsilon,
))
if self.next_attn is not None and self.is_nvfp4:
if self.post_mlp_fusion_op == AllReduceFusionOp.RESIDUAL_RMS_NORM_QUANT_NVFP4:
act_fp4, act_sf, residual = all_reduce_output
hidden_states = Fp4QuantizedTensor(act_fp4, act_sf)
else:

View File

@ -645,15 +645,15 @@ class TestLlama3_3_70BInstruct(LlmapiAccuracyTestHarness):
@pytest.mark.skip_less_device(4)
@skip_pre_blackwell
def test_fp8_tp2pp2(self):
model_path = f"{llm_models_root()}/llama-3.3-models/Llama-3.3-70B-Instruct-FP8"
def test_fp4_tp2pp2(self):
model_path = f"{llm_models_root()}/llama-3.3-models/Llama-3.3-70B-Instruct-FP4"
kv_cache_config = KvCacheConfig(free_gpu_memory_fraction=0.5)
with LLM(model_path,
tensor_parallel_size=2,
pipeline_parallel_size=2,
max_batch_size=32,
kv_cache_config=kv_cache_config) as llm:
assert llm.args.quant_config.quant_algo == QuantAlgo.FP8
assert llm.args.quant_config.quant_algo == QuantAlgo.NVFP4
sampling_params = SamplingParams(
max_tokens=256,
temperature=0.0,

View File

@ -418,7 +418,7 @@ accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_fp8_beam_search[
accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_fp8_beam_search[enable_cuda_graph=True-enable_padding=True-disable_overlap_scheduler=False]
accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_fp8_beam_search[enable_cuda_graph=True-enable_padding=True-disable_overlap_scheduler=True]
accuracy/test_llm_api_pytorch.py::TestLlama3_3_70BInstruct::test_fp8_tp4
accuracy/test_llm_api_pytorch.py::TestLlama3_3_70BInstruct::test_fp8_tp2pp2
accuracy/test_llm_api_pytorch.py::TestLlama3_3_70BInstruct::test_fp4_tp2pp2
accuracy/test_llm_api_pytorch.py::TestLlama3_3_70BInstruct::test_nvfp4_tp4
accuracy/test_llm_api_pytorch.py::TestLlama3_3_70BInstruct::test_fp8_eagle3_tp8[eagle3_one_model=True]
accuracy/test_llm_api_pytorch.py::TestLlama3_3_70BInstruct::test_fp8_eagle3_tp8[eagle3_one_model=False]

View File

@ -117,7 +117,7 @@ accuracy/test_llm_api_pytorch.py::TestLlama3_2_1B::test_auto_dtype
accuracy/test_llm_api_pytorch.py::TestLlama3_2_1B::test_fp8_prequantized
accuracy/test_llm_api_pytorch.py::TestLlama3_2_3B::test_auto_dtype
accuracy/test_llm_api_pytorch.py::TestLlama3_2_3B::test_fp8_prequantized
accuracy/test_llm_api_pytorch.py::TestLlama3_3_70BInstruct::test_fp8_tp2pp2
accuracy/test_llm_api_pytorch.py::TestLlama3_3_70BInstruct::test_fp4_tp2pp2
accuracy/test_llm_api_pytorch.py::TestLlama3_3_70BInstruct::test_fp8_tp4
accuracy/test_llm_api_pytorch.py::TestLlama3_3_70BInstruct::test_nvfp4_tp4
accuracy/test_llm_api_pytorch.py::TestLlama3_3_70BInstruct::test_fp8_eagle3_tp8[eagle3_one_model=True]

View File

@ -51,7 +51,7 @@ l0_dgx_b200:
- disaggregated/test_disaggregated.py::test_disaggregated_benchmark_on_diff_backends[llama-3.1-8b-instruct-hf-fp8]
- disaggregated/test_disaggregated.py::test_disaggregated_deepseek_v3_lite_fp8_nixl[DeepSeek-V3-Lite-fp8]
- accuracy/test_llm_api_pytorch.py::TestLlama3_3_70BInstruct::test_fp8_tp4
- accuracy/test_llm_api_pytorch.py::TestLlama3_3_70BInstruct::test_fp8_tp2pp2
- accuracy/test_llm_api_pytorch.py::TestLlama3_3_70BInstruct::test_fp4_tp2pp2
- accuracy/test_llm_api_pytorch.py::TestLlama3_3_70BInstruct::test_nvfp4_tp4
- accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_4gpus[dp4-cutlass-auto]
- condition: