mirror of
https://github.com/NVIDIA/TensorRT-LLM.git
synced 2026-01-14 06:27:45 +08:00
[https://nvbugs/5608489][fix] Fix output unpack issues for Llama3/4 NVFP4 models. (#8679)
Signed-off-by: Yukun He <23156053+hyukn@users.noreply.github.com>
This commit is contained in:
parent
28c9a51c06
commit
e04354bc09
@ -598,7 +598,7 @@ class Llama4DecoderLayer(DecoderLayer):
|
||||
))
|
||||
|
||||
# Unpack the allreduce output
|
||||
if self.next_attn is not None and self.is_nvfp4:
|
||||
if self.post_feed_forward_fusion_op == AllReduceFusionOp.RESIDUAL_RMS_NORM_QUANT_NVFP4:
|
||||
act_fp4, act_sf, residual = allreduce_output
|
||||
hidden_states = Fp4QuantizedTensor(act_fp4, act_sf)
|
||||
else:
|
||||
@ -789,7 +789,7 @@ class LlamaDecoderLayer(DecoderLayer):
|
||||
scale=scale,
|
||||
eps=self.next_layer_layernorm.variance_epsilon,
|
||||
))
|
||||
if self.next_attn is not None and self.is_nvfp4:
|
||||
if self.post_mlp_fusion_op == AllReduceFusionOp.RESIDUAL_RMS_NORM_QUANT_NVFP4:
|
||||
act_fp4, act_sf, residual = all_reduce_output
|
||||
hidden_states = Fp4QuantizedTensor(act_fp4, act_sf)
|
||||
else:
|
||||
|
||||
@ -645,15 +645,15 @@ class TestLlama3_3_70BInstruct(LlmapiAccuracyTestHarness):
|
||||
|
||||
@pytest.mark.skip_less_device(4)
|
||||
@skip_pre_blackwell
|
||||
def test_fp8_tp2pp2(self):
|
||||
model_path = f"{llm_models_root()}/llama-3.3-models/Llama-3.3-70B-Instruct-FP8"
|
||||
def test_fp4_tp2pp2(self):
|
||||
model_path = f"{llm_models_root()}/llama-3.3-models/Llama-3.3-70B-Instruct-FP4"
|
||||
kv_cache_config = KvCacheConfig(free_gpu_memory_fraction=0.5)
|
||||
with LLM(model_path,
|
||||
tensor_parallel_size=2,
|
||||
pipeline_parallel_size=2,
|
||||
max_batch_size=32,
|
||||
kv_cache_config=kv_cache_config) as llm:
|
||||
assert llm.args.quant_config.quant_algo == QuantAlgo.FP8
|
||||
assert llm.args.quant_config.quant_algo == QuantAlgo.NVFP4
|
||||
sampling_params = SamplingParams(
|
||||
max_tokens=256,
|
||||
temperature=0.0,
|
||||
|
||||
@ -418,7 +418,7 @@ accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_fp8_beam_search[
|
||||
accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_fp8_beam_search[enable_cuda_graph=True-enable_padding=True-disable_overlap_scheduler=False]
|
||||
accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_fp8_beam_search[enable_cuda_graph=True-enable_padding=True-disable_overlap_scheduler=True]
|
||||
accuracy/test_llm_api_pytorch.py::TestLlama3_3_70BInstruct::test_fp8_tp4
|
||||
accuracy/test_llm_api_pytorch.py::TestLlama3_3_70BInstruct::test_fp8_tp2pp2
|
||||
accuracy/test_llm_api_pytorch.py::TestLlama3_3_70BInstruct::test_fp4_tp2pp2
|
||||
accuracy/test_llm_api_pytorch.py::TestLlama3_3_70BInstruct::test_nvfp4_tp4
|
||||
accuracy/test_llm_api_pytorch.py::TestLlama3_3_70BInstruct::test_fp8_eagle3_tp8[eagle3_one_model=True]
|
||||
accuracy/test_llm_api_pytorch.py::TestLlama3_3_70BInstruct::test_fp8_eagle3_tp8[eagle3_one_model=False]
|
||||
|
||||
@ -117,7 +117,7 @@ accuracy/test_llm_api_pytorch.py::TestLlama3_2_1B::test_auto_dtype
|
||||
accuracy/test_llm_api_pytorch.py::TestLlama3_2_1B::test_fp8_prequantized
|
||||
accuracy/test_llm_api_pytorch.py::TestLlama3_2_3B::test_auto_dtype
|
||||
accuracy/test_llm_api_pytorch.py::TestLlama3_2_3B::test_fp8_prequantized
|
||||
accuracy/test_llm_api_pytorch.py::TestLlama3_3_70BInstruct::test_fp8_tp2pp2
|
||||
accuracy/test_llm_api_pytorch.py::TestLlama3_3_70BInstruct::test_fp4_tp2pp2
|
||||
accuracy/test_llm_api_pytorch.py::TestLlama3_3_70BInstruct::test_fp8_tp4
|
||||
accuracy/test_llm_api_pytorch.py::TestLlama3_3_70BInstruct::test_nvfp4_tp4
|
||||
accuracy/test_llm_api_pytorch.py::TestLlama3_3_70BInstruct::test_fp8_eagle3_tp8[eagle3_one_model=True]
|
||||
|
||||
@ -51,7 +51,7 @@ l0_dgx_b200:
|
||||
- disaggregated/test_disaggregated.py::test_disaggregated_benchmark_on_diff_backends[llama-3.1-8b-instruct-hf-fp8]
|
||||
- disaggregated/test_disaggregated.py::test_disaggregated_deepseek_v3_lite_fp8_nixl[DeepSeek-V3-Lite-fp8]
|
||||
- accuracy/test_llm_api_pytorch.py::TestLlama3_3_70BInstruct::test_fp8_tp4
|
||||
- accuracy/test_llm_api_pytorch.py::TestLlama3_3_70BInstruct::test_fp8_tp2pp2
|
||||
- accuracy/test_llm_api_pytorch.py::TestLlama3_3_70BInstruct::test_fp4_tp2pp2
|
||||
- accuracy/test_llm_api_pytorch.py::TestLlama3_3_70BInstruct::test_nvfp4_tp4
|
||||
- accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_4gpus[dp4-cutlass-auto]
|
||||
- condition:
|
||||
|
||||
Loading…
Reference in New Issue
Block a user