mirror of
https://github.com/NVIDIA/TensorRT-LLM.git
synced 2026-01-14 06:27:45 +08:00
[https://nvbugs/5744427][fix] Make Gemma3 multimodal test fp8 (#10368)
Signed-off-by: Balaram Buddharaju <169953907+brb-nv@users.noreply.github.com>
This commit is contained in:
parent
edbcff0257
commit
0b75340223
@ -1,5 +1,8 @@
|
|||||||
google/gemma-3-27b-it:
|
google/gemma-3-27b-it:
|
||||||
- accuracy: 52.0
|
- accuracy: 52.0
|
||||||
|
- quant_algo: FP8
|
||||||
|
kv_cache_quant_algo: FP8
|
||||||
|
accuracy: 50.0
|
||||||
Qwen/Qwen2-VL-7B-Instruct:
|
Qwen/Qwen2-VL-7B-Instruct:
|
||||||
- accuracy: 48.44
|
- accuracy: 48.44
|
||||||
Qwen/Qwen2.5-VL-7B-Instruct:
|
Qwen/Qwen2.5-VL-7B-Instruct:
|
||||||
|
|||||||
@ -220,7 +220,8 @@ class TestPhi4MMFusedVisionLora(LlmapiAccuracyTestHarness):
|
|||||||
@skip_post_blackwell
|
@skip_post_blackwell
|
||||||
class TestGemma3_27BInstruct(LlmapiAccuracyTestHarness):
|
class TestGemma3_27BInstruct(LlmapiAccuracyTestHarness):
|
||||||
MODEL_NAME = "google/gemma-3-27b-it"
|
MODEL_NAME = "google/gemma-3-27b-it"
|
||||||
MODEL_PATH = f"{llm_models_root()}/gemma/gemma-3-27b-it/"
|
# Note: This has only the LLM part quantized. Vision part is in bfloat16.
|
||||||
|
MODEL_PATH = f"{llm_models_root()}/gemma/gemma-3-27b-it-fp8/"
|
||||||
MAX_NUM_TOKENS = 12800
|
MAX_NUM_TOKENS = 12800
|
||||||
|
|
||||||
sampling_params = SamplingParams(
|
sampling_params = SamplingParams(
|
||||||
@ -232,9 +233,10 @@ class TestGemma3_27BInstruct(LlmapiAccuracyTestHarness):
|
|||||||
enable_block_reuse=False,
|
enable_block_reuse=False,
|
||||||
enable_partial_reuse=False,
|
enable_partial_reuse=False,
|
||||||
free_gpu_memory_fraction=0.4,
|
free_gpu_memory_fraction=0.4,
|
||||||
|
dtype="fp8",
|
||||||
)
|
)
|
||||||
|
|
||||||
def test_auto_dtype(self):
|
def test_fp8_prequantized(self):
|
||||||
# Gemma3 VLM needs FlashInfer attention backend for custom mask support.
|
# Gemma3 VLM needs FlashInfer attention backend for custom mask support.
|
||||||
with LLM(
|
with LLM(
|
||||||
self.MODEL_PATH,
|
self.MODEL_PATH,
|
||||||
|
|||||||
@ -675,7 +675,7 @@ accuracy/test_llm_api_pytorch_multimodal.py::TestNVILA_8B::test_auto_dtype
|
|||||||
accuracy/test_llm_api_pytorch_multimodal.py::TestVILA1_5_3B::test_auto_dtype
|
accuracy/test_llm_api_pytorch_multimodal.py::TestVILA1_5_3B::test_auto_dtype
|
||||||
accuracy/test_llm_api_pytorch_multimodal.py::TestNemotron_Nano_12B_V2_VL::test_auto_dtype
|
accuracy/test_llm_api_pytorch_multimodal.py::TestNemotron_Nano_12B_V2_VL::test_auto_dtype
|
||||||
accuracy/test_llm_api_pytorch_multimodal.py::TestPhi4MMFusedVisionLora::test_auto_dtype
|
accuracy/test_llm_api_pytorch_multimodal.py::TestPhi4MMFusedVisionLora::test_auto_dtype
|
||||||
accuracy/test_llm_api_pytorch_multimodal.py::TestGemma3_27BInstruct::test_auto_dtype
|
accuracy/test_llm_api_pytorch_multimodal.py::TestGemma3_27BInstruct::test_fp8_prequantized
|
||||||
accuracy/test_llm_api_pytorch_multimodal.py::TestQwen3VL_MOE::test_auto_dtype
|
accuracy/test_llm_api_pytorch_multimodal.py::TestQwen3VL_MOE::test_auto_dtype
|
||||||
|
|
||||||
test_e2e.py::test_llama_e2e[use_cpp_session-remove_input_padding-]
|
test_e2e.py::test_llama_e2e[use_cpp_session-remove_input_padding-]
|
||||||
|
|||||||
@ -236,7 +236,7 @@ accuracy/test_llm_api_pytorch_multimodal.py::TestQwen2_5_VL_7B::test_auto_dtype
|
|||||||
accuracy/test_llm_api_pytorch_multimodal.py::TestLlava_V1_6_Mistral_7B::test_auto_dtype
|
accuracy/test_llm_api_pytorch_multimodal.py::TestLlava_V1_6_Mistral_7B::test_auto_dtype
|
||||||
accuracy/test_llm_api_pytorch_multimodal.py::TestNVILA_8B::test_auto_dtype
|
accuracy/test_llm_api_pytorch_multimodal.py::TestNVILA_8B::test_auto_dtype
|
||||||
accuracy/test_llm_api_pytorch_multimodal.py::TestVILA1_5_3B::test_auto_dtype
|
accuracy/test_llm_api_pytorch_multimodal.py::TestVILA1_5_3B::test_auto_dtype
|
||||||
accuracy/test_llm_api_pytorch_multimodal.py::TestGemma3_27BInstruct::test_auto_dtype
|
accuracy/test_llm_api_pytorch_multimodal.py::TestGemma3_27BInstruct::test_fp8_prequantized
|
||||||
|
|
||||||
disaggregated/test_disaggregated.py::test_disaggregated_cache_aware_balance[TinyLlama-1.1B-Chat-v1.0]
|
disaggregated/test_disaggregated.py::test_disaggregated_cache_aware_balance[TinyLlama-1.1B-Chat-v1.0]
|
||||||
disaggregated/test_disaggregated.py::test_disaggregated_cache_aware_balance[TinyLlama-1.1B-Chat-v1.0]
|
disaggregated/test_disaggregated.py::test_disaggregated_cache_aware_balance[TinyLlama-1.1B-Chat-v1.0]
|
||||||
|
|||||||
@ -241,7 +241,7 @@ accuracy/test_llm_api_pytorch.py::TestStarcoder2_15B::test_auto_dtype
|
|||||||
accuracy/test_llm_api_pytorch_multimodal.py::TestQwen2_VL_7B::test_auto_dtype
|
accuracy/test_llm_api_pytorch_multimodal.py::TestQwen2_VL_7B::test_auto_dtype
|
||||||
accuracy/test_llm_api_pytorch_multimodal.py::TestQwen2_5_VL_7B::test_auto_dtype
|
accuracy/test_llm_api_pytorch_multimodal.py::TestQwen2_5_VL_7B::test_auto_dtype
|
||||||
accuracy/test_llm_api_pytorch_multimodal.py::TestLlava_V1_6_Mistral_7B::test_auto_dtype
|
accuracy/test_llm_api_pytorch_multimodal.py::TestLlava_V1_6_Mistral_7B::test_auto_dtype
|
||||||
accuracy/test_llm_api_pytorch_multimodal.py::TestGemma3_27BInstruct::test_auto_dtype
|
accuracy/test_llm_api_pytorch_multimodal.py::TestGemma3_27BInstruct::test_fp8_prequantized
|
||||||
accuracy/test_disaggregated_serving.py::TestGPTOSS::test_auto_dtype[True]
|
accuracy/test_disaggregated_serving.py::TestGPTOSS::test_auto_dtype[True]
|
||||||
accuracy/test_disaggregated_serving.py::TestGPTOSS::test_auto_dtype[False]
|
accuracy/test_disaggregated_serving.py::TestGPTOSS::test_auto_dtype[False]
|
||||||
|
|
||||||
|
|||||||
@ -71,6 +71,7 @@ l0_h100:
|
|||||||
- accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_chunked_prefill[quant_dtype=none-kv_cache_reuse=False-fp8kv=False-overlap_scheduler=True]
|
- accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_chunked_prefill[quant_dtype=none-kv_cache_reuse=False-fp8kv=False-overlap_scheduler=True]
|
||||||
- accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_chunked_prefill[quant_dtype=fp8-kv_cache_reuse=True-fp8kv=True-overlap_scheduler=True]
|
- accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_chunked_prefill[quant_dtype=fp8-kv_cache_reuse=True-fp8kv=True-overlap_scheduler=True]
|
||||||
- accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_dummy_load_format
|
- accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_dummy_load_format
|
||||||
|
- accuracy/test_llm_api_pytorch_multimodal.py::TestGemma3_27BInstruct::test_fp8_prequantized
|
||||||
- accuracy/test_llm_api_pytorch.py::TestQwen3_8B::test_fp8_block_scales[latency]
|
- accuracy/test_llm_api_pytorch.py::TestQwen3_8B::test_fp8_block_scales[latency]
|
||||||
- accuracy/test_llm_api_pytorch.py::TestQwen3_8B::test_dummy_load_format
|
- accuracy/test_llm_api_pytorch.py::TestQwen3_8B::test_dummy_load_format
|
||||||
- accuracy/test_llm_api_pytorch.py::TestQwen3_30B_A3B::test_fp8[latency-torch_compile=False]
|
- accuracy/test_llm_api_pytorch.py::TestQwen3_30B_A3B::test_fp8[latency-torch_compile=False]
|
||||||
@ -265,7 +266,6 @@ l0_h100:
|
|||||||
- accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_fp8_block_scales[mtp=vanilla-fp8kv=False-attention_dp=False-cuda_graph=False-overlap_scheduler=False-torch_compile=False]
|
- accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_fp8_block_scales[mtp=vanilla-fp8kv=False-attention_dp=False-cuda_graph=False-overlap_scheduler=False-torch_compile=False]
|
||||||
- accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_no_kv_cache_reuse[quant_dtype=none-mtp_nextn=2-fp8kv=False-attention_dp=True-cuda_graph=True-overlap_scheduler=True]
|
- accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_no_kv_cache_reuse[quant_dtype=none-mtp_nextn=2-fp8kv=False-attention_dp=True-cuda_graph=True-overlap_scheduler=True]
|
||||||
- accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_guided_decoding[xgrammar-mtp_nextn=0]
|
- accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_guided_decoding[xgrammar-mtp_nextn=0]
|
||||||
- accuracy/test_llm_api_pytorch_multimodal.py::TestGemma3_27BInstruct::test_auto_dtype
|
|
||||||
- accuracy/test_llm_api_pytorch.py::TestGemma3_1BInstruct::test_fp8_prequantized
|
- accuracy/test_llm_api_pytorch.py::TestGemma3_1BInstruct::test_fp8_prequantized
|
||||||
- accuracy/test_llm_api_pytorch.py::TestGemma3_27BInstruct::test_fp8_prequantized
|
- accuracy/test_llm_api_pytorch.py::TestGemma3_27BInstruct::test_fp8_prequantized
|
||||||
- accuracy/test_llm_api_pytorch.py::TestGemma3_27BInstruct::test_auto_dtype TIMEOUT (90)
|
- accuracy/test_llm_api_pytorch.py::TestGemma3_27BInstruct::test_auto_dtype TIMEOUT (90)
|
||||||
|
|||||||
Loading…
Reference in New Issue
Block a user