[#10707][fix] AutoDeploy: Super accuracy test fixes (#10717)

Signed-off-by: Gal Hubara Agam <96368689+galagam@users.noreply.github.com>
Signed-off-by: Gal Hubara-Agam <96368689+galagam@users.noreply.github.com>
This commit is contained in:
Gal Hubara-Agam 2026-01-20 18:16:13 +02:00 committed by GitHub
parent ae8f74b620
commit e61c942d1f
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
5 changed files with 55 additions and 4 deletions

View File

@ -349,10 +349,10 @@ mistral/Mistral-Large-3-675B:
- spec_dec_algo: Eagle
accuracy: 85.30
nvidia/Nemotron-Super-V3:
- accuracy: 81.07
- accuracy: 80.00
- quant_algo: FP8
kv_cache_quant_algo: FP8
accuracy: 78.22
accuracy: 77.80
- quant_algo: NVFP4
kv_cache_quant_algo: FP8
accuracy: 77.56

View File

@ -20,6 +20,7 @@ from tensorrt_llm._torch.auto_deploy import LLM as AutoDeployLLM
from tensorrt_llm.quantization import QuantAlgo
from tensorrt_llm.sampling_params import SamplingParams
from ..conftest import get_device_count, llm_models_root
from .accuracy_core import GSM8K, MMLU, CnnDailymail, LlmapiAccuracyTestHarness
@ -244,6 +245,9 @@ class TestNemotronSuperV3(LlmapiAccuracyTestHarness):
MODEL_NAME = "nvidia/Nemotron-Super-V3"
MODEL_PATH_BF16 = f"{llm_models_root()}/Nemotron-Super-3-120B-A12B-dev"
MODEL_PATH_FP8 = f"{llm_models_root()}/Nemotron-SuperV3-phase1-mtp-fp8-fp8kv"
MODEL_PATH_FP4 = f"{llm_models_root()}/Nemotron-SuperV3-phase1-mtp-nvfp4-fp8kv"
# Set minimum possible seq len + small buffer, for test speed & memory usage
MAX_SEQ_LEN = max(MMLU.MAX_INPUT_LEN + MMLU.MAX_OUTPUT_LEN,
GSM8K.MAX_INPUT_LEN + GSM8K.MAX_OUTPUT_LEN)
@ -289,3 +293,45 @@ class TestNemotronSuperV3(LlmapiAccuracyTestHarness):
task.evaluate(llm, sampling_params=sampling_params)
task = GSM8K(self.MODEL_NAME)
task.evaluate(llm)
@pytest.mark.skip("Skipping FP8 test until it is supported")
@pytest.mark.skip_less_device_memory(180000)
@pytest.mark.parametrize("world_size", [4, 8])
def test_fp8(self, world_size):
if get_device_count() < world_size:
pytest.skip("Not enough devices for world size, skipping test")
kwargs = self.get_default_kwargs()
sampling_params = self.get_default_sampling_params()
with AutoDeployLLM(model=self.MODEL_PATH_FP8,
tokenizer=self.MODEL_PATH_FP8,
world_size=world_size,
**kwargs) as llm:
# Manually set quant_config for FP8 model to get the accuracy threshold
llm.args.quant_config.quant_algo = QuantAlgo.FP8
llm.args.quant_config.kv_cache_quant_algo = QuantAlgo.FP8
task = MMLU(self.MODEL_NAME)
task.evaluate(llm, sampling_params=sampling_params)
task = GSM8K(self.MODEL_NAME)
task.evaluate(llm)
@pytest.mark.skip("Skipping FP4 test until it is supported")
@pytest.mark.skip_less_device_memory(180000)
@pytest.mark.parametrize("world_size", [1, 4, 8])
def test_fp4(self, world_size):
if get_device_count() < world_size:
pytest.skip("Not enough devices for world size, skipping test")
kwargs = self.get_default_kwargs()
sampling_params = self.get_default_sampling_params()
with AutoDeployLLM(model=self.MODEL_PATH_FP4,
tokenizer=self.MODEL_PATH_FP4,
world_size=world_size,
**kwargs) as llm:
# Manually set quant_config for FP4 model to get the accuracy threshold
llm.args.quant_config.quant_algo = QuantAlgo.NVFP4
llm.args.quant_config.kv_cache_quant_algo = QuantAlgo.NVFP4
task = MMLU(self.MODEL_NAME)
task.evaluate(llm, sampling_params=sampling_params)
task = GSM8K(self.MODEL_NAME)
task.evaluate(llm)

View File

@ -218,4 +218,6 @@ l0_dgx_b200:
tests:
- unittest/_torch/auto_deploy/unit/multigpu
- accuracy/test_llm_api_autodeploy.py::TestLlama3_1_8B::test_auto_dtype[False-4]
- accuracy/test_llm_api_autodeploy.py::TestNemotronMOE::test_bf16
- accuracy/test_llm_api_autodeploy.py::TestNemotronSuperV3::test_bf16
- accuracy/test_llm_api_autodeploy.py::TestNemotronSuperV3::test_fp8[4]
- accuracy/test_llm_api_autodeploy.py::TestNemotronSuperV3::test_fp8[8]

View File

@ -322,4 +322,6 @@ l0_dgx_h100:
tests:
- unittest/_torch/auto_deploy/unit/multigpu
- accuracy/test_llm_api_autodeploy.py::TestLlama3_1_8B::test_auto_dtype[False-4]
- accuracy/test_llm_api_autodeploy.py::TestNemotronMOE::test_bf16
- accuracy/test_llm_api_autodeploy.py::TestNemotronSuperV3::test_bf16
- accuracy/test_llm_api_autodeploy.py::TestNemotronSuperV3::test_fp8[4]
- accuracy/test_llm_api_autodeploy.py::TestNemotronSuperV3::test_fp8[8]

View File

@ -192,6 +192,7 @@ triton_server/test_triton.py::test_llava[llava] SKIP (https://nvbugs/5547414)
full:RTX/accuracy/test_llm_api_pytorch.py::TestGemma3_1BInstruct::test_auto_dtype SKIP (https://nvbugs/5569696)
accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_4gpus[ep4-cutlass-auto] SKIP (https://nvbugs/5596343)
accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_4gpus[tp4-cutlass-auto] SKIP (https://nvbugs/5596343)
accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_4gpus[ep4-trtllm-auto] SKIP (https://nvbugs/5596343)
examples/test_phi.py::test_llm_phi_lora_1gpu[Phi-3-mini-4k-instruct-ru-lora-Phi-3-mini-4k-instruct-lora_fp16-base_fp16] SKIP (https://nvbugs/5612313)
triton_server/test_triton.py::test_cpp_unit_tests[cpp-unit-tests] SKIP (https://nvbugs/5619359)
triton_server/test_triton_rcca.py::test_rcca_bug_4934893[Temperature:0.5-TOP_P:0.95-TOP_K:10-False-1---False-True-False-0-2048-enableDecoupleMode-inflight_fused_batching-disableTrtOverlap--max_utilization---1-1-1-False-ensemble] SKIP (https://nvbugs/5619369)