mirror of
https://github.com/NVIDIA/TensorRT-LLM.git
synced 2026-02-16 15:55:08 +08:00
[None][chore] AutoDeploy update SuperV3 checkpoints and accuracy thresholds (#11107)
Signed-off-by: Gal Hubara Agam <96368689+galagam@users.noreply.github.com> Signed-off-by: Gal Hubara-Agam <96368689+galagam@users.noreply.github.com>
This commit is contained in:
parent
b1268e1b37
commit
f9eed3ecc2
@ -358,3 +358,8 @@ MiniMaxAI/MiniMax-M2:
|
||||
- accuracy: 85
|
||||
- quant_algo: FP8_BLOCK_SCALES
|
||||
accuracy: 85
|
||||
nvidia/NVIDIA-Nemotron-3-Super-120B-012726:
|
||||
- accuracy: 82.363
|
||||
- quant_algo: FP8
|
||||
kv_cache_quant_algo: FP8
|
||||
accuracy: 82.121
|
||||
|
||||
@ -392,3 +392,8 @@ nvidia/Nemotron-3-Nano:
|
||||
- quant_algo: FP8
|
||||
kv_cache_quant_algo: FP8
|
||||
accuracy: 74.35
|
||||
nvidia/NVIDIA-Nemotron-3-Super-120B-012726:
|
||||
- accuracy: 86.88
|
||||
- quant_algo: FP8
|
||||
kv_cache_quant_algo: FP8
|
||||
accuracy: 86.12
|
||||
|
||||
@ -319,10 +319,10 @@ class TestNemotronSuperV3(LlmapiAccuracyTestHarness):
|
||||
Runs the model via AutoDeploy and verifies benchmark performance on MMLU and GSM8K
|
||||
"""
|
||||
|
||||
MODEL_NAME = "nvidia/Nemotron-Super-V3"
|
||||
MODEL_PATH_BF16 = f"{llm_models_root()}/Nemotron-Super-3-120B-A12B-dev"
|
||||
MODEL_PATH_FP8 = f"{llm_models_root()}/Nemotron-SuperV3-phase1-mtp-fp8-fp8kv"
|
||||
MODEL_PATH_FP4 = f"{llm_models_root()}/Nemotron-SuperV3-phase1-mtp-nvfp4-fp8kv"
|
||||
MODEL_NAME = "nvidia/NVIDIA-Nemotron-3-Super-120B-012726"
|
||||
MODEL_PATH_BF16 = f"{llm_models_root()}/NVIDIA-Nemotron-3-Super-120B-BF16-BF16KV-012726"
|
||||
MODEL_PATH_FP8 = f"{llm_models_root()}/NVIDIA-Nemotron-3-Super-120B-FP8-FP8KV-012726"
|
||||
MODEL_PATH_FP4 = f"{llm_models_root()}/NVIDIA-Nemotron-3-Super-120B-NVFP4-FP8KV-012726"
|
||||
|
||||
# Set minimum possible seq len + small buffer, for test speed & memory usage
|
||||
MAX_SEQ_LEN = max(MMLU.MAX_INPUT_LEN + MMLU.MAX_OUTPUT_LEN,
|
||||
@ -371,7 +371,6 @@ class TestNemotronSuperV3(LlmapiAccuracyTestHarness):
|
||||
task.evaluate(llm)
|
||||
print_memory_usage("After evaluation")
|
||||
|
||||
@pytest.mark.skip("Skipping FP8 test until it is supported")
|
||||
@pytest.mark.skip_less_device_memory(180000)
|
||||
@pytest.mark.parametrize("world_size", [1, 4, 8])
|
||||
def test_fp8(self, world_size):
|
||||
@ -394,7 +393,7 @@ class TestNemotronSuperV3(LlmapiAccuracyTestHarness):
|
||||
|
||||
@pytest.mark.skip("Skipping FP4 test until it is supported")
|
||||
@pytest.mark.skip_less_device_memory(180000)
|
||||
@pytest.mark.parametrize("world_size", [1, 4, 8])
|
||||
@pytest.mark.parametrize("world_size", [4, 8])
|
||||
def test_fp4(self, world_size):
|
||||
if get_device_count() < world_size:
|
||||
pytest.skip("Not enough devices for world size, skipping test")
|
||||
|
||||
Loading…
Reference in New Issue
Block a user