mirror of
https://github.com/NVIDIA/TensorRT-LLM.git
synced 2026-02-05 02:31:33 +08:00
Signed-off-by: Gal Hubara Agam <96368689+galagam@users.noreply.github.com> Signed-off-by: Gal Hubara-Agam <96368689+galagam@users.noreply.github.com>
This commit is contained in:
parent
ae8f74b620
commit
e61c942d1f
@ -349,10 +349,10 @@ mistral/Mistral-Large-3-675B:
|
||||
- spec_dec_algo: Eagle
|
||||
accuracy: 85.30
|
||||
nvidia/Nemotron-Super-V3:
|
||||
- accuracy: 81.07
|
||||
- accuracy: 80.00
|
||||
- quant_algo: FP8
|
||||
kv_cache_quant_algo: FP8
|
||||
accuracy: 78.22
|
||||
accuracy: 77.80
|
||||
- quant_algo: NVFP4
|
||||
kv_cache_quant_algo: FP8
|
||||
accuracy: 77.56
|
||||
|
||||
@ -20,6 +20,7 @@ from tensorrt_llm._torch.auto_deploy import LLM as AutoDeployLLM
|
||||
from tensorrt_llm.quantization import QuantAlgo
|
||||
from tensorrt_llm.sampling_params import SamplingParams
|
||||
|
||||
from ..conftest import get_device_count, llm_models_root
|
||||
from .accuracy_core import GSM8K, MMLU, CnnDailymail, LlmapiAccuracyTestHarness
|
||||
|
||||
|
||||
@ -244,6 +245,9 @@ class TestNemotronSuperV3(LlmapiAccuracyTestHarness):
|
||||
|
||||
MODEL_NAME = "nvidia/Nemotron-Super-V3"
|
||||
MODEL_PATH_BF16 = f"{llm_models_root()}/Nemotron-Super-3-120B-A12B-dev"
|
||||
MODEL_PATH_FP8 = f"{llm_models_root()}/Nemotron-SuperV3-phase1-mtp-fp8-fp8kv"
|
||||
MODEL_PATH_FP4 = f"{llm_models_root()}/Nemotron-SuperV3-phase1-mtp-nvfp4-fp8kv"
|
||||
|
||||
# Set minimum possible seq len + small buffer, for test speed & memory usage
|
||||
MAX_SEQ_LEN = max(MMLU.MAX_INPUT_LEN + MMLU.MAX_OUTPUT_LEN,
|
||||
GSM8K.MAX_INPUT_LEN + GSM8K.MAX_OUTPUT_LEN)
|
||||
@ -289,3 +293,45 @@ class TestNemotronSuperV3(LlmapiAccuracyTestHarness):
|
||||
task.evaluate(llm, sampling_params=sampling_params)
|
||||
task = GSM8K(self.MODEL_NAME)
|
||||
task.evaluate(llm)
|
||||
|
||||
@pytest.mark.skip("Skipping FP8 test until it is supported")
|
||||
@pytest.mark.skip_less_device_memory(180000)
|
||||
@pytest.mark.parametrize("world_size", [4, 8])
|
||||
def test_fp8(self, world_size):
|
||||
if get_device_count() < world_size:
|
||||
pytest.skip("Not enough devices for world size, skipping test")
|
||||
kwargs = self.get_default_kwargs()
|
||||
sampling_params = self.get_default_sampling_params()
|
||||
with AutoDeployLLM(model=self.MODEL_PATH_FP8,
|
||||
tokenizer=self.MODEL_PATH_FP8,
|
||||
world_size=world_size,
|
||||
**kwargs) as llm:
|
||||
# Manually set quant_config for FP8 model to get the accuracy threshold
|
||||
llm.args.quant_config.quant_algo = QuantAlgo.FP8
|
||||
llm.args.quant_config.kv_cache_quant_algo = QuantAlgo.FP8
|
||||
|
||||
task = MMLU(self.MODEL_NAME)
|
||||
task.evaluate(llm, sampling_params=sampling_params)
|
||||
task = GSM8K(self.MODEL_NAME)
|
||||
task.evaluate(llm)
|
||||
|
||||
@pytest.mark.skip("Skipping FP4 test until it is supported")
|
||||
@pytest.mark.skip_less_device_memory(180000)
|
||||
@pytest.mark.parametrize("world_size", [1, 4, 8])
|
||||
def test_fp4(self, world_size):
|
||||
if get_device_count() < world_size:
|
||||
pytest.skip("Not enough devices for world size, skipping test")
|
||||
kwargs = self.get_default_kwargs()
|
||||
sampling_params = self.get_default_sampling_params()
|
||||
with AutoDeployLLM(model=self.MODEL_PATH_FP4,
|
||||
tokenizer=self.MODEL_PATH_FP4,
|
||||
world_size=world_size,
|
||||
**kwargs) as llm:
|
||||
# Manually set quant_config for FP4 model to get the accuracy threshold
|
||||
llm.args.quant_config.quant_algo = QuantAlgo.NVFP4
|
||||
llm.args.quant_config.kv_cache_quant_algo = QuantAlgo.NVFP4
|
||||
|
||||
task = MMLU(self.MODEL_NAME)
|
||||
task.evaluate(llm, sampling_params=sampling_params)
|
||||
task = GSM8K(self.MODEL_NAME)
|
||||
task.evaluate(llm)
|
||||
|
||||
@ -218,4 +218,6 @@ l0_dgx_b200:
|
||||
tests:
|
||||
- unittest/_torch/auto_deploy/unit/multigpu
|
||||
- accuracy/test_llm_api_autodeploy.py::TestLlama3_1_8B::test_auto_dtype[False-4]
|
||||
- accuracy/test_llm_api_autodeploy.py::TestNemotronMOE::test_bf16
|
||||
- accuracy/test_llm_api_autodeploy.py::TestNemotronSuperV3::test_bf16
|
||||
- accuracy/test_llm_api_autodeploy.py::TestNemotronSuperV3::test_fp8[4]
|
||||
- accuracy/test_llm_api_autodeploy.py::TestNemotronSuperV3::test_fp8[8]
|
||||
|
||||
@ -322,4 +322,6 @@ l0_dgx_h100:
|
||||
tests:
|
||||
- unittest/_torch/auto_deploy/unit/multigpu
|
||||
- accuracy/test_llm_api_autodeploy.py::TestLlama3_1_8B::test_auto_dtype[False-4]
|
||||
- accuracy/test_llm_api_autodeploy.py::TestNemotronMOE::test_bf16
|
||||
- accuracy/test_llm_api_autodeploy.py::TestNemotronSuperV3::test_bf16
|
||||
- accuracy/test_llm_api_autodeploy.py::TestNemotronSuperV3::test_fp8[4]
|
||||
- accuracy/test_llm_api_autodeploy.py::TestNemotronSuperV3::test_fp8[8]
|
||||
|
||||
@ -192,6 +192,7 @@ triton_server/test_triton.py::test_llava[llava] SKIP (https://nvbugs/5547414)
|
||||
full:RTX/accuracy/test_llm_api_pytorch.py::TestGemma3_1BInstruct::test_auto_dtype SKIP (https://nvbugs/5569696)
|
||||
accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_4gpus[ep4-cutlass-auto] SKIP (https://nvbugs/5596343)
|
||||
accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_4gpus[tp4-cutlass-auto] SKIP (https://nvbugs/5596343)
|
||||
accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_4gpus[ep4-trtllm-auto] SKIP (https://nvbugs/5596343)
|
||||
examples/test_phi.py::test_llm_phi_lora_1gpu[Phi-3-mini-4k-instruct-ru-lora-Phi-3-mini-4k-instruct-lora_fp16-base_fp16] SKIP (https://nvbugs/5612313)
|
||||
triton_server/test_triton.py::test_cpp_unit_tests[cpp-unit-tests] SKIP (https://nvbugs/5619359)
|
||||
triton_server/test_triton_rcca.py::test_rcca_bug_4934893[Temperature:0.5-TOP_P:0.95-TOP_K:10-False-1---False-True-False-0-2048-enableDecoupleMode-inflight_fused_batching-disableTrtOverlap--max_utilization---1-1-1-False-ensemble] SKIP (https://nvbugs/5619369)
|
||||
|
||||
Loading…
Reference in New Issue
Block a user