mirror of
https://github.com/NVIDIA/TensorRT-LLM.git
synced 2026-01-14 06:27:45 +08:00
tests: add TestNemotronH cuda graph tests (#6390)
Signed-off-by: Xin He (SW-GPU) <200704525+xinhe-nv@users.noreply.github.com> Co-authored-by: Larry <197874197+LarryXFly@users.noreply.github.com>
This commit is contained in:
parent
a5540acfce
commit
d9ab3fd35e
@ -96,6 +96,16 @@ nvidia/Nemotron-H-8B-Base-8K:
|
||||
- quant_algo: FP8
|
||||
kv_cache_quant_algo: FP8
|
||||
accuracy: 85.78
|
||||
nvidia/Nemotron-H-47B-Base-8K:
|
||||
- accuracy: 88.82
|
||||
- quant_algo: FP8
|
||||
kv_cache_quant_algo: FP8
|
||||
accuracy: 88.55
|
||||
nvidia/Nemotron-H-56B-Base-8K:
|
||||
- accuracy: 89.27
|
||||
- quant_algo: FP8
|
||||
kv_cache_quant_algo: FP8
|
||||
accuracy: 89.27
|
||||
nvidia/Llama-3.1-Nemotron-Nano-8B-v1:
|
||||
- accuracy: 37.15
|
||||
- quant_algo: FP8
|
||||
|
||||
@ -185,6 +185,16 @@ nvidia/Nemotron-H-8B-Base-8K:
|
||||
- quant_algo: FP8
|
||||
kv_cache_quant_algo: FP8
|
||||
accuracy: 69.180
|
||||
nvidia/Nemotron-H-47B-Base-8K:
|
||||
- accuracy: 83.26
|
||||
- quant_algo: FP8
|
||||
kv_cache_quant_algo: FP8
|
||||
accuracy: 82.68
|
||||
nvidia/Nemotron-H-56B-Base-8K:
|
||||
- accuracy: 83.82
|
||||
- quant_algo: FP8
|
||||
kv_cache_quant_algo: FP8
|
||||
accuracy: 83.82
|
||||
microsoft/Phi-4-mini-instruct:
|
||||
- accuracy: 68.98
|
||||
# Created a dummy accuracy to track tp_size=2 for phi4-mini model.
|
||||
|
||||
@ -1602,24 +1602,30 @@ class TestNemotronH(LlmapiAccuracyTestHarness):
|
||||
MODEL_NAME = "nvidia/Nemotron-H-8B-Base-8K"
|
||||
MODEL_PATH = f"{llm_models_root()}/Nemotron-H-8B-Base-8K"
|
||||
|
||||
def test_auto_dtype(self):
|
||||
@parametrize_with_ids("cuda_graph", [False, True])
|
||||
def test_auto_dtype(self, cuda_graph):
|
||||
# TODO: remove max_batch_size after mamba cache manager is supported
|
||||
# ToDo: check 47b and 56b model
|
||||
# Once removed max_batch_size, the test will OOM
|
||||
kv_cache_config = KvCacheConfig(enable_block_reuse=False)
|
||||
with LLM(self.MODEL_PATH,
|
||||
kv_cache_config=kv_cache_config,
|
||||
max_batch_size=128) as llm:
|
||||
max_batch_size=128,
|
||||
cuda_graph_config=CudaGraphConfig()
|
||||
if cuda_graph else None) as llm:
|
||||
task = MMLU(self.MODEL_NAME)
|
||||
task.evaluate(llm)
|
||||
task = GSM8K(self.MODEL_NAME)
|
||||
task.evaluate(llm)
|
||||
|
||||
@skip_pre_ada
|
||||
def test_reasoning_fp8_prequantized(self):
|
||||
@parametrize_with_ids("cuda_graph", [False, True])
|
||||
def test_reasoning_fp8_prequantized(self, cuda_graph):
|
||||
kv_cache_config = KvCacheConfig(enable_block_reuse=False)
|
||||
with LLM(f"{llm_models_root()}/Nemotron-H-8B-Reasoning-128K-FP8",
|
||||
kv_cache_config=kv_cache_config,
|
||||
max_batch_size=256) as llm:
|
||||
max_batch_size=256,
|
||||
cuda_graph_config=CudaGraphConfig()
|
||||
if cuda_graph else None) as llm:
|
||||
assert llm.args.quant_config.quant_algo == QuantAlgo.FP8
|
||||
task = MMLU(self.MODEL_NAME)
|
||||
task.evaluate(llm)
|
||||
@ -1627,6 +1633,83 @@ class TestNemotronH(LlmapiAccuracyTestHarness):
|
||||
task.evaluate(llm)
|
||||
|
||||
|
||||
@pytest.mark.skip_less_device(8)
|
||||
@pytest.mark.skip_less_device_memory(80000)
|
||||
class TestNemotronH_47B_Base(LlmapiAccuracyTestHarness):
|
||||
MODEL_NAME = "nvidia/Nemotron-H-47B-Base-8K"
|
||||
MODEL_PATH = f"{llm_models_root()}/Nemotron-H-47B-Base-8K"
|
||||
|
||||
@parametrize_with_ids("cuda_graph", [False, True])
|
||||
@pytest.mark.parametrize("tp_size,pp_size,ep_size", [(8, 1, 1), (8, 1, 4),
|
||||
(8, 1, 8)],
|
||||
ids=["tp8", "tp8ep4", "tp8ep8"])
|
||||
def test_auto_dtype(self, cuda_graph, tp_size, pp_size, ep_size):
|
||||
kv_cache_config = KvCacheConfig(enable_block_reuse=False,
|
||||
free_gpu_memory_fraction=0.6)
|
||||
with LLM(self.MODEL_PATH,
|
||||
tensor_parallel_size=tp_size,
|
||||
pipeline_parallel_size=pp_size,
|
||||
moe_expert_parallel_size=ep_size,
|
||||
kv_cache_config=kv_cache_config,
|
||||
max_batch_size=256,
|
||||
cuda_graph_config=CudaGraphConfig()
|
||||
if cuda_graph else None) as llm:
|
||||
task = MMLU(self.MODEL_NAME)
|
||||
task.evaluate(llm)
|
||||
task = GSM8K(self.MODEL_NAME)
|
||||
task.evaluate(llm)
|
||||
|
||||
@skip_pre_ada
|
||||
@parametrize_with_ids("cuda_graph", [False, True])
|
||||
@pytest.mark.parametrize("tp_size,pp_size,ep_size", [(8, 1, 1), (8, 1, 4),
|
||||
(8, 1, 8)],
|
||||
ids=["tp8", "tp8ep4", "tp8ep8"])
|
||||
def test_reasoning_fp8_prequantized(self, cuda_graph, tp_size, pp_size,
|
||||
ep_size):
|
||||
kv_cache_config = KvCacheConfig(enable_block_reuse=False,
|
||||
free_gpu_memory_fraction=0.6)
|
||||
with LLM(f"{llm_models_root()}/Nemotron-H-47B-Reasoning-128K-FP8",
|
||||
kv_cache_config=kv_cache_config,
|
||||
tensor_parallel_size=tp_size,
|
||||
pipeline_parallel_size=pp_size,
|
||||
moe_expert_parallel_size=ep_size,
|
||||
max_batch_size=256,
|
||||
cuda_graph_config=CudaGraphConfig()
|
||||
if cuda_graph else None) as llm:
|
||||
assert llm.args.quant_config.quant_algo == QuantAlgo.FP8
|
||||
task = MMLU(self.MODEL_NAME)
|
||||
task.evaluate(llm)
|
||||
task = GSM8K(self.MODEL_NAME)
|
||||
task.evaluate(llm)
|
||||
|
||||
|
||||
@pytest.mark.skip_less_device(8)
|
||||
@pytest.mark.skip_less_device_memory(80000)
|
||||
class TestNemotronH_56B_Base(LlmapiAccuracyTestHarness):
|
||||
MODEL_NAME = "nvidia/Nemotron-H-56B-Base-8K"
|
||||
MODEL_PATH = f"{llm_models_root()}/Nemotron-H-56B-Base-8K"
|
||||
|
||||
@parametrize_with_ids("cuda_graph", [False, True])
|
||||
@pytest.mark.parametrize("tp_size,pp_size,ep_size", [(8, 1, 1), (8, 1, 4),
|
||||
(8, 1, 8)],
|
||||
ids=["tp8", "tp8ep4", "tp8ep8"])
|
||||
def test_auto_dtype(self, cuda_graph, tp_size, pp_size, ep_size):
|
||||
kv_cache_config = KvCacheConfig(enable_block_reuse=False,
|
||||
free_gpu_memory_fraction=0.6)
|
||||
with LLM(self.MODEL_PATH,
|
||||
tensor_parallel_size=tp_size,
|
||||
pipeline_parallel_size=pp_size,
|
||||
moe_expert_parallel_size=ep_size,
|
||||
kv_cache_config=kv_cache_config,
|
||||
max_batch_size=256,
|
||||
cuda_graph_config=CudaGraphConfig()
|
||||
if cuda_graph else None) as llm:
|
||||
task = MMLU(self.MODEL_NAME)
|
||||
task.evaluate(llm)
|
||||
task = GSM8K(self.MODEL_NAME)
|
||||
task.evaluate(llm)
|
||||
|
||||
|
||||
class TestQwen2_7BInstruct(LlmapiAccuracyTestHarness):
|
||||
MODEL_NAME = "Qwen/Qwen2-7B-Instruct"
|
||||
MODEL_PATH = f"{llm_models_root()}/Qwen2-7B-Instruct"
|
||||
|
||||
@ -12,8 +12,12 @@ accuracy/test_cli_flow.py::TestNemotronUltra::test_fp8_prequantized[tp8-cuda_gra
|
||||
accuracy/test_llm_api_pytorch.py::TestLlama3_3NemotronSuper49Bv1::test_fp8_prequantized_tp2
|
||||
accuracy/test_llm_api_pytorch.py::TestLlama3_1NemotronNano8Bv1::test_auto_dtype
|
||||
accuracy/test_llm_api_pytorch.py::TestLlama3_1NemotronNano8Bv1::test_fp8_prequantized
|
||||
accuracy/test_llm_api_pytorch.py::TestNemotronH::test_auto_dtype
|
||||
accuracy/test_llm_api_pytorch.py::TestNemotronH::test_reasoning_fp8_prequantized
|
||||
accuracy/test_llm_api_pytorch.py::TestNemotronH::test_auto_dtype[cuda_graph=True]
|
||||
accuracy/test_llm_api_pytorch.py::TestNemotronH::test_auto_dtype[cuda_graph=False]
|
||||
accuracy/test_llm_api_pytorch.py::TestNemotronH::test_reasoning_fp8_prequantized[cuda_graph=True]
|
||||
accuracy/test_llm_api_pytorch.py::TestNemotronH_47B_Base::test_auto_dtype[tp8ep4-cuda_graph=True]
|
||||
accuracy/test_llm_api_pytorch.py::TestNemotronH_47B_Base::test_reasoning_fp8_prequantized[tp8ep8-cuda_graph=True]
|
||||
accuracy/test_llm_api_pytorch.py::TestNemotronH_56B_Base::test_auto_dtype[tp8-cuda_graph=True]
|
||||
accuracy/test_llm_api_pytorch.py::TestNemotronUltra::test_auto_dtype[tp8ep4-cuda_graph=True]
|
||||
accuracy/test_llm_api_pytorch.py::TestNemotronUltra::test_fp8_prequantized[tp8ep4-cuda_graph=True]
|
||||
accuracy/test_llm_api_pytorch.py::TestNemotronUltra::test_fp8_prequantized[tp8-cuda_graph=True]
|
||||
|
||||
Loading…
Reference in New Issue
Block a user