mirror of
https://github.com/NVIDIA/TensorRT-LLM.git
synced 2026-01-14 06:27:45 +08:00
tests: Add llama4 functional cases (#6392)
Signed-off-by: Ivy Zhang <25222398+crazydemo@users.noreply.github.com>
This commit is contained in:
parent
eb157accac
commit
94de3c11b0
@ -14,8 +14,17 @@ meta-llama/Llama-3.3-70B-Instruct:
|
|||||||
accuracy: 83.30
|
accuracy: 83.30
|
||||||
meta-llama/Llama-4-Maverick-17B-128E-Instruct:
|
meta-llama/Llama-4-Maverick-17B-128E-Instruct:
|
||||||
- accuracy: 92.20
|
- accuracy: 92.20
|
||||||
|
- quant_algo: FP8
|
||||||
|
kv_cache_quant_algo: FP8
|
||||||
|
accuracy: 83.30
|
||||||
meta-llama/Llama-4-Scout-17B-16E-Instruct:
|
meta-llama/Llama-4-Scout-17B-16E-Instruct:
|
||||||
- accuracy: 89.70
|
- accuracy: 89.70
|
||||||
|
- quant_algo: FP8
|
||||||
|
kv_cache_quant_algo: FP8
|
||||||
|
accuracy: 89.61
|
||||||
|
- quant_algo: NVFP4
|
||||||
|
kv_cache_quant_algo: FP8
|
||||||
|
accuracy: 89.00
|
||||||
deepseek-ai/DeepSeek-V3-Lite:
|
deepseek-ai/DeepSeek-V3-Lite:
|
||||||
- accuracy: 64.74
|
- accuracy: 64.74
|
||||||
- quant_algo: NVFP4
|
- quant_algo: NVFP4
|
||||||
|
|||||||
@ -66,8 +66,17 @@ meta-llama/Llama-3.3-70B-Instruct:
|
|||||||
accuracy: 81.02
|
accuracy: 81.02
|
||||||
meta-llama/Llama-4-Maverick-17B-128E-Instruct:
|
meta-llama/Llama-4-Maverick-17B-128E-Instruct:
|
||||||
- accuracy: 86.40
|
- accuracy: 86.40
|
||||||
|
- quant_algo: FP8
|
||||||
|
kv_cache_quant_algo: FP8
|
||||||
|
accuracy: 86.45
|
||||||
meta-llama/Llama-4-Scout-17B-16E-Instruct:
|
meta-llama/Llama-4-Scout-17B-16E-Instruct:
|
||||||
- accuracy: 80.00
|
- accuracy: 80.00
|
||||||
|
- quant_algo: FP8
|
||||||
|
kv_cache_quant_algo: FP8
|
||||||
|
accuracy: 80.00
|
||||||
|
- quant_algo: NVFP4
|
||||||
|
kv_cache_quant_algo: FP8
|
||||||
|
accuracy: 80.00
|
||||||
mistralai/Mistral-7B-v0.1:
|
mistralai/Mistral-7B-v0.1:
|
||||||
- accuracy: 66
|
- accuracy: 66
|
||||||
mistralai/Mistral-7B-Instruct-v0.3:
|
mistralai/Mistral-7B-Instruct-v0.3:
|
||||||
|
|||||||
@ -398,6 +398,68 @@ class TestLlama4MaverickInstruct(LlmapiAccuracyTestHarness):
|
|||||||
task = GSM8K(self.MODEL_NAME)
|
task = GSM8K(self.MODEL_NAME)
|
||||||
task.evaluate(llm)
|
task.evaluate(llm)
|
||||||
|
|
||||||
|
@skip_pre_blackwell
|
||||||
|
@pytest.mark.skip_less_device(8)
|
||||||
|
@parametrize_with_ids("attn_backend", ["TRTLLM", "FLASHINFER"])
|
||||||
|
def test_chunked_prefill(self, attn_backend):
|
||||||
|
pytorch_config = dict(attn_backend=attn_backend,
|
||||||
|
disable_overlap_scheduler=True)
|
||||||
|
with LLM(self.MODEL_PATH,
|
||||||
|
tensor_parallel_size=8,
|
||||||
|
pipeline_parallel_size=1,
|
||||||
|
moe_expert_parallel_size=1,
|
||||||
|
max_seq_len=8192,
|
||||||
|
enable_chunked_prefill=True,
|
||||||
|
max_num_tokens=256,
|
||||||
|
**pytorch_config) as llm:
|
||||||
|
task = MMLU(self.MODEL_NAME)
|
||||||
|
task.evaluate(llm)
|
||||||
|
|
||||||
|
@skip_pre_hopper
|
||||||
|
@pytest.mark.skip_less_mpi_world_size(8)
|
||||||
|
@parametrize_with_ids("cuda_graph", [False, True])
|
||||||
|
@pytest.mark.parametrize("tp_size,pp_size,ep_size", [(8, 1, 1), (8, 1, 4),
|
||||||
|
(8, 1, 8)],
|
||||||
|
ids=["tp8", "tp8ep4", "tp8ep8"])
|
||||||
|
def test_fp8_prequantized(self, cuda_graph, tp_size, pp_size, ep_size):
|
||||||
|
with LLM(
|
||||||
|
f"{llm_models_root()}/llama4-models/nvidia/Llama-4-Maverick-17B-128E-Instruct-FP8",
|
||||||
|
tensor_parallel_size=tp_size,
|
||||||
|
# Keep this low to avoid warmup OOM in CI
|
||||||
|
max_seq_len=8192,
|
||||||
|
pipeline_parallel_size=pp_size,
|
||||||
|
moe_expert_parallel_size=ep_size,
|
||||||
|
use_cuda_graph=cuda_graph) as llm:
|
||||||
|
assert llm.args.quant_config.quant_algo == QuantAlgo.FP8
|
||||||
|
assert llm.args.quant_config.kv_cache_quant_algo == QuantAlgo.FP8
|
||||||
|
task = MMLU(self.MODEL_NAME)
|
||||||
|
task.evaluate(llm)
|
||||||
|
task = GSM8K(self.MODEL_NAME)
|
||||||
|
task.evaluate(llm)
|
||||||
|
|
||||||
|
@skip_pre_hopper
|
||||||
|
@pytest.mark.skip_less_mpi_world_size(8)
|
||||||
|
@parametrize_with_ids("cuda_graph", [False, True])
|
||||||
|
@pytest.mark.parametrize("tp_size,pp_size,ep_size", [(8, 1, 8)],
|
||||||
|
ids=["tp8ep8"])
|
||||||
|
def test_fp8_chunked_prefill(self, cuda_graph, tp_size, pp_size, ep_size):
|
||||||
|
with LLM(
|
||||||
|
f"{llm_models_root()}/llama4-models/nvidia/Llama-4-Maverick-17B-128E-Instruct-FP8",
|
||||||
|
tensor_parallel_size=tp_size,
|
||||||
|
# Keep this low to avoid warmup OOM in CI
|
||||||
|
max_seq_len=8192,
|
||||||
|
pipeline_parallel_size=pp_size,
|
||||||
|
moe_expert_parallel_size=ep_size,
|
||||||
|
enable_chunked_prefill=True,
|
||||||
|
max_num_tokens=256,
|
||||||
|
use_cuda_graph=cuda_graph) as llm:
|
||||||
|
assert llm.args.quant_config.quant_algo == QuantAlgo.FP8
|
||||||
|
assert llm.args.quant_config.kv_cache_quant_algo == QuantAlgo.FP8
|
||||||
|
task = MMLU(self.MODEL_NAME)
|
||||||
|
task.evaluate(llm)
|
||||||
|
task = GSM8K(self.MODEL_NAME)
|
||||||
|
task.evaluate(llm)
|
||||||
|
|
||||||
|
|
||||||
class TestLlama4ScoutInstruct(LlmapiAccuracyTestHarness):
|
class TestLlama4ScoutInstruct(LlmapiAccuracyTestHarness):
|
||||||
MODEL_NAME = "meta-llama/Llama-4-Scout-17B-16E-Instruct"
|
MODEL_NAME = "meta-llama/Llama-4-Scout-17B-16E-Instruct"
|
||||||
@ -423,6 +485,94 @@ class TestLlama4ScoutInstruct(LlmapiAccuracyTestHarness):
|
|||||||
task = GSM8K(self.MODEL_NAME)
|
task = GSM8K(self.MODEL_NAME)
|
||||||
task.evaluate(llm)
|
task.evaluate(llm)
|
||||||
|
|
||||||
|
@skip_pre_hopper
|
||||||
|
@pytest.mark.skip_less_mpi_world_size(4)
|
||||||
|
@parametrize_with_ids("cuda_graph", [True])
|
||||||
|
@pytest.mark.parametrize("tp_size,pp_size,ep_size", [(4, 1, 4), (4, 1, 1)],
|
||||||
|
ids=["tp4ep4", "tp4"])
|
||||||
|
def test_fp8_prequantized(self, cuda_graph, tp_size, pp_size, ep_size):
|
||||||
|
model_path = f"{llm_models_root()}/llama4-models/Llama-4-Scout-17B-16E-Instruct-FP8"
|
||||||
|
with LLM(
|
||||||
|
model_path,
|
||||||
|
tensor_parallel_size=tp_size,
|
||||||
|
# Keep this low to avoid warmup OOM in CI
|
||||||
|
max_seq_len=8192,
|
||||||
|
pipeline_parallel_size=pp_size,
|
||||||
|
moe_expert_parallel_size=ep_size,
|
||||||
|
use_cuda_graph=cuda_graph) as llm:
|
||||||
|
assert llm.args.quant_config.quant_algo == QuantAlgo.FP8
|
||||||
|
assert llm.args.quant_config.kv_cache_quant_algo == QuantAlgo.FP8
|
||||||
|
task = MMLU(self.MODEL_NAME)
|
||||||
|
task.evaluate(llm)
|
||||||
|
task = GSM8K(self.MODEL_NAME)
|
||||||
|
task.evaluate(llm)
|
||||||
|
|
||||||
|
@skip_pre_hopper
|
||||||
|
@pytest.mark.skip_less_mpi_world_size(4)
|
||||||
|
@parametrize_with_ids("cuda_graph", [True])
|
||||||
|
@pytest.mark.parametrize("tp_size,pp_size,ep_size", [(4, 1, 4)],
|
||||||
|
ids=["tp4ep4"])
|
||||||
|
def test_fp8_chunked_prefill(self, cuda_graph, tp_size, pp_size, ep_size):
|
||||||
|
with LLM(
|
||||||
|
f"{llm_models_root()}/llama4-models/Llama-4-Scout-17B-16E-Instruct-FP8",
|
||||||
|
tensor_parallel_size=tp_size,
|
||||||
|
max_seq_len=22000,
|
||||||
|
pipeline_parallel_size=pp_size,
|
||||||
|
moe_expert_parallel_size=ep_size,
|
||||||
|
enable_chunked_prefill=True,
|
||||||
|
max_num_tokens=256,
|
||||||
|
use_cuda_graph=cuda_graph) as llm:
|
||||||
|
assert llm.args.quant_config.quant_algo == QuantAlgo.FP8
|
||||||
|
assert llm.args.quant_config.kv_cache_quant_algo == QuantAlgo.FP8
|
||||||
|
task = MMLU(self.MODEL_NAME)
|
||||||
|
task.evaluate(llm)
|
||||||
|
task = GSM8K(self.MODEL_NAME)
|
||||||
|
task.evaluate(llm)
|
||||||
|
|
||||||
|
@skip_pre_blackwell
|
||||||
|
@pytest.mark.skip_less_mpi_world_size(4)
|
||||||
|
@parametrize_with_ids("cuda_graph", [True])
|
||||||
|
@pytest.mark.parametrize("tp_size,pp_size,ep_size", [(4, 1, 4), (4, 1, 1)],
|
||||||
|
ids=["tp4ep4", "tp4"])
|
||||||
|
def test_fp4_prequantized(self, cuda_graph, tp_size, pp_size, ep_size):
|
||||||
|
model_path = f"{llm_models_root()}/llama4-models/Llama-4-Scout-17B-16E-Instruct-FP4"
|
||||||
|
with LLM(
|
||||||
|
model_path,
|
||||||
|
tensor_parallel_size=tp_size,
|
||||||
|
# Keep this low to avoid warmup OOM in CI
|
||||||
|
max_seq_len=8192,
|
||||||
|
pipeline_parallel_size=pp_size,
|
||||||
|
moe_expert_parallel_size=ep_size,
|
||||||
|
use_cuda_graph=cuda_graph) as llm:
|
||||||
|
assert llm.args.quant_config.quant_algo == QuantAlgo.NVFP4
|
||||||
|
assert llm.args.quant_config.kv_cache_quant_algo == QuantAlgo.FP8
|
||||||
|
task = MMLU(self.MODEL_NAME)
|
||||||
|
task.evaluate(llm)
|
||||||
|
task = GSM8K(self.MODEL_NAME)
|
||||||
|
task.evaluate(llm)
|
||||||
|
|
||||||
|
@skip_pre_blackwell
|
||||||
|
@pytest.mark.skip_less_mpi_world_size(8)
|
||||||
|
@parametrize_with_ids("cuda_graph", [True])
|
||||||
|
@pytest.mark.parametrize("tp_size,pp_size,ep_size", [(4, 1, 4)],
|
||||||
|
ids=["tp4ep4"])
|
||||||
|
def test_fp4_chunked_prefill(self, cuda_graph, tp_size, pp_size, ep_size):
|
||||||
|
with LLM(
|
||||||
|
f"{llm_models_root()}/llama4-models/Llama-4-Scout-17B-16E-Instruct-FP4",
|
||||||
|
tensor_parallel_size=tp_size,
|
||||||
|
pipeline_parallel_size=pp_size,
|
||||||
|
moe_expert_parallel_size=ep_size,
|
||||||
|
max_seq_len=22000,
|
||||||
|
enable_chunked_prefill=True,
|
||||||
|
max_num_tokens=256,
|
||||||
|
use_cuda_graph=cuda_graph) as llm:
|
||||||
|
assert llm.args.quant_config.quant_algo == QuantAlgo.NVFP4
|
||||||
|
assert llm.args.quant_config.kv_cache_quant_algo == QuantAlgo.FP8
|
||||||
|
task = MMLU(self.MODEL_NAME)
|
||||||
|
task.evaluate(llm)
|
||||||
|
task = GSM8K(self.MODEL_NAME)
|
||||||
|
task.evaluate(llm)
|
||||||
|
|
||||||
|
|
||||||
class TestMistral7B(LlmapiAccuracyTestHarness):
|
class TestMistral7B(LlmapiAccuracyTestHarness):
|
||||||
MODEL_NAME = "mistralai/Mistral-7B-v0.1"
|
MODEL_NAME = "mistralai/Mistral-7B-v0.1"
|
||||||
|
|||||||
@ -1860,6 +1860,40 @@ def test_ptp_quickstart_advanced_8gpus(llm_root, llm_venv, model_name,
|
|||||||
_check_mem_usage(running_log, [mapping[model_name], 0, 0, 0], 8)
|
_check_mem_usage(running_log, [mapping[model_name], 0, 0, 0], 8)
|
||||||
|
|
||||||
|
|
||||||
|
@skip_pre_hopper
|
||||||
|
@pytest.mark.skip_less_device(8)
|
||||||
|
@pytest.mark.parametrize("cuda_graph", [False, True])
|
||||||
|
@pytest.mark.parametrize("model_name,model_path", [
|
||||||
|
("Llama-4-Maverick-17B-128E-Instruct-FP8",
|
||||||
|
"llama4-models/nvidia/Llama-4-Maverick-17B-128E-Instruct-FP8"),
|
||||||
|
("Llama-4-Scout-17B-16E-Instruct-FP8",
|
||||||
|
"llama4-models/Llama-4-Scout-17B-16E-Instruct-FP8"),
|
||||||
|
pytest.param('Llama-4-Scout-17B-16E-Instruct-FP4',
|
||||||
|
'llama4-models/Llama-4-Scout-17B-16E-Instruct-FP4',
|
||||||
|
marks=skip_pre_blackwell),
|
||||||
|
])
|
||||||
|
def test_ptp_quickstart_advanced_8gpus_chunked_prefill_sq_22k(
|
||||||
|
llm_root, llm_venv, model_name, model_path, cuda_graph):
|
||||||
|
print(f"Testing {model_name} on 8 GPUs.")
|
||||||
|
example_root = Path(os.path.join(llm_root, "examples", "pytorch"))
|
||||||
|
cmd = [
|
||||||
|
str(example_root / "quickstart_advanced.py"),
|
||||||
|
"--enable_chunked_prefill",
|
||||||
|
"--model_dir",
|
||||||
|
f"{llm_models_root()}/{model_path}",
|
||||||
|
"--tp_size=8",
|
||||||
|
"--moe_ep_size=8",
|
||||||
|
"--max_seq_len=22000",
|
||||||
|
"--kv_cache_fraction=0.1",
|
||||||
|
]
|
||||||
|
if cuda_graph:
|
||||||
|
cmd.extend([
|
||||||
|
"--use_cuda_graph",
|
||||||
|
"--cuda_graph_padding_enabled",
|
||||||
|
])
|
||||||
|
llm_venv.run_cmd(cmd)
|
||||||
|
|
||||||
|
|
||||||
# This test is specifically to be run on 2 GPUs on Blackwell RTX 6000 Pro (SM120) architecture
|
# This test is specifically to be run on 2 GPUs on Blackwell RTX 6000 Pro (SM120) architecture
|
||||||
# TODO: remove once we have a node with 8 GPUs and reuse test_ptp_quickstart_advanced_8gpus
|
# TODO: remove once we have a node with 8 GPUs and reuse test_ptp_quickstart_advanced_8gpus
|
||||||
@skip_no_sm120
|
@skip_no_sm120
|
||||||
|
|||||||
@ -452,9 +452,22 @@ accuracy/test_llm_api_pytorch.py::TestGemma3_1BInstruct::test_auto_dtype
|
|||||||
accuracy/test_llm_api_pytorch.py::TestLlama4MaverickInstruct::test_auto_dtype[tp8-cuda_graph=False]
|
accuracy/test_llm_api_pytorch.py::TestLlama4MaverickInstruct::test_auto_dtype[tp8-cuda_graph=False]
|
||||||
accuracy/test_llm_api_pytorch.py::TestLlama4MaverickInstruct::test_auto_dtype[tp8ep4-cuda_graph=True]
|
accuracy/test_llm_api_pytorch.py::TestLlama4MaverickInstruct::test_auto_dtype[tp8ep4-cuda_graph=True]
|
||||||
accuracy/test_llm_api_pytorch.py::TestLlama4MaverickInstruct::test_auto_dtype[tp8ep8-cuda_graph=True]
|
accuracy/test_llm_api_pytorch.py::TestLlama4MaverickInstruct::test_auto_dtype[tp8ep8-cuda_graph=True]
|
||||||
|
accuracy/test_llm_api_pytorch.py::TestLlama4MaverickInstruct::test_chunked_prefill[attn_backend=FLASHINFER]
|
||||||
|
accuracy/test_llm_api_pytorch.py::TestLlama4MaverickInstruct::test_chunked_prefill[attn_backend=TRTLLM]
|
||||||
|
accuracy/test_llm_api_pytorch.py::TestLlama4MaverickInstruct::test_fp8_prequantized[tp8ep8-cuda_graph=True]
|
||||||
|
accuracy/test_llm_api_pytorch.py::TestLlama4MaverickInstruct::test_fp8_prequantized[tp8ep4-cuda_graph=True]
|
||||||
|
accuracy/test_llm_api_pytorch.py::TestLlama4MaverickInstruct::test_fp8_prequantized[tp8-cuda_graph=True]
|
||||||
|
accuracy/test_llm_api_pytorch.py::TestLlama4MaverickInstruct::test_fp8_chunked_prefill[tp8ep8-cuda_graph=False]
|
||||||
|
accuracy/test_llm_api_pytorch.py::TestLlama4MaverickInstruct::test_fp8_chunked_prefill[tp8ep8-cuda_graph=True]
|
||||||
accuracy/test_llm_api_pytorch.py::TestLlama4ScoutInstruct::test_auto_dtype[tp8-cuda_graph=False]
|
accuracy/test_llm_api_pytorch.py::TestLlama4ScoutInstruct::test_auto_dtype[tp8-cuda_graph=False]
|
||||||
accuracy/test_llm_api_pytorch.py::TestLlama4ScoutInstruct::test_auto_dtype[tp8ep4-cuda_graph=True]
|
accuracy/test_llm_api_pytorch.py::TestLlama4ScoutInstruct::test_auto_dtype[tp8ep4-cuda_graph=True]
|
||||||
accuracy/test_llm_api_pytorch.py::TestLlama4ScoutInstruct::test_auto_dtype[tp8ep8-cuda_graph=True]
|
accuracy/test_llm_api_pytorch.py::TestLlama4ScoutInstruct::test_auto_dtype[tp8ep8-cuda_graph=True]
|
||||||
|
accuracy/test_llm_api_pytorch.py::TestLlama4ScoutInstruct::test_fp8_prequantized[tp4ep4-cuda_graph=True]
|
||||||
|
accuracy/test_llm_api_pytorch.py::TestLlama4ScoutInstruct::test_fp8_prequantized[tp4-cuda_graph=True]
|
||||||
|
accuracy/test_llm_api_pytorch.py::TestLlama4ScoutInstruct::test_fp8_chunked_prefill[tp4ep4-cuda_graph=True]
|
||||||
|
accuracy/test_llm_api_pytorch.py::TestLlama4ScoutInstruct::test_fp4_prequantized[tp4ep4-cuda_graph=True]
|
||||||
|
accuracy/test_llm_api_pytorch.py::TestLlama4ScoutInstruct::test_fp4_prequantized[tp4-cuda_graph=True]
|
||||||
|
accuracy/test_llm_api_pytorch.py::TestLlama4ScoutInstruct::test_fp4_chunked_prefill[tp4ep4-cuda_graph=True]
|
||||||
accuracy/test_llm_api_pytorch.py::TestMixtral8x7B::test_fp8_tp2
|
accuracy/test_llm_api_pytorch.py::TestMixtral8x7B::test_fp8_tp2
|
||||||
accuracy/test_llm_api_pytorch.py::TestMixtral8x7B::test_nvfp4_tp2
|
accuracy/test_llm_api_pytorch.py::TestMixtral8x7B::test_nvfp4_tp2
|
||||||
accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16[mtp_nextn=0-attention_dp=False-cuda_graph=False-overlap_scheduler=False-torch_compile=False]
|
accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16[mtp_nextn=0-attention_dp=False-cuda_graph=False-overlap_scheduler=False-torch_compile=False]
|
||||||
@ -517,6 +530,10 @@ test_e2e.py::test_ptp_quickstart_advanced_8gpus[Llama3.1-405B-FP8-llama-3.1-mode
|
|||||||
test_e2e.py::test_ptp_quickstart_advanced_8gpus[Mixtral-8x7B-BF16-Mixtral-8x7B-v0.1]
|
test_e2e.py::test_ptp_quickstart_advanced_8gpus[Mixtral-8x7B-BF16-Mixtral-8x7B-v0.1]
|
||||||
test_e2e.py::test_ptp_quickstart_advanced_8gpus[Mixtral-8x7B-NVFP4-nvfp4-quantized/Mixtral-8x7B-Instruct-v0.1]
|
test_e2e.py::test_ptp_quickstart_advanced_8gpus[Mixtral-8x7B-NVFP4-nvfp4-quantized/Mixtral-8x7B-Instruct-v0.1]
|
||||||
test_e2e.py::test_ptp_quickstart_advanced_deepseek_r1_8gpus[DeepSeek-R1-DeepSeek-R1/DeepSeek-R1]
|
test_e2e.py::test_ptp_quickstart_advanced_deepseek_r1_8gpus[DeepSeek-R1-DeepSeek-R1/DeepSeek-R1]
|
||||||
|
test_e2e.py::test_ptp_quickstart_advanced_8gpus_chunked_prefill_sq_22k[Llama-4-Maverick-17B-128E-Instruct-FP8-llama4-models/nvidia/Llama-4-Maverick-17B-128E-Instruct-FP8-False]
|
||||||
|
test_e2e.py::test_ptp_quickstart_advanced_8gpus_chunked_prefill_sq_22k[Llama-4-Maverick-17B-128E-Instruct-FP8-llama4-models/nvidia/Llama-4-Maverick-17B-128E-Instruct-FP8-True]
|
||||||
|
test_e2e.py::test_ptp_quickstart_advanced_8gpus_chunked_prefill_sq_22k[Llama-4-Scout-17B-16E-Instruct-FP8-llama4-models/Llama-4-Scout-17B-16E-Instruct-FP8-True]
|
||||||
|
test_e2e.py::test_ptp_quickstart_advanced_8gpus_chunked_prefill_sq_22k[Llama-4-Scout-17B-16E-Instruct-FP4-llama4-models/Llama-4-Scout-17B-16E-Instruct-FP4-True]
|
||||||
test_e2e.py::test_relaxed_acceptance_quickstart_advanced_deepseek_r1_8gpus[DeepSeek-R1-DeepSeek-R1/DeepSeek-R1]
|
test_e2e.py::test_relaxed_acceptance_quickstart_advanced_deepseek_r1_8gpus[DeepSeek-R1-DeepSeek-R1/DeepSeek-R1]
|
||||||
test_e2e.py::test_ptp_quickstart_multimodal[NVILA-8B-FP16-vila/NVILA-8B-image]
|
test_e2e.py::test_ptp_quickstart_multimodal[NVILA-8B-FP16-vila/NVILA-8B-image]
|
||||||
test_e2e.py::test_ptp_quickstart_multimodal[NVILA-8B-FP16-vila/NVILA-8B-video]
|
test_e2e.py::test_ptp_quickstart_multimodal[NVILA-8B-FP16-vila/NVILA-8B-video]
|
||||||
|
|||||||
Loading…
Reference in New Issue
Block a user