mirror of
https://github.com/NVIDIA/TensorRT-LLM.git
synced 2026-01-14 06:27:45 +08:00
test: rcca https://nvbugs/5223130 (#4510)
* add rcca tests Signed-off-by: xinhe-nv <200704525+xinhe-nv@users.noreply.github.com> * skip tests on blackwell Signed-off-by: xinhe-nv <200704525+xinhe-nv@users.noreply.github.com> --------- Signed-off-by: xinhe-nv <200704525+xinhe-nv@users.noreply.github.com>
This commit is contained in:
parent
157fe62965
commit
59f7622281
@ -200,6 +200,7 @@ class TestNemotronMini4BInstruct(CliFlowAccuracyTestHarness):
|
||||
self.run(quant_algo=QuantAlgo.FP8, kv_cache_quant_algo=QuantAlgo.FP8)
|
||||
|
||||
|
||||
@skip_post_blackwell
|
||||
class TestPhi2(CliFlowAccuracyTestHarness):
|
||||
MODEL_NAME = "microsoft/phi-2"
|
||||
MODEL_PATH = f"{llm_models_root()}/phi-2"
|
||||
@ -215,6 +216,7 @@ class TestPhi2(CliFlowAccuracyTestHarness):
|
||||
self.run(tp_size=2)
|
||||
|
||||
|
||||
@skip_post_blackwell
|
||||
class TestPhi3Mini4kInstruct(CliFlowAccuracyTestHarness):
|
||||
MODEL_NAME = "microsoft/Phi-3-mini-4k-instruct"
|
||||
MODEL_PATH = f"{llm_models_root()}/Phi-3/Phi-3-mini-4k-instruct"
|
||||
@ -224,6 +226,7 @@ class TestPhi3Mini4kInstruct(CliFlowAccuracyTestHarness):
|
||||
self.run(dtype='auto')
|
||||
|
||||
|
||||
@skip_post_blackwell
|
||||
class TestPhi3Mini128kInstruct(CliFlowAccuracyTestHarness):
|
||||
MODEL_NAME = "microsoft/Phi-3-mini-128k-instruct"
|
||||
MODEL_PATH = f"{llm_models_root()}/Phi-3/Phi-3-mini-128k-instruct"
|
||||
@ -233,6 +236,7 @@ class TestPhi3Mini128kInstruct(CliFlowAccuracyTestHarness):
|
||||
self.run(dtype='auto')
|
||||
|
||||
|
||||
@skip_post_blackwell
|
||||
class TestPhi3Small8kInstruct(CliFlowAccuracyTestHarness):
|
||||
MODEL_NAME = "microsoft/Phi-3-small-8k-instruct"
|
||||
MODEL_PATH = f"{llm_models_root()}/Phi-3/Phi-3-small-8k-instruct"
|
||||
@ -242,6 +246,7 @@ class TestPhi3Small8kInstruct(CliFlowAccuracyTestHarness):
|
||||
self.run(dtype='auto')
|
||||
|
||||
|
||||
@skip_post_blackwell
|
||||
class TestPhi3Small128kInstruct(CliFlowAccuracyTestHarness):
|
||||
MODEL_NAME = "microsoft/Phi-3-small-128k-instruct"
|
||||
MODEL_PATH = f"{llm_models_root()}/Phi-3/Phi-3-small-128k-instruct"
|
||||
@ -251,6 +256,7 @@ class TestPhi3Small128kInstruct(CliFlowAccuracyTestHarness):
|
||||
self.run(dtype='auto')
|
||||
|
||||
|
||||
@skip_post_blackwell
|
||||
class TestPhi3_5MiniInstruct(CliFlowAccuracyTestHarness):
|
||||
MODEL_NAME = "microsoft/Phi-3.5-mini-instruct"
|
||||
MODEL_PATH = f"{llm_models_root()}/Phi-3.5/Phi-3.5-mini-instruct"
|
||||
|
||||
@ -21,8 +21,9 @@ from tensorrt_llm.llmapi import KvCacheConfig, MTPDecodingConfig, SamplingParams
|
||||
from tensorrt_llm.models.modeling_utils import QuantConfig
|
||||
from tensorrt_llm.quantization import QuantAlgo
|
||||
|
||||
from ..conftest import (llm_models_root, parametrize_with_ids, skip_pre_ada,
|
||||
skip_pre_blackwell, skip_pre_hopper)
|
||||
from ..conftest import (llm_models_root, parametrize_with_ids,
|
||||
skip_post_blackwell, skip_pre_ada, skip_pre_blackwell,
|
||||
skip_pre_hopper)
|
||||
from .accuracy_core import (GSM8K, MMLU, CnnDailymail, GPQADiamond,
|
||||
LlmapiAccuracyTestHarness)
|
||||
|
||||
@ -1015,6 +1016,7 @@ class TestQwen3_30B_A3B(LlmapiAccuracyTestHarness):
|
||||
MODEL_NAME = "Qwen3/Qwen3-30B-A3B"
|
||||
|
||||
@skip_pre_hopper
|
||||
@skip_post_blackwell
|
||||
@pytest.mark.parametrize(
|
||||
"tp_size,pp_size,ep_size,attention_dp,cuda_graph,overlap_scheduler",
|
||||
[(1, 1, 1, False, False, True)],
|
||||
|
||||
@ -14,11 +14,13 @@
|
||||
# limitations under the License.
|
||||
|
||||
import csv
|
||||
import os
|
||||
from copy import deepcopy
|
||||
|
||||
import pytest
|
||||
from defs.common import convert_weights, venv_check_call
|
||||
from defs.conftest import get_device_memory, skip_post_blackwell
|
||||
from defs.common import convert_weights, venv_check_call, venv_mpi_check_call
|
||||
from defs.conftest import (get_device_memory, llm_models_root,
|
||||
skip_post_blackwell, skip_pre_hopper)
|
||||
from defs.trt_test_alternative import check_call
|
||||
|
||||
|
||||
@ -144,3 +146,182 @@ def test_llm_draft_target_model_1gpu(batch_size, data_type, draft_len,
|
||||
assert (
|
||||
int(dt) == int(b)
|
||||
), f"Output at ({bs=}, {index=}) is different ({dt} v.s. {b})."
|
||||
|
||||
|
||||
@skip_post_blackwell
|
||||
def test_llm_draft_target_llama_1gpu(llama_example_root, llm_venv, cmodel_dir,
|
||||
engine_dir):
|
||||
"RCCA https://nvbugs/5223130"
|
||||
data_type = "float16"
|
||||
max_batch_size = 4
|
||||
max_draft_len = 10
|
||||
max_input_len = 3200
|
||||
max_seq_len = 4800
|
||||
|
||||
draft_model = os.path.join(llm_models_root(), "llama-3.2-models",
|
||||
"Llama-3.2-1B")
|
||||
target_model = os.path.join(llm_models_root(), "llama-3.1-model",
|
||||
"Meta-Llama-3.1-8B")
|
||||
|
||||
print("Build checkpoint ...")
|
||||
draft_model_dir = convert_weights(llm_venv=llm_venv,
|
||||
example_root=llama_example_root,
|
||||
cmodel_dir=cmodel_dir,
|
||||
model="llama3-1b",
|
||||
model_path=draft_model,
|
||||
data_type=data_type)
|
||||
|
||||
target_model_dir = convert_weights(llm_venv=llm_venv,
|
||||
example_root=llama_example_root,
|
||||
cmodel_dir=cmodel_dir,
|
||||
model="llama3-8b",
|
||||
model_path=target_model,
|
||||
data_type=data_type)
|
||||
|
||||
print("Build engines ...")
|
||||
draft_engine_dir = os.path.join(engine_dir, "draft")
|
||||
target_engine_dir = os.path.join(engine_dir, "target")
|
||||
|
||||
base_build_cmd = [
|
||||
"trtllm-build",
|
||||
f"--max_batch_size={max_batch_size}",
|
||||
f"--max_input_len={max_input_len}",
|
||||
f"--max_seq_len={max_seq_len}",
|
||||
"--use_paged_context_fmha=enable",
|
||||
f"--gemm_plugin={data_type}",
|
||||
"--gather_generation_logits",
|
||||
]
|
||||
|
||||
draft_model_build_cmd = base_build_cmd + [
|
||||
f"--checkpoint_dir={draft_model_dir}",
|
||||
f"--output_dir={draft_engine_dir}",
|
||||
]
|
||||
|
||||
target_model_build_cmd = base_build_cmd + [
|
||||
f"--checkpoint_dir={target_model_dir}",
|
||||
"--speculative_decoding_mode=draft_tokens_external",
|
||||
f"--max_draft_len={max_draft_len}",
|
||||
f"--output_dir={target_engine_dir}",
|
||||
]
|
||||
|
||||
check_call(" ".join(draft_model_build_cmd),
|
||||
shell=True,
|
||||
env=llm_venv._new_env)
|
||||
check_call(" ".join(target_model_build_cmd),
|
||||
shell=True,
|
||||
env=llm_venv._new_env)
|
||||
|
||||
print("Run inferences ...")
|
||||
run_cmd = [
|
||||
f"{llama_example_root}/../../../run.py",
|
||||
f"--tokenizer_dir={target_model}",
|
||||
f"--draft_engine_dir={draft_engine_dir}",
|
||||
f"--engine_dir={target_engine_dir}",
|
||||
"--draft_target_model_config=[4,[0],[0],True]", "--max_output_len=256",
|
||||
"--kv_cache_enable_block_reuse",
|
||||
"--kv_cache_free_gpu_memory_fraction=0.4",
|
||||
f"--input_text='how does draft-sampling work'"
|
||||
]
|
||||
|
||||
venv_check_call(llm_venv, run_cmd)
|
||||
|
||||
|
||||
@skip_post_blackwell
|
||||
@skip_pre_hopper
|
||||
@pytest.mark.skip_less_device(2)
|
||||
@pytest.mark.skip_less_device_memory(80000)
|
||||
def test_llm_draft_target_llama_fp8_2gpu(llama_example_root, llm_venv,
|
||||
qcache_dir, engine_dir,
|
||||
llm_datasets_root):
|
||||
"RCCA https://nvbugs/5257681"
|
||||
data_type = "bfloat16"
|
||||
max_batch_size = 16
|
||||
max_draft_len = 5
|
||||
max_input_len = 2000
|
||||
max_seq_len = 4000
|
||||
|
||||
draft_model = os.path.join(llm_models_root(), "llama-3.1-model",
|
||||
"Meta-Llama-3.1-8B")
|
||||
|
||||
target_model = os.path.join(llm_models_root(), "llama-3.3-models",
|
||||
"Llama-3.3-70B-Instruct")
|
||||
|
||||
draft_quantized_dir = os.path.join(qcache_dir, "draft")
|
||||
target_quantized_dir = os.path.join(qcache_dir, "target")
|
||||
|
||||
print("Build checkpoint ...")
|
||||
quantize_cmd = [
|
||||
f"{llama_example_root}/../../../quantization/quantize.py",
|
||||
f"--calib_dataset={llm_datasets_root}/cnn_dailymail",
|
||||
f"--dtype={data_type}",
|
||||
"--qformat=fp8",
|
||||
"--calib_size=32",
|
||||
"--kv_cache_dtype=fp8",
|
||||
"--tp_size=2",
|
||||
]
|
||||
|
||||
draft_quantized_cmd = quantize_cmd + [
|
||||
f"--model_dir={draft_model}",
|
||||
f"--output_dir={draft_quantized_dir}",
|
||||
]
|
||||
|
||||
target_quantized_cmd = quantize_cmd + [
|
||||
f"--model_dir={target_model}",
|
||||
f"--output_dir={target_quantized_dir}",
|
||||
]
|
||||
|
||||
venv_check_call(llm_venv, draft_quantized_cmd)
|
||||
venv_check_call(llm_venv, target_quantized_cmd)
|
||||
|
||||
print("Build engines ...")
|
||||
draft_engine_dir = os.path.join(engine_dir, "draft")
|
||||
target_engine_dir = os.path.join(engine_dir, "target")
|
||||
|
||||
base_build_cmd = [
|
||||
"trtllm-build",
|
||||
f"--max_batch_size={max_batch_size}",
|
||||
f"--max_input_len={max_input_len}",
|
||||
f"--max_seq_len={max_seq_len}",
|
||||
"--use_paged_context_fmha=enable",
|
||||
"--gemm_plugin=auto",
|
||||
"--workers=2",
|
||||
"--gather_generation_logits",
|
||||
]
|
||||
|
||||
draft_model_build_cmd = base_build_cmd + [
|
||||
f"--checkpoint_dir={draft_quantized_dir}",
|
||||
f"--output_dir={draft_engine_dir}",
|
||||
]
|
||||
|
||||
target_model_build_cmd = base_build_cmd + [
|
||||
f"--checkpoint_dir={target_quantized_dir}",
|
||||
f"--output_dir={target_engine_dir}",
|
||||
f"--max_draft_len={max_draft_len}",
|
||||
"--speculative_decoding_mode=draft_tokens_external",
|
||||
]
|
||||
|
||||
check_call(" ".join(draft_model_build_cmd),
|
||||
shell=True,
|
||||
env=llm_venv._new_env)
|
||||
check_call(" ".join(target_model_build_cmd),
|
||||
shell=True,
|
||||
env=llm_venv._new_env)
|
||||
|
||||
INPUT_TEXT = "The United States of America (USA), also known as the United States (U.S.) or America, is a country located primarily in North America. It is a federal republic of 50 states and the federal capital district of Washington, D.C. The 48 contiguous states border Canada to the north and Mexico to the south, with the state of Alaska to the northwest and the islands of Hawaii in Oceania. Indian country includes 574 federally recognized tribes and 326 Indian reservations with tribal sovereignty rights. The U.S. asserts sovereignty over five major island territories and various uninhabited islands in the Pacific Ocean and the Caribbean. It has the world's third-largest land area[c] and third-largest population, exceeding 340 million. Paleo-Indians migrated to North America across"
|
||||
|
||||
print("Run inferences ...")
|
||||
run_cmd = [
|
||||
f"{llama_example_root}/../../../run.py",
|
||||
f"--tokenizer_dir={draft_model}",
|
||||
f"--draft_engine_dir={draft_engine_dir}",
|
||||
f"--engine_dir={target_engine_dir}",
|
||||
f"--input_text={INPUT_TEXT}",
|
||||
"--draft_target_model_config=[3,[0,1],[0,1],False]",
|
||||
"--max_output_len=800",
|
||||
"--kv_cache_enable_block_reuse",
|
||||
"--kv_cache_free_gpu_memory_fraction=0.3",
|
||||
"--run_profiling",
|
||||
]
|
||||
|
||||
venv_mpi_check_call(llm_venv, ["mpirun", "-n", "1", "--allow-run-as-root"],
|
||||
run_cmd)
|
||||
|
||||
@ -303,6 +303,7 @@ def test_mistral_eagle_1gpu(llm_mistral_model_root,
|
||||
llm_rouge_root=llm_rouge_root)
|
||||
|
||||
|
||||
@skip_post_blackwell
|
||||
@skip_pre_ada
|
||||
@pytest.mark.parametrize("use_dynamic_tree", [False, True],
|
||||
ids=['eagle1', 'eagle2'])
|
||||
|
||||
@ -1065,6 +1065,7 @@ def test_llm_llama_v3_1_autoq_2gpu_mmlu(llama_example_root, llama_model_root,
|
||||
check_call(" ".join(mmlu_cmd), shell=True, env=llm_venv._new_env)
|
||||
|
||||
|
||||
@skip_post_blackwell
|
||||
@pytest.mark.skip_less_device(2)
|
||||
@pytest.mark.skip_less_device_memory(80000)
|
||||
@pytest.mark.parametrize("use_auto_parallel", [True, False],
|
||||
|
||||
@ -621,12 +621,13 @@ def _test_llm_multimodal_general(llm_venv,
|
||||
'cogvlm-chat',
|
||||
'fuyu-8b',
|
||||
'deplot',
|
||||
'neva-22b',
|
||||
pytest.param('neva-22b',
|
||||
marks=pytest.mark.skip(reason="RCCA https://nvbugs/5220761")),
|
||||
'kosmos-2',
|
||||
'video-neva',
|
||||
pytest.param('Phi-3-vision-128k-instruct', marks=skip_post_blackwell),
|
||||
'Phi-3.5-vision-instruct',
|
||||
'Phi-4-multimodal-instruct',
|
||||
pytest.param('Phi-3.5-vision-instruct', marks=skip_post_blackwell),
|
||||
pytest.param('Phi-4-multimodal-instruct', marks=skip_post_blackwell),
|
||||
'Llama-3.2-11B-Vision',
|
||||
'Qwen2-VL-7B-Instruct',
|
||||
'internlm-xcomposer2-vl-7b',
|
||||
|
||||
@ -1639,6 +1639,7 @@ def test_ptp_quickstart_advanced_deepseek_r1_8gpus(llm_root, llm_venv,
|
||||
_check_mem_usage(running_log, [106.3, 0, 0, 0], 8)
|
||||
|
||||
|
||||
@skip_post_blackwell
|
||||
@pytest.mark.skip_less_device_memory(110000)
|
||||
@pytest.mark.skip_less_device(8)
|
||||
@pytest.mark.parametrize("model_name,model_path", [
|
||||
|
||||
@ -95,6 +95,8 @@ examples/test_draft_target_model.py::test_llm_draft_target_model_1gpu[streaming-
|
||||
examples/test_draft_target_model.py::test_llm_draft_target_model_1gpu[streaming-gpt2-use_cpp_session-use_tokens-draft_len_8-float16-bs1]
|
||||
examples/test_draft_target_model.py::test_llm_draft_target_model_1gpu[no_streaming-llama_v2-use_cpp_session-use_tokens-draft_len_4-float16-bs2]
|
||||
examples/test_draft_target_model.py::test_llm_draft_target_model_1gpu[streaming-llama_v2-use_cpp_session-use_logits-draft_len_4-float16-bs2]
|
||||
examples/test_draft_target_model.py::test_llm_draft_target_llama_1gpu
|
||||
examples/test_draft_target_model.py::test_llm_draft_target_llama_fp8_2gpu
|
||||
examples/test_prompt_lookup.py::test_llm_prompt_lookup_1gpu[no_streaming-gpt2-use_cpp_session-use_tokens-max_matching_ngram_size_2-prompt_lookup_num_tokens_8-float16-bs1]
|
||||
examples/test_prompt_lookup.py::test_llm_prompt_lookup_1gpu[no_streaming-gpt2-use_cpp_session-use_tokens-max_matching_ngram_size_2-prompt_lookup_num_tokens_8-float16-bs2]
|
||||
examples/test_prompt_lookup.py::test_llm_prompt_lookup_1gpu[streaming-gpt2-use_cpp_session-use_tokens-max_matching_ngram_size_2-prompt_lookup_num_tokens_8-float16-bs1]
|
||||
@ -184,8 +186,6 @@ examples/test_multimodal.py::test_llm_multimodal_general[llava-v1.6-mistral-7b-h
|
||||
examples/test_multimodal.py::test_llm_multimodal_general[llava-onevision-qwen2-7b-ov-hf-pp:1-tp:1-float16-bs:1-cpp_e2e:False-nb:1]
|
||||
examples/test_multimodal.py::test_llm_multimodal_general[llava-onevision-qwen2-7b-ov-hf-video-pp:1-tp:1-float16-bs:1-cpp_e2e:False-nb:1]
|
||||
examples/test_multimodal.py::test_llm_multimodal_general[Mistral-Small-3.1-24B-Instruct-2503-pp:1-tp:1-bfloat16-bs:1-cpp_e2e:False-nb:1]
|
||||
examples/test_multimodal.py::test_llm_multimodal_general[neva-22b-pp:1-tp:1-bfloat16-bs:1-cpp_e2e:False-nb:1]
|
||||
examples/test_multimodal.py::test_llm_multimodal_general[neva-22b-pp:1-tp:1-bfloat16-bs:8-cpp_e2e:False-nb:1]
|
||||
examples/test_multimodal.py::test_llm_multimodal_general[nougat-base-pp:1-tp:1-bfloat16-bs:1-cpp_e2e:False-nb:1]
|
||||
examples/test_multimodal.py::test_llm_multimodal_general[nougat-base-pp:1-tp:1-bfloat16-bs:8-cpp_e2e:False-nb:1]
|
||||
examples/test_multimodal.py::test_llm_multimodal_general[video-neva-pp:1-tp:1-bfloat16-bs:1-cpp_e2e:False-nb:1]
|
||||
@ -199,7 +199,6 @@ examples/test_multimodal.py::test_llm_multimodal_general[VILA1.5-3b-pp:1-tp:1-fl
|
||||
examples/test_multimodal.py::test_llm_multimodal_general[fuyu-8b-pp:1-tp:1-float16-bs:8-cpp_e2e:True-nb:1]
|
||||
examples/test_multimodal.py::test_llm_multimodal_general[kosmos-2-pp:1-tp:1-float16-bs:8-cpp_e2e:True-nb:1]
|
||||
examples/test_multimodal.py::test_llm_multimodal_general[llava-1.5-7b-hf-pp:1-tp:1-float16-bs:8-cpp_e2e:True-nb:1]
|
||||
examples/test_multimodal.py::test_llm_multimodal_general[neva-22b-pp:1-tp:1-bfloat16-bs:8-cpp_e2e:True-nb:1]
|
||||
examples/test_multimodal.py::test_llm_fp8_multimodal_general[fp8-fp8-scienceqa-Llama-3.2-11B-Vision-Instruct-pp:1-tp:1-bfloat16-bs:1-cpp_e2e:False]
|
||||
examples/test_nemotron.py::test_llm_nemotron_3_8b_1gpu[bfloat16-full_prec]
|
||||
examples/test_nemotron.py::test_llm_nemotron_3_8b_1gpu[bfloat16-int4_awq]
|
||||
|
||||
@ -46,7 +46,6 @@ examples/test_multimodal.py::test_llm_multimodal_general[llava-v1.6-mistral-7b-h
|
||||
examples/test_multimodal.py::test_llm_multimodal_general[llava-v1.6-mistral-7b-hf-vision-trtllm-pp:1-tp:2-float16-bs:1-cpp_e2e:False-nb:1]
|
||||
examples/test_multimodal.py::test_llm_multimodal_general[llava-onevision-qwen2-7b-ov-hf-pp:1-tp:1-float16-bs:1-cpp_e2e:False-nb:1]
|
||||
examples/test_multimodal.py::test_llm_multimodal_general[llava-onevision-qwen2-7b-ov-hf-video-pp:1-tp:1-float16-bs:1-cpp_e2e:False-nb:1]
|
||||
examples/test_multimodal.py::test_llm_multimodal_general[neva-22b-pp:1-tp:1-bfloat16-bs:1-cpp_e2e:False-nb:1]
|
||||
examples/test_multimodal.py::test_llm_multimodal_general[nougat-base-pp:1-tp:1-bfloat16-bs:1-cpp_e2e:False-nb:1]
|
||||
examples/test_multimodal.py::test_llm_multimodal_general[video-neva-pp:1-tp:1-bfloat16-bs:1-cpp_e2e:False-nb:1]
|
||||
examples/test_multimodal.py::test_llm_multimodal_general[Llama-3.2-11B-Vision-pp:1-tp:1-bfloat16-bs:1-cpp_e2e:False-nb:1]
|
||||
@ -56,7 +55,6 @@ examples/test_multimodal.py::test_llm_multimodal_general[VILA1.5-3b-pp:1-tp:1-fl
|
||||
examples/test_multimodal.py::test_llm_multimodal_general[fuyu-8b-pp:1-tp:1-float16-bs:1-cpp_e2e:True-nb:1]
|
||||
examples/test_multimodal.py::test_llm_multimodal_general[kosmos-2-pp:1-tp:1-float16-bs:1-cpp_e2e:True-nb:1]
|
||||
examples/test_multimodal.py::test_llm_multimodal_general[llava-1.5-7b-hf-pp:1-tp:1-float16-bs:1-cpp_e2e:True-nb:1]
|
||||
examples/test_multimodal.py::test_llm_multimodal_general[neva-22b-pp:1-tp:1-bfloat16-bs:1-cpp_e2e:True-nb:1]
|
||||
|
||||
examples/test_nemotron.py::test_llm_nemotron_3_8b_1gpu[bfloat16-full_prec]
|
||||
examples/test_nemotron.py::test_llm_nemotron_3_8b_1gpu[bfloat16-int4_awq]
|
||||
|
||||
@ -236,7 +236,6 @@ l0_h100:
|
||||
- accuracy/test_llm_api.py::TestMistral_Nemo_12B_Base::test_fp8
|
||||
- examples/test_multimodal.py::test_llm_multimodal_general[llava-1.5-7b-hf-pp:1-tp:1-float16-bs:1-cpp_e2e:False-nb:1] # 7 mins
|
||||
- examples/test_multimodal.py::test_llm_multimodal_general[fuyu-8b-pp:1-tp:1-float16-bs:1-cpp_e2e:True-nb:1]
|
||||
- examples/test_multimodal.py::test_llm_multimodal_general[neva-22b-pp:1-tp:1-bfloat16-bs:1-cpp_e2e:False-nb:1]
|
||||
- examples/test_multimodal.py::test_llm_multimodal_general[video-neva-pp:1-tp:1-bfloat16-bs:1-cpp_e2e:False-nb:1]
|
||||
- examples/test_multimodal.py::test_llm_multimodal_general[kosmos-2-pp:1-tp:1-float16-bs:1-cpp_e2e:True-nb:1]
|
||||
- examples/test_multimodal.py::test_llm_multimodal_general[Phi-3-vision-128k-instruct-pp:1-tp:1-float16-bs:1-cpp_e2e:False-nb:1]
|
||||
|
||||
@ -96,7 +96,6 @@ l0_l40s:
|
||||
- examples/test_gpt.py::test_llm_gpt2_next_prompt_tuning[use_cpp_session-tp1] # 10 mins
|
||||
- examples/test_gpt.py::test_llm_gpt2_next_prompt_tuning[use_py_session-tp1]
|
||||
- examples/test_llama.py::test_llm_llama_1gpu_fp8_kv_cache[llama-v2-7b-hf-bfloat16] #4 mins
|
||||
- examples/test_multimodal.py::test_llm_multimodal_general[neva-22b-pp:1-tp:1-bfloat16-bs:1-cpp_e2e:False-nb:1]
|
||||
- examples/test_multimodal.py::test_llm_multimodal_general[video-neva-pp:1-tp:1-bfloat16-bs:1-cpp_e2e:False-nb:1]
|
||||
- examples/test_multimodal.py::test_llm_multimodal_general[kosmos-2-pp:1-tp:1-float16-bs:1-cpp_e2e:True-nb:1]
|
||||
- examples/test_nemotron.py::test_llm_nemotron_3_8b_1gpu[bfloat16-fp8] # 18mins
|
||||
|
||||
@ -31,7 +31,6 @@ full:L40S/examples/test_commandr.py::test_llm_commandr_plus_4gpus_summary[disabl
|
||||
examples/test_phi.py::test_llm_phi_quantization_1gpu[Phi-3-small-128k-instruct-fp8-bfloat16] SKIP (https://nvbugs/4955671)
|
||||
examples/test_whisper.py::test_llm_whisper_general[large-v3-enable_gemm_plugin-enable_attention_plugin-enable_weight_only-float16-nb:1-use_python_runtime] SKIP (https://nvbugs/4967883)
|
||||
full:GH200/cpp/test_e2e.py::test_model[fp8-gptj-90] SKIP (https://nvbugspro.nvidia.com/bug/4979920)
|
||||
full:GH200/examples/test_multimodal.py::test_llm_multimodal_general[neva-22b-pp:1-tp:1-bfloat16-bs:1-cpp_e2e:False-nb:1] SKIP (https://nvbugspro.nvidia.com/bug/4979845)
|
||||
full:GH200/unittest/trt/model_api/test_model_quantization.py SKIP (https://nvbugspro.nvidia.com/bug/4979955)
|
||||
examples/test_gemma.py::test_llm_hf_gemma_quantization_1gpu[gemma-7b-int8_sq-bfloat16-8] SKIP (https://nvbugs/4988782)
|
||||
examples/test_llama.py::test_llm_llama_v3_8b_1048k_long_context_ppl[SlimPajama-6B-Llama-3-8B-Instruct-Gradient-1048k] SKIP (https://nvbugs/4993898)
|
||||
@ -370,7 +369,6 @@ full:RTX_PRO_6000_Blackwell_Server_Edition/perf/test_perf.py::test_perf[quant:in
|
||||
full:RTX_PRO_6000_Blackwell_Server_Edition/perf/test_perf.py::test_perf[quant:int8_sq_per_token_channel] SKIP (https://nvbugspro.nvidia.com/bug/5161074)
|
||||
examples/test_recurrentgemma.py::test_llm_recurrentgemma_1gpu[use_cpp_session-recurrentgemma-2b-use_paged_cache-disable_quant-float16-enable_attn_plugin-enable_gemm_plugin] SKIP (https://nvbugs/5174573)
|
||||
examples/test_qwen.py::test_llm_qwen_moe_single_gpu_summary[qwen1.5_moe_a2.7b_chat-enable_paged_kv_cache-enable_remove_input_padding-enable_weight_only-enable_fmha] SKIP (https://nvbugs/5180961)
|
||||
examples/test_multimodal.py::test_llm_multimodal_general[neva-22b-pp:1-tp:1-bfloat16-bs:1-cpp_e2e:False-nb:1] SKIP (https://nvbugs/5214245)
|
||||
examples/test_recurrentgemma.py::test_llm_recurrentgemma_1gpu[use_py_session-recurrentgemma-2b-no_paged_cache-disable_quant-float16-disable_attn_plugin-enable_gemm_plugin] SKIP (https://nvbugs/5214221)
|
||||
examples/test_recurrentgemma.py::test_llm_recurrentgemma_1gpu[use_py_session-recurrentgemma-2b-no_paged_cache-disable_quant-float16-enable_attn_plugin-enable_gemm_plugin] SKIP (https://nvbugs/5214221)
|
||||
examples/test_recurrentgemma.py::test_llm_recurrentgemma_1gpu[use_py_session-recurrentgemma-2b-use_paged_cache-disable_quant-float16-enable_attn_plugin-enable_gemm_plugin] SKIP (https://nvbugs/5214221)
|
||||
@ -386,12 +384,9 @@ examples/test_eagle.py::test_mistral_eagle_1gpu[mistral-7b-v0.1-eagle2] SKIP (ht
|
||||
examples/test_multimodal.py::test_llm_multimodal_general[VILA1.5-3b-pp:1-tp:1-float16-bs:8-cpp_e2e:False-nb:1] SKIP (https://nvbugs/5214239)
|
||||
examples/test_multimodal.py::test_llm_multimodal_general[VILA1.5-3b-pp:1-tp:1-float16-bs:1-cpp_e2e:True-nb:1] SKIP (https://nvbugs/5214239)
|
||||
examples/test_multimodal.py::test_llm_multimodal_general[VILA1.5-3b-pp:1-tp:1-float16-bs:8-cpp_e2e:True-nb:1] SKIP (https://nvbugs/5214239)
|
||||
examples/test_multimodal.py::test_llm_multimodal_general[neva-22b-pp:1-tp:1-bfloat16-bs:8-cpp_e2e:False-nb:1] SKIP (https://nvbugs/5220761)
|
||||
examples/test_multimodal.py::test_llm_multimodal_general[neva-22b-pp:1-tp:1-bfloat16-bs:8-cpp_e2e:True-nb:1] SKIP (https://nvbugs/5220761)
|
||||
examples/test_cli_flow.py::TestSantacoder::test_auto_dtype SKIP (https://nvbugs/5219531)
|
||||
examples/test_medusa.py::test_llama_medusa_1gpu[llama-3.1-8b] SKIP (https://nvbugs/5219535)
|
||||
examples/test_phi.py::test_llm_phi_quantization_1gpu[Phi-4-mini-instruct-fp8-bfloat16] SKIP (https://nvbugspro.nvidia.com/bug/5226339)
|
||||
examples/test_multimodal.py::test_llm_multimodal_general[neva-22b-pp:1-tp:1-bfloat16-bs:1-cpp_e2e:True-nb:1] SKIP (https://nvbugs/5227342)
|
||||
perf/test_perf.py::test_perf[t5-bench-float16-input_output_len:128,20] SKIP # https://nvbugspro.nvidia.com/bug/5207477
|
||||
perf/test_perf.py::test_perf[flan_t5_base-bench-float16-input_output_len:128,20] SKIP
|
||||
perf/test_perf.py::test_perf[flan_t5_large-bench-float16-input_output_len:128,20] SKIP
|
||||
|
||||
Loading…
Reference in New Issue
Block a user