mirror of
https://github.com/NVIDIA/TensorRT-LLM.git
synced 2026-02-03 09:41:30 +08:00
clean (#3061)
Signed-off-by: Enwei Zhu <21126786+syuoni@users.noreply.github.com> Co-authored-by: QI JUN <22017000+QiJune@users.noreply.github.com>
This commit is contained in:
parent
8ee840159b
commit
f93ac9672e
@ -21,17 +21,6 @@ gpt2-medium:
|
||||
accuracy: 22.249
|
||||
gpt-next:
|
||||
- accuracy: 25.516
|
||||
EleutherAI/gpt-j-6b:
|
||||
- accuracy: 27.883
|
||||
- dtype: float32
|
||||
accuracy: 26.449
|
||||
- quant_algo: FP8
|
||||
kv_cache_quant_algo: FP8
|
||||
accuracy: 27.883
|
||||
- extra_acc_spec: max_attention_window_size=960
|
||||
accuracy: 27.822
|
||||
- extra_acc_spec: max_attention_window_size=960;beam_width=4
|
||||
accuracy: 0
|
||||
microsoft/phi-2:
|
||||
- accuracy: 31.255
|
||||
microsoft/Phi-3-mini-4k-instruct:
|
||||
@ -77,6 +66,8 @@ meta-llama/Llama-2-7b-hf:
|
||||
accuracy: 27.999
|
||||
TinyLlama/TinyLlama-1.1B-Chat-v1.0:
|
||||
- accuracy: 28.328
|
||||
- dtype: float32
|
||||
accuracy: 28.082
|
||||
- quant_algo: W8A16
|
||||
accuracy: 28.003
|
||||
- quant_algo: W8A16
|
||||
@ -141,6 +132,10 @@ meta-llama/Llama-3.2-1B:
|
||||
- quant_algo: FP8_PER_CHANNEL_PER_TOKEN
|
||||
extra_acc_spec: meta_recipe
|
||||
accuracy: 27.614
|
||||
- extra_acc_spec: max_attention_window_size=960
|
||||
accuracy: 27.259
|
||||
- extra_acc_spec: max_attention_window_size=960;beam_width=4
|
||||
accuracy: 0
|
||||
mistralai/Mixtral-8x7B-v0.1:
|
||||
- accuracy: 28.810
|
||||
- quant_algo: FP8
|
||||
|
||||
@ -174,36 +174,6 @@ class TestMinitron4BBase(AccuracyTestHarness):
|
||||
kv_cache_quant_algo=QuantAlgo.FP8)
|
||||
|
||||
|
||||
class TestGptJ6B(AccuracyTestHarness):
|
||||
MODEL_NAME = "EleutherAI/gpt-j-6b"
|
||||
MODEL_PATH = f"{llm_models_root()}/gpt-j-6b"
|
||||
EXAMPLE_FOLDER = "models/contrib/gptj"
|
||||
|
||||
def test_auto_dtype(self):
|
||||
# float16
|
||||
self.run(dtype='auto')
|
||||
|
||||
def test_float32(self):
|
||||
self.run(dtype='float32')
|
||||
|
||||
@skip_pre_ada
|
||||
def test_fp8(self):
|
||||
self.run(quant_algo=QuantAlgo.FP8, kv_cache_quant_algo=QuantAlgo.FP8)
|
||||
|
||||
@pytest.mark.skip(reason="https://nvbugspro.nvidia.com/bug/5166352")
|
||||
def test_cyclic_kv_cache(self):
|
||||
self.run(extra_acc_spec="max_attention_window_size=960",
|
||||
extra_summarize_args=["--max_attention_window_size=960"])
|
||||
|
||||
@pytest.mark.skip(reason="https://nvbugspro.nvidia.com/bug/5166352")
|
||||
def test_cyclic_kv_cache_beam_search(self):
|
||||
self.run(extra_acc_spec="max_attention_window_size=960;beam_width=4",
|
||||
extra_build_args=["--max_beam_width=4"],
|
||||
extra_summarize_args=[
|
||||
"--max_attention_window_size=960", "--num_beams=4"
|
||||
])
|
||||
|
||||
|
||||
class TestPhi2(AccuracyTestHarness):
|
||||
MODEL_NAME = "microsoft/phi-2"
|
||||
MODEL_PATH = f"{llm_models_root()}/phi-2"
|
||||
@ -486,6 +456,9 @@ class TestTinyLlama1_1BChat(AccuracyTestHarness):
|
||||
def test_auto_dtype(self):
|
||||
self.run(dtype='auto')
|
||||
|
||||
def test_float32(self):
|
||||
self.run(dtype='float32')
|
||||
|
||||
@pytest.mark.parametrize("precision", ["int8", "int4"])
|
||||
def test_weight_only(self, precision: str):
|
||||
quant_algo = QuantAlgo.W8A16 if precision == "int8" else QuantAlgo.W4A16
|
||||
@ -768,6 +741,18 @@ class TestLlama3_2_1B(AccuracyTestHarness):
|
||||
self.extra_summarize_args = [f"--gpu_weights_percent={gpu_percent}"]
|
||||
self.evaluate()
|
||||
|
||||
def test_cyclic_kv_cache(self):
|
||||
self.run(extra_acc_spec="max_attention_window_size=960",
|
||||
extra_summarize_args=["--max_attention_window_size=960"])
|
||||
|
||||
@pytest.mark.skip(reason="https://nvbugspro.nvidia.com/bug/5166352")
|
||||
def test_cyclic_kv_cache_beam_search(self):
|
||||
self.run(extra_acc_spec="max_attention_window_size=960;beam_width=4",
|
||||
extra_build_args=["--max_beam_width=4"],
|
||||
extra_summarize_args=[
|
||||
"--max_attention_window_size=960", "--num_beams=4"
|
||||
])
|
||||
|
||||
|
||||
class TestMixtral8x7B(AccuracyTestHarness):
|
||||
MODEL_NAME = "mistralai/Mixtral-8x7B-v0.1"
|
||||
|
||||
@ -349,9 +349,6 @@ accuracy/test_accuracy.py::TestStarcoder2_15B::test_smooth_quant_ootb
|
||||
accuracy/test_accuracy.py::TestGptNext::test_auto_dtype
|
||||
accuracy/test_accuracy.py::TestMinitron4BBase::test_auto_dtype
|
||||
accuracy/test_accuracy.py::TestMinitron4BBase::test_fp8
|
||||
accuracy/test_accuracy.py::TestGptJ6B::test_float32
|
||||
accuracy/test_accuracy.py::TestGptJ6B::test_cyclic_kv_cache
|
||||
accuracy/test_accuracy.py::TestGptJ6B::test_cyclic_kv_cache_beam_search
|
||||
accuracy/test_accuracy.py::TestPhi2::test_auto_dtype
|
||||
accuracy/test_accuracy.py::TestPhi2::test_tp2
|
||||
accuracy/test_accuracy.py::TestPhi3Mini4kInstruct::test_auto_dtype
|
||||
@ -388,6 +385,7 @@ accuracy/test_accuracy.py::TestLlama2_7B::test_int4_awq_tp2
|
||||
accuracy/test_accuracy.py::TestLlama2_7B::test_int4_awq_pre_quantized_tp2
|
||||
accuracy/test_accuracy.py::TestLlama2_7B::test_int4_gptq_pre_quantized_tp2
|
||||
accuracy/test_accuracy.py::TestLlama2_7B::test_weight_sparsity
|
||||
accuracy/test_accuracy.py::TestTinyLlama1_1BChat::test_float32
|
||||
accuracy/test_accuracy.py::TestTinyLlama1_1BChat::test_weight_only[int8]
|
||||
accuracy/test_accuracy.py::TestTinyLlama1_1BChat::test_weight_only[int4]
|
||||
accuracy/test_accuracy.py::TestTinyLlama1_1BChat::test_weight_only_int8_kv_cache[int8]
|
||||
@ -422,6 +420,8 @@ accuracy/test_accuracy.py::TestLlama3_2_1B::test_fp8
|
||||
accuracy/test_accuracy.py::TestLlama3_2_1B::test_fp8_pp2
|
||||
accuracy/test_accuracy.py::TestLlama3_2_1B::test_fp8_rowwise
|
||||
accuracy/test_accuracy.py::TestLlama3_2_1B::test_weight_streaming[1.0]
|
||||
accuracy/test_accuracy.py::TestLlama3_2_1B::test_cyclic_kv_cache
|
||||
accuracy/test_accuracy.py::TestLlama3_2_1B::test_cyclic_kv_cache_beam_search
|
||||
accuracy/test_accuracy.py::TestMixtral8x7B::test_fp8_tp2pp2
|
||||
accuracy/test_accuracy.py::TestMixtral8x7B::test_fp8_tp2pp2_manage_weights
|
||||
accuracy/test_accuracy.py::TestMixtral8x7B::test_nvfp4_pre_quantized
|
||||
|
||||
@ -112,9 +112,9 @@ l0_a10:
|
||||
- examples/test_mamba.py::test_llm_mamba_1gpu[mamba2-130m-float16-disable_gemm_plugin]
|
||||
- examples/test_mamba.py::test_llm_mamba_1gpu[mamba-codestral-7B-v0.1-float16-disable_gemm_plugin] # 4 mins
|
||||
- accuracy/test_accuracy.py::TestGpt2::test_context_fmha_disabled # 1 min
|
||||
- accuracy/test_accuracy.py::TestGptJ6B::test_cyclic_kv_cache
|
||||
- accuracy/test_accuracy.py::TestGptJ6B::test_cyclic_kv_cache_beam_search
|
||||
- accuracy/test_accuracy.py::TestLlama7B::test_auto_dtype # 2 mins
|
||||
- accuracy/test_accuracy.py::TestLlama3_2_1B::test_cyclic_kv_cache
|
||||
- accuracy/test_accuracy.py::TestLlama3_2_1B::test_cyclic_kv_cache_beam_search
|
||||
- accuracy/test_accuracy.py::TestGpt2::test_attention_ootb
|
||||
- accuracy/test_accuracy.py::TestStarcoder2_3B::test_auto_dtype
|
||||
- accuracy/test_accuracy.py::TestPhi2::test_auto_dtype # 2 mins
|
||||
|
||||
@ -123,9 +123,6 @@ l0_h100:
|
||||
- accuracy/test_accuracy.py::TestGpt2::test_cuda_graph # 1 min
|
||||
- accuracy/test_accuracy.py::TestGpt2::test_context_fmha_disabled # 1 min
|
||||
- accuracy/test_accuracy.py::TestGptNext::test_auto_dtype # 1.5 mins
|
||||
- accuracy/test_accuracy.py::TestGptJ6B::test_float32 # 4 mins
|
||||
- accuracy/test_accuracy.py::TestGptJ6B::test_cyclic_kv_cache
|
||||
- accuracy/test_accuracy.py::TestGptJ6B::test_cyclic_kv_cache_beam_search
|
||||
- accuracy/test_accuracy.py::TestSantacoder::test_auto_dtype # 1.5 mins
|
||||
- accuracy/test_accuracy.py::TestMamba130M::test_auto_dtype # 1 min
|
||||
- accuracy/test_accuracy.py::TestVicuna7B::test_lookahead # 5 mins
|
||||
@ -139,11 +136,14 @@ l0_h100:
|
||||
- accuracy/test_accuracy.py::TestGpt2::test_attention_ootb
|
||||
- accuracy/test_accuracy.py::TestStarcoder2_3B::test_auto_dtype
|
||||
- accuracy/test_accuracy.py::TestMinitron4BBase::test_fp8
|
||||
- accuracy/test_accuracy.py::TestTinyLlama1_1BChat::test_float32
|
||||
- accuracy/test_accuracy.py::TestLlama3_8BInstructGradient1048k::test_long_context
|
||||
- accuracy/test_accuracy.py::TestLlama3_8BInstructGradient1048k::test_long_context_ppl
|
||||
- accuracy/test_accuracy.py::TestLlama3_1_8B::test_fp8
|
||||
- accuracy/test_accuracy.py::TestLlama3_1_8B::test_autoq
|
||||
- accuracy/test_accuracy.py::TestLlama3_2_1B::test_auto_dtype
|
||||
- accuracy/test_accuracy.py::TestLlama3_2_1B::test_cyclic_kv_cache
|
||||
- accuracy/test_accuracy.py::TestLlama3_2_1B::test_cyclic_kv_cache_beam_search
|
||||
- accuracy/test_accuracy.py::TestGemma2_9BIt::test_auto_dtype
|
||||
- examples/test_gpt.py::test_llm_minitron_fp8_with_pseudo_loras[4b]
|
||||
- examples/test_chatglm.py::test_llm_glm_4_9b_single_gpu_summary[glm-4-9b-disable_weight_only]
|
||||
|
||||
Loading…
Reference in New Issue
Block a user