Signed-off-by: Enwei Zhu <21126786+syuoni@users.noreply.github.com>
Co-authored-by: QI JUN <22017000+QiJune@users.noreply.github.com>
This commit is contained in:
Enwei Zhu 2025-03-25 21:55:08 +08:00 committed by GitHub
parent 8ee840159b
commit f93ac9672e
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
5 changed files with 29 additions and 49 deletions

View File

@ -21,17 +21,6 @@ gpt2-medium:
accuracy: 22.249
gpt-next:
- accuracy: 25.516
EleutherAI/gpt-j-6b:
- accuracy: 27.883
- dtype: float32
accuracy: 26.449
- quant_algo: FP8
kv_cache_quant_algo: FP8
accuracy: 27.883
- extra_acc_spec: max_attention_window_size=960
accuracy: 27.822
- extra_acc_spec: max_attention_window_size=960;beam_width=4
accuracy: 0
microsoft/phi-2:
- accuracy: 31.255
microsoft/Phi-3-mini-4k-instruct:
@ -77,6 +66,8 @@ meta-llama/Llama-2-7b-hf:
accuracy: 27.999
TinyLlama/TinyLlama-1.1B-Chat-v1.0:
- accuracy: 28.328
- dtype: float32
accuracy: 28.082
- quant_algo: W8A16
accuracy: 28.003
- quant_algo: W8A16
@ -141,6 +132,10 @@ meta-llama/Llama-3.2-1B:
- quant_algo: FP8_PER_CHANNEL_PER_TOKEN
extra_acc_spec: meta_recipe
accuracy: 27.614
- extra_acc_spec: max_attention_window_size=960
accuracy: 27.259
- extra_acc_spec: max_attention_window_size=960;beam_width=4
accuracy: 0
mistralai/Mixtral-8x7B-v0.1:
- accuracy: 28.810
- quant_algo: FP8

View File

@ -174,36 +174,6 @@ class TestMinitron4BBase(AccuracyTestHarness):
kv_cache_quant_algo=QuantAlgo.FP8)
class TestGptJ6B(AccuracyTestHarness):
MODEL_NAME = "EleutherAI/gpt-j-6b"
MODEL_PATH = f"{llm_models_root()}/gpt-j-6b"
EXAMPLE_FOLDER = "models/contrib/gptj"
def test_auto_dtype(self):
# float16
self.run(dtype='auto')
def test_float32(self):
self.run(dtype='float32')
@skip_pre_ada
def test_fp8(self):
self.run(quant_algo=QuantAlgo.FP8, kv_cache_quant_algo=QuantAlgo.FP8)
@pytest.mark.skip(reason="https://nvbugspro.nvidia.com/bug/5166352")
def test_cyclic_kv_cache(self):
self.run(extra_acc_spec="max_attention_window_size=960",
extra_summarize_args=["--max_attention_window_size=960"])
@pytest.mark.skip(reason="https://nvbugspro.nvidia.com/bug/5166352")
def test_cyclic_kv_cache_beam_search(self):
self.run(extra_acc_spec="max_attention_window_size=960;beam_width=4",
extra_build_args=["--max_beam_width=4"],
extra_summarize_args=[
"--max_attention_window_size=960", "--num_beams=4"
])
class TestPhi2(AccuracyTestHarness):
MODEL_NAME = "microsoft/phi-2"
MODEL_PATH = f"{llm_models_root()}/phi-2"
@ -486,6 +456,9 @@ class TestTinyLlama1_1BChat(AccuracyTestHarness):
def test_auto_dtype(self):
self.run(dtype='auto')
def test_float32(self):
self.run(dtype='float32')
@pytest.mark.parametrize("precision", ["int8", "int4"])
def test_weight_only(self, precision: str):
quant_algo = QuantAlgo.W8A16 if precision == "int8" else QuantAlgo.W4A16
@ -768,6 +741,18 @@ class TestLlama3_2_1B(AccuracyTestHarness):
self.extra_summarize_args = [f"--gpu_weights_percent={gpu_percent}"]
self.evaluate()
def test_cyclic_kv_cache(self):
self.run(extra_acc_spec="max_attention_window_size=960",
extra_summarize_args=["--max_attention_window_size=960"])
@pytest.mark.skip(reason="https://nvbugspro.nvidia.com/bug/5166352")
def test_cyclic_kv_cache_beam_search(self):
self.run(extra_acc_spec="max_attention_window_size=960;beam_width=4",
extra_build_args=["--max_beam_width=4"],
extra_summarize_args=[
"--max_attention_window_size=960", "--num_beams=4"
])
class TestMixtral8x7B(AccuracyTestHarness):
MODEL_NAME = "mistralai/Mixtral-8x7B-v0.1"

View File

@ -349,9 +349,6 @@ accuracy/test_accuracy.py::TestStarcoder2_15B::test_smooth_quant_ootb
accuracy/test_accuracy.py::TestGptNext::test_auto_dtype
accuracy/test_accuracy.py::TestMinitron4BBase::test_auto_dtype
accuracy/test_accuracy.py::TestMinitron4BBase::test_fp8
accuracy/test_accuracy.py::TestGptJ6B::test_float32
accuracy/test_accuracy.py::TestGptJ6B::test_cyclic_kv_cache
accuracy/test_accuracy.py::TestGptJ6B::test_cyclic_kv_cache_beam_search
accuracy/test_accuracy.py::TestPhi2::test_auto_dtype
accuracy/test_accuracy.py::TestPhi2::test_tp2
accuracy/test_accuracy.py::TestPhi3Mini4kInstruct::test_auto_dtype
@ -388,6 +385,7 @@ accuracy/test_accuracy.py::TestLlama2_7B::test_int4_awq_tp2
accuracy/test_accuracy.py::TestLlama2_7B::test_int4_awq_pre_quantized_tp2
accuracy/test_accuracy.py::TestLlama2_7B::test_int4_gptq_pre_quantized_tp2
accuracy/test_accuracy.py::TestLlama2_7B::test_weight_sparsity
accuracy/test_accuracy.py::TestTinyLlama1_1BChat::test_float32
accuracy/test_accuracy.py::TestTinyLlama1_1BChat::test_weight_only[int8]
accuracy/test_accuracy.py::TestTinyLlama1_1BChat::test_weight_only[int4]
accuracy/test_accuracy.py::TestTinyLlama1_1BChat::test_weight_only_int8_kv_cache[int8]
@ -422,6 +420,8 @@ accuracy/test_accuracy.py::TestLlama3_2_1B::test_fp8
accuracy/test_accuracy.py::TestLlama3_2_1B::test_fp8_pp2
accuracy/test_accuracy.py::TestLlama3_2_1B::test_fp8_rowwise
accuracy/test_accuracy.py::TestLlama3_2_1B::test_weight_streaming[1.0]
accuracy/test_accuracy.py::TestLlama3_2_1B::test_cyclic_kv_cache
accuracy/test_accuracy.py::TestLlama3_2_1B::test_cyclic_kv_cache_beam_search
accuracy/test_accuracy.py::TestMixtral8x7B::test_fp8_tp2pp2
accuracy/test_accuracy.py::TestMixtral8x7B::test_fp8_tp2pp2_manage_weights
accuracy/test_accuracy.py::TestMixtral8x7B::test_nvfp4_pre_quantized

View File

@ -112,9 +112,9 @@ l0_a10:
- examples/test_mamba.py::test_llm_mamba_1gpu[mamba2-130m-float16-disable_gemm_plugin]
- examples/test_mamba.py::test_llm_mamba_1gpu[mamba-codestral-7B-v0.1-float16-disable_gemm_plugin] # 4 mins
- accuracy/test_accuracy.py::TestGpt2::test_context_fmha_disabled # 1 min
- accuracy/test_accuracy.py::TestGptJ6B::test_cyclic_kv_cache
- accuracy/test_accuracy.py::TestGptJ6B::test_cyclic_kv_cache_beam_search
- accuracy/test_accuracy.py::TestLlama7B::test_auto_dtype # 2 mins
- accuracy/test_accuracy.py::TestLlama3_2_1B::test_cyclic_kv_cache
- accuracy/test_accuracy.py::TestLlama3_2_1B::test_cyclic_kv_cache_beam_search
- accuracy/test_accuracy.py::TestGpt2::test_attention_ootb
- accuracy/test_accuracy.py::TestStarcoder2_3B::test_auto_dtype
- accuracy/test_accuracy.py::TestPhi2::test_auto_dtype # 2 mins

View File

@ -123,9 +123,6 @@ l0_h100:
- accuracy/test_accuracy.py::TestGpt2::test_cuda_graph # 1 min
- accuracy/test_accuracy.py::TestGpt2::test_context_fmha_disabled # 1 min
- accuracy/test_accuracy.py::TestGptNext::test_auto_dtype # 1.5 mins
- accuracy/test_accuracy.py::TestGptJ6B::test_float32 # 4 mins
- accuracy/test_accuracy.py::TestGptJ6B::test_cyclic_kv_cache
- accuracy/test_accuracy.py::TestGptJ6B::test_cyclic_kv_cache_beam_search
- accuracy/test_accuracy.py::TestSantacoder::test_auto_dtype # 1.5 mins
- accuracy/test_accuracy.py::TestMamba130M::test_auto_dtype # 1 min
- accuracy/test_accuracy.py::TestVicuna7B::test_lookahead # 5 mins
@ -139,11 +136,14 @@ l0_h100:
- accuracy/test_accuracy.py::TestGpt2::test_attention_ootb
- accuracy/test_accuracy.py::TestStarcoder2_3B::test_auto_dtype
- accuracy/test_accuracy.py::TestMinitron4BBase::test_fp8
- accuracy/test_accuracy.py::TestTinyLlama1_1BChat::test_float32
- accuracy/test_accuracy.py::TestLlama3_8BInstructGradient1048k::test_long_context
- accuracy/test_accuracy.py::TestLlama3_8BInstructGradient1048k::test_long_context_ppl
- accuracy/test_accuracy.py::TestLlama3_1_8B::test_fp8
- accuracy/test_accuracy.py::TestLlama3_1_8B::test_autoq
- accuracy/test_accuracy.py::TestLlama3_2_1B::test_auto_dtype
- accuracy/test_accuracy.py::TestLlama3_2_1B::test_cyclic_kv_cache
- accuracy/test_accuracy.py::TestLlama3_2_1B::test_cyclic_kv_cache_beam_search
- accuracy/test_accuracy.py::TestGemma2_9BIt::test_auto_dtype
- examples/test_gpt.py::test_llm_minitron_fp8_with_pseudo_loras[4b]
- examples/test_chatglm.py::test_llm_glm_4_9b_single_gpu_summary[glm-4-9b-disable_weight_only]