diff --git a/tests/integration/defs/perf/test_perf.py b/tests/integration/defs/perf/test_perf.py index 82891ca847..f6d81460fe 100644 --- a/tests/integration/defs/perf/test_perf.py +++ b/tests/integration/defs/perf/test_perf.py @@ -111,6 +111,7 @@ MODEL_PATH_DICT = { "deepseek_v3_lite_nvfp4": "DeepSeek-V3-Lite/nvfp4_moe_only", "qwen2_7b_instruct": "Qwen2-7B-Instruct", "qwen_14b_chat": "Qwen-14B-Chat", + "qwen3_0.6b": "Qwen3/Qwen3-0.6B", "qwen3_4b_eagle3": "Qwen3/Qwen3-4B", "qwen3_235b_a22b_fp8": "Qwen3/saved_models_Qwen3-235B-A22B_fp8_hf", "qwen3_235b_a22b_fp4": "Qwen3/saved_models_Qwen3-235B-A22B_nvfp4_hf", diff --git a/tests/integration/test_lists/qa/llm_perf_core.yml b/tests/integration/test_lists/qa/llm_perf_core.yml index b8f8b1f222..2db0b307b2 100644 --- a/tests/integration/test_lists/qa/llm_perf_core.yml +++ b/tests/integration/test_lists/qa/llm_perf_core.yml @@ -94,7 +94,6 @@ llm_perf_core: - perf/test_perf.py::test_perf[mistral_small_v3.1_24b-bench-pytorch-bfloat16-maxbs:48-maxnt:256-input_output_len:1000,2000-reqs:500-con:200] - perf/test_perf.py::test_perf[mistral_small_v3.1_24b-bench-pytorch-bfloat16-maxbs:128-maxnt:512-input_output_len:1000,2000-reqs:500-con:200] #llama_v3.1_8b - - perf/test_perf.py::test_perf[llama_v3.1_8b-bench-pytorch-bfloat16-mp-maxbs:256-input_output_len:128,128-pp:2] - perf/test_perf.py::test_perf[llama_v3.1_8b-bench-pytorch-streaming-bfloat16-input_output_len:128,128-gpus:2] - perf/test_perf.py::test_perf[llama_v3.1_8b-bench-pytorch-bfloat16-maxbs:256-input_output_len:128,128-gpus:2] #mixtral_8x7b_v0.1 diff --git a/tests/integration/test_lists/qa/llm_perf_sanity.yml b/tests/integration/test_lists/qa/llm_perf_sanity.yml index 069bd02ea2..0348cef095 100644 --- a/tests/integration/test_lists/qa/llm_perf_sanity.yml +++ b/tests/integration/test_lists/qa/llm_perf_sanity.yml @@ -1,6 +1,18 @@ version: 0.0.1 llm_perf_sanity: -# A100, L40S, L20, H20, H100, H200, Blackwell +# =============================================================================== +# Test Conditions Index +# =============================================================================== +# 1: All GPUs +# 2: L40S, L20, H20, H100, H200, B200, B300, GB200, GB300, RTX6000-Server +# 3: A100, H20, H100, H200, B200, B300, RTX6000-Server +# 4: A100, L40S, H20, H100, H200, B200, B300, GB200, GB300, RTX6000-Server +# 5: L40S, H20, H100, H200, B200, B300, RTX6000-Server +# 6: H20, H100, H200, B200, B300, RTX6000-Server +# 7: H20, H100, H200, B200, B300 +# =============================================================================== + +# 1: All GPUs - condition: ranges: system_gpu_count: @@ -28,10 +40,13 @@ llm_perf_sanity: - perf/test_perf.py::test_perf[phi_4_mini_instruct-bench-pytorch-bfloat16-input_output_len:500,2000] - perf/test_perf.py::test_perf[nemotron_nano_9b_v2-bench-pytorch-bfloat16-input_output_len:512,512] - perf/test_perf.py::test_perf[qwen3_4b_eagle3-bench-pytorch-streaming-bfloat16-maxbs:4-kv_frac:0.6-input_output_len:500,100-reqs:200-con:4] + - perf/test_perf.py::test_perf[llama_v3.1_8b-bench-pytorch-bfloat16-maxbs:256-input_output_len:128,128-gpus:2] + - perf/test_perf.py::test_perf[llama_v3.1_8b_instruct-bench-pytorch-streaming-bfloat16-input_output_len:128,128-gpus:2] + - perf/test_perf.py::test_perf[llama_v3.2_1b-bench-bfloat16-maxbs:1-maxnt:256-input_output_len:128,128-reqs:10-gpus:2] + - perf/test_perf.py::test_perf[llama_v3.2_1b-bench-pytorch-bfloat16-input_output_len:128,128-gpus:2] -# FP8 specific tests -# A100, L40S, L20, H20, H100, H200, Blackwell +# 2: L40S, L20, H20, H100, H200, B200, B300, GB200, GB300, RTX6000-Server - condition: terms: supports_fp8: true @@ -45,94 +60,40 @@ llm_perf_sanity: - perf/test_perf.py::test_perf[llama_v3.1_nemotron_nano_8b_fp8-bench-pytorch-float8-maxbs:512-input_output_len:1000,1000-reqs:8-con:1] - perf/test_perf.py::test_perf[bielik_11b_v2.2_instruct_fp8-bench-pytorch-float8-input_output_len:1000,1000-con:250] - perf/test_perf.py::test_perf[ministral_8b_fp8-bench-pytorch-float8-input_output_len:500,2000-reqs:500-con:250] - -# Tests for ALL systems with 2+ GPUs -# A100, L40S, L20, H20, H100, H200, Blackwell -- condition: - ranges: - system_gpu_count: - gte: 2 - tests: - #llama_v3.1_8b_instruct - #pytorch backend - - perf/test_perf.py::test_perf[llama_v3.1_8b-bench-pytorch-bfloat16-maxbs:256-input_output_len:128,128-gpus:2] - - perf/test_perf.py::test_perf[llama_v3.1_8b_instruct-bench-pytorch-streaming-bfloat16-input_output_len:128,128-gpus:2] - - perf/test_perf.py::test_perf[llama_v3.2_1b-bench-bfloat16-maxbs:1-maxnt:256-input_output_len:128,128-reqs:10-gpus:2] - - perf/test_perf.py::test_perf[llama_v3.2_1b-bench-pytorch-bfloat16-input_output_len:128,128-gpus:2] - -# FP8 tests for systems with 2+ GPUs -# A100, L40S, L20, H20, H100, H200, Blackwell -- condition: - terms: - supports_fp8: true - ranges: - system_gpu_count: - gte: 2 - - tests: - #mixtral_8x7b_v0.1_fp8 pytorch backend - perf/test_perf.py::test_perf[mixtral_8x7b_v0.1_instruct_fp8-bench-pytorch-float8-input_output_len:128,128-gpus:2] -# Tests for systems with 2+ GPUs and high memory -# A100, L40S, H20, H100, H200, Blackwell + +# 3: A100, H20, H100, H200, B200, B300, RTX6000-Server - condition: ranges: - system_gpu_count: - gte: 2 gpu_memory: gt: 80000 - tests: - perf/test_perf.py::test_perf[llama_v3.1_70b-bench-pytorch-bfloat16-maxbs:1-input_output_len:128,128-reqs:10-gpus:2] - perf/test_perf.py::test_perf[mixtral_8x7b_v0.1-bench-pytorch-float16-input_output_len:128,128-gpus:2] - -# Tests for systems with 4+ GPUs -# A100, L40S, H20, H100, H200, Blackwell -- condition: - ranges: - system_gpu_count: - gte: 4 - - tests: - #llama_v3.1_70b - #trt backend - #pytorch backend - - perf/test_perf.py::test_perf[llama_v3.1_70b-bench-pytorch-bfloat16-input_output_len:128,128-gpus:4] - - perf/test_perf.py::test_perf[llama_v3.1_70b-bench-pytorch-bfloat16-maxbs:1-input_output_len:128,128-reqs:10-gpus:4] - -# FP8 specific tests -# L40S, H20, H100, H200, Blackwell -- condition: - terms: - supports_fp8: true - ranges: - system_gpu_count: - gte: 4 - - tests: - - perf/test_perf.py::test_perf[llama_v3.3_70b_instruct_fp8-bench-pytorch-float8-input_output_len:2000,500-gpus:4] - - perf/test_perf.py::test_perf[llama_v3.3_nemotron_super_49b-bench-pytorch-bfloat16-maxbs:256-input_output_len:500,2000-reqs:250-con:250-tp:4-gpus:4] - - perf/test_perf.py::test_perf[llama_v3.3_nemotron_super_49b_fp8-bench-pytorch-float8-maxbs:256-input_output_len:500,2000-reqs:250-con:250-tp:4-gpus:4] - -# Tests for systems with 8+ GPUs -# A100, L40S, H20, H100, H200, Blackwell -- condition: - ranges: - system_gpu_count: - gte: 8 - gpu_memory: - gt: 46000 - - tests: - #llama_v3.1_70b - #pytorch backend - perf/test_perf.py::test_perf[llama_v3.1_70b-bench-pytorch-bfloat16-maxbs:1-input_output_len:2000,200-reqs:10-gpus:8] - perf/test_perf.py::test_perf[llama_v3.1_70b-bench-pytorch-bfloat16-maxbs:1-input_output_len:200,2000-reqs:10-gpus:8] - perf/test_perf.py::test_perf[llama_v3.3_70b_instruct-bench-pytorch-bfloat16-input_output_len:500,2000-gpus:8] - perf/test_perf.py::test_perf[llama_v3.3_70b_instruct-bench-pytorch-bfloat16-input_output_len:2000,500-gpus:8] -# FP8 tests for systems with 8+ GPUs -# L40S, H20, H100, H200, Blackwell +# 4: A100, L40S, H20, H100, H200, B200, B300, GB200, GB300, RTX6000-Server +- condition: + ranges: + system_gpu_count: + gte: 4 + + tests: + # llama_v3.1_70b + # trt backend + # pytorch backend + - perf/test_perf.py::test_perf[llama_v3.1_70b-bench-pytorch-bfloat16-input_output_len:128,128-gpus:4] + - perf/test_perf.py::test_perf[llama_v3.1_70b-bench-pytorch-bfloat16-maxbs:1-input_output_len:128,128-reqs:10-gpus:4] + # test overlap scheduler + - perf/test_perf.py::test_perf[qwen3_0.6b-bench-pytorch-bfloat16-maxnt:2048-input_output_len:8000,1000-reqs:256-con:1-pp:4-gpus:4] + - perf/test_perf.py::test_perf[llama_v3.1_8b-bench-pytorch-streaming-bfloat16-input_output_len:1000,1000-reqs:1000-con:200] + + +# 5: L40S, H20, H100, H200, B200, B300, RTX6000-Server - condition: terms: supports_fp8: true @@ -141,17 +102,15 @@ llm_perf_sanity: gte: 8 tests: - #llama_v3.1_70b - #trt backend + - perf/test_perf.py::test_perf[llama_v3.3_70b_instruct_fp8-bench-pytorch-float8-input_output_len:2000,500-gpus:4] + - perf/test_perf.py::test_perf[llama_v3.3_nemotron_super_49b-bench-pytorch-bfloat16-maxbs:256-input_output_len:500,2000-reqs:250-con:250-tp:4-gpus:4] + - perf/test_perf.py::test_perf[llama_v3.3_nemotron_super_49b_fp8-bench-pytorch-float8-maxbs:256-input_output_len:500,2000-reqs:250-con:250-tp:4-gpus:4] - perf/test_perf.py::test_perf[llama_v3.1_70b-bench-bfloat16-maxbs:1-maxnt:544-input_output_len:512,32-quant:fp8-gpus:8] - #llama_v3.3_70b_instruct_fp8 - #pytorch backend - perf/test_perf.py::test_perf[llama_v3.3_70b_instruct_fp8-bench-pytorch-float8-input_output_len:128,128-gpus:8] - perf/test_perf.py::test_perf[mixtral_8x7b_v0.1_instruct-bench-pytorch-float16-input_output_len:128,128-reqs:64-gpus:8] -# FP4, FP8 tests for systems with 8+ GPUs -# H20, H100, H200, Blackwell +# 6: H20, H100, H200, B200, B300, RTX6000-Server - condition: ranges: system_gpu_count: @@ -171,10 +130,11 @@ llm_perf_sanity: - perf/test_perf.py::test_perf[qwen3_235b_a22b_fp8-bench-pytorch-float8-input_output_len:1000,2000-con:256-ep:8-gpus:8] TIMEOUT(60) - perf/test_perf.py::test_perf[deepseek_v3_lite_fp8-disagg_server-ctx_dp:4-gen_tp:4] - perf/test_perf.py::test_perf[llama_v3.1_8b-disagg_server-ctx_dp:4-gen_tp:4] - # gpt_oss_20b_fp4 + # gpt_oss_20b_fp4 - perf/test_perf.py::test_perf[gpt_oss_20b_fp4-bench-pytorch-float4-input_output_len:512,512] -# gpu_arch > Hopper, exclude GB20X, RTX 6000 for not supported + +# 7: H20, H100, H200, B200, B300 - condition: ranges: system_gpu_count: