diff --git a/jenkins/L0_Test.groovy b/jenkins/L0_Test.groovy index 7964657561..1d7583a57e 100644 --- a/jenkins/L0_Test.groovy +++ b/jenkins/L0_Test.groovy @@ -918,7 +918,7 @@ def runLLMTestlistWithSbatch(pipeline, platform, testList, config=VANILLA_CONFIG // Create a unique suffix for the job name String customSuffix = "${env.BUILD_TAG}-${UUID.randomUUID().toString().replaceAll("-", "").substring(0, 6)}".toLowerCase() def jobUID = "${cluster.host}-multi_node_test-${customSuffix}" - def disaggMode = stageName.contains("PerfSanity-Disagg") + def disaggMode = stageName.contains("Disagg-PerfSanity") Utils.exec(pipeline, script: "env | sort && pwd && ls -alh") @@ -3151,6 +3151,15 @@ def runInKubernetes(pipeline, podSpec, containerName) } } +def buildStageConfigs(stageName, platform, testlist, testCount, gpuCount, nodeCount, runWithSbatch=false) { + def configs = [:] + for (int k = 1; k <= testCount; k++) { + def key = "${stageName}-${k}" + configs[key] = [platform, testlist, k, testCount, gpuCount, nodeCount, runWithSbatch] + } + return configs +} + def launchTestJobs(pipeline, testFilter) { // IMPORTANT: Stage Configuration Syntax Requirement @@ -3354,18 +3363,57 @@ def launchTestJobs(pipeline, testFilter) "GB200-8_GPUs-2_Nodes-PyTorch-Post-Merge-1": ["auto:gb200-flex", "l0_gb200_multi_nodes", 1, 3, 8, 2], "GB200-8_GPUs-2_Nodes-PyTorch-Post-Merge-2": ["auto:gb200-flex", "l0_gb200_multi_nodes", 2, 3, 8, 2], "GB200-8_GPUs-2_Nodes-PyTorch-Post-Merge-3": ["auto:gb200-flex", "l0_gb200_multi_nodes", 3, 3, 8, 2], - // PerfSanity post-merge tests - "GB200-8_GPUs-2_Nodes-PyTorch-PerfSanity-Post-Merge-1": ["auto:gb200-flex", "l0_gb200_multi_nodes_aggr_perf_sanity_2_nodes", 1, 5, 8, 2], - "GB200-8_GPUs-2_Nodes-PyTorch-PerfSanity-Post-Merge-2": ["auto:gb200-flex", "l0_gb200_multi_nodes_aggr_perf_sanity_2_nodes", 2, 5, 8, 2], - "GB200-8_GPUs-2_Nodes-PyTorch-PerfSanity-Post-Merge-3": ["auto:gb200-flex", "l0_gb200_multi_nodes_aggr_perf_sanity_2_nodes", 3, 5, 8, 2], - "GB200-8_GPUs-2_Nodes-PyTorch-PerfSanity-Post-Merge-4": ["auto:gb200-flex", "l0_gb200_multi_nodes_aggr_perf_sanity_2_nodes", 4, 5, 8, 2], - "GB200-8_GPUs-2_Nodes-PyTorch-PerfSanity-Post-Merge-5": ["auto:gb200-flex", "l0_gb200_multi_nodes_aggr_perf_sanity_2_nodes", 5, 5, 8, 2], - // Disable stage 'GB200-12_GPUs-3_Nodes-PyTorch-PerfSanity-Disagg-Post-Merge-1' due to https://nvbugs/5819053 - // "GB200-12_GPUs-3_Nodes-PyTorch-PerfSanity-Disagg-Post-Merge-1": ["auto:gb200-flex", "l0_gb200_multi_nodes_disagg_perf_sanity_3_nodes", 1, 1, 12, 3], - // "GB200-24_GPUs-6_Nodes-PyTorch-PerfSanity-Disagg-Post-Merge-1": ["auto:gb200-flex", "l0_gb200_multi_nodes_disagg_perf_sanity_6_nodes", 1, 2, 24, 6], - // "GB200-24_GPUs-6_Nodes-PyTorch-PerfSanity-Disagg-Post-Merge-2": ["auto:gb200-flex", "l0_gb200_multi_nodes_disagg_perf_sanity_6_nodes", 2, 2, 24, 6], - // "GB200-32_GPUs-8_Nodes-PyTorch-PerfSanity-Disagg-Post-Merge-1": ["auto:gb200-flex", "l0_gb200_multi_nodes_disagg_perf_sanity_8_nodes", 1, 1, 32, 8], ] + // PerfSanity post-merge aggr tests + multiNodesSBSAConfigs += buildStageConfigs( + "GB200-8_GPUs-2_Nodes-PyTorch-Aggr-PerfSanity-Node2-GPU8-Post-Merge", + "auto:gb200-flex", + "l0_gb200_multi_nodes_aggr_perf_sanity_node2_gpu8", + 5, + 8, + 2 + ) + // PerfSanity post-merge disagg tests + multiNodesSBSAConfigs += buildStageConfigs( + "GB200-8_GPUs-2_Nodes-PyTorch-Disagg-PerfSanity-CTX1-NODE1-GPU1-GEN1-NODE1-GPU4-Post-Merge", + "auto:gb200-flex", + "l0_gb200_multi_nodes_disagg_perf_sanity_ctx1_node1_gpu1_gen1_node1_gpu4", + 1, + 8, + 2 + ) + multiNodesSBSAConfigs += buildStageConfigs( + "GB200-8_GPUs-2_Nodes-PyTorch-Disagg-PerfSanity-CTX1-NODE1-GPU4-GEN1-NODE1-GPU4-Post-Merge", + "auto:gb200-flex", + "l0_gb200_multi_nodes_disagg_perf_sanity_ctx1_node1_gpu4_gen1_node1_gpu4", + 3, + 8, + 2 + ) + multiNodesSBSAConfigs += buildStageConfigs( + "GB200-12_GPUs-3_Nodes-PyTorch-Disagg-PerfSanity-CTX1-NODE1-GPU1-GEN1-NODE2-GPU8-Post-Merge", + "auto:gb200-flex", + "l0_gb200_multi_nodes_disagg_perf_sanity_ctx1_node1_gpu1_gen1_node2_gpu8", + 1, + 12, + 3 + ) + multiNodesSBSAConfigs += buildStageConfigs( + "GB200-12_GPUs-3_Nodes-PyTorch-Disagg-PerfSanity-CTX1-NODE1-GPU4-GEN1-NODE2-GPU8-Post-Merge", + "auto:gb200-flex", + "l0_gb200_multi_nodes_disagg_perf_sanity_ctx1_node1_gpu4_gen1_node2_gpu8", + 5, + 12, + 3 + ) + multiNodesSBSAConfigs += buildStageConfigs( + "GB200-16_GPUs-4_Nodes-PyTorch-Disagg-PerfSanity-CTX1-NODE2-GPU8-GEN1-NODE2-GPU8-Post-Merge", + "auto:gb200-flex", + "l0_gb200_multi_nodes_disagg_perf_sanity_ctx1_node2_gpu8_gen1_node2_gpu8", + 1, + 16, + 4 + ) fullSet += multiNodesSBSAConfigs.keySet() if (env.targetArch == AARCH64_TRIPLE) { @@ -3610,9 +3658,9 @@ def launchTestJobs(pipeline, testFilter) }, {}, true) }]} - multiGpuJobs = parallelJobs.findAll{(it.key.contains("2_GPUs") || it.key.contains("4_GPUs") || it.key.contains("8_GPUs")) && !it.key.contains("Post-Merge")} + multiGpuJobs = parallelJobs.findAll{(it.key =~ /\d+_GPUs/) && !it.key.contains("Post-Merge")} println multiGpuJobs.keySet() - multiGpuJobsPostMerge = parallelJobs.findAll{(it.key.contains("2_GPUs") || it.key.contains("4_GPUs") || it.key.contains("8_GPUs")) && it.key.contains("Post-Merge")} + multiGpuJobsPostMerge = parallelJobs.findAll{(it.key =~ /\d+_GPUs/) && it.key.contains("Post-Merge")} parallelJobs += docBuildJobs parallelJobs += sanityCheckJobs @@ -3927,9 +3975,9 @@ pipeline { def testPhase2StageName = env.testPhase2StageName if (testPhase2StageName) { - def dgxSigns = ["2_GPUs", "4_GPUs", "8_GPUs"] - singleGpuJobs = parallelJobs.findAll{!dgxSigns.any{sign -> it.key.contains(sign)}} - dgxJobs = parallelJobs.findAll{dgxSigns.any{sign -> it.key.contains(sign)}} + def multiGpuPattern = /\d+_GPUs/ + singleGpuJobs = parallelJobs.findAll{!(it.key =~ multiGpuPattern)} + dgxJobs = parallelJobs.findAll{it.key =~ multiGpuPattern} } if (env.JOB_NAME ==~ /.*Single-GPU.*/) { diff --git a/jenkins/scripts/perf/disaggregated/slurm_launch_draft.sh b/jenkins/scripts/perf/disaggregated/slurm_launch_draft.sh index c5dc80c971..f6c1e05d13 100644 --- a/jenkins/scripts/perf/disaggregated/slurm_launch_draft.sh +++ b/jenkins/scripts/perf/disaggregated/slurm_launch_draft.sh @@ -19,13 +19,13 @@ echo "Installation completed on all nodes" # Start gen servers echo "Starting gen servers..." for i in $(seq 0 $((numGenServers - 1))); do - gen_world_size=$((nodesPerGenServer * gpusPerNode)) + gen_world_size=$((nodesPerGenServer * gpusPerfNodePerfGenServer)) export DISAGG_SERVING_TYPE="GEN_$i" export pytestCommand="$pytestCommandWorker" srun "${srunArgs[@]}" --kill-on-bad-exit=1 \ -N $nodesPerGenServer \ --ntasks=$gen_world_size \ - --ntasks-per-node=$gpusPerNode \ + --ntasks-per-node=$gpusPerfNodePerfGenServer \ $runScript &> $jobWorkspace/gen_server_$i.log & echo "Started gen server $i" done @@ -34,13 +34,13 @@ done if [ "${TRTLLM_DISAGG_BENCHMARK_GEN_ONLY:-0}" != "1" ]; then echo "Starting ctx servers..." for i in $(seq 0 $((numCtxServers - 1))); do - ctx_world_size=$((nodesPerCtxServer * gpusPerNode)) + ctx_world_size=$((nodesPerCtxServer * gpusPerfNodePerfCtxServer)) export DISAGG_SERVING_TYPE="CTX_$i" export pytestCommand="$pytestCommandWorker" srun "${srunArgs[@]}" --kill-on-bad-exit=1 \ -N $nodesPerCtxServer \ - --ntasks=$ctx_world_size \ - --ntasks-per-node=$gpusPerNode \ + --ntasks=$ctx_world_size \ + --ntasks-per-node=$gpusPerfNodePerfCtxServer \ $runScript &> $jobWorkspace/ctx_server_$i.log & echo "Started ctx server $i" done diff --git a/jenkins/scripts/perf/disaggregated/submit.py b/jenkins/scripts/perf/disaggregated/submit.py index 5e8e374f4f..4208f809ab 100644 --- a/jenkins/scripts/perf/disaggregated/submit.py +++ b/jenkins/scripts/perf/disaggregated/submit.py @@ -38,6 +38,9 @@ def get_hardware_config(config, benchmark_mode): nodes_per_ctx_server = (gpus_per_ctx_server + gpus_per_node - 1) // gpus_per_node nodes_per_gen_server = (gpus_per_gen_server + gpus_per_node - 1) // gpus_per_node + gpus_per_node_per_ctx_server = min(gpus_per_ctx_server, gpus_per_node) + gpus_per_node_per_gen_server = min(gpus_per_gen_server, gpus_per_node) + total_nodes = num_ctx_servers * nodes_per_ctx_server + num_gen_servers * nodes_per_gen_server total_gpus = total_nodes * gpus_per_node @@ -49,6 +52,8 @@ def get_hardware_config(config, benchmark_mode): "gpus_per_gen_server": gpus_per_gen_server, "nodes_per_ctx_server": nodes_per_ctx_server, "nodes_per_gen_server": nodes_per_gen_server, + "gpus_per_node_per_ctx_server": gpus_per_node_per_ctx_server, + "gpus_per_node_per_gen_server": gpus_per_node_per_gen_server, "total_nodes": total_nodes, "total_gpus": total_gpus, } @@ -102,7 +107,14 @@ def remove_whitespace_lines(lines): return [line.strip() for line in lines if line.strip()] -def get_pytest_command_no_llmapilaunch(script_prefix_lines): +def get_pytest_commands(script_prefix_lines): + # Get worker, disagg_server, benchmark pytest commands from pytest command. + # Worker pytest command is pytest command with trtllm-llmapi-launch and + # without --csv, --cov, --periodic flags. + # Disagg_server pytest command is pytest command without trtllm-llmapi-launch + # and without --csv, --cov, --periodic flags. + # Benchmark pytest command is pytest command without trtllm-llmapi-launch + # and with --csv, --cov, --periodic flags. pytest_command_line = None for line in script_prefix_lines: if "export pytestCommand=" in line: @@ -110,17 +122,102 @@ def get_pytest_command_no_llmapilaunch(script_prefix_lines): break if not pytest_command_line: - return "" + return "", "", "" - # Replace pytestCommand with pytestCommandNoLLMAPILaunch - replaced_line = pytest_command_line.replace("pytestCommand", "pytestCommandNoLLMAPILaunch") + def split_pytest_command_line(command_line): + # After pytest, there are six types of substrings: + # Type 1: --xxx=yyy (long option with value, self-contained) + # Type 2: --xxx= (long option with empty value, self-contained) + # Type 3: --xxx (long option flag, no value) + # Type 4: --xxx yyy (long option with value as next arg) + # Type 5: -x yyy (short single-letter option with value as next arg) + # Type 6: -x (short option flag, e.g., -v, -vv) + parts = command_line.split() + pytest_index = None + for idx, part in enumerate(parts): + if "pytest" == part: + pytest_index = idx + break + if pytest_index is None: + return parts - # Split by space, find and remove the substring with trtllm-llmapi-launch - replaced_line_parts = replaced_line.split() - replaced_line_parts_no_llmapi = [ - part for part in replaced_line_parts if "trtllm-llmapi-launch" not in part + grouped_parts = parts[: pytest_index + 1] + i = pytest_index + 1 + while i < len(parts): + part = parts[i] + has_next = i + 1 < len(parts) + next_is_value = has_next and not parts[i + 1].startswith("-") + + # Type 1 & 2: --xxx=yyy or --xxx= (self-contained, has '=') + if part.startswith("--") and "=" in part: + grouped_parts.append(part) + i += 1 + continue + + # Type 4: --xxx yyy (long option with value as next arg) + if part.startswith("--") and next_is_value: + grouped_parts.append(f"{part} {parts[i + 1]}") + i += 2 + continue + + # Type 3: --xxx (long option flag) + if part.startswith("--"): + grouped_parts.append(part) + i += 1 + continue + + # Type 5: -x yyy (short single-letter option with value as next arg) + # Only single letter after dash, e.g., -o, not -vv + if part.startswith("-") and len(part) == 2 and next_is_value: + grouped_parts.append(f"{part} {parts[i + 1]}") + i += 2 + continue + + # Type 6: -x (short option flag, including combined like -vv) + if part.startswith("-"): + grouped_parts.append(part) + i += 1 + continue + + # Other parts (shouldn't happen after pytest, but handle gracefully) + grouped_parts.append(part) + i += 1 + + return grouped_parts + + def is_llmapi_launch(part): + return "trtllm-llmapi-launch" in part + + def is_output_file_part(part): + return any(flag in part for flag in ("--csv", "--cov", "--periodic")) + + worker_line = pytest_command_line.replace("pytestCommand", "partialPytestCommandWorker") + worker_parts = [ + part for part in split_pytest_command_line(worker_line) if not is_output_file_part(part) ] - return " ".join(replaced_line_parts_no_llmapi) + worker_pytest_command = " ".join(worker_parts) + + disagg_server_line = pytest_command_line.replace( + "pytestCommand", "partialPytestCommandDisaggServer" + ) + disagg_server_parts = [ + part + for part in split_pytest_command_line(disagg_server_line) + if not is_llmapi_launch(part) and not is_output_file_part(part) + ] + disagg_server_pytest_command = " ".join(disagg_server_parts) + + benchmark_line = pytest_command_line.replace("pytestCommand", "partialPytestCommandBenchmark") + benchmark_parts = [ + part for part in split_pytest_command_line(benchmark_line) if not is_llmapi_launch(part) + ] + benchmark_pytest_command = " ".join(benchmark_parts) + + return ( + worker_pytest_command, + disagg_server_pytest_command, + benchmark_pytest_command, + ) def get_config_yaml(test_list_path, llm_src): @@ -153,7 +250,7 @@ def get_config_yaml(test_list_path, llm_src): "disagg", "test_configs", "disagg", - "perf", + "perf-sanity", f"{config_base_name}.yaml", ) if not os.path.exists(config_yaml_path): @@ -225,8 +322,12 @@ def main(): srun_args_lines = srun_args_content.split() - # Extract pytestCommand and generate pytestCommandNoLLMAPILaunch - pytest_command_no_llmapi_launch = get_pytest_command_no_llmapilaunch(script_prefix_lines) + # Extract pytestCommand and generate partial pytest commands + ( + worker_pytest_command, + disagg_server_pytest_command, + benchmark_pytest_command, + ) = get_pytest_commands(script_prefix_lines) # Build worker env vars, add extra env vars for gen_only mode worker_env_vars = env_config["worker_env_var"] @@ -244,12 +345,15 @@ def main(): script_prefix_lines.extend( [ - pytest_command_no_llmapi_launch, - f'export pytestCommandWorker="unset UCX_TLS && {worker_env_vars} $pytestCommand"', - f'export pytestCommandDisaggServer="{server_env_vars} $pytestCommandNoLLMAPILaunch"', - f'export pytestCommandBenchmark="{env_config["benchmark_env_var"]} $pytestCommandNoLLMAPILaunch"', + worker_pytest_command, + disagg_server_pytest_command, + benchmark_pytest_command, + f'export pytestCommandWorker="unset UCX_TLS && {worker_env_vars} $partialPytestCommandWorker"', + f'export pytestCommandDisaggServer="{server_env_vars} $partialPytestCommandDisaggServer"', + f'export pytestCommandBenchmark="{env_config["benchmark_env_var"]} $partialPytestCommandBenchmark"', f"export runScript={args.run_sh}", f"export installScript={install_script}", + f"export configYamlPath={config_yaml}", f"export numCtxServers={hardware_config['num_ctx_servers']}", f"export numGenServers={hardware_config['num_gen_servers']}", f"export gpusPerNode={hardware_config['gpus_per_node']}", @@ -257,6 +361,8 @@ def main(): f"export gpusPerGenServer={hardware_config['gpus_per_gen_server']}", f"export nodesPerCtxServer={hardware_config['nodes_per_ctx_server']}", f"export nodesPerGenServer={hardware_config['nodes_per_gen_server']}", + f"export gpusPerfNodePerfCtxServer={hardware_config['gpus_per_node_per_ctx_server']}", + f"export gpusPerfNodePerfGenServer={hardware_config['gpus_per_node_per_gen_server']}", f"export totalNodes={hardware_config['total_nodes']}", f"export totalGpus={hardware_config['total_gpus']}", ] diff --git a/jenkins/scripts/perf/perf_sanity_triage.py b/jenkins/scripts/perf/perf_sanity_triage.py index 50094b93a3..dbe0961119 100644 --- a/jenkins/scripts/perf/perf_sanity_triage.py +++ b/jenkins/scripts/perf/perf_sanity_triage.py @@ -13,8 +13,9 @@ sys.path.insert(0, sys.path[0] + "/..") from open_search_db import OpenSearchDB QUERY_LOOKBACK_DAYS = 90 +LOOKBACK_JOBS = 30 MAX_QUERY_SIZE = 3000 -MAX_TEST_CASES_PER_MSG = 5 +MAX_TEST_CASES_PER_MSG = 4 POST_SLACK_MSG_RETRY_TIMES = 5 @@ -99,42 +100,74 @@ def post_perf_data(data_list, project_name): return False -def get_regression_data_by_job_id(data_list, query_job_number): - """Returns a dict with job_id as key and list of regression data as value. +def get_regression_dict(data_list, query_job_number, lookback_job_number=LOOKBACK_JOBS): + """Returns a dict with job_id as key and list of regression tuples as value. + Each tuple is (test_case_name, gpu_type, runtime, history_regression_job_ids, data). Only returns the latest query_job_number jobs. """ if data_list is None or len(data_list) == 0: return {} # Group data by job_id - job_data_dict = {} + job_test_dict = {} for data in data_list: - job_id = data.get("s_job_id", "") - if job_id == "": + raw_job_id = data.get("s_job_id", "") + if raw_job_id == "": continue - if job_id not in job_data_dict: - job_data_dict[job_id] = [] - job_data_dict[job_id].append(data) + try: + job_id = int(raw_job_id) + except (TypeError, ValueError): + continue + job_test_dict.setdefault(job_id, []).append(data) - # Sort job_ids by the latest ts_created in each group (descending) - def get_latest_timestamp(job_id): - timestamps = [d.get("ts_created", 0) for d in job_data_dict[job_id]] - return max(timestamps) if timestamps else 0 + if not job_test_dict: + return {} - sorted_job_ids = sorted(job_data_dict.keys(), key=get_latest_timestamp, reverse=True) + # Sort job_ids (descending: latest -> oldest) + sorted_job_id_list = sorted(job_test_dict.keys(), reverse=True) - # Only keep the latest query_job_number jobs - latest_job_ids = sorted_job_ids[:query_job_number] + # Build (test_case_name, gpu_type, runtime) -> job_ids dict + test_job_dict = {} + for job_id, data_list in job_test_dict.items(): + for data in data_list: + test_case_name = data.get("s_test_case_name") or "" + gpu_type = data.get("s_gpu_type") or "" + runtime = data.get("s_runtime") or "" + if not test_case_name or not gpu_type or not runtime: + continue + key = (test_case_name, gpu_type, runtime) + test_job_dict.setdefault(key, set()).add(job_id) - result = {} + # Sort job ids for each test case (descending: latest -> oldest) + for key, job_id_set in list(test_job_dict.items()): + test_job_dict[key] = sorted(job_id_set, reverse=True) + + # Only keep the latest query_job_number jobs in the result + latest_job_ids = sorted_job_id_list[:query_job_number] + + regression_dict = {} for job_id in latest_job_ids: - result[job_id] = job_data_dict[job_id] + entries = [] + for data in job_test_dict.get(job_id, []): + test_case_name = data.get("s_test_case_name") or "" + gpu_type = data.get("s_gpu_type") or "" + runtime = data.get("s_runtime") or "" + if not test_case_name or not gpu_type or not runtime: + continue + key = (test_case_name, gpu_type, runtime) + history_ids = test_job_dict.get(key, []) + lower_bound = job_id - lookback_job_number + 1 + history_regression_job_ids = [ + jid for jid in history_ids if lower_bound <= jid <= job_id + ] + entries.append((test_case_name, gpu_type, runtime, history_regression_job_ids, data)) + regression_dict[job_id] = entries - return result + return regression_dict -def process_regression_message(regression_dict): +def split_regression_message(regression_dict): """Process regression data into message chunks. Returns a list of messages, each containing at most MAX_TEST_CASES_PER_MSG test cases. @@ -142,12 +175,17 @@ def process_regression_message(regression_dict): if not regression_dict: return [] - # Flatten all test cases into a list with (job_id, idx, data) tuples + # Flatten all test cases into a list with + # (job_id, idx, test_case_name, gpu_type, runtime, history_regression_job_ids, data) tuples all_test_cases = [] for job_id, data_list in regression_dict.items(): - sorted_data_list = sorted(data_list, key=lambda x: x.get("s_test_case_name", "")) - for idx, data in enumerate(sorted_data_list, start=1): - all_test_cases.append((job_id, idx, data)) + sorted_data_list = sorted(data_list, key=lambda x: x[0]) + for idx, (test_case_name, gpu_type, runtime, history_regression_job_ids, data) in enumerate( + sorted_data_list, start=1 + ): + all_test_cases.append( + (job_id, idx, test_case_name, gpu_type, runtime, history_regression_job_ids, data) + ) # Split into chunks of MAX_TEST_CASES_PER_MSG chunks = [] @@ -159,7 +197,15 @@ def process_regression_message(regression_dict): for chunk in chunks: msg_parts = [] current_job_id = None - for job_id, idx, data in chunk: + for ( + job_id, + idx, + test_case_name, + gpu_type, + runtime, + history_regression_job_ids, + data, + ) in chunk: # Add job header when switching to a new job_id if job_id != current_job_id: if msg_parts: @@ -168,12 +214,46 @@ def process_regression_message(regression_dict): msg_parts.append(job_header) current_job_id = job_id - test_case_name = data.get("s_test_case_name", "N/A") regression_info = data.get("s_regression_info", "N/A") + history_text = ( + ", ".join(str(jid) for jid in history_regression_job_ids) + if history_regression_job_ids + else "N/A" + ) msg_parts.append(f"*REGRESSION TEST CASE {idx}: {test_case_name}*\n") + msg_parts.append(f"*GPU: {gpu_type} Mode: {runtime}*\n") + msg_parts.append(f"*History Regression Post-Merge Job IDs: {history_text}*\n") + + # Parse regression_info to extract baseline info and metrics + baseline_date = "N/A" + baseline_branch = "N/A" + baseline_commit = "N/A" for part in regression_info.split(","): part = part.strip() - if part and "baseline_id" not in part: + if "baseline_date:" in part: + baseline_date = part.split(":", 1)[-1].strip() + elif "baseline_branch:" in part: + baseline_branch = part.split(":", 1)[-1].strip() + elif "baseline_commit:" in part: + baseline_commit = part.split(":", 1)[-1].strip() + + # Get regression branch and commit from data + regression_date = data.get("ts_created", "N/A") + regression_branch = data.get("s_branch", "N/A") + regression_commit = data.get("s_commit", "N/A") + + msg_parts.append( + f"*Baseline date, branch and commit: " + f"{baseline_date} {baseline_branch} {baseline_commit}*\n" + ) + msg_parts.append( + f"*Regression date, branch and commit: " + f"{regression_date} {regression_branch} {regression_commit}*\n" + ) + + for part in regression_info.split(","): + part = part.strip() + if part and "baseline_" not in part: msg_parts.append(f" {part}\n") msg = "".join(msg_parts).strip() @@ -288,8 +368,8 @@ def main(): print("Failed to query regression data") return - regression_dict = get_regression_data_by_job_id(data_list, args.query_job_number) - messages = process_regression_message(regression_dict) + regression_dict = get_regression_dict(data_list, args.query_job_number) + messages = split_regression_message(regression_dict) send_regression_message(messages, args.channel_id, args.bot_token) elif args.operation.strip().upper().startswith("UPDATE"): set_values, where_values, error = parse_update_operation(args.operation) diff --git a/jenkins/scripts/slurm_run.sh b/jenkins/scripts/slurm_run.sh index dec9d20fcd..3d8c709826 100755 --- a/jenkins/scripts/slurm_run.sh +++ b/jenkins/scripts/slurm_run.sh @@ -58,7 +58,10 @@ cd $llmSrcNode/tests/integration/defs trtllmWhlPath=$(pip3 show tensorrt_llm | grep Location | cut -d ' ' -f 2) trtllmWhlPath=$(echo "$trtllmWhlPath" | sed 's/[[:space:]]+/_/g') echo "TRTLLM WHEEL PATH: $trtllmWhlPath" -pytestCommand=$(set_value_in_command "TRTLLM_WHL_PATH" "$trtllmWhlPath" "$pytestCommand") +# In disaggregated mode, we only set coverage config file in benchmark pytest. +if [[ -z "${DISAGG_SERVING_TYPE:-}" || "${DISAGG_SERVING_TYPE}" == "BENCHMARK" ]]; then + pytestCommand=$(set_value_in_command "TRTLLM_WHL_PATH" "$trtllmWhlPath" "$pytestCommand") +fi # Only the first process will save the coverage config file if [ $SLURM_PROCID -eq 0 ]; then diff --git a/tests/integration/defs/perf/disagg/test_configs/disagg/perf-sanity/b200-deepseek-r1-fp4_1k1k_ctx1_dep4_gen1_dep8_eplb0_mtp1_ccb-UCX.yaml b/tests/integration/defs/perf/disagg/test_configs/disagg/perf-sanity/b200-deepseek-r1-fp4_1k1k_ctx1_dep4_gen1_dep8_eplb0_mtp1_ccb-UCX.yaml new file mode 100644 index 0000000000..6deca9de6b --- /dev/null +++ b/tests/integration/defs/perf/disagg/test_configs/disagg/perf-sanity/b200-deepseek-r1-fp4_1k1k_ctx1_dep4_gen1_dep8_eplb0_mtp1_ccb-UCX.yaml @@ -0,0 +1,97 @@ +metadata: + model_name: deepseek_r1_0528_fp4_v2 + precision: fp4 + model_dir_name: DeepSeek-R1-0528-FP4-v2 + supported_gpus: + - B200 + script_file: disaggr_torch.slurm + benchmark_type: 1k1k +slurm: + script_file: disaggr_torch.slurm + partition: + account: + job_time: 02:00:00 + job_name: unified-benchmark + extra_args: --gres=gpu:8 + numa_bind: true +benchmark: + mode: e2e + use_nv_sa_benchmark: false + multi_round: 5 + benchmark_ratio: 0.0 + streaming: true + concurrency_list: '2048' + input_length: 1024 + output_length: 1024 + dataset_file: datasets/perf-ci/deepseek_r1-1k1k-20480-ratio-1_for_serve.json +hardware: + gpus_per_node: 8 + num_ctx_servers: 1 + num_gen_servers: 1 +environment: + container_mount: + container_image: + model_path: + trtllm_repo: '' + build_wheel: false + work_dir: + worker_env_var: TLLM_LOG_LEVEL=INFO TRTLLM_SERVER_DISABLE_GC=1 TRTLLM_WORKER_DISABLE_GC=1 TRTLLM_ENABLE_PDL=1 ENROOT_ALLOW_DEV=yes + server_env_var: TRTLLM_SERVER_DISABLE_GC=1 +profiling: + nsys_on: false +accuracy: + enable_accuracy_test: false + model: local-completions + tasks: gsm8k + model_args_extra: num_concurrent=512,max_retries=3,tokenized_requests=false,timeout=1200,max_gen_toks=256,max_length=4096 +worker_config: + gen: + print_iter_log: true + max_batch_size: 256 + max_num_tokens: 512 + tensor_parallel_size: 8 + moe_expert_parallel_size: 8 + pipeline_parallel_size: 1 + context_parallel_size: 1 + enable_attention_dp: true + enable_lm_head_tp_in_adp: true + cuda_graph_config: + enable_padding: true + max_batch_size: 256 + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.8 + dtype: fp8 + moe_config: + backend: CUTEDSL + cache_transceiver_config: + max_tokens_in_buffer: 16384 + backend: UCX + disable_overlap_scheduler: true + speculative_config: &id001 + decoding_type: MTP + num_nextn_predict_layers: 1 + num_postprocess_workers: 4 + stream_interval: 20 + ctx: + print_iter_log: true + max_batch_size: 16 + max_num_tokens: 16384 + tensor_parallel_size: 4 + moe_expert_parallel_size: 4 + pipeline_parallel_size: 1 + context_parallel_size: 1 + enable_attention_dp: true + enable_lm_head_tp_in_adp: true + cuda_graph_config: null + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.6 + dtype: fp8 + moe_config: + backend: TRTLLM + cache_transceiver_config: + max_tokens_in_buffer: 16384 + backend: UCX + disable_overlap_scheduler: true + speculative_config: *id001 diff --git a/tests/integration/defs/perf/disagg/test_configs/disagg/perf-sanity/b200-deepseek-r1-fp4_1k1k_ctx1_dep4_gen1_dep8_eplb0_mtp3_ccb-UCX.yaml b/tests/integration/defs/perf/disagg/test_configs/disagg/perf-sanity/b200-deepseek-r1-fp4_1k1k_ctx1_dep4_gen1_dep8_eplb0_mtp3_ccb-UCX.yaml new file mode 100644 index 0000000000..deb467d605 --- /dev/null +++ b/tests/integration/defs/perf/disagg/test_configs/disagg/perf-sanity/b200-deepseek-r1-fp4_1k1k_ctx1_dep4_gen1_dep8_eplb0_mtp3_ccb-UCX.yaml @@ -0,0 +1,97 @@ +metadata: + model_name: deepseek_r1_0528_fp4_v2 + precision: fp4 + model_dir_name: DeepSeek-R1-0528-FP4-v2 + supported_gpus: + - B200 + script_file: disaggr_torch.slurm + benchmark_type: 1k1k +slurm: + script_file: disaggr_torch.slurm + partition: + account: + job_time: 02:00:00 + job_name: unified-benchmark + extra_args: --gres=gpu:8 + numa_bind: true +benchmark: + mode: e2e + use_nv_sa_benchmark: false + multi_round: 10 + benchmark_ratio: 0.0 + streaming: true + concurrency_list: '256' + input_length: 1024 + output_length: 1024 + dataset_file: datasets/perf-ci/deepseek_r1-1k1k-20480-ratio-1_for_serve.json +hardware: + gpus_per_node: 8 + num_ctx_servers: 1 + num_gen_servers: 1 +environment: + container_mount: + container_image: + model_path: + trtllm_repo: '' + build_wheel: false + work_dir: + worker_env_var: TLLM_LOG_LEVEL=INFO TRTLLM_SERVER_DISABLE_GC=1 TRTLLM_WORKER_DISABLE_GC=1 TRTLLM_ENABLE_PDL=1 ENROOT_ALLOW_DEV=yes + server_env_var: TRTLLM_SERVER_DISABLE_GC=1 +profiling: + nsys_on: false +accuracy: + enable_accuracy_test: false + model: local-completions + tasks: gsm8k + model_args_extra: num_concurrent=512,max_retries=3,tokenized_requests=false,timeout=1200,max_gen_toks=256,max_length=4096 +worker_config: + gen: + print_iter_log: true + max_batch_size: 32 + max_num_tokens: 256 + tensor_parallel_size: 8 + moe_expert_parallel_size: 8 + pipeline_parallel_size: 1 + context_parallel_size: 1 + enable_attention_dp: true + enable_lm_head_tp_in_adp: true + cuda_graph_config: + enable_padding: true + max_batch_size: 32 + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.8 + dtype: fp8 + moe_config: + backend: CUTEDSL + cache_transceiver_config: + max_tokens_in_buffer: 16384 + backend: UCX + disable_overlap_scheduler: true + speculative_config: &id001 + decoding_type: MTP + num_nextn_predict_layers: 3 + num_postprocess_workers: 4 + stream_interval: 20 + ctx: + print_iter_log: true + max_batch_size: 16 + max_num_tokens: 16384 + tensor_parallel_size: 4 + moe_expert_parallel_size: 4 + pipeline_parallel_size: 1 + context_parallel_size: 1 + enable_attention_dp: true + enable_lm_head_tp_in_adp: true + cuda_graph_config: null + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.6 + dtype: fp8 + moe_config: + backend: TRTLLM + cache_transceiver_config: + max_tokens_in_buffer: 16384 + backend: UCX + disable_overlap_scheduler: true + speculative_config: *id001 diff --git a/tests/integration/defs/perf/disagg/test_configs/disagg/perf-sanity/b200-deepseek-r1-fp4_1k1k_ctx1_dep4_gen1_tep8_eplb0_mtp3_ccb-UCX.yaml b/tests/integration/defs/perf/disagg/test_configs/disagg/perf-sanity/b200-deepseek-r1-fp4_1k1k_ctx1_dep4_gen1_tep8_eplb0_mtp3_ccb-UCX.yaml new file mode 100644 index 0000000000..7c21e51ae8 --- /dev/null +++ b/tests/integration/defs/perf/disagg/test_configs/disagg/perf-sanity/b200-deepseek-r1-fp4_1k1k_ctx1_dep4_gen1_tep8_eplb0_mtp3_ccb-UCX.yaml @@ -0,0 +1,96 @@ +metadata: + model_name: deepseek_r1_0528_fp4_v2 + precision: fp4 + model_dir_name: DeepSeek-R1-0528-FP4-v2 + supported_gpus: + - B200 + script_file: disaggr_torch.slurm + benchmark_type: 1k1k +slurm: + script_file: disaggr_torch.slurm + partition: + account: + job_time: 02:00:00 + job_name: unified-benchmark + extra_args: --gres=gpu:8 + numa_bind: true +benchmark: + mode: e2e + use_nv_sa_benchmark: false + multi_round: 10 + benchmark_ratio: 0.0 + streaming: true + concurrency_list: '1' + input_length: 1024 + output_length: 1024 + dataset_file: datasets/perf-ci/deepseek_r1-1k1k-20480-ratio-1_for_serve.json +hardware: + gpus_per_node: 8 + num_ctx_servers: 1 + num_gen_servers: 1 +environment: + container_mount: + container_image: + model_path: + trtllm_repo: '' + build_wheel: false + work_dir: + worker_env_var: TLLM_LOG_LEVEL=INFO TRTLLM_SERVER_DISABLE_GC=1 TRTLLM_WORKER_DISABLE_GC=1 TRTLLM_ENABLE_PDL=1 ENROOT_ALLOW_DEV=yes + server_env_var: TRTLLM_SERVER_DISABLE_GC=1 +profiling: + nsys_on: false +accuracy: + enable_accuracy_test: false + model: local-completions + tasks: gsm8k + model_args_extra: num_concurrent=512,max_retries=3,tokenized_requests=false,timeout=1200,max_gen_toks=256,max_length=4096 +worker_config: + gen: + print_iter_log: true + max_batch_size: 1 + max_num_tokens: 256 + tensor_parallel_size: 8 + moe_expert_parallel_size: 8 + pipeline_parallel_size: 1 + context_parallel_size: 1 + enable_attention_dp: false + cuda_graph_config: + enable_padding: true + max_batch_size: 1 + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.9 + dtype: fp8 + moe_config: + backend: TRTLLM + cache_transceiver_config: + max_tokens_in_buffer: 16384 + backend: UCX + disable_overlap_scheduler: true + speculative_config: &id001 + decoding_type: MTP + num_nextn_predict_layers: 3 + num_postprocess_workers: 4 + stream_interval: 20 + ctx: + print_iter_log: true + max_batch_size: 16 + max_num_tokens: 16384 + tensor_parallel_size: 4 + moe_expert_parallel_size: 4 + pipeline_parallel_size: 1 + context_parallel_size: 1 + enable_attention_dp: true + enable_lm_head_tp_in_adp: true + cuda_graph_config: null + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.6 + dtype: fp8 + moe_config: + backend: TRTLLM + cache_transceiver_config: + max_tokens_in_buffer: 16384 + backend: UCX + disable_overlap_scheduler: true + speculative_config: *id001 diff --git a/tests/integration/defs/perf/disagg/test_configs/disagg/perf-sanity/b200-deepseek-r1-fp4_8k1k_con1536_ctx1_dep4_gen1_dep8_eplb0_mtp1_ccb-UCX.yaml b/tests/integration/defs/perf/disagg/test_configs/disagg/perf-sanity/b200-deepseek-r1-fp4_8k1k_con1536_ctx1_dep4_gen1_dep8_eplb0_mtp1_ccb-UCX.yaml new file mode 100644 index 0000000000..7bed6bb3df --- /dev/null +++ b/tests/integration/defs/perf/disagg/test_configs/disagg/perf-sanity/b200-deepseek-r1-fp4_8k1k_con1536_ctx1_dep4_gen1_dep8_eplb0_mtp1_ccb-UCX.yaml @@ -0,0 +1,97 @@ +metadata: + model_name: deepseek_r1_0528_fp4_v2 + precision: fp4 + model_dir_name: DeepSeek-R1-0528-FP4-v2 + supported_gpus: + - B200 + script_file: disaggr_torch.slurm + benchmark_type: 8k1k +slurm: + script_file: disaggr_torch.slurm + partition: + account: + job_time: 02:00:00 + job_name: unified-benchmark + extra_args: --gres=gpu:8 + numa_bind: true +benchmark: + mode: e2e + use_nv_sa_benchmark: false + multi_round: 10 + benchmark_ratio: 0.0 + streaming: true + concurrency_list: '1536' + input_length: 8192 + output_length: 1024 + dataset_file: datasets/perf-ci/deepseek_r1-8k1k-20480-ratio-1_for_serve.json +hardware: + gpus_per_node: 8 + num_ctx_servers: 1 + num_gen_servers: 1 +environment: + container_mount: + container_image: + model_path: + trtllm_repo: '' + build_wheel: false + work_dir: + worker_env_var: TLLM_LOG_LEVEL=INFO TRTLLM_SERVER_DISABLE_GC=1 TRTLLM_WORKER_DISABLE_GC=1 TRTLLM_ENABLE_PDL=1 ENROOT_ALLOW_DEV=yes + server_env_var: TRTLLM_SERVER_DISABLE_GC=1 +profiling: + nsys_on: false +accuracy: + enable_accuracy_test: false + model: local-completions + tasks: gsm8k + model_args_extra: num_concurrent=512,max_retries=3,tokenized_requests=false,timeout=1200,max_gen_toks=256,max_length=4096 +worker_config: + gen: + print_iter_log: true + max_batch_size: 192 + max_num_tokens: 384 + tensor_parallel_size: 8 + moe_expert_parallel_size: 8 + pipeline_parallel_size: 1 + context_parallel_size: 1 + enable_attention_dp: true + enable_lm_head_tp_in_adp: true + cuda_graph_config: + enable_padding: true + max_batch_size: 192 + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.8 + dtype: fp8 + moe_config: + backend: CUTEDSL + cache_transceiver_config: + max_tokens_in_buffer: 16384 + backend: UCX + disable_overlap_scheduler: true + speculative_config: &id001 + decoding_type: MTP + num_nextn_predict_layers: 1 + num_postprocess_workers: 4 + stream_interval: 20 + ctx: + print_iter_log: true + max_batch_size: 2 + max_num_tokens: 16384 + tensor_parallel_size: 4 + moe_expert_parallel_size: 4 + pipeline_parallel_size: 1 + context_parallel_size: 1 + enable_attention_dp: true + enable_lm_head_tp_in_adp: true + cuda_graph_config: null + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.6 + dtype: fp8 + moe_config: + backend: TRTLLM + cache_transceiver_config: + max_tokens_in_buffer: 16384 + backend: UCX + disable_overlap_scheduler: true + speculative_config: *id001 diff --git a/tests/integration/defs/perf/disagg/test_configs/disagg/perf-sanity/b200-deepseek-r1-fp4_8k1k_con256_ctx1_dep4_gen1_dep8_eplb0_mtp1_ccb-UCX.yaml b/tests/integration/defs/perf/disagg/test_configs/disagg/perf-sanity/b200-deepseek-r1-fp4_8k1k_con256_ctx1_dep4_gen1_dep8_eplb0_mtp1_ccb-UCX.yaml new file mode 100644 index 0000000000..5394e90236 --- /dev/null +++ b/tests/integration/defs/perf/disagg/test_configs/disagg/perf-sanity/b200-deepseek-r1-fp4_8k1k_con256_ctx1_dep4_gen1_dep8_eplb0_mtp1_ccb-UCX.yaml @@ -0,0 +1,97 @@ +metadata: + model_name: deepseek_r1_0528_fp4_v2 + precision: fp4 + model_dir_name: DeepSeek-R1-0528-FP4-v2 + supported_gpus: + - B200 + script_file: disaggr_torch.slurm + benchmark_type: 8k1k +slurm: + script_file: disaggr_torch.slurm + partition: + account: + job_time: 02:00:00 + job_name: unified-benchmark + extra_args: --gres=gpu:8 + numa_bind: true +benchmark: + mode: e2e + use_nv_sa_benchmark: false + multi_round: 10 + benchmark_ratio: 0.0 + streaming: true + concurrency_list: '256' + input_length: 8192 + output_length: 1024 + dataset_file: datasets/perf-ci/deepseek_r1-8k1k-20480-ratio-1_for_serve.json +hardware: + gpus_per_node: 8 + num_ctx_servers: 1 + num_gen_servers: 1 +environment: + container_mount: + container_image: + model_path: + trtllm_repo: '' + build_wheel: false + work_dir: + worker_env_var: TLLM_LOG_LEVEL=INFO TRTLLM_SERVER_DISABLE_GC=1 TRTLLM_WORKER_DISABLE_GC=1 TRTLLM_ENABLE_PDL=1 ENROOT_ALLOW_DEV=yes + server_env_var: TRTLLM_SERVER_DISABLE_GC=1 +profiling: + nsys_on: false +accuracy: + enable_accuracy_test: false + model: local-completions + tasks: gsm8k + model_args_extra: num_concurrent=512,max_retries=3,tokenized_requests=false,timeout=1200,max_gen_toks=256,max_length=4096 +worker_config: + gen: + print_iter_log: true + max_batch_size: 32 + max_num_tokens: 128 + tensor_parallel_size: 8 + moe_expert_parallel_size: 8 + pipeline_parallel_size: 1 + context_parallel_size: 1 + enable_attention_dp: true + enable_lm_head_tp_in_adp: true + cuda_graph_config: + enable_padding: true + max_batch_size: 32 + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.8 + dtype: fp8 + moe_config: + backend: CUTEDSL + cache_transceiver_config: + max_tokens_in_buffer: 16384 + backend: UCX + disable_overlap_scheduler: true + speculative_config: &id001 + decoding_type: MTP + num_nextn_predict_layers: 1 + num_postprocess_workers: 4 + stream_interval: 20 + ctx: + print_iter_log: true + max_batch_size: 2 + max_num_tokens: 16384 + tensor_parallel_size: 4 + moe_expert_parallel_size: 4 + pipeline_parallel_size: 1 + context_parallel_size: 1 + enable_attention_dp: true + enable_lm_head_tp_in_adp: true + cuda_graph_config: null + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.6 + dtype: fp8 + moe_config: + backend: TRTLLM + cache_transceiver_config: + max_tokens_in_buffer: 16384 + backend: UCX + disable_overlap_scheduler: true + speculative_config: *id001 diff --git a/tests/integration/defs/perf/disagg/test_configs/disagg/perf-sanity/b200-deepseek-r1-fp4_8k1k_ctx1_dep4_gen1_tep8_eplb0_mtp3_ccb-UCX.yaml b/tests/integration/defs/perf/disagg/test_configs/disagg/perf-sanity/b200-deepseek-r1-fp4_8k1k_ctx1_dep4_gen1_tep8_eplb0_mtp3_ccb-UCX.yaml new file mode 100644 index 0000000000..63b38743fc --- /dev/null +++ b/tests/integration/defs/perf/disagg/test_configs/disagg/perf-sanity/b200-deepseek-r1-fp4_8k1k_ctx1_dep4_gen1_tep8_eplb0_mtp3_ccb-UCX.yaml @@ -0,0 +1,96 @@ +metadata: + model_name: deepseek_r1_0528_fp4_v2 + precision: fp4 + model_dir_name: DeepSeek-R1-0528-FP4-v2 + supported_gpus: + - B200 + script_file: disaggr_torch.slurm + benchmark_type: 8k1k +slurm: + script_file: disaggr_torch.slurm + partition: + account: + job_time: 02:00:00 + job_name: unified-benchmark + extra_args: --gres=gpu:8 + numa_bind: true +benchmark: + mode: e2e + use_nv_sa_benchmark: false + multi_round: 10 + benchmark_ratio: 0.0 + streaming: true + concurrency_list: '1' + input_length: 8192 + output_length: 1024 + dataset_file: datasets/perf-ci/deepseek_r1-8k1k-20480-ratio-1_for_serve.json +hardware: + gpus_per_node: 8 + num_ctx_servers: 1 + num_gen_servers: 1 +environment: + container_mount: + container_image: + model_path: + trtllm_repo: '' + build_wheel: false + work_dir: + worker_env_var: TLLM_LOG_LEVEL=INFO TRTLLM_SERVER_DISABLE_GC=1 TRTLLM_WORKER_DISABLE_GC=1 TRTLLM_ENABLE_PDL=1 ENROOT_ALLOW_DEV=yes + server_env_var: TRTLLM_SERVER_DISABLE_GC=1 +profiling: + nsys_on: false +accuracy: + enable_accuracy_test: false + model: local-completions + tasks: gsm8k + model_args_extra: num_concurrent=512,max_retries=3,tokenized_requests=false,timeout=1200,max_gen_toks=256,max_length=4096 +worker_config: + gen: + print_iter_log: true + max_batch_size: 1 + max_num_tokens: 256 + tensor_parallel_size: 8 + moe_expert_parallel_size: 8 + pipeline_parallel_size: 1 + context_parallel_size: 1 + enable_attention_dp: false + cuda_graph_config: + enable_padding: true + max_batch_size: 1 + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.9 + dtype: fp8 + moe_config: + backend: TRTLLM + cache_transceiver_config: + max_tokens_in_buffer: 16384 + backend: UCX + disable_overlap_scheduler: true + speculative_config: &id001 + decoding_type: MTP + num_nextn_predict_layers: 3 + num_postprocess_workers: 4 + stream_interval: 20 + ctx: + print_iter_log: true + max_batch_size: 2 + max_num_tokens: 16384 + tensor_parallel_size: 4 + moe_expert_parallel_size: 4 + pipeline_parallel_size: 1 + context_parallel_size: 1 + enable_attention_dp: true + enable_lm_head_tp_in_adp: true + cuda_graph_config: null + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.6 + dtype: fp8 + moe_config: + backend: TRTLLM + cache_transceiver_config: + max_tokens_in_buffer: 16384 + backend: UCX + disable_overlap_scheduler: true + speculative_config: *id001 diff --git a/tests/integration/defs/perf/disagg/test_configs/disagg/perf-sanity/gb200-deepseek-r1-fp4_128k8k_ctx1_pp8_gen1_dep16_eplb0_mtp2_ccb-UCX.yaml b/tests/integration/defs/perf/disagg/test_configs/disagg/perf-sanity/gb200-deepseek-r1-fp4_128k8k_ctx1_pp8_gen1_dep16_eplb0_mtp2_ccb-UCX.yaml new file mode 100644 index 0000000000..1bb024f5f1 --- /dev/null +++ b/tests/integration/defs/perf/disagg/test_configs/disagg/perf-sanity/gb200-deepseek-r1-fp4_128k8k_ctx1_pp8_gen1_dep16_eplb0_mtp2_ccb-UCX.yaml @@ -0,0 +1,96 @@ +metadata: + model_name: deepseek_r1_0528_fp4_v2 + precision: fp4 + model_dir_name: DeepSeek-R1-0528-FP4-v2 + supported_gpus: + - GB200 + script_file: disaggr_torch.slurm + benchmark_type: 128k8k +slurm: + script_file: disaggr_torch.slurm + partition: + account: + job_time: 02:00:00 + job_name: unified-benchmark + extra_args: --gres=gpu:4 + numa_bind: true +benchmark: + mode: e2e + use_nv_sa_benchmark: false + multi_round: 10 + benchmark_ratio: 0.0 + streaming: true + concurrency_list: '128' + input_length: 131072 + output_length: 8192 + dataset_file: datasets/perf-ci/deepseek_r1-128k8k-20480-ratio-1_for_serve.json +hardware: + gpus_per_node: 4 + num_ctx_servers: 1 + num_gen_servers: 1 +environment: + container_mount: + container_image: + model_path: + trtllm_repo: '' + build_wheel: false + work_dir: + worker_env_var: TLLM_LOG_LEVEL=INFO TRTLLM_SERVER_DISABLE_GC=1 TRTLLM_WORKER_DISABLE_GC=1 TRTLLM_ENABLE_PDL=1 ENROOT_ALLOW_DEV=yes + server_env_var: TRTLLM_SERVER_DISABLE_GC=1 +profiling: + nsys_on: false +accuracy: + enable_accuracy_test: false + model: local-completions + tasks: gsm8k + model_args_extra: num_concurrent=512,max_retries=3,tokenized_requests=false,timeout=1200,max_gen_toks=256,max_length=4096 +worker_config: + gen: + print_iter_log: true + max_batch_size: 8 + max_num_tokens: 32 + tensor_parallel_size: 16 + moe_expert_parallel_size: 16 + pipeline_parallel_size: 1 + context_parallel_size: 1 + enable_attention_dp: true + enable_lm_head_tp_in_adp: true + cuda_graph_config: + enable_padding: true + max_batch_size: 8 + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.8 + dtype: fp8 + moe_config: + backend: CUTEDSL + cache_transceiver_config: + max_tokens_in_buffer: 131104 + backend: UCX + disable_overlap_scheduler: true + speculative_config: &id001 + decoding_type: MTP + num_nextn_predict_layers: 2 + num_postprocess_workers: 4 + stream_interval: 20 + ctx: + print_iter_log: true + max_batch_size: 1 + max_num_tokens: 131104 + tensor_parallel_size: 1 + moe_expert_parallel_size: 1 + pipeline_parallel_size: 8 + context_parallel_size: 1 + enable_attention_dp: false + cuda_graph_config: null + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.3 + dtype: fp8 + moe_config: + backend: TRTLLM + cache_transceiver_config: + max_tokens_in_buffer: 131104 + backend: UCX + disable_overlap_scheduler: true + speculative_config: *id001 diff --git a/tests/integration/defs/perf/disagg/test_configs/disagg/perf-sanity/gb200-deepseek-r1-fp4_128k8k_ctx1_pp8_gen1_dep32_eplb0_mtp3_ccb-UCX.yaml b/tests/integration/defs/perf/disagg/test_configs/disagg/perf-sanity/gb200-deepseek-r1-fp4_128k8k_ctx1_pp8_gen1_dep32_eplb0_mtp3_ccb-UCX.yaml new file mode 100644 index 0000000000..ac3050722d --- /dev/null +++ b/tests/integration/defs/perf/disagg/test_configs/disagg/perf-sanity/gb200-deepseek-r1-fp4_128k8k_ctx1_pp8_gen1_dep32_eplb0_mtp3_ccb-UCX.yaml @@ -0,0 +1,96 @@ +metadata: + model_name: deepseek_r1_0528_fp4_v2 + precision: fp4 + model_dir_name: DeepSeek-R1-0528-FP4-v2 + supported_gpus: + - GB200 + script_file: disaggr_torch.slurm + benchmark_type: 128k8k +slurm: + script_file: disaggr_torch.slurm + partition: + account: + job_time: 02:00:00 + job_name: unified-benchmark + extra_args: --gres=gpu:4 + numa_bind: true +benchmark: + mode: e2e + use_nv_sa_benchmark: false + multi_round: 10 + benchmark_ratio: 0.0 + streaming: true + concurrency_list: '64' + input_length: 131072 + output_length: 8192 + dataset_file: datasets/perf-ci/deepseek_r1-128k8k-20480-ratio-1_for_serve.json +hardware: + gpus_per_node: 4 + num_ctx_servers: 1 + num_gen_servers: 1 +environment: + container_mount: + container_image: + model_path: + trtllm_repo: '' + build_wheel: false + work_dir: + worker_env_var: TLLM_LOG_LEVEL=INFO TRTLLM_SERVER_DISABLE_GC=1 TRTLLM_WORKER_DISABLE_GC=1 TRTLLM_ENABLE_PDL=1 ENROOT_ALLOW_DEV=yes + server_env_var: TRTLLM_SERVER_DISABLE_GC=1 +profiling: + nsys_on: false +accuracy: + enable_accuracy_test: false + model: local-completions + tasks: gsm8k + model_args_extra: num_concurrent=512,max_retries=3,tokenized_requests=false,timeout=1200,max_gen_toks=256,max_length=4096 +worker_config: + gen: + print_iter_log: true + max_batch_size: 2 + max_num_tokens: 8 + tensor_parallel_size: 32 + moe_expert_parallel_size: 32 + pipeline_parallel_size: 1 + context_parallel_size: 1 + enable_attention_dp: true + enable_lm_head_tp_in_adp: true + cuda_graph_config: + enable_padding: true + max_batch_size: 2 + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.6 + dtype: fp8 + moe_config: + backend: CUTEDSL + cache_transceiver_config: + max_tokens_in_buffer: 131104 + backend: UCX + disable_overlap_scheduler: true + speculative_config: &id001 + decoding_type: MTP + num_nextn_predict_layers: 3 + num_postprocess_workers: 4 + stream_interval: 20 + ctx: + print_iter_log: true + max_batch_size: 1 + max_num_tokens: 131104 + tensor_parallel_size: 1 + moe_expert_parallel_size: 1 + pipeline_parallel_size: 8 + context_parallel_size: 1 + enable_attention_dp: false + cuda_graph_config: null + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.3 + dtype: fp8 + moe_config: + backend: TRTLLM + cache_transceiver_config: + max_tokens_in_buffer: 131104 + backend: UCX + disable_overlap_scheduler: true + speculative_config: *id001 diff --git a/tests/integration/defs/perf/disagg/test_configs/disagg/perf-sanity/gb200-deepseek-r1-fp4_128k8k_ctx1_pp8_gen1_tep8_eplb0_mtp3_ccb-UCX.yaml b/tests/integration/defs/perf/disagg/test_configs/disagg/perf-sanity/gb200-deepseek-r1-fp4_128k8k_ctx1_pp8_gen1_tep8_eplb0_mtp3_ccb-UCX.yaml new file mode 100644 index 0000000000..0106c78459 --- /dev/null +++ b/tests/integration/defs/perf/disagg/test_configs/disagg/perf-sanity/gb200-deepseek-r1-fp4_128k8k_ctx1_pp8_gen1_tep8_eplb0_mtp3_ccb-UCX.yaml @@ -0,0 +1,96 @@ +metadata: + model_name: deepseek_r1_0528_fp4_v2 + precision: fp4 + model_dir_name: DeepSeek-R1-0528-FP4-v2 + supported_gpus: + - GB200 + script_file: disaggr_torch.slurm + benchmark_type: 128k8k +slurm: + script_file: disaggr_torch.slurm + partition: + account: + job_time: 02:00:00 + job_name: unified-benchmark + extra_args: --gres=gpu:4 + numa_bind: true +benchmark: + mode: e2e + use_nv_sa_benchmark: false + multi_round: 10 + benchmark_ratio: 0.0 + streaming: true + concurrency_list: '1' + input_length: 131072 + output_length: 8192 + dataset_file: datasets/perf-ci/deepseek_r1-128k8k-20480-ratio-1_for_serve.json +hardware: + gpus_per_node: 4 + num_ctx_servers: 1 + num_gen_servers: 1 +environment: + container_mount: + container_image: + model_path: + trtllm_repo: '' + build_wheel: false + work_dir: + worker_env_var: TLLM_LOG_LEVEL=INFO TRTLLM_SERVER_DISABLE_GC=1 TRTLLM_WORKER_DISABLE_GC=1 TRTLLM_ENABLE_PDL=1 ENROOT_ALLOW_DEV=yes + server_env_var: TRTLLM_SERVER_DISABLE_GC=1 +profiling: + nsys_on: false +accuracy: + enable_accuracy_test: false + model: local-completions + tasks: gsm8k + model_args_extra: num_concurrent=512,max_retries=3,tokenized_requests=false,timeout=1200,max_gen_toks=256,max_length=4096 +worker_config: + gen: + print_iter_log: true + max_batch_size: 1 + max_num_tokens: 4 + tensor_parallel_size: 8 + moe_expert_parallel_size: 8 + pipeline_parallel_size: 1 + context_parallel_size: 1 + enable_attention_dp: false + cuda_graph_config: + enable_padding: true + max_batch_size: 1 + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.8 + dtype: fp8 + moe_config: + backend: TRTLLM + cache_transceiver_config: + max_tokens_in_buffer: 131104 + backend: UCX + disable_overlap_scheduler: true + speculative_config: &id001 + decoding_type: MTP + num_nextn_predict_layers: 3 + num_postprocess_workers: 4 + stream_interval: 20 + allreduce_strategy: MNNVL + ctx: + print_iter_log: true + max_batch_size: 1 + max_num_tokens: 131104 + tensor_parallel_size: 1 + moe_expert_parallel_size: 1 + pipeline_parallel_size: 8 + context_parallel_size: 1 + enable_attention_dp: false + cuda_graph_config: null + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.3 + dtype: fp8 + moe_config: + backend: TRTLLM + cache_transceiver_config: + max_tokens_in_buffer: 131104 + backend: UCX + disable_overlap_scheduler: true + speculative_config: *id001 diff --git a/tests/integration/defs/perf/disagg/test_configs/disagg/perf-sanity/gb200-deepseek-r1-fp4_1k1k_ctx1_dep4_gen1_dep32_eplb0_mtp3_ccb-UCX.yaml b/tests/integration/defs/perf/disagg/test_configs/disagg/perf-sanity/gb200-deepseek-r1-fp4_1k1k_ctx1_dep4_gen1_dep32_eplb0_mtp3_ccb-UCX.yaml new file mode 100644 index 0000000000..21269ad60d --- /dev/null +++ b/tests/integration/defs/perf/disagg/test_configs/disagg/perf-sanity/gb200-deepseek-r1-fp4_1k1k_ctx1_dep4_gen1_dep32_eplb0_mtp3_ccb-UCX.yaml @@ -0,0 +1,97 @@ +metadata: + model_name: deepseek_r1_0528_fp4_v2 + precision: fp4 + model_dir_name: DeepSeek-R1-0528-FP4-v2 + supported_gpus: + - GB200 + script_file: disaggr_torch.slurm + benchmark_type: 1k1k +slurm: + script_file: disaggr_torch.slurm + partition: + account: + job_time: 02:00:00 + job_name: unified-benchmark + extra_args: --gres=gpu:4 + numa_bind: true +benchmark: + mode: e2e + use_nv_sa_benchmark: false + multi_round: 10 + benchmark_ratio: 0.0 + streaming: true + concurrency_list: '1024' + input_length: 1024 + output_length: 1024 + dataset_file: datasets/perf-ci/deepseek_r1-1k1k-20480-ratio-1_for_serve.json +hardware: + gpus_per_node: 4 + num_ctx_servers: 1 + num_gen_servers: 1 +environment: + container_mount: + container_image: + model_path: + trtllm_repo: '' + build_wheel: false + work_dir: + worker_env_var: TLLM_LOG_LEVEL=INFO TRTLLM_SERVER_DISABLE_GC=1 TRTLLM_WORKER_DISABLE_GC=1 TRTLLM_ENABLE_PDL=1 ENROOT_ALLOW_DEV=yes + server_env_var: TRTLLM_SERVER_DISABLE_GC=1 +profiling: + nsys_on: false +accuracy: + enable_accuracy_test: false + model: local-completions + tasks: gsm8k + model_args_extra: num_concurrent=512,max_retries=3,tokenized_requests=false,timeout=1200,max_gen_toks=256,max_length=4096 +worker_config: + gen: + print_iter_log: true + max_batch_size: 32 + max_num_tokens: 128 + tensor_parallel_size: 32 + moe_expert_parallel_size: 32 + pipeline_parallel_size: 1 + context_parallel_size: 1 + enable_attention_dp: true + enable_lm_head_tp_in_adp: true + cuda_graph_config: + enable_padding: true + max_batch_size: 32 + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.6 + dtype: fp8 + moe_config: + backend: CUTEDSL + cache_transceiver_config: + max_tokens_in_buffer: 16384 + backend: UCX + disable_overlap_scheduler: true + speculative_config: &id001 + decoding_type: MTP + num_nextn_predict_layers: 3 + num_postprocess_workers: 4 + stream_interval: 20 + ctx: + print_iter_log: true + max_batch_size: 16 + max_num_tokens: 16384 + tensor_parallel_size: 4 + moe_expert_parallel_size: 4 + pipeline_parallel_size: 1 + context_parallel_size: 1 + enable_attention_dp: true + enable_lm_head_tp_in_adp: true + cuda_graph_config: null + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.6 + dtype: fp8 + moe_config: + backend: TRTLLM + cache_transceiver_config: + max_tokens_in_buffer: 16384 + backend: UCX + disable_overlap_scheduler: true + speculative_config: *id001 diff --git a/tests/integration/defs/perf/disagg/test_configs/disagg/perf-sanity/gb200-deepseek-r1-fp4_1k1k_ctx1_dep4_gen1_dep4_eplb0_mtp1_ccb-UCX.yaml b/tests/integration/defs/perf/disagg/test_configs/disagg/perf-sanity/gb200-deepseek-r1-fp4_1k1k_ctx1_dep4_gen1_dep4_eplb0_mtp1_ccb-UCX.yaml new file mode 100644 index 0000000000..6ab0b332fe --- /dev/null +++ b/tests/integration/defs/perf/disagg/test_configs/disagg/perf-sanity/gb200-deepseek-r1-fp4_1k1k_ctx1_dep4_gen1_dep4_eplb0_mtp1_ccb-UCX.yaml @@ -0,0 +1,97 @@ +metadata: + model_name: deepseek_r1_0528_fp4_v2 + precision: fp4 + model_dir_name: DeepSeek-R1-0528-FP4-v2 + supported_gpus: + - GB200 + script_file: disaggr_torch.slurm + benchmark_type: 1k1k +slurm: + script_file: disaggr_torch.slurm + partition: + account: + job_time: 02:00:00 + job_name: unified-benchmark + extra_args: --gres=gpu:4 + numa_bind: true +benchmark: + mode: e2e + use_nv_sa_benchmark: false + multi_round: 5 + benchmark_ratio: 0.0 + streaming: true + concurrency_list: '3072' + input_length: 1024 + output_length: 1024 + dataset_file: datasets/perf-ci/deepseek_r1-1k1k-20480-ratio-1_for_serve.json +hardware: + gpus_per_node: 4 + num_ctx_servers: 1 + num_gen_servers: 1 +environment: + container_mount: + container_image: + model_path: + trtllm_repo: '' + build_wheel: false + work_dir: + worker_env_var: TLLM_LOG_LEVEL=INFO TRTLLM_SERVER_DISABLE_GC=1 TRTLLM_WORKER_DISABLE_GC=1 TRTLLM_ENABLE_PDL=1 ENROOT_ALLOW_DEV=yes + server_env_var: TRTLLM_SERVER_DISABLE_GC=1 +profiling: + nsys_on: false +accuracy: + enable_accuracy_test: false + model: local-completions + tasks: gsm8k + model_args_extra: num_concurrent=512,max_retries=3,tokenized_requests=false,timeout=1200,max_gen_toks=256,max_length=4096 +worker_config: + gen: + print_iter_log: true + max_batch_size: 768 + max_num_tokens: 1536 + tensor_parallel_size: 4 + moe_expert_parallel_size: 4 + pipeline_parallel_size: 1 + context_parallel_size: 1 + enable_attention_dp: true + enable_lm_head_tp_in_adp: true + cuda_graph_config: + enable_padding: true + max_batch_size: 768 + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.8 + dtype: fp8 + moe_config: + backend: CUTEDSL + cache_transceiver_config: + max_tokens_in_buffer: 16384 + backend: UCX + disable_overlap_scheduler: true + speculative_config: &id001 + decoding_type: MTP + num_nextn_predict_layers: 1 + num_postprocess_workers: 4 + stream_interval: 20 + ctx: + print_iter_log: true + max_batch_size: 16 + max_num_tokens: 16384 + tensor_parallel_size: 4 + moe_expert_parallel_size: 4 + pipeline_parallel_size: 1 + context_parallel_size: 1 + enable_attention_dp: true + enable_lm_head_tp_in_adp: true + cuda_graph_config: null + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.6 + dtype: fp8 + moe_config: + backend: TRTLLM + cache_transceiver_config: + max_tokens_in_buffer: 16384 + backend: UCX + disable_overlap_scheduler: true + speculative_config: *id001 diff --git a/tests/integration/defs/perf/disagg/test_configs/disagg/perf/deepseek-r1-fp4_1k1k_ctx1_gen1_dep8_bs768_eplb0_mtp0_ccb-UCX.yaml b/tests/integration/defs/perf/disagg/test_configs/disagg/perf-sanity/gb200-deepseek-r1-fp4_1k1k_ctx1_dep4_gen1_dep8_eplb0_mtp0_ccb-UCX.yaml similarity index 93% rename from tests/integration/defs/perf/disagg/test_configs/disagg/perf/deepseek-r1-fp4_1k1k_ctx1_gen1_dep8_bs768_eplb0_mtp0_ccb-UCX.yaml rename to tests/integration/defs/perf/disagg/test_configs/disagg/perf-sanity/gb200-deepseek-r1-fp4_1k1k_ctx1_dep4_gen1_dep8_eplb0_mtp0_ccb-UCX.yaml index be9dc6556d..434906488f 100644 --- a/tests/integration/defs/perf/disagg/test_configs/disagg/perf/deepseek-r1-fp4_1k1k_ctx1_gen1_dep8_bs768_eplb0_mtp0_ccb-UCX.yaml +++ b/tests/integration/defs/perf/disagg/test_configs/disagg/perf-sanity/gb200-deepseek-r1-fp4_1k1k_ctx1_dep4_gen1_dep8_eplb0_mtp0_ccb-UCX.yaml @@ -25,7 +25,7 @@ benchmark: concurrency_list: '1024' input_length: 1024 output_length: 1024 - dataset_file: + dataset_file: datasets/perf-ci/deepseek_r1-1k1k-20480-ratio-1_for_serve.json hardware: gpus_per_node: 4 num_ctx_servers: 1 @@ -56,18 +56,7 @@ worker_config: max_seq_len: 2068 cuda_graph_config: enable_padding: true - batch_sizes: - - 1 - - 2 - - 4 - - 8 - - 16 - - 32 - - 64 - - 128 - - 256 - - 512 - - 768 + max_batch_size: 768 print_iter_log: true kv_cache_config: enable_block_reuse: false diff --git a/tests/integration/defs/perf/disagg/test_configs/disagg/perf-sanity/gb200-deepseek-r1-fp4_1k1k_ctx1_dep4_gen1_tep8_eplb0_mtp3_ccb-UCX.yaml b/tests/integration/defs/perf/disagg/test_configs/disagg/perf-sanity/gb200-deepseek-r1-fp4_1k1k_ctx1_dep4_gen1_tep8_eplb0_mtp3_ccb-UCX.yaml new file mode 100644 index 0000000000..9f63539ec5 --- /dev/null +++ b/tests/integration/defs/perf/disagg/test_configs/disagg/perf-sanity/gb200-deepseek-r1-fp4_1k1k_ctx1_dep4_gen1_tep8_eplb0_mtp3_ccb-UCX.yaml @@ -0,0 +1,97 @@ +metadata: + model_name: deepseek_r1_0528_fp4_v2 + precision: fp4 + model_dir_name: DeepSeek-R1-0528-FP4-v2 + supported_gpus: + - GB200 + script_file: disaggr_torch.slurm + benchmark_type: 1k1k +slurm: + script_file: disaggr_torch.slurm + partition: + account: + job_time: 02:00:00 + job_name: unified-benchmark + extra_args: --gres=gpu:4 + numa_bind: true +benchmark: + mode: e2e + use_nv_sa_benchmark: false + multi_round: 10 + benchmark_ratio: 0.0 + streaming: true + concurrency_list: '1' + input_length: 1024 + output_length: 1024 + dataset_file: datasets/perf-ci/deepseek_r1-1k1k-20480-ratio-1_for_serve.json +hardware: + gpus_per_node: 4 + num_ctx_servers: 1 + num_gen_servers: 1 +environment: + container_mount: + container_image: + model_path: + trtllm_repo: '' + build_wheel: false + work_dir: + worker_env_var: TLLM_LOG_LEVEL=INFO TRTLLM_SERVER_DISABLE_GC=1 TRTLLM_WORKER_DISABLE_GC=1 TRTLLM_ENABLE_PDL=1 ENROOT_ALLOW_DEV=yes + server_env_var: TRTLLM_SERVER_DISABLE_GC=1 +profiling: + nsys_on: false +accuracy: + enable_accuracy_test: false + model: local-completions + tasks: gsm8k + model_args_extra: num_concurrent=512,max_retries=3,tokenized_requests=false,timeout=1200,max_gen_toks=256,max_length=4096 +worker_config: + gen: + print_iter_log: true + max_batch_size: 128 + max_num_tokens: 512 + tensor_parallel_size: 8 + moe_expert_parallel_size: 8 + pipeline_parallel_size: 1 + context_parallel_size: 1 + enable_attention_dp: false + cuda_graph_config: + enable_padding: true + max_batch_size: 128 + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.9 + dtype: fp8 + moe_config: + backend: TRTLLM + cache_transceiver_config: + max_tokens_in_buffer: 16384 + backend: UCX + disable_overlap_scheduler: true + speculative_config: &id001 + decoding_type: MTP + num_nextn_predict_layers: 3 + num_postprocess_workers: 4 + stream_interval: 20 + allreduce_strategy: MNNVL + ctx: + print_iter_log: true + max_batch_size: 16 + max_num_tokens: 16384 + tensor_parallel_size: 4 + moe_expert_parallel_size: 4 + pipeline_parallel_size: 1 + context_parallel_size: 1 + enable_attention_dp: true + enable_lm_head_tp_in_adp: true + cuda_graph_config: null + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.6 + dtype: fp8 + moe_config: + backend: TRTLLM + cache_transceiver_config: + max_tokens_in_buffer: 16384 + backend: UCX + disable_overlap_scheduler: true + speculative_config: *id001 diff --git a/tests/integration/defs/perf/disagg/test_configs/disagg/perf-sanity/gb200-deepseek-r1-fp4_8k1k_ctx1_dep4_gen1_dep16_eplb0_mtp1_ccb-UCX.yaml b/tests/integration/defs/perf/disagg/test_configs/disagg/perf-sanity/gb200-deepseek-r1-fp4_8k1k_ctx1_dep4_gen1_dep16_eplb0_mtp1_ccb-UCX.yaml new file mode 100644 index 0000000000..0bcd5e9310 --- /dev/null +++ b/tests/integration/defs/perf/disagg/test_configs/disagg/perf-sanity/gb200-deepseek-r1-fp4_8k1k_ctx1_dep4_gen1_dep16_eplb0_mtp1_ccb-UCX.yaml @@ -0,0 +1,97 @@ +metadata: + model_name: deepseek_r1_0528_fp4_v2 + precision: fp4 + model_dir_name: DeepSeek-R1-0528-FP4-v2 + supported_gpus: + - GB200 + script_file: disaggr_torch.slurm + benchmark_type: 8k1k +slurm: + script_file: disaggr_torch.slurm + partition: + account: + job_time: 02:00:00 + job_name: unified-benchmark + extra_args: --gres=gpu:4 + numa_bind: true +benchmark: + mode: e2e + use_nv_sa_benchmark: false + multi_round: 5 + benchmark_ratio: 0.0 + streaming: true + concurrency_list: '4096' + input_length: 8192 + output_length: 1024 + dataset_file: datasets/perf-ci/deepseek_r1-8k1k-20480-ratio-1_for_serve.json +hardware: + gpus_per_node: 4 + num_ctx_servers: 1 + num_gen_servers: 1 +environment: + container_mount: + container_image: + model_path: + trtllm_repo: '' + build_wheel: false + work_dir: + worker_env_var: TLLM_LOG_LEVEL=INFO TRTLLM_SERVER_DISABLE_GC=1 TRTLLM_WORKER_DISABLE_GC=1 TRTLLM_ENABLE_PDL=1 ENROOT_ALLOW_DEV=yes + server_env_var: TRTLLM_SERVER_DISABLE_GC=1 +profiling: + nsys_on: false +accuracy: + enable_accuracy_test: false + model: local-completions + tasks: gsm8k + model_args_extra: num_concurrent=512,max_retries=3,tokenized_requests=false,timeout=1200,max_gen_toks=256,max_length=4096 +worker_config: + gen: + print_iter_log: true + max_batch_size: 256 + max_num_tokens: 512 + tensor_parallel_size: 16 + moe_expert_parallel_size: 16 + pipeline_parallel_size: 1 + context_parallel_size: 1 + enable_attention_dp: true + enable_lm_head_tp_in_adp: true + cuda_graph_config: + enable_padding: true + max_batch_size: 256 + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.8 + dtype: fp8 + moe_config: + backend: CUTEDSL + cache_transceiver_config: + max_tokens_in_buffer: 16384 + backend: UCX + disable_overlap_scheduler: true + speculative_config: &id001 + decoding_type: MTP + num_nextn_predict_layers: 1 + num_postprocess_workers: 4 + stream_interval: 20 + ctx: + print_iter_log: true + max_batch_size: 2 + max_num_tokens: 16384 + tensor_parallel_size: 4 + moe_expert_parallel_size: 4 + pipeline_parallel_size: 1 + context_parallel_size: 1 + enable_attention_dp: true + enable_lm_head_tp_in_adp: true + cuda_graph_config: null + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.6 + dtype: fp8 + moe_config: + backend: TRTLLM + cache_transceiver_config: + max_tokens_in_buffer: 16384 + backend: UCX + disable_overlap_scheduler: true + speculative_config: *id001 diff --git a/tests/integration/defs/perf/disagg/test_configs/disagg/perf-sanity/gb200-deepseek-r1-fp4_8k1k_ctx1_dep4_gen1_dep32_eplb0_mtp3_ccb-UCX.yaml b/tests/integration/defs/perf/disagg/test_configs/disagg/perf-sanity/gb200-deepseek-r1-fp4_8k1k_ctx1_dep4_gen1_dep32_eplb0_mtp3_ccb-UCX.yaml new file mode 100644 index 0000000000..d9517f6fe4 --- /dev/null +++ b/tests/integration/defs/perf/disagg/test_configs/disagg/perf-sanity/gb200-deepseek-r1-fp4_8k1k_ctx1_dep4_gen1_dep32_eplb0_mtp3_ccb-UCX.yaml @@ -0,0 +1,97 @@ +metadata: + model_name: deepseek_r1_0528_fp4_v2 + precision: fp4 + model_dir_name: DeepSeek-R1-0528-FP4-v2 + supported_gpus: + - GB200 + script_file: disaggr_torch.slurm + benchmark_type: 8k1k +slurm: + script_file: disaggr_torch.slurm + partition: + account: + job_time: 02:00:00 + job_name: unified-benchmark + extra_args: --gres=gpu:4 + numa_bind: true +benchmark: + mode: e2e + use_nv_sa_benchmark: false + multi_round: 10 + benchmark_ratio: 0.0 + streaming: true + concurrency_list: '1024' + input_length: 8192 + output_length: 1024 + dataset_file: datasets/perf-ci/deepseek_r1-8k1k-20480-ratio-1_for_serve.json +hardware: + gpus_per_node: 4 + num_ctx_servers: 1 + num_gen_servers: 1 +environment: + container_mount: + container_image: + model_path: + trtllm_repo: '' + build_wheel: false + work_dir: + worker_env_var: TLLM_LOG_LEVEL=INFO TRTLLM_SERVER_DISABLE_GC=1 TRTLLM_WORKER_DISABLE_GC=1 TRTLLM_ENABLE_PDL=1 ENROOT_ALLOW_DEV=yes + server_env_var: TRTLLM_SERVER_DISABLE_GC=1 +profiling: + nsys_on: false +accuracy: + enable_accuracy_test: false + model: local-completions + tasks: gsm8k + model_args_extra: num_concurrent=512,max_retries=3,tokenized_requests=false,timeout=1200,max_gen_toks=256,max_length=4096 +worker_config: + gen: + print_iter_log: true + max_batch_size: 32 + max_num_tokens: 128 + tensor_parallel_size: 32 + moe_expert_parallel_size: 32 + pipeline_parallel_size: 1 + context_parallel_size: 1 + enable_attention_dp: true + enable_lm_head_tp_in_adp: true + cuda_graph_config: + enable_padding: true + max_batch_size: 32 + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.75 + dtype: fp8 + moe_config: + backend: CUTEDSL + cache_transceiver_config: + max_tokens_in_buffer: 16384 + backend: UCX + disable_overlap_scheduler: true + speculative_config: &id001 + decoding_type: MTP + num_nextn_predict_layers: 3 + num_postprocess_workers: 4 + stream_interval: 20 + ctx: + print_iter_log: true + max_batch_size: 2 + max_num_tokens: 16384 + tensor_parallel_size: 4 + moe_expert_parallel_size: 4 + pipeline_parallel_size: 1 + context_parallel_size: 1 + enable_attention_dp: true + enable_lm_head_tp_in_adp: true + cuda_graph_config: null + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.6 + dtype: fp8 + moe_config: + backend: TRTLLM + cache_transceiver_config: + max_tokens_in_buffer: 16384 + backend: UCX + disable_overlap_scheduler: true + speculative_config: *id001 diff --git a/tests/integration/defs/perf/disagg/test_configs/disagg/perf-sanity/gb200-deepseek-r1-fp4_8k1k_ctx1_dep4_gen1_tep8_eplb0_mtp3_ccb-UCX.yaml b/tests/integration/defs/perf/disagg/test_configs/disagg/perf-sanity/gb200-deepseek-r1-fp4_8k1k_ctx1_dep4_gen1_tep8_eplb0_mtp3_ccb-UCX.yaml new file mode 100644 index 0000000000..2c6b9c8b40 --- /dev/null +++ b/tests/integration/defs/perf/disagg/test_configs/disagg/perf-sanity/gb200-deepseek-r1-fp4_8k1k_ctx1_dep4_gen1_tep8_eplb0_mtp3_ccb-UCX.yaml @@ -0,0 +1,97 @@ +metadata: + model_name: deepseek_r1_0528_fp4_v2 + precision: fp4 + model_dir_name: DeepSeek-R1-0528-FP4-v2 + supported_gpus: + - GB200 + script_file: disaggr_torch.slurm + benchmark_type: 8k1k +slurm: + script_file: disaggr_torch.slurm + partition: + account: + job_time: 02:00:00 + job_name: unified-benchmark + extra_args: --gres=gpu:4 + numa_bind: true +benchmark: + mode: e2e + use_nv_sa_benchmark: false + multi_round: 10 + benchmark_ratio: 0.0 + streaming: true + concurrency_list: '1' + input_length: 8192 + output_length: 1024 + dataset_file: datasets/perf-ci/deepseek_r1-8k1k-20480-ratio-1_for_serve.json +hardware: + gpus_per_node: 4 + num_ctx_servers: 1 + num_gen_servers: 1 +environment: + container_mount: + container_image: + model_path: + trtllm_repo: '' + build_wheel: false + work_dir: + worker_env_var: TLLM_LOG_LEVEL=INFO TRTLLM_SERVER_DISABLE_GC=1 TRTLLM_WORKER_DISABLE_GC=1 TRTLLM_ENABLE_PDL=1 ENROOT_ALLOW_DEV=yes + server_env_var: TRTLLM_SERVER_DISABLE_GC=1 +profiling: + nsys_on: false +accuracy: + enable_accuracy_test: false + model: local-completions + tasks: gsm8k + model_args_extra: num_concurrent=512,max_retries=3,tokenized_requests=false,timeout=1200,max_gen_toks=256,max_length=4096 +worker_config: + gen: + print_iter_log: true + max_batch_size: 128 + max_num_tokens: 512 + tensor_parallel_size: 8 + moe_expert_parallel_size: 8 + pipeline_parallel_size: 1 + context_parallel_size: 1 + enable_attention_dp: false + cuda_graph_config: + enable_padding: true + max_batch_size: 128 + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.9 + dtype: fp8 + moe_config: + backend: TRTLLM + cache_transceiver_config: + max_tokens_in_buffer: 16384 + backend: UCX + disable_overlap_scheduler: true + speculative_config: &id001 + decoding_type: MTP + num_nextn_predict_layers: 3 + num_postprocess_workers: 4 + stream_interval: 20 + allreduce_strategy: MNNVL + ctx: + print_iter_log: true + max_batch_size: 2 + max_num_tokens: 16384 + tensor_parallel_size: 4 + moe_expert_parallel_size: 4 + pipeline_parallel_size: 1 + context_parallel_size: 1 + enable_attention_dp: true + enable_lm_head_tp_in_adp: true + cuda_graph_config: null + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.6 + dtype: fp8 + moe_config: + backend: TRTLLM + cache_transceiver_config: + max_tokens_in_buffer: 16384 + backend: UCX + disable_overlap_scheduler: true + speculative_config: *id001 diff --git a/tests/integration/defs/perf/disagg/test_configs/disagg/perf-sanity/gb200-deepseek-v32-fp4_1k1k_ctx1_dep4_gen1_dep32_eplb256_mtp3_ccb-UCX.yaml b/tests/integration/defs/perf/disagg/test_configs/disagg/perf-sanity/gb200-deepseek-v32-fp4_1k1k_ctx1_dep4_gen1_dep32_eplb256_mtp3_ccb-UCX.yaml new file mode 100644 index 0000000000..d673a4dbe8 --- /dev/null +++ b/tests/integration/defs/perf/disagg/test_configs/disagg/perf-sanity/gb200-deepseek-v32-fp4_1k1k_ctx1_dep4_gen1_dep32_eplb256_mtp3_ccb-UCX.yaml @@ -0,0 +1,108 @@ +metadata: + model_name: deepseek_v32_fp4 + precision: fp4 + model_dir_name: DeepSeek-V3.2-FP4-v2 + supported_gpus: + - GB200 + script_file: disaggr_torch.slurm + benchmark_type: 1k1k +slurm: + script_file: disaggr_torch.slurm + partition: + account: + job_time: 02:00:00 + job_name: unified-benchmark + extra_args: --gres=gpu:4 + numa_bind: true +benchmark: + mode: e2e + use_nv_sa_benchmark: false + multi_round: 10 + benchmark_ratio: 0.0 + streaming: true + concurrency_list: '1024' + input_length: 1024 + output_length: 1024 + dataset_file: datasets/perf-ci/deepseek_v32-1k1k-20480-ratio-1_for_serve.json +hardware: + gpus_per_node: 4 + num_ctx_servers: 1 + num_gen_servers: 1 +environment: + container_mount: + container_image: + model_path: + trtllm_repo: '' + build_wheel: false + work_dir: + worker_env_var: TLLM_LOG_LEVEL=INFO TRTLLM_SERVER_DISABLE_GC=1 TRTLLM_WORKER_DISABLE_GC=1 TRTLLM_ENABLE_PDL=1 ENROOT_ALLOW_DEV=yes + server_env_var: TRTLLM_SERVER_DISABLE_GC=1 +profiling: + nsys_on: false +accuracy: + enable_accuracy_test: false + model: local-completions + tasks: gsm8k + model_args_extra: num_concurrent=512,max_retries=3,tokenized_requests=false,timeout=1200,max_gen_toks=256,max_length=4096 +worker_config: + gen: + print_iter_log: true + max_batch_size: 32 + max_num_tokens: 128 + tensor_parallel_size: 32 + moe_expert_parallel_size: 32 + pipeline_parallel_size: 1 + context_parallel_size: 1 + enable_attention_dp: true + enable_lm_head_tp_in_adp: true + cuda_graph_config: + enable_padding: true + max_batch_size: 32 + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.6 + dtype: fp8 + tokens_per_block: 64 + moe_config: + backend: CUTEDSL + load_balancer: + num_slots: 256 + layer_updates_per_iter: 1 + cache_transceiver_config: + max_tokens_in_buffer: 16384 + backend: UCX + disable_overlap_scheduler: true + speculative_config: &id001 + decoding_type: MTP + num_nextn_predict_layers: 3 + num_postprocess_workers: 4 + stream_interval: 20 + nvfp4_gemm_config: + allowed_backends: + - cutlass + - cublaslt + - cutedsl + - cuda_core + ctx: + print_iter_log: true + max_batch_size: 16 + max_num_tokens: 16384 + tensor_parallel_size: 4 + moe_expert_parallel_size: 4 + pipeline_parallel_size: 1 + context_parallel_size: 1 + enable_attention_dp: true + enable_lm_head_tp_in_adp: true + cuda_graph_config: null + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.6 + dtype: fp8 + tokens_per_block: 64 + moe_config: + backend: TRTLLM + cache_transceiver_config: + max_tokens_in_buffer: 16384 + backend: UCX + disable_overlap_scheduler: true + speculative_config: *id001 diff --git a/tests/integration/defs/perf/disagg/test_configs/disagg/perf-sanity/gb200-deepseek-v32-fp4_1k1k_ctx1_dep4_gen1_dep4_eplb0_mtp1_ccb-UCX.yaml b/tests/integration/defs/perf/disagg/test_configs/disagg/perf-sanity/gb200-deepseek-v32-fp4_1k1k_ctx1_dep4_gen1_dep4_eplb0_mtp1_ccb-UCX.yaml new file mode 100644 index 0000000000..4a1e5529d4 --- /dev/null +++ b/tests/integration/defs/perf/disagg/test_configs/disagg/perf-sanity/gb200-deepseek-v32-fp4_1k1k_ctx1_dep4_gen1_dep4_eplb0_mtp1_ccb-UCX.yaml @@ -0,0 +1,105 @@ +metadata: + model_name: deepseek_v32_fp4 + precision: fp4 + model_dir_name: DeepSeek-V3.2-FP4-v2 + supported_gpus: + - GB200 + script_file: disaggr_torch.slurm + benchmark_type: 1k1k +slurm: + script_file: disaggr_torch.slurm + partition: + account: + job_time: 02:00:00 + job_name: unified-benchmark + extra_args: --gres=gpu:4 + numa_bind: true +benchmark: + mode: e2e + use_nv_sa_benchmark: false + multi_round: 5 + benchmark_ratio: 0.0 + streaming: true + concurrency_list: '2048' + input_length: 1024 + output_length: 1024 + dataset_file: datasets/perf-ci/deepseek_v32-1k1k-20480-ratio-1_for_serve.json +hardware: + gpus_per_node: 4 + num_ctx_servers: 1 + num_gen_servers: 1 +environment: + container_mount: + container_image: + model_path: + trtllm_repo: '' + build_wheel: false + work_dir: + worker_env_var: TLLM_LOG_LEVEL=INFO TRTLLM_SERVER_DISABLE_GC=1 TRTLLM_WORKER_DISABLE_GC=1 TRTLLM_ENABLE_PDL=1 ENROOT_ALLOW_DEV=yes + server_env_var: TRTLLM_SERVER_DISABLE_GC=1 +profiling: + nsys_on: false +accuracy: + enable_accuracy_test: false + model: local-completions + tasks: gsm8k + model_args_extra: num_concurrent=512,max_retries=3,tokenized_requests=false,timeout=1200,max_gen_toks=256,max_length=4096 +worker_config: + gen: + print_iter_log: true + max_batch_size: 512 + max_num_tokens: 1024 + tensor_parallel_size: 4 + moe_expert_parallel_size: 4 + pipeline_parallel_size: 1 + context_parallel_size: 1 + enable_attention_dp: true + enable_lm_head_tp_in_adp: true + cuda_graph_config: + enable_padding: true + max_batch_size: 512 + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.8 + dtype: fp8 + tokens_per_block: 64 + moe_config: + backend: CUTEDSL + cache_transceiver_config: + max_tokens_in_buffer: 16384 + backend: UCX + disable_overlap_scheduler: true + speculative_config: &id001 + decoding_type: MTP + num_nextn_predict_layers: 1 + num_postprocess_workers: 4 + stream_interval: 20 + nvfp4_gemm_config: + allowed_backends: + - cutlass + - cublaslt + - cutedsl + - cuda_core + ctx: + print_iter_log: true + max_batch_size: 16 + max_num_tokens: 16384 + tensor_parallel_size: 4 + moe_expert_parallel_size: 4 + pipeline_parallel_size: 1 + context_parallel_size: 1 + enable_attention_dp: true + enable_lm_head_tp_in_adp: true + cuda_graph_config: null + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.6 + dtype: fp8 + tokens_per_block: 64 + moe_config: + backend: TRTLLM + cache_transceiver_config: + max_tokens_in_buffer: 16384 + backend: UCX + disable_overlap_scheduler: true + speculative_config: *id001 diff --git a/tests/integration/defs/perf/disagg/test_configs/disagg/perf-sanity/gb200-deepseek-v32-fp4_1k1k_ctx1_dep4_gen1_tep8_eplb0_mtp3_ccb-UCX.yaml b/tests/integration/defs/perf/disagg/test_configs/disagg/perf-sanity/gb200-deepseek-v32-fp4_1k1k_ctx1_dep4_gen1_tep8_eplb0_mtp3_ccb-UCX.yaml new file mode 100644 index 0000000000..8249b0f1e8 --- /dev/null +++ b/tests/integration/defs/perf/disagg/test_configs/disagg/perf-sanity/gb200-deepseek-v32-fp4_1k1k_ctx1_dep4_gen1_tep8_eplb0_mtp3_ccb-UCX.yaml @@ -0,0 +1,105 @@ +metadata: + model_name: deepseek_v32_fp4 + precision: fp4 + model_dir_name: DeepSeek-V3.2-FP4-v2 + supported_gpus: + - GB200 + script_file: disaggr_torch.slurm + benchmark_type: 1k1k +slurm: + script_file: disaggr_torch.slurm + partition: + account: + job_time: 02:00:00 + job_name: unified-benchmark + extra_args: --gres=gpu:4 + numa_bind: true +benchmark: + mode: e2e + use_nv_sa_benchmark: false + multi_round: 10 + benchmark_ratio: 0.0 + streaming: true + concurrency_list: '1' + input_length: 1024 + output_length: 1024 + dataset_file: datasets/perf-ci/deepseek_v32-1k1k-20480-ratio-1_for_serve.json +hardware: + gpus_per_node: 4 + num_ctx_servers: 1 + num_gen_servers: 1 +environment: + container_mount: + container_image: + model_path: + trtllm_repo: '' + build_wheel: false + work_dir: + worker_env_var: TLLM_LOG_LEVEL=INFO TRTLLM_SERVER_DISABLE_GC=1 TRTLLM_WORKER_DISABLE_GC=1 TRTLLM_ENABLE_PDL=1 ENROOT_ALLOW_DEV=yes + server_env_var: TRTLLM_SERVER_DISABLE_GC=1 +profiling: + nsys_on: false +accuracy: + enable_accuracy_test: false + model: local-completions + tasks: gsm8k + model_args_extra: num_concurrent=512,max_retries=3,tokenized_requests=false,timeout=1200,max_gen_toks=256,max_length=4096 +worker_config: + gen: + print_iter_log: true + max_batch_size: 1 + max_num_tokens: 512 + tensor_parallel_size: 8 + moe_expert_parallel_size: 8 + pipeline_parallel_size: 1 + context_parallel_size: 1 + enable_attention_dp: false + cuda_graph_config: + enable_padding: true + max_batch_size: 1 + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.9 + dtype: fp8 + tokens_per_block: 64 + moe_config: + backend: TRTLLM + cache_transceiver_config: + max_tokens_in_buffer: 16384 + backend: UCX + disable_overlap_scheduler: true + speculative_config: &id001 + decoding_type: MTP + num_nextn_predict_layers: 3 + num_postprocess_workers: 4 + stream_interval: 20 + nvfp4_gemm_config: + allowed_backends: + - cutlass + - cublaslt + - cutedsl + - cuda_core + allreduce_strategy: MNNVL + ctx: + print_iter_log: true + max_batch_size: 16 + max_num_tokens: 16384 + tensor_parallel_size: 4 + moe_expert_parallel_size: 4 + pipeline_parallel_size: 1 + context_parallel_size: 1 + enable_attention_dp: true + enable_lm_head_tp_in_adp: true + cuda_graph_config: null + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.6 + dtype: fp8 + tokens_per_block: 64 + moe_config: + backend: TRTLLM + cache_transceiver_config: + max_tokens_in_buffer: 16384 + backend: UCX + disable_overlap_scheduler: true + speculative_config: *id001 diff --git a/tests/integration/defs/perf/disagg/test_configs/disagg/perf-sanity/gb200-deepseek-v32-fp4_32k4k_ctx1_dep4_gen1_dep32_eplb0_mtp3_ccb-UCX.yaml b/tests/integration/defs/perf/disagg/test_configs/disagg/perf-sanity/gb200-deepseek-v32-fp4_32k4k_ctx1_dep4_gen1_dep32_eplb0_mtp3_ccb-UCX.yaml new file mode 100644 index 0000000000..a2d5589c61 --- /dev/null +++ b/tests/integration/defs/perf/disagg/test_configs/disagg/perf-sanity/gb200-deepseek-v32-fp4_32k4k_ctx1_dep4_gen1_dep32_eplb0_mtp3_ccb-UCX.yaml @@ -0,0 +1,105 @@ +metadata: + model_name: deepseek_v32_fp4 + precision: fp4 + model_dir_name: DeepSeek-V3.2-FP4-v2 + supported_gpus: + - GB200 + script_file: disaggr_torch.slurm + benchmark_type: 32k4k +slurm: + script_file: disaggr_torch.slurm + partition: + account: + job_time: 02:00:00 + job_name: unified-benchmark + extra_args: --gres=gpu:4 + numa_bind: true +benchmark: + mode: e2e + use_nv_sa_benchmark: false + multi_round: 10 + benchmark_ratio: 0.0 + streaming: true + concurrency_list: '256' + input_length: 32768 + output_length: 4096 + dataset_file: datasets/perf-ci/deepseek_v32-32k4k-20480-ratio-1_for_serve.json +hardware: + gpus_per_node: 4 + num_ctx_servers: 1 + num_gen_servers: 1 +environment: + container_mount: + container_image: + model_path: + trtllm_repo: '' + build_wheel: false + work_dir: + worker_env_var: TLLM_LOG_LEVEL=INFO TRTLLM_SERVER_DISABLE_GC=1 TRTLLM_WORKER_DISABLE_GC=1 TRTLLM_ENABLE_PDL=1 ENROOT_ALLOW_DEV=yes + server_env_var: TRTLLM_SERVER_DISABLE_GC=1 +profiling: + nsys_on: false +accuracy: + enable_accuracy_test: false + model: local-completions + tasks: gsm8k + model_args_extra: num_concurrent=512,max_retries=3,tokenized_requests=false,timeout=1200,max_gen_toks=256,max_length=4096 +worker_config: + gen: + print_iter_log: true + max_batch_size: 8 + max_num_tokens: 256 + tensor_parallel_size: 32 + moe_expert_parallel_size: 32 + pipeline_parallel_size: 1 + context_parallel_size: 1 + enable_attention_dp: true + enable_lm_head_tp_in_adp: true + cuda_graph_config: + enable_padding: true + max_batch_size: 8 + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.85 + dtype: fp8 + tokens_per_block: 64 + moe_config: + backend: CUTEDSL + cache_transceiver_config: + max_tokens_in_buffer: 32768 + backend: UCX + disable_overlap_scheduler: true + speculative_config: &id001 + decoding_type: MTP + num_nextn_predict_layers: 3 + num_postprocess_workers: 4 + stream_interval: 20 + nvfp4_gemm_config: + allowed_backends: + - cutlass + - cublaslt + - cutedsl + - cuda_core + ctx: + print_iter_log: true + max_batch_size: 1 + max_num_tokens: 32784 + tensor_parallel_size: 4 + moe_expert_parallel_size: 4 + pipeline_parallel_size: 1 + context_parallel_size: 1 + enable_attention_dp: true + enable_lm_head_tp_in_adp: true + cuda_graph_config: null + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.6 + dtype: fp8 + tokens_per_block: 64 + moe_config: + backend: TRTLLM + cache_transceiver_config: + max_tokens_in_buffer: 32768 + backend: UCX + disable_overlap_scheduler: true + speculative_config: *id001 diff --git a/tests/integration/defs/perf/disagg/test_configs/disagg/perf-sanity/gb200-deepseek-v32-fp4_32k4k_ctx1_dep4_gen1_dep32_eplb288_mtp1_ccb-UCX.yaml b/tests/integration/defs/perf/disagg/test_configs/disagg/perf-sanity/gb200-deepseek-v32-fp4_32k4k_ctx1_dep4_gen1_dep32_eplb288_mtp1_ccb-UCX.yaml new file mode 100644 index 0000000000..7198cf0b58 --- /dev/null +++ b/tests/integration/defs/perf/disagg/test_configs/disagg/perf-sanity/gb200-deepseek-v32-fp4_32k4k_ctx1_dep4_gen1_dep32_eplb288_mtp1_ccb-UCX.yaml @@ -0,0 +1,108 @@ +metadata: + model_name: deepseek_v32_fp4 + precision: fp4 + model_dir_name: DeepSeek-V3.2-FP4-v2 + supported_gpus: + - GB200 + script_file: disaggr_torch.slurm + benchmark_type: 32k4k +slurm: + script_file: disaggr_torch.slurm + partition: + account: + job_time: 02:00:00 + job_name: unified-benchmark + extra_args: --gres=gpu:4 + numa_bind: true +benchmark: + mode: e2e + use_nv_sa_benchmark: false + multi_round: 5 + benchmark_ratio: 0.0 + streaming: true + concurrency_list: '2048' + input_length: 32768 + output_length: 4096 + dataset_file: datasets/perf-ci/deepseek_v32-32k4k-20480-ratio-1_for_serve.json +hardware: + gpus_per_node: 4 + num_ctx_servers: 1 + num_gen_servers: 1 +environment: + container_mount: + container_image: + model_path: + trtllm_repo: '' + build_wheel: false + work_dir: + worker_env_var: TLLM_LOG_LEVEL=INFO TRTLLM_SERVER_DISABLE_GC=1 TRTLLM_WORKER_DISABLE_GC=1 TRTLLM_ENABLE_PDL=1 ENROOT_ALLOW_DEV=yes + server_env_var: TRTLLM_SERVER_DISABLE_GC=1 +profiling: + nsys_on: false +accuracy: + enable_accuracy_test: false + model: local-completions + tasks: gsm8k + model_args_extra: num_concurrent=512,max_retries=3,tokenized_requests=false,timeout=1200,max_gen_toks=256,max_length=4096 +worker_config: + gen: + print_iter_log: true + max_batch_size: 64 + max_num_tokens: 128 + tensor_parallel_size: 32 + moe_expert_parallel_size: 32 + pipeline_parallel_size: 1 + context_parallel_size: 1 + enable_attention_dp: true + enable_lm_head_tp_in_adp: true + cuda_graph_config: + enable_padding: true + max_batch_size: 64 + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.85 + dtype: fp8 + tokens_per_block: 64 + moe_config: + backend: CUTEDSL + load_balancer: + num_slots: 288 + layer_updates_per_iter: 1 + cache_transceiver_config: + max_tokens_in_buffer: 32768 + backend: UCX + disable_overlap_scheduler: true + speculative_config: &id001 + decoding_type: MTP + num_nextn_predict_layers: 1 + num_postprocess_workers: 4 + stream_interval: 20 + nvfp4_gemm_config: + allowed_backends: + - cutlass + - cublaslt + - cutedsl + - cuda_core + ctx: + print_iter_log: true + max_batch_size: 1 + max_num_tokens: 32784 + tensor_parallel_size: 4 + moe_expert_parallel_size: 4 + pipeline_parallel_size: 1 + context_parallel_size: 1 + enable_attention_dp: true + enable_lm_head_tp_in_adp: true + cuda_graph_config: null + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.6 + dtype: fp8 + tokens_per_block: 64 + moe_config: + backend: TRTLLM + cache_transceiver_config: + max_tokens_in_buffer: 32768 + backend: UCX + disable_overlap_scheduler: true + speculative_config: *id001 diff --git a/tests/integration/defs/perf/disagg/test_configs/disagg/perf-sanity/gb200-deepseek-v32-fp4_32k4k_ctx1_dep4_gen1_tep8_eplb0_mtp3_ccb-UCX.yaml b/tests/integration/defs/perf/disagg/test_configs/disagg/perf-sanity/gb200-deepseek-v32-fp4_32k4k_ctx1_dep4_gen1_tep8_eplb0_mtp3_ccb-UCX.yaml new file mode 100644 index 0000000000..4770b7e722 --- /dev/null +++ b/tests/integration/defs/perf/disagg/test_configs/disagg/perf-sanity/gb200-deepseek-v32-fp4_32k4k_ctx1_dep4_gen1_tep8_eplb0_mtp3_ccb-UCX.yaml @@ -0,0 +1,105 @@ +metadata: + model_name: deepseek_v32_fp4 + precision: fp4 + model_dir_name: DeepSeek-V3.2-FP4-v2 + supported_gpus: + - GB200 + script_file: disaggr_torch.slurm + benchmark_type: 32k4k +slurm: + script_file: disaggr_torch.slurm + partition: + account: + job_time: 02:00:00 + job_name: unified-benchmark + extra_args: --gres=gpu:4 + numa_bind: true +benchmark: + mode: e2e + use_nv_sa_benchmark: false + multi_round: 10 + benchmark_ratio: 0.0 + streaming: true + concurrency_list: '1' + input_length: 32768 + output_length: 4096 + dataset_file: datasets/perf-ci/deepseek_v32-32k4k-20480-ratio-1_for_serve.json +hardware: + gpus_per_node: 4 + num_ctx_servers: 1 + num_gen_servers: 1 +environment: + container_mount: + container_image: + model_path: + trtllm_repo: '' + build_wheel: false + work_dir: + worker_env_var: TLLM_LOG_LEVEL=INFO TRTLLM_SERVER_DISABLE_GC=1 TRTLLM_WORKER_DISABLE_GC=1 TRTLLM_ENABLE_PDL=1 ENROOT_ALLOW_DEV=yes + server_env_var: TRTLLM_SERVER_DISABLE_GC=1 +profiling: + nsys_on: false +accuracy: + enable_accuracy_test: false + model: local-completions + tasks: gsm8k + model_args_extra: num_concurrent=512,max_retries=3,tokenized_requests=false,timeout=1200,max_gen_toks=256,max_length=4096 +worker_config: + gen: + print_iter_log: true + max_batch_size: 1 + max_num_tokens: 256 + tensor_parallel_size: 8 + moe_expert_parallel_size: 8 + pipeline_parallel_size: 1 + context_parallel_size: 1 + enable_attention_dp: false + cuda_graph_config: + enable_padding: true + max_batch_size: 1 + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.9 + dtype: fp8 + tokens_per_block: 64 + moe_config: + backend: TRTLLM + cache_transceiver_config: + max_tokens_in_buffer: 32768 + backend: UCX + disable_overlap_scheduler: true + speculative_config: &id001 + decoding_type: MTP + num_nextn_predict_layers: 3 + num_postprocess_workers: 4 + stream_interval: 20 + nvfp4_gemm_config: + allowed_backends: + - cutlass + - cublaslt + - cutedsl + - cuda_core + allreduce_strategy: MNNVL + ctx: + print_iter_log: true + max_batch_size: 1 + max_num_tokens: 32784 + tensor_parallel_size: 4 + moe_expert_parallel_size: 4 + pipeline_parallel_size: 1 + context_parallel_size: 1 + enable_attention_dp: true + enable_lm_head_tp_in_adp: true + cuda_graph_config: null + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.6 + dtype: fp8 + tokens_per_block: 64 + moe_config: + backend: TRTLLM + cache_transceiver_config: + max_tokens_in_buffer: 32768 + backend: UCX + disable_overlap_scheduler: true + speculative_config: *id001 diff --git a/tests/integration/defs/perf/disagg/test_configs/disagg/perf-sanity/gb200-deepseek-v32-fp4_8k1k_ctx1_dep4_gen1_dep32_eplb256_mtp0_ccb-UCX.yaml b/tests/integration/defs/perf/disagg/test_configs/disagg/perf-sanity/gb200-deepseek-v32-fp4_8k1k_ctx1_dep4_gen1_dep32_eplb256_mtp0_ccb-UCX.yaml new file mode 100644 index 0000000000..d43b566f61 --- /dev/null +++ b/tests/integration/defs/perf/disagg/test_configs/disagg/perf-sanity/gb200-deepseek-v32-fp4_8k1k_ctx1_dep4_gen1_dep32_eplb256_mtp0_ccb-UCX.yaml @@ -0,0 +1,104 @@ +metadata: + model_name: deepseek_v32_fp4 + precision: fp4 + model_dir_name: DeepSeek-V3.2-FP4-v2 + supported_gpus: + - GB200 + script_file: disaggr_torch.slurm + benchmark_type: 8k1k +slurm: + script_file: disaggr_torch.slurm + partition: + account: + job_time: 02:00:00 + job_name: unified-benchmark + extra_args: --gres=gpu:4 + numa_bind: true +benchmark: + mode: e2e + use_nv_sa_benchmark: false + multi_round: 5 + benchmark_ratio: 0.0 + streaming: true + concurrency_list: '4096' + input_length: 8192 + output_length: 1024 + dataset_file: datasets/perf-ci/deepseek_v32-8k1k-20480-ratio-1_for_serve.json +hardware: + gpus_per_node: 4 + num_ctx_servers: 1 + num_gen_servers: 1 +environment: + container_mount: + container_image: + model_path: + trtllm_repo: '' + build_wheel: false + work_dir: + worker_env_var: TLLM_LOG_LEVEL=INFO TRTLLM_SERVER_DISABLE_GC=1 TRTLLM_WORKER_DISABLE_GC=1 TRTLLM_ENABLE_PDL=1 ENROOT_ALLOW_DEV=yes + server_env_var: TRTLLM_SERVER_DISABLE_GC=1 +profiling: + nsys_on: false +accuracy: + enable_accuracy_test: false + model: local-completions + tasks: gsm8k + model_args_extra: num_concurrent=512,max_retries=3,tokenized_requests=false,timeout=1200,max_gen_toks=256,max_length=4096 +worker_config: + gen: + print_iter_log: true + max_batch_size: 128 + max_num_tokens: 128 + tensor_parallel_size: 32 + moe_expert_parallel_size: 32 + pipeline_parallel_size: 1 + context_parallel_size: 1 + enable_attention_dp: true + enable_lm_head_tp_in_adp: true + cuda_graph_config: + enable_padding: true + max_batch_size: 128 + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.8 + dtype: fp8 + tokens_per_block: 64 + moe_config: + backend: CUTEDSL + load_balancer: + num_slots: 256 + layer_updates_per_iter: 1 + cache_transceiver_config: + max_tokens_in_buffer: 16384 + backend: UCX + disable_overlap_scheduler: true + num_postprocess_workers: 4 + stream_interval: 20 + nvfp4_gemm_config: + allowed_backends: + - cutlass + - cublaslt + - cutedsl + - cuda_core + ctx: + print_iter_log: true + max_batch_size: 2 + max_num_tokens: 16384 + tensor_parallel_size: 4 + moe_expert_parallel_size: 4 + pipeline_parallel_size: 1 + context_parallel_size: 1 + enable_attention_dp: true + enable_lm_head_tp_in_adp: true + cuda_graph_config: null + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.6 + dtype: fp8 + tokens_per_block: 64 + moe_config: + backend: TRTLLM + cache_transceiver_config: + max_tokens_in_buffer: 16384 + backend: UCX + disable_overlap_scheduler: true diff --git a/tests/integration/defs/perf/disagg/test_configs/disagg/perf-sanity/gb200-deepseek-v32-fp4_8k1k_ctx1_dep4_gen1_dep32_eplb256_mtp3_ccb-UCX.yaml b/tests/integration/defs/perf/disagg/test_configs/disagg/perf-sanity/gb200-deepseek-v32-fp4_8k1k_ctx1_dep4_gen1_dep32_eplb256_mtp3_ccb-UCX.yaml new file mode 100644 index 0000000000..671a790156 --- /dev/null +++ b/tests/integration/defs/perf/disagg/test_configs/disagg/perf-sanity/gb200-deepseek-v32-fp4_8k1k_ctx1_dep4_gen1_dep32_eplb256_mtp3_ccb-UCX.yaml @@ -0,0 +1,108 @@ +metadata: + model_name: deepseek_v32_fp4 + precision: fp4 + model_dir_name: DeepSeek-V3.2-FP4-v2 + supported_gpus: + - GB200 + script_file: disaggr_torch.slurm + benchmark_type: 8k1k +slurm: + script_file: disaggr_torch.slurm + partition: + account: + job_time: 02:00:00 + job_name: unified-benchmark + extra_args: --gres=gpu:4 + numa_bind: true +benchmark: + mode: e2e + use_nv_sa_benchmark: false + multi_round: 10 + benchmark_ratio: 0.0 + streaming: true + concurrency_list: '1024' + input_length: 8192 + output_length: 1024 + dataset_file: datasets/perf-ci/deepseek_v32-8k1k-20480-ratio-1_for_serve.json +hardware: + gpus_per_node: 4 + num_ctx_servers: 1 + num_gen_servers: 1 +environment: + container_mount: + container_image: + model_path: + trtllm_repo: '' + build_wheel: false + work_dir: + worker_env_var: TLLM_LOG_LEVEL=INFO TRTLLM_SERVER_DISABLE_GC=1 TRTLLM_WORKER_DISABLE_GC=1 TRTLLM_ENABLE_PDL=1 ENROOT_ALLOW_DEV=yes + server_env_var: TRTLLM_SERVER_DISABLE_GC=1 +profiling: + nsys_on: false +accuracy: + enable_accuracy_test: false + model: local-completions + tasks: gsm8k + model_args_extra: num_concurrent=512,max_retries=3,tokenized_requests=false,timeout=1200,max_gen_toks=256,max_length=4096 +worker_config: + gen: + print_iter_log: true + max_batch_size: 32 + max_num_tokens: 128 + tensor_parallel_size: 32 + moe_expert_parallel_size: 32 + pipeline_parallel_size: 1 + context_parallel_size: 1 + enable_attention_dp: true + enable_lm_head_tp_in_adp: true + cuda_graph_config: + enable_padding: true + max_batch_size: 32 + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.75 + dtype: fp8 + tokens_per_block: 64 + moe_config: + backend: CUTEDSL + load_balancer: + num_slots: 256 + layer_updates_per_iter: 1 + cache_transceiver_config: + max_tokens_in_buffer: 16384 + backend: UCX + disable_overlap_scheduler: true + speculative_config: &id001 + decoding_type: MTP + num_nextn_predict_layers: 3 + num_postprocess_workers: 4 + stream_interval: 20 + nvfp4_gemm_config: + allowed_backends: + - cutlass + - cublaslt + - cutedsl + - cuda_core + ctx: + print_iter_log: true + max_batch_size: 2 + max_num_tokens: 16384 + tensor_parallel_size: 4 + moe_expert_parallel_size: 4 + pipeline_parallel_size: 1 + context_parallel_size: 1 + enable_attention_dp: true + enable_lm_head_tp_in_adp: true + cuda_graph_config: null + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.6 + dtype: fp8 + tokens_per_block: 64 + moe_config: + backend: TRTLLM + cache_transceiver_config: + max_tokens_in_buffer: 16384 + backend: UCX + disable_overlap_scheduler: true + speculative_config: *id001 diff --git a/tests/integration/defs/perf/disagg/test_configs/disagg/perf-sanity/gb200-deepseek-v32-fp4_8k1k_ctx1_dep4_gen1_tep8_eplb0_mtp3_ccb-UCX.yaml b/tests/integration/defs/perf/disagg/test_configs/disagg/perf-sanity/gb200-deepseek-v32-fp4_8k1k_ctx1_dep4_gen1_tep8_eplb0_mtp3_ccb-UCX.yaml new file mode 100644 index 0000000000..02db3e9fb0 --- /dev/null +++ b/tests/integration/defs/perf/disagg/test_configs/disagg/perf-sanity/gb200-deepseek-v32-fp4_8k1k_ctx1_dep4_gen1_tep8_eplb0_mtp3_ccb-UCX.yaml @@ -0,0 +1,105 @@ +metadata: + model_name: deepseek_v32_fp4 + precision: fp4 + model_dir_name: DeepSeek-V3.2-FP4-v2 + supported_gpus: + - GB200 + script_file: disaggr_torch.slurm + benchmark_type: 8k1k +slurm: + script_file: disaggr_torch.slurm + partition: + account: + job_time: 02:00:00 + job_name: unified-benchmark + extra_args: --gres=gpu:4 + numa_bind: true +benchmark: + mode: e2e + use_nv_sa_benchmark: false + multi_round: 10 + benchmark_ratio: 0.0 + streaming: true + concurrency_list: '1' + input_length: 8192 + output_length: 1024 + dataset_file: datasets/perf-ci/deepseek_v32-8k1k-20480-ratio-1_for_serve.json +hardware: + gpus_per_node: 4 + num_ctx_servers: 1 + num_gen_servers: 1 +environment: + container_mount: + container_image: + model_path: + trtllm_repo: '' + build_wheel: false + work_dir: + worker_env_var: TLLM_LOG_LEVEL=INFO TRTLLM_SERVER_DISABLE_GC=1 TRTLLM_WORKER_DISABLE_GC=1 TRTLLM_ENABLE_PDL=1 ENROOT_ALLOW_DEV=yes + server_env_var: TRTLLM_SERVER_DISABLE_GC=1 +profiling: + nsys_on: false +accuracy: + enable_accuracy_test: false + model: local-completions + tasks: gsm8k + model_args_extra: num_concurrent=512,max_retries=3,tokenized_requests=false,timeout=1200,max_gen_toks=256,max_length=4096 +worker_config: + gen: + print_iter_log: true + max_batch_size: 1 + max_num_tokens: 512 + tensor_parallel_size: 8 + moe_expert_parallel_size: 8 + pipeline_parallel_size: 1 + context_parallel_size: 1 + enable_attention_dp: false + cuda_graph_config: + enable_padding: true + max_batch_size: 1 + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.9 + dtype: fp8 + tokens_per_block: 64 + moe_config: + backend: TRTLLM + cache_transceiver_config: + max_tokens_in_buffer: 16384 + backend: UCX + disable_overlap_scheduler: true + speculative_config: &id001 + decoding_type: MTP + num_nextn_predict_layers: 3 + num_postprocess_workers: 4 + stream_interval: 20 + nvfp4_gemm_config: + allowed_backends: + - cutlass + - cublaslt + - cutedsl + - cuda_core + allreduce_strategy: MNNVL + ctx: + print_iter_log: true + max_batch_size: 2 + max_num_tokens: 16384 + tensor_parallel_size: 4 + moe_expert_parallel_size: 4 + pipeline_parallel_size: 1 + context_parallel_size: 1 + enable_attention_dp: true + enable_lm_head_tp_in_adp: true + cuda_graph_config: null + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.6 + dtype: fp8 + tokens_per_block: 64 + moe_config: + backend: TRTLLM + cache_transceiver_config: + max_tokens_in_buffer: 16384 + backend: UCX + disable_overlap_scheduler: true + speculative_config: *id001 diff --git a/tests/integration/defs/perf/disagg/test_configs/disagg/perf-sanity/gb200-kimi-k2-thinking-fp4_1k1k_ctx1_dep4_gen1_dep32_eplb384_mtp0_ccb-UCX.yaml b/tests/integration/defs/perf/disagg/test_configs/disagg/perf-sanity/gb200-kimi-k2-thinking-fp4_1k1k_ctx1_dep4_gen1_dep32_eplb384_mtp0_ccb-UCX.yaml new file mode 100644 index 0000000000..57ec85d9ed --- /dev/null +++ b/tests/integration/defs/perf/disagg/test_configs/disagg/perf-sanity/gb200-kimi-k2-thinking-fp4_1k1k_ctx1_dep4_gen1_dep32_eplb384_mtp0_ccb-UCX.yaml @@ -0,0 +1,98 @@ +metadata: + model_name: k2_thinking_fp4 + precision: fp4 + model_dir_name: Kimi-K2-Thinking-NVFP4 + supported_gpus: + - GB200 + script_file: disaggr_torch.slurm + benchmark_type: 1k1k +slurm: + script_file: disaggr_torch.slurm + partition: + account: + job_time: 02:00:00 + job_name: unified-benchmark + extra_args: --gres=gpu:4 + numa_bind: true +benchmark: + mode: e2e + use_nv_sa_benchmark: false + multi_round: 5 + benchmark_ratio: 0.0 + streaming: true + concurrency_list: '2048' + input_length: 1024 + output_length: 1024 + dataset_file: datasets/perf-ci/k2_thinking-1k1k-20480-ratio-1_for_serve.json +hardware: + gpus_per_node: 4 + num_ctx_servers: 1 + num_gen_servers: 1 +environment: + container_mount: + container_image: + model_path: + trtllm_repo: '' + build_wheel: false + work_dir: + worker_env_var: TLLM_LOG_LEVEL=INFO TRTLLM_SERVER_DISABLE_GC=1 TRTLLM_WORKER_DISABLE_GC=1 TRTLLM_ENABLE_PDL=1 ENROOT_ALLOW_DEV=yes + server_env_var: TRTLLM_SERVER_DISABLE_GC=1 +profiling: + nsys_on: false +accuracy: + enable_accuracy_test: false + model: local-completions + tasks: gsm8k + model_args_extra: num_concurrent=512,max_retries=3,tokenized_requests=false,timeout=1200,max_gen_toks=256,max_length=4096 +worker_config: + gen: + print_iter_log: true + max_batch_size: 64 + max_num_tokens: 128 + tensor_parallel_size: 32 + moe_expert_parallel_size: 32 + pipeline_parallel_size: 1 + context_parallel_size: 1 + enable_attention_dp: true + enable_lm_head_tp_in_adp: true + cuda_graph_config: + enable_padding: true + max_batch_size: 64 + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.8 + dtype: fp8 + moe_config: + backend: WIDEEP + load_balancer: + num_slots: 384 + layer_updates_per_iter: 1 + cache_transceiver_config: + max_tokens_in_buffer: 16384 + backend: UCX + disable_overlap_scheduler: true + trust_remote_code: true + num_postprocess_workers: 4 + stream_interval: 20 + ctx: + print_iter_log: true + max_batch_size: 16 + max_num_tokens: 8192 + tensor_parallel_size: 4 + moe_expert_parallel_size: 4 + pipeline_parallel_size: 1 + context_parallel_size: 1 + enable_attention_dp: true + enable_lm_head_tp_in_adp: true + cuda_graph_config: null + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.6 + dtype: fp8 + moe_config: + backend: TRTLLM + cache_transceiver_config: + max_tokens_in_buffer: 16384 + backend: UCX + disable_overlap_scheduler: true + trust_remote_code: true diff --git a/tests/integration/defs/perf/disagg/test_configs/disagg/perf-sanity/gb200-kimi-k2-thinking-fp4_1k1k_ctx1_dep4_gen1_dep8_eplb0_mtp0_ccb-UCX.yaml b/tests/integration/defs/perf/disagg/test_configs/disagg/perf-sanity/gb200-kimi-k2-thinking-fp4_1k1k_ctx1_dep4_gen1_dep8_eplb0_mtp0_ccb-UCX.yaml new file mode 100644 index 0000000000..3b4dc353de --- /dev/null +++ b/tests/integration/defs/perf/disagg/test_configs/disagg/perf-sanity/gb200-kimi-k2-thinking-fp4_1k1k_ctx1_dep4_gen1_dep8_eplb0_mtp0_ccb-UCX.yaml @@ -0,0 +1,95 @@ +metadata: + model_name: k2_thinking_fp4 + precision: fp4 + model_dir_name: Kimi-K2-Thinking-NVFP4 + supported_gpus: + - GB200 + script_file: disaggr_torch.slurm + benchmark_type: 1k1k +slurm: + script_file: disaggr_torch.slurm + partition: + account: + job_time: 02:00:00 + job_name: unified-benchmark + extra_args: --gres=gpu:4 + numa_bind: true +benchmark: + mode: e2e + use_nv_sa_benchmark: false + multi_round: 5 + benchmark_ratio: 0.0 + streaming: true + concurrency_list: '4096' + input_length: 1024 + output_length: 1024 + dataset_file: datasets/perf-ci/k2_thinking-1k1k-20480-ratio-1_for_serve.json +hardware: + gpus_per_node: 4 + num_ctx_servers: 1 + num_gen_servers: 1 +environment: + container_mount: + container_image: + model_path: + trtllm_repo: '' + build_wheel: false + work_dir: + worker_env_var: TLLM_LOG_LEVEL=INFO TRTLLM_SERVER_DISABLE_GC=1 TRTLLM_WORKER_DISABLE_GC=1 TRTLLM_ENABLE_PDL=1 ENROOT_ALLOW_DEV=yes + server_env_var: TRTLLM_SERVER_DISABLE_GC=1 +profiling: + nsys_on: false +accuracy: + enable_accuracy_test: false + model: local-completions + tasks: gsm8k + model_args_extra: num_concurrent=512,max_retries=3,tokenized_requests=false,timeout=1200,max_gen_toks=256,max_length=4096 +worker_config: + gen: + print_iter_log: true + max_batch_size: 512 + max_num_tokens: 512 + tensor_parallel_size: 8 + moe_expert_parallel_size: 8 + pipeline_parallel_size: 1 + context_parallel_size: 1 + enable_attention_dp: true + enable_lm_head_tp_in_adp: true + cuda_graph_config: + enable_padding: true + max_batch_size: 512 + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.8 + dtype: fp8 + moe_config: + backend: CUTLASS + cache_transceiver_config: + max_tokens_in_buffer: 16384 + backend: UCX + disable_overlap_scheduler: true + trust_remote_code: true + num_postprocess_workers: 4 + stream_interval: 20 + ctx: + print_iter_log: true + max_batch_size: 16 + max_num_tokens: 8192 + tensor_parallel_size: 4 + moe_expert_parallel_size: 4 + pipeline_parallel_size: 1 + context_parallel_size: 1 + enable_attention_dp: true + enable_lm_head_tp_in_adp: true + cuda_graph_config: null + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.6 + dtype: fp8 + moe_config: + backend: TRTLLM + cache_transceiver_config: + max_tokens_in_buffer: 16384 + backend: UCX + disable_overlap_scheduler: true + trust_remote_code: true diff --git a/tests/integration/defs/perf/disagg/test_configs/disagg/perf-sanity/gb200-kimi-k2-thinking-fp4_1k1k_ctx1_dep4_gen1_tep4_eplb0_mtp0_ccb-UCX.yaml b/tests/integration/defs/perf/disagg/test_configs/disagg/perf-sanity/gb200-kimi-k2-thinking-fp4_1k1k_ctx1_dep4_gen1_tep4_eplb0_mtp0_ccb-UCX.yaml new file mode 100644 index 0000000000..bcd22793a8 --- /dev/null +++ b/tests/integration/defs/perf/disagg/test_configs/disagg/perf-sanity/gb200-kimi-k2-thinking-fp4_1k1k_ctx1_dep4_gen1_tep4_eplb0_mtp0_ccb-UCX.yaml @@ -0,0 +1,94 @@ +metadata: + model_name: k2_thinking_fp4 + precision: fp4 + model_dir_name: Kimi-K2-Thinking-NVFP4 + supported_gpus: + - GB200 + script_file: disaggr_torch.slurm + benchmark_type: 1k1k +slurm: + script_file: disaggr_torch.slurm + partition: + account: + job_time: 02:00:00 + job_name: unified-benchmark + extra_args: --gres=gpu:4 + numa_bind: true +benchmark: + mode: e2e + use_nv_sa_benchmark: false + multi_round: 10 + benchmark_ratio: 0.0 + streaming: true + concurrency_list: '4' + input_length: 1024 + output_length: 1024 + dataset_file: datasets/perf-ci/k2_thinking-1k1k-20480-ratio-1_for_serve.json +hardware: + gpus_per_node: 4 + num_ctx_servers: 1 + num_gen_servers: 1 +environment: + container_mount: + container_image: + model_path: + trtllm_repo: '' + build_wheel: false + work_dir: + worker_env_var: TLLM_LOG_LEVEL=INFO TRTLLM_SERVER_DISABLE_GC=1 TRTLLM_WORKER_DISABLE_GC=1 TRTLLM_ENABLE_PDL=1 ENROOT_ALLOW_DEV=yes + server_env_var: TRTLLM_SERVER_DISABLE_GC=1 +profiling: + nsys_on: false +accuracy: + enable_accuracy_test: false + model: local-completions + tasks: gsm8k + model_args_extra: num_concurrent=512,max_retries=3,tokenized_requests=false,timeout=1200,max_gen_toks=256,max_length=4096 +worker_config: + gen: + print_iter_log: true + max_batch_size: 4 + max_num_tokens: 128 + tensor_parallel_size: 4 + moe_expert_parallel_size: 4 + pipeline_parallel_size: 1 + context_parallel_size: 1 + enable_attention_dp: false + cuda_graph_config: + enable_padding: true + max_batch_size: 4 + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.8 + dtype: fp8 + moe_config: + backend: TRTLLM + cache_transceiver_config: + max_tokens_in_buffer: 16384 + backend: UCX + disable_overlap_scheduler: true + trust_remote_code: true + num_postprocess_workers: 4 + stream_interval: 20 + ctx: + print_iter_log: true + max_batch_size: 16 + max_num_tokens: 8192 + tensor_parallel_size: 4 + moe_expert_parallel_size: 4 + pipeline_parallel_size: 1 + context_parallel_size: 1 + enable_attention_dp: true + enable_lm_head_tp_in_adp: true + cuda_graph_config: null + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.6 + dtype: fp8 + moe_config: + backend: TRTLLM + cache_transceiver_config: + max_tokens_in_buffer: 16384 + backend: UCX + disable_overlap_scheduler: true + trust_remote_code: true diff --git a/tests/integration/defs/perf/disagg/test_configs/disagg/perf-sanity/gb200-kimi-k2-thinking-fp4_8k1k_ctx1_dep4_gen1_dep16_eplb384_mtp0_ccb-UCX.yaml b/tests/integration/defs/perf/disagg/test_configs/disagg/perf-sanity/gb200-kimi-k2-thinking-fp4_8k1k_ctx1_dep4_gen1_dep16_eplb384_mtp0_ccb-UCX.yaml new file mode 100644 index 0000000000..1b6848ad7f --- /dev/null +++ b/tests/integration/defs/perf/disagg/test_configs/disagg/perf-sanity/gb200-kimi-k2-thinking-fp4_8k1k_ctx1_dep4_gen1_dep16_eplb384_mtp0_ccb-UCX.yaml @@ -0,0 +1,98 @@ +metadata: + model_name: k2_thinking_fp4 + precision: fp4 + model_dir_name: Kimi-K2-Thinking-NVFP4 + supported_gpus: + - GB200 + script_file: disaggr_torch.slurm + benchmark_type: 8k1k +slurm: + script_file: disaggr_torch.slurm + partition: + account: + job_time: 02:00:00 + job_name: unified-benchmark + extra_args: --gres=gpu:4 + numa_bind: true +benchmark: + mode: e2e + use_nv_sa_benchmark: false + multi_round: 5 + benchmark_ratio: 0.0 + streaming: true + concurrency_list: '4096' + input_length: 8192 + output_length: 1024 + dataset_file: datasets/perf-ci/k2_thinking-8k1k-20480-ratio-1_for_serve.json +hardware: + gpus_per_node: 4 + num_ctx_servers: 1 + num_gen_servers: 1 +environment: + container_mount: + container_image: + model_path: + trtllm_repo: '' + build_wheel: false + work_dir: + worker_env_var: TLLM_LOG_LEVEL=INFO TRTLLM_SERVER_DISABLE_GC=1 TRTLLM_WORKER_DISABLE_GC=1 TRTLLM_ENABLE_PDL=1 ENROOT_ALLOW_DEV=yes + server_env_var: TRTLLM_SERVER_DISABLE_GC=1 +profiling: + nsys_on: false +accuracy: + enable_accuracy_test: false + model: local-completions + tasks: gsm8k + model_args_extra: num_concurrent=512,max_retries=3,tokenized_requests=false,timeout=1200,max_gen_toks=256,max_length=4096 +worker_config: + gen: + print_iter_log: true + max_batch_size: 256 + max_num_tokens: 256 + tensor_parallel_size: 16 + moe_expert_parallel_size: 16 + pipeline_parallel_size: 1 + context_parallel_size: 1 + enable_attention_dp: true + enable_lm_head_tp_in_adp: true + cuda_graph_config: + enable_padding: true + max_batch_size: 256 + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.8 + dtype: fp8 + moe_config: + backend: WIDEEP + load_balancer: + num_slots: 384 + layer_updates_per_iter: 1 + cache_transceiver_config: + max_tokens_in_buffer: 16384 + backend: UCX + disable_overlap_scheduler: true + trust_remote_code: true + num_postprocess_workers: 4 + stream_interval: 20 + ctx: + print_iter_log: true + max_batch_size: 2 + max_num_tokens: 8192 + tensor_parallel_size: 4 + moe_expert_parallel_size: 4 + pipeline_parallel_size: 1 + context_parallel_size: 1 + enable_attention_dp: true + enable_lm_head_tp_in_adp: true + cuda_graph_config: null + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.6 + dtype: fp8 + moe_config: + backend: TRTLLM + cache_transceiver_config: + max_tokens_in_buffer: 16384 + backend: UCX + disable_overlap_scheduler: true + trust_remote_code: true diff --git a/tests/integration/defs/perf/disagg/test_configs/disagg/perf-sanity/gb200-kimi-k2-thinking-fp4_8k1k_ctx1_dep4_gen1_dep32_eplb416_mtp3_ccb-UCX.yaml b/tests/integration/defs/perf/disagg/test_configs/disagg/perf-sanity/gb200-kimi-k2-thinking-fp4_8k1k_ctx1_dep4_gen1_dep32_eplb416_mtp3_ccb-UCX.yaml new file mode 100644 index 0000000000..8e2bf42a29 --- /dev/null +++ b/tests/integration/defs/perf/disagg/test_configs/disagg/perf-sanity/gb200-kimi-k2-thinking-fp4_8k1k_ctx1_dep4_gen1_dep32_eplb416_mtp3_ccb-UCX.yaml @@ -0,0 +1,104 @@ +metadata: + model_name: k2_thinking_fp4 + precision: fp4 + model_dir_name: Kimi-K2-Thinking-NVFP4 + supported_gpus: + - GB200 + script_file: disaggr_torch.slurm + benchmark_type: 8k1k +slurm: + script_file: disaggr_torch.slurm + partition: + account: + job_time: 02:00:00 + job_name: unified-benchmark + extra_args: --gres=gpu:4 + numa_bind: true +benchmark: + mode: e2e + use_nv_sa_benchmark: false + multi_round: 10 + benchmark_ratio: 0.0 + streaming: true + concurrency_list: '1024' + input_length: 8192 + output_length: 1024 + dataset_file: datasets/perf-ci/k2_thinking-8k1k-20480-ratio-1_for_serve.json +hardware: + gpus_per_node: 4 + num_ctx_servers: 1 + num_gen_servers: 1 +environment: + container_mount: + container_image: + model_path: + trtllm_repo: '' + build_wheel: false + work_dir: + worker_env_var: TLLM_LOG_LEVEL=INFO TRTLLM_SERVER_DISABLE_GC=1 TRTLLM_WORKER_DISABLE_GC=1 TRTLLM_ENABLE_PDL=1 ENROOT_ALLOW_DEV=yes + server_env_var: TRTLLM_SERVER_DISABLE_GC=1 +profiling: + nsys_on: false +accuracy: + enable_accuracy_test: false + model: local-completions + tasks: gsm8k + model_args_extra: num_concurrent=512,max_retries=3,tokenized_requests=false,timeout=1200,max_gen_toks=256,max_length=4096 +worker_config: + gen: + print_iter_log: true + max_batch_size: 32 + max_num_tokens: 128 + tensor_parallel_size: 32 + moe_expert_parallel_size: 32 + pipeline_parallel_size: 1 + context_parallel_size: 1 + enable_attention_dp: true + enable_lm_head_tp_in_adp: true + cuda_graph_config: + enable_padding: true + max_batch_size: 32 + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.8 + dtype: fp8 + moe_config: + backend: WIDEEP + load_balancer: + num_slots: 416 + layer_updates_per_iter: 1 + cache_transceiver_config: + max_tokens_in_buffer: 16384 + backend: UCX + disable_overlap_scheduler: true + speculative_config: &id001 + decoding_type: Eagle + max_draft_len: 3 + eagle3_one_model: true + speculative_model: Kimi-K2-Thinking-NVFP4-Eagle3 + trust_remote_code: true + num_postprocess_workers: 4 + stream_interval: 20 + ctx: + print_iter_log: true + max_batch_size: 2 + max_num_tokens: 8192 + tensor_parallel_size: 4 + moe_expert_parallel_size: 4 + pipeline_parallel_size: 1 + context_parallel_size: 1 + enable_attention_dp: true + enable_lm_head_tp_in_adp: true + cuda_graph_config: null + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.6 + dtype: fp8 + moe_config: + backend: TRTLLM + cache_transceiver_config: + max_tokens_in_buffer: 16384 + backend: UCX + disable_overlap_scheduler: true + speculative_config: *id001 + trust_remote_code: true diff --git a/tests/integration/defs/perf/disagg/test_configs/disagg/perf-sanity/gb200-kimi-k2-thinking-fp4_8k1k_ctx1_dep4_gen1_tep8_eplb0_mtp3_ccb-UCX.yaml b/tests/integration/defs/perf/disagg/test_configs/disagg/perf-sanity/gb200-kimi-k2-thinking-fp4_8k1k_ctx1_dep4_gen1_tep8_eplb0_mtp3_ccb-UCX.yaml new file mode 100644 index 0000000000..531ea93e2d --- /dev/null +++ b/tests/integration/defs/perf/disagg/test_configs/disagg/perf-sanity/gb200-kimi-k2-thinking-fp4_8k1k_ctx1_dep4_gen1_tep8_eplb0_mtp3_ccb-UCX.yaml @@ -0,0 +1,101 @@ +metadata: + model_name: k2_thinking_fp4 + precision: fp4 + model_dir_name: Kimi-K2-Thinking-NVFP4 + supported_gpus: + - GB200 + script_file: disaggr_torch.slurm + benchmark_type: 8k1k +slurm: + script_file: disaggr_torch.slurm + partition: + account: + job_time: 02:00:00 + job_name: unified-benchmark + extra_args: --gres=gpu:4 + numa_bind: true +benchmark: + mode: e2e + use_nv_sa_benchmark: false + multi_round: 10 + benchmark_ratio: 0.0 + streaming: true + concurrency_list: '4' + input_length: 8192 + output_length: 1024 + dataset_file: datasets/perf-ci/k2_thinking-8k1k-20480-ratio-1_for_serve.json +hardware: + gpus_per_node: 4 + num_ctx_servers: 1 + num_gen_servers: 1 +environment: + container_mount: + container_image: + model_path: + trtllm_repo: '' + build_wheel: false + work_dir: + worker_env_var: TLLM_LOG_LEVEL=INFO TRTLLM_SERVER_DISABLE_GC=1 TRTLLM_WORKER_DISABLE_GC=1 TRTLLM_ENABLE_PDL=1 ENROOT_ALLOW_DEV=yes + server_env_var: TRTLLM_SERVER_DISABLE_GC=1 +profiling: + nsys_on: false +accuracy: + enable_accuracy_test: false + model: local-completions + tasks: gsm8k + model_args_extra: num_concurrent=512,max_retries=3,tokenized_requests=false,timeout=1200,max_gen_toks=256,max_length=4096 +worker_config: + gen: + print_iter_log: true + max_batch_size: 4 + max_num_tokens: 128 + tensor_parallel_size: 8 + moe_expert_parallel_size: 8 + pipeline_parallel_size: 1 + context_parallel_size: 1 + enable_attention_dp: false + cuda_graph_config: + enable_padding: true + max_batch_size: 4 + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.8 + dtype: fp8 + moe_config: + backend: TRTLLM + cache_transceiver_config: + max_tokens_in_buffer: 16384 + backend: UCX + disable_overlap_scheduler: true + speculative_config: &id001 + decoding_type: Eagle + max_draft_len: 3 + eagle3_one_model: true + speculative_model: Kimi-K2-Thinking-NVFP4-Eagle3 + trust_remote_code: true + num_postprocess_workers: 4 + stream_interval: 20 + allreduce_strategy: MNNVL + ctx: + print_iter_log: true + max_batch_size: 2 + max_num_tokens: 8192 + tensor_parallel_size: 4 + moe_expert_parallel_size: 4 + pipeline_parallel_size: 1 + context_parallel_size: 1 + enable_attention_dp: true + enable_lm_head_tp_in_adp: true + cuda_graph_config: null + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.6 + dtype: fp8 + moe_config: + backend: TRTLLM + cache_transceiver_config: + max_tokens_in_buffer: 16384 + backend: UCX + disable_overlap_scheduler: true + speculative_config: *id001 + trust_remote_code: true diff --git a/tests/integration/defs/perf/disagg/test_configs/disagg/perf-sanity/gb200-qwen3-235b-fp4_8k1k_ctx1_tp1_gen1_dep8_eplb0_mtp0_ccb-UCX.yaml b/tests/integration/defs/perf/disagg/test_configs/disagg/perf-sanity/gb200-qwen3-235b-fp4_8k1k_ctx1_tp1_gen1_dep8_eplb0_mtp0_ccb-UCX.yaml new file mode 100644 index 0000000000..1f1540fc43 --- /dev/null +++ b/tests/integration/defs/perf/disagg/test_configs/disagg/perf-sanity/gb200-qwen3-235b-fp4_8k1k_ctx1_tp1_gen1_dep8_eplb0_mtp0_ccb-UCX.yaml @@ -0,0 +1,92 @@ +metadata: + model_name: qwen3_235b_a22b_fp4 + precision: fp4 + model_dir_name: Qwen3-235B-A22B-FP4 + supported_gpus: + - GB200 + script_file: disaggr_torch.slurm + benchmark_type: 8k1k +slurm: + script_file: disaggr_torch.slurm + partition: + account: + job_time: 02:00:00 + job_name: unified-benchmark + extra_args: --gres=gpu:4 + numa_bind: true +benchmark: + mode: e2e + use_nv_sa_benchmark: false + multi_round: 10 + benchmark_ratio: 0.0 + streaming: true + concurrency_list: '1024' + input_length: 8192 + output_length: 1024 + dataset_file: datasets/perf-ci/qwen3_235b-8k1k-20480-ratio-1_for_serve.json +hardware: + gpus_per_node: 4 + num_ctx_servers: 1 + num_gen_servers: 1 +environment: + container_mount: + container_image: + model_path: + trtllm_repo: '' + build_wheel: false + work_dir: + worker_env_var: TLLM_LOG_LEVEL=INFO TRTLLM_SERVER_DISABLE_GC=1 TRTLLM_WORKER_DISABLE_GC=1 TRTLLM_ENABLE_PDL=1 ENROOT_ALLOW_DEV=yes + server_env_var: TRTLLM_SERVER_DISABLE_GC=1 +profiling: + nsys_on: false +accuracy: + enable_accuracy_test: false + model: local-completions + tasks: gsm8k + model_args_extra: num_concurrent=512,max_retries=3,tokenized_requests=false,timeout=1200,max_gen_toks=256,max_length=4096 +worker_config: + gen: + print_iter_log: true + max_batch_size: 128 + max_num_tokens: 128 + tensor_parallel_size: 8 + moe_expert_parallel_size: 8 + pipeline_parallel_size: 1 + context_parallel_size: 1 + enable_attention_dp: true + enable_lm_head_tp_in_adp: true + cuda_graph_config: + enable_padding: true + max_batch_size: 128 + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.9 + dtype: fp8 + moe_config: + backend: CUTEDSL + cache_transceiver_config: + max_tokens_in_buffer: 32768 + backend: UCX + disable_overlap_scheduler: true + num_postprocess_workers: 4 + stream_interval: 20 + ctx: + print_iter_log: true + max_batch_size: 4 + max_num_tokens: 32768 + tensor_parallel_size: 1 + moe_expert_parallel_size: 1 + pipeline_parallel_size: 1 + context_parallel_size: 1 + enable_attention_dp: false + cuda_graph_config: null + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.6 + dtype: fp8 + moe_config: + backend: TRTLLM + cache_transceiver_config: + max_tokens_in_buffer: 32768 + backend: UCX + disable_overlap_scheduler: true diff --git a/tests/integration/defs/perf/disagg/test_configs/disagg/perf-sanity/gb200-qwen3-235b-fp4_8k1k_ctx1_tp1_gen1_tep4_eplb0_mtp0_ccb-UCX.yaml b/tests/integration/defs/perf/disagg/test_configs/disagg/perf-sanity/gb200-qwen3-235b-fp4_8k1k_ctx1_tp1_gen1_tep4_eplb0_mtp0_ccb-UCX.yaml new file mode 100644 index 0000000000..20c68321bb --- /dev/null +++ b/tests/integration/defs/perf/disagg/test_configs/disagg/perf-sanity/gb200-qwen3-235b-fp4_8k1k_ctx1_tp1_gen1_tep4_eplb0_mtp0_ccb-UCX.yaml @@ -0,0 +1,91 @@ +metadata: + model_name: qwen3_235b_a22b_fp4 + precision: fp4 + model_dir_name: Qwen3-235B-A22B-FP4 + supported_gpus: + - GB200 + script_file: disaggr_torch.slurm + benchmark_type: 8k1k +slurm: + script_file: disaggr_torch.slurm + partition: + account: + job_time: 02:00:00 + job_name: unified-benchmark + extra_args: --gres=gpu:4 + numa_bind: true +benchmark: + mode: e2e + use_nv_sa_benchmark: false + multi_round: 10 + benchmark_ratio: 0.0 + streaming: true + concurrency_list: '64' + input_length: 8192 + output_length: 1024 + dataset_file: datasets/perf-ci/qwen3_235b-8k1k-20480-ratio-1_for_serve.json +hardware: + gpus_per_node: 4 + num_ctx_servers: 1 + num_gen_servers: 1 +environment: + container_mount: + container_image: + model_path: + trtllm_repo: '' + build_wheel: false + work_dir: + worker_env_var: TLLM_LOG_LEVEL=INFO TRTLLM_SERVER_DISABLE_GC=1 TRTLLM_WORKER_DISABLE_GC=1 TRTLLM_ENABLE_PDL=1 ENROOT_ALLOW_DEV=yes + server_env_var: TRTLLM_SERVER_DISABLE_GC=1 +profiling: + nsys_on: false +accuracy: + enable_accuracy_test: false + model: local-completions + tasks: gsm8k + model_args_extra: num_concurrent=512,max_retries=3,tokenized_requests=false,timeout=1200,max_gen_toks=256,max_length=4096 +worker_config: + gen: + print_iter_log: true + max_batch_size: 64 + max_num_tokens: 64 + tensor_parallel_size: 4 + moe_expert_parallel_size: 4 + pipeline_parallel_size: 1 + context_parallel_size: 1 + enable_attention_dp: false + cuda_graph_config: + enable_padding: true + max_batch_size: 64 + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.9 + dtype: fp8 + moe_config: + backend: TRTLLM + cache_transceiver_config: + max_tokens_in_buffer: 32768 + backend: UCX + disable_overlap_scheduler: true + num_postprocess_workers: 4 + stream_interval: 20 + ctx: + print_iter_log: true + max_batch_size: 4 + max_num_tokens: 32768 + tensor_parallel_size: 1 + moe_expert_parallel_size: 1 + pipeline_parallel_size: 1 + context_parallel_size: 1 + enable_attention_dp: false + cuda_graph_config: null + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.6 + dtype: fp8 + moe_config: + backend: TRTLLM + cache_transceiver_config: + max_tokens_in_buffer: 32768 + backend: UCX + disable_overlap_scheduler: true diff --git a/tests/integration/defs/perf/open_search_db_utils.py b/tests/integration/defs/perf/open_search_db_utils.py index c1dd130593..53d849bc47 100644 --- a/tests/integration/defs/perf/open_search_db_utils.py +++ b/tests/integration/defs/perf/open_search_db_utils.py @@ -448,9 +448,12 @@ def prepare_regressive_test_cases(history_baseline_dict, new_data_dict): continue is_post_merge = new_data.get("b_is_post_merge", False) - baseline_id = history_baseline.get("_id", "") - - info_parts = [f"baseline_id: {baseline_id}"] + info_parts = [ + f"baseline_id: {history_baseline.get('_id', '')}", + f"baseline_branch: {history_baseline.get('s_branch', '')}", + f"baseline_commit: {history_baseline.get('s_commit', '')}", + f"baseline_date: {history_baseline.get('ts_created', '')}", + ] regressive_metrics = [] # Check all metrics and build info string for metric in MAXIMIZE_METRICS + MINIMIZE_METRICS: diff --git a/tests/integration/defs/perf/test_perf_sanity.py b/tests/integration/defs/perf/test_perf_sanity.py index 2dfb7b0515..593e6b93b4 100644 --- a/tests/integration/defs/perf/test_perf_sanity.py +++ b/tests/integration/defs/perf/test_perf_sanity.py @@ -56,6 +56,7 @@ MODEL_PATH_DICT = { "deepseek_v32_fp4": "DeepSeek-V3.2-Exp-FP4-v2", "gpt_oss_120b_fp4": "gpt_oss/gpt-oss-120b", "k2_thinking_fp4": "Kimi-K2-Thinking-NVFP4", + "qwen3_235b_a22b_fp4": "Qwen3/saved_models_Qwen3-235B-A22B_nvfp4_hf", # Qwen3-235B-A22B-FP4 } SUPPORTED_GPU_MAPPING = { @@ -68,6 +69,9 @@ SUPPORTED_GPU_MAPPING = { DEFAULT_TIMEOUT = 7200 +AGGR_CONFIG_FOLDER = "tests/scripts/perf-sanity" +DISAGG_CONFIG_FOLDER = "tests/integration/defs/perf/disagg/test_configs/disagg/perf-sanity" + # Regex patterns for parsing benchmark output metrics # Key is the metric name used in database (e.g., "mean_e2el", "seq_throughput") PERF_METRIC_LOG_QUERIES = { @@ -97,9 +101,20 @@ def get_model_dir(model_name: str) -> str: return "" -def get_dataset_path() -> str: - """Get dataset path for benchmark.""" - return os.path.join(llm_models_root(), "datasets", "ShareGPT_V3_unfiltered_cleaned_split.json") +def get_dataset_dir(dataset_file: Optional[str]) -> str: + """Get dataset directory path from dataset file.""" + if not dataset_file or dataset_file == "": + return "" + + # return os.path.join(llm_models_root(), "datasets", "ShareGPT_V3_unfiltered_cleaned_split.json") + llm_models_path = os.path.join(llm_models_root(), dataset_file) + if os.path.exists(llm_models_path): + return llm_models_path + elif os.path.exists(dataset_file): + return dataset_file + else: + print_info(f"Dataset file not found in {llm_models_path} and {dataset_file}") + return "" def to_env_dict(env_vars: str) -> Dict[str, str]: @@ -141,6 +156,7 @@ class ServerConfig: self.disable_overlap_scheduler = server_config_data.get("disable_overlap_scheduler", False) self.num_postprocess_workers = server_config_data.get("num_postprocess_workers", 0) self.stream_interval = server_config_data.get("stream_interval", 10) + self.print_iter_log = server_config_data.get("print_iter_log", False) self.attn_backend = server_config_data.get("attn_backend", "TRTLLM") self.enable_chunked_prefill = server_config_data.get("enable_chunked_prefill", False) self.enable_attention_dp = server_config_data.get("enable_attention_dp", False) @@ -213,6 +229,7 @@ class ServerConfig: self.eagle3_layers_to_capture = [] self.max_draft_len = speculative_config.get("max_draft_len", 0) self.speculative_model = speculative_config.get("speculative_model", "") + self.eagle3_one_model = speculative_config.get("eagle3_one_model", False) # match_mode: "config" (default) or "scenario" self.match_mode = server_config_data.get("match_mode", "config") @@ -340,6 +357,7 @@ class ServerConfig: "s_eagle3_layers_to_capture": ",".join(map(str, self.eagle3_layers_to_capture)), "l_max_draft_len": self.max_draft_len, "s_speculative_model_dir": self.speculative_model, + "b_eagle3_one_model": self.eagle3_one_model, "s_server_log_link": "", "s_server_env_var": self.env_vars, } @@ -366,7 +384,12 @@ class ServerConfig: class ClientConfig: """Configurations of benchmark client.""" - def __init__(self, client_config_data: dict, model_name: str, env_vars: str = ""): + def __init__( + self, + client_config_data: dict, + model_name: str, + env_vars: str = "", + ): self.model_name = model_name self.concurrency = client_config_data.get("concurrency", 1) self.iterations = client_config_data.get("iterations", 1) @@ -378,6 +401,7 @@ class ClientConfig: self.streaming = client_config_data.get("streaming", True) self.trust_remote_code = client_config_data.get("trust_remote_code", True) self.model_path = "" + self.dataset_file = client_config_data.get("dataset_file", "") self.env_vars = env_vars # Generate default name if not provided @@ -389,7 +413,7 @@ class ClientConfig: """Generate benchmark command.""" model_dir = get_model_dir(self.model_name) self.model_path = model_dir if os.path.exists(model_dir) else self.model_name - dataset_path = get_dataset_path() + dataset_path = get_dataset_dir(self.dataset_file) benchmark_cmd = [ "python", "-m", @@ -398,9 +422,6 @@ class ClientConfig: self.model_path, "--tokenizer", self.model_path, - "--dataset-name", - "random", - "--random-ids", "--num-prompts", str(self.concurrency * self.iterations), "--max-concurrency", @@ -409,15 +430,27 @@ class ClientConfig: str(self.isl), "--random-output-len", str(self.osl), - "--random-range-ratio", - str(self.random_range_ratio), "--ignore-eos", + "--no-test-input", "--percentile-metrics", "ttft,tpot,itl,e2el", ] - if dataset_path and os.path.exists(dataset_path): + if dataset_path: + benchmark_cmd.append("--dataset-name") + benchmark_cmd.append("trtllm_custom") benchmark_cmd.append("--dataset-path") benchmark_cmd.append(dataset_path) + print_info(f"Dataset: {dataset_path} exists. Use trtllm_custom dataset for benchmark.") + else: + benchmark_cmd.append("--dataset-name") + benchmark_cmd.append("random") + benchmark_cmd.append("--random-ids") + benchmark_cmd.append("--random-range-ratio") + benchmark_cmd.append(str(self.random_range_ratio)) + print_info( + f"Dataset: {dataset_path} is not provided or does not exist. " + f"Use random dataset (random_range_ratio={self.random_range_ratio}) for benchmark." + ) if self.backend: benchmark_cmd.append("--backend") benchmark_cmd.append(self.backend) @@ -453,6 +486,7 @@ class ClientConfig: "l_isl": self.isl, "l_osl": self.osl, "d_random_range_ratio": self.random_range_ratio, + "s_dataset_file": self.dataset_file, "s_backend": self.backend, "b_use_chat_template": self.use_chat_template, "b_streaming": self.streaming, @@ -840,7 +874,7 @@ class PerfSanityTestConfig: if is_disagg: # For disagg: disagg_upload-deepseek-r1-fp4_8k1k_ctx1_gen1_dep32_bs128_eplb0_mtp0_ccb-UCX self.runtime = "multi_node_disagg_server" - self.config_dir = "tests/integration/defs/perf/disagg/test_configs/disagg/perf" + self.config_dir = DISAGG_CONFIG_FOLDER config_base = "-".join(labels[1:]) self.config_file = ( f"{config_base}.yaml" if not config_base.endswith(".yaml") else config_base @@ -849,7 +883,7 @@ class PerfSanityTestConfig: else: # For aggr: aggr_upload-config_yml or aggr_upload-config_yml-server_config_name self.runtime = "aggr_server" - self.config_dir = "tests/scripts/perf-sanity" + self.config_dir = AGGR_CONFIG_FOLDER config_base = labels[1] self.config_file = ( f"{config_base}.yaml" @@ -922,7 +956,9 @@ class PerfSanityTestConfig: client_configs = [] for client_config_data in server_config_data["client_configs"]: client_config = ClientConfig( - client_config_data, server_config_data["model_name"], client_env_var + client_config_data, + server_config_data["model_name"], + env_vars=client_env_var, ) client_configs.append(client_config) @@ -1026,8 +1062,13 @@ class PerfSanityTestConfig: "backend": "openai", "use_chat_template": False, "streaming": benchmark.get("streaming", True), + "dataset_file": benchmark.get("dataset_file", ""), } - client_config = ClientConfig(client_config_data, model_name, client_env_var) + client_config = ClientConfig( + client_config_data, + model_name, + env_vars=client_env_var, + ) client_configs.append(client_config) self.server_client_configs = {0: client_configs} @@ -1417,9 +1458,6 @@ class PerfSanityTestConfig: AGG_TEST_TYPES = ["aggr_upload", "aggr"] DISAGG_TEST_TYPES = ["disagg_upload", "disagg"] -AGGR_CONFIG_FOLDER = "tests/scripts/perf-sanity" -DISAGG_CONFIG_FOLDER = "tests/integration/defs/perf/disagg/test_configs/disagg/perf" - def get_server_config_names(yaml_path: str) -> List[str]: """Read a YAML file and return the list of server_config names.""" diff --git a/tests/integration/test_lists/test-db/l0_dgx_b200_multi_nodes_disagg_perf_sanity_ctx1_node1_gpu4_gen1_node1_gpu8.yml b/tests/integration/test_lists/test-db/l0_dgx_b200_multi_nodes_disagg_perf_sanity_ctx1_node1_gpu4_gen1_node1_gpu8.yml new file mode 100644 index 0000000000..49fb86c52e --- /dev/null +++ b/tests/integration/test_lists/test-db/l0_dgx_b200_multi_nodes_disagg_perf_sanity_ctx1_node1_gpu4_gen1_node1_gpu8.yml @@ -0,0 +1,21 @@ +version: 0.0.1 +l0_dgx_b200_multi_nodes_disagg_perf_sanity_ctx1_node1_gpu4_gen1_node1_gpu8: +- condition: + ranges: + # 2 nodes with each node has 8 GPUs + system_gpu_count: + gte: 16 + lte: 16 + wildcards: + gpu: + - '*b200*' + terms: + stage: post_merge + backend: pytorch + tests: + - perf/test_perf_sanity.py::test_e2e[disagg_upload-b200-deepseek-r1-fp4_1k1k_ctx1_dep4_gen1_dep8_eplb0_mtp1_ccb-UCX] TIMEOUT (90) + - perf/test_perf_sanity.py::test_e2e[disagg_upload-b200-deepseek-r1-fp4_1k1k_ctx1_dep4_gen1_dep8_eplb0_mtp3_ccb-UCX] TIMEOUT (90) + - perf/test_perf_sanity.py::test_e2e[disagg_upload-b200-deepseek-r1-fp4_1k1k_ctx1_dep4_gen1_tep8_eplb0_mtp3_ccb-UCX] TIMEOUT (90) + - perf/test_perf_sanity.py::test_e2e[disagg_upload-b200-deepseek-r1-fp4_8k1k_con256_ctx1_dep4_gen1_dep8_eplb0_mtp1_ccb-UCX] TIMEOUT (90) + - perf/test_perf_sanity.py::test_e2e[disagg_upload-b200-deepseek-r1-fp4_8k1k_con1536_ctx1_dep4_gen1_dep8_eplb0_mtp1_ccb-UCX] TIMEOUT (90) + - perf/test_perf_sanity.py::test_e2e[disagg_upload-b200-deepseek-r1-fp4_8k1k_ctx1_dep4_gen1_tep8_eplb0_mtp3_ccb-UCX] TIMEOUT (90) diff --git a/tests/integration/test_lists/test-db/l0_gb200_multi_nodes_aggr_perf_sanity_2_nodes.yml b/tests/integration/test_lists/test-db/l0_gb200_multi_nodes_aggr_perf_sanity_node2_gpu8.yml similarity index 95% rename from tests/integration/test_lists/test-db/l0_gb200_multi_nodes_aggr_perf_sanity_2_nodes.yml rename to tests/integration/test_lists/test-db/l0_gb200_multi_nodes_aggr_perf_sanity_node2_gpu8.yml index d46b3835e2..a55a573352 100644 --- a/tests/integration/test_lists/test-db/l0_gb200_multi_nodes_aggr_perf_sanity_2_nodes.yml +++ b/tests/integration/test_lists/test-db/l0_gb200_multi_nodes_aggr_perf_sanity_node2_gpu8.yml @@ -1,5 +1,5 @@ version: 0.0.1 -l0_gb200_multi_nodes_aggr_perf_sanity_2_nodes: +l0_gb200_multi_nodes_aggr_perf_sanity_node2_gpu8: - condition: ranges: # 2 nodes with each node has 4 GPUs diff --git a/tests/integration/test_lists/test-db/l0_gb200_multi_nodes_disagg_perf_sanity_3_nodes.yml b/tests/integration/test_lists/test-db/l0_gb200_multi_nodes_disagg_perf_sanity_3_nodes.yml deleted file mode 100644 index eb0aeebd90..0000000000 --- a/tests/integration/test_lists/test-db/l0_gb200_multi_nodes_disagg_perf_sanity_3_nodes.yml +++ /dev/null @@ -1,16 +0,0 @@ -version: 0.0.1 -l0_gb200_multi_nodes_disagg_perf_sanity_3_nodes: -- condition: - ranges: - # 3 nodes with each node has 4 GPUs - system_gpu_count: - gte: 12 - lte: 12 - wildcards: - gpu: - - '*gb200*' - terms: - stage: post_merge - backend: pytorch - tests: - - perf/test_perf_sanity.py::test_e2e[disagg_upload-deepseek-r1-fp4_1k1k_ctx1_gen1_dep8_bs768_eplb0_mtp0_ccb-UCX] TIMEOUT (90) diff --git a/tests/integration/test_lists/test-db/l0_gb200_multi_nodes_disagg_perf_sanity_6_nodes.yml b/tests/integration/test_lists/test-db/l0_gb200_multi_nodes_disagg_perf_sanity_6_nodes.yml deleted file mode 100644 index 55ad5690c6..0000000000 --- a/tests/integration/test_lists/test-db/l0_gb200_multi_nodes_disagg_perf_sanity_6_nodes.yml +++ /dev/null @@ -1,17 +0,0 @@ -version: 0.0.1 -l0_gb200_multi_nodes_disagg_perf_sanity_6_nodes: -- condition: - ranges: - # 6 nodes with each node has 4 GPUs - system_gpu_count: - gte: 24 - lte: 24 - wildcards: - gpu: - - '*gb200*' - terms: - stage: post_merge - backend: pytorch - tests: - - perf/test_perf_sanity.py::test_e2e[disagg_upload-deepseek-r1-fp4_1k1k_ctx2_gen1_dep16_bs128_eplb0_mtp3_ccb-UCX] TIMEOUT (90) - - perf/test_perf_sanity.py::test_e2e[disagg_upload-deepseek-r1-fp4_1k1k_ctx2_gen1_dep16_bs128_eplb0_mtp3_ccb-NIXL] TIMEOUT (90) diff --git a/tests/integration/test_lists/test-db/l0_gb200_multi_nodes_disagg_perf_sanity_8_nodes.yml b/tests/integration/test_lists/test-db/l0_gb200_multi_nodes_disagg_perf_sanity_8_nodes.yml deleted file mode 100644 index 196c76a669..0000000000 --- a/tests/integration/test_lists/test-db/l0_gb200_multi_nodes_disagg_perf_sanity_8_nodes.yml +++ /dev/null @@ -1,16 +0,0 @@ -version: 0.0.1 -l0_gb200_multi_nodes_disagg_perf_sanity_8_nodes: -- condition: - ranges: - # 8 nodes with each node has 4 GPUs - system_gpu_count: - gte: 32 - lte: 32 - wildcards: - gpu: - - '*gb200*' - terms: - stage: post_merge - backend: pytorch - tests: - - perf/test_perf_sanity.py::test_e2e[disagg_upload-deepseek-r1-fp4_8k1k_ctx1_gen1_dep32_bs128_eplb0_mtp3_ccb-UCX] TIMEOUT (90) diff --git a/tests/integration/test_lists/test-db/l0_gb200_multi_nodes_disagg_perf_sanity_ctx1_node1_gpu1_gen1_node1_gpu4.yml b/tests/integration/test_lists/test-db/l0_gb200_multi_nodes_disagg_perf_sanity_ctx1_node1_gpu1_gen1_node1_gpu4.yml new file mode 100644 index 0000000000..8100ebedfd --- /dev/null +++ b/tests/integration/test_lists/test-db/l0_gb200_multi_nodes_disagg_perf_sanity_ctx1_node1_gpu1_gen1_node1_gpu4.yml @@ -0,0 +1,17 @@ +version: 0.0.1 +l0_gb200_multi_nodes_disagg_perf_sanity_ctx1_node1_gpu1_gen1_node1_gpu4: +- condition: + ranges: + # 1 ctx worker with each 1 node and 1 GPU + # 1 gen worker with each 1 node and 4 GPUs + system_gpu_count: + gte: 8 + lte: 8 + wildcards: + gpu: + - '*gb200*' + terms: + stage: post_merge + backend: pytorch + tests: + - perf/test_perf_sanity.py::test_e2e[disagg_upload-gb200-qwen3-235b-fp4_8k1k_ctx1_tp1_gen1_tep4_eplb0_mtp0_ccb-UCX] TIMEOUT (120) diff --git a/tests/integration/test_lists/test-db/l0_gb200_multi_nodes_disagg_perf_sanity_ctx1_node1_gpu1_gen1_node2_gpu8.yml b/tests/integration/test_lists/test-db/l0_gb200_multi_nodes_disagg_perf_sanity_ctx1_node1_gpu1_gen1_node2_gpu8.yml new file mode 100644 index 0000000000..46b48ac4f5 --- /dev/null +++ b/tests/integration/test_lists/test-db/l0_gb200_multi_nodes_disagg_perf_sanity_ctx1_node1_gpu1_gen1_node2_gpu8.yml @@ -0,0 +1,17 @@ +version: 0.0.1 +l0_gb200_multi_nodes_disagg_perf_sanity_ctx1_node1_gpu1_gen1_node2_gpu8: +- condition: + ranges: + # 1 ctx worker with each 1 node and 1 GPU + # 1 gen worker with each 2 nodes and 8 GPUs + system_gpu_count: + gte: 12 + lte: 12 + wildcards: + gpu: + - '*gb200*' + terms: + stage: post_merge + backend: pytorch + tests: + - perf/test_perf_sanity.py::test_e2e[disagg_upload-gb200-qwen3-235b-fp4_8k1k_ctx1_tp1_gen1_dep8_eplb0_mtp0_ccb-UCX] TIMEOUT (120) diff --git a/tests/integration/test_lists/test-db/l0_gb200_multi_nodes_disagg_perf_sanity_ctx1_node1_gpu4_gen1_node1_gpu4.yml b/tests/integration/test_lists/test-db/l0_gb200_multi_nodes_disagg_perf_sanity_ctx1_node1_gpu4_gen1_node1_gpu4.yml new file mode 100644 index 0000000000..47bfe2a50b --- /dev/null +++ b/tests/integration/test_lists/test-db/l0_gb200_multi_nodes_disagg_perf_sanity_ctx1_node1_gpu4_gen1_node1_gpu4.yml @@ -0,0 +1,19 @@ +version: 0.0.1 +l0_gb200_multi_nodes_disagg_perf_sanity_ctx1_node1_gpu4_gen1_node1_gpu4: +- condition: + ranges: + # 1 ctx worker with each 1 node and 4 GPUs + # 1 gen worker with each 1 node and 4 GPUs + system_gpu_count: + gte: 8 + lte: 8 + wildcards: + gpu: + - '*gb200*' + terms: + stage: post_merge + backend: pytorch + tests: + - perf/test_perf_sanity.py::test_e2e[disagg_upload-gb200-deepseek-r1-fp4_1k1k_ctx1_dep4_gen1_dep4_eplb0_mtp1_ccb-UCX] TIMEOUT (120) + - perf/test_perf_sanity.py::test_e2e[disagg_upload-gb200-deepseek-v32-fp4_1k1k_ctx1_dep4_gen1_dep4_eplb0_mtp1_ccb-UCX] TIMEOUT (120) + - perf/test_perf_sanity.py::test_e2e[disagg_upload-gb200-kimi-k2-thinking-fp4_1k1k_ctx1_dep4_gen1_tep4_eplb0_mtp0_ccb-UCX] TIMEOUT (120) diff --git a/tests/integration/test_lists/test-db/l0_gb200_multi_nodes_disagg_perf_sanity_ctx1_node1_gpu4_gen1_node2_gpu8.yml b/tests/integration/test_lists/test-db/l0_gb200_multi_nodes_disagg_perf_sanity_ctx1_node1_gpu4_gen1_node2_gpu8.yml new file mode 100644 index 0000000000..497223f5b4 --- /dev/null +++ b/tests/integration/test_lists/test-db/l0_gb200_multi_nodes_disagg_perf_sanity_ctx1_node1_gpu4_gen1_node2_gpu8.yml @@ -0,0 +1,24 @@ +version: 0.0.1 +l0_gb200_multi_nodes_disagg_perf_sanity_ctx1_node1_gpu4_gen1_node2_gpu8: +- condition: + ranges: + # 1 ctx worker with each 1 node and 4 GPUs + # 1 gen worker with each 2 nodes and 8 GPUs + system_gpu_count: + gte: 12 + lte: 12 + wildcards: + gpu: + - '*gb200*' + terms: + stage: post_merge + backend: pytorch + tests: + - perf/test_perf_sanity.py::test_e2e[disagg_upload-gb200-deepseek-r1-fp4_1k1k_ctx1_dep4_gen1_dep8_eplb0_mtp0_ccb-UCX] TIMEOUT (120) + - perf/test_perf_sanity.py::test_e2e[disagg_upload-gb200-deepseek-r1-fp4_1k1k_ctx1_dep4_gen1_tep8_eplb0_mtp3_ccb-UCX] TIMEOUT (120) + - perf/test_perf_sanity.py::test_e2e[disagg_upload-gb200-deepseek-r1-fp4_8k1k_ctx1_dep4_gen1_tep8_eplb0_mtp3_ccb-UCX] TIMEOUT (120) + # - perf/test_perf_sanity.py::test_e2e[disagg_upload-gb200-deepseek-v32-fp4_1k1k_ctx1_dep4_gen1_tep8_eplb0_mtp3_ccb-UCX] TIMEOUT (120) + # - perf/test_perf_sanity.py::test_e2e[disagg_upload-gb200-deepseek-v32-fp4_32k4k_ctx1_dep4_gen1_tep8_eplb0_mtp3_ccb-UCX] TIMEOUT (120) + # - perf/test_perf_sanity.py::test_e2e[disagg_upload-gb200-deepseek-v32-fp4_8k1k_ctx1_dep4_gen1_tep8_eplb0_mtp3_ccb-UCX] TIMEOUT (120) + - perf/test_perf_sanity.py::test_e2e[disagg_upload-gb200-kimi-k2-thinking-fp4_1k1k_ctx1_dep4_gen1_dep8_eplb0_mtp0_ccb-UCX] TIMEOUT (120) + - perf/test_perf_sanity.py::test_e2e[disagg_upload-gb200-kimi-k2-thinking-fp4_8k1k_ctx1_dep4_gen1_tep8_eplb0_mtp3_ccb-UCX] TIMEOUT (120) diff --git a/tests/integration/test_lists/test-db/l0_gb200_multi_nodes_disagg_perf_sanity_ctx1_node1_gpu4_gen1_node4_gpu16.yml b/tests/integration/test_lists/test-db/l0_gb200_multi_nodes_disagg_perf_sanity_ctx1_node1_gpu4_gen1_node4_gpu16.yml new file mode 100644 index 0000000000..e05302f6ae --- /dev/null +++ b/tests/integration/test_lists/test-db/l0_gb200_multi_nodes_disagg_perf_sanity_ctx1_node1_gpu4_gen1_node4_gpu16.yml @@ -0,0 +1,18 @@ +version: 0.0.1 +l0_gb200_multi_nodes_disagg_perf_sanity_ctx1_node1_gpu4_gen1_node4_gpu16: +- condition: + ranges: + # 1 ctx worker with each 1 node and 4 GPUs + # 1 gen worker with each 4 nodes and 16 GPUs + system_gpu_count: + gte: 20 + lte: 20 + wildcards: + gpu: + - '*gb200*' + terms: + stage: post_merge + backend: pytorch + tests: + - perf/test_perf_sanity.py::test_e2e[disagg_upload-gb200-deepseek-r1-fp4_8k1k_ctx1_dep4_gen1_dep16_eplb0_mtp1_ccb-UCX] TIMEOUT (120) + - perf/test_perf_sanity.py::test_e2e[disagg_upload-gb200-kimi-k2-thinking-fp4_8k1k_ctx1_dep4_gen1_dep16_eplb384_mtp0_ccb-UCX] TIMEOUT (120) diff --git a/tests/integration/test_lists/test-db/l0_gb200_multi_nodes_disagg_perf_sanity_ctx1_node1_gpu4_gen1_node8_gpu32.yml b/tests/integration/test_lists/test-db/l0_gb200_multi_nodes_disagg_perf_sanity_ctx1_node1_gpu4_gen1_node8_gpu32.yml new file mode 100644 index 0000000000..629d42fceb --- /dev/null +++ b/tests/integration/test_lists/test-db/l0_gb200_multi_nodes_disagg_perf_sanity_ctx1_node1_gpu4_gen1_node8_gpu32.yml @@ -0,0 +1,25 @@ +version: 0.0.1 +l0_gb200_multi_nodes_disagg_perf_sanity_ctx1_node1_gpu4_gen1_node8_gpu32: +- condition: + ranges: + # 1 ctx worker with each 1 node and 4 GPUs + # 1 gen worker with each 8 nodes and 32 GPUs + system_gpu_count: + gte: 36 + lte: 36 + wildcards: + gpu: + - '*gb200*' + terms: + stage: post_merge + backend: pytorch + tests: + - perf/test_perf_sanity.py::test_e2e[disagg_upload-gb200-deepseek-r1-fp4_1k1k_ctx1_dep4_gen1_dep32_eplb0_mtp3_ccb-UCX] TIMEOUT (120) + - perf/test_perf_sanity.py::test_e2e[disagg_upload-gb200-deepseek-r1-fp4_8k1k_ctx1_dep4_gen1_dep32_eplb0_mtp3_ccb-UCX] TIMEOUT (120) + - perf/test_perf_sanity.py::test_e2e[disagg_upload-gb200-deepseek-v32-fp4_1k1k_ctx1_dep4_gen1_dep32_eplb256_mtp3_ccb-UCX] TIMEOUT (120) + - perf/test_perf_sanity.py::test_e2e[disagg_upload-gb200-deepseek-v32-fp4_32k4k_ctx1_dep4_gen1_dep32_eplb288_mtp1_ccb-UCX] TIMEOUT (120) + - perf/test_perf_sanity.py::test_e2e[disagg_upload-gb200-deepseek-v32-fp4_32k4k_ctx1_dep4_gen1_dep32_eplb0_mtp3_ccb-UCX] TIMEOUT (120) + - perf/test_perf_sanity.py::test_e2e[disagg_upload-gb200-deepseek-v32-fp4_8k1k_ctx1_dep4_gen1_dep32_eplb256_mtp0_ccb-UCX] TIMEOUT (120) + - perf/test_perf_sanity.py::test_e2e[disagg_upload-gb200-deepseek-v32-fp4_8k1k_ctx1_dep4_gen1_dep32_eplb256_mtp3_ccb-UCX] TIMEOUT (120) + - perf/test_perf_sanity.py::test_e2e[disagg_upload-gb200-kimi-k2-thinking-fp4_1k1k_ctx1_dep4_gen1_dep32_eplb384_mtp0_ccb-UCX] TIMEOUT (120) + - perf/test_perf_sanity.py::test_e2e[disagg_upload-gb200-kimi-k2-thinking-fp4_8k1k_ctx1_dep4_gen1_dep32_eplb416_mtp3_ccb-UCX] TIMEOUT (120) diff --git a/tests/integration/test_lists/test-db/l0_gb200_multi_nodes_disagg_perf_sanity_ctx1_node2_gpu8_gen1_node2_gpu8.yml b/tests/integration/test_lists/test-db/l0_gb200_multi_nodes_disagg_perf_sanity_ctx1_node2_gpu8_gen1_node2_gpu8.yml new file mode 100644 index 0000000000..df3a41d66f --- /dev/null +++ b/tests/integration/test_lists/test-db/l0_gb200_multi_nodes_disagg_perf_sanity_ctx1_node2_gpu8_gen1_node2_gpu8.yml @@ -0,0 +1,17 @@ +version: 0.0.1 +l0_gb200_multi_nodes_disagg_perf_sanity_ctx1_node2_gpu8_gen1_node2_gpu8: +- condition: + ranges: + # 1 ctx worker with each 2 nodes and 8 GPUs + # 1 gen worker with each 2 nodes and 8 GPUs + system_gpu_count: + gte: 16 + lte: 16 + wildcards: + gpu: + - '*gb200*' + terms: + stage: post_merge + backend: pytorch + tests: + - perf/test_perf_sanity.py::test_e2e[disagg_upload-gb200-deepseek-r1-fp4_128k8k_ctx1_pp8_gen1_tep8_eplb0_mtp3_ccb-UCX] TIMEOUT (120) diff --git a/tests/integration/test_lists/test-db/l0_gb200_multi_nodes_disagg_perf_sanity_ctx1_node2_gpu8_gen1_node4_gpu16.yml b/tests/integration/test_lists/test-db/l0_gb200_multi_nodes_disagg_perf_sanity_ctx1_node2_gpu8_gen1_node4_gpu16.yml new file mode 100644 index 0000000000..df9304b4f4 --- /dev/null +++ b/tests/integration/test_lists/test-db/l0_gb200_multi_nodes_disagg_perf_sanity_ctx1_node2_gpu8_gen1_node4_gpu16.yml @@ -0,0 +1,17 @@ +version: 0.0.1 +l0_gb200_multi_nodes_disagg_perf_sanity_ctx1_node2_gpu8_gen1_node4_gpu16: +- condition: + ranges: + # 1 ctx worker with each 2 nodes and 8 GPUs + # 1 gen worker with each 4 nodes and 16 GPUs + system_gpu_count: + gte: 24 + lte: 24 + wildcards: + gpu: + - '*gb200*' + terms: + stage: post_merge + backend: pytorch + tests: + - perf/test_perf_sanity.py::test_e2e[disagg_upload-gb200-deepseek-r1-fp4_128k8k_ctx1_pp8_gen1_dep16_eplb0_mtp2_ccb-UCX] TIMEOUT (120) diff --git a/tests/integration/test_lists/test-db/l0_gb200_multi_nodes_disagg_perf_sanity_ctx1_node2_gpu8_gen1_node8_gpu32.yml b/tests/integration/test_lists/test-db/l0_gb200_multi_nodes_disagg_perf_sanity_ctx1_node2_gpu8_gen1_node8_gpu32.yml new file mode 100644 index 0000000000..2b58246456 --- /dev/null +++ b/tests/integration/test_lists/test-db/l0_gb200_multi_nodes_disagg_perf_sanity_ctx1_node2_gpu8_gen1_node8_gpu32.yml @@ -0,0 +1,17 @@ +version: 0.0.1 +l0_gb200_multi_nodes_disagg_perf_sanity_ctx1_node2_gpu8_gen1_node8_gpu32: +- condition: + ranges: + # 1 ctx worker with each 2 nodes and 8 GPUs + # 1 gen worker with each 8 nodes and 32 GPUs + system_gpu_count: + gte: 40 + lte: 40 + wildcards: + gpu: + - '*gb200*' + terms: + stage: post_merge + backend: pytorch + tests: + - perf/test_perf_sanity.py::test_e2e[disagg_upload-gb200-deepseek-r1-fp4_128k8k_ctx1_pp8_gen1_dep32_eplb0_mtp3_ccb-UCX] TIMEOUT (120) diff --git a/tests/integration/test_lists/waives.txt b/tests/integration/test_lists/waives.txt index bcef706097..dc164d0063 100644 --- a/tests/integration/test_lists/waives.txt +++ b/tests/integration/test_lists/waives.txt @@ -291,9 +291,7 @@ unittest/llmapi/test_mpi_session.py::test_llmapi_launch_multiple_tasks SKIP (htt accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16_4gpus[ep4-mtp_nextn=0-attention_dp=False-cuda_graph=False-overlap_scheduler=True-torch_compile=False] SKIP (https://nvbugs/5819019) unittest/_torch/thop/serial/test_moe.py::TestMoeFp4::test_gptoss_style_nvfp4[limit1-beta0-alpha1-RoutingGPTOSS-512-512-1] SKIP (https://nvbugs/5819042) accuracy/test_llm_api_pytorch.py::TestQwen3_235B_A22B::test_nvfp4_4gpus[latency_moe_trtllm_eagle3] SKIP (https://nvbugs/5819048) -perf/test_perf_sanity.py::test_e2e[aggr_upload-k2_thinking_fp4_2_nodes_grace_blackwell-k2_thinking_fp4_dep8_32k8k] SKIP (https://nvbugs/5819053) -perf/test_perf_sanity.py::test_e2e[aggr_upload-k2_thinking_fp4_2_nodes_grace_blackwell-k2_thinking_fp4_tep8_32k8k] SKIP (https://nvbugs/5819053) -perf/test_perf_sanity.py::test_e2e[aggr_upload-k2_thinking_fp4_grace_blackwell-k2_thinking_fp4_tep4_8k1k] SKIP (https://nvbugs/5820541) +accuracy/test_llm_api_pytorch.py::TestQwen3_30B_A3B::test_nvfp4[tep4_latency_moe_cutlass-torch_compile=False] SKIP (https://nvbugs/5819021) disaggregated/test_disaggregated.py::test_disaggregated_deepseek_v3_lite_bf16_cache_aware_balance[DeepSeek-V3-Lite-bf16] SKIP (https://nvbugs/5820576) llmapi/test_llm_examples.py::test_llmapi_tensorrt_engine SKIP (https://nvbugs/5820553) accuracy/test_llm_api_pytorch.py::TestLlama4ScoutInstruct::test_auto_dtype[tp4-cuda_graph=False] SKIP (https://nvbugs/5820938) @@ -317,10 +315,8 @@ accuracy/test_disaggregated_serving.py::TestDeepSeekV3Lite::test_auto_dtype[mtp_ accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_fp8_block_scales_4gpus[tp4-mtp_nextn=2-fp8kv=True-attention_dp=False-cuda_graph=False-overlap_scheduler=False-torch_compile=False-sampler_async_worker=False] SKIP (https://nvbugs/5701445) accuracy/test_llm_api_pytorch.py::TestLlama4ScoutInstruct::test_fp4[tp4-cuda_graph=True] SKIP (https://nvbugs/5820734) accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4_4gpus[moe_backend=CUTLASS-mtp_nextn=2-ep4-fp8kv=False-attention_dp=False-cuda_graph=False-overlap_scheduler=False-torch_compile=False] SKIP (https://nvbugs/5800672) -perf/test_perf_sanity.py::test_e2e[aggr_upload-deepseek_r1_fp4_v2_2_nodes_grace_blackwell-r1_fp4_v2_tep8_mtp3] SKIP (https://nvbugs/5819053) +accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16_4gpus[ep4-mtp_nextn=2-attention_dp=False-cuda_graph=False-overlap_scheduler=False-torch_compile=False] SKIP (https://nvbugs/5823284) accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4_4gpus[moe_backend=CUTLASS-mtp_nextn=2-tp2pp2-fp8kv=False-attention_dp=False-cuda_graph=False-overlap_scheduler=False-torch_compile=False] SKIP (https://nvbugs/5800672) -perf/test_perf_sanity.py::test_e2e[aggr_upload-deepseek_r1_fp4_v2_grace_blackwell-r1_fp4_v2_tp4_mtp3_1k8k] SKIP (https://nvbugs/5819053) -perf/test_perf_sanity.py::test_e2e[disagg_upload-deepseek-r1-fp4_1k1k_ctx1_gen1_dep8_bs768_eplb0_mtp0_ccb-UCX] SKIP (https://nvbugs/5819053) accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_bfloat16_4gpus[tp4-attn_backend=TRTLLM-torch_compile=True] SKIP (https://nvbugs/5826604) disaggregated/test_disaggregated.py::test_disaggregated_mixed[TinyLlama-1.1B-Chat-v1.0] SKIP (https://nvbugs/5834212) accuracy/test_llm_api_pytorch.py::TestGLM4_5Air::test_nvfp4_multi_gpus[throughput_trtllm] SKIP (https://nvbugs/5837275) @@ -337,7 +333,6 @@ test_e2e.py::test_openai_chat_harmony SKIP (https://nvbugs/5819444) accuracy/test_cli_flow.py::TestLlama2_7B::test_fp8 SKIP (https://nvbugs/5819452) accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16_4gpus[ep4-mtp_nextn=2-attention_dp=True-cuda_graph=True-overlap_scheduler=True-torch_compile=False] SKIP (https://nvbugs/5800646) accuracy/test_disaggregated_serving.py::TestQwen3_30B_A3B::test_mixed_ctx_gen_model[ctxpp2gentp2] SKIP (https://nvbugs/5748664) -perf/test_perf_sanity.py::test_e2e[aggr_upload-deepseek_r1_fp4_v2_grace_blackwell-r1_fp4_v2_tep4_mtp3_8k1k] SKIP (https://nvbugs/5819053) examples/test_llama.py::test_llama_3_x_with_bf16_lora_torch[llama-3.2-1b-instruct] SKIP (https://nvbugs/5838178) accuracy/test_llm_api_autodeploy.py::TestNemotronSuperV3::test_bf16 SKIP (https://nvbugs/5838184) cpp/test_multi_gpu.py::test_cache_transceiver[8proc-mooncake_kvcache-90] SKIP (https://nvbugs/5838199) @@ -362,7 +357,6 @@ full:RTXPro6000D/accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bflo full:RTXPro6000D/accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4_4gpus[moe_backend=CUTLASS-mtp_nextn=2-ep4-fp8kv=True-attention_dp=True-cuda_graph=True-overlap_scheduler=True-torch_compile=False] SKIP (https://nvbugs/5846154) full:RTXPro6000D/accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4_4gpus[moe_backend=CUTLASS-mtp_nextn=2-tp2pp2-fp8kv=True-attention_dp=True-cuda_graph=True-overlap_scheduler=True-torch_compile=False] SKIP (https://nvbugs/5846154) full:RTXPro6000D/accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4_4gpus[moe_backend=CUTLASS-mtp_nextn=2-tp4-fp8kv=True-attention_dp=True-cuda_graph=True-overlap_scheduler=True-torch_compile=False] SKIP (https://nvbugs/5846154) -perf/test_perf_sanity.py::test_e2e[aggr_upload-deepseek_r1_fp4_v2_grace_blackwell-r1_fp4_v2_dep4_mtp1_1k8k] SKIP (https://nvbugs/5846166) accuracy/test_disaggregated_serving.py::TestDeepSeekV32Exp::test_auto_dtype[False] SKIP (https://nvbugs/5847284) accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_4gpus[v1_kv_cache-dp4-trtllm-fp8] SKIP (https://nvbugs/5850183) accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_4gpus[v2_kv_cache-dp4-trtllm-fp8] SKIP (https://nvbugs/5850183) diff --git a/tests/scripts/perf-sanity/deepseek_r1_fp4_v2_2_nodes_grace_blackwell.yaml b/tests/scripts/perf-sanity/deepseek_r1_fp4_v2_2_nodes_grace_blackwell.yaml index 3adee07a4e..937356b9ce 100644 --- a/tests/scripts/perf-sanity/deepseek_r1_fp4_v2_2_nodes_grace_blackwell.yaml +++ b/tests/scripts/perf-sanity/deepseek_r1_fp4_v2_2_nodes_grace_blackwell.yaml @@ -38,8 +38,8 @@ server_configs: iterations: 10 isl: 1024 osl: 1024 - random_range_ratio: 0.2 backend: "openai" + dataset_file: datasets/perf-ci/deepseek_r1-1k1k-20480-ratio-1_for_serve.json # 8k1k configs - DEP8 with CUTLASS, MTP1 - name: "r1_fp4_v2_dep8_mtp1_8k1k" @@ -74,8 +74,8 @@ server_configs: iterations: 10 isl: 8192 osl: 1024 - random_range_ratio: 0.2 backend: "openai" + dataset_file: datasets/perf-ci/deepseek_r1-8k1k-20480-ratio-1_for_serve.json # 1k1k configs - TEP8 with TRTLLM, MTP3 - name: "r1_fp4_v2_tep8_mtp3" @@ -105,5 +105,5 @@ server_configs: iterations: 12 isl: 1024 osl: 1024 - random_range_ratio: 0.8 backend: "openai" + dataset_file: datasets/perf-ci/deepseek_r1-1k1k-20480-ratio-1_for_serve.json diff --git a/tests/scripts/perf-sanity/deepseek_r1_fp4_v2_blackwell.yaml b/tests/scripts/perf-sanity/deepseek_r1_fp4_v2_blackwell.yaml index f396c1405b..3ffd67e371 100644 --- a/tests/scripts/perf-sanity/deepseek_r1_fp4_v2_blackwell.yaml +++ b/tests/scripts/perf-sanity/deepseek_r1_fp4_v2_blackwell.yaml @@ -31,8 +31,8 @@ server_configs: iterations: 10 isl: 1024 osl: 1024 - random_range_ratio: 0.2 backend: "openai" + dataset_file: datasets/perf-ci/deepseek_r1-1k1k-20480-ratio-1_for_serve.json # 1k1k configs - DEP8 with CUTLASS, MTP1 - name: "r1_fp4_v2_dep8_mtp1_1k1k" @@ -66,8 +66,8 @@ server_configs: iterations: 10 isl: 1024 osl: 1024 - random_range_ratio: 0.2 backend: "openai" + dataset_file: datasets/perf-ci/deepseek_r1-1k1k-20480-ratio-1_for_serve.json # 8k1k configs - TP4 with TRTLLM, MTP3 - name: "r1_fp4_v2_tp4_mtp3_8k1k" @@ -97,8 +97,8 @@ server_configs: iterations: 10 isl: 8192 osl: 1024 - random_range_ratio: 0.2 backend: "openai" + dataset_file: datasets/perf-ci/deepseek_r1-8k1k-20480-ratio-1_for_serve.json # 8k1k configs - DEP8 with CUTLASS, MTP1 - name: "r1_fp4_v2_dep8_mtp1_8k1k" @@ -132,5 +132,5 @@ server_configs: iterations: 10 isl: 8192 osl: 1024 - random_range_ratio: 0.2 backend: "openai" + dataset_file: datasets/perf-ci/deepseek_r1-8k1k-20480-ratio-1_for_serve.json diff --git a/tests/scripts/perf-sanity/deepseek_r1_fp4_v2_grace_blackwell.yaml b/tests/scripts/perf-sanity/deepseek_r1_fp4_v2_grace_blackwell.yaml index 3398d7b945..323a44fdfc 100644 --- a/tests/scripts/perf-sanity/deepseek_r1_fp4_v2_grace_blackwell.yaml +++ b/tests/scripts/perf-sanity/deepseek_r1_fp4_v2_grace_blackwell.yaml @@ -31,20 +31,13 @@ server_configs: decoding_type: 'MTP' num_nextn_predict_layers: 1 client_configs: - - name: "con2048_iter5_1k1k" - concurrency: 2048 - iterations: 5 - isl: 1024 - osl: 1024 - random_range_ratio: 0.8 - backend: "openai" - name: "con1024_iter10_1k1k" concurrency: 1024 iterations: 10 isl: 1024 osl: 1024 - random_range_ratio: 0.2 backend: "openai" + dataset_file: datasets/perf-ci/deepseek_r1-1k1k-20480-ratio-1_for_serve.json # 1k1k configs - TEP4 with TRTLLM, MTP3 - name: "r1_fp4_v2_tep4_mtp3_1k1k" @@ -74,8 +67,8 @@ server_configs: iterations: 10 isl: 1024 osl: 1024 - random_range_ratio: 0.8 backend: "openai" + dataset_file: datasets/perf-ci/deepseek_r1-1k1k-20480-ratio-1_for_serve.json # 1k1k configs - TP4 with TRTLLM, MTP3 - name: "r1_fp4_v2_tp4_mtp3_1k1k" @@ -100,20 +93,13 @@ server_configs: decoding_type: 'MTP' num_nextn_predict_layers: 3 client_configs: - - name: "con4_iter10_1k1k" - concurrency: 4 - iterations: 10 - isl: 1024 - osl: 1024 - random_range_ratio: 0.8 - backend: "openai" - name: "con2_iter10_1k1k" concurrency: 2 iterations: 10 isl: 1024 osl: 1024 - random_range_ratio: 0.2 backend: "openai" + dataset_file: datasets/perf-ci/deepseek_r1-1k1k-20480-ratio-1_for_serve.json # 8k1k configs - DEP4 with CUTLASS, MTP1 - name: "r1_fp4_v2_dep4_mtp1_8k1k" @@ -142,20 +128,13 @@ server_configs: decoding_type: 'MTP' num_nextn_predict_layers: 1 client_configs: - - name: "con2048_iter5_8k1k" - concurrency: 2048 - iterations: 5 - isl: 8192 - osl: 1024 - random_range_ratio: 0.2 - backend: "openai" - name: "con256_iter10_8k1k" concurrency: 256 iterations: 10 isl: 8192 osl: 1024 - random_range_ratio: 0.2 backend: "openai" + dataset_file: datasets/perf-ci/deepseek_r1-8k1k-20480-ratio-1_for_serve.json # 8k1k configs - TEP4 with TRTLLM, MTP3 - name: "r1_fp4_v2_tep4_mtp3_8k1k" @@ -185,8 +164,8 @@ server_configs: iterations: 10 isl: 8192 osl: 1024 - random_range_ratio: 0.2 backend: "openai" + dataset_file: datasets/perf-ci/deepseek_r1-8k1k-20480-ratio-1_for_serve.json # 8k1k configs - TP4 with TRTLLM, MTP3 - name: "r1_fp4_v2_tp4_mtp3_8k1k" @@ -211,20 +190,13 @@ server_configs: decoding_type: 'MTP' num_nextn_predict_layers: 3 client_configs: - - name: "con4_iter10_8k1k" - concurrency: 4 - iterations: 10 - isl: 8192 - osl: 1024 - random_range_ratio: 0.2 - backend: "openai" - name: "con2_iter10_8k1k" concurrency: 2 iterations: 10 isl: 8192 osl: 1024 - random_range_ratio: 0.2 backend: "openai" + dataset_file: datasets/perf-ci/deepseek_r1-8k1k-20480-ratio-1_for_serve.json # 1k8k configs - DEP4 with CUTLASS, MTP1 - name: "r1_fp4_v2_dep4_mtp1_1k8k" @@ -258,8 +230,8 @@ server_configs: iterations: 5 isl: 1024 osl: 8192 - random_range_ratio: 0.8 backend: "openai" + dataset_file: datasets/perf-ci/deepseek_r1-1k8k-20480-ratio-1_for_serve.json # 1k8k configs - TEP4 with TRTLLM, MTP3 - name: "r1_fp4_v2_tep4_mtp3_1k8k" @@ -289,8 +261,8 @@ server_configs: iterations: 10 isl: 1024 osl: 8192 - random_range_ratio: 0.8 backend: "openai" + dataset_file: datasets/perf-ci/deepseek_r1-1k8k-20480-ratio-1_for_serve.json # 1k8k configs - TP4 with TRTLLM, MTP3 - name: "r1_fp4_v2_tp4_mtp3_1k8k" @@ -320,5 +292,5 @@ server_configs: iterations: 10 isl: 1024 osl: 8192 - random_range_ratio: 0.8 backend: "openai" + dataset_file: datasets/perf-ci/deepseek_r1-1k8k-20480-ratio-1_for_serve.json diff --git a/tests/scripts/perf-sanity/deepseek_r1_fp8_blackwell.yaml b/tests/scripts/perf-sanity/deepseek_r1_fp8_blackwell.yaml index fc1ad5e7ca..f4dd195328 100644 --- a/tests/scripts/perf-sanity/deepseek_r1_fp8_blackwell.yaml +++ b/tests/scripts/perf-sanity/deepseek_r1_fp8_blackwell.yaml @@ -31,8 +31,8 @@ server_configs: iterations: 10 isl: 1024 osl: 1024 - random_range_ratio: 0.2 backend: "openai" + dataset_file: datasets/perf-ci/deepseek_r1-1k1k-20480-ratio-1_for_serve.json # 1k1k configs - DEP8 with DEEPGEMM, MTP1 - name: "r1_fp8_dep8_mtp1_1k1k" @@ -66,8 +66,8 @@ server_configs: iterations: 10 isl: 1024 osl: 1024 - random_range_ratio: 0.2 backend: "openai" + dataset_file: datasets/perf-ci/deepseek_r1-1k1k-20480-ratio-1_for_serve.json # 8k1k configs - TP8 with TRTLLM, MTP3 - name: "r1_fp8_tp8_mtp3_8k1k" @@ -97,8 +97,8 @@ server_configs: iterations: 10 isl: 8192 osl: 1024 - random_range_ratio: 0.2 backend: "openai" + dataset_file: datasets/perf-ci/deepseek_r1-8k1k-20480-ratio-1_for_serve.json # 8k1k configs - DEP8 with DEEPGEMM, MTP1 - name: "r1_fp8_dep8_mtp1_8k1k" @@ -132,5 +132,5 @@ server_configs: iterations: 10 isl: 8192 osl: 1024 - random_range_ratio: 0.2 backend: "openai" + dataset_file: datasets/perf-ci/deepseek_r1-8k1k-20480-ratio-1_for_serve.json diff --git a/tests/scripts/perf-sanity/deepseek_v32_fp4_blackwell.yaml b/tests/scripts/perf-sanity/deepseek_v32_fp4_blackwell.yaml index e3f58d14dc..bb98ee5545 100644 --- a/tests/scripts/perf-sanity/deepseek_v32_fp4_blackwell.yaml +++ b/tests/scripts/perf-sanity/deepseek_v32_fp4_blackwell.yaml @@ -31,8 +31,8 @@ server_configs: iterations: 10 isl: 8192 osl: 1024 - random_range_ratio: 0.2 backend: "openai" + dataset_file: datasets/perf-ci/deepseek_v32-8k1k-20480-ratio-1_for_serve.json # 8k1k configs - DEP8 with CUTLASS, MTP1 - name: "v32_fp4_dep8_mtp1_8k1k" @@ -66,5 +66,5 @@ server_configs: iterations: 10 isl: 8192 osl: 1024 - random_range_ratio: 0.2 backend: "openai" + dataset_file: datasets/perf-ci/deepseek_v32-8k1k-20480-ratio-1_for_serve.json diff --git a/tests/scripts/perf-sanity/deepseek_v32_fp4_grace_blackwell.yaml b/tests/scripts/perf-sanity/deepseek_v32_fp4_grace_blackwell.yaml index 5efca68fd6..38ac77fabc 100644 --- a/tests/scripts/perf-sanity/deepseek_v32_fp4_grace_blackwell.yaml +++ b/tests/scripts/perf-sanity/deepseek_v32_fp4_grace_blackwell.yaml @@ -31,8 +31,8 @@ server_configs: iterations: 10 isl: 1024 osl: 1024 - random_range_ratio: 0.2 backend: "openai" + dataset_file: datasets/perf-ci/deepseek_v32-1k1k-20480-ratio-1_for_serve.json # 1k1k configs - DEP4 with CUTLASS, MTP1 - name: "v32_fp4_dep4_mtp1_1k1k" @@ -66,8 +66,8 @@ server_configs: iterations: 10 isl: 1024 osl: 1024 - random_range_ratio: 0.2 backend: "openai" + dataset_file: datasets/perf-ci/deepseek_v32-1k1k-20480-ratio-1_for_serve.json # 8k1k configs - TEP4 with TRTLLM, MTP3 - name: "v32_fp4_tep4_mtp3_8k1k" @@ -97,8 +97,8 @@ server_configs: iterations: 10 isl: 8192 osl: 1024 - random_range_ratio: 0.2 backend: "openai" + dataset_file: datasets/perf-ci/deepseek_v32-8k1k-20480-ratio-1_for_serve.json # 8k1k configs - DEP4 with CUTLASS, MTP1 - name: "v32_fp4_dep4_mtp1_8k1k" @@ -132,5 +132,5 @@ server_configs: iterations: 10 isl: 8192 osl: 1024 - random_range_ratio: 0.2 backend: "openai" + dataset_file: datasets/perf-ci/deepseek_v32-8k1k-20480-ratio-1_for_serve.json diff --git a/tests/scripts/perf-sanity/gpt_oss_120b_fp4_grace_blackwell.yaml b/tests/scripts/perf-sanity/gpt_oss_120b_fp4_grace_blackwell.yaml index d5993c46de..cddd185c3d 100644 --- a/tests/scripts/perf-sanity/gpt_oss_120b_fp4_grace_blackwell.yaml +++ b/tests/scripts/perf-sanity/gpt_oss_120b_fp4_grace_blackwell.yaml @@ -32,7 +32,7 @@ server_configs: iterations: 5 isl: 1024 osl: 8192 - random_range_ratio: 0.8 + random_range_ratio: 0.0 backend: "openai" - name: "gpt_oss_fp4_dep2_1k1k" @@ -63,7 +63,7 @@ server_configs: iterations: 5 isl: 1024 osl: 1024 - random_range_ratio: 0.8 + random_range_ratio: 0.0 backend: "openai" - name: "gpt_oss_fp4_tep2_1k8k" @@ -92,7 +92,7 @@ server_configs: iterations: 10 isl: 1024 osl: 8192 - random_range_ratio: 0.8 + random_range_ratio: 0.0 backend: "openai" - name: "gpt_oss_fp4_tp2_1k8k" @@ -121,7 +121,7 @@ server_configs: iterations: 10 isl: 1024 osl: 8192 - random_range_ratio: 0.8 + random_range_ratio: 0.0 backend: "openai" - name: "gpt_oss_fp4_tp4_eagle3_1k1k" @@ -155,5 +155,5 @@ server_configs: iterations: 32 isl: 1024 osl: 1024 - random_range_ratio: 0.8 + random_range_ratio: 0.0 backend: "openai" diff --git a/tests/scripts/perf-sanity/k2_thinking_fp4_2_nodes_grace_blackwell.yaml b/tests/scripts/perf-sanity/k2_thinking_fp4_2_nodes_grace_blackwell.yaml index 780d5fa10f..d7f9a5cc72 100644 --- a/tests/scripts/perf-sanity/k2_thinking_fp4_2_nodes_grace_blackwell.yaml +++ b/tests/scripts/perf-sanity/k2_thinking_fp4_2_nodes_grace_blackwell.yaml @@ -32,9 +32,9 @@ server_configs: iterations: 10 isl: 32768 osl: 8192 - random_range_ratio: 0.2 backend: "openai" trust_remote_code: true + dataset_file: datasets/perf-ci/k2_thinking-32k8k-20480-ratio-1_for_serve.json # 32k8k configs - DEP8 with CUTLASS - name: "k2_thinking_fp4_dep8_32k8k" @@ -67,6 +67,6 @@ server_configs: iterations: 10 isl: 32768 osl: 8192 - random_range_ratio: 0.2 backend: "openai" trust_remote_code: true + dataset_file: datasets/perf-ci/k2_thinking-32k8k-20480-ratio-1_for_serve.json diff --git a/tests/scripts/perf-sanity/k2_thinking_fp4_blackwell.yaml b/tests/scripts/perf-sanity/k2_thinking_fp4_blackwell.yaml index 780be6fbe6..fa7bb13ed4 100644 --- a/tests/scripts/perf-sanity/k2_thinking_fp4_blackwell.yaml +++ b/tests/scripts/perf-sanity/k2_thinking_fp4_blackwell.yaml @@ -29,9 +29,9 @@ server_configs: iterations: 10 isl: 8192 osl: 1024 - random_range_ratio: 0.2 backend: "openai" trust_remote_code: true + dataset_file: datasets/perf-ci/k2_thinking-8k1k-20480-ratio-1_for_serve.json # 8k1k configs - DEP8 with CUTLASS - name: "k2_thinking_fp4_dep8_8k1k" @@ -63,9 +63,9 @@ server_configs: iterations: 10 isl: 8192 osl: 1024 - random_range_ratio: 0.2 backend: "openai" trust_remote_code: true + dataset_file: datasets/perf-ci/k2_thinking-8k1k-20480-ratio-1_for_serve.json # 32k8k configs - TEP8 with TRTLLM - name: "k2_thinking_fp4_tep8_32k8k" @@ -94,9 +94,9 @@ server_configs: iterations: 10 isl: 32768 osl: 8192 - random_range_ratio: 0.2 backend: "openai" trust_remote_code: true + dataset_file: datasets/perf-ci/k2_thinking-32k8k-20480-ratio-1_for_serve.json # 32k8k configs - DEP8 with CUTLASS - name: "k2_thinking_fp4_dep8_32k8k" @@ -129,6 +129,6 @@ server_configs: iterations: 10 isl: 32768 osl: 8192 - random_range_ratio: 0.2 backend: "openai" trust_remote_code: true + dataset_file: datasets/perf-ci/k2_thinking-32k8k-20480-ratio-1_for_serve.json diff --git a/tests/scripts/perf-sanity/k2_thinking_fp4_grace_blackwell.yaml b/tests/scripts/perf-sanity/k2_thinking_fp4_grace_blackwell.yaml index 6e180d2bfe..033f377acf 100644 --- a/tests/scripts/perf-sanity/k2_thinking_fp4_grace_blackwell.yaml +++ b/tests/scripts/perf-sanity/k2_thinking_fp4_grace_blackwell.yaml @@ -29,9 +29,9 @@ server_configs: iterations: 10 isl: 8192 osl: 1024 - random_range_ratio: 0.2 backend: "openai" trust_remote_code: true + dataset_file: datasets/perf-ci/k2_thinking-8k1k-20480-ratio-1_for_serve.json # 8k1k configs - DEP4 with CUTLASS - name: "k2_thinking_fp4_dep4_8k1k" @@ -63,6 +63,6 @@ server_configs: iterations: 10 isl: 8192 osl: 1024 - random_range_ratio: 0.2 backend: "openai" trust_remote_code: true + dataset_file: datasets/perf-ci/k2_thinking-8k1k-20480-ratio-1_for_serve.json