mirror of
https://github.com/NVIDIA/TensorRT-LLM.git
synced 2026-02-16 07:53:55 +08:00
[TRTLLM-8263][feat] Add Disagg Perf Tests (#10912)
Signed-off-by: Chenfei Zhang <chenfeiz@nvidia.com>
This commit is contained in:
parent
588db0ed64
commit
04b7db3ab5
@ -918,7 +918,7 @@ def runLLMTestlistWithSbatch(pipeline, platform, testList, config=VANILLA_CONFIG
|
||||
// Create a unique suffix for the job name
|
||||
String customSuffix = "${env.BUILD_TAG}-${UUID.randomUUID().toString().replaceAll("-", "").substring(0, 6)}".toLowerCase()
|
||||
def jobUID = "${cluster.host}-multi_node_test-${customSuffix}"
|
||||
def disaggMode = stageName.contains("PerfSanity-Disagg")
|
||||
def disaggMode = stageName.contains("Disagg-PerfSanity")
|
||||
|
||||
Utils.exec(pipeline, script: "env | sort && pwd && ls -alh")
|
||||
|
||||
@ -3151,6 +3151,15 @@ def runInKubernetes(pipeline, podSpec, containerName)
|
||||
}
|
||||
}
|
||||
|
||||
def buildStageConfigs(stageName, platform, testlist, testCount, gpuCount, nodeCount, runWithSbatch=false) {
|
||||
def configs = [:]
|
||||
for (int k = 1; k <= testCount; k++) {
|
||||
def key = "${stageName}-${k}"
|
||||
configs[key] = [platform, testlist, k, testCount, gpuCount, nodeCount, runWithSbatch]
|
||||
}
|
||||
return configs
|
||||
}
|
||||
|
||||
def launchTestJobs(pipeline, testFilter)
|
||||
{
|
||||
// IMPORTANT: Stage Configuration Syntax Requirement
|
||||
@ -3354,18 +3363,57 @@ def launchTestJobs(pipeline, testFilter)
|
||||
"GB200-8_GPUs-2_Nodes-PyTorch-Post-Merge-1": ["auto:gb200-flex", "l0_gb200_multi_nodes", 1, 3, 8, 2],
|
||||
"GB200-8_GPUs-2_Nodes-PyTorch-Post-Merge-2": ["auto:gb200-flex", "l0_gb200_multi_nodes", 2, 3, 8, 2],
|
||||
"GB200-8_GPUs-2_Nodes-PyTorch-Post-Merge-3": ["auto:gb200-flex", "l0_gb200_multi_nodes", 3, 3, 8, 2],
|
||||
// PerfSanity post-merge tests
|
||||
"GB200-8_GPUs-2_Nodes-PyTorch-PerfSanity-Post-Merge-1": ["auto:gb200-flex", "l0_gb200_multi_nodes_aggr_perf_sanity_2_nodes", 1, 5, 8, 2],
|
||||
"GB200-8_GPUs-2_Nodes-PyTorch-PerfSanity-Post-Merge-2": ["auto:gb200-flex", "l0_gb200_multi_nodes_aggr_perf_sanity_2_nodes", 2, 5, 8, 2],
|
||||
"GB200-8_GPUs-2_Nodes-PyTorch-PerfSanity-Post-Merge-3": ["auto:gb200-flex", "l0_gb200_multi_nodes_aggr_perf_sanity_2_nodes", 3, 5, 8, 2],
|
||||
"GB200-8_GPUs-2_Nodes-PyTorch-PerfSanity-Post-Merge-4": ["auto:gb200-flex", "l0_gb200_multi_nodes_aggr_perf_sanity_2_nodes", 4, 5, 8, 2],
|
||||
"GB200-8_GPUs-2_Nodes-PyTorch-PerfSanity-Post-Merge-5": ["auto:gb200-flex", "l0_gb200_multi_nodes_aggr_perf_sanity_2_nodes", 5, 5, 8, 2],
|
||||
// Disable stage 'GB200-12_GPUs-3_Nodes-PyTorch-PerfSanity-Disagg-Post-Merge-1' due to https://nvbugs/5819053
|
||||
// "GB200-12_GPUs-3_Nodes-PyTorch-PerfSanity-Disagg-Post-Merge-1": ["auto:gb200-flex", "l0_gb200_multi_nodes_disagg_perf_sanity_3_nodes", 1, 1, 12, 3],
|
||||
// "GB200-24_GPUs-6_Nodes-PyTorch-PerfSanity-Disagg-Post-Merge-1": ["auto:gb200-flex", "l0_gb200_multi_nodes_disagg_perf_sanity_6_nodes", 1, 2, 24, 6],
|
||||
// "GB200-24_GPUs-6_Nodes-PyTorch-PerfSanity-Disagg-Post-Merge-2": ["auto:gb200-flex", "l0_gb200_multi_nodes_disagg_perf_sanity_6_nodes", 2, 2, 24, 6],
|
||||
// "GB200-32_GPUs-8_Nodes-PyTorch-PerfSanity-Disagg-Post-Merge-1": ["auto:gb200-flex", "l0_gb200_multi_nodes_disagg_perf_sanity_8_nodes", 1, 1, 32, 8],
|
||||
]
|
||||
// PerfSanity post-merge aggr tests
|
||||
multiNodesSBSAConfigs += buildStageConfigs(
|
||||
"GB200-8_GPUs-2_Nodes-PyTorch-Aggr-PerfSanity-Node2-GPU8-Post-Merge",
|
||||
"auto:gb200-flex",
|
||||
"l0_gb200_multi_nodes_aggr_perf_sanity_node2_gpu8",
|
||||
5,
|
||||
8,
|
||||
2
|
||||
)
|
||||
// PerfSanity post-merge disagg tests
|
||||
multiNodesSBSAConfigs += buildStageConfigs(
|
||||
"GB200-8_GPUs-2_Nodes-PyTorch-Disagg-PerfSanity-CTX1-NODE1-GPU1-GEN1-NODE1-GPU4-Post-Merge",
|
||||
"auto:gb200-flex",
|
||||
"l0_gb200_multi_nodes_disagg_perf_sanity_ctx1_node1_gpu1_gen1_node1_gpu4",
|
||||
1,
|
||||
8,
|
||||
2
|
||||
)
|
||||
multiNodesSBSAConfigs += buildStageConfigs(
|
||||
"GB200-8_GPUs-2_Nodes-PyTorch-Disagg-PerfSanity-CTX1-NODE1-GPU4-GEN1-NODE1-GPU4-Post-Merge",
|
||||
"auto:gb200-flex",
|
||||
"l0_gb200_multi_nodes_disagg_perf_sanity_ctx1_node1_gpu4_gen1_node1_gpu4",
|
||||
3,
|
||||
8,
|
||||
2
|
||||
)
|
||||
multiNodesSBSAConfigs += buildStageConfigs(
|
||||
"GB200-12_GPUs-3_Nodes-PyTorch-Disagg-PerfSanity-CTX1-NODE1-GPU1-GEN1-NODE2-GPU8-Post-Merge",
|
||||
"auto:gb200-flex",
|
||||
"l0_gb200_multi_nodes_disagg_perf_sanity_ctx1_node1_gpu1_gen1_node2_gpu8",
|
||||
1,
|
||||
12,
|
||||
3
|
||||
)
|
||||
multiNodesSBSAConfigs += buildStageConfigs(
|
||||
"GB200-12_GPUs-3_Nodes-PyTorch-Disagg-PerfSanity-CTX1-NODE1-GPU4-GEN1-NODE2-GPU8-Post-Merge",
|
||||
"auto:gb200-flex",
|
||||
"l0_gb200_multi_nodes_disagg_perf_sanity_ctx1_node1_gpu4_gen1_node2_gpu8",
|
||||
5,
|
||||
12,
|
||||
3
|
||||
)
|
||||
multiNodesSBSAConfigs += buildStageConfigs(
|
||||
"GB200-16_GPUs-4_Nodes-PyTorch-Disagg-PerfSanity-CTX1-NODE2-GPU8-GEN1-NODE2-GPU8-Post-Merge",
|
||||
"auto:gb200-flex",
|
||||
"l0_gb200_multi_nodes_disagg_perf_sanity_ctx1_node2_gpu8_gen1_node2_gpu8",
|
||||
1,
|
||||
16,
|
||||
4
|
||||
)
|
||||
fullSet += multiNodesSBSAConfigs.keySet()
|
||||
|
||||
if (env.targetArch == AARCH64_TRIPLE) {
|
||||
@ -3610,9 +3658,9 @@ def launchTestJobs(pipeline, testFilter)
|
||||
}, {}, true)
|
||||
}]}
|
||||
|
||||
multiGpuJobs = parallelJobs.findAll{(it.key.contains("2_GPUs") || it.key.contains("4_GPUs") || it.key.contains("8_GPUs")) && !it.key.contains("Post-Merge")}
|
||||
multiGpuJobs = parallelJobs.findAll{(it.key =~ /\d+_GPUs/) && !it.key.contains("Post-Merge")}
|
||||
println multiGpuJobs.keySet()
|
||||
multiGpuJobsPostMerge = parallelJobs.findAll{(it.key.contains("2_GPUs") || it.key.contains("4_GPUs") || it.key.contains("8_GPUs")) && it.key.contains("Post-Merge")}
|
||||
multiGpuJobsPostMerge = parallelJobs.findAll{(it.key =~ /\d+_GPUs/) && it.key.contains("Post-Merge")}
|
||||
|
||||
parallelJobs += docBuildJobs
|
||||
parallelJobs += sanityCheckJobs
|
||||
@ -3927,9 +3975,9 @@ pipeline {
|
||||
|
||||
def testPhase2StageName = env.testPhase2StageName
|
||||
if (testPhase2StageName) {
|
||||
def dgxSigns = ["2_GPUs", "4_GPUs", "8_GPUs"]
|
||||
singleGpuJobs = parallelJobs.findAll{!dgxSigns.any{sign -> it.key.contains(sign)}}
|
||||
dgxJobs = parallelJobs.findAll{dgxSigns.any{sign -> it.key.contains(sign)}}
|
||||
def multiGpuPattern = /\d+_GPUs/
|
||||
singleGpuJobs = parallelJobs.findAll{!(it.key =~ multiGpuPattern)}
|
||||
dgxJobs = parallelJobs.findAll{it.key =~ multiGpuPattern}
|
||||
}
|
||||
|
||||
if (env.JOB_NAME ==~ /.*Single-GPU.*/) {
|
||||
|
||||
@ -19,13 +19,13 @@ echo "Installation completed on all nodes"
|
||||
# Start gen servers
|
||||
echo "Starting gen servers..."
|
||||
for i in $(seq 0 $((numGenServers - 1))); do
|
||||
gen_world_size=$((nodesPerGenServer * gpusPerNode))
|
||||
gen_world_size=$((nodesPerGenServer * gpusPerfNodePerfGenServer))
|
||||
export DISAGG_SERVING_TYPE="GEN_$i"
|
||||
export pytestCommand="$pytestCommandWorker"
|
||||
srun "${srunArgs[@]}" --kill-on-bad-exit=1 \
|
||||
-N $nodesPerGenServer \
|
||||
--ntasks=$gen_world_size \
|
||||
--ntasks-per-node=$gpusPerNode \
|
||||
--ntasks-per-node=$gpusPerfNodePerfGenServer \
|
||||
$runScript &> $jobWorkspace/gen_server_$i.log &
|
||||
echo "Started gen server $i"
|
||||
done
|
||||
@ -34,13 +34,13 @@ done
|
||||
if [ "${TRTLLM_DISAGG_BENCHMARK_GEN_ONLY:-0}" != "1" ]; then
|
||||
echo "Starting ctx servers..."
|
||||
for i in $(seq 0 $((numCtxServers - 1))); do
|
||||
ctx_world_size=$((nodesPerCtxServer * gpusPerNode))
|
||||
ctx_world_size=$((nodesPerCtxServer * gpusPerfNodePerfCtxServer))
|
||||
export DISAGG_SERVING_TYPE="CTX_$i"
|
||||
export pytestCommand="$pytestCommandWorker"
|
||||
srun "${srunArgs[@]}" --kill-on-bad-exit=1 \
|
||||
-N $nodesPerCtxServer \
|
||||
--ntasks=$ctx_world_size \
|
||||
--ntasks-per-node=$gpusPerNode \
|
||||
--ntasks=$ctx_world_size \
|
||||
--ntasks-per-node=$gpusPerfNodePerfCtxServer \
|
||||
$runScript &> $jobWorkspace/ctx_server_$i.log &
|
||||
echo "Started ctx server $i"
|
||||
done
|
||||
|
||||
@ -38,6 +38,9 @@ def get_hardware_config(config, benchmark_mode):
|
||||
nodes_per_ctx_server = (gpus_per_ctx_server + gpus_per_node - 1) // gpus_per_node
|
||||
nodes_per_gen_server = (gpus_per_gen_server + gpus_per_node - 1) // gpus_per_node
|
||||
|
||||
gpus_per_node_per_ctx_server = min(gpus_per_ctx_server, gpus_per_node)
|
||||
gpus_per_node_per_gen_server = min(gpus_per_gen_server, gpus_per_node)
|
||||
|
||||
total_nodes = num_ctx_servers * nodes_per_ctx_server + num_gen_servers * nodes_per_gen_server
|
||||
total_gpus = total_nodes * gpus_per_node
|
||||
|
||||
@ -49,6 +52,8 @@ def get_hardware_config(config, benchmark_mode):
|
||||
"gpus_per_gen_server": gpus_per_gen_server,
|
||||
"nodes_per_ctx_server": nodes_per_ctx_server,
|
||||
"nodes_per_gen_server": nodes_per_gen_server,
|
||||
"gpus_per_node_per_ctx_server": gpus_per_node_per_ctx_server,
|
||||
"gpus_per_node_per_gen_server": gpus_per_node_per_gen_server,
|
||||
"total_nodes": total_nodes,
|
||||
"total_gpus": total_gpus,
|
||||
}
|
||||
@ -102,7 +107,14 @@ def remove_whitespace_lines(lines):
|
||||
return [line.strip() for line in lines if line.strip()]
|
||||
|
||||
|
||||
def get_pytest_command_no_llmapilaunch(script_prefix_lines):
|
||||
def get_pytest_commands(script_prefix_lines):
|
||||
# Get worker, disagg_server, benchmark pytest commands from pytest command.
|
||||
# Worker pytest command is pytest command with trtllm-llmapi-launch and
|
||||
# without --csv, --cov, --periodic flags.
|
||||
# Disagg_server pytest command is pytest command without trtllm-llmapi-launch
|
||||
# and without --csv, --cov, --periodic flags.
|
||||
# Benchmark pytest command is pytest command without trtllm-llmapi-launch
|
||||
# and with --csv, --cov, --periodic flags.
|
||||
pytest_command_line = None
|
||||
for line in script_prefix_lines:
|
||||
if "export pytestCommand=" in line:
|
||||
@ -110,17 +122,102 @@ def get_pytest_command_no_llmapilaunch(script_prefix_lines):
|
||||
break
|
||||
|
||||
if not pytest_command_line:
|
||||
return ""
|
||||
return "", "", ""
|
||||
|
||||
# Replace pytestCommand with pytestCommandNoLLMAPILaunch
|
||||
replaced_line = pytest_command_line.replace("pytestCommand", "pytestCommandNoLLMAPILaunch")
|
||||
def split_pytest_command_line(command_line):
|
||||
# After pytest, there are six types of substrings:
|
||||
# Type 1: --xxx=yyy (long option with value, self-contained)
|
||||
# Type 2: --xxx= (long option with empty value, self-contained)
|
||||
# Type 3: --xxx (long option flag, no value)
|
||||
# Type 4: --xxx yyy (long option with value as next arg)
|
||||
# Type 5: -x yyy (short single-letter option with value as next arg)
|
||||
# Type 6: -x (short option flag, e.g., -v, -vv)
|
||||
parts = command_line.split()
|
||||
pytest_index = None
|
||||
for idx, part in enumerate(parts):
|
||||
if "pytest" == part:
|
||||
pytest_index = idx
|
||||
break
|
||||
if pytest_index is None:
|
||||
return parts
|
||||
|
||||
# Split by space, find and remove the substring with trtllm-llmapi-launch
|
||||
replaced_line_parts = replaced_line.split()
|
||||
replaced_line_parts_no_llmapi = [
|
||||
part for part in replaced_line_parts if "trtllm-llmapi-launch" not in part
|
||||
grouped_parts = parts[: pytest_index + 1]
|
||||
i = pytest_index + 1
|
||||
while i < len(parts):
|
||||
part = parts[i]
|
||||
has_next = i + 1 < len(parts)
|
||||
next_is_value = has_next and not parts[i + 1].startswith("-")
|
||||
|
||||
# Type 1 & 2: --xxx=yyy or --xxx= (self-contained, has '=')
|
||||
if part.startswith("--") and "=" in part:
|
||||
grouped_parts.append(part)
|
||||
i += 1
|
||||
continue
|
||||
|
||||
# Type 4: --xxx yyy (long option with value as next arg)
|
||||
if part.startswith("--") and next_is_value:
|
||||
grouped_parts.append(f"{part} {parts[i + 1]}")
|
||||
i += 2
|
||||
continue
|
||||
|
||||
# Type 3: --xxx (long option flag)
|
||||
if part.startswith("--"):
|
||||
grouped_parts.append(part)
|
||||
i += 1
|
||||
continue
|
||||
|
||||
# Type 5: -x yyy (short single-letter option with value as next arg)
|
||||
# Only single letter after dash, e.g., -o, not -vv
|
||||
if part.startswith("-") and len(part) == 2 and next_is_value:
|
||||
grouped_parts.append(f"{part} {parts[i + 1]}")
|
||||
i += 2
|
||||
continue
|
||||
|
||||
# Type 6: -x (short option flag, including combined like -vv)
|
||||
if part.startswith("-"):
|
||||
grouped_parts.append(part)
|
||||
i += 1
|
||||
continue
|
||||
|
||||
# Other parts (shouldn't happen after pytest, but handle gracefully)
|
||||
grouped_parts.append(part)
|
||||
i += 1
|
||||
|
||||
return grouped_parts
|
||||
|
||||
def is_llmapi_launch(part):
|
||||
return "trtllm-llmapi-launch" in part
|
||||
|
||||
def is_output_file_part(part):
|
||||
return any(flag in part for flag in ("--csv", "--cov", "--periodic"))
|
||||
|
||||
worker_line = pytest_command_line.replace("pytestCommand", "partialPytestCommandWorker")
|
||||
worker_parts = [
|
||||
part for part in split_pytest_command_line(worker_line) if not is_output_file_part(part)
|
||||
]
|
||||
return " ".join(replaced_line_parts_no_llmapi)
|
||||
worker_pytest_command = " ".join(worker_parts)
|
||||
|
||||
disagg_server_line = pytest_command_line.replace(
|
||||
"pytestCommand", "partialPytestCommandDisaggServer"
|
||||
)
|
||||
disagg_server_parts = [
|
||||
part
|
||||
for part in split_pytest_command_line(disagg_server_line)
|
||||
if not is_llmapi_launch(part) and not is_output_file_part(part)
|
||||
]
|
||||
disagg_server_pytest_command = " ".join(disagg_server_parts)
|
||||
|
||||
benchmark_line = pytest_command_line.replace("pytestCommand", "partialPytestCommandBenchmark")
|
||||
benchmark_parts = [
|
||||
part for part in split_pytest_command_line(benchmark_line) if not is_llmapi_launch(part)
|
||||
]
|
||||
benchmark_pytest_command = " ".join(benchmark_parts)
|
||||
|
||||
return (
|
||||
worker_pytest_command,
|
||||
disagg_server_pytest_command,
|
||||
benchmark_pytest_command,
|
||||
)
|
||||
|
||||
|
||||
def get_config_yaml(test_list_path, llm_src):
|
||||
@ -153,7 +250,7 @@ def get_config_yaml(test_list_path, llm_src):
|
||||
"disagg",
|
||||
"test_configs",
|
||||
"disagg",
|
||||
"perf",
|
||||
"perf-sanity",
|
||||
f"{config_base_name}.yaml",
|
||||
)
|
||||
if not os.path.exists(config_yaml_path):
|
||||
@ -225,8 +322,12 @@ def main():
|
||||
|
||||
srun_args_lines = srun_args_content.split()
|
||||
|
||||
# Extract pytestCommand and generate pytestCommandNoLLMAPILaunch
|
||||
pytest_command_no_llmapi_launch = get_pytest_command_no_llmapilaunch(script_prefix_lines)
|
||||
# Extract pytestCommand and generate partial pytest commands
|
||||
(
|
||||
worker_pytest_command,
|
||||
disagg_server_pytest_command,
|
||||
benchmark_pytest_command,
|
||||
) = get_pytest_commands(script_prefix_lines)
|
||||
|
||||
# Build worker env vars, add extra env vars for gen_only mode
|
||||
worker_env_vars = env_config["worker_env_var"]
|
||||
@ -244,12 +345,15 @@ def main():
|
||||
|
||||
script_prefix_lines.extend(
|
||||
[
|
||||
pytest_command_no_llmapi_launch,
|
||||
f'export pytestCommandWorker="unset UCX_TLS && {worker_env_vars} $pytestCommand"',
|
||||
f'export pytestCommandDisaggServer="{server_env_vars} $pytestCommandNoLLMAPILaunch"',
|
||||
f'export pytestCommandBenchmark="{env_config["benchmark_env_var"]} $pytestCommandNoLLMAPILaunch"',
|
||||
worker_pytest_command,
|
||||
disagg_server_pytest_command,
|
||||
benchmark_pytest_command,
|
||||
f'export pytestCommandWorker="unset UCX_TLS && {worker_env_vars} $partialPytestCommandWorker"',
|
||||
f'export pytestCommandDisaggServer="{server_env_vars} $partialPytestCommandDisaggServer"',
|
||||
f'export pytestCommandBenchmark="{env_config["benchmark_env_var"]} $partialPytestCommandBenchmark"',
|
||||
f"export runScript={args.run_sh}",
|
||||
f"export installScript={install_script}",
|
||||
f"export configYamlPath={config_yaml}",
|
||||
f"export numCtxServers={hardware_config['num_ctx_servers']}",
|
||||
f"export numGenServers={hardware_config['num_gen_servers']}",
|
||||
f"export gpusPerNode={hardware_config['gpus_per_node']}",
|
||||
@ -257,6 +361,8 @@ def main():
|
||||
f"export gpusPerGenServer={hardware_config['gpus_per_gen_server']}",
|
||||
f"export nodesPerCtxServer={hardware_config['nodes_per_ctx_server']}",
|
||||
f"export nodesPerGenServer={hardware_config['nodes_per_gen_server']}",
|
||||
f"export gpusPerfNodePerfCtxServer={hardware_config['gpus_per_node_per_ctx_server']}",
|
||||
f"export gpusPerfNodePerfGenServer={hardware_config['gpus_per_node_per_gen_server']}",
|
||||
f"export totalNodes={hardware_config['total_nodes']}",
|
||||
f"export totalGpus={hardware_config['total_gpus']}",
|
||||
]
|
||||
|
||||
@ -13,8 +13,9 @@ sys.path.insert(0, sys.path[0] + "/..")
|
||||
from open_search_db import OpenSearchDB
|
||||
|
||||
QUERY_LOOKBACK_DAYS = 90
|
||||
LOOKBACK_JOBS = 30
|
||||
MAX_QUERY_SIZE = 3000
|
||||
MAX_TEST_CASES_PER_MSG = 5
|
||||
MAX_TEST_CASES_PER_MSG = 4
|
||||
POST_SLACK_MSG_RETRY_TIMES = 5
|
||||
|
||||
|
||||
@ -99,42 +100,74 @@ def post_perf_data(data_list, project_name):
|
||||
return False
|
||||
|
||||
|
||||
def get_regression_data_by_job_id(data_list, query_job_number):
|
||||
"""Returns a dict with job_id as key and list of regression data as value.
|
||||
def get_regression_dict(data_list, query_job_number, lookback_job_number=LOOKBACK_JOBS):
|
||||
"""Returns a dict with job_id as key and list of regression tuples as value.
|
||||
|
||||
Each tuple is (test_case_name, gpu_type, runtime, history_regression_job_ids, data).
|
||||
Only returns the latest query_job_number jobs.
|
||||
"""
|
||||
if data_list is None or len(data_list) == 0:
|
||||
return {}
|
||||
|
||||
# Group data by job_id
|
||||
job_data_dict = {}
|
||||
job_test_dict = {}
|
||||
for data in data_list:
|
||||
job_id = data.get("s_job_id", "")
|
||||
if job_id == "":
|
||||
raw_job_id = data.get("s_job_id", "")
|
||||
if raw_job_id == "":
|
||||
continue
|
||||
if job_id not in job_data_dict:
|
||||
job_data_dict[job_id] = []
|
||||
job_data_dict[job_id].append(data)
|
||||
try:
|
||||
job_id = int(raw_job_id)
|
||||
except (TypeError, ValueError):
|
||||
continue
|
||||
job_test_dict.setdefault(job_id, []).append(data)
|
||||
|
||||
# Sort job_ids by the latest ts_created in each group (descending)
|
||||
def get_latest_timestamp(job_id):
|
||||
timestamps = [d.get("ts_created", 0) for d in job_data_dict[job_id]]
|
||||
return max(timestamps) if timestamps else 0
|
||||
if not job_test_dict:
|
||||
return {}
|
||||
|
||||
sorted_job_ids = sorted(job_data_dict.keys(), key=get_latest_timestamp, reverse=True)
|
||||
# Sort job_ids (descending: latest -> oldest)
|
||||
sorted_job_id_list = sorted(job_test_dict.keys(), reverse=True)
|
||||
|
||||
# Only keep the latest query_job_number jobs
|
||||
latest_job_ids = sorted_job_ids[:query_job_number]
|
||||
# Build (test_case_name, gpu_type, runtime) -> job_ids dict
|
||||
test_job_dict = {}
|
||||
for job_id, data_list in job_test_dict.items():
|
||||
for data in data_list:
|
||||
test_case_name = data.get("s_test_case_name") or ""
|
||||
gpu_type = data.get("s_gpu_type") or ""
|
||||
runtime = data.get("s_runtime") or ""
|
||||
if not test_case_name or not gpu_type or not runtime:
|
||||
continue
|
||||
key = (test_case_name, gpu_type, runtime)
|
||||
test_job_dict.setdefault(key, set()).add(job_id)
|
||||
|
||||
result = {}
|
||||
# Sort job ids for each test case (descending: latest -> oldest)
|
||||
for key, job_id_set in list(test_job_dict.items()):
|
||||
test_job_dict[key] = sorted(job_id_set, reverse=True)
|
||||
|
||||
# Only keep the latest query_job_number jobs in the result
|
||||
latest_job_ids = sorted_job_id_list[:query_job_number]
|
||||
|
||||
regression_dict = {}
|
||||
for job_id in latest_job_ids:
|
||||
result[job_id] = job_data_dict[job_id]
|
||||
entries = []
|
||||
for data in job_test_dict.get(job_id, []):
|
||||
test_case_name = data.get("s_test_case_name") or ""
|
||||
gpu_type = data.get("s_gpu_type") or ""
|
||||
runtime = data.get("s_runtime") or ""
|
||||
if not test_case_name or not gpu_type or not runtime:
|
||||
continue
|
||||
key = (test_case_name, gpu_type, runtime)
|
||||
history_ids = test_job_dict.get(key, [])
|
||||
lower_bound = job_id - lookback_job_number + 1
|
||||
history_regression_job_ids = [
|
||||
jid for jid in history_ids if lower_bound <= jid <= job_id
|
||||
]
|
||||
entries.append((test_case_name, gpu_type, runtime, history_regression_job_ids, data))
|
||||
regression_dict[job_id] = entries
|
||||
|
||||
return result
|
||||
return regression_dict
|
||||
|
||||
|
||||
def process_regression_message(regression_dict):
|
||||
def split_regression_message(regression_dict):
|
||||
"""Process regression data into message chunks.
|
||||
|
||||
Returns a list of messages, each containing at most MAX_TEST_CASES_PER_MSG test cases.
|
||||
@ -142,12 +175,17 @@ def process_regression_message(regression_dict):
|
||||
if not regression_dict:
|
||||
return []
|
||||
|
||||
# Flatten all test cases into a list with (job_id, idx, data) tuples
|
||||
# Flatten all test cases into a list with
|
||||
# (job_id, idx, test_case_name, gpu_type, runtime, history_regression_job_ids, data) tuples
|
||||
all_test_cases = []
|
||||
for job_id, data_list in regression_dict.items():
|
||||
sorted_data_list = sorted(data_list, key=lambda x: x.get("s_test_case_name", ""))
|
||||
for idx, data in enumerate(sorted_data_list, start=1):
|
||||
all_test_cases.append((job_id, idx, data))
|
||||
sorted_data_list = sorted(data_list, key=lambda x: x[0])
|
||||
for idx, (test_case_name, gpu_type, runtime, history_regression_job_ids, data) in enumerate(
|
||||
sorted_data_list, start=1
|
||||
):
|
||||
all_test_cases.append(
|
||||
(job_id, idx, test_case_name, gpu_type, runtime, history_regression_job_ids, data)
|
||||
)
|
||||
|
||||
# Split into chunks of MAX_TEST_CASES_PER_MSG
|
||||
chunks = []
|
||||
@ -159,7 +197,15 @@ def process_regression_message(regression_dict):
|
||||
for chunk in chunks:
|
||||
msg_parts = []
|
||||
current_job_id = None
|
||||
for job_id, idx, data in chunk:
|
||||
for (
|
||||
job_id,
|
||||
idx,
|
||||
test_case_name,
|
||||
gpu_type,
|
||||
runtime,
|
||||
history_regression_job_ids,
|
||||
data,
|
||||
) in chunk:
|
||||
# Add job header when switching to a new job_id
|
||||
if job_id != current_job_id:
|
||||
if msg_parts:
|
||||
@ -168,12 +214,46 @@ def process_regression_message(regression_dict):
|
||||
msg_parts.append(job_header)
|
||||
current_job_id = job_id
|
||||
|
||||
test_case_name = data.get("s_test_case_name", "N/A")
|
||||
regression_info = data.get("s_regression_info", "N/A")
|
||||
history_text = (
|
||||
", ".join(str(jid) for jid in history_regression_job_ids)
|
||||
if history_regression_job_ids
|
||||
else "N/A"
|
||||
)
|
||||
msg_parts.append(f"*REGRESSION TEST CASE {idx}: {test_case_name}*\n")
|
||||
msg_parts.append(f"*GPU: {gpu_type} Mode: {runtime}*\n")
|
||||
msg_parts.append(f"*History Regression Post-Merge Job IDs: {history_text}*\n")
|
||||
|
||||
# Parse regression_info to extract baseline info and metrics
|
||||
baseline_date = "N/A"
|
||||
baseline_branch = "N/A"
|
||||
baseline_commit = "N/A"
|
||||
for part in regression_info.split(","):
|
||||
part = part.strip()
|
||||
if part and "baseline_id" not in part:
|
||||
if "baseline_date:" in part:
|
||||
baseline_date = part.split(":", 1)[-1].strip()
|
||||
elif "baseline_branch:" in part:
|
||||
baseline_branch = part.split(":", 1)[-1].strip()
|
||||
elif "baseline_commit:" in part:
|
||||
baseline_commit = part.split(":", 1)[-1].strip()
|
||||
|
||||
# Get regression branch and commit from data
|
||||
regression_date = data.get("ts_created", "N/A")
|
||||
regression_branch = data.get("s_branch", "N/A")
|
||||
regression_commit = data.get("s_commit", "N/A")
|
||||
|
||||
msg_parts.append(
|
||||
f"*Baseline date, branch and commit: "
|
||||
f"{baseline_date} {baseline_branch} {baseline_commit}*\n"
|
||||
)
|
||||
msg_parts.append(
|
||||
f"*Regression date, branch and commit: "
|
||||
f"{regression_date} {regression_branch} {regression_commit}*\n"
|
||||
)
|
||||
|
||||
for part in regression_info.split(","):
|
||||
part = part.strip()
|
||||
if part and "baseline_" not in part:
|
||||
msg_parts.append(f" {part}\n")
|
||||
|
||||
msg = "".join(msg_parts).strip()
|
||||
@ -288,8 +368,8 @@ def main():
|
||||
print("Failed to query regression data")
|
||||
return
|
||||
|
||||
regression_dict = get_regression_data_by_job_id(data_list, args.query_job_number)
|
||||
messages = process_regression_message(regression_dict)
|
||||
regression_dict = get_regression_dict(data_list, args.query_job_number)
|
||||
messages = split_regression_message(regression_dict)
|
||||
send_regression_message(messages, args.channel_id, args.bot_token)
|
||||
elif args.operation.strip().upper().startswith("UPDATE"):
|
||||
set_values, where_values, error = parse_update_operation(args.operation)
|
||||
|
||||
@ -58,7 +58,10 @@ cd $llmSrcNode/tests/integration/defs
|
||||
trtllmWhlPath=$(pip3 show tensorrt_llm | grep Location | cut -d ' ' -f 2)
|
||||
trtllmWhlPath=$(echo "$trtllmWhlPath" | sed 's/[[:space:]]+/_/g')
|
||||
echo "TRTLLM WHEEL PATH: $trtllmWhlPath"
|
||||
pytestCommand=$(set_value_in_command "TRTLLM_WHL_PATH" "$trtllmWhlPath" "$pytestCommand")
|
||||
# In disaggregated mode, we only set coverage config file in benchmark pytest.
|
||||
if [[ -z "${DISAGG_SERVING_TYPE:-}" || "${DISAGG_SERVING_TYPE}" == "BENCHMARK" ]]; then
|
||||
pytestCommand=$(set_value_in_command "TRTLLM_WHL_PATH" "$trtllmWhlPath" "$pytestCommand")
|
||||
fi
|
||||
|
||||
# Only the first process will save the coverage config file
|
||||
if [ $SLURM_PROCID -eq 0 ]; then
|
||||
|
||||
@ -0,0 +1,97 @@
|
||||
metadata:
|
||||
model_name: deepseek_r1_0528_fp4_v2
|
||||
precision: fp4
|
||||
model_dir_name: DeepSeek-R1-0528-FP4-v2
|
||||
supported_gpus:
|
||||
- B200
|
||||
script_file: disaggr_torch.slurm
|
||||
benchmark_type: 1k1k
|
||||
slurm:
|
||||
script_file: disaggr_torch.slurm
|
||||
partition: <partition>
|
||||
account: <account>
|
||||
job_time: 02:00:00
|
||||
job_name: unified-benchmark
|
||||
extra_args: --gres=gpu:8
|
||||
numa_bind: true
|
||||
benchmark:
|
||||
mode: e2e
|
||||
use_nv_sa_benchmark: false
|
||||
multi_round: 5
|
||||
benchmark_ratio: 0.0
|
||||
streaming: true
|
||||
concurrency_list: '2048'
|
||||
input_length: 1024
|
||||
output_length: 1024
|
||||
dataset_file: datasets/perf-ci/deepseek_r1-1k1k-20480-ratio-1_for_serve.json
|
||||
hardware:
|
||||
gpus_per_node: 8
|
||||
num_ctx_servers: 1
|
||||
num_gen_servers: 1
|
||||
environment:
|
||||
container_mount: <container_mount>
|
||||
container_image: <container_image>
|
||||
model_path: <model_path>
|
||||
trtllm_repo: ''
|
||||
build_wheel: false
|
||||
work_dir: <full_path_to_work_dir>
|
||||
worker_env_var: TLLM_LOG_LEVEL=INFO TRTLLM_SERVER_DISABLE_GC=1 TRTLLM_WORKER_DISABLE_GC=1 TRTLLM_ENABLE_PDL=1 ENROOT_ALLOW_DEV=yes
|
||||
server_env_var: TRTLLM_SERVER_DISABLE_GC=1
|
||||
profiling:
|
||||
nsys_on: false
|
||||
accuracy:
|
||||
enable_accuracy_test: false
|
||||
model: local-completions
|
||||
tasks: gsm8k
|
||||
model_args_extra: num_concurrent=512,max_retries=3,tokenized_requests=false,timeout=1200,max_gen_toks=256,max_length=4096
|
||||
worker_config:
|
||||
gen:
|
||||
print_iter_log: true
|
||||
max_batch_size: 256
|
||||
max_num_tokens: 512
|
||||
tensor_parallel_size: 8
|
||||
moe_expert_parallel_size: 8
|
||||
pipeline_parallel_size: 1
|
||||
context_parallel_size: 1
|
||||
enable_attention_dp: true
|
||||
enable_lm_head_tp_in_adp: true
|
||||
cuda_graph_config:
|
||||
enable_padding: true
|
||||
max_batch_size: 256
|
||||
kv_cache_config:
|
||||
enable_block_reuse: false
|
||||
free_gpu_memory_fraction: 0.8
|
||||
dtype: fp8
|
||||
moe_config:
|
||||
backend: CUTEDSL
|
||||
cache_transceiver_config:
|
||||
max_tokens_in_buffer: 16384
|
||||
backend: UCX
|
||||
disable_overlap_scheduler: true
|
||||
speculative_config: &id001
|
||||
decoding_type: MTP
|
||||
num_nextn_predict_layers: 1
|
||||
num_postprocess_workers: 4
|
||||
stream_interval: 20
|
||||
ctx:
|
||||
print_iter_log: true
|
||||
max_batch_size: 16
|
||||
max_num_tokens: 16384
|
||||
tensor_parallel_size: 4
|
||||
moe_expert_parallel_size: 4
|
||||
pipeline_parallel_size: 1
|
||||
context_parallel_size: 1
|
||||
enable_attention_dp: true
|
||||
enable_lm_head_tp_in_adp: true
|
||||
cuda_graph_config: null
|
||||
kv_cache_config:
|
||||
enable_block_reuse: false
|
||||
free_gpu_memory_fraction: 0.6
|
||||
dtype: fp8
|
||||
moe_config:
|
||||
backend: TRTLLM
|
||||
cache_transceiver_config:
|
||||
max_tokens_in_buffer: 16384
|
||||
backend: UCX
|
||||
disable_overlap_scheduler: true
|
||||
speculative_config: *id001
|
||||
@ -0,0 +1,97 @@
|
||||
metadata:
|
||||
model_name: deepseek_r1_0528_fp4_v2
|
||||
precision: fp4
|
||||
model_dir_name: DeepSeek-R1-0528-FP4-v2
|
||||
supported_gpus:
|
||||
- B200
|
||||
script_file: disaggr_torch.slurm
|
||||
benchmark_type: 1k1k
|
||||
slurm:
|
||||
script_file: disaggr_torch.slurm
|
||||
partition: <partition>
|
||||
account: <account>
|
||||
job_time: 02:00:00
|
||||
job_name: unified-benchmark
|
||||
extra_args: --gres=gpu:8
|
||||
numa_bind: true
|
||||
benchmark:
|
||||
mode: e2e
|
||||
use_nv_sa_benchmark: false
|
||||
multi_round: 10
|
||||
benchmark_ratio: 0.0
|
||||
streaming: true
|
||||
concurrency_list: '256'
|
||||
input_length: 1024
|
||||
output_length: 1024
|
||||
dataset_file: datasets/perf-ci/deepseek_r1-1k1k-20480-ratio-1_for_serve.json
|
||||
hardware:
|
||||
gpus_per_node: 8
|
||||
num_ctx_servers: 1
|
||||
num_gen_servers: 1
|
||||
environment:
|
||||
container_mount: <container_mount>
|
||||
container_image: <container_image>
|
||||
model_path: <model_path>
|
||||
trtllm_repo: ''
|
||||
build_wheel: false
|
||||
work_dir: <full_path_to_work_dir>
|
||||
worker_env_var: TLLM_LOG_LEVEL=INFO TRTLLM_SERVER_DISABLE_GC=1 TRTLLM_WORKER_DISABLE_GC=1 TRTLLM_ENABLE_PDL=1 ENROOT_ALLOW_DEV=yes
|
||||
server_env_var: TRTLLM_SERVER_DISABLE_GC=1
|
||||
profiling:
|
||||
nsys_on: false
|
||||
accuracy:
|
||||
enable_accuracy_test: false
|
||||
model: local-completions
|
||||
tasks: gsm8k
|
||||
model_args_extra: num_concurrent=512,max_retries=3,tokenized_requests=false,timeout=1200,max_gen_toks=256,max_length=4096
|
||||
worker_config:
|
||||
gen:
|
||||
print_iter_log: true
|
||||
max_batch_size: 32
|
||||
max_num_tokens: 256
|
||||
tensor_parallel_size: 8
|
||||
moe_expert_parallel_size: 8
|
||||
pipeline_parallel_size: 1
|
||||
context_parallel_size: 1
|
||||
enable_attention_dp: true
|
||||
enable_lm_head_tp_in_adp: true
|
||||
cuda_graph_config:
|
||||
enable_padding: true
|
||||
max_batch_size: 32
|
||||
kv_cache_config:
|
||||
enable_block_reuse: false
|
||||
free_gpu_memory_fraction: 0.8
|
||||
dtype: fp8
|
||||
moe_config:
|
||||
backend: CUTEDSL
|
||||
cache_transceiver_config:
|
||||
max_tokens_in_buffer: 16384
|
||||
backend: UCX
|
||||
disable_overlap_scheduler: true
|
||||
speculative_config: &id001
|
||||
decoding_type: MTP
|
||||
num_nextn_predict_layers: 3
|
||||
num_postprocess_workers: 4
|
||||
stream_interval: 20
|
||||
ctx:
|
||||
print_iter_log: true
|
||||
max_batch_size: 16
|
||||
max_num_tokens: 16384
|
||||
tensor_parallel_size: 4
|
||||
moe_expert_parallel_size: 4
|
||||
pipeline_parallel_size: 1
|
||||
context_parallel_size: 1
|
||||
enable_attention_dp: true
|
||||
enable_lm_head_tp_in_adp: true
|
||||
cuda_graph_config: null
|
||||
kv_cache_config:
|
||||
enable_block_reuse: false
|
||||
free_gpu_memory_fraction: 0.6
|
||||
dtype: fp8
|
||||
moe_config:
|
||||
backend: TRTLLM
|
||||
cache_transceiver_config:
|
||||
max_tokens_in_buffer: 16384
|
||||
backend: UCX
|
||||
disable_overlap_scheduler: true
|
||||
speculative_config: *id001
|
||||
@ -0,0 +1,96 @@
|
||||
metadata:
|
||||
model_name: deepseek_r1_0528_fp4_v2
|
||||
precision: fp4
|
||||
model_dir_name: DeepSeek-R1-0528-FP4-v2
|
||||
supported_gpus:
|
||||
- B200
|
||||
script_file: disaggr_torch.slurm
|
||||
benchmark_type: 1k1k
|
||||
slurm:
|
||||
script_file: disaggr_torch.slurm
|
||||
partition: <partition>
|
||||
account: <account>
|
||||
job_time: 02:00:00
|
||||
job_name: unified-benchmark
|
||||
extra_args: --gres=gpu:8
|
||||
numa_bind: true
|
||||
benchmark:
|
||||
mode: e2e
|
||||
use_nv_sa_benchmark: false
|
||||
multi_round: 10
|
||||
benchmark_ratio: 0.0
|
||||
streaming: true
|
||||
concurrency_list: '1'
|
||||
input_length: 1024
|
||||
output_length: 1024
|
||||
dataset_file: datasets/perf-ci/deepseek_r1-1k1k-20480-ratio-1_for_serve.json
|
||||
hardware:
|
||||
gpus_per_node: 8
|
||||
num_ctx_servers: 1
|
||||
num_gen_servers: 1
|
||||
environment:
|
||||
container_mount: <container_mount>
|
||||
container_image: <container_image>
|
||||
model_path: <model_path>
|
||||
trtllm_repo: ''
|
||||
build_wheel: false
|
||||
work_dir: <full_path_to_work_dir>
|
||||
worker_env_var: TLLM_LOG_LEVEL=INFO TRTLLM_SERVER_DISABLE_GC=1 TRTLLM_WORKER_DISABLE_GC=1 TRTLLM_ENABLE_PDL=1 ENROOT_ALLOW_DEV=yes
|
||||
server_env_var: TRTLLM_SERVER_DISABLE_GC=1
|
||||
profiling:
|
||||
nsys_on: false
|
||||
accuracy:
|
||||
enable_accuracy_test: false
|
||||
model: local-completions
|
||||
tasks: gsm8k
|
||||
model_args_extra: num_concurrent=512,max_retries=3,tokenized_requests=false,timeout=1200,max_gen_toks=256,max_length=4096
|
||||
worker_config:
|
||||
gen:
|
||||
print_iter_log: true
|
||||
max_batch_size: 1
|
||||
max_num_tokens: 256
|
||||
tensor_parallel_size: 8
|
||||
moe_expert_parallel_size: 8
|
||||
pipeline_parallel_size: 1
|
||||
context_parallel_size: 1
|
||||
enable_attention_dp: false
|
||||
cuda_graph_config:
|
||||
enable_padding: true
|
||||
max_batch_size: 1
|
||||
kv_cache_config:
|
||||
enable_block_reuse: false
|
||||
free_gpu_memory_fraction: 0.9
|
||||
dtype: fp8
|
||||
moe_config:
|
||||
backend: TRTLLM
|
||||
cache_transceiver_config:
|
||||
max_tokens_in_buffer: 16384
|
||||
backend: UCX
|
||||
disable_overlap_scheduler: true
|
||||
speculative_config: &id001
|
||||
decoding_type: MTP
|
||||
num_nextn_predict_layers: 3
|
||||
num_postprocess_workers: 4
|
||||
stream_interval: 20
|
||||
ctx:
|
||||
print_iter_log: true
|
||||
max_batch_size: 16
|
||||
max_num_tokens: 16384
|
||||
tensor_parallel_size: 4
|
||||
moe_expert_parallel_size: 4
|
||||
pipeline_parallel_size: 1
|
||||
context_parallel_size: 1
|
||||
enable_attention_dp: true
|
||||
enable_lm_head_tp_in_adp: true
|
||||
cuda_graph_config: null
|
||||
kv_cache_config:
|
||||
enable_block_reuse: false
|
||||
free_gpu_memory_fraction: 0.6
|
||||
dtype: fp8
|
||||
moe_config:
|
||||
backend: TRTLLM
|
||||
cache_transceiver_config:
|
||||
max_tokens_in_buffer: 16384
|
||||
backend: UCX
|
||||
disable_overlap_scheduler: true
|
||||
speculative_config: *id001
|
||||
@ -0,0 +1,97 @@
|
||||
metadata:
|
||||
model_name: deepseek_r1_0528_fp4_v2
|
||||
precision: fp4
|
||||
model_dir_name: DeepSeek-R1-0528-FP4-v2
|
||||
supported_gpus:
|
||||
- B200
|
||||
script_file: disaggr_torch.slurm
|
||||
benchmark_type: 8k1k
|
||||
slurm:
|
||||
script_file: disaggr_torch.slurm
|
||||
partition: <partition>
|
||||
account: <account>
|
||||
job_time: 02:00:00
|
||||
job_name: unified-benchmark
|
||||
extra_args: --gres=gpu:8
|
||||
numa_bind: true
|
||||
benchmark:
|
||||
mode: e2e
|
||||
use_nv_sa_benchmark: false
|
||||
multi_round: 10
|
||||
benchmark_ratio: 0.0
|
||||
streaming: true
|
||||
concurrency_list: '1536'
|
||||
input_length: 8192
|
||||
output_length: 1024
|
||||
dataset_file: datasets/perf-ci/deepseek_r1-8k1k-20480-ratio-1_for_serve.json
|
||||
hardware:
|
||||
gpus_per_node: 8
|
||||
num_ctx_servers: 1
|
||||
num_gen_servers: 1
|
||||
environment:
|
||||
container_mount: <container_mount>
|
||||
container_image: <container_image>
|
||||
model_path: <model_path>
|
||||
trtllm_repo: ''
|
||||
build_wheel: false
|
||||
work_dir: <full_path_to_work_dir>
|
||||
worker_env_var: TLLM_LOG_LEVEL=INFO TRTLLM_SERVER_DISABLE_GC=1 TRTLLM_WORKER_DISABLE_GC=1 TRTLLM_ENABLE_PDL=1 ENROOT_ALLOW_DEV=yes
|
||||
server_env_var: TRTLLM_SERVER_DISABLE_GC=1
|
||||
profiling:
|
||||
nsys_on: false
|
||||
accuracy:
|
||||
enable_accuracy_test: false
|
||||
model: local-completions
|
||||
tasks: gsm8k
|
||||
model_args_extra: num_concurrent=512,max_retries=3,tokenized_requests=false,timeout=1200,max_gen_toks=256,max_length=4096
|
||||
worker_config:
|
||||
gen:
|
||||
print_iter_log: true
|
||||
max_batch_size: 192
|
||||
max_num_tokens: 384
|
||||
tensor_parallel_size: 8
|
||||
moe_expert_parallel_size: 8
|
||||
pipeline_parallel_size: 1
|
||||
context_parallel_size: 1
|
||||
enable_attention_dp: true
|
||||
enable_lm_head_tp_in_adp: true
|
||||
cuda_graph_config:
|
||||
enable_padding: true
|
||||
max_batch_size: 192
|
||||
kv_cache_config:
|
||||
enable_block_reuse: false
|
||||
free_gpu_memory_fraction: 0.8
|
||||
dtype: fp8
|
||||
moe_config:
|
||||
backend: CUTEDSL
|
||||
cache_transceiver_config:
|
||||
max_tokens_in_buffer: 16384
|
||||
backend: UCX
|
||||
disable_overlap_scheduler: true
|
||||
speculative_config: &id001
|
||||
decoding_type: MTP
|
||||
num_nextn_predict_layers: 1
|
||||
num_postprocess_workers: 4
|
||||
stream_interval: 20
|
||||
ctx:
|
||||
print_iter_log: true
|
||||
max_batch_size: 2
|
||||
max_num_tokens: 16384
|
||||
tensor_parallel_size: 4
|
||||
moe_expert_parallel_size: 4
|
||||
pipeline_parallel_size: 1
|
||||
context_parallel_size: 1
|
||||
enable_attention_dp: true
|
||||
enable_lm_head_tp_in_adp: true
|
||||
cuda_graph_config: null
|
||||
kv_cache_config:
|
||||
enable_block_reuse: false
|
||||
free_gpu_memory_fraction: 0.6
|
||||
dtype: fp8
|
||||
moe_config:
|
||||
backend: TRTLLM
|
||||
cache_transceiver_config:
|
||||
max_tokens_in_buffer: 16384
|
||||
backend: UCX
|
||||
disable_overlap_scheduler: true
|
||||
speculative_config: *id001
|
||||
@ -0,0 +1,97 @@
|
||||
metadata:
|
||||
model_name: deepseek_r1_0528_fp4_v2
|
||||
precision: fp4
|
||||
model_dir_name: DeepSeek-R1-0528-FP4-v2
|
||||
supported_gpus:
|
||||
- B200
|
||||
script_file: disaggr_torch.slurm
|
||||
benchmark_type: 8k1k
|
||||
slurm:
|
||||
script_file: disaggr_torch.slurm
|
||||
partition: <partition>
|
||||
account: <account>
|
||||
job_time: 02:00:00
|
||||
job_name: unified-benchmark
|
||||
extra_args: --gres=gpu:8
|
||||
numa_bind: true
|
||||
benchmark:
|
||||
mode: e2e
|
||||
use_nv_sa_benchmark: false
|
||||
multi_round: 10
|
||||
benchmark_ratio: 0.0
|
||||
streaming: true
|
||||
concurrency_list: '256'
|
||||
input_length: 8192
|
||||
output_length: 1024
|
||||
dataset_file: datasets/perf-ci/deepseek_r1-8k1k-20480-ratio-1_for_serve.json
|
||||
hardware:
|
||||
gpus_per_node: 8
|
||||
num_ctx_servers: 1
|
||||
num_gen_servers: 1
|
||||
environment:
|
||||
container_mount: <container_mount>
|
||||
container_image: <container_image>
|
||||
model_path: <model_path>
|
||||
trtllm_repo: ''
|
||||
build_wheel: false
|
||||
work_dir: <full_path_to_work_dir>
|
||||
worker_env_var: TLLM_LOG_LEVEL=INFO TRTLLM_SERVER_DISABLE_GC=1 TRTLLM_WORKER_DISABLE_GC=1 TRTLLM_ENABLE_PDL=1 ENROOT_ALLOW_DEV=yes
|
||||
server_env_var: TRTLLM_SERVER_DISABLE_GC=1
|
||||
profiling:
|
||||
nsys_on: false
|
||||
accuracy:
|
||||
enable_accuracy_test: false
|
||||
model: local-completions
|
||||
tasks: gsm8k
|
||||
model_args_extra: num_concurrent=512,max_retries=3,tokenized_requests=false,timeout=1200,max_gen_toks=256,max_length=4096
|
||||
worker_config:
|
||||
gen:
|
||||
print_iter_log: true
|
||||
max_batch_size: 32
|
||||
max_num_tokens: 128
|
||||
tensor_parallel_size: 8
|
||||
moe_expert_parallel_size: 8
|
||||
pipeline_parallel_size: 1
|
||||
context_parallel_size: 1
|
||||
enable_attention_dp: true
|
||||
enable_lm_head_tp_in_adp: true
|
||||
cuda_graph_config:
|
||||
enable_padding: true
|
||||
max_batch_size: 32
|
||||
kv_cache_config:
|
||||
enable_block_reuse: false
|
||||
free_gpu_memory_fraction: 0.8
|
||||
dtype: fp8
|
||||
moe_config:
|
||||
backend: CUTEDSL
|
||||
cache_transceiver_config:
|
||||
max_tokens_in_buffer: 16384
|
||||
backend: UCX
|
||||
disable_overlap_scheduler: true
|
||||
speculative_config: &id001
|
||||
decoding_type: MTP
|
||||
num_nextn_predict_layers: 1
|
||||
num_postprocess_workers: 4
|
||||
stream_interval: 20
|
||||
ctx:
|
||||
print_iter_log: true
|
||||
max_batch_size: 2
|
||||
max_num_tokens: 16384
|
||||
tensor_parallel_size: 4
|
||||
moe_expert_parallel_size: 4
|
||||
pipeline_parallel_size: 1
|
||||
context_parallel_size: 1
|
||||
enable_attention_dp: true
|
||||
enable_lm_head_tp_in_adp: true
|
||||
cuda_graph_config: null
|
||||
kv_cache_config:
|
||||
enable_block_reuse: false
|
||||
free_gpu_memory_fraction: 0.6
|
||||
dtype: fp8
|
||||
moe_config:
|
||||
backend: TRTLLM
|
||||
cache_transceiver_config:
|
||||
max_tokens_in_buffer: 16384
|
||||
backend: UCX
|
||||
disable_overlap_scheduler: true
|
||||
speculative_config: *id001
|
||||
@ -0,0 +1,96 @@
|
||||
metadata:
|
||||
model_name: deepseek_r1_0528_fp4_v2
|
||||
precision: fp4
|
||||
model_dir_name: DeepSeek-R1-0528-FP4-v2
|
||||
supported_gpus:
|
||||
- B200
|
||||
script_file: disaggr_torch.slurm
|
||||
benchmark_type: 8k1k
|
||||
slurm:
|
||||
script_file: disaggr_torch.slurm
|
||||
partition: <partition>
|
||||
account: <account>
|
||||
job_time: 02:00:00
|
||||
job_name: unified-benchmark
|
||||
extra_args: --gres=gpu:8
|
||||
numa_bind: true
|
||||
benchmark:
|
||||
mode: e2e
|
||||
use_nv_sa_benchmark: false
|
||||
multi_round: 10
|
||||
benchmark_ratio: 0.0
|
||||
streaming: true
|
||||
concurrency_list: '1'
|
||||
input_length: 8192
|
||||
output_length: 1024
|
||||
dataset_file: datasets/perf-ci/deepseek_r1-8k1k-20480-ratio-1_for_serve.json
|
||||
hardware:
|
||||
gpus_per_node: 8
|
||||
num_ctx_servers: 1
|
||||
num_gen_servers: 1
|
||||
environment:
|
||||
container_mount: <container_mount>
|
||||
container_image: <container_image>
|
||||
model_path: <model_path>
|
||||
trtllm_repo: ''
|
||||
build_wheel: false
|
||||
work_dir: <full_path_to_work_dir>
|
||||
worker_env_var: TLLM_LOG_LEVEL=INFO TRTLLM_SERVER_DISABLE_GC=1 TRTLLM_WORKER_DISABLE_GC=1 TRTLLM_ENABLE_PDL=1 ENROOT_ALLOW_DEV=yes
|
||||
server_env_var: TRTLLM_SERVER_DISABLE_GC=1
|
||||
profiling:
|
||||
nsys_on: false
|
||||
accuracy:
|
||||
enable_accuracy_test: false
|
||||
model: local-completions
|
||||
tasks: gsm8k
|
||||
model_args_extra: num_concurrent=512,max_retries=3,tokenized_requests=false,timeout=1200,max_gen_toks=256,max_length=4096
|
||||
worker_config:
|
||||
gen:
|
||||
print_iter_log: true
|
||||
max_batch_size: 1
|
||||
max_num_tokens: 256
|
||||
tensor_parallel_size: 8
|
||||
moe_expert_parallel_size: 8
|
||||
pipeline_parallel_size: 1
|
||||
context_parallel_size: 1
|
||||
enable_attention_dp: false
|
||||
cuda_graph_config:
|
||||
enable_padding: true
|
||||
max_batch_size: 1
|
||||
kv_cache_config:
|
||||
enable_block_reuse: false
|
||||
free_gpu_memory_fraction: 0.9
|
||||
dtype: fp8
|
||||
moe_config:
|
||||
backend: TRTLLM
|
||||
cache_transceiver_config:
|
||||
max_tokens_in_buffer: 16384
|
||||
backend: UCX
|
||||
disable_overlap_scheduler: true
|
||||
speculative_config: &id001
|
||||
decoding_type: MTP
|
||||
num_nextn_predict_layers: 3
|
||||
num_postprocess_workers: 4
|
||||
stream_interval: 20
|
||||
ctx:
|
||||
print_iter_log: true
|
||||
max_batch_size: 2
|
||||
max_num_tokens: 16384
|
||||
tensor_parallel_size: 4
|
||||
moe_expert_parallel_size: 4
|
||||
pipeline_parallel_size: 1
|
||||
context_parallel_size: 1
|
||||
enable_attention_dp: true
|
||||
enable_lm_head_tp_in_adp: true
|
||||
cuda_graph_config: null
|
||||
kv_cache_config:
|
||||
enable_block_reuse: false
|
||||
free_gpu_memory_fraction: 0.6
|
||||
dtype: fp8
|
||||
moe_config:
|
||||
backend: TRTLLM
|
||||
cache_transceiver_config:
|
||||
max_tokens_in_buffer: 16384
|
||||
backend: UCX
|
||||
disable_overlap_scheduler: true
|
||||
speculative_config: *id001
|
||||
@ -0,0 +1,96 @@
|
||||
metadata:
|
||||
model_name: deepseek_r1_0528_fp4_v2
|
||||
precision: fp4
|
||||
model_dir_name: DeepSeek-R1-0528-FP4-v2
|
||||
supported_gpus:
|
||||
- GB200
|
||||
script_file: disaggr_torch.slurm
|
||||
benchmark_type: 128k8k
|
||||
slurm:
|
||||
script_file: disaggr_torch.slurm
|
||||
partition: <partition>
|
||||
account: <account>
|
||||
job_time: 02:00:00
|
||||
job_name: unified-benchmark
|
||||
extra_args: --gres=gpu:4
|
||||
numa_bind: true
|
||||
benchmark:
|
||||
mode: e2e
|
||||
use_nv_sa_benchmark: false
|
||||
multi_round: 10
|
||||
benchmark_ratio: 0.0
|
||||
streaming: true
|
||||
concurrency_list: '128'
|
||||
input_length: 131072
|
||||
output_length: 8192
|
||||
dataset_file: datasets/perf-ci/deepseek_r1-128k8k-20480-ratio-1_for_serve.json
|
||||
hardware:
|
||||
gpus_per_node: 4
|
||||
num_ctx_servers: 1
|
||||
num_gen_servers: 1
|
||||
environment:
|
||||
container_mount: <container_mount>
|
||||
container_image: <container_image>
|
||||
model_path: <model_path>
|
||||
trtllm_repo: ''
|
||||
build_wheel: false
|
||||
work_dir: <full_path_to_work_dir>
|
||||
worker_env_var: TLLM_LOG_LEVEL=INFO TRTLLM_SERVER_DISABLE_GC=1 TRTLLM_WORKER_DISABLE_GC=1 TRTLLM_ENABLE_PDL=1 ENROOT_ALLOW_DEV=yes
|
||||
server_env_var: TRTLLM_SERVER_DISABLE_GC=1
|
||||
profiling:
|
||||
nsys_on: false
|
||||
accuracy:
|
||||
enable_accuracy_test: false
|
||||
model: local-completions
|
||||
tasks: gsm8k
|
||||
model_args_extra: num_concurrent=512,max_retries=3,tokenized_requests=false,timeout=1200,max_gen_toks=256,max_length=4096
|
||||
worker_config:
|
||||
gen:
|
||||
print_iter_log: true
|
||||
max_batch_size: 8
|
||||
max_num_tokens: 32
|
||||
tensor_parallel_size: 16
|
||||
moe_expert_parallel_size: 16
|
||||
pipeline_parallel_size: 1
|
||||
context_parallel_size: 1
|
||||
enable_attention_dp: true
|
||||
enable_lm_head_tp_in_adp: true
|
||||
cuda_graph_config:
|
||||
enable_padding: true
|
||||
max_batch_size: 8
|
||||
kv_cache_config:
|
||||
enable_block_reuse: false
|
||||
free_gpu_memory_fraction: 0.8
|
||||
dtype: fp8
|
||||
moe_config:
|
||||
backend: CUTEDSL
|
||||
cache_transceiver_config:
|
||||
max_tokens_in_buffer: 131104
|
||||
backend: UCX
|
||||
disable_overlap_scheduler: true
|
||||
speculative_config: &id001
|
||||
decoding_type: MTP
|
||||
num_nextn_predict_layers: 2
|
||||
num_postprocess_workers: 4
|
||||
stream_interval: 20
|
||||
ctx:
|
||||
print_iter_log: true
|
||||
max_batch_size: 1
|
||||
max_num_tokens: 131104
|
||||
tensor_parallel_size: 1
|
||||
moe_expert_parallel_size: 1
|
||||
pipeline_parallel_size: 8
|
||||
context_parallel_size: 1
|
||||
enable_attention_dp: false
|
||||
cuda_graph_config: null
|
||||
kv_cache_config:
|
||||
enable_block_reuse: false
|
||||
free_gpu_memory_fraction: 0.3
|
||||
dtype: fp8
|
||||
moe_config:
|
||||
backend: TRTLLM
|
||||
cache_transceiver_config:
|
||||
max_tokens_in_buffer: 131104
|
||||
backend: UCX
|
||||
disable_overlap_scheduler: true
|
||||
speculative_config: *id001
|
||||
@ -0,0 +1,96 @@
|
||||
metadata:
|
||||
model_name: deepseek_r1_0528_fp4_v2
|
||||
precision: fp4
|
||||
model_dir_name: DeepSeek-R1-0528-FP4-v2
|
||||
supported_gpus:
|
||||
- GB200
|
||||
script_file: disaggr_torch.slurm
|
||||
benchmark_type: 128k8k
|
||||
slurm:
|
||||
script_file: disaggr_torch.slurm
|
||||
partition: <partition>
|
||||
account: <account>
|
||||
job_time: 02:00:00
|
||||
job_name: unified-benchmark
|
||||
extra_args: --gres=gpu:4
|
||||
numa_bind: true
|
||||
benchmark:
|
||||
mode: e2e
|
||||
use_nv_sa_benchmark: false
|
||||
multi_round: 10
|
||||
benchmark_ratio: 0.0
|
||||
streaming: true
|
||||
concurrency_list: '64'
|
||||
input_length: 131072
|
||||
output_length: 8192
|
||||
dataset_file: datasets/perf-ci/deepseek_r1-128k8k-20480-ratio-1_for_serve.json
|
||||
hardware:
|
||||
gpus_per_node: 4
|
||||
num_ctx_servers: 1
|
||||
num_gen_servers: 1
|
||||
environment:
|
||||
container_mount: <container_mount>
|
||||
container_image: <container_image>
|
||||
model_path: <model_path>
|
||||
trtllm_repo: ''
|
||||
build_wheel: false
|
||||
work_dir: <full_path_to_work_dir>
|
||||
worker_env_var: TLLM_LOG_LEVEL=INFO TRTLLM_SERVER_DISABLE_GC=1 TRTLLM_WORKER_DISABLE_GC=1 TRTLLM_ENABLE_PDL=1 ENROOT_ALLOW_DEV=yes
|
||||
server_env_var: TRTLLM_SERVER_DISABLE_GC=1
|
||||
profiling:
|
||||
nsys_on: false
|
||||
accuracy:
|
||||
enable_accuracy_test: false
|
||||
model: local-completions
|
||||
tasks: gsm8k
|
||||
model_args_extra: num_concurrent=512,max_retries=3,tokenized_requests=false,timeout=1200,max_gen_toks=256,max_length=4096
|
||||
worker_config:
|
||||
gen:
|
||||
print_iter_log: true
|
||||
max_batch_size: 2
|
||||
max_num_tokens: 8
|
||||
tensor_parallel_size: 32
|
||||
moe_expert_parallel_size: 32
|
||||
pipeline_parallel_size: 1
|
||||
context_parallel_size: 1
|
||||
enable_attention_dp: true
|
||||
enable_lm_head_tp_in_adp: true
|
||||
cuda_graph_config:
|
||||
enable_padding: true
|
||||
max_batch_size: 2
|
||||
kv_cache_config:
|
||||
enable_block_reuse: false
|
||||
free_gpu_memory_fraction: 0.6
|
||||
dtype: fp8
|
||||
moe_config:
|
||||
backend: CUTEDSL
|
||||
cache_transceiver_config:
|
||||
max_tokens_in_buffer: 131104
|
||||
backend: UCX
|
||||
disable_overlap_scheduler: true
|
||||
speculative_config: &id001
|
||||
decoding_type: MTP
|
||||
num_nextn_predict_layers: 3
|
||||
num_postprocess_workers: 4
|
||||
stream_interval: 20
|
||||
ctx:
|
||||
print_iter_log: true
|
||||
max_batch_size: 1
|
||||
max_num_tokens: 131104
|
||||
tensor_parallel_size: 1
|
||||
moe_expert_parallel_size: 1
|
||||
pipeline_parallel_size: 8
|
||||
context_parallel_size: 1
|
||||
enable_attention_dp: false
|
||||
cuda_graph_config: null
|
||||
kv_cache_config:
|
||||
enable_block_reuse: false
|
||||
free_gpu_memory_fraction: 0.3
|
||||
dtype: fp8
|
||||
moe_config:
|
||||
backend: TRTLLM
|
||||
cache_transceiver_config:
|
||||
max_tokens_in_buffer: 131104
|
||||
backend: UCX
|
||||
disable_overlap_scheduler: true
|
||||
speculative_config: *id001
|
||||
@ -0,0 +1,96 @@
|
||||
metadata:
|
||||
model_name: deepseek_r1_0528_fp4_v2
|
||||
precision: fp4
|
||||
model_dir_name: DeepSeek-R1-0528-FP4-v2
|
||||
supported_gpus:
|
||||
- GB200
|
||||
script_file: disaggr_torch.slurm
|
||||
benchmark_type: 128k8k
|
||||
slurm:
|
||||
script_file: disaggr_torch.slurm
|
||||
partition: <partition>
|
||||
account: <account>
|
||||
job_time: 02:00:00
|
||||
job_name: unified-benchmark
|
||||
extra_args: --gres=gpu:4
|
||||
numa_bind: true
|
||||
benchmark:
|
||||
mode: e2e
|
||||
use_nv_sa_benchmark: false
|
||||
multi_round: 10
|
||||
benchmark_ratio: 0.0
|
||||
streaming: true
|
||||
concurrency_list: '1'
|
||||
input_length: 131072
|
||||
output_length: 8192
|
||||
dataset_file: datasets/perf-ci/deepseek_r1-128k8k-20480-ratio-1_for_serve.json
|
||||
hardware:
|
||||
gpus_per_node: 4
|
||||
num_ctx_servers: 1
|
||||
num_gen_servers: 1
|
||||
environment:
|
||||
container_mount: <container_mount>
|
||||
container_image: <container_image>
|
||||
model_path: <model_path>
|
||||
trtllm_repo: ''
|
||||
build_wheel: false
|
||||
work_dir: <full_path_to_work_dir>
|
||||
worker_env_var: TLLM_LOG_LEVEL=INFO TRTLLM_SERVER_DISABLE_GC=1 TRTLLM_WORKER_DISABLE_GC=1 TRTLLM_ENABLE_PDL=1 ENROOT_ALLOW_DEV=yes
|
||||
server_env_var: TRTLLM_SERVER_DISABLE_GC=1
|
||||
profiling:
|
||||
nsys_on: false
|
||||
accuracy:
|
||||
enable_accuracy_test: false
|
||||
model: local-completions
|
||||
tasks: gsm8k
|
||||
model_args_extra: num_concurrent=512,max_retries=3,tokenized_requests=false,timeout=1200,max_gen_toks=256,max_length=4096
|
||||
worker_config:
|
||||
gen:
|
||||
print_iter_log: true
|
||||
max_batch_size: 1
|
||||
max_num_tokens: 4
|
||||
tensor_parallel_size: 8
|
||||
moe_expert_parallel_size: 8
|
||||
pipeline_parallel_size: 1
|
||||
context_parallel_size: 1
|
||||
enable_attention_dp: false
|
||||
cuda_graph_config:
|
||||
enable_padding: true
|
||||
max_batch_size: 1
|
||||
kv_cache_config:
|
||||
enable_block_reuse: false
|
||||
free_gpu_memory_fraction: 0.8
|
||||
dtype: fp8
|
||||
moe_config:
|
||||
backend: TRTLLM
|
||||
cache_transceiver_config:
|
||||
max_tokens_in_buffer: 131104
|
||||
backend: UCX
|
||||
disable_overlap_scheduler: true
|
||||
speculative_config: &id001
|
||||
decoding_type: MTP
|
||||
num_nextn_predict_layers: 3
|
||||
num_postprocess_workers: 4
|
||||
stream_interval: 20
|
||||
allreduce_strategy: MNNVL
|
||||
ctx:
|
||||
print_iter_log: true
|
||||
max_batch_size: 1
|
||||
max_num_tokens: 131104
|
||||
tensor_parallel_size: 1
|
||||
moe_expert_parallel_size: 1
|
||||
pipeline_parallel_size: 8
|
||||
context_parallel_size: 1
|
||||
enable_attention_dp: false
|
||||
cuda_graph_config: null
|
||||
kv_cache_config:
|
||||
enable_block_reuse: false
|
||||
free_gpu_memory_fraction: 0.3
|
||||
dtype: fp8
|
||||
moe_config:
|
||||
backend: TRTLLM
|
||||
cache_transceiver_config:
|
||||
max_tokens_in_buffer: 131104
|
||||
backend: UCX
|
||||
disable_overlap_scheduler: true
|
||||
speculative_config: *id001
|
||||
@ -0,0 +1,97 @@
|
||||
metadata:
|
||||
model_name: deepseek_r1_0528_fp4_v2
|
||||
precision: fp4
|
||||
model_dir_name: DeepSeek-R1-0528-FP4-v2
|
||||
supported_gpus:
|
||||
- GB200
|
||||
script_file: disaggr_torch.slurm
|
||||
benchmark_type: 1k1k
|
||||
slurm:
|
||||
script_file: disaggr_torch.slurm
|
||||
partition: <partition>
|
||||
account: <account>
|
||||
job_time: 02:00:00
|
||||
job_name: unified-benchmark
|
||||
extra_args: --gres=gpu:4
|
||||
numa_bind: true
|
||||
benchmark:
|
||||
mode: e2e
|
||||
use_nv_sa_benchmark: false
|
||||
multi_round: 10
|
||||
benchmark_ratio: 0.0
|
||||
streaming: true
|
||||
concurrency_list: '1024'
|
||||
input_length: 1024
|
||||
output_length: 1024
|
||||
dataset_file: datasets/perf-ci/deepseek_r1-1k1k-20480-ratio-1_for_serve.json
|
||||
hardware:
|
||||
gpus_per_node: 4
|
||||
num_ctx_servers: 1
|
||||
num_gen_servers: 1
|
||||
environment:
|
||||
container_mount: <container_mount>
|
||||
container_image: <container_image>
|
||||
model_path: <model_path>
|
||||
trtllm_repo: ''
|
||||
build_wheel: false
|
||||
work_dir: <full_path_to_work_dir>
|
||||
worker_env_var: TLLM_LOG_LEVEL=INFO TRTLLM_SERVER_DISABLE_GC=1 TRTLLM_WORKER_DISABLE_GC=1 TRTLLM_ENABLE_PDL=1 ENROOT_ALLOW_DEV=yes
|
||||
server_env_var: TRTLLM_SERVER_DISABLE_GC=1
|
||||
profiling:
|
||||
nsys_on: false
|
||||
accuracy:
|
||||
enable_accuracy_test: false
|
||||
model: local-completions
|
||||
tasks: gsm8k
|
||||
model_args_extra: num_concurrent=512,max_retries=3,tokenized_requests=false,timeout=1200,max_gen_toks=256,max_length=4096
|
||||
worker_config:
|
||||
gen:
|
||||
print_iter_log: true
|
||||
max_batch_size: 32
|
||||
max_num_tokens: 128
|
||||
tensor_parallel_size: 32
|
||||
moe_expert_parallel_size: 32
|
||||
pipeline_parallel_size: 1
|
||||
context_parallel_size: 1
|
||||
enable_attention_dp: true
|
||||
enable_lm_head_tp_in_adp: true
|
||||
cuda_graph_config:
|
||||
enable_padding: true
|
||||
max_batch_size: 32
|
||||
kv_cache_config:
|
||||
enable_block_reuse: false
|
||||
free_gpu_memory_fraction: 0.6
|
||||
dtype: fp8
|
||||
moe_config:
|
||||
backend: CUTEDSL
|
||||
cache_transceiver_config:
|
||||
max_tokens_in_buffer: 16384
|
||||
backend: UCX
|
||||
disable_overlap_scheduler: true
|
||||
speculative_config: &id001
|
||||
decoding_type: MTP
|
||||
num_nextn_predict_layers: 3
|
||||
num_postprocess_workers: 4
|
||||
stream_interval: 20
|
||||
ctx:
|
||||
print_iter_log: true
|
||||
max_batch_size: 16
|
||||
max_num_tokens: 16384
|
||||
tensor_parallel_size: 4
|
||||
moe_expert_parallel_size: 4
|
||||
pipeline_parallel_size: 1
|
||||
context_parallel_size: 1
|
||||
enable_attention_dp: true
|
||||
enable_lm_head_tp_in_adp: true
|
||||
cuda_graph_config: null
|
||||
kv_cache_config:
|
||||
enable_block_reuse: false
|
||||
free_gpu_memory_fraction: 0.6
|
||||
dtype: fp8
|
||||
moe_config:
|
||||
backend: TRTLLM
|
||||
cache_transceiver_config:
|
||||
max_tokens_in_buffer: 16384
|
||||
backend: UCX
|
||||
disable_overlap_scheduler: true
|
||||
speculative_config: *id001
|
||||
@ -0,0 +1,97 @@
|
||||
metadata:
|
||||
model_name: deepseek_r1_0528_fp4_v2
|
||||
precision: fp4
|
||||
model_dir_name: DeepSeek-R1-0528-FP4-v2
|
||||
supported_gpus:
|
||||
- GB200
|
||||
script_file: disaggr_torch.slurm
|
||||
benchmark_type: 1k1k
|
||||
slurm:
|
||||
script_file: disaggr_torch.slurm
|
||||
partition: <partition>
|
||||
account: <account>
|
||||
job_time: 02:00:00
|
||||
job_name: unified-benchmark
|
||||
extra_args: --gres=gpu:4
|
||||
numa_bind: true
|
||||
benchmark:
|
||||
mode: e2e
|
||||
use_nv_sa_benchmark: false
|
||||
multi_round: 5
|
||||
benchmark_ratio: 0.0
|
||||
streaming: true
|
||||
concurrency_list: '3072'
|
||||
input_length: 1024
|
||||
output_length: 1024
|
||||
dataset_file: datasets/perf-ci/deepseek_r1-1k1k-20480-ratio-1_for_serve.json
|
||||
hardware:
|
||||
gpus_per_node: 4
|
||||
num_ctx_servers: 1
|
||||
num_gen_servers: 1
|
||||
environment:
|
||||
container_mount: <container_mount>
|
||||
container_image: <container_image>
|
||||
model_path: <model_path>
|
||||
trtllm_repo: ''
|
||||
build_wheel: false
|
||||
work_dir: <full_path_to_work_dir>
|
||||
worker_env_var: TLLM_LOG_LEVEL=INFO TRTLLM_SERVER_DISABLE_GC=1 TRTLLM_WORKER_DISABLE_GC=1 TRTLLM_ENABLE_PDL=1 ENROOT_ALLOW_DEV=yes
|
||||
server_env_var: TRTLLM_SERVER_DISABLE_GC=1
|
||||
profiling:
|
||||
nsys_on: false
|
||||
accuracy:
|
||||
enable_accuracy_test: false
|
||||
model: local-completions
|
||||
tasks: gsm8k
|
||||
model_args_extra: num_concurrent=512,max_retries=3,tokenized_requests=false,timeout=1200,max_gen_toks=256,max_length=4096
|
||||
worker_config:
|
||||
gen:
|
||||
print_iter_log: true
|
||||
max_batch_size: 768
|
||||
max_num_tokens: 1536
|
||||
tensor_parallel_size: 4
|
||||
moe_expert_parallel_size: 4
|
||||
pipeline_parallel_size: 1
|
||||
context_parallel_size: 1
|
||||
enable_attention_dp: true
|
||||
enable_lm_head_tp_in_adp: true
|
||||
cuda_graph_config:
|
||||
enable_padding: true
|
||||
max_batch_size: 768
|
||||
kv_cache_config:
|
||||
enable_block_reuse: false
|
||||
free_gpu_memory_fraction: 0.8
|
||||
dtype: fp8
|
||||
moe_config:
|
||||
backend: CUTEDSL
|
||||
cache_transceiver_config:
|
||||
max_tokens_in_buffer: 16384
|
||||
backend: UCX
|
||||
disable_overlap_scheduler: true
|
||||
speculative_config: &id001
|
||||
decoding_type: MTP
|
||||
num_nextn_predict_layers: 1
|
||||
num_postprocess_workers: 4
|
||||
stream_interval: 20
|
||||
ctx:
|
||||
print_iter_log: true
|
||||
max_batch_size: 16
|
||||
max_num_tokens: 16384
|
||||
tensor_parallel_size: 4
|
||||
moe_expert_parallel_size: 4
|
||||
pipeline_parallel_size: 1
|
||||
context_parallel_size: 1
|
||||
enable_attention_dp: true
|
||||
enable_lm_head_tp_in_adp: true
|
||||
cuda_graph_config: null
|
||||
kv_cache_config:
|
||||
enable_block_reuse: false
|
||||
free_gpu_memory_fraction: 0.6
|
||||
dtype: fp8
|
||||
moe_config:
|
||||
backend: TRTLLM
|
||||
cache_transceiver_config:
|
||||
max_tokens_in_buffer: 16384
|
||||
backend: UCX
|
||||
disable_overlap_scheduler: true
|
||||
speculative_config: *id001
|
||||
@ -25,7 +25,7 @@ benchmark:
|
||||
concurrency_list: '1024'
|
||||
input_length: 1024
|
||||
output_length: 1024
|
||||
dataset_file: <dataset_file>
|
||||
dataset_file: datasets/perf-ci/deepseek_r1-1k1k-20480-ratio-1_for_serve.json
|
||||
hardware:
|
||||
gpus_per_node: 4
|
||||
num_ctx_servers: 1
|
||||
@ -56,18 +56,7 @@ worker_config:
|
||||
max_seq_len: 2068
|
||||
cuda_graph_config:
|
||||
enable_padding: true
|
||||
batch_sizes:
|
||||
- 1
|
||||
- 2
|
||||
- 4
|
||||
- 8
|
||||
- 16
|
||||
- 32
|
||||
- 64
|
||||
- 128
|
||||
- 256
|
||||
- 512
|
||||
- 768
|
||||
max_batch_size: 768
|
||||
print_iter_log: true
|
||||
kv_cache_config:
|
||||
enable_block_reuse: false
|
||||
@ -0,0 +1,97 @@
|
||||
metadata:
|
||||
model_name: deepseek_r1_0528_fp4_v2
|
||||
precision: fp4
|
||||
model_dir_name: DeepSeek-R1-0528-FP4-v2
|
||||
supported_gpus:
|
||||
- GB200
|
||||
script_file: disaggr_torch.slurm
|
||||
benchmark_type: 1k1k
|
||||
slurm:
|
||||
script_file: disaggr_torch.slurm
|
||||
partition: <partition>
|
||||
account: <account>
|
||||
job_time: 02:00:00
|
||||
job_name: unified-benchmark
|
||||
extra_args: --gres=gpu:4
|
||||
numa_bind: true
|
||||
benchmark:
|
||||
mode: e2e
|
||||
use_nv_sa_benchmark: false
|
||||
multi_round: 10
|
||||
benchmark_ratio: 0.0
|
||||
streaming: true
|
||||
concurrency_list: '1'
|
||||
input_length: 1024
|
||||
output_length: 1024
|
||||
dataset_file: datasets/perf-ci/deepseek_r1-1k1k-20480-ratio-1_for_serve.json
|
||||
hardware:
|
||||
gpus_per_node: 4
|
||||
num_ctx_servers: 1
|
||||
num_gen_servers: 1
|
||||
environment:
|
||||
container_mount: <container_mount>
|
||||
container_image: <container_image>
|
||||
model_path: <model_path>
|
||||
trtllm_repo: ''
|
||||
build_wheel: false
|
||||
work_dir: <full_path_to_work_dir>
|
||||
worker_env_var: TLLM_LOG_LEVEL=INFO TRTLLM_SERVER_DISABLE_GC=1 TRTLLM_WORKER_DISABLE_GC=1 TRTLLM_ENABLE_PDL=1 ENROOT_ALLOW_DEV=yes
|
||||
server_env_var: TRTLLM_SERVER_DISABLE_GC=1
|
||||
profiling:
|
||||
nsys_on: false
|
||||
accuracy:
|
||||
enable_accuracy_test: false
|
||||
model: local-completions
|
||||
tasks: gsm8k
|
||||
model_args_extra: num_concurrent=512,max_retries=3,tokenized_requests=false,timeout=1200,max_gen_toks=256,max_length=4096
|
||||
worker_config:
|
||||
gen:
|
||||
print_iter_log: true
|
||||
max_batch_size: 128
|
||||
max_num_tokens: 512
|
||||
tensor_parallel_size: 8
|
||||
moe_expert_parallel_size: 8
|
||||
pipeline_parallel_size: 1
|
||||
context_parallel_size: 1
|
||||
enable_attention_dp: false
|
||||
cuda_graph_config:
|
||||
enable_padding: true
|
||||
max_batch_size: 128
|
||||
kv_cache_config:
|
||||
enable_block_reuse: false
|
||||
free_gpu_memory_fraction: 0.9
|
||||
dtype: fp8
|
||||
moe_config:
|
||||
backend: TRTLLM
|
||||
cache_transceiver_config:
|
||||
max_tokens_in_buffer: 16384
|
||||
backend: UCX
|
||||
disable_overlap_scheduler: true
|
||||
speculative_config: &id001
|
||||
decoding_type: MTP
|
||||
num_nextn_predict_layers: 3
|
||||
num_postprocess_workers: 4
|
||||
stream_interval: 20
|
||||
allreduce_strategy: MNNVL
|
||||
ctx:
|
||||
print_iter_log: true
|
||||
max_batch_size: 16
|
||||
max_num_tokens: 16384
|
||||
tensor_parallel_size: 4
|
||||
moe_expert_parallel_size: 4
|
||||
pipeline_parallel_size: 1
|
||||
context_parallel_size: 1
|
||||
enable_attention_dp: true
|
||||
enable_lm_head_tp_in_adp: true
|
||||
cuda_graph_config: null
|
||||
kv_cache_config:
|
||||
enable_block_reuse: false
|
||||
free_gpu_memory_fraction: 0.6
|
||||
dtype: fp8
|
||||
moe_config:
|
||||
backend: TRTLLM
|
||||
cache_transceiver_config:
|
||||
max_tokens_in_buffer: 16384
|
||||
backend: UCX
|
||||
disable_overlap_scheduler: true
|
||||
speculative_config: *id001
|
||||
@ -0,0 +1,97 @@
|
||||
metadata:
|
||||
model_name: deepseek_r1_0528_fp4_v2
|
||||
precision: fp4
|
||||
model_dir_name: DeepSeek-R1-0528-FP4-v2
|
||||
supported_gpus:
|
||||
- GB200
|
||||
script_file: disaggr_torch.slurm
|
||||
benchmark_type: 8k1k
|
||||
slurm:
|
||||
script_file: disaggr_torch.slurm
|
||||
partition: <partition>
|
||||
account: <account>
|
||||
job_time: 02:00:00
|
||||
job_name: unified-benchmark
|
||||
extra_args: --gres=gpu:4
|
||||
numa_bind: true
|
||||
benchmark:
|
||||
mode: e2e
|
||||
use_nv_sa_benchmark: false
|
||||
multi_round: 5
|
||||
benchmark_ratio: 0.0
|
||||
streaming: true
|
||||
concurrency_list: '4096'
|
||||
input_length: 8192
|
||||
output_length: 1024
|
||||
dataset_file: datasets/perf-ci/deepseek_r1-8k1k-20480-ratio-1_for_serve.json
|
||||
hardware:
|
||||
gpus_per_node: 4
|
||||
num_ctx_servers: 1
|
||||
num_gen_servers: 1
|
||||
environment:
|
||||
container_mount: <container_mount>
|
||||
container_image: <container_image>
|
||||
model_path: <model_path>
|
||||
trtllm_repo: ''
|
||||
build_wheel: false
|
||||
work_dir: <full_path_to_work_dir>
|
||||
worker_env_var: TLLM_LOG_LEVEL=INFO TRTLLM_SERVER_DISABLE_GC=1 TRTLLM_WORKER_DISABLE_GC=1 TRTLLM_ENABLE_PDL=1 ENROOT_ALLOW_DEV=yes
|
||||
server_env_var: TRTLLM_SERVER_DISABLE_GC=1
|
||||
profiling:
|
||||
nsys_on: false
|
||||
accuracy:
|
||||
enable_accuracy_test: false
|
||||
model: local-completions
|
||||
tasks: gsm8k
|
||||
model_args_extra: num_concurrent=512,max_retries=3,tokenized_requests=false,timeout=1200,max_gen_toks=256,max_length=4096
|
||||
worker_config:
|
||||
gen:
|
||||
print_iter_log: true
|
||||
max_batch_size: 256
|
||||
max_num_tokens: 512
|
||||
tensor_parallel_size: 16
|
||||
moe_expert_parallel_size: 16
|
||||
pipeline_parallel_size: 1
|
||||
context_parallel_size: 1
|
||||
enable_attention_dp: true
|
||||
enable_lm_head_tp_in_adp: true
|
||||
cuda_graph_config:
|
||||
enable_padding: true
|
||||
max_batch_size: 256
|
||||
kv_cache_config:
|
||||
enable_block_reuse: false
|
||||
free_gpu_memory_fraction: 0.8
|
||||
dtype: fp8
|
||||
moe_config:
|
||||
backend: CUTEDSL
|
||||
cache_transceiver_config:
|
||||
max_tokens_in_buffer: 16384
|
||||
backend: UCX
|
||||
disable_overlap_scheduler: true
|
||||
speculative_config: &id001
|
||||
decoding_type: MTP
|
||||
num_nextn_predict_layers: 1
|
||||
num_postprocess_workers: 4
|
||||
stream_interval: 20
|
||||
ctx:
|
||||
print_iter_log: true
|
||||
max_batch_size: 2
|
||||
max_num_tokens: 16384
|
||||
tensor_parallel_size: 4
|
||||
moe_expert_parallel_size: 4
|
||||
pipeline_parallel_size: 1
|
||||
context_parallel_size: 1
|
||||
enable_attention_dp: true
|
||||
enable_lm_head_tp_in_adp: true
|
||||
cuda_graph_config: null
|
||||
kv_cache_config:
|
||||
enable_block_reuse: false
|
||||
free_gpu_memory_fraction: 0.6
|
||||
dtype: fp8
|
||||
moe_config:
|
||||
backend: TRTLLM
|
||||
cache_transceiver_config:
|
||||
max_tokens_in_buffer: 16384
|
||||
backend: UCX
|
||||
disable_overlap_scheduler: true
|
||||
speculative_config: *id001
|
||||
@ -0,0 +1,97 @@
|
||||
metadata:
|
||||
model_name: deepseek_r1_0528_fp4_v2
|
||||
precision: fp4
|
||||
model_dir_name: DeepSeek-R1-0528-FP4-v2
|
||||
supported_gpus:
|
||||
- GB200
|
||||
script_file: disaggr_torch.slurm
|
||||
benchmark_type: 8k1k
|
||||
slurm:
|
||||
script_file: disaggr_torch.slurm
|
||||
partition: <partition>
|
||||
account: <account>
|
||||
job_time: 02:00:00
|
||||
job_name: unified-benchmark
|
||||
extra_args: --gres=gpu:4
|
||||
numa_bind: true
|
||||
benchmark:
|
||||
mode: e2e
|
||||
use_nv_sa_benchmark: false
|
||||
multi_round: 10
|
||||
benchmark_ratio: 0.0
|
||||
streaming: true
|
||||
concurrency_list: '1024'
|
||||
input_length: 8192
|
||||
output_length: 1024
|
||||
dataset_file: datasets/perf-ci/deepseek_r1-8k1k-20480-ratio-1_for_serve.json
|
||||
hardware:
|
||||
gpus_per_node: 4
|
||||
num_ctx_servers: 1
|
||||
num_gen_servers: 1
|
||||
environment:
|
||||
container_mount: <container_mount>
|
||||
container_image: <container_image>
|
||||
model_path: <model_path>
|
||||
trtllm_repo: ''
|
||||
build_wheel: false
|
||||
work_dir: <full_path_to_work_dir>
|
||||
worker_env_var: TLLM_LOG_LEVEL=INFO TRTLLM_SERVER_DISABLE_GC=1 TRTLLM_WORKER_DISABLE_GC=1 TRTLLM_ENABLE_PDL=1 ENROOT_ALLOW_DEV=yes
|
||||
server_env_var: TRTLLM_SERVER_DISABLE_GC=1
|
||||
profiling:
|
||||
nsys_on: false
|
||||
accuracy:
|
||||
enable_accuracy_test: false
|
||||
model: local-completions
|
||||
tasks: gsm8k
|
||||
model_args_extra: num_concurrent=512,max_retries=3,tokenized_requests=false,timeout=1200,max_gen_toks=256,max_length=4096
|
||||
worker_config:
|
||||
gen:
|
||||
print_iter_log: true
|
||||
max_batch_size: 32
|
||||
max_num_tokens: 128
|
||||
tensor_parallel_size: 32
|
||||
moe_expert_parallel_size: 32
|
||||
pipeline_parallel_size: 1
|
||||
context_parallel_size: 1
|
||||
enable_attention_dp: true
|
||||
enable_lm_head_tp_in_adp: true
|
||||
cuda_graph_config:
|
||||
enable_padding: true
|
||||
max_batch_size: 32
|
||||
kv_cache_config:
|
||||
enable_block_reuse: false
|
||||
free_gpu_memory_fraction: 0.75
|
||||
dtype: fp8
|
||||
moe_config:
|
||||
backend: CUTEDSL
|
||||
cache_transceiver_config:
|
||||
max_tokens_in_buffer: 16384
|
||||
backend: UCX
|
||||
disable_overlap_scheduler: true
|
||||
speculative_config: &id001
|
||||
decoding_type: MTP
|
||||
num_nextn_predict_layers: 3
|
||||
num_postprocess_workers: 4
|
||||
stream_interval: 20
|
||||
ctx:
|
||||
print_iter_log: true
|
||||
max_batch_size: 2
|
||||
max_num_tokens: 16384
|
||||
tensor_parallel_size: 4
|
||||
moe_expert_parallel_size: 4
|
||||
pipeline_parallel_size: 1
|
||||
context_parallel_size: 1
|
||||
enable_attention_dp: true
|
||||
enable_lm_head_tp_in_adp: true
|
||||
cuda_graph_config: null
|
||||
kv_cache_config:
|
||||
enable_block_reuse: false
|
||||
free_gpu_memory_fraction: 0.6
|
||||
dtype: fp8
|
||||
moe_config:
|
||||
backend: TRTLLM
|
||||
cache_transceiver_config:
|
||||
max_tokens_in_buffer: 16384
|
||||
backend: UCX
|
||||
disable_overlap_scheduler: true
|
||||
speculative_config: *id001
|
||||
@ -0,0 +1,97 @@
|
||||
metadata:
|
||||
model_name: deepseek_r1_0528_fp4_v2
|
||||
precision: fp4
|
||||
model_dir_name: DeepSeek-R1-0528-FP4-v2
|
||||
supported_gpus:
|
||||
- GB200
|
||||
script_file: disaggr_torch.slurm
|
||||
benchmark_type: 8k1k
|
||||
slurm:
|
||||
script_file: disaggr_torch.slurm
|
||||
partition: <partition>
|
||||
account: <account>
|
||||
job_time: 02:00:00
|
||||
job_name: unified-benchmark
|
||||
extra_args: --gres=gpu:4
|
||||
numa_bind: true
|
||||
benchmark:
|
||||
mode: e2e
|
||||
use_nv_sa_benchmark: false
|
||||
multi_round: 10
|
||||
benchmark_ratio: 0.0
|
||||
streaming: true
|
||||
concurrency_list: '1'
|
||||
input_length: 8192
|
||||
output_length: 1024
|
||||
dataset_file: datasets/perf-ci/deepseek_r1-8k1k-20480-ratio-1_for_serve.json
|
||||
hardware:
|
||||
gpus_per_node: 4
|
||||
num_ctx_servers: 1
|
||||
num_gen_servers: 1
|
||||
environment:
|
||||
container_mount: <container_mount>
|
||||
container_image: <container_image>
|
||||
model_path: <model_path>
|
||||
trtllm_repo: ''
|
||||
build_wheel: false
|
||||
work_dir: <full_path_to_work_dir>
|
||||
worker_env_var: TLLM_LOG_LEVEL=INFO TRTLLM_SERVER_DISABLE_GC=1 TRTLLM_WORKER_DISABLE_GC=1 TRTLLM_ENABLE_PDL=1 ENROOT_ALLOW_DEV=yes
|
||||
server_env_var: TRTLLM_SERVER_DISABLE_GC=1
|
||||
profiling:
|
||||
nsys_on: false
|
||||
accuracy:
|
||||
enable_accuracy_test: false
|
||||
model: local-completions
|
||||
tasks: gsm8k
|
||||
model_args_extra: num_concurrent=512,max_retries=3,tokenized_requests=false,timeout=1200,max_gen_toks=256,max_length=4096
|
||||
worker_config:
|
||||
gen:
|
||||
print_iter_log: true
|
||||
max_batch_size: 128
|
||||
max_num_tokens: 512
|
||||
tensor_parallel_size: 8
|
||||
moe_expert_parallel_size: 8
|
||||
pipeline_parallel_size: 1
|
||||
context_parallel_size: 1
|
||||
enable_attention_dp: false
|
||||
cuda_graph_config:
|
||||
enable_padding: true
|
||||
max_batch_size: 128
|
||||
kv_cache_config:
|
||||
enable_block_reuse: false
|
||||
free_gpu_memory_fraction: 0.9
|
||||
dtype: fp8
|
||||
moe_config:
|
||||
backend: TRTLLM
|
||||
cache_transceiver_config:
|
||||
max_tokens_in_buffer: 16384
|
||||
backend: UCX
|
||||
disable_overlap_scheduler: true
|
||||
speculative_config: &id001
|
||||
decoding_type: MTP
|
||||
num_nextn_predict_layers: 3
|
||||
num_postprocess_workers: 4
|
||||
stream_interval: 20
|
||||
allreduce_strategy: MNNVL
|
||||
ctx:
|
||||
print_iter_log: true
|
||||
max_batch_size: 2
|
||||
max_num_tokens: 16384
|
||||
tensor_parallel_size: 4
|
||||
moe_expert_parallel_size: 4
|
||||
pipeline_parallel_size: 1
|
||||
context_parallel_size: 1
|
||||
enable_attention_dp: true
|
||||
enable_lm_head_tp_in_adp: true
|
||||
cuda_graph_config: null
|
||||
kv_cache_config:
|
||||
enable_block_reuse: false
|
||||
free_gpu_memory_fraction: 0.6
|
||||
dtype: fp8
|
||||
moe_config:
|
||||
backend: TRTLLM
|
||||
cache_transceiver_config:
|
||||
max_tokens_in_buffer: 16384
|
||||
backend: UCX
|
||||
disable_overlap_scheduler: true
|
||||
speculative_config: *id001
|
||||
@ -0,0 +1,108 @@
|
||||
metadata:
|
||||
model_name: deepseek_v32_fp4
|
||||
precision: fp4
|
||||
model_dir_name: DeepSeek-V3.2-FP4-v2
|
||||
supported_gpus:
|
||||
- GB200
|
||||
script_file: disaggr_torch.slurm
|
||||
benchmark_type: 1k1k
|
||||
slurm:
|
||||
script_file: disaggr_torch.slurm
|
||||
partition: <partition>
|
||||
account: <account>
|
||||
job_time: 02:00:00
|
||||
job_name: unified-benchmark
|
||||
extra_args: --gres=gpu:4
|
||||
numa_bind: true
|
||||
benchmark:
|
||||
mode: e2e
|
||||
use_nv_sa_benchmark: false
|
||||
multi_round: 10
|
||||
benchmark_ratio: 0.0
|
||||
streaming: true
|
||||
concurrency_list: '1024'
|
||||
input_length: 1024
|
||||
output_length: 1024
|
||||
dataset_file: datasets/perf-ci/deepseek_v32-1k1k-20480-ratio-1_for_serve.json
|
||||
hardware:
|
||||
gpus_per_node: 4
|
||||
num_ctx_servers: 1
|
||||
num_gen_servers: 1
|
||||
environment:
|
||||
container_mount: <container_mount>
|
||||
container_image: <container_image>
|
||||
model_path: <model_path>
|
||||
trtllm_repo: ''
|
||||
build_wheel: false
|
||||
work_dir: <full_path_to_work_dir>
|
||||
worker_env_var: TLLM_LOG_LEVEL=INFO TRTLLM_SERVER_DISABLE_GC=1 TRTLLM_WORKER_DISABLE_GC=1 TRTLLM_ENABLE_PDL=1 ENROOT_ALLOW_DEV=yes
|
||||
server_env_var: TRTLLM_SERVER_DISABLE_GC=1
|
||||
profiling:
|
||||
nsys_on: false
|
||||
accuracy:
|
||||
enable_accuracy_test: false
|
||||
model: local-completions
|
||||
tasks: gsm8k
|
||||
model_args_extra: num_concurrent=512,max_retries=3,tokenized_requests=false,timeout=1200,max_gen_toks=256,max_length=4096
|
||||
worker_config:
|
||||
gen:
|
||||
print_iter_log: true
|
||||
max_batch_size: 32
|
||||
max_num_tokens: 128
|
||||
tensor_parallel_size: 32
|
||||
moe_expert_parallel_size: 32
|
||||
pipeline_parallel_size: 1
|
||||
context_parallel_size: 1
|
||||
enable_attention_dp: true
|
||||
enable_lm_head_tp_in_adp: true
|
||||
cuda_graph_config:
|
||||
enable_padding: true
|
||||
max_batch_size: 32
|
||||
kv_cache_config:
|
||||
enable_block_reuse: false
|
||||
free_gpu_memory_fraction: 0.6
|
||||
dtype: fp8
|
||||
tokens_per_block: 64
|
||||
moe_config:
|
||||
backend: CUTEDSL
|
||||
load_balancer:
|
||||
num_slots: 256
|
||||
layer_updates_per_iter: 1
|
||||
cache_transceiver_config:
|
||||
max_tokens_in_buffer: 16384
|
||||
backend: UCX
|
||||
disable_overlap_scheduler: true
|
||||
speculative_config: &id001
|
||||
decoding_type: MTP
|
||||
num_nextn_predict_layers: 3
|
||||
num_postprocess_workers: 4
|
||||
stream_interval: 20
|
||||
nvfp4_gemm_config:
|
||||
allowed_backends:
|
||||
- cutlass
|
||||
- cublaslt
|
||||
- cutedsl
|
||||
- cuda_core
|
||||
ctx:
|
||||
print_iter_log: true
|
||||
max_batch_size: 16
|
||||
max_num_tokens: 16384
|
||||
tensor_parallel_size: 4
|
||||
moe_expert_parallel_size: 4
|
||||
pipeline_parallel_size: 1
|
||||
context_parallel_size: 1
|
||||
enable_attention_dp: true
|
||||
enable_lm_head_tp_in_adp: true
|
||||
cuda_graph_config: null
|
||||
kv_cache_config:
|
||||
enable_block_reuse: false
|
||||
free_gpu_memory_fraction: 0.6
|
||||
dtype: fp8
|
||||
tokens_per_block: 64
|
||||
moe_config:
|
||||
backend: TRTLLM
|
||||
cache_transceiver_config:
|
||||
max_tokens_in_buffer: 16384
|
||||
backend: UCX
|
||||
disable_overlap_scheduler: true
|
||||
speculative_config: *id001
|
||||
@ -0,0 +1,105 @@
|
||||
metadata:
|
||||
model_name: deepseek_v32_fp4
|
||||
precision: fp4
|
||||
model_dir_name: DeepSeek-V3.2-FP4-v2
|
||||
supported_gpus:
|
||||
- GB200
|
||||
script_file: disaggr_torch.slurm
|
||||
benchmark_type: 1k1k
|
||||
slurm:
|
||||
script_file: disaggr_torch.slurm
|
||||
partition: <partition>
|
||||
account: <account>
|
||||
job_time: 02:00:00
|
||||
job_name: unified-benchmark
|
||||
extra_args: --gres=gpu:4
|
||||
numa_bind: true
|
||||
benchmark:
|
||||
mode: e2e
|
||||
use_nv_sa_benchmark: false
|
||||
multi_round: 5
|
||||
benchmark_ratio: 0.0
|
||||
streaming: true
|
||||
concurrency_list: '2048'
|
||||
input_length: 1024
|
||||
output_length: 1024
|
||||
dataset_file: datasets/perf-ci/deepseek_v32-1k1k-20480-ratio-1_for_serve.json
|
||||
hardware:
|
||||
gpus_per_node: 4
|
||||
num_ctx_servers: 1
|
||||
num_gen_servers: 1
|
||||
environment:
|
||||
container_mount: <container_mount>
|
||||
container_image: <container_image>
|
||||
model_path: <model_path>
|
||||
trtllm_repo: ''
|
||||
build_wheel: false
|
||||
work_dir: <full_path_to_work_dir>
|
||||
worker_env_var: TLLM_LOG_LEVEL=INFO TRTLLM_SERVER_DISABLE_GC=1 TRTLLM_WORKER_DISABLE_GC=1 TRTLLM_ENABLE_PDL=1 ENROOT_ALLOW_DEV=yes
|
||||
server_env_var: TRTLLM_SERVER_DISABLE_GC=1
|
||||
profiling:
|
||||
nsys_on: false
|
||||
accuracy:
|
||||
enable_accuracy_test: false
|
||||
model: local-completions
|
||||
tasks: gsm8k
|
||||
model_args_extra: num_concurrent=512,max_retries=3,tokenized_requests=false,timeout=1200,max_gen_toks=256,max_length=4096
|
||||
worker_config:
|
||||
gen:
|
||||
print_iter_log: true
|
||||
max_batch_size: 512
|
||||
max_num_tokens: 1024
|
||||
tensor_parallel_size: 4
|
||||
moe_expert_parallel_size: 4
|
||||
pipeline_parallel_size: 1
|
||||
context_parallel_size: 1
|
||||
enable_attention_dp: true
|
||||
enable_lm_head_tp_in_adp: true
|
||||
cuda_graph_config:
|
||||
enable_padding: true
|
||||
max_batch_size: 512
|
||||
kv_cache_config:
|
||||
enable_block_reuse: false
|
||||
free_gpu_memory_fraction: 0.8
|
||||
dtype: fp8
|
||||
tokens_per_block: 64
|
||||
moe_config:
|
||||
backend: CUTEDSL
|
||||
cache_transceiver_config:
|
||||
max_tokens_in_buffer: 16384
|
||||
backend: UCX
|
||||
disable_overlap_scheduler: true
|
||||
speculative_config: &id001
|
||||
decoding_type: MTP
|
||||
num_nextn_predict_layers: 1
|
||||
num_postprocess_workers: 4
|
||||
stream_interval: 20
|
||||
nvfp4_gemm_config:
|
||||
allowed_backends:
|
||||
- cutlass
|
||||
- cublaslt
|
||||
- cutedsl
|
||||
- cuda_core
|
||||
ctx:
|
||||
print_iter_log: true
|
||||
max_batch_size: 16
|
||||
max_num_tokens: 16384
|
||||
tensor_parallel_size: 4
|
||||
moe_expert_parallel_size: 4
|
||||
pipeline_parallel_size: 1
|
||||
context_parallel_size: 1
|
||||
enable_attention_dp: true
|
||||
enable_lm_head_tp_in_adp: true
|
||||
cuda_graph_config: null
|
||||
kv_cache_config:
|
||||
enable_block_reuse: false
|
||||
free_gpu_memory_fraction: 0.6
|
||||
dtype: fp8
|
||||
tokens_per_block: 64
|
||||
moe_config:
|
||||
backend: TRTLLM
|
||||
cache_transceiver_config:
|
||||
max_tokens_in_buffer: 16384
|
||||
backend: UCX
|
||||
disable_overlap_scheduler: true
|
||||
speculative_config: *id001
|
||||
@ -0,0 +1,105 @@
|
||||
metadata:
|
||||
model_name: deepseek_v32_fp4
|
||||
precision: fp4
|
||||
model_dir_name: DeepSeek-V3.2-FP4-v2
|
||||
supported_gpus:
|
||||
- GB200
|
||||
script_file: disaggr_torch.slurm
|
||||
benchmark_type: 1k1k
|
||||
slurm:
|
||||
script_file: disaggr_torch.slurm
|
||||
partition: <partition>
|
||||
account: <account>
|
||||
job_time: 02:00:00
|
||||
job_name: unified-benchmark
|
||||
extra_args: --gres=gpu:4
|
||||
numa_bind: true
|
||||
benchmark:
|
||||
mode: e2e
|
||||
use_nv_sa_benchmark: false
|
||||
multi_round: 10
|
||||
benchmark_ratio: 0.0
|
||||
streaming: true
|
||||
concurrency_list: '1'
|
||||
input_length: 1024
|
||||
output_length: 1024
|
||||
dataset_file: datasets/perf-ci/deepseek_v32-1k1k-20480-ratio-1_for_serve.json
|
||||
hardware:
|
||||
gpus_per_node: 4
|
||||
num_ctx_servers: 1
|
||||
num_gen_servers: 1
|
||||
environment:
|
||||
container_mount: <container_mount>
|
||||
container_image: <container_image>
|
||||
model_path: <model_path>
|
||||
trtllm_repo: ''
|
||||
build_wheel: false
|
||||
work_dir: <full_path_to_work_dir>
|
||||
worker_env_var: TLLM_LOG_LEVEL=INFO TRTLLM_SERVER_DISABLE_GC=1 TRTLLM_WORKER_DISABLE_GC=1 TRTLLM_ENABLE_PDL=1 ENROOT_ALLOW_DEV=yes
|
||||
server_env_var: TRTLLM_SERVER_DISABLE_GC=1
|
||||
profiling:
|
||||
nsys_on: false
|
||||
accuracy:
|
||||
enable_accuracy_test: false
|
||||
model: local-completions
|
||||
tasks: gsm8k
|
||||
model_args_extra: num_concurrent=512,max_retries=3,tokenized_requests=false,timeout=1200,max_gen_toks=256,max_length=4096
|
||||
worker_config:
|
||||
gen:
|
||||
print_iter_log: true
|
||||
max_batch_size: 1
|
||||
max_num_tokens: 512
|
||||
tensor_parallel_size: 8
|
||||
moe_expert_parallel_size: 8
|
||||
pipeline_parallel_size: 1
|
||||
context_parallel_size: 1
|
||||
enable_attention_dp: false
|
||||
cuda_graph_config:
|
||||
enable_padding: true
|
||||
max_batch_size: 1
|
||||
kv_cache_config:
|
||||
enable_block_reuse: false
|
||||
free_gpu_memory_fraction: 0.9
|
||||
dtype: fp8
|
||||
tokens_per_block: 64
|
||||
moe_config:
|
||||
backend: TRTLLM
|
||||
cache_transceiver_config:
|
||||
max_tokens_in_buffer: 16384
|
||||
backend: UCX
|
||||
disable_overlap_scheduler: true
|
||||
speculative_config: &id001
|
||||
decoding_type: MTP
|
||||
num_nextn_predict_layers: 3
|
||||
num_postprocess_workers: 4
|
||||
stream_interval: 20
|
||||
nvfp4_gemm_config:
|
||||
allowed_backends:
|
||||
- cutlass
|
||||
- cublaslt
|
||||
- cutedsl
|
||||
- cuda_core
|
||||
allreduce_strategy: MNNVL
|
||||
ctx:
|
||||
print_iter_log: true
|
||||
max_batch_size: 16
|
||||
max_num_tokens: 16384
|
||||
tensor_parallel_size: 4
|
||||
moe_expert_parallel_size: 4
|
||||
pipeline_parallel_size: 1
|
||||
context_parallel_size: 1
|
||||
enable_attention_dp: true
|
||||
enable_lm_head_tp_in_adp: true
|
||||
cuda_graph_config: null
|
||||
kv_cache_config:
|
||||
enable_block_reuse: false
|
||||
free_gpu_memory_fraction: 0.6
|
||||
dtype: fp8
|
||||
tokens_per_block: 64
|
||||
moe_config:
|
||||
backend: TRTLLM
|
||||
cache_transceiver_config:
|
||||
max_tokens_in_buffer: 16384
|
||||
backend: UCX
|
||||
disable_overlap_scheduler: true
|
||||
speculative_config: *id001
|
||||
@ -0,0 +1,105 @@
|
||||
metadata:
|
||||
model_name: deepseek_v32_fp4
|
||||
precision: fp4
|
||||
model_dir_name: DeepSeek-V3.2-FP4-v2
|
||||
supported_gpus:
|
||||
- GB200
|
||||
script_file: disaggr_torch.slurm
|
||||
benchmark_type: 32k4k
|
||||
slurm:
|
||||
script_file: disaggr_torch.slurm
|
||||
partition: <partition>
|
||||
account: <account>
|
||||
job_time: 02:00:00
|
||||
job_name: unified-benchmark
|
||||
extra_args: --gres=gpu:4
|
||||
numa_bind: true
|
||||
benchmark:
|
||||
mode: e2e
|
||||
use_nv_sa_benchmark: false
|
||||
multi_round: 10
|
||||
benchmark_ratio: 0.0
|
||||
streaming: true
|
||||
concurrency_list: '256'
|
||||
input_length: 32768
|
||||
output_length: 4096
|
||||
dataset_file: datasets/perf-ci/deepseek_v32-32k4k-20480-ratio-1_for_serve.json
|
||||
hardware:
|
||||
gpus_per_node: 4
|
||||
num_ctx_servers: 1
|
||||
num_gen_servers: 1
|
||||
environment:
|
||||
container_mount: <container_mount>
|
||||
container_image: <container_image>
|
||||
model_path: <model_path>
|
||||
trtllm_repo: ''
|
||||
build_wheel: false
|
||||
work_dir: <full_path_to_work_dir>
|
||||
worker_env_var: TLLM_LOG_LEVEL=INFO TRTLLM_SERVER_DISABLE_GC=1 TRTLLM_WORKER_DISABLE_GC=1 TRTLLM_ENABLE_PDL=1 ENROOT_ALLOW_DEV=yes
|
||||
server_env_var: TRTLLM_SERVER_DISABLE_GC=1
|
||||
profiling:
|
||||
nsys_on: false
|
||||
accuracy:
|
||||
enable_accuracy_test: false
|
||||
model: local-completions
|
||||
tasks: gsm8k
|
||||
model_args_extra: num_concurrent=512,max_retries=3,tokenized_requests=false,timeout=1200,max_gen_toks=256,max_length=4096
|
||||
worker_config:
|
||||
gen:
|
||||
print_iter_log: true
|
||||
max_batch_size: 8
|
||||
max_num_tokens: 256
|
||||
tensor_parallel_size: 32
|
||||
moe_expert_parallel_size: 32
|
||||
pipeline_parallel_size: 1
|
||||
context_parallel_size: 1
|
||||
enable_attention_dp: true
|
||||
enable_lm_head_tp_in_adp: true
|
||||
cuda_graph_config:
|
||||
enable_padding: true
|
||||
max_batch_size: 8
|
||||
kv_cache_config:
|
||||
enable_block_reuse: false
|
||||
free_gpu_memory_fraction: 0.85
|
||||
dtype: fp8
|
||||
tokens_per_block: 64
|
||||
moe_config:
|
||||
backend: CUTEDSL
|
||||
cache_transceiver_config:
|
||||
max_tokens_in_buffer: 32768
|
||||
backend: UCX
|
||||
disable_overlap_scheduler: true
|
||||
speculative_config: &id001
|
||||
decoding_type: MTP
|
||||
num_nextn_predict_layers: 3
|
||||
num_postprocess_workers: 4
|
||||
stream_interval: 20
|
||||
nvfp4_gemm_config:
|
||||
allowed_backends:
|
||||
- cutlass
|
||||
- cublaslt
|
||||
- cutedsl
|
||||
- cuda_core
|
||||
ctx:
|
||||
print_iter_log: true
|
||||
max_batch_size: 1
|
||||
max_num_tokens: 32784
|
||||
tensor_parallel_size: 4
|
||||
moe_expert_parallel_size: 4
|
||||
pipeline_parallel_size: 1
|
||||
context_parallel_size: 1
|
||||
enable_attention_dp: true
|
||||
enable_lm_head_tp_in_adp: true
|
||||
cuda_graph_config: null
|
||||
kv_cache_config:
|
||||
enable_block_reuse: false
|
||||
free_gpu_memory_fraction: 0.6
|
||||
dtype: fp8
|
||||
tokens_per_block: 64
|
||||
moe_config:
|
||||
backend: TRTLLM
|
||||
cache_transceiver_config:
|
||||
max_tokens_in_buffer: 32768
|
||||
backend: UCX
|
||||
disable_overlap_scheduler: true
|
||||
speculative_config: *id001
|
||||
@ -0,0 +1,108 @@
|
||||
metadata:
|
||||
model_name: deepseek_v32_fp4
|
||||
precision: fp4
|
||||
model_dir_name: DeepSeek-V3.2-FP4-v2
|
||||
supported_gpus:
|
||||
- GB200
|
||||
script_file: disaggr_torch.slurm
|
||||
benchmark_type: 32k4k
|
||||
slurm:
|
||||
script_file: disaggr_torch.slurm
|
||||
partition: <partition>
|
||||
account: <account>
|
||||
job_time: 02:00:00
|
||||
job_name: unified-benchmark
|
||||
extra_args: --gres=gpu:4
|
||||
numa_bind: true
|
||||
benchmark:
|
||||
mode: e2e
|
||||
use_nv_sa_benchmark: false
|
||||
multi_round: 5
|
||||
benchmark_ratio: 0.0
|
||||
streaming: true
|
||||
concurrency_list: '2048'
|
||||
input_length: 32768
|
||||
output_length: 4096
|
||||
dataset_file: datasets/perf-ci/deepseek_v32-32k4k-20480-ratio-1_for_serve.json
|
||||
hardware:
|
||||
gpus_per_node: 4
|
||||
num_ctx_servers: 1
|
||||
num_gen_servers: 1
|
||||
environment:
|
||||
container_mount: <container_mount>
|
||||
container_image: <container_image>
|
||||
model_path: <model_path>
|
||||
trtllm_repo: ''
|
||||
build_wheel: false
|
||||
work_dir: <full_path_to_work_dir>
|
||||
worker_env_var: TLLM_LOG_LEVEL=INFO TRTLLM_SERVER_DISABLE_GC=1 TRTLLM_WORKER_DISABLE_GC=1 TRTLLM_ENABLE_PDL=1 ENROOT_ALLOW_DEV=yes
|
||||
server_env_var: TRTLLM_SERVER_DISABLE_GC=1
|
||||
profiling:
|
||||
nsys_on: false
|
||||
accuracy:
|
||||
enable_accuracy_test: false
|
||||
model: local-completions
|
||||
tasks: gsm8k
|
||||
model_args_extra: num_concurrent=512,max_retries=3,tokenized_requests=false,timeout=1200,max_gen_toks=256,max_length=4096
|
||||
worker_config:
|
||||
gen:
|
||||
print_iter_log: true
|
||||
max_batch_size: 64
|
||||
max_num_tokens: 128
|
||||
tensor_parallel_size: 32
|
||||
moe_expert_parallel_size: 32
|
||||
pipeline_parallel_size: 1
|
||||
context_parallel_size: 1
|
||||
enable_attention_dp: true
|
||||
enable_lm_head_tp_in_adp: true
|
||||
cuda_graph_config:
|
||||
enable_padding: true
|
||||
max_batch_size: 64
|
||||
kv_cache_config:
|
||||
enable_block_reuse: false
|
||||
free_gpu_memory_fraction: 0.85
|
||||
dtype: fp8
|
||||
tokens_per_block: 64
|
||||
moe_config:
|
||||
backend: CUTEDSL
|
||||
load_balancer:
|
||||
num_slots: 288
|
||||
layer_updates_per_iter: 1
|
||||
cache_transceiver_config:
|
||||
max_tokens_in_buffer: 32768
|
||||
backend: UCX
|
||||
disable_overlap_scheduler: true
|
||||
speculative_config: &id001
|
||||
decoding_type: MTP
|
||||
num_nextn_predict_layers: 1
|
||||
num_postprocess_workers: 4
|
||||
stream_interval: 20
|
||||
nvfp4_gemm_config:
|
||||
allowed_backends:
|
||||
- cutlass
|
||||
- cublaslt
|
||||
- cutedsl
|
||||
- cuda_core
|
||||
ctx:
|
||||
print_iter_log: true
|
||||
max_batch_size: 1
|
||||
max_num_tokens: 32784
|
||||
tensor_parallel_size: 4
|
||||
moe_expert_parallel_size: 4
|
||||
pipeline_parallel_size: 1
|
||||
context_parallel_size: 1
|
||||
enable_attention_dp: true
|
||||
enable_lm_head_tp_in_adp: true
|
||||
cuda_graph_config: null
|
||||
kv_cache_config:
|
||||
enable_block_reuse: false
|
||||
free_gpu_memory_fraction: 0.6
|
||||
dtype: fp8
|
||||
tokens_per_block: 64
|
||||
moe_config:
|
||||
backend: TRTLLM
|
||||
cache_transceiver_config:
|
||||
max_tokens_in_buffer: 32768
|
||||
backend: UCX
|
||||
disable_overlap_scheduler: true
|
||||
speculative_config: *id001
|
||||
@ -0,0 +1,105 @@
|
||||
metadata:
|
||||
model_name: deepseek_v32_fp4
|
||||
precision: fp4
|
||||
model_dir_name: DeepSeek-V3.2-FP4-v2
|
||||
supported_gpus:
|
||||
- GB200
|
||||
script_file: disaggr_torch.slurm
|
||||
benchmark_type: 32k4k
|
||||
slurm:
|
||||
script_file: disaggr_torch.slurm
|
||||
partition: <partition>
|
||||
account: <account>
|
||||
job_time: 02:00:00
|
||||
job_name: unified-benchmark
|
||||
extra_args: --gres=gpu:4
|
||||
numa_bind: true
|
||||
benchmark:
|
||||
mode: e2e
|
||||
use_nv_sa_benchmark: false
|
||||
multi_round: 10
|
||||
benchmark_ratio: 0.0
|
||||
streaming: true
|
||||
concurrency_list: '1'
|
||||
input_length: 32768
|
||||
output_length: 4096
|
||||
dataset_file: datasets/perf-ci/deepseek_v32-32k4k-20480-ratio-1_for_serve.json
|
||||
hardware:
|
||||
gpus_per_node: 4
|
||||
num_ctx_servers: 1
|
||||
num_gen_servers: 1
|
||||
environment:
|
||||
container_mount: <container_mount>
|
||||
container_image: <container_image>
|
||||
model_path: <model_path>
|
||||
trtllm_repo: ''
|
||||
build_wheel: false
|
||||
work_dir: <full_path_to_work_dir>
|
||||
worker_env_var: TLLM_LOG_LEVEL=INFO TRTLLM_SERVER_DISABLE_GC=1 TRTLLM_WORKER_DISABLE_GC=1 TRTLLM_ENABLE_PDL=1 ENROOT_ALLOW_DEV=yes
|
||||
server_env_var: TRTLLM_SERVER_DISABLE_GC=1
|
||||
profiling:
|
||||
nsys_on: false
|
||||
accuracy:
|
||||
enable_accuracy_test: false
|
||||
model: local-completions
|
||||
tasks: gsm8k
|
||||
model_args_extra: num_concurrent=512,max_retries=3,tokenized_requests=false,timeout=1200,max_gen_toks=256,max_length=4096
|
||||
worker_config:
|
||||
gen:
|
||||
print_iter_log: true
|
||||
max_batch_size: 1
|
||||
max_num_tokens: 256
|
||||
tensor_parallel_size: 8
|
||||
moe_expert_parallel_size: 8
|
||||
pipeline_parallel_size: 1
|
||||
context_parallel_size: 1
|
||||
enable_attention_dp: false
|
||||
cuda_graph_config:
|
||||
enable_padding: true
|
||||
max_batch_size: 1
|
||||
kv_cache_config:
|
||||
enable_block_reuse: false
|
||||
free_gpu_memory_fraction: 0.9
|
||||
dtype: fp8
|
||||
tokens_per_block: 64
|
||||
moe_config:
|
||||
backend: TRTLLM
|
||||
cache_transceiver_config:
|
||||
max_tokens_in_buffer: 32768
|
||||
backend: UCX
|
||||
disable_overlap_scheduler: true
|
||||
speculative_config: &id001
|
||||
decoding_type: MTP
|
||||
num_nextn_predict_layers: 3
|
||||
num_postprocess_workers: 4
|
||||
stream_interval: 20
|
||||
nvfp4_gemm_config:
|
||||
allowed_backends:
|
||||
- cutlass
|
||||
- cublaslt
|
||||
- cutedsl
|
||||
- cuda_core
|
||||
allreduce_strategy: MNNVL
|
||||
ctx:
|
||||
print_iter_log: true
|
||||
max_batch_size: 1
|
||||
max_num_tokens: 32784
|
||||
tensor_parallel_size: 4
|
||||
moe_expert_parallel_size: 4
|
||||
pipeline_parallel_size: 1
|
||||
context_parallel_size: 1
|
||||
enable_attention_dp: true
|
||||
enable_lm_head_tp_in_adp: true
|
||||
cuda_graph_config: null
|
||||
kv_cache_config:
|
||||
enable_block_reuse: false
|
||||
free_gpu_memory_fraction: 0.6
|
||||
dtype: fp8
|
||||
tokens_per_block: 64
|
||||
moe_config:
|
||||
backend: TRTLLM
|
||||
cache_transceiver_config:
|
||||
max_tokens_in_buffer: 32768
|
||||
backend: UCX
|
||||
disable_overlap_scheduler: true
|
||||
speculative_config: *id001
|
||||
@ -0,0 +1,104 @@
|
||||
metadata:
|
||||
model_name: deepseek_v32_fp4
|
||||
precision: fp4
|
||||
model_dir_name: DeepSeek-V3.2-FP4-v2
|
||||
supported_gpus:
|
||||
- GB200
|
||||
script_file: disaggr_torch.slurm
|
||||
benchmark_type: 8k1k
|
||||
slurm:
|
||||
script_file: disaggr_torch.slurm
|
||||
partition: <partition>
|
||||
account: <account>
|
||||
job_time: 02:00:00
|
||||
job_name: unified-benchmark
|
||||
extra_args: --gres=gpu:4
|
||||
numa_bind: true
|
||||
benchmark:
|
||||
mode: e2e
|
||||
use_nv_sa_benchmark: false
|
||||
multi_round: 5
|
||||
benchmark_ratio: 0.0
|
||||
streaming: true
|
||||
concurrency_list: '4096'
|
||||
input_length: 8192
|
||||
output_length: 1024
|
||||
dataset_file: datasets/perf-ci/deepseek_v32-8k1k-20480-ratio-1_for_serve.json
|
||||
hardware:
|
||||
gpus_per_node: 4
|
||||
num_ctx_servers: 1
|
||||
num_gen_servers: 1
|
||||
environment:
|
||||
container_mount: <container_mount>
|
||||
container_image: <container_image>
|
||||
model_path: <model_path>
|
||||
trtllm_repo: ''
|
||||
build_wheel: false
|
||||
work_dir: <full_path_to_work_dir>
|
||||
worker_env_var: TLLM_LOG_LEVEL=INFO TRTLLM_SERVER_DISABLE_GC=1 TRTLLM_WORKER_DISABLE_GC=1 TRTLLM_ENABLE_PDL=1 ENROOT_ALLOW_DEV=yes
|
||||
server_env_var: TRTLLM_SERVER_DISABLE_GC=1
|
||||
profiling:
|
||||
nsys_on: false
|
||||
accuracy:
|
||||
enable_accuracy_test: false
|
||||
model: local-completions
|
||||
tasks: gsm8k
|
||||
model_args_extra: num_concurrent=512,max_retries=3,tokenized_requests=false,timeout=1200,max_gen_toks=256,max_length=4096
|
||||
worker_config:
|
||||
gen:
|
||||
print_iter_log: true
|
||||
max_batch_size: 128
|
||||
max_num_tokens: 128
|
||||
tensor_parallel_size: 32
|
||||
moe_expert_parallel_size: 32
|
||||
pipeline_parallel_size: 1
|
||||
context_parallel_size: 1
|
||||
enable_attention_dp: true
|
||||
enable_lm_head_tp_in_adp: true
|
||||
cuda_graph_config:
|
||||
enable_padding: true
|
||||
max_batch_size: 128
|
||||
kv_cache_config:
|
||||
enable_block_reuse: false
|
||||
free_gpu_memory_fraction: 0.8
|
||||
dtype: fp8
|
||||
tokens_per_block: 64
|
||||
moe_config:
|
||||
backend: CUTEDSL
|
||||
load_balancer:
|
||||
num_slots: 256
|
||||
layer_updates_per_iter: 1
|
||||
cache_transceiver_config:
|
||||
max_tokens_in_buffer: 16384
|
||||
backend: UCX
|
||||
disable_overlap_scheduler: true
|
||||
num_postprocess_workers: 4
|
||||
stream_interval: 20
|
||||
nvfp4_gemm_config:
|
||||
allowed_backends:
|
||||
- cutlass
|
||||
- cublaslt
|
||||
- cutedsl
|
||||
- cuda_core
|
||||
ctx:
|
||||
print_iter_log: true
|
||||
max_batch_size: 2
|
||||
max_num_tokens: 16384
|
||||
tensor_parallel_size: 4
|
||||
moe_expert_parallel_size: 4
|
||||
pipeline_parallel_size: 1
|
||||
context_parallel_size: 1
|
||||
enable_attention_dp: true
|
||||
enable_lm_head_tp_in_adp: true
|
||||
cuda_graph_config: null
|
||||
kv_cache_config:
|
||||
enable_block_reuse: false
|
||||
free_gpu_memory_fraction: 0.6
|
||||
dtype: fp8
|
||||
tokens_per_block: 64
|
||||
moe_config:
|
||||
backend: TRTLLM
|
||||
cache_transceiver_config:
|
||||
max_tokens_in_buffer: 16384
|
||||
backend: UCX
|
||||
disable_overlap_scheduler: true
|
||||
@ -0,0 +1,108 @@
|
||||
metadata:
|
||||
model_name: deepseek_v32_fp4
|
||||
precision: fp4
|
||||
model_dir_name: DeepSeek-V3.2-FP4-v2
|
||||
supported_gpus:
|
||||
- GB200
|
||||
script_file: disaggr_torch.slurm
|
||||
benchmark_type: 8k1k
|
||||
slurm:
|
||||
script_file: disaggr_torch.slurm
|
||||
partition: <partition>
|
||||
account: <account>
|
||||
job_time: 02:00:00
|
||||
job_name: unified-benchmark
|
||||
extra_args: --gres=gpu:4
|
||||
numa_bind: true
|
||||
benchmark:
|
||||
mode: e2e
|
||||
use_nv_sa_benchmark: false
|
||||
multi_round: 10
|
||||
benchmark_ratio: 0.0
|
||||
streaming: true
|
||||
concurrency_list: '1024'
|
||||
input_length: 8192
|
||||
output_length: 1024
|
||||
dataset_file: datasets/perf-ci/deepseek_v32-8k1k-20480-ratio-1_for_serve.json
|
||||
hardware:
|
||||
gpus_per_node: 4
|
||||
num_ctx_servers: 1
|
||||
num_gen_servers: 1
|
||||
environment:
|
||||
container_mount: <container_mount>
|
||||
container_image: <container_image>
|
||||
model_path: <model_path>
|
||||
trtllm_repo: ''
|
||||
build_wheel: false
|
||||
work_dir: <full_path_to_work_dir>
|
||||
worker_env_var: TLLM_LOG_LEVEL=INFO TRTLLM_SERVER_DISABLE_GC=1 TRTLLM_WORKER_DISABLE_GC=1 TRTLLM_ENABLE_PDL=1 ENROOT_ALLOW_DEV=yes
|
||||
server_env_var: TRTLLM_SERVER_DISABLE_GC=1
|
||||
profiling:
|
||||
nsys_on: false
|
||||
accuracy:
|
||||
enable_accuracy_test: false
|
||||
model: local-completions
|
||||
tasks: gsm8k
|
||||
model_args_extra: num_concurrent=512,max_retries=3,tokenized_requests=false,timeout=1200,max_gen_toks=256,max_length=4096
|
||||
worker_config:
|
||||
gen:
|
||||
print_iter_log: true
|
||||
max_batch_size: 32
|
||||
max_num_tokens: 128
|
||||
tensor_parallel_size: 32
|
||||
moe_expert_parallel_size: 32
|
||||
pipeline_parallel_size: 1
|
||||
context_parallel_size: 1
|
||||
enable_attention_dp: true
|
||||
enable_lm_head_tp_in_adp: true
|
||||
cuda_graph_config:
|
||||
enable_padding: true
|
||||
max_batch_size: 32
|
||||
kv_cache_config:
|
||||
enable_block_reuse: false
|
||||
free_gpu_memory_fraction: 0.75
|
||||
dtype: fp8
|
||||
tokens_per_block: 64
|
||||
moe_config:
|
||||
backend: CUTEDSL
|
||||
load_balancer:
|
||||
num_slots: 256
|
||||
layer_updates_per_iter: 1
|
||||
cache_transceiver_config:
|
||||
max_tokens_in_buffer: 16384
|
||||
backend: UCX
|
||||
disable_overlap_scheduler: true
|
||||
speculative_config: &id001
|
||||
decoding_type: MTP
|
||||
num_nextn_predict_layers: 3
|
||||
num_postprocess_workers: 4
|
||||
stream_interval: 20
|
||||
nvfp4_gemm_config:
|
||||
allowed_backends:
|
||||
- cutlass
|
||||
- cublaslt
|
||||
- cutedsl
|
||||
- cuda_core
|
||||
ctx:
|
||||
print_iter_log: true
|
||||
max_batch_size: 2
|
||||
max_num_tokens: 16384
|
||||
tensor_parallel_size: 4
|
||||
moe_expert_parallel_size: 4
|
||||
pipeline_parallel_size: 1
|
||||
context_parallel_size: 1
|
||||
enable_attention_dp: true
|
||||
enable_lm_head_tp_in_adp: true
|
||||
cuda_graph_config: null
|
||||
kv_cache_config:
|
||||
enable_block_reuse: false
|
||||
free_gpu_memory_fraction: 0.6
|
||||
dtype: fp8
|
||||
tokens_per_block: 64
|
||||
moe_config:
|
||||
backend: TRTLLM
|
||||
cache_transceiver_config:
|
||||
max_tokens_in_buffer: 16384
|
||||
backend: UCX
|
||||
disable_overlap_scheduler: true
|
||||
speculative_config: *id001
|
||||
@ -0,0 +1,105 @@
|
||||
metadata:
|
||||
model_name: deepseek_v32_fp4
|
||||
precision: fp4
|
||||
model_dir_name: DeepSeek-V3.2-FP4-v2
|
||||
supported_gpus:
|
||||
- GB200
|
||||
script_file: disaggr_torch.slurm
|
||||
benchmark_type: 8k1k
|
||||
slurm:
|
||||
script_file: disaggr_torch.slurm
|
||||
partition: <partition>
|
||||
account: <account>
|
||||
job_time: 02:00:00
|
||||
job_name: unified-benchmark
|
||||
extra_args: --gres=gpu:4
|
||||
numa_bind: true
|
||||
benchmark:
|
||||
mode: e2e
|
||||
use_nv_sa_benchmark: false
|
||||
multi_round: 10
|
||||
benchmark_ratio: 0.0
|
||||
streaming: true
|
||||
concurrency_list: '1'
|
||||
input_length: 8192
|
||||
output_length: 1024
|
||||
dataset_file: datasets/perf-ci/deepseek_v32-8k1k-20480-ratio-1_for_serve.json
|
||||
hardware:
|
||||
gpus_per_node: 4
|
||||
num_ctx_servers: 1
|
||||
num_gen_servers: 1
|
||||
environment:
|
||||
container_mount: <container_mount>
|
||||
container_image: <container_image>
|
||||
model_path: <model_path>
|
||||
trtllm_repo: ''
|
||||
build_wheel: false
|
||||
work_dir: <full_path_to_work_dir>
|
||||
worker_env_var: TLLM_LOG_LEVEL=INFO TRTLLM_SERVER_DISABLE_GC=1 TRTLLM_WORKER_DISABLE_GC=1 TRTLLM_ENABLE_PDL=1 ENROOT_ALLOW_DEV=yes
|
||||
server_env_var: TRTLLM_SERVER_DISABLE_GC=1
|
||||
profiling:
|
||||
nsys_on: false
|
||||
accuracy:
|
||||
enable_accuracy_test: false
|
||||
model: local-completions
|
||||
tasks: gsm8k
|
||||
model_args_extra: num_concurrent=512,max_retries=3,tokenized_requests=false,timeout=1200,max_gen_toks=256,max_length=4096
|
||||
worker_config:
|
||||
gen:
|
||||
print_iter_log: true
|
||||
max_batch_size: 1
|
||||
max_num_tokens: 512
|
||||
tensor_parallel_size: 8
|
||||
moe_expert_parallel_size: 8
|
||||
pipeline_parallel_size: 1
|
||||
context_parallel_size: 1
|
||||
enable_attention_dp: false
|
||||
cuda_graph_config:
|
||||
enable_padding: true
|
||||
max_batch_size: 1
|
||||
kv_cache_config:
|
||||
enable_block_reuse: false
|
||||
free_gpu_memory_fraction: 0.9
|
||||
dtype: fp8
|
||||
tokens_per_block: 64
|
||||
moe_config:
|
||||
backend: TRTLLM
|
||||
cache_transceiver_config:
|
||||
max_tokens_in_buffer: 16384
|
||||
backend: UCX
|
||||
disable_overlap_scheduler: true
|
||||
speculative_config: &id001
|
||||
decoding_type: MTP
|
||||
num_nextn_predict_layers: 3
|
||||
num_postprocess_workers: 4
|
||||
stream_interval: 20
|
||||
nvfp4_gemm_config:
|
||||
allowed_backends:
|
||||
- cutlass
|
||||
- cublaslt
|
||||
- cutedsl
|
||||
- cuda_core
|
||||
allreduce_strategy: MNNVL
|
||||
ctx:
|
||||
print_iter_log: true
|
||||
max_batch_size: 2
|
||||
max_num_tokens: 16384
|
||||
tensor_parallel_size: 4
|
||||
moe_expert_parallel_size: 4
|
||||
pipeline_parallel_size: 1
|
||||
context_parallel_size: 1
|
||||
enable_attention_dp: true
|
||||
enable_lm_head_tp_in_adp: true
|
||||
cuda_graph_config: null
|
||||
kv_cache_config:
|
||||
enable_block_reuse: false
|
||||
free_gpu_memory_fraction: 0.6
|
||||
dtype: fp8
|
||||
tokens_per_block: 64
|
||||
moe_config:
|
||||
backend: TRTLLM
|
||||
cache_transceiver_config:
|
||||
max_tokens_in_buffer: 16384
|
||||
backend: UCX
|
||||
disable_overlap_scheduler: true
|
||||
speculative_config: *id001
|
||||
@ -0,0 +1,98 @@
|
||||
metadata:
|
||||
model_name: k2_thinking_fp4
|
||||
precision: fp4
|
||||
model_dir_name: Kimi-K2-Thinking-NVFP4
|
||||
supported_gpus:
|
||||
- GB200
|
||||
script_file: disaggr_torch.slurm
|
||||
benchmark_type: 1k1k
|
||||
slurm:
|
||||
script_file: disaggr_torch.slurm
|
||||
partition: <partition>
|
||||
account: <account>
|
||||
job_time: 02:00:00
|
||||
job_name: unified-benchmark
|
||||
extra_args: --gres=gpu:4
|
||||
numa_bind: true
|
||||
benchmark:
|
||||
mode: e2e
|
||||
use_nv_sa_benchmark: false
|
||||
multi_round: 5
|
||||
benchmark_ratio: 0.0
|
||||
streaming: true
|
||||
concurrency_list: '2048'
|
||||
input_length: 1024
|
||||
output_length: 1024
|
||||
dataset_file: datasets/perf-ci/k2_thinking-1k1k-20480-ratio-1_for_serve.json
|
||||
hardware:
|
||||
gpus_per_node: 4
|
||||
num_ctx_servers: 1
|
||||
num_gen_servers: 1
|
||||
environment:
|
||||
container_mount: <container_mount>
|
||||
container_image: <container_image>
|
||||
model_path: <model_path>
|
||||
trtllm_repo: ''
|
||||
build_wheel: false
|
||||
work_dir: <full_path_to_work_dir>
|
||||
worker_env_var: TLLM_LOG_LEVEL=INFO TRTLLM_SERVER_DISABLE_GC=1 TRTLLM_WORKER_DISABLE_GC=1 TRTLLM_ENABLE_PDL=1 ENROOT_ALLOW_DEV=yes
|
||||
server_env_var: TRTLLM_SERVER_DISABLE_GC=1
|
||||
profiling:
|
||||
nsys_on: false
|
||||
accuracy:
|
||||
enable_accuracy_test: false
|
||||
model: local-completions
|
||||
tasks: gsm8k
|
||||
model_args_extra: num_concurrent=512,max_retries=3,tokenized_requests=false,timeout=1200,max_gen_toks=256,max_length=4096
|
||||
worker_config:
|
||||
gen:
|
||||
print_iter_log: true
|
||||
max_batch_size: 64
|
||||
max_num_tokens: 128
|
||||
tensor_parallel_size: 32
|
||||
moe_expert_parallel_size: 32
|
||||
pipeline_parallel_size: 1
|
||||
context_parallel_size: 1
|
||||
enable_attention_dp: true
|
||||
enable_lm_head_tp_in_adp: true
|
||||
cuda_graph_config:
|
||||
enable_padding: true
|
||||
max_batch_size: 64
|
||||
kv_cache_config:
|
||||
enable_block_reuse: false
|
||||
free_gpu_memory_fraction: 0.8
|
||||
dtype: fp8
|
||||
moe_config:
|
||||
backend: WIDEEP
|
||||
load_balancer:
|
||||
num_slots: 384
|
||||
layer_updates_per_iter: 1
|
||||
cache_transceiver_config:
|
||||
max_tokens_in_buffer: 16384
|
||||
backend: UCX
|
||||
disable_overlap_scheduler: true
|
||||
trust_remote_code: true
|
||||
num_postprocess_workers: 4
|
||||
stream_interval: 20
|
||||
ctx:
|
||||
print_iter_log: true
|
||||
max_batch_size: 16
|
||||
max_num_tokens: 8192
|
||||
tensor_parallel_size: 4
|
||||
moe_expert_parallel_size: 4
|
||||
pipeline_parallel_size: 1
|
||||
context_parallel_size: 1
|
||||
enable_attention_dp: true
|
||||
enable_lm_head_tp_in_adp: true
|
||||
cuda_graph_config: null
|
||||
kv_cache_config:
|
||||
enable_block_reuse: false
|
||||
free_gpu_memory_fraction: 0.6
|
||||
dtype: fp8
|
||||
moe_config:
|
||||
backend: TRTLLM
|
||||
cache_transceiver_config:
|
||||
max_tokens_in_buffer: 16384
|
||||
backend: UCX
|
||||
disable_overlap_scheduler: true
|
||||
trust_remote_code: true
|
||||
@ -0,0 +1,95 @@
|
||||
metadata:
|
||||
model_name: k2_thinking_fp4
|
||||
precision: fp4
|
||||
model_dir_name: Kimi-K2-Thinking-NVFP4
|
||||
supported_gpus:
|
||||
- GB200
|
||||
script_file: disaggr_torch.slurm
|
||||
benchmark_type: 1k1k
|
||||
slurm:
|
||||
script_file: disaggr_torch.slurm
|
||||
partition: <partition>
|
||||
account: <account>
|
||||
job_time: 02:00:00
|
||||
job_name: unified-benchmark
|
||||
extra_args: --gres=gpu:4
|
||||
numa_bind: true
|
||||
benchmark:
|
||||
mode: e2e
|
||||
use_nv_sa_benchmark: false
|
||||
multi_round: 5
|
||||
benchmark_ratio: 0.0
|
||||
streaming: true
|
||||
concurrency_list: '4096'
|
||||
input_length: 1024
|
||||
output_length: 1024
|
||||
dataset_file: datasets/perf-ci/k2_thinking-1k1k-20480-ratio-1_for_serve.json
|
||||
hardware:
|
||||
gpus_per_node: 4
|
||||
num_ctx_servers: 1
|
||||
num_gen_servers: 1
|
||||
environment:
|
||||
container_mount: <container_mount>
|
||||
container_image: <container_image>
|
||||
model_path: <model_path>
|
||||
trtllm_repo: ''
|
||||
build_wheel: false
|
||||
work_dir: <full_path_to_work_dir>
|
||||
worker_env_var: TLLM_LOG_LEVEL=INFO TRTLLM_SERVER_DISABLE_GC=1 TRTLLM_WORKER_DISABLE_GC=1 TRTLLM_ENABLE_PDL=1 ENROOT_ALLOW_DEV=yes
|
||||
server_env_var: TRTLLM_SERVER_DISABLE_GC=1
|
||||
profiling:
|
||||
nsys_on: false
|
||||
accuracy:
|
||||
enable_accuracy_test: false
|
||||
model: local-completions
|
||||
tasks: gsm8k
|
||||
model_args_extra: num_concurrent=512,max_retries=3,tokenized_requests=false,timeout=1200,max_gen_toks=256,max_length=4096
|
||||
worker_config:
|
||||
gen:
|
||||
print_iter_log: true
|
||||
max_batch_size: 512
|
||||
max_num_tokens: 512
|
||||
tensor_parallel_size: 8
|
||||
moe_expert_parallel_size: 8
|
||||
pipeline_parallel_size: 1
|
||||
context_parallel_size: 1
|
||||
enable_attention_dp: true
|
||||
enable_lm_head_tp_in_adp: true
|
||||
cuda_graph_config:
|
||||
enable_padding: true
|
||||
max_batch_size: 512
|
||||
kv_cache_config:
|
||||
enable_block_reuse: false
|
||||
free_gpu_memory_fraction: 0.8
|
||||
dtype: fp8
|
||||
moe_config:
|
||||
backend: CUTLASS
|
||||
cache_transceiver_config:
|
||||
max_tokens_in_buffer: 16384
|
||||
backend: UCX
|
||||
disable_overlap_scheduler: true
|
||||
trust_remote_code: true
|
||||
num_postprocess_workers: 4
|
||||
stream_interval: 20
|
||||
ctx:
|
||||
print_iter_log: true
|
||||
max_batch_size: 16
|
||||
max_num_tokens: 8192
|
||||
tensor_parallel_size: 4
|
||||
moe_expert_parallel_size: 4
|
||||
pipeline_parallel_size: 1
|
||||
context_parallel_size: 1
|
||||
enable_attention_dp: true
|
||||
enable_lm_head_tp_in_adp: true
|
||||
cuda_graph_config: null
|
||||
kv_cache_config:
|
||||
enable_block_reuse: false
|
||||
free_gpu_memory_fraction: 0.6
|
||||
dtype: fp8
|
||||
moe_config:
|
||||
backend: TRTLLM
|
||||
cache_transceiver_config:
|
||||
max_tokens_in_buffer: 16384
|
||||
backend: UCX
|
||||
disable_overlap_scheduler: true
|
||||
trust_remote_code: true
|
||||
@ -0,0 +1,94 @@
|
||||
metadata:
|
||||
model_name: k2_thinking_fp4
|
||||
precision: fp4
|
||||
model_dir_name: Kimi-K2-Thinking-NVFP4
|
||||
supported_gpus:
|
||||
- GB200
|
||||
script_file: disaggr_torch.slurm
|
||||
benchmark_type: 1k1k
|
||||
slurm:
|
||||
script_file: disaggr_torch.slurm
|
||||
partition: <partition>
|
||||
account: <account>
|
||||
job_time: 02:00:00
|
||||
job_name: unified-benchmark
|
||||
extra_args: --gres=gpu:4
|
||||
numa_bind: true
|
||||
benchmark:
|
||||
mode: e2e
|
||||
use_nv_sa_benchmark: false
|
||||
multi_round: 10
|
||||
benchmark_ratio: 0.0
|
||||
streaming: true
|
||||
concurrency_list: '4'
|
||||
input_length: 1024
|
||||
output_length: 1024
|
||||
dataset_file: datasets/perf-ci/k2_thinking-1k1k-20480-ratio-1_for_serve.json
|
||||
hardware:
|
||||
gpus_per_node: 4
|
||||
num_ctx_servers: 1
|
||||
num_gen_servers: 1
|
||||
environment:
|
||||
container_mount: <container_mount>
|
||||
container_image: <container_image>
|
||||
model_path: <model_path>
|
||||
trtllm_repo: ''
|
||||
build_wheel: false
|
||||
work_dir: <full_path_to_work_dir>
|
||||
worker_env_var: TLLM_LOG_LEVEL=INFO TRTLLM_SERVER_DISABLE_GC=1 TRTLLM_WORKER_DISABLE_GC=1 TRTLLM_ENABLE_PDL=1 ENROOT_ALLOW_DEV=yes
|
||||
server_env_var: TRTLLM_SERVER_DISABLE_GC=1
|
||||
profiling:
|
||||
nsys_on: false
|
||||
accuracy:
|
||||
enable_accuracy_test: false
|
||||
model: local-completions
|
||||
tasks: gsm8k
|
||||
model_args_extra: num_concurrent=512,max_retries=3,tokenized_requests=false,timeout=1200,max_gen_toks=256,max_length=4096
|
||||
worker_config:
|
||||
gen:
|
||||
print_iter_log: true
|
||||
max_batch_size: 4
|
||||
max_num_tokens: 128
|
||||
tensor_parallel_size: 4
|
||||
moe_expert_parallel_size: 4
|
||||
pipeline_parallel_size: 1
|
||||
context_parallel_size: 1
|
||||
enable_attention_dp: false
|
||||
cuda_graph_config:
|
||||
enable_padding: true
|
||||
max_batch_size: 4
|
||||
kv_cache_config:
|
||||
enable_block_reuse: false
|
||||
free_gpu_memory_fraction: 0.8
|
||||
dtype: fp8
|
||||
moe_config:
|
||||
backend: TRTLLM
|
||||
cache_transceiver_config:
|
||||
max_tokens_in_buffer: 16384
|
||||
backend: UCX
|
||||
disable_overlap_scheduler: true
|
||||
trust_remote_code: true
|
||||
num_postprocess_workers: 4
|
||||
stream_interval: 20
|
||||
ctx:
|
||||
print_iter_log: true
|
||||
max_batch_size: 16
|
||||
max_num_tokens: 8192
|
||||
tensor_parallel_size: 4
|
||||
moe_expert_parallel_size: 4
|
||||
pipeline_parallel_size: 1
|
||||
context_parallel_size: 1
|
||||
enable_attention_dp: true
|
||||
enable_lm_head_tp_in_adp: true
|
||||
cuda_graph_config: null
|
||||
kv_cache_config:
|
||||
enable_block_reuse: false
|
||||
free_gpu_memory_fraction: 0.6
|
||||
dtype: fp8
|
||||
moe_config:
|
||||
backend: TRTLLM
|
||||
cache_transceiver_config:
|
||||
max_tokens_in_buffer: 16384
|
||||
backend: UCX
|
||||
disable_overlap_scheduler: true
|
||||
trust_remote_code: true
|
||||
@ -0,0 +1,98 @@
|
||||
metadata:
|
||||
model_name: k2_thinking_fp4
|
||||
precision: fp4
|
||||
model_dir_name: Kimi-K2-Thinking-NVFP4
|
||||
supported_gpus:
|
||||
- GB200
|
||||
script_file: disaggr_torch.slurm
|
||||
benchmark_type: 8k1k
|
||||
slurm:
|
||||
script_file: disaggr_torch.slurm
|
||||
partition: <partition>
|
||||
account: <account>
|
||||
job_time: 02:00:00
|
||||
job_name: unified-benchmark
|
||||
extra_args: --gres=gpu:4
|
||||
numa_bind: true
|
||||
benchmark:
|
||||
mode: e2e
|
||||
use_nv_sa_benchmark: false
|
||||
multi_round: 5
|
||||
benchmark_ratio: 0.0
|
||||
streaming: true
|
||||
concurrency_list: '4096'
|
||||
input_length: 8192
|
||||
output_length: 1024
|
||||
dataset_file: datasets/perf-ci/k2_thinking-8k1k-20480-ratio-1_for_serve.json
|
||||
hardware:
|
||||
gpus_per_node: 4
|
||||
num_ctx_servers: 1
|
||||
num_gen_servers: 1
|
||||
environment:
|
||||
container_mount: <container_mount>
|
||||
container_image: <container_image>
|
||||
model_path: <model_path>
|
||||
trtllm_repo: ''
|
||||
build_wheel: false
|
||||
work_dir: <full_path_to_work_dir>
|
||||
worker_env_var: TLLM_LOG_LEVEL=INFO TRTLLM_SERVER_DISABLE_GC=1 TRTLLM_WORKER_DISABLE_GC=1 TRTLLM_ENABLE_PDL=1 ENROOT_ALLOW_DEV=yes
|
||||
server_env_var: TRTLLM_SERVER_DISABLE_GC=1
|
||||
profiling:
|
||||
nsys_on: false
|
||||
accuracy:
|
||||
enable_accuracy_test: false
|
||||
model: local-completions
|
||||
tasks: gsm8k
|
||||
model_args_extra: num_concurrent=512,max_retries=3,tokenized_requests=false,timeout=1200,max_gen_toks=256,max_length=4096
|
||||
worker_config:
|
||||
gen:
|
||||
print_iter_log: true
|
||||
max_batch_size: 256
|
||||
max_num_tokens: 256
|
||||
tensor_parallel_size: 16
|
||||
moe_expert_parallel_size: 16
|
||||
pipeline_parallel_size: 1
|
||||
context_parallel_size: 1
|
||||
enable_attention_dp: true
|
||||
enable_lm_head_tp_in_adp: true
|
||||
cuda_graph_config:
|
||||
enable_padding: true
|
||||
max_batch_size: 256
|
||||
kv_cache_config:
|
||||
enable_block_reuse: false
|
||||
free_gpu_memory_fraction: 0.8
|
||||
dtype: fp8
|
||||
moe_config:
|
||||
backend: WIDEEP
|
||||
load_balancer:
|
||||
num_slots: 384
|
||||
layer_updates_per_iter: 1
|
||||
cache_transceiver_config:
|
||||
max_tokens_in_buffer: 16384
|
||||
backend: UCX
|
||||
disable_overlap_scheduler: true
|
||||
trust_remote_code: true
|
||||
num_postprocess_workers: 4
|
||||
stream_interval: 20
|
||||
ctx:
|
||||
print_iter_log: true
|
||||
max_batch_size: 2
|
||||
max_num_tokens: 8192
|
||||
tensor_parallel_size: 4
|
||||
moe_expert_parallel_size: 4
|
||||
pipeline_parallel_size: 1
|
||||
context_parallel_size: 1
|
||||
enable_attention_dp: true
|
||||
enable_lm_head_tp_in_adp: true
|
||||
cuda_graph_config: null
|
||||
kv_cache_config:
|
||||
enable_block_reuse: false
|
||||
free_gpu_memory_fraction: 0.6
|
||||
dtype: fp8
|
||||
moe_config:
|
||||
backend: TRTLLM
|
||||
cache_transceiver_config:
|
||||
max_tokens_in_buffer: 16384
|
||||
backend: UCX
|
||||
disable_overlap_scheduler: true
|
||||
trust_remote_code: true
|
||||
@ -0,0 +1,104 @@
|
||||
metadata:
|
||||
model_name: k2_thinking_fp4
|
||||
precision: fp4
|
||||
model_dir_name: Kimi-K2-Thinking-NVFP4
|
||||
supported_gpus:
|
||||
- GB200
|
||||
script_file: disaggr_torch.slurm
|
||||
benchmark_type: 8k1k
|
||||
slurm:
|
||||
script_file: disaggr_torch.slurm
|
||||
partition: <partition>
|
||||
account: <account>
|
||||
job_time: 02:00:00
|
||||
job_name: unified-benchmark
|
||||
extra_args: --gres=gpu:4
|
||||
numa_bind: true
|
||||
benchmark:
|
||||
mode: e2e
|
||||
use_nv_sa_benchmark: false
|
||||
multi_round: 10
|
||||
benchmark_ratio: 0.0
|
||||
streaming: true
|
||||
concurrency_list: '1024'
|
||||
input_length: 8192
|
||||
output_length: 1024
|
||||
dataset_file: datasets/perf-ci/k2_thinking-8k1k-20480-ratio-1_for_serve.json
|
||||
hardware:
|
||||
gpus_per_node: 4
|
||||
num_ctx_servers: 1
|
||||
num_gen_servers: 1
|
||||
environment:
|
||||
container_mount: <container_mount>
|
||||
container_image: <container_image>
|
||||
model_path: <model_path>
|
||||
trtllm_repo: ''
|
||||
build_wheel: false
|
||||
work_dir: <full_path_to_work_dir>
|
||||
worker_env_var: TLLM_LOG_LEVEL=INFO TRTLLM_SERVER_DISABLE_GC=1 TRTLLM_WORKER_DISABLE_GC=1 TRTLLM_ENABLE_PDL=1 ENROOT_ALLOW_DEV=yes
|
||||
server_env_var: TRTLLM_SERVER_DISABLE_GC=1
|
||||
profiling:
|
||||
nsys_on: false
|
||||
accuracy:
|
||||
enable_accuracy_test: false
|
||||
model: local-completions
|
||||
tasks: gsm8k
|
||||
model_args_extra: num_concurrent=512,max_retries=3,tokenized_requests=false,timeout=1200,max_gen_toks=256,max_length=4096
|
||||
worker_config:
|
||||
gen:
|
||||
print_iter_log: true
|
||||
max_batch_size: 32
|
||||
max_num_tokens: 128
|
||||
tensor_parallel_size: 32
|
||||
moe_expert_parallel_size: 32
|
||||
pipeline_parallel_size: 1
|
||||
context_parallel_size: 1
|
||||
enable_attention_dp: true
|
||||
enable_lm_head_tp_in_adp: true
|
||||
cuda_graph_config:
|
||||
enable_padding: true
|
||||
max_batch_size: 32
|
||||
kv_cache_config:
|
||||
enable_block_reuse: false
|
||||
free_gpu_memory_fraction: 0.8
|
||||
dtype: fp8
|
||||
moe_config:
|
||||
backend: WIDEEP
|
||||
load_balancer:
|
||||
num_slots: 416
|
||||
layer_updates_per_iter: 1
|
||||
cache_transceiver_config:
|
||||
max_tokens_in_buffer: 16384
|
||||
backend: UCX
|
||||
disable_overlap_scheduler: true
|
||||
speculative_config: &id001
|
||||
decoding_type: Eagle
|
||||
max_draft_len: 3
|
||||
eagle3_one_model: true
|
||||
speculative_model: Kimi-K2-Thinking-NVFP4-Eagle3
|
||||
trust_remote_code: true
|
||||
num_postprocess_workers: 4
|
||||
stream_interval: 20
|
||||
ctx:
|
||||
print_iter_log: true
|
||||
max_batch_size: 2
|
||||
max_num_tokens: 8192
|
||||
tensor_parallel_size: 4
|
||||
moe_expert_parallel_size: 4
|
||||
pipeline_parallel_size: 1
|
||||
context_parallel_size: 1
|
||||
enable_attention_dp: true
|
||||
enable_lm_head_tp_in_adp: true
|
||||
cuda_graph_config: null
|
||||
kv_cache_config:
|
||||
enable_block_reuse: false
|
||||
free_gpu_memory_fraction: 0.6
|
||||
dtype: fp8
|
||||
moe_config:
|
||||
backend: TRTLLM
|
||||
cache_transceiver_config:
|
||||
max_tokens_in_buffer: 16384
|
||||
backend: UCX
|
||||
disable_overlap_scheduler: true
|
||||
speculative_config: *id001
|
||||
trust_remote_code: true
|
||||
@ -0,0 +1,101 @@
|
||||
metadata:
|
||||
model_name: k2_thinking_fp4
|
||||
precision: fp4
|
||||
model_dir_name: Kimi-K2-Thinking-NVFP4
|
||||
supported_gpus:
|
||||
- GB200
|
||||
script_file: disaggr_torch.slurm
|
||||
benchmark_type: 8k1k
|
||||
slurm:
|
||||
script_file: disaggr_torch.slurm
|
||||
partition: <partition>
|
||||
account: <account>
|
||||
job_time: 02:00:00
|
||||
job_name: unified-benchmark
|
||||
extra_args: --gres=gpu:4
|
||||
numa_bind: true
|
||||
benchmark:
|
||||
mode: e2e
|
||||
use_nv_sa_benchmark: false
|
||||
multi_round: 10
|
||||
benchmark_ratio: 0.0
|
||||
streaming: true
|
||||
concurrency_list: '4'
|
||||
input_length: 8192
|
||||
output_length: 1024
|
||||
dataset_file: datasets/perf-ci/k2_thinking-8k1k-20480-ratio-1_for_serve.json
|
||||
hardware:
|
||||
gpus_per_node: 4
|
||||
num_ctx_servers: 1
|
||||
num_gen_servers: 1
|
||||
environment:
|
||||
container_mount: <container_mount>
|
||||
container_image: <container_image>
|
||||
model_path: <model_path>
|
||||
trtllm_repo: ''
|
||||
build_wheel: false
|
||||
work_dir: <full_path_to_work_dir>
|
||||
worker_env_var: TLLM_LOG_LEVEL=INFO TRTLLM_SERVER_DISABLE_GC=1 TRTLLM_WORKER_DISABLE_GC=1 TRTLLM_ENABLE_PDL=1 ENROOT_ALLOW_DEV=yes
|
||||
server_env_var: TRTLLM_SERVER_DISABLE_GC=1
|
||||
profiling:
|
||||
nsys_on: false
|
||||
accuracy:
|
||||
enable_accuracy_test: false
|
||||
model: local-completions
|
||||
tasks: gsm8k
|
||||
model_args_extra: num_concurrent=512,max_retries=3,tokenized_requests=false,timeout=1200,max_gen_toks=256,max_length=4096
|
||||
worker_config:
|
||||
gen:
|
||||
print_iter_log: true
|
||||
max_batch_size: 4
|
||||
max_num_tokens: 128
|
||||
tensor_parallel_size: 8
|
||||
moe_expert_parallel_size: 8
|
||||
pipeline_parallel_size: 1
|
||||
context_parallel_size: 1
|
||||
enable_attention_dp: false
|
||||
cuda_graph_config:
|
||||
enable_padding: true
|
||||
max_batch_size: 4
|
||||
kv_cache_config:
|
||||
enable_block_reuse: false
|
||||
free_gpu_memory_fraction: 0.8
|
||||
dtype: fp8
|
||||
moe_config:
|
||||
backend: TRTLLM
|
||||
cache_transceiver_config:
|
||||
max_tokens_in_buffer: 16384
|
||||
backend: UCX
|
||||
disable_overlap_scheduler: true
|
||||
speculative_config: &id001
|
||||
decoding_type: Eagle
|
||||
max_draft_len: 3
|
||||
eagle3_one_model: true
|
||||
speculative_model: Kimi-K2-Thinking-NVFP4-Eagle3
|
||||
trust_remote_code: true
|
||||
num_postprocess_workers: 4
|
||||
stream_interval: 20
|
||||
allreduce_strategy: MNNVL
|
||||
ctx:
|
||||
print_iter_log: true
|
||||
max_batch_size: 2
|
||||
max_num_tokens: 8192
|
||||
tensor_parallel_size: 4
|
||||
moe_expert_parallel_size: 4
|
||||
pipeline_parallel_size: 1
|
||||
context_parallel_size: 1
|
||||
enable_attention_dp: true
|
||||
enable_lm_head_tp_in_adp: true
|
||||
cuda_graph_config: null
|
||||
kv_cache_config:
|
||||
enable_block_reuse: false
|
||||
free_gpu_memory_fraction: 0.6
|
||||
dtype: fp8
|
||||
moe_config:
|
||||
backend: TRTLLM
|
||||
cache_transceiver_config:
|
||||
max_tokens_in_buffer: 16384
|
||||
backend: UCX
|
||||
disable_overlap_scheduler: true
|
||||
speculative_config: *id001
|
||||
trust_remote_code: true
|
||||
@ -0,0 +1,92 @@
|
||||
metadata:
|
||||
model_name: qwen3_235b_a22b_fp4
|
||||
precision: fp4
|
||||
model_dir_name: Qwen3-235B-A22B-FP4
|
||||
supported_gpus:
|
||||
- GB200
|
||||
script_file: disaggr_torch.slurm
|
||||
benchmark_type: 8k1k
|
||||
slurm:
|
||||
script_file: disaggr_torch.slurm
|
||||
partition: <partition>
|
||||
account: <account>
|
||||
job_time: 02:00:00
|
||||
job_name: unified-benchmark
|
||||
extra_args: --gres=gpu:4
|
||||
numa_bind: true
|
||||
benchmark:
|
||||
mode: e2e
|
||||
use_nv_sa_benchmark: false
|
||||
multi_round: 10
|
||||
benchmark_ratio: 0.0
|
||||
streaming: true
|
||||
concurrency_list: '1024'
|
||||
input_length: 8192
|
||||
output_length: 1024
|
||||
dataset_file: datasets/perf-ci/qwen3_235b-8k1k-20480-ratio-1_for_serve.json
|
||||
hardware:
|
||||
gpus_per_node: 4
|
||||
num_ctx_servers: 1
|
||||
num_gen_servers: 1
|
||||
environment:
|
||||
container_mount: <container_mount>
|
||||
container_image: <container_image>
|
||||
model_path: <model_path>
|
||||
trtllm_repo: ''
|
||||
build_wheel: false
|
||||
work_dir: <full_path_to_work_dir>
|
||||
worker_env_var: TLLM_LOG_LEVEL=INFO TRTLLM_SERVER_DISABLE_GC=1 TRTLLM_WORKER_DISABLE_GC=1 TRTLLM_ENABLE_PDL=1 ENROOT_ALLOW_DEV=yes
|
||||
server_env_var: TRTLLM_SERVER_DISABLE_GC=1
|
||||
profiling:
|
||||
nsys_on: false
|
||||
accuracy:
|
||||
enable_accuracy_test: false
|
||||
model: local-completions
|
||||
tasks: gsm8k
|
||||
model_args_extra: num_concurrent=512,max_retries=3,tokenized_requests=false,timeout=1200,max_gen_toks=256,max_length=4096
|
||||
worker_config:
|
||||
gen:
|
||||
print_iter_log: true
|
||||
max_batch_size: 128
|
||||
max_num_tokens: 128
|
||||
tensor_parallel_size: 8
|
||||
moe_expert_parallel_size: 8
|
||||
pipeline_parallel_size: 1
|
||||
context_parallel_size: 1
|
||||
enable_attention_dp: true
|
||||
enable_lm_head_tp_in_adp: true
|
||||
cuda_graph_config:
|
||||
enable_padding: true
|
||||
max_batch_size: 128
|
||||
kv_cache_config:
|
||||
enable_block_reuse: false
|
||||
free_gpu_memory_fraction: 0.9
|
||||
dtype: fp8
|
||||
moe_config:
|
||||
backend: CUTEDSL
|
||||
cache_transceiver_config:
|
||||
max_tokens_in_buffer: 32768
|
||||
backend: UCX
|
||||
disable_overlap_scheduler: true
|
||||
num_postprocess_workers: 4
|
||||
stream_interval: 20
|
||||
ctx:
|
||||
print_iter_log: true
|
||||
max_batch_size: 4
|
||||
max_num_tokens: 32768
|
||||
tensor_parallel_size: 1
|
||||
moe_expert_parallel_size: 1
|
||||
pipeline_parallel_size: 1
|
||||
context_parallel_size: 1
|
||||
enable_attention_dp: false
|
||||
cuda_graph_config: null
|
||||
kv_cache_config:
|
||||
enable_block_reuse: false
|
||||
free_gpu_memory_fraction: 0.6
|
||||
dtype: fp8
|
||||
moe_config:
|
||||
backend: TRTLLM
|
||||
cache_transceiver_config:
|
||||
max_tokens_in_buffer: 32768
|
||||
backend: UCX
|
||||
disable_overlap_scheduler: true
|
||||
@ -0,0 +1,91 @@
|
||||
metadata:
|
||||
model_name: qwen3_235b_a22b_fp4
|
||||
precision: fp4
|
||||
model_dir_name: Qwen3-235B-A22B-FP4
|
||||
supported_gpus:
|
||||
- GB200
|
||||
script_file: disaggr_torch.slurm
|
||||
benchmark_type: 8k1k
|
||||
slurm:
|
||||
script_file: disaggr_torch.slurm
|
||||
partition: <partition>
|
||||
account: <account>
|
||||
job_time: 02:00:00
|
||||
job_name: unified-benchmark
|
||||
extra_args: --gres=gpu:4
|
||||
numa_bind: true
|
||||
benchmark:
|
||||
mode: e2e
|
||||
use_nv_sa_benchmark: false
|
||||
multi_round: 10
|
||||
benchmark_ratio: 0.0
|
||||
streaming: true
|
||||
concurrency_list: '64'
|
||||
input_length: 8192
|
||||
output_length: 1024
|
||||
dataset_file: datasets/perf-ci/qwen3_235b-8k1k-20480-ratio-1_for_serve.json
|
||||
hardware:
|
||||
gpus_per_node: 4
|
||||
num_ctx_servers: 1
|
||||
num_gen_servers: 1
|
||||
environment:
|
||||
container_mount: <container_mount>
|
||||
container_image: <container_image>
|
||||
model_path: <model_path>
|
||||
trtllm_repo: ''
|
||||
build_wheel: false
|
||||
work_dir: <full_path_to_work_dir>
|
||||
worker_env_var: TLLM_LOG_LEVEL=INFO TRTLLM_SERVER_DISABLE_GC=1 TRTLLM_WORKER_DISABLE_GC=1 TRTLLM_ENABLE_PDL=1 ENROOT_ALLOW_DEV=yes
|
||||
server_env_var: TRTLLM_SERVER_DISABLE_GC=1
|
||||
profiling:
|
||||
nsys_on: false
|
||||
accuracy:
|
||||
enable_accuracy_test: false
|
||||
model: local-completions
|
||||
tasks: gsm8k
|
||||
model_args_extra: num_concurrent=512,max_retries=3,tokenized_requests=false,timeout=1200,max_gen_toks=256,max_length=4096
|
||||
worker_config:
|
||||
gen:
|
||||
print_iter_log: true
|
||||
max_batch_size: 64
|
||||
max_num_tokens: 64
|
||||
tensor_parallel_size: 4
|
||||
moe_expert_parallel_size: 4
|
||||
pipeline_parallel_size: 1
|
||||
context_parallel_size: 1
|
||||
enable_attention_dp: false
|
||||
cuda_graph_config:
|
||||
enable_padding: true
|
||||
max_batch_size: 64
|
||||
kv_cache_config:
|
||||
enable_block_reuse: false
|
||||
free_gpu_memory_fraction: 0.9
|
||||
dtype: fp8
|
||||
moe_config:
|
||||
backend: TRTLLM
|
||||
cache_transceiver_config:
|
||||
max_tokens_in_buffer: 32768
|
||||
backend: UCX
|
||||
disable_overlap_scheduler: true
|
||||
num_postprocess_workers: 4
|
||||
stream_interval: 20
|
||||
ctx:
|
||||
print_iter_log: true
|
||||
max_batch_size: 4
|
||||
max_num_tokens: 32768
|
||||
tensor_parallel_size: 1
|
||||
moe_expert_parallel_size: 1
|
||||
pipeline_parallel_size: 1
|
||||
context_parallel_size: 1
|
||||
enable_attention_dp: false
|
||||
cuda_graph_config: null
|
||||
kv_cache_config:
|
||||
enable_block_reuse: false
|
||||
free_gpu_memory_fraction: 0.6
|
||||
dtype: fp8
|
||||
moe_config:
|
||||
backend: TRTLLM
|
||||
cache_transceiver_config:
|
||||
max_tokens_in_buffer: 32768
|
||||
backend: UCX
|
||||
disable_overlap_scheduler: true
|
||||
@ -448,9 +448,12 @@ def prepare_regressive_test_cases(history_baseline_dict, new_data_dict):
|
||||
continue
|
||||
|
||||
is_post_merge = new_data.get("b_is_post_merge", False)
|
||||
baseline_id = history_baseline.get("_id", "")
|
||||
|
||||
info_parts = [f"baseline_id: {baseline_id}"]
|
||||
info_parts = [
|
||||
f"baseline_id: {history_baseline.get('_id', '')}",
|
||||
f"baseline_branch: {history_baseline.get('s_branch', '')}",
|
||||
f"baseline_commit: {history_baseline.get('s_commit', '')}",
|
||||
f"baseline_date: {history_baseline.get('ts_created', '')}",
|
||||
]
|
||||
regressive_metrics = []
|
||||
# Check all metrics and build info string
|
||||
for metric in MAXIMIZE_METRICS + MINIMIZE_METRICS:
|
||||
|
||||
@ -56,6 +56,7 @@ MODEL_PATH_DICT = {
|
||||
"deepseek_v32_fp4": "DeepSeek-V3.2-Exp-FP4-v2",
|
||||
"gpt_oss_120b_fp4": "gpt_oss/gpt-oss-120b",
|
||||
"k2_thinking_fp4": "Kimi-K2-Thinking-NVFP4",
|
||||
"qwen3_235b_a22b_fp4": "Qwen3/saved_models_Qwen3-235B-A22B_nvfp4_hf", # Qwen3-235B-A22B-FP4
|
||||
}
|
||||
|
||||
SUPPORTED_GPU_MAPPING = {
|
||||
@ -68,6 +69,9 @@ SUPPORTED_GPU_MAPPING = {
|
||||
|
||||
DEFAULT_TIMEOUT = 7200
|
||||
|
||||
AGGR_CONFIG_FOLDER = "tests/scripts/perf-sanity"
|
||||
DISAGG_CONFIG_FOLDER = "tests/integration/defs/perf/disagg/test_configs/disagg/perf-sanity"
|
||||
|
||||
# Regex patterns for parsing benchmark output metrics
|
||||
# Key is the metric name used in database (e.g., "mean_e2el", "seq_throughput")
|
||||
PERF_METRIC_LOG_QUERIES = {
|
||||
@ -97,9 +101,20 @@ def get_model_dir(model_name: str) -> str:
|
||||
return ""
|
||||
|
||||
|
||||
def get_dataset_path() -> str:
|
||||
"""Get dataset path for benchmark."""
|
||||
return os.path.join(llm_models_root(), "datasets", "ShareGPT_V3_unfiltered_cleaned_split.json")
|
||||
def get_dataset_dir(dataset_file: Optional[str]) -> str:
|
||||
"""Get dataset directory path from dataset file."""
|
||||
if not dataset_file or dataset_file == "<dataset_file>":
|
||||
return ""
|
||||
|
||||
# return os.path.join(llm_models_root(), "datasets", "ShareGPT_V3_unfiltered_cleaned_split.json")
|
||||
llm_models_path = os.path.join(llm_models_root(), dataset_file)
|
||||
if os.path.exists(llm_models_path):
|
||||
return llm_models_path
|
||||
elif os.path.exists(dataset_file):
|
||||
return dataset_file
|
||||
else:
|
||||
print_info(f"Dataset file not found in {llm_models_path} and {dataset_file}")
|
||||
return ""
|
||||
|
||||
|
||||
def to_env_dict(env_vars: str) -> Dict[str, str]:
|
||||
@ -141,6 +156,7 @@ class ServerConfig:
|
||||
self.disable_overlap_scheduler = server_config_data.get("disable_overlap_scheduler", False)
|
||||
self.num_postprocess_workers = server_config_data.get("num_postprocess_workers", 0)
|
||||
self.stream_interval = server_config_data.get("stream_interval", 10)
|
||||
self.print_iter_log = server_config_data.get("print_iter_log", False)
|
||||
self.attn_backend = server_config_data.get("attn_backend", "TRTLLM")
|
||||
self.enable_chunked_prefill = server_config_data.get("enable_chunked_prefill", False)
|
||||
self.enable_attention_dp = server_config_data.get("enable_attention_dp", False)
|
||||
@ -213,6 +229,7 @@ class ServerConfig:
|
||||
self.eagle3_layers_to_capture = []
|
||||
self.max_draft_len = speculative_config.get("max_draft_len", 0)
|
||||
self.speculative_model = speculative_config.get("speculative_model", "")
|
||||
self.eagle3_one_model = speculative_config.get("eagle3_one_model", False)
|
||||
|
||||
# match_mode: "config" (default) or "scenario"
|
||||
self.match_mode = server_config_data.get("match_mode", "config")
|
||||
@ -340,6 +357,7 @@ class ServerConfig:
|
||||
"s_eagle3_layers_to_capture": ",".join(map(str, self.eagle3_layers_to_capture)),
|
||||
"l_max_draft_len": self.max_draft_len,
|
||||
"s_speculative_model_dir": self.speculative_model,
|
||||
"b_eagle3_one_model": self.eagle3_one_model,
|
||||
"s_server_log_link": "",
|
||||
"s_server_env_var": self.env_vars,
|
||||
}
|
||||
@ -366,7 +384,12 @@ class ServerConfig:
|
||||
class ClientConfig:
|
||||
"""Configurations of benchmark client."""
|
||||
|
||||
def __init__(self, client_config_data: dict, model_name: str, env_vars: str = ""):
|
||||
def __init__(
|
||||
self,
|
||||
client_config_data: dict,
|
||||
model_name: str,
|
||||
env_vars: str = "",
|
||||
):
|
||||
self.model_name = model_name
|
||||
self.concurrency = client_config_data.get("concurrency", 1)
|
||||
self.iterations = client_config_data.get("iterations", 1)
|
||||
@ -378,6 +401,7 @@ class ClientConfig:
|
||||
self.streaming = client_config_data.get("streaming", True)
|
||||
self.trust_remote_code = client_config_data.get("trust_remote_code", True)
|
||||
self.model_path = ""
|
||||
self.dataset_file = client_config_data.get("dataset_file", "")
|
||||
self.env_vars = env_vars
|
||||
|
||||
# Generate default name if not provided
|
||||
@ -389,7 +413,7 @@ class ClientConfig:
|
||||
"""Generate benchmark command."""
|
||||
model_dir = get_model_dir(self.model_name)
|
||||
self.model_path = model_dir if os.path.exists(model_dir) else self.model_name
|
||||
dataset_path = get_dataset_path()
|
||||
dataset_path = get_dataset_dir(self.dataset_file)
|
||||
benchmark_cmd = [
|
||||
"python",
|
||||
"-m",
|
||||
@ -398,9 +422,6 @@ class ClientConfig:
|
||||
self.model_path,
|
||||
"--tokenizer",
|
||||
self.model_path,
|
||||
"--dataset-name",
|
||||
"random",
|
||||
"--random-ids",
|
||||
"--num-prompts",
|
||||
str(self.concurrency * self.iterations),
|
||||
"--max-concurrency",
|
||||
@ -409,15 +430,27 @@ class ClientConfig:
|
||||
str(self.isl),
|
||||
"--random-output-len",
|
||||
str(self.osl),
|
||||
"--random-range-ratio",
|
||||
str(self.random_range_ratio),
|
||||
"--ignore-eos",
|
||||
"--no-test-input",
|
||||
"--percentile-metrics",
|
||||
"ttft,tpot,itl,e2el",
|
||||
]
|
||||
if dataset_path and os.path.exists(dataset_path):
|
||||
if dataset_path:
|
||||
benchmark_cmd.append("--dataset-name")
|
||||
benchmark_cmd.append("trtllm_custom")
|
||||
benchmark_cmd.append("--dataset-path")
|
||||
benchmark_cmd.append(dataset_path)
|
||||
print_info(f"Dataset: {dataset_path} exists. Use trtllm_custom dataset for benchmark.")
|
||||
else:
|
||||
benchmark_cmd.append("--dataset-name")
|
||||
benchmark_cmd.append("random")
|
||||
benchmark_cmd.append("--random-ids")
|
||||
benchmark_cmd.append("--random-range-ratio")
|
||||
benchmark_cmd.append(str(self.random_range_ratio))
|
||||
print_info(
|
||||
f"Dataset: {dataset_path} is not provided or does not exist. "
|
||||
f"Use random dataset (random_range_ratio={self.random_range_ratio}) for benchmark."
|
||||
)
|
||||
if self.backend:
|
||||
benchmark_cmd.append("--backend")
|
||||
benchmark_cmd.append(self.backend)
|
||||
@ -453,6 +486,7 @@ class ClientConfig:
|
||||
"l_isl": self.isl,
|
||||
"l_osl": self.osl,
|
||||
"d_random_range_ratio": self.random_range_ratio,
|
||||
"s_dataset_file": self.dataset_file,
|
||||
"s_backend": self.backend,
|
||||
"b_use_chat_template": self.use_chat_template,
|
||||
"b_streaming": self.streaming,
|
||||
@ -840,7 +874,7 @@ class PerfSanityTestConfig:
|
||||
if is_disagg:
|
||||
# For disagg: disagg_upload-deepseek-r1-fp4_8k1k_ctx1_gen1_dep32_bs128_eplb0_mtp0_ccb-UCX
|
||||
self.runtime = "multi_node_disagg_server"
|
||||
self.config_dir = "tests/integration/defs/perf/disagg/test_configs/disagg/perf"
|
||||
self.config_dir = DISAGG_CONFIG_FOLDER
|
||||
config_base = "-".join(labels[1:])
|
||||
self.config_file = (
|
||||
f"{config_base}.yaml" if not config_base.endswith(".yaml") else config_base
|
||||
@ -849,7 +883,7 @@ class PerfSanityTestConfig:
|
||||
else:
|
||||
# For aggr: aggr_upload-config_yml or aggr_upload-config_yml-server_config_name
|
||||
self.runtime = "aggr_server"
|
||||
self.config_dir = "tests/scripts/perf-sanity"
|
||||
self.config_dir = AGGR_CONFIG_FOLDER
|
||||
config_base = labels[1]
|
||||
self.config_file = (
|
||||
f"{config_base}.yaml"
|
||||
@ -922,7 +956,9 @@ class PerfSanityTestConfig:
|
||||
client_configs = []
|
||||
for client_config_data in server_config_data["client_configs"]:
|
||||
client_config = ClientConfig(
|
||||
client_config_data, server_config_data["model_name"], client_env_var
|
||||
client_config_data,
|
||||
server_config_data["model_name"],
|
||||
env_vars=client_env_var,
|
||||
)
|
||||
client_configs.append(client_config)
|
||||
|
||||
@ -1026,8 +1062,13 @@ class PerfSanityTestConfig:
|
||||
"backend": "openai",
|
||||
"use_chat_template": False,
|
||||
"streaming": benchmark.get("streaming", True),
|
||||
"dataset_file": benchmark.get("dataset_file", ""),
|
||||
}
|
||||
client_config = ClientConfig(client_config_data, model_name, client_env_var)
|
||||
client_config = ClientConfig(
|
||||
client_config_data,
|
||||
model_name,
|
||||
env_vars=client_env_var,
|
||||
)
|
||||
client_configs.append(client_config)
|
||||
|
||||
self.server_client_configs = {0: client_configs}
|
||||
@ -1417,9 +1458,6 @@ class PerfSanityTestConfig:
|
||||
AGG_TEST_TYPES = ["aggr_upload", "aggr"]
|
||||
DISAGG_TEST_TYPES = ["disagg_upload", "disagg"]
|
||||
|
||||
AGGR_CONFIG_FOLDER = "tests/scripts/perf-sanity"
|
||||
DISAGG_CONFIG_FOLDER = "tests/integration/defs/perf/disagg/test_configs/disagg/perf"
|
||||
|
||||
|
||||
def get_server_config_names(yaml_path: str) -> List[str]:
|
||||
"""Read a YAML file and return the list of server_config names."""
|
||||
|
||||
@ -0,0 +1,21 @@
|
||||
version: 0.0.1
|
||||
l0_dgx_b200_multi_nodes_disagg_perf_sanity_ctx1_node1_gpu4_gen1_node1_gpu8:
|
||||
- condition:
|
||||
ranges:
|
||||
# 2 nodes with each node has 8 GPUs
|
||||
system_gpu_count:
|
||||
gte: 16
|
||||
lte: 16
|
||||
wildcards:
|
||||
gpu:
|
||||
- '*b200*'
|
||||
terms:
|
||||
stage: post_merge
|
||||
backend: pytorch
|
||||
tests:
|
||||
- perf/test_perf_sanity.py::test_e2e[disagg_upload-b200-deepseek-r1-fp4_1k1k_ctx1_dep4_gen1_dep8_eplb0_mtp1_ccb-UCX] TIMEOUT (90)
|
||||
- perf/test_perf_sanity.py::test_e2e[disagg_upload-b200-deepseek-r1-fp4_1k1k_ctx1_dep4_gen1_dep8_eplb0_mtp3_ccb-UCX] TIMEOUT (90)
|
||||
- perf/test_perf_sanity.py::test_e2e[disagg_upload-b200-deepseek-r1-fp4_1k1k_ctx1_dep4_gen1_tep8_eplb0_mtp3_ccb-UCX] TIMEOUT (90)
|
||||
- perf/test_perf_sanity.py::test_e2e[disagg_upload-b200-deepseek-r1-fp4_8k1k_con256_ctx1_dep4_gen1_dep8_eplb0_mtp1_ccb-UCX] TIMEOUT (90)
|
||||
- perf/test_perf_sanity.py::test_e2e[disagg_upload-b200-deepseek-r1-fp4_8k1k_con1536_ctx1_dep4_gen1_dep8_eplb0_mtp1_ccb-UCX] TIMEOUT (90)
|
||||
- perf/test_perf_sanity.py::test_e2e[disagg_upload-b200-deepseek-r1-fp4_8k1k_ctx1_dep4_gen1_tep8_eplb0_mtp3_ccb-UCX] TIMEOUT (90)
|
||||
@ -1,5 +1,5 @@
|
||||
version: 0.0.1
|
||||
l0_gb200_multi_nodes_aggr_perf_sanity_2_nodes:
|
||||
l0_gb200_multi_nodes_aggr_perf_sanity_node2_gpu8:
|
||||
- condition:
|
||||
ranges:
|
||||
# 2 nodes with each node has 4 GPUs
|
||||
@ -1,16 +0,0 @@
|
||||
version: 0.0.1
|
||||
l0_gb200_multi_nodes_disagg_perf_sanity_3_nodes:
|
||||
- condition:
|
||||
ranges:
|
||||
# 3 nodes with each node has 4 GPUs
|
||||
system_gpu_count:
|
||||
gte: 12
|
||||
lte: 12
|
||||
wildcards:
|
||||
gpu:
|
||||
- '*gb200*'
|
||||
terms:
|
||||
stage: post_merge
|
||||
backend: pytorch
|
||||
tests:
|
||||
- perf/test_perf_sanity.py::test_e2e[disagg_upload-deepseek-r1-fp4_1k1k_ctx1_gen1_dep8_bs768_eplb0_mtp0_ccb-UCX] TIMEOUT (90)
|
||||
@ -1,17 +0,0 @@
|
||||
version: 0.0.1
|
||||
l0_gb200_multi_nodes_disagg_perf_sanity_6_nodes:
|
||||
- condition:
|
||||
ranges:
|
||||
# 6 nodes with each node has 4 GPUs
|
||||
system_gpu_count:
|
||||
gte: 24
|
||||
lte: 24
|
||||
wildcards:
|
||||
gpu:
|
||||
- '*gb200*'
|
||||
terms:
|
||||
stage: post_merge
|
||||
backend: pytorch
|
||||
tests:
|
||||
- perf/test_perf_sanity.py::test_e2e[disagg_upload-deepseek-r1-fp4_1k1k_ctx2_gen1_dep16_bs128_eplb0_mtp3_ccb-UCX] TIMEOUT (90)
|
||||
- perf/test_perf_sanity.py::test_e2e[disagg_upload-deepseek-r1-fp4_1k1k_ctx2_gen1_dep16_bs128_eplb0_mtp3_ccb-NIXL] TIMEOUT (90)
|
||||
@ -1,16 +0,0 @@
|
||||
version: 0.0.1
|
||||
l0_gb200_multi_nodes_disagg_perf_sanity_8_nodes:
|
||||
- condition:
|
||||
ranges:
|
||||
# 8 nodes with each node has 4 GPUs
|
||||
system_gpu_count:
|
||||
gte: 32
|
||||
lte: 32
|
||||
wildcards:
|
||||
gpu:
|
||||
- '*gb200*'
|
||||
terms:
|
||||
stage: post_merge
|
||||
backend: pytorch
|
||||
tests:
|
||||
- perf/test_perf_sanity.py::test_e2e[disagg_upload-deepseek-r1-fp4_8k1k_ctx1_gen1_dep32_bs128_eplb0_mtp3_ccb-UCX] TIMEOUT (90)
|
||||
@ -0,0 +1,17 @@
|
||||
version: 0.0.1
|
||||
l0_gb200_multi_nodes_disagg_perf_sanity_ctx1_node1_gpu1_gen1_node1_gpu4:
|
||||
- condition:
|
||||
ranges:
|
||||
# 1 ctx worker with each 1 node and 1 GPU
|
||||
# 1 gen worker with each 1 node and 4 GPUs
|
||||
system_gpu_count:
|
||||
gte: 8
|
||||
lte: 8
|
||||
wildcards:
|
||||
gpu:
|
||||
- '*gb200*'
|
||||
terms:
|
||||
stage: post_merge
|
||||
backend: pytorch
|
||||
tests:
|
||||
- perf/test_perf_sanity.py::test_e2e[disagg_upload-gb200-qwen3-235b-fp4_8k1k_ctx1_tp1_gen1_tep4_eplb0_mtp0_ccb-UCX] TIMEOUT (120)
|
||||
@ -0,0 +1,17 @@
|
||||
version: 0.0.1
|
||||
l0_gb200_multi_nodes_disagg_perf_sanity_ctx1_node1_gpu1_gen1_node2_gpu8:
|
||||
- condition:
|
||||
ranges:
|
||||
# 1 ctx worker with each 1 node and 1 GPU
|
||||
# 1 gen worker with each 2 nodes and 8 GPUs
|
||||
system_gpu_count:
|
||||
gte: 12
|
||||
lte: 12
|
||||
wildcards:
|
||||
gpu:
|
||||
- '*gb200*'
|
||||
terms:
|
||||
stage: post_merge
|
||||
backend: pytorch
|
||||
tests:
|
||||
- perf/test_perf_sanity.py::test_e2e[disagg_upload-gb200-qwen3-235b-fp4_8k1k_ctx1_tp1_gen1_dep8_eplb0_mtp0_ccb-UCX] TIMEOUT (120)
|
||||
@ -0,0 +1,19 @@
|
||||
version: 0.0.1
|
||||
l0_gb200_multi_nodes_disagg_perf_sanity_ctx1_node1_gpu4_gen1_node1_gpu4:
|
||||
- condition:
|
||||
ranges:
|
||||
# 1 ctx worker with each 1 node and 4 GPUs
|
||||
# 1 gen worker with each 1 node and 4 GPUs
|
||||
system_gpu_count:
|
||||
gte: 8
|
||||
lte: 8
|
||||
wildcards:
|
||||
gpu:
|
||||
- '*gb200*'
|
||||
terms:
|
||||
stage: post_merge
|
||||
backend: pytorch
|
||||
tests:
|
||||
- perf/test_perf_sanity.py::test_e2e[disagg_upload-gb200-deepseek-r1-fp4_1k1k_ctx1_dep4_gen1_dep4_eplb0_mtp1_ccb-UCX] TIMEOUT (120)
|
||||
- perf/test_perf_sanity.py::test_e2e[disagg_upload-gb200-deepseek-v32-fp4_1k1k_ctx1_dep4_gen1_dep4_eplb0_mtp1_ccb-UCX] TIMEOUT (120)
|
||||
- perf/test_perf_sanity.py::test_e2e[disagg_upload-gb200-kimi-k2-thinking-fp4_1k1k_ctx1_dep4_gen1_tep4_eplb0_mtp0_ccb-UCX] TIMEOUT (120)
|
||||
@ -0,0 +1,24 @@
|
||||
version: 0.0.1
|
||||
l0_gb200_multi_nodes_disagg_perf_sanity_ctx1_node1_gpu4_gen1_node2_gpu8:
|
||||
- condition:
|
||||
ranges:
|
||||
# 1 ctx worker with each 1 node and 4 GPUs
|
||||
# 1 gen worker with each 2 nodes and 8 GPUs
|
||||
system_gpu_count:
|
||||
gte: 12
|
||||
lte: 12
|
||||
wildcards:
|
||||
gpu:
|
||||
- '*gb200*'
|
||||
terms:
|
||||
stage: post_merge
|
||||
backend: pytorch
|
||||
tests:
|
||||
- perf/test_perf_sanity.py::test_e2e[disagg_upload-gb200-deepseek-r1-fp4_1k1k_ctx1_dep4_gen1_dep8_eplb0_mtp0_ccb-UCX] TIMEOUT (120)
|
||||
- perf/test_perf_sanity.py::test_e2e[disagg_upload-gb200-deepseek-r1-fp4_1k1k_ctx1_dep4_gen1_tep8_eplb0_mtp3_ccb-UCX] TIMEOUT (120)
|
||||
- perf/test_perf_sanity.py::test_e2e[disagg_upload-gb200-deepseek-r1-fp4_8k1k_ctx1_dep4_gen1_tep8_eplb0_mtp3_ccb-UCX] TIMEOUT (120)
|
||||
# - perf/test_perf_sanity.py::test_e2e[disagg_upload-gb200-deepseek-v32-fp4_1k1k_ctx1_dep4_gen1_tep8_eplb0_mtp3_ccb-UCX] TIMEOUT (120)
|
||||
# - perf/test_perf_sanity.py::test_e2e[disagg_upload-gb200-deepseek-v32-fp4_32k4k_ctx1_dep4_gen1_tep8_eplb0_mtp3_ccb-UCX] TIMEOUT (120)
|
||||
# - perf/test_perf_sanity.py::test_e2e[disagg_upload-gb200-deepseek-v32-fp4_8k1k_ctx1_dep4_gen1_tep8_eplb0_mtp3_ccb-UCX] TIMEOUT (120)
|
||||
- perf/test_perf_sanity.py::test_e2e[disagg_upload-gb200-kimi-k2-thinking-fp4_1k1k_ctx1_dep4_gen1_dep8_eplb0_mtp0_ccb-UCX] TIMEOUT (120)
|
||||
- perf/test_perf_sanity.py::test_e2e[disagg_upload-gb200-kimi-k2-thinking-fp4_8k1k_ctx1_dep4_gen1_tep8_eplb0_mtp3_ccb-UCX] TIMEOUT (120)
|
||||
@ -0,0 +1,18 @@
|
||||
version: 0.0.1
|
||||
l0_gb200_multi_nodes_disagg_perf_sanity_ctx1_node1_gpu4_gen1_node4_gpu16:
|
||||
- condition:
|
||||
ranges:
|
||||
# 1 ctx worker with each 1 node and 4 GPUs
|
||||
# 1 gen worker with each 4 nodes and 16 GPUs
|
||||
system_gpu_count:
|
||||
gte: 20
|
||||
lte: 20
|
||||
wildcards:
|
||||
gpu:
|
||||
- '*gb200*'
|
||||
terms:
|
||||
stage: post_merge
|
||||
backend: pytorch
|
||||
tests:
|
||||
- perf/test_perf_sanity.py::test_e2e[disagg_upload-gb200-deepseek-r1-fp4_8k1k_ctx1_dep4_gen1_dep16_eplb0_mtp1_ccb-UCX] TIMEOUT (120)
|
||||
- perf/test_perf_sanity.py::test_e2e[disagg_upload-gb200-kimi-k2-thinking-fp4_8k1k_ctx1_dep4_gen1_dep16_eplb384_mtp0_ccb-UCX] TIMEOUT (120)
|
||||
@ -0,0 +1,25 @@
|
||||
version: 0.0.1
|
||||
l0_gb200_multi_nodes_disagg_perf_sanity_ctx1_node1_gpu4_gen1_node8_gpu32:
|
||||
- condition:
|
||||
ranges:
|
||||
# 1 ctx worker with each 1 node and 4 GPUs
|
||||
# 1 gen worker with each 8 nodes and 32 GPUs
|
||||
system_gpu_count:
|
||||
gte: 36
|
||||
lte: 36
|
||||
wildcards:
|
||||
gpu:
|
||||
- '*gb200*'
|
||||
terms:
|
||||
stage: post_merge
|
||||
backend: pytorch
|
||||
tests:
|
||||
- perf/test_perf_sanity.py::test_e2e[disagg_upload-gb200-deepseek-r1-fp4_1k1k_ctx1_dep4_gen1_dep32_eplb0_mtp3_ccb-UCX] TIMEOUT (120)
|
||||
- perf/test_perf_sanity.py::test_e2e[disagg_upload-gb200-deepseek-r1-fp4_8k1k_ctx1_dep4_gen1_dep32_eplb0_mtp3_ccb-UCX] TIMEOUT (120)
|
||||
- perf/test_perf_sanity.py::test_e2e[disagg_upload-gb200-deepseek-v32-fp4_1k1k_ctx1_dep4_gen1_dep32_eplb256_mtp3_ccb-UCX] TIMEOUT (120)
|
||||
- perf/test_perf_sanity.py::test_e2e[disagg_upload-gb200-deepseek-v32-fp4_32k4k_ctx1_dep4_gen1_dep32_eplb288_mtp1_ccb-UCX] TIMEOUT (120)
|
||||
- perf/test_perf_sanity.py::test_e2e[disagg_upload-gb200-deepseek-v32-fp4_32k4k_ctx1_dep4_gen1_dep32_eplb0_mtp3_ccb-UCX] TIMEOUT (120)
|
||||
- perf/test_perf_sanity.py::test_e2e[disagg_upload-gb200-deepseek-v32-fp4_8k1k_ctx1_dep4_gen1_dep32_eplb256_mtp0_ccb-UCX] TIMEOUT (120)
|
||||
- perf/test_perf_sanity.py::test_e2e[disagg_upload-gb200-deepseek-v32-fp4_8k1k_ctx1_dep4_gen1_dep32_eplb256_mtp3_ccb-UCX] TIMEOUT (120)
|
||||
- perf/test_perf_sanity.py::test_e2e[disagg_upload-gb200-kimi-k2-thinking-fp4_1k1k_ctx1_dep4_gen1_dep32_eplb384_mtp0_ccb-UCX] TIMEOUT (120)
|
||||
- perf/test_perf_sanity.py::test_e2e[disagg_upload-gb200-kimi-k2-thinking-fp4_8k1k_ctx1_dep4_gen1_dep32_eplb416_mtp3_ccb-UCX] TIMEOUT (120)
|
||||
@ -0,0 +1,17 @@
|
||||
version: 0.0.1
|
||||
l0_gb200_multi_nodes_disagg_perf_sanity_ctx1_node2_gpu8_gen1_node2_gpu8:
|
||||
- condition:
|
||||
ranges:
|
||||
# 1 ctx worker with each 2 nodes and 8 GPUs
|
||||
# 1 gen worker with each 2 nodes and 8 GPUs
|
||||
system_gpu_count:
|
||||
gte: 16
|
||||
lte: 16
|
||||
wildcards:
|
||||
gpu:
|
||||
- '*gb200*'
|
||||
terms:
|
||||
stage: post_merge
|
||||
backend: pytorch
|
||||
tests:
|
||||
- perf/test_perf_sanity.py::test_e2e[disagg_upload-gb200-deepseek-r1-fp4_128k8k_ctx1_pp8_gen1_tep8_eplb0_mtp3_ccb-UCX] TIMEOUT (120)
|
||||
@ -0,0 +1,17 @@
|
||||
version: 0.0.1
|
||||
l0_gb200_multi_nodes_disagg_perf_sanity_ctx1_node2_gpu8_gen1_node4_gpu16:
|
||||
- condition:
|
||||
ranges:
|
||||
# 1 ctx worker with each 2 nodes and 8 GPUs
|
||||
# 1 gen worker with each 4 nodes and 16 GPUs
|
||||
system_gpu_count:
|
||||
gte: 24
|
||||
lte: 24
|
||||
wildcards:
|
||||
gpu:
|
||||
- '*gb200*'
|
||||
terms:
|
||||
stage: post_merge
|
||||
backend: pytorch
|
||||
tests:
|
||||
- perf/test_perf_sanity.py::test_e2e[disagg_upload-gb200-deepseek-r1-fp4_128k8k_ctx1_pp8_gen1_dep16_eplb0_mtp2_ccb-UCX] TIMEOUT (120)
|
||||
@ -0,0 +1,17 @@
|
||||
version: 0.0.1
|
||||
l0_gb200_multi_nodes_disagg_perf_sanity_ctx1_node2_gpu8_gen1_node8_gpu32:
|
||||
- condition:
|
||||
ranges:
|
||||
# 1 ctx worker with each 2 nodes and 8 GPUs
|
||||
# 1 gen worker with each 8 nodes and 32 GPUs
|
||||
system_gpu_count:
|
||||
gte: 40
|
||||
lte: 40
|
||||
wildcards:
|
||||
gpu:
|
||||
- '*gb200*'
|
||||
terms:
|
||||
stage: post_merge
|
||||
backend: pytorch
|
||||
tests:
|
||||
- perf/test_perf_sanity.py::test_e2e[disagg_upload-gb200-deepseek-r1-fp4_128k8k_ctx1_pp8_gen1_dep32_eplb0_mtp3_ccb-UCX] TIMEOUT (120)
|
||||
@ -291,9 +291,7 @@ unittest/llmapi/test_mpi_session.py::test_llmapi_launch_multiple_tasks SKIP (htt
|
||||
accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16_4gpus[ep4-mtp_nextn=0-attention_dp=False-cuda_graph=False-overlap_scheduler=True-torch_compile=False] SKIP (https://nvbugs/5819019)
|
||||
unittest/_torch/thop/serial/test_moe.py::TestMoeFp4::test_gptoss_style_nvfp4[limit1-beta0-alpha1-RoutingGPTOSS-512-512-1] SKIP (https://nvbugs/5819042)
|
||||
accuracy/test_llm_api_pytorch.py::TestQwen3_235B_A22B::test_nvfp4_4gpus[latency_moe_trtllm_eagle3] SKIP (https://nvbugs/5819048)
|
||||
perf/test_perf_sanity.py::test_e2e[aggr_upload-k2_thinking_fp4_2_nodes_grace_blackwell-k2_thinking_fp4_dep8_32k8k] SKIP (https://nvbugs/5819053)
|
||||
perf/test_perf_sanity.py::test_e2e[aggr_upload-k2_thinking_fp4_2_nodes_grace_blackwell-k2_thinking_fp4_tep8_32k8k] SKIP (https://nvbugs/5819053)
|
||||
perf/test_perf_sanity.py::test_e2e[aggr_upload-k2_thinking_fp4_grace_blackwell-k2_thinking_fp4_tep4_8k1k] SKIP (https://nvbugs/5820541)
|
||||
accuracy/test_llm_api_pytorch.py::TestQwen3_30B_A3B::test_nvfp4[tep4_latency_moe_cutlass-torch_compile=False] SKIP (https://nvbugs/5819021)
|
||||
disaggregated/test_disaggregated.py::test_disaggregated_deepseek_v3_lite_bf16_cache_aware_balance[DeepSeek-V3-Lite-bf16] SKIP (https://nvbugs/5820576)
|
||||
llmapi/test_llm_examples.py::test_llmapi_tensorrt_engine SKIP (https://nvbugs/5820553)
|
||||
accuracy/test_llm_api_pytorch.py::TestLlama4ScoutInstruct::test_auto_dtype[tp4-cuda_graph=False] SKIP (https://nvbugs/5820938)
|
||||
@ -317,10 +315,8 @@ accuracy/test_disaggregated_serving.py::TestDeepSeekV3Lite::test_auto_dtype[mtp_
|
||||
accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_fp8_block_scales_4gpus[tp4-mtp_nextn=2-fp8kv=True-attention_dp=False-cuda_graph=False-overlap_scheduler=False-torch_compile=False-sampler_async_worker=False] SKIP (https://nvbugs/5701445)
|
||||
accuracy/test_llm_api_pytorch.py::TestLlama4ScoutInstruct::test_fp4[tp4-cuda_graph=True] SKIP (https://nvbugs/5820734)
|
||||
accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4_4gpus[moe_backend=CUTLASS-mtp_nextn=2-ep4-fp8kv=False-attention_dp=False-cuda_graph=False-overlap_scheduler=False-torch_compile=False] SKIP (https://nvbugs/5800672)
|
||||
perf/test_perf_sanity.py::test_e2e[aggr_upload-deepseek_r1_fp4_v2_2_nodes_grace_blackwell-r1_fp4_v2_tep8_mtp3] SKIP (https://nvbugs/5819053)
|
||||
accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16_4gpus[ep4-mtp_nextn=2-attention_dp=False-cuda_graph=False-overlap_scheduler=False-torch_compile=False] SKIP (https://nvbugs/5823284)
|
||||
accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4_4gpus[moe_backend=CUTLASS-mtp_nextn=2-tp2pp2-fp8kv=False-attention_dp=False-cuda_graph=False-overlap_scheduler=False-torch_compile=False] SKIP (https://nvbugs/5800672)
|
||||
perf/test_perf_sanity.py::test_e2e[aggr_upload-deepseek_r1_fp4_v2_grace_blackwell-r1_fp4_v2_tp4_mtp3_1k8k] SKIP (https://nvbugs/5819053)
|
||||
perf/test_perf_sanity.py::test_e2e[disagg_upload-deepseek-r1-fp4_1k1k_ctx1_gen1_dep8_bs768_eplb0_mtp0_ccb-UCX] SKIP (https://nvbugs/5819053)
|
||||
accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_bfloat16_4gpus[tp4-attn_backend=TRTLLM-torch_compile=True] SKIP (https://nvbugs/5826604)
|
||||
disaggregated/test_disaggregated.py::test_disaggregated_mixed[TinyLlama-1.1B-Chat-v1.0] SKIP (https://nvbugs/5834212)
|
||||
accuracy/test_llm_api_pytorch.py::TestGLM4_5Air::test_nvfp4_multi_gpus[throughput_trtllm] SKIP (https://nvbugs/5837275)
|
||||
@ -337,7 +333,6 @@ test_e2e.py::test_openai_chat_harmony SKIP (https://nvbugs/5819444)
|
||||
accuracy/test_cli_flow.py::TestLlama2_7B::test_fp8 SKIP (https://nvbugs/5819452)
|
||||
accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16_4gpus[ep4-mtp_nextn=2-attention_dp=True-cuda_graph=True-overlap_scheduler=True-torch_compile=False] SKIP (https://nvbugs/5800646)
|
||||
accuracy/test_disaggregated_serving.py::TestQwen3_30B_A3B::test_mixed_ctx_gen_model[ctxpp2gentp2] SKIP (https://nvbugs/5748664)
|
||||
perf/test_perf_sanity.py::test_e2e[aggr_upload-deepseek_r1_fp4_v2_grace_blackwell-r1_fp4_v2_tep4_mtp3_8k1k] SKIP (https://nvbugs/5819053)
|
||||
examples/test_llama.py::test_llama_3_x_with_bf16_lora_torch[llama-3.2-1b-instruct] SKIP (https://nvbugs/5838178)
|
||||
accuracy/test_llm_api_autodeploy.py::TestNemotronSuperV3::test_bf16 SKIP (https://nvbugs/5838184)
|
||||
cpp/test_multi_gpu.py::test_cache_transceiver[8proc-mooncake_kvcache-90] SKIP (https://nvbugs/5838199)
|
||||
@ -362,7 +357,6 @@ full:RTXPro6000D/accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bflo
|
||||
full:RTXPro6000D/accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4_4gpus[moe_backend=CUTLASS-mtp_nextn=2-ep4-fp8kv=True-attention_dp=True-cuda_graph=True-overlap_scheduler=True-torch_compile=False] SKIP (https://nvbugs/5846154)
|
||||
full:RTXPro6000D/accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4_4gpus[moe_backend=CUTLASS-mtp_nextn=2-tp2pp2-fp8kv=True-attention_dp=True-cuda_graph=True-overlap_scheduler=True-torch_compile=False] SKIP (https://nvbugs/5846154)
|
||||
full:RTXPro6000D/accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4_4gpus[moe_backend=CUTLASS-mtp_nextn=2-tp4-fp8kv=True-attention_dp=True-cuda_graph=True-overlap_scheduler=True-torch_compile=False] SKIP (https://nvbugs/5846154)
|
||||
perf/test_perf_sanity.py::test_e2e[aggr_upload-deepseek_r1_fp4_v2_grace_blackwell-r1_fp4_v2_dep4_mtp1_1k8k] SKIP (https://nvbugs/5846166)
|
||||
accuracy/test_disaggregated_serving.py::TestDeepSeekV32Exp::test_auto_dtype[False] SKIP (https://nvbugs/5847284)
|
||||
accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_4gpus[v1_kv_cache-dp4-trtllm-fp8] SKIP (https://nvbugs/5850183)
|
||||
accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_4gpus[v2_kv_cache-dp4-trtllm-fp8] SKIP (https://nvbugs/5850183)
|
||||
|
||||
@ -38,8 +38,8 @@ server_configs:
|
||||
iterations: 10
|
||||
isl: 1024
|
||||
osl: 1024
|
||||
random_range_ratio: 0.2
|
||||
backend: "openai"
|
||||
dataset_file: datasets/perf-ci/deepseek_r1-1k1k-20480-ratio-1_for_serve.json
|
||||
|
||||
# 8k1k configs - DEP8 with CUTLASS, MTP1
|
||||
- name: "r1_fp4_v2_dep8_mtp1_8k1k"
|
||||
@ -74,8 +74,8 @@ server_configs:
|
||||
iterations: 10
|
||||
isl: 8192
|
||||
osl: 1024
|
||||
random_range_ratio: 0.2
|
||||
backend: "openai"
|
||||
dataset_file: datasets/perf-ci/deepseek_r1-8k1k-20480-ratio-1_for_serve.json
|
||||
|
||||
# 1k1k configs - TEP8 with TRTLLM, MTP3
|
||||
- name: "r1_fp4_v2_tep8_mtp3"
|
||||
@ -105,5 +105,5 @@ server_configs:
|
||||
iterations: 12
|
||||
isl: 1024
|
||||
osl: 1024
|
||||
random_range_ratio: 0.8
|
||||
backend: "openai"
|
||||
dataset_file: datasets/perf-ci/deepseek_r1-1k1k-20480-ratio-1_for_serve.json
|
||||
|
||||
@ -31,8 +31,8 @@ server_configs:
|
||||
iterations: 10
|
||||
isl: 1024
|
||||
osl: 1024
|
||||
random_range_ratio: 0.2
|
||||
backend: "openai"
|
||||
dataset_file: datasets/perf-ci/deepseek_r1-1k1k-20480-ratio-1_for_serve.json
|
||||
|
||||
# 1k1k configs - DEP8 with CUTLASS, MTP1
|
||||
- name: "r1_fp4_v2_dep8_mtp1_1k1k"
|
||||
@ -66,8 +66,8 @@ server_configs:
|
||||
iterations: 10
|
||||
isl: 1024
|
||||
osl: 1024
|
||||
random_range_ratio: 0.2
|
||||
backend: "openai"
|
||||
dataset_file: datasets/perf-ci/deepseek_r1-1k1k-20480-ratio-1_for_serve.json
|
||||
|
||||
# 8k1k configs - TP4 with TRTLLM, MTP3
|
||||
- name: "r1_fp4_v2_tp4_mtp3_8k1k"
|
||||
@ -97,8 +97,8 @@ server_configs:
|
||||
iterations: 10
|
||||
isl: 8192
|
||||
osl: 1024
|
||||
random_range_ratio: 0.2
|
||||
backend: "openai"
|
||||
dataset_file: datasets/perf-ci/deepseek_r1-8k1k-20480-ratio-1_for_serve.json
|
||||
|
||||
# 8k1k configs - DEP8 with CUTLASS, MTP1
|
||||
- name: "r1_fp4_v2_dep8_mtp1_8k1k"
|
||||
@ -132,5 +132,5 @@ server_configs:
|
||||
iterations: 10
|
||||
isl: 8192
|
||||
osl: 1024
|
||||
random_range_ratio: 0.2
|
||||
backend: "openai"
|
||||
dataset_file: datasets/perf-ci/deepseek_r1-8k1k-20480-ratio-1_for_serve.json
|
||||
|
||||
@ -31,20 +31,13 @@ server_configs:
|
||||
decoding_type: 'MTP'
|
||||
num_nextn_predict_layers: 1
|
||||
client_configs:
|
||||
- name: "con2048_iter5_1k1k"
|
||||
concurrency: 2048
|
||||
iterations: 5
|
||||
isl: 1024
|
||||
osl: 1024
|
||||
random_range_ratio: 0.8
|
||||
backend: "openai"
|
||||
- name: "con1024_iter10_1k1k"
|
||||
concurrency: 1024
|
||||
iterations: 10
|
||||
isl: 1024
|
||||
osl: 1024
|
||||
random_range_ratio: 0.2
|
||||
backend: "openai"
|
||||
dataset_file: datasets/perf-ci/deepseek_r1-1k1k-20480-ratio-1_for_serve.json
|
||||
|
||||
# 1k1k configs - TEP4 with TRTLLM, MTP3
|
||||
- name: "r1_fp4_v2_tep4_mtp3_1k1k"
|
||||
@ -74,8 +67,8 @@ server_configs:
|
||||
iterations: 10
|
||||
isl: 1024
|
||||
osl: 1024
|
||||
random_range_ratio: 0.8
|
||||
backend: "openai"
|
||||
dataset_file: datasets/perf-ci/deepseek_r1-1k1k-20480-ratio-1_for_serve.json
|
||||
|
||||
# 1k1k configs - TP4 with TRTLLM, MTP3
|
||||
- name: "r1_fp4_v2_tp4_mtp3_1k1k"
|
||||
@ -100,20 +93,13 @@ server_configs:
|
||||
decoding_type: 'MTP'
|
||||
num_nextn_predict_layers: 3
|
||||
client_configs:
|
||||
- name: "con4_iter10_1k1k"
|
||||
concurrency: 4
|
||||
iterations: 10
|
||||
isl: 1024
|
||||
osl: 1024
|
||||
random_range_ratio: 0.8
|
||||
backend: "openai"
|
||||
- name: "con2_iter10_1k1k"
|
||||
concurrency: 2
|
||||
iterations: 10
|
||||
isl: 1024
|
||||
osl: 1024
|
||||
random_range_ratio: 0.2
|
||||
backend: "openai"
|
||||
dataset_file: datasets/perf-ci/deepseek_r1-1k1k-20480-ratio-1_for_serve.json
|
||||
|
||||
# 8k1k configs - DEP4 with CUTLASS, MTP1
|
||||
- name: "r1_fp4_v2_dep4_mtp1_8k1k"
|
||||
@ -142,20 +128,13 @@ server_configs:
|
||||
decoding_type: 'MTP'
|
||||
num_nextn_predict_layers: 1
|
||||
client_configs:
|
||||
- name: "con2048_iter5_8k1k"
|
||||
concurrency: 2048
|
||||
iterations: 5
|
||||
isl: 8192
|
||||
osl: 1024
|
||||
random_range_ratio: 0.2
|
||||
backend: "openai"
|
||||
- name: "con256_iter10_8k1k"
|
||||
concurrency: 256
|
||||
iterations: 10
|
||||
isl: 8192
|
||||
osl: 1024
|
||||
random_range_ratio: 0.2
|
||||
backend: "openai"
|
||||
dataset_file: datasets/perf-ci/deepseek_r1-8k1k-20480-ratio-1_for_serve.json
|
||||
|
||||
# 8k1k configs - TEP4 with TRTLLM, MTP3
|
||||
- name: "r1_fp4_v2_tep4_mtp3_8k1k"
|
||||
@ -185,8 +164,8 @@ server_configs:
|
||||
iterations: 10
|
||||
isl: 8192
|
||||
osl: 1024
|
||||
random_range_ratio: 0.2
|
||||
backend: "openai"
|
||||
dataset_file: datasets/perf-ci/deepseek_r1-8k1k-20480-ratio-1_for_serve.json
|
||||
|
||||
# 8k1k configs - TP4 with TRTLLM, MTP3
|
||||
- name: "r1_fp4_v2_tp4_mtp3_8k1k"
|
||||
@ -211,20 +190,13 @@ server_configs:
|
||||
decoding_type: 'MTP'
|
||||
num_nextn_predict_layers: 3
|
||||
client_configs:
|
||||
- name: "con4_iter10_8k1k"
|
||||
concurrency: 4
|
||||
iterations: 10
|
||||
isl: 8192
|
||||
osl: 1024
|
||||
random_range_ratio: 0.2
|
||||
backend: "openai"
|
||||
- name: "con2_iter10_8k1k"
|
||||
concurrency: 2
|
||||
iterations: 10
|
||||
isl: 8192
|
||||
osl: 1024
|
||||
random_range_ratio: 0.2
|
||||
backend: "openai"
|
||||
dataset_file: datasets/perf-ci/deepseek_r1-8k1k-20480-ratio-1_for_serve.json
|
||||
|
||||
# 1k8k configs - DEP4 with CUTLASS, MTP1
|
||||
- name: "r1_fp4_v2_dep4_mtp1_1k8k"
|
||||
@ -258,8 +230,8 @@ server_configs:
|
||||
iterations: 5
|
||||
isl: 1024
|
||||
osl: 8192
|
||||
random_range_ratio: 0.8
|
||||
backend: "openai"
|
||||
dataset_file: datasets/perf-ci/deepseek_r1-1k8k-20480-ratio-1_for_serve.json
|
||||
|
||||
# 1k8k configs - TEP4 with TRTLLM, MTP3
|
||||
- name: "r1_fp4_v2_tep4_mtp3_1k8k"
|
||||
@ -289,8 +261,8 @@ server_configs:
|
||||
iterations: 10
|
||||
isl: 1024
|
||||
osl: 8192
|
||||
random_range_ratio: 0.8
|
||||
backend: "openai"
|
||||
dataset_file: datasets/perf-ci/deepseek_r1-1k8k-20480-ratio-1_for_serve.json
|
||||
|
||||
# 1k8k configs - TP4 with TRTLLM, MTP3
|
||||
- name: "r1_fp4_v2_tp4_mtp3_1k8k"
|
||||
@ -320,5 +292,5 @@ server_configs:
|
||||
iterations: 10
|
||||
isl: 1024
|
||||
osl: 8192
|
||||
random_range_ratio: 0.8
|
||||
backend: "openai"
|
||||
dataset_file: datasets/perf-ci/deepseek_r1-1k8k-20480-ratio-1_for_serve.json
|
||||
|
||||
@ -31,8 +31,8 @@ server_configs:
|
||||
iterations: 10
|
||||
isl: 1024
|
||||
osl: 1024
|
||||
random_range_ratio: 0.2
|
||||
backend: "openai"
|
||||
dataset_file: datasets/perf-ci/deepseek_r1-1k1k-20480-ratio-1_for_serve.json
|
||||
|
||||
# 1k1k configs - DEP8 with DEEPGEMM, MTP1
|
||||
- name: "r1_fp8_dep8_mtp1_1k1k"
|
||||
@ -66,8 +66,8 @@ server_configs:
|
||||
iterations: 10
|
||||
isl: 1024
|
||||
osl: 1024
|
||||
random_range_ratio: 0.2
|
||||
backend: "openai"
|
||||
dataset_file: datasets/perf-ci/deepseek_r1-1k1k-20480-ratio-1_for_serve.json
|
||||
|
||||
# 8k1k configs - TP8 with TRTLLM, MTP3
|
||||
- name: "r1_fp8_tp8_mtp3_8k1k"
|
||||
@ -97,8 +97,8 @@ server_configs:
|
||||
iterations: 10
|
||||
isl: 8192
|
||||
osl: 1024
|
||||
random_range_ratio: 0.2
|
||||
backend: "openai"
|
||||
dataset_file: datasets/perf-ci/deepseek_r1-8k1k-20480-ratio-1_for_serve.json
|
||||
|
||||
# 8k1k configs - DEP8 with DEEPGEMM, MTP1
|
||||
- name: "r1_fp8_dep8_mtp1_8k1k"
|
||||
@ -132,5 +132,5 @@ server_configs:
|
||||
iterations: 10
|
||||
isl: 8192
|
||||
osl: 1024
|
||||
random_range_ratio: 0.2
|
||||
backend: "openai"
|
||||
dataset_file: datasets/perf-ci/deepseek_r1-8k1k-20480-ratio-1_for_serve.json
|
||||
|
||||
@ -31,8 +31,8 @@ server_configs:
|
||||
iterations: 10
|
||||
isl: 8192
|
||||
osl: 1024
|
||||
random_range_ratio: 0.2
|
||||
backend: "openai"
|
||||
dataset_file: datasets/perf-ci/deepseek_v32-8k1k-20480-ratio-1_for_serve.json
|
||||
|
||||
# 8k1k configs - DEP8 with CUTLASS, MTP1
|
||||
- name: "v32_fp4_dep8_mtp1_8k1k"
|
||||
@ -66,5 +66,5 @@ server_configs:
|
||||
iterations: 10
|
||||
isl: 8192
|
||||
osl: 1024
|
||||
random_range_ratio: 0.2
|
||||
backend: "openai"
|
||||
dataset_file: datasets/perf-ci/deepseek_v32-8k1k-20480-ratio-1_for_serve.json
|
||||
|
||||
@ -31,8 +31,8 @@ server_configs:
|
||||
iterations: 10
|
||||
isl: 1024
|
||||
osl: 1024
|
||||
random_range_ratio: 0.2
|
||||
backend: "openai"
|
||||
dataset_file: datasets/perf-ci/deepseek_v32-1k1k-20480-ratio-1_for_serve.json
|
||||
|
||||
# 1k1k configs - DEP4 with CUTLASS, MTP1
|
||||
- name: "v32_fp4_dep4_mtp1_1k1k"
|
||||
@ -66,8 +66,8 @@ server_configs:
|
||||
iterations: 10
|
||||
isl: 1024
|
||||
osl: 1024
|
||||
random_range_ratio: 0.2
|
||||
backend: "openai"
|
||||
dataset_file: datasets/perf-ci/deepseek_v32-1k1k-20480-ratio-1_for_serve.json
|
||||
|
||||
# 8k1k configs - TEP4 with TRTLLM, MTP3
|
||||
- name: "v32_fp4_tep4_mtp3_8k1k"
|
||||
@ -97,8 +97,8 @@ server_configs:
|
||||
iterations: 10
|
||||
isl: 8192
|
||||
osl: 1024
|
||||
random_range_ratio: 0.2
|
||||
backend: "openai"
|
||||
dataset_file: datasets/perf-ci/deepseek_v32-8k1k-20480-ratio-1_for_serve.json
|
||||
|
||||
# 8k1k configs - DEP4 with CUTLASS, MTP1
|
||||
- name: "v32_fp4_dep4_mtp1_8k1k"
|
||||
@ -132,5 +132,5 @@ server_configs:
|
||||
iterations: 10
|
||||
isl: 8192
|
||||
osl: 1024
|
||||
random_range_ratio: 0.2
|
||||
backend: "openai"
|
||||
dataset_file: datasets/perf-ci/deepseek_v32-8k1k-20480-ratio-1_for_serve.json
|
||||
|
||||
@ -32,7 +32,7 @@ server_configs:
|
||||
iterations: 5
|
||||
isl: 1024
|
||||
osl: 8192
|
||||
random_range_ratio: 0.8
|
||||
random_range_ratio: 0.0
|
||||
backend: "openai"
|
||||
|
||||
- name: "gpt_oss_fp4_dep2_1k1k"
|
||||
@ -63,7 +63,7 @@ server_configs:
|
||||
iterations: 5
|
||||
isl: 1024
|
||||
osl: 1024
|
||||
random_range_ratio: 0.8
|
||||
random_range_ratio: 0.0
|
||||
backend: "openai"
|
||||
|
||||
- name: "gpt_oss_fp4_tep2_1k8k"
|
||||
@ -92,7 +92,7 @@ server_configs:
|
||||
iterations: 10
|
||||
isl: 1024
|
||||
osl: 8192
|
||||
random_range_ratio: 0.8
|
||||
random_range_ratio: 0.0
|
||||
backend: "openai"
|
||||
|
||||
- name: "gpt_oss_fp4_tp2_1k8k"
|
||||
@ -121,7 +121,7 @@ server_configs:
|
||||
iterations: 10
|
||||
isl: 1024
|
||||
osl: 8192
|
||||
random_range_ratio: 0.8
|
||||
random_range_ratio: 0.0
|
||||
backend: "openai"
|
||||
|
||||
- name: "gpt_oss_fp4_tp4_eagle3_1k1k"
|
||||
@ -155,5 +155,5 @@ server_configs:
|
||||
iterations: 32
|
||||
isl: 1024
|
||||
osl: 1024
|
||||
random_range_ratio: 0.8
|
||||
random_range_ratio: 0.0
|
||||
backend: "openai"
|
||||
|
||||
@ -32,9 +32,9 @@ server_configs:
|
||||
iterations: 10
|
||||
isl: 32768
|
||||
osl: 8192
|
||||
random_range_ratio: 0.2
|
||||
backend: "openai"
|
||||
trust_remote_code: true
|
||||
dataset_file: datasets/perf-ci/k2_thinking-32k8k-20480-ratio-1_for_serve.json
|
||||
|
||||
# 32k8k configs - DEP8 with CUTLASS
|
||||
- name: "k2_thinking_fp4_dep8_32k8k"
|
||||
@ -67,6 +67,6 @@ server_configs:
|
||||
iterations: 10
|
||||
isl: 32768
|
||||
osl: 8192
|
||||
random_range_ratio: 0.2
|
||||
backend: "openai"
|
||||
trust_remote_code: true
|
||||
dataset_file: datasets/perf-ci/k2_thinking-32k8k-20480-ratio-1_for_serve.json
|
||||
|
||||
@ -29,9 +29,9 @@ server_configs:
|
||||
iterations: 10
|
||||
isl: 8192
|
||||
osl: 1024
|
||||
random_range_ratio: 0.2
|
||||
backend: "openai"
|
||||
trust_remote_code: true
|
||||
dataset_file: datasets/perf-ci/k2_thinking-8k1k-20480-ratio-1_for_serve.json
|
||||
|
||||
# 8k1k configs - DEP8 with CUTLASS
|
||||
- name: "k2_thinking_fp4_dep8_8k1k"
|
||||
@ -63,9 +63,9 @@ server_configs:
|
||||
iterations: 10
|
||||
isl: 8192
|
||||
osl: 1024
|
||||
random_range_ratio: 0.2
|
||||
backend: "openai"
|
||||
trust_remote_code: true
|
||||
dataset_file: datasets/perf-ci/k2_thinking-8k1k-20480-ratio-1_for_serve.json
|
||||
|
||||
# 32k8k configs - TEP8 with TRTLLM
|
||||
- name: "k2_thinking_fp4_tep8_32k8k"
|
||||
@ -94,9 +94,9 @@ server_configs:
|
||||
iterations: 10
|
||||
isl: 32768
|
||||
osl: 8192
|
||||
random_range_ratio: 0.2
|
||||
backend: "openai"
|
||||
trust_remote_code: true
|
||||
dataset_file: datasets/perf-ci/k2_thinking-32k8k-20480-ratio-1_for_serve.json
|
||||
|
||||
# 32k8k configs - DEP8 with CUTLASS
|
||||
- name: "k2_thinking_fp4_dep8_32k8k"
|
||||
@ -129,6 +129,6 @@ server_configs:
|
||||
iterations: 10
|
||||
isl: 32768
|
||||
osl: 8192
|
||||
random_range_ratio: 0.2
|
||||
backend: "openai"
|
||||
trust_remote_code: true
|
||||
dataset_file: datasets/perf-ci/k2_thinking-32k8k-20480-ratio-1_for_serve.json
|
||||
|
||||
@ -29,9 +29,9 @@ server_configs:
|
||||
iterations: 10
|
||||
isl: 8192
|
||||
osl: 1024
|
||||
random_range_ratio: 0.2
|
||||
backend: "openai"
|
||||
trust_remote_code: true
|
||||
dataset_file: datasets/perf-ci/k2_thinking-8k1k-20480-ratio-1_for_serve.json
|
||||
|
||||
# 8k1k configs - DEP4 with CUTLASS
|
||||
- name: "k2_thinking_fp4_dep4_8k1k"
|
||||
@ -63,6 +63,6 @@ server_configs:
|
||||
iterations: 10
|
||||
isl: 8192
|
||||
osl: 1024
|
||||
random_range_ratio: 0.2
|
||||
backend: "openai"
|
||||
trust_remote_code: true
|
||||
dataset_file: datasets/perf-ci/k2_thinking-8k1k-20480-ratio-1_for_serve.json
|
||||
|
||||
Loading…
Reference in New Issue
Block a user