[TRTLLM-8263][feat] Add Disagg Perf Tests (#10912)

Signed-off-by: Chenfei Zhang <chenfeiz@nvidia.com>
2026-02-16 07:53:55 +08:00 · 2026-02-04 10:16:11 +08:00 · 2026-02-04 10:16:11 +08:00 · 04b7db3ab5
commit 04b7db3ab5
parent 588db0ed64
65 changed files with 3779 additions and 227 deletions
--- a/jenkins/L0_Test.groovy
+++ b/jenkins/L0_Test.groovy
@ -918,7 +918,7 @@ def runLLMTestlistWithSbatch(pipeline, platform, testList, config=VANILLA_CONFIG
    // Create a unique suffix for the job name
    String customSuffix = "${env.BUILD_TAG}-${UUID.randomUUID().toString().replaceAll("-", "").substring(0, 6)}".toLowerCase()
    def jobUID = "${cluster.host}-multi_node_test-${customSuffix}"
-    def disaggMode = stageName.contains("PerfSanity-Disagg")
+    def disaggMode = stageName.contains("Disagg-PerfSanity")

    Utils.exec(pipeline, script: "env | sort && pwd && ls -alh")

@ -3151,6 +3151,15 @@ def runInKubernetes(pipeline, podSpec, containerName)
    }
 }

+def buildStageConfigs(stageName, platform, testlist, testCount, gpuCount, nodeCount, runWithSbatch=false) {
+    def configs = [:]
+    for (int k = 1; k <= testCount; k++) {
+        def key = "${stageName}-${k}"
+        configs[key] = [platform, testlist, k, testCount, gpuCount, nodeCount, runWithSbatch]
+    }
+    return configs
+}
+
 def launchTestJobs(pipeline, testFilter)
 {
    // IMPORTANT: Stage Configuration Syntax Requirement
@ -3354,18 +3363,57 @@ def launchTestJobs(pipeline, testFilter)
        "GB200-8_GPUs-2_Nodes-PyTorch-Post-Merge-1": ["auto:gb200-flex", "l0_gb200_multi_nodes", 1, 3, 8, 2],
        "GB200-8_GPUs-2_Nodes-PyTorch-Post-Merge-2": ["auto:gb200-flex", "l0_gb200_multi_nodes", 2, 3, 8, 2],
        "GB200-8_GPUs-2_Nodes-PyTorch-Post-Merge-3": ["auto:gb200-flex", "l0_gb200_multi_nodes", 3, 3, 8, 2],
-        // PerfSanity post-merge tests
-        "GB200-8_GPUs-2_Nodes-PyTorch-PerfSanity-Post-Merge-1": ["auto:gb200-flex", "l0_gb200_multi_nodes_aggr_perf_sanity_2_nodes", 1, 5, 8, 2],
-        "GB200-8_GPUs-2_Nodes-PyTorch-PerfSanity-Post-Merge-2": ["auto:gb200-flex", "l0_gb200_multi_nodes_aggr_perf_sanity_2_nodes", 2, 5, 8, 2],
-        "GB200-8_GPUs-2_Nodes-PyTorch-PerfSanity-Post-Merge-3": ["auto:gb200-flex", "l0_gb200_multi_nodes_aggr_perf_sanity_2_nodes", 3, 5, 8, 2],
-        "GB200-8_GPUs-2_Nodes-PyTorch-PerfSanity-Post-Merge-4": ["auto:gb200-flex", "l0_gb200_multi_nodes_aggr_perf_sanity_2_nodes", 4, 5, 8, 2],
-        "GB200-8_GPUs-2_Nodes-PyTorch-PerfSanity-Post-Merge-5": ["auto:gb200-flex", "l0_gb200_multi_nodes_aggr_perf_sanity_2_nodes", 5, 5, 8, 2],
-        // Disable stage 'GB200-12_GPUs-3_Nodes-PyTorch-PerfSanity-Disagg-Post-Merge-1' due to https://nvbugs/5819053
-        // "GB200-12_GPUs-3_Nodes-PyTorch-PerfSanity-Disagg-Post-Merge-1": ["auto:gb200-flex", "l0_gb200_multi_nodes_disagg_perf_sanity_3_nodes", 1, 1, 12, 3],
-        // "GB200-24_GPUs-6_Nodes-PyTorch-PerfSanity-Disagg-Post-Merge-1": ["auto:gb200-flex", "l0_gb200_multi_nodes_disagg_perf_sanity_6_nodes", 1, 2, 24, 6],
-        // "GB200-24_GPUs-6_Nodes-PyTorch-PerfSanity-Disagg-Post-Merge-2": ["auto:gb200-flex", "l0_gb200_multi_nodes_disagg_perf_sanity_6_nodes", 2, 2, 24, 6],
-        // "GB200-32_GPUs-8_Nodes-PyTorch-PerfSanity-Disagg-Post-Merge-1": ["auto:gb200-flex", "l0_gb200_multi_nodes_disagg_perf_sanity_8_nodes", 1, 1, 32, 8],
    ]
+    // PerfSanity post-merge aggr tests
+    multiNodesSBSAConfigs += buildStageConfigs(
+        "GB200-8_GPUs-2_Nodes-PyTorch-Aggr-PerfSanity-Node2-GPU8-Post-Merge",
+        "auto:gb200-flex",
+        "l0_gb200_multi_nodes_aggr_perf_sanity_node2_gpu8",
+        5,
+        8,
+        2
+    )
+    // PerfSanity post-merge disagg tests
+    multiNodesSBSAConfigs += buildStageConfigs(
+        "GB200-8_GPUs-2_Nodes-PyTorch-Disagg-PerfSanity-CTX1-NODE1-GPU1-GEN1-NODE1-GPU4-Post-Merge",
+        "auto:gb200-flex",
+        "l0_gb200_multi_nodes_disagg_perf_sanity_ctx1_node1_gpu1_gen1_node1_gpu4",
+        1,
+        8,
+        2
+    )
+    multiNodesSBSAConfigs += buildStageConfigs(
+        "GB200-8_GPUs-2_Nodes-PyTorch-Disagg-PerfSanity-CTX1-NODE1-GPU4-GEN1-NODE1-GPU4-Post-Merge",
+        "auto:gb200-flex",
+        "l0_gb200_multi_nodes_disagg_perf_sanity_ctx1_node1_gpu4_gen1_node1_gpu4",
+        3,
+        8,
+        2
+    )
+    multiNodesSBSAConfigs += buildStageConfigs(
+        "GB200-12_GPUs-3_Nodes-PyTorch-Disagg-PerfSanity-CTX1-NODE1-GPU1-GEN1-NODE2-GPU8-Post-Merge",
+        "auto:gb200-flex",
+        "l0_gb200_multi_nodes_disagg_perf_sanity_ctx1_node1_gpu1_gen1_node2_gpu8",
+        1,
+        12,
+        3
+    )
+    multiNodesSBSAConfigs += buildStageConfigs(
+        "GB200-12_GPUs-3_Nodes-PyTorch-Disagg-PerfSanity-CTX1-NODE1-GPU4-GEN1-NODE2-GPU8-Post-Merge",
+        "auto:gb200-flex",
+        "l0_gb200_multi_nodes_disagg_perf_sanity_ctx1_node1_gpu4_gen1_node2_gpu8",
+        5,
+        12,
+        3
+    )
+    multiNodesSBSAConfigs += buildStageConfigs(
+        "GB200-16_GPUs-4_Nodes-PyTorch-Disagg-PerfSanity-CTX1-NODE2-GPU8-GEN1-NODE2-GPU8-Post-Merge",
+        "auto:gb200-flex",
+        "l0_gb200_multi_nodes_disagg_perf_sanity_ctx1_node2_gpu8_gen1_node2_gpu8",
+        1,
+        16,
+        4
+    )
    fullSet += multiNodesSBSAConfigs.keySet()

    if (env.targetArch == AARCH64_TRIPLE) {
@ -3610,9 +3658,9 @@ def launchTestJobs(pipeline, testFilter)
        }, {}, true)
    }]}

-    multiGpuJobs = parallelJobs.findAll{(it.key.contains("2_GPUs") || it.key.contains("4_GPUs") || it.key.contains("8_GPUs")) && !it.key.contains("Post-Merge")}
+    multiGpuJobs = parallelJobs.findAll{(it.key =~ /\d+_GPUs/) && !it.key.contains("Post-Merge")}
    println multiGpuJobs.keySet()
-    multiGpuJobsPostMerge = parallelJobs.findAll{(it.key.contains("2_GPUs") || it.key.contains("4_GPUs") || it.key.contains("8_GPUs")) && it.key.contains("Post-Merge")}
+    multiGpuJobsPostMerge = parallelJobs.findAll{(it.key =~ /\d+_GPUs/) && it.key.contains("Post-Merge")}

    parallelJobs += docBuildJobs
    parallelJobs += sanityCheckJobs
@ -3927,9 +3975,9 @@ pipeline {

                    def testPhase2StageName = env.testPhase2StageName
                    if (testPhase2StageName) {
-                        def dgxSigns = ["2_GPUs", "4_GPUs", "8_GPUs"]
-                        singleGpuJobs = parallelJobs.findAll{!dgxSigns.any{sign -> it.key.contains(sign)}}
-                        dgxJobs = parallelJobs.findAll{dgxSigns.any{sign -> it.key.contains(sign)}}
+                        def multiGpuPattern = /\d+_GPUs/
+                        singleGpuJobs = parallelJobs.findAll{!(it.key =~ multiGpuPattern)}
+                        dgxJobs = parallelJobs.findAll{it.key =~ multiGpuPattern}
                    }

                    if (env.JOB_NAME ==~ /.*Single-GPU.*/) {
--- a/jenkins/scripts/perf/disaggregated/slurm_launch_draft.sh
+++ b/jenkins/scripts/perf/disaggregated/slurm_launch_draft.sh
@ -19,13 +19,13 @@ echo "Installation completed on all nodes"
 # Start gen servers
 echo "Starting gen servers..."
 for i in $(seq 0 $((numGenServers - 1))); do
-    gen_world_size=$((nodesPerGenServer * gpusPerNode))
+    gen_world_size=$((nodesPerGenServer * gpusPerfNodePerfGenServer))
    export DISAGG_SERVING_TYPE="GEN_$i"
    export pytestCommand="$pytestCommandWorker"
    srun "${srunArgs[@]}" --kill-on-bad-exit=1 \
        -N $nodesPerGenServer \
        --ntasks=$gen_world_size \
-        --ntasks-per-node=$gpusPerNode \
+        --ntasks-per-node=$gpusPerfNodePerfGenServer \
        $runScript &> $jobWorkspace/gen_server_$i.log &
    echo "Started gen server $i"
 done
@ -34,13 +34,13 @@ done
 if [ "${TRTLLM_DISAGG_BENCHMARK_GEN_ONLY:-0}" != "1" ]; then
    echo "Starting ctx servers..."
    for i in $(seq 0 $((numCtxServers - 1))); do
-        ctx_world_size=$((nodesPerCtxServer * gpusPerNode))
+        ctx_world_size=$((nodesPerCtxServer * gpusPerfNodePerfCtxServer))
        export DISAGG_SERVING_TYPE="CTX_$i"
        export pytestCommand="$pytestCommandWorker"
        srun "${srunArgs[@]}" --kill-on-bad-exit=1 \
            -N $nodesPerCtxServer \
-            --ntasks=$ctx_world_size \
-            --ntasks-per-node=$gpusPerNode \
+        --ntasks=$ctx_world_size \
+        --ntasks-per-node=$gpusPerfNodePerfCtxServer \
            $runScript &> $jobWorkspace/ctx_server_$i.log &
        echo "Started ctx server $i"
    done
--- a/jenkins/scripts/perf/disaggregated/submit.py
+++ b/jenkins/scripts/perf/disaggregated/submit.py
@ -38,6 +38,9 @@ def get_hardware_config(config, benchmark_mode):
    nodes_per_ctx_server = (gpus_per_ctx_server + gpus_per_node - 1) // gpus_per_node
    nodes_per_gen_server = (gpus_per_gen_server + gpus_per_node - 1) // gpus_per_node

+    gpus_per_node_per_ctx_server = min(gpus_per_ctx_server, gpus_per_node)
+    gpus_per_node_per_gen_server = min(gpus_per_gen_server, gpus_per_node)
+
    total_nodes = num_ctx_servers * nodes_per_ctx_server + num_gen_servers * nodes_per_gen_server
    total_gpus = total_nodes * gpus_per_node

@ -49,6 +52,8 @@ def get_hardware_config(config, benchmark_mode):
        "gpus_per_gen_server": gpus_per_gen_server,
        "nodes_per_ctx_server": nodes_per_ctx_server,
        "nodes_per_gen_server": nodes_per_gen_server,
+        "gpus_per_node_per_ctx_server": gpus_per_node_per_ctx_server,
+        "gpus_per_node_per_gen_server": gpus_per_node_per_gen_server,
        "total_nodes": total_nodes,
        "total_gpus": total_gpus,
    }
@ -102,7 +107,14 @@ def remove_whitespace_lines(lines):
    return [line.strip() for line in lines if line.strip()]


-def get_pytest_command_no_llmapilaunch(script_prefix_lines):
+def get_pytest_commands(script_prefix_lines):
+    # Get worker, disagg_server, benchmark pytest commands from pytest command.
+    # Worker pytest command is pytest command with trtllm-llmapi-launch and
+    # without --csv, --cov, --periodic flags.
+    # Disagg_server pytest command is pytest command without trtllm-llmapi-launch
+    # and without --csv, --cov, --periodic flags.
+    # Benchmark pytest command is pytest command without trtllm-llmapi-launch
+    # and with --csv, --cov, --periodic flags.
    pytest_command_line = None
    for line in script_prefix_lines:
        if "export pytestCommand=" in line:
@ -110,17 +122,102 @@ def get_pytest_command_no_llmapilaunch(script_prefix_lines):
            break

    if not pytest_command_line:
-        return ""
+        return "", "", ""

-    # Replace pytestCommand with pytestCommandNoLLMAPILaunch
-    replaced_line = pytest_command_line.replace("pytestCommand", "pytestCommandNoLLMAPILaunch")
+    def split_pytest_command_line(command_line):
+        # After pytest, there are six types of substrings:
+        # Type 1: --xxx=yyy  (long option with value, self-contained)
+        # Type 2: --xxx=     (long option with empty value, self-contained)
+        # Type 3: --xxx      (long option flag, no value)
+        # Type 4: --xxx yyy  (long option with value as next arg)
+        # Type 5: -x yyy     (short single-letter option with value as next arg)
+        # Type 6: -x         (short option flag, e.g., -v, -vv)
+        parts = command_line.split()
+        pytest_index = None
+        for idx, part in enumerate(parts):
+            if "pytest" == part:
+                pytest_index = idx
+                break
+        if pytest_index is None:
+            return parts

-    # Split by space, find and remove the substring with trtllm-llmapi-launch
-    replaced_line_parts = replaced_line.split()
-    replaced_line_parts_no_llmapi = [
-        part for part in replaced_line_parts if "trtllm-llmapi-launch" not in part
+        grouped_parts = parts[: pytest_index + 1]
+        i = pytest_index + 1
+        while i < len(parts):
+            part = parts[i]
+            has_next = i + 1 < len(parts)
+            next_is_value = has_next and not parts[i + 1].startswith("-")
+
+            # Type 1 & 2: --xxx=yyy or --xxx= (self-contained, has '=')
+            if part.startswith("--") and "=" in part:
+                grouped_parts.append(part)
+                i += 1
+                continue
+
+            # Type 4: --xxx yyy (long option with value as next arg)
+            if part.startswith("--") and next_is_value:
+                grouped_parts.append(f"{part} {parts[i + 1]}")
+                i += 2
+                continue
+
+            # Type 3: --xxx (long option flag)
+            if part.startswith("--"):
+                grouped_parts.append(part)
+                i += 1
+                continue
+
+            # Type 5: -x yyy (short single-letter option with value as next arg)
+            # Only single letter after dash, e.g., -o, not -vv
+            if part.startswith("-") and len(part) == 2 and next_is_value:
+                grouped_parts.append(f"{part} {parts[i + 1]}")
+                i += 2
+                continue
+
+            # Type 6: -x (short option flag, including combined like -vv)
+            if part.startswith("-"):
+                grouped_parts.append(part)
+                i += 1
+                continue
+
+            # Other parts (shouldn't happen after pytest, but handle gracefully)
+            grouped_parts.append(part)
+            i += 1
+
+        return grouped_parts
+
+    def is_llmapi_launch(part):
+        return "trtllm-llmapi-launch" in part
+
+    def is_output_file_part(part):
+        return any(flag in part for flag in ("--csv", "--cov", "--periodic"))
+
+    worker_line = pytest_command_line.replace("pytestCommand", "partialPytestCommandWorker")
+    worker_parts = [
+        part for part in split_pytest_command_line(worker_line) if not is_output_file_part(part)
    ]
-    return " ".join(replaced_line_parts_no_llmapi)
+    worker_pytest_command = " ".join(worker_parts)
+
+    disagg_server_line = pytest_command_line.replace(
+        "pytestCommand", "partialPytestCommandDisaggServer"
+    )
+    disagg_server_parts = [
+        part
+        for part in split_pytest_command_line(disagg_server_line)
+        if not is_llmapi_launch(part) and not is_output_file_part(part)
+    ]
+    disagg_server_pytest_command = " ".join(disagg_server_parts)
+
+    benchmark_line = pytest_command_line.replace("pytestCommand", "partialPytestCommandBenchmark")
+    benchmark_parts = [
+        part for part in split_pytest_command_line(benchmark_line) if not is_llmapi_launch(part)
+    ]
+    benchmark_pytest_command = " ".join(benchmark_parts)
+
+    return (
+        worker_pytest_command,
+        disagg_server_pytest_command,
+        benchmark_pytest_command,
+    )


 def get_config_yaml(test_list_path, llm_src):
@ -153,7 +250,7 @@ def get_config_yaml(test_list_path, llm_src):
        "disagg",
        "test_configs",
        "disagg",
-        "perf",
+        "perf-sanity",
        f"{config_base_name}.yaml",
    )
    if not os.path.exists(config_yaml_path):
@ -225,8 +322,12 @@ def main():

    srun_args_lines = srun_args_content.split()

-    # Extract pytestCommand and generate pytestCommandNoLLMAPILaunch
-    pytest_command_no_llmapi_launch = get_pytest_command_no_llmapilaunch(script_prefix_lines)
+    # Extract pytestCommand and generate partial pytest commands
+    (
+        worker_pytest_command,
+        disagg_server_pytest_command,
+        benchmark_pytest_command,
+    ) = get_pytest_commands(script_prefix_lines)

    # Build worker env vars, add extra env vars for gen_only mode
    worker_env_vars = env_config["worker_env_var"]
@ -244,12 +345,15 @@ def main():

    script_prefix_lines.extend(
        [
-            pytest_command_no_llmapi_launch,
-            f'export pytestCommandWorker="unset UCX_TLS && {worker_env_vars} $pytestCommand"',
-            f'export pytestCommandDisaggServer="{server_env_vars} $pytestCommandNoLLMAPILaunch"',
-            f'export pytestCommandBenchmark="{env_config["benchmark_env_var"]} $pytestCommandNoLLMAPILaunch"',
+            worker_pytest_command,
+            disagg_server_pytest_command,
+            benchmark_pytest_command,
+            f'export pytestCommandWorker="unset UCX_TLS && {worker_env_vars} $partialPytestCommandWorker"',
+            f'export pytestCommandDisaggServer="{server_env_vars} $partialPytestCommandDisaggServer"',
+            f'export pytestCommandBenchmark="{env_config["benchmark_env_var"]} $partialPytestCommandBenchmark"',
            f"export runScript={args.run_sh}",
            f"export installScript={install_script}",
+            f"export configYamlPath={config_yaml}",
            f"export numCtxServers={hardware_config['num_ctx_servers']}",
            f"export numGenServers={hardware_config['num_gen_servers']}",
            f"export gpusPerNode={hardware_config['gpus_per_node']}",
@ -257,6 +361,8 @@ def main():
            f"export gpusPerGenServer={hardware_config['gpus_per_gen_server']}",
            f"export nodesPerCtxServer={hardware_config['nodes_per_ctx_server']}",
            f"export nodesPerGenServer={hardware_config['nodes_per_gen_server']}",
+            f"export gpusPerfNodePerfCtxServer={hardware_config['gpus_per_node_per_ctx_server']}",
+            f"export gpusPerfNodePerfGenServer={hardware_config['gpus_per_node_per_gen_server']}",
            f"export totalNodes={hardware_config['total_nodes']}",
            f"export totalGpus={hardware_config['total_gpus']}",
        ]
--- a/jenkins/scripts/perf/perf_sanity_triage.py
+++ b/jenkins/scripts/perf/perf_sanity_triage.py
@ -13,8 +13,9 @@ sys.path.insert(0, sys.path[0] + "/..")
 from open_search_db import OpenSearchDB

 QUERY_LOOKBACK_DAYS = 90
+LOOKBACK_JOBS = 30
 MAX_QUERY_SIZE = 3000
-MAX_TEST_CASES_PER_MSG = 5
+MAX_TEST_CASES_PER_MSG = 4
 POST_SLACK_MSG_RETRY_TIMES = 5


@ -99,42 +100,74 @@ def post_perf_data(data_list, project_name):
        return False


-def get_regression_data_by_job_id(data_list, query_job_number):
-    """Returns a dict with job_id as key and list of regression data as value.
+def get_regression_dict(data_list, query_job_number, lookback_job_number=LOOKBACK_JOBS):
+    """Returns a dict with job_id as key and list of regression tuples as value.

+    Each tuple is (test_case_name, gpu_type, runtime, history_regression_job_ids, data).
    Only returns the latest query_job_number jobs.
    """
    if data_list is None or len(data_list) == 0:
        return {}

    # Group data by job_id
-    job_data_dict = {}
+    job_test_dict = {}
    for data in data_list:
-        job_id = data.get("s_job_id", "")
-        if job_id == "":
+        raw_job_id = data.get("s_job_id", "")
+        if raw_job_id == "":
            continue
-        if job_id not in job_data_dict:
-            job_data_dict[job_id] = []
-        job_data_dict[job_id].append(data)
+        try:
+            job_id = int(raw_job_id)
+        except (TypeError, ValueError):
+            continue
+        job_test_dict.setdefault(job_id, []).append(data)

-    # Sort job_ids by the latest ts_created in each group (descending)
-    def get_latest_timestamp(job_id):
-        timestamps = [d.get("ts_created", 0) for d in job_data_dict[job_id]]
-        return max(timestamps) if timestamps else 0
+    if not job_test_dict:
+        return {}

-    sorted_job_ids = sorted(job_data_dict.keys(), key=get_latest_timestamp, reverse=True)
+    # Sort job_ids (descending: latest -> oldest)
+    sorted_job_id_list = sorted(job_test_dict.keys(), reverse=True)

-    # Only keep the latest query_job_number jobs
-    latest_job_ids = sorted_job_ids[:query_job_number]
+    # Build (test_case_name, gpu_type, runtime) -> job_ids dict
+    test_job_dict = {}
+    for job_id, data_list in job_test_dict.items():
+        for data in data_list:
+            test_case_name = data.get("s_test_case_name") or ""
+            gpu_type = data.get("s_gpu_type") or ""
+            runtime = data.get("s_runtime") or ""
+            if not test_case_name or not gpu_type or not runtime:
+                continue
+            key = (test_case_name, gpu_type, runtime)
+            test_job_dict.setdefault(key, set()).add(job_id)

-    result = {}
+    # Sort job ids for each test case (descending: latest -> oldest)
+    for key, job_id_set in list(test_job_dict.items()):
+        test_job_dict[key] = sorted(job_id_set, reverse=True)
+
+    # Only keep the latest query_job_number jobs in the result
+    latest_job_ids = sorted_job_id_list[:query_job_number]
+
+    regression_dict = {}
    for job_id in latest_job_ids:
-        result[job_id] = job_data_dict[job_id]
+        entries = []
+        for data in job_test_dict.get(job_id, []):
+            test_case_name = data.get("s_test_case_name") or ""
+            gpu_type = data.get("s_gpu_type") or ""
+            runtime = data.get("s_runtime") or ""
+            if not test_case_name or not gpu_type or not runtime:
+                continue
+            key = (test_case_name, gpu_type, runtime)
+            history_ids = test_job_dict.get(key, [])
+            lower_bound = job_id - lookback_job_number + 1
+            history_regression_job_ids = [
+                jid for jid in history_ids if lower_bound <= jid <= job_id
+            ]
+            entries.append((test_case_name, gpu_type, runtime, history_regression_job_ids, data))
+        regression_dict[job_id] = entries

-    return result
+    return regression_dict


-def process_regression_message(regression_dict):
+def split_regression_message(regression_dict):
    """Process regression data into message chunks.

    Returns a list of messages, each containing at most MAX_TEST_CASES_PER_MSG test cases.
@ -142,12 +175,17 @@ def process_regression_message(regression_dict):
    if not regression_dict:
        return []

-    # Flatten all test cases into a list with (job_id, idx, data) tuples
+    # Flatten all test cases into a list with
+    # (job_id, idx, test_case_name, gpu_type, runtime, history_regression_job_ids, data) tuples
    all_test_cases = []
    for job_id, data_list in regression_dict.items():
-        sorted_data_list = sorted(data_list, key=lambda x: x.get("s_test_case_name", ""))
-        for idx, data in enumerate(sorted_data_list, start=1):
-            all_test_cases.append((job_id, idx, data))
+        sorted_data_list = sorted(data_list, key=lambda x: x[0])
+        for idx, (test_case_name, gpu_type, runtime, history_regression_job_ids, data) in enumerate(
+            sorted_data_list, start=1
+        ):
+            all_test_cases.append(
+                (job_id, idx, test_case_name, gpu_type, runtime, history_regression_job_ids, data)
+            )

    # Split into chunks of MAX_TEST_CASES_PER_MSG
    chunks = []
@ -159,7 +197,15 @@ def process_regression_message(regression_dict):
    for chunk in chunks:
        msg_parts = []
        current_job_id = None
-        for job_id, idx, data in chunk:
+        for (
+            job_id,
+            idx,
+            test_case_name,
+            gpu_type,
+            runtime,
+            history_regression_job_ids,
+            data,
+        ) in chunk:
            # Add job header when switching to a new job_id
            if job_id != current_job_id:
                if msg_parts:
@ -168,12 +214,46 @@ def process_regression_message(regression_dict):
                msg_parts.append(job_header)
                current_job_id = job_id

-            test_case_name = data.get("s_test_case_name", "N/A")
            regression_info = data.get("s_regression_info", "N/A")
+            history_text = (
+                ", ".join(str(jid) for jid in history_regression_job_ids)
+                if history_regression_job_ids
+                else "N/A"
+            )
            msg_parts.append(f"*REGRESSION TEST CASE {idx}: {test_case_name}*\n")
+            msg_parts.append(f"*GPU: {gpu_type} Mode: {runtime}*\n")
+            msg_parts.append(f"*History Regression Post-Merge Job IDs: {history_text}*\n")
+
+            # Parse regression_info to extract baseline info and metrics
+            baseline_date = "N/A"
+            baseline_branch = "N/A"
+            baseline_commit = "N/A"
            for part in regression_info.split(","):
                part = part.strip()
-                if part and "baseline_id" not in part:
+                if "baseline_date:" in part:
+                    baseline_date = part.split(":", 1)[-1].strip()
+                elif "baseline_branch:" in part:
+                    baseline_branch = part.split(":", 1)[-1].strip()
+                elif "baseline_commit:" in part:
+                    baseline_commit = part.split(":", 1)[-1].strip()
+
+            # Get regression branch and commit from data
+            regression_date = data.get("ts_created", "N/A")
+            regression_branch = data.get("s_branch", "N/A")
+            regression_commit = data.get("s_commit", "N/A")
+
+            msg_parts.append(
+                f"*Baseline date, branch and commit: "
+                f"{baseline_date} {baseline_branch} {baseline_commit}*\n"
+            )
+            msg_parts.append(
+                f"*Regression date, branch and commit: "
+                f"{regression_date} {regression_branch} {regression_commit}*\n"
+            )
+
+            for part in regression_info.split(","):
+                part = part.strip()
+                if part and "baseline_" not in part:
                    msg_parts.append(f"  {part}\n")

        msg = "".join(msg_parts).strip()
@ -288,8 +368,8 @@ def main():
            print("Failed to query regression data")
            return

-        regression_dict = get_regression_data_by_job_id(data_list, args.query_job_number)
-        messages = process_regression_message(regression_dict)
+        regression_dict = get_regression_dict(data_list, args.query_job_number)
+        messages = split_regression_message(regression_dict)
        send_regression_message(messages, args.channel_id, args.bot_token)
    elif args.operation.strip().upper().startswith("UPDATE"):
        set_values, where_values, error = parse_update_operation(args.operation)
--- a/jenkins/scripts/slurm_run.sh
+++ b/jenkins/scripts/slurm_run.sh
@ -58,7 +58,10 @@ cd $llmSrcNode/tests/integration/defs
 trtllmWhlPath=$(pip3 show tensorrt_llm | grep Location | cut -d ' ' -f 2)
 trtllmWhlPath=$(echo "$trtllmWhlPath" | sed 's/[[:space:]]+/_/g')
 echo "TRTLLM WHEEL PATH: $trtllmWhlPath"
-pytestCommand=$(set_value_in_command "TRTLLM_WHL_PATH" "$trtllmWhlPath" "$pytestCommand")
+# In disaggregated mode, we only set coverage config file in benchmark pytest.
+if [[ -z "${DISAGG_SERVING_TYPE:-}" || "${DISAGG_SERVING_TYPE}" == "BENCHMARK" ]]; then
+    pytestCommand=$(set_value_in_command "TRTLLM_WHL_PATH" "$trtllmWhlPath" "$pytestCommand")
+fi

 # Only the first process will save the coverage config file
 if [ $SLURM_PROCID -eq 0 ]; then
--- a/tests/integration/defs/perf/disagg/test_configs/disagg/perf-sanity/b200-deepseek-r1-fp4_1k1k_ctx1_dep4_gen1_dep8_eplb0_mtp1_ccb-UCX.yaml
+++ b/tests/integration/defs/perf/disagg/test_configs/disagg/perf-sanity/b200-deepseek-r1-fp4_1k1k_ctx1_dep4_gen1_dep8_eplb0_mtp1_ccb-UCX.yaml
@ -0,0 +1,97 @@
+metadata:
+  model_name: deepseek_r1_0528_fp4_v2
+  precision: fp4
+  model_dir_name: DeepSeek-R1-0528-FP4-v2
+  supported_gpus:
+  - B200
+  script_file: disaggr_torch.slurm
+  benchmark_type: 1k1k
+slurm:
+  script_file: disaggr_torch.slurm
+  partition: <partition>
+  account: <account>
+  job_time: 02:00:00
+  job_name: unified-benchmark
+  extra_args: --gres=gpu:8
+  numa_bind: true
+benchmark:
+  mode: e2e
+  use_nv_sa_benchmark: false
+  multi_round: 5
+  benchmark_ratio: 0.0
+  streaming: true
+  concurrency_list: '2048'
+  input_length: 1024
+  output_length: 1024
+  dataset_file: datasets/perf-ci/deepseek_r1-1k1k-20480-ratio-1_for_serve.json
+hardware:
+  gpus_per_node: 8
+  num_ctx_servers: 1
+  num_gen_servers: 1
+environment:
+  container_mount: <container_mount>
+  container_image: <container_image>
+  model_path: <model_path>
+  trtllm_repo: ''
+  build_wheel: false
+  work_dir: <full_path_to_work_dir>
+  worker_env_var: TLLM_LOG_LEVEL=INFO TRTLLM_SERVER_DISABLE_GC=1 TRTLLM_WORKER_DISABLE_GC=1 TRTLLM_ENABLE_PDL=1 ENROOT_ALLOW_DEV=yes
+  server_env_var: TRTLLM_SERVER_DISABLE_GC=1
+profiling:
+  nsys_on: false
+accuracy:
+  enable_accuracy_test: false
+  model: local-completions
+  tasks: gsm8k
+  model_args_extra: num_concurrent=512,max_retries=3,tokenized_requests=false,timeout=1200,max_gen_toks=256,max_length=4096
+worker_config:
+  gen:
+    print_iter_log: true
+    max_batch_size: 256
+    max_num_tokens: 512
+    tensor_parallel_size: 8
+    moe_expert_parallel_size: 8
+    pipeline_parallel_size: 1
+    context_parallel_size: 1
+    enable_attention_dp: true
+    enable_lm_head_tp_in_adp: true
+    cuda_graph_config:
+      enable_padding: true
+      max_batch_size: 256
+    kv_cache_config:
+      enable_block_reuse: false
+      free_gpu_memory_fraction: 0.8
+      dtype: fp8
+    moe_config:
+      backend: CUTEDSL
+    cache_transceiver_config:
+      max_tokens_in_buffer: 16384
+      backend: UCX
+    disable_overlap_scheduler: true
+    speculative_config: &id001
+      decoding_type: MTP
+      num_nextn_predict_layers: 1
+    num_postprocess_workers: 4
+    stream_interval: 20
+  ctx:
+    print_iter_log: true
+    max_batch_size: 16
+    max_num_tokens: 16384
+    tensor_parallel_size: 4
+    moe_expert_parallel_size: 4
+    pipeline_parallel_size: 1
+    context_parallel_size: 1
+    enable_attention_dp: true
+    enable_lm_head_tp_in_adp: true
+    cuda_graph_config: null
+    kv_cache_config:
+      enable_block_reuse: false
+      free_gpu_memory_fraction: 0.6
+      dtype: fp8
+    moe_config:
+      backend: TRTLLM
+    cache_transceiver_config:
+      max_tokens_in_buffer: 16384
+      backend: UCX
+    disable_overlap_scheduler: true
+    speculative_config: *id001
--- a/tests/integration/defs/perf/disagg/test_configs/disagg/perf-sanity/b200-deepseek-r1-fp4_1k1k_ctx1_dep4_gen1_dep8_eplb0_mtp3_ccb-UCX.yaml
+++ b/tests/integration/defs/perf/disagg/test_configs/disagg/perf-sanity/b200-deepseek-r1-fp4_1k1k_ctx1_dep4_gen1_dep8_eplb0_mtp3_ccb-UCX.yaml
@ -0,0 +1,97 @@
+metadata:
+  model_name: deepseek_r1_0528_fp4_v2
+  precision: fp4
+  model_dir_name: DeepSeek-R1-0528-FP4-v2
+  supported_gpus:
+  - B200
+  script_file: disaggr_torch.slurm
+  benchmark_type: 1k1k
+slurm:
+  script_file: disaggr_torch.slurm
+  partition: <partition>
+  account: <account>
+  job_time: 02:00:00
+  job_name: unified-benchmark
+  extra_args: --gres=gpu:8
+  numa_bind: true
+benchmark:
+  mode: e2e
+  use_nv_sa_benchmark: false
+  multi_round: 10
+  benchmark_ratio: 0.0
+  streaming: true
+  concurrency_list: '256'
+  input_length: 1024
+  output_length: 1024
+  dataset_file: datasets/perf-ci/deepseek_r1-1k1k-20480-ratio-1_for_serve.json
+hardware:
+  gpus_per_node: 8
+  num_ctx_servers: 1
+  num_gen_servers: 1
+environment:
+  container_mount: <container_mount>
+  container_image: <container_image>
+  model_path: <model_path>
+  trtllm_repo: ''
+  build_wheel: false
+  work_dir: <full_path_to_work_dir>
+  worker_env_var: TLLM_LOG_LEVEL=INFO TRTLLM_SERVER_DISABLE_GC=1 TRTLLM_WORKER_DISABLE_GC=1 TRTLLM_ENABLE_PDL=1 ENROOT_ALLOW_DEV=yes
+  server_env_var: TRTLLM_SERVER_DISABLE_GC=1
+profiling:
+  nsys_on: false
+accuracy:
+  enable_accuracy_test: false
+  model: local-completions
+  tasks: gsm8k
+  model_args_extra: num_concurrent=512,max_retries=3,tokenized_requests=false,timeout=1200,max_gen_toks=256,max_length=4096
+worker_config:
+  gen:
+    print_iter_log: true
+    max_batch_size: 32
+    max_num_tokens: 256
+    tensor_parallel_size: 8
+    moe_expert_parallel_size: 8
+    pipeline_parallel_size: 1
+    context_parallel_size: 1
+    enable_attention_dp: true
+    enable_lm_head_tp_in_adp: true
+    cuda_graph_config:
+      enable_padding: true
+      max_batch_size: 32
+    kv_cache_config:
+      enable_block_reuse: false
+      free_gpu_memory_fraction: 0.8
+      dtype: fp8
+    moe_config:
+      backend: CUTEDSL
+    cache_transceiver_config:
+      max_tokens_in_buffer: 16384
+      backend: UCX
+    disable_overlap_scheduler: true
+    speculative_config: &id001
+      decoding_type: MTP
+      num_nextn_predict_layers: 3
+    num_postprocess_workers: 4
+    stream_interval: 20
+  ctx:
+    print_iter_log: true
+    max_batch_size: 16
+    max_num_tokens: 16384
+    tensor_parallel_size: 4
+    moe_expert_parallel_size: 4
+    pipeline_parallel_size: 1
+    context_parallel_size: 1
+    enable_attention_dp: true
+    enable_lm_head_tp_in_adp: true
+    cuda_graph_config: null
+    kv_cache_config:
+      enable_block_reuse: false
+      free_gpu_memory_fraction: 0.6
+      dtype: fp8
+    moe_config:
+      backend: TRTLLM
+    cache_transceiver_config:
+      max_tokens_in_buffer: 16384
+      backend: UCX
+    disable_overlap_scheduler: true
+    speculative_config: *id001
--- a/tests/integration/defs/perf/disagg/test_configs/disagg/perf-sanity/b200-deepseek-r1-fp4_1k1k_ctx1_dep4_gen1_tep8_eplb0_mtp3_ccb-UCX.yaml
+++ b/tests/integration/defs/perf/disagg/test_configs/disagg/perf-sanity/b200-deepseek-r1-fp4_1k1k_ctx1_dep4_gen1_tep8_eplb0_mtp3_ccb-UCX.yaml
@ -0,0 +1,96 @@
+metadata:
+  model_name: deepseek_r1_0528_fp4_v2
+  precision: fp4
+  model_dir_name: DeepSeek-R1-0528-FP4-v2
+  supported_gpus:
+  - B200
+  script_file: disaggr_torch.slurm
+  benchmark_type: 1k1k
+slurm:
+  script_file: disaggr_torch.slurm
+  partition: <partition>
+  account: <account>
+  job_time: 02:00:00
+  job_name: unified-benchmark
+  extra_args: --gres=gpu:8
+  numa_bind: true
+benchmark:
+  mode: e2e
+  use_nv_sa_benchmark: false
+  multi_round: 10
+  benchmark_ratio: 0.0
+  streaming: true
+  concurrency_list: '1'
+  input_length: 1024
+  output_length: 1024
+  dataset_file: datasets/perf-ci/deepseek_r1-1k1k-20480-ratio-1_for_serve.json
+hardware:
+  gpus_per_node: 8
+  num_ctx_servers: 1
+  num_gen_servers: 1
+environment:
+  container_mount: <container_mount>
+  container_image: <container_image>
+  model_path: <model_path>
+  trtllm_repo: ''
+  build_wheel: false
+  work_dir: <full_path_to_work_dir>
+  worker_env_var: TLLM_LOG_LEVEL=INFO TRTLLM_SERVER_DISABLE_GC=1 TRTLLM_WORKER_DISABLE_GC=1 TRTLLM_ENABLE_PDL=1 ENROOT_ALLOW_DEV=yes
+  server_env_var: TRTLLM_SERVER_DISABLE_GC=1
+profiling:
+  nsys_on: false
+accuracy:
+  enable_accuracy_test: false
+  model: local-completions
+  tasks: gsm8k
+  model_args_extra: num_concurrent=512,max_retries=3,tokenized_requests=false,timeout=1200,max_gen_toks=256,max_length=4096
+worker_config:
+  gen:
+    print_iter_log: true
+    max_batch_size: 1
+    max_num_tokens: 256
+    tensor_parallel_size: 8
+    moe_expert_parallel_size: 8
+    pipeline_parallel_size: 1
+    context_parallel_size: 1
+    enable_attention_dp: false
+    cuda_graph_config:
+      enable_padding: true
+      max_batch_size: 1
+    kv_cache_config:
+      enable_block_reuse: false
+      free_gpu_memory_fraction: 0.9
+      dtype: fp8
+    moe_config:
+      backend: TRTLLM
+    cache_transceiver_config:
+      max_tokens_in_buffer: 16384
+      backend: UCX
+    disable_overlap_scheduler: true
+    speculative_config: &id001
+      decoding_type: MTP
+      num_nextn_predict_layers: 3
+    num_postprocess_workers: 4
+    stream_interval: 20
+  ctx:
+    print_iter_log: true
+    max_batch_size: 16
+    max_num_tokens: 16384
+    tensor_parallel_size: 4
+    moe_expert_parallel_size: 4
+    pipeline_parallel_size: 1
+    context_parallel_size: 1
+    enable_attention_dp: true
+    enable_lm_head_tp_in_adp: true
+    cuda_graph_config: null
+    kv_cache_config:
+      enable_block_reuse: false
+      free_gpu_memory_fraction: 0.6
+      dtype: fp8
+    moe_config:
+      backend: TRTLLM
+    cache_transceiver_config:
+      max_tokens_in_buffer: 16384
+      backend: UCX
+    disable_overlap_scheduler: true
+    speculative_config: *id001
--- a/tests/integration/defs/perf/disagg/test_configs/disagg/perf-sanity/b200-deepseek-r1-fp4_8k1k_con1536_ctx1_dep4_gen1_dep8_eplb0_mtp1_ccb-UCX.yaml
+++ b/tests/integration/defs/perf/disagg/test_configs/disagg/perf-sanity/b200-deepseek-r1-fp4_8k1k_con1536_ctx1_dep4_gen1_dep8_eplb0_mtp1_ccb-UCX.yaml
@ -0,0 +1,97 @@
+metadata:
+  model_name: deepseek_r1_0528_fp4_v2
+  precision: fp4
+  model_dir_name: DeepSeek-R1-0528-FP4-v2
+  supported_gpus:
+  - B200
+  script_file: disaggr_torch.slurm
+  benchmark_type: 8k1k
+slurm:
+  script_file: disaggr_torch.slurm
+  partition: <partition>
+  account: <account>
+  job_time: 02:00:00
+  job_name: unified-benchmark
+  extra_args: --gres=gpu:8
+  numa_bind: true
+benchmark:
+  mode: e2e
+  use_nv_sa_benchmark: false
+  multi_round: 10
+  benchmark_ratio: 0.0
+  streaming: true
+  concurrency_list: '1536'
+  input_length: 8192
+  output_length: 1024
+  dataset_file: datasets/perf-ci/deepseek_r1-8k1k-20480-ratio-1_for_serve.json
+hardware:
+  gpus_per_node: 8
+  num_ctx_servers: 1
+  num_gen_servers: 1
+environment:
+  container_mount: <container_mount>
+  container_image: <container_image>
+  model_path: <model_path>
+  trtllm_repo: ''
+  build_wheel: false
+  work_dir: <full_path_to_work_dir>
+  worker_env_var: TLLM_LOG_LEVEL=INFO TRTLLM_SERVER_DISABLE_GC=1 TRTLLM_WORKER_DISABLE_GC=1 TRTLLM_ENABLE_PDL=1 ENROOT_ALLOW_DEV=yes
+  server_env_var: TRTLLM_SERVER_DISABLE_GC=1
+profiling:
+  nsys_on: false
+accuracy:
+  enable_accuracy_test: false
+  model: local-completions
+  tasks: gsm8k
+  model_args_extra: num_concurrent=512,max_retries=3,tokenized_requests=false,timeout=1200,max_gen_toks=256,max_length=4096
+worker_config:
+  gen:
+    print_iter_log: true
+    max_batch_size: 192
+    max_num_tokens: 384
+    tensor_parallel_size: 8
+    moe_expert_parallel_size: 8
+    pipeline_parallel_size: 1
+    context_parallel_size: 1
+    enable_attention_dp: true
+    enable_lm_head_tp_in_adp: true
+    cuda_graph_config:
+      enable_padding: true
+      max_batch_size: 192
+    kv_cache_config:
+      enable_block_reuse: false
+      free_gpu_memory_fraction: 0.8
+      dtype: fp8
+    moe_config:
+      backend: CUTEDSL
+    cache_transceiver_config:
+      max_tokens_in_buffer: 16384
+      backend: UCX
+    disable_overlap_scheduler: true
+    speculative_config: &id001
+      decoding_type: MTP
+      num_nextn_predict_layers: 1
+    num_postprocess_workers: 4
+    stream_interval: 20
+  ctx:
+    print_iter_log: true
+    max_batch_size: 2
+    max_num_tokens: 16384
+    tensor_parallel_size: 4
+    moe_expert_parallel_size: 4
+    pipeline_parallel_size: 1
+    context_parallel_size: 1
+    enable_attention_dp: true
+    enable_lm_head_tp_in_adp: true
+    cuda_graph_config: null
+    kv_cache_config:
+      enable_block_reuse: false
+      free_gpu_memory_fraction: 0.6
+      dtype: fp8
+    moe_config:
+      backend: TRTLLM
+    cache_transceiver_config:
+      max_tokens_in_buffer: 16384
+      backend: UCX
+    disable_overlap_scheduler: true
+    speculative_config: *id001
--- a/tests/integration/defs/perf/disagg/test_configs/disagg/perf-sanity/b200-deepseek-r1-fp4_8k1k_con256_ctx1_dep4_gen1_dep8_eplb0_mtp1_ccb-UCX.yaml
+++ b/tests/integration/defs/perf/disagg/test_configs/disagg/perf-sanity/b200-deepseek-r1-fp4_8k1k_con256_ctx1_dep4_gen1_dep8_eplb0_mtp1_ccb-UCX.yaml
@ -0,0 +1,97 @@
+metadata:
+  model_name: deepseek_r1_0528_fp4_v2
+  precision: fp4
+  model_dir_name: DeepSeek-R1-0528-FP4-v2
+  supported_gpus:
+  - B200
+  script_file: disaggr_torch.slurm
+  benchmark_type: 8k1k
+slurm:
+  script_file: disaggr_torch.slurm
+  partition: <partition>
+  account: <account>
+  job_time: 02:00:00
+  job_name: unified-benchmark
+  extra_args: --gres=gpu:8
+  numa_bind: true
+benchmark:
+  mode: e2e
+  use_nv_sa_benchmark: false
+  multi_round: 10
+  benchmark_ratio: 0.0
+  streaming: true
+  concurrency_list: '256'
+  input_length: 8192
+  output_length: 1024
+  dataset_file: datasets/perf-ci/deepseek_r1-8k1k-20480-ratio-1_for_serve.json
+hardware:
+  gpus_per_node: 8
+  num_ctx_servers: 1
+  num_gen_servers: 1
+environment:
+  container_mount: <container_mount>
+  container_image: <container_image>
+  model_path: <model_path>
+  trtllm_repo: ''
+  build_wheel: false
+  work_dir: <full_path_to_work_dir>
+  worker_env_var: TLLM_LOG_LEVEL=INFO TRTLLM_SERVER_DISABLE_GC=1 TRTLLM_WORKER_DISABLE_GC=1 TRTLLM_ENABLE_PDL=1 ENROOT_ALLOW_DEV=yes
+  server_env_var: TRTLLM_SERVER_DISABLE_GC=1
+profiling:
+  nsys_on: false
+accuracy:
+  enable_accuracy_test: false
+  model: local-completions
+  tasks: gsm8k
+  model_args_extra: num_concurrent=512,max_retries=3,tokenized_requests=false,timeout=1200,max_gen_toks=256,max_length=4096
+worker_config:
+  gen:
+    print_iter_log: true
+    max_batch_size: 32
+    max_num_tokens: 128
+    tensor_parallel_size: 8
+    moe_expert_parallel_size: 8
+    pipeline_parallel_size: 1
+    context_parallel_size: 1
+    enable_attention_dp: true
+    enable_lm_head_tp_in_adp: true
+    cuda_graph_config:
+      enable_padding: true
+      max_batch_size: 32
+    kv_cache_config:
+      enable_block_reuse: false
+      free_gpu_memory_fraction: 0.8
+      dtype: fp8
+    moe_config:
+      backend: CUTEDSL
+    cache_transceiver_config:
+      max_tokens_in_buffer: 16384
+      backend: UCX
+    disable_overlap_scheduler: true
+    speculative_config: &id001
+      decoding_type: MTP
+      num_nextn_predict_layers: 1
+    num_postprocess_workers: 4
+    stream_interval: 20
+  ctx:
+    print_iter_log: true
+    max_batch_size: 2
+    max_num_tokens: 16384
+    tensor_parallel_size: 4
+    moe_expert_parallel_size: 4
+    pipeline_parallel_size: 1
+    context_parallel_size: 1
+    enable_attention_dp: true
+    enable_lm_head_tp_in_adp: true
+    cuda_graph_config: null
+    kv_cache_config:
+      enable_block_reuse: false
+      free_gpu_memory_fraction: 0.6
+      dtype: fp8
+    moe_config:
+      backend: TRTLLM
+    cache_transceiver_config:
+      max_tokens_in_buffer: 16384
+      backend: UCX
+    disable_overlap_scheduler: true
+    speculative_config: *id001
--- a/tests/integration/defs/perf/disagg/test_configs/disagg/perf-sanity/b200-deepseek-r1-fp4_8k1k_ctx1_dep4_gen1_tep8_eplb0_mtp3_ccb-UCX.yaml
+++ b/tests/integration/defs/perf/disagg/test_configs/disagg/perf-sanity/b200-deepseek-r1-fp4_8k1k_ctx1_dep4_gen1_tep8_eplb0_mtp3_ccb-UCX.yaml
@ -0,0 +1,96 @@
+metadata:
+  model_name: deepseek_r1_0528_fp4_v2
+  precision: fp4
+  model_dir_name: DeepSeek-R1-0528-FP4-v2
+  supported_gpus:
+  - B200
+  script_file: disaggr_torch.slurm
+  benchmark_type: 8k1k
+slurm:
+  script_file: disaggr_torch.slurm
+  partition: <partition>
+  account: <account>
+  job_time: 02:00:00
+  job_name: unified-benchmark
+  extra_args: --gres=gpu:8
+  numa_bind: true
+benchmark:
+  mode: e2e
+  use_nv_sa_benchmark: false
+  multi_round: 10
+  benchmark_ratio: 0.0
+  streaming: true
+  concurrency_list: '1'
+  input_length: 8192
+  output_length: 1024
+  dataset_file: datasets/perf-ci/deepseek_r1-8k1k-20480-ratio-1_for_serve.json
+hardware:
+  gpus_per_node: 8
+  num_ctx_servers: 1
+  num_gen_servers: 1
+environment:
+  container_mount: <container_mount>
+  container_image: <container_image>
+  model_path: <model_path>
+  trtllm_repo: ''
+  build_wheel: false
+  work_dir: <full_path_to_work_dir>
+  worker_env_var: TLLM_LOG_LEVEL=INFO TRTLLM_SERVER_DISABLE_GC=1 TRTLLM_WORKER_DISABLE_GC=1 TRTLLM_ENABLE_PDL=1 ENROOT_ALLOW_DEV=yes
+  server_env_var: TRTLLM_SERVER_DISABLE_GC=1
+profiling:
+  nsys_on: false
+accuracy:
+  enable_accuracy_test: false
+  model: local-completions
+  tasks: gsm8k
+  model_args_extra: num_concurrent=512,max_retries=3,tokenized_requests=false,timeout=1200,max_gen_toks=256,max_length=4096
+worker_config:
+  gen:
+    print_iter_log: true
+    max_batch_size: 1
+    max_num_tokens: 256
+    tensor_parallel_size: 8
+    moe_expert_parallel_size: 8
+    pipeline_parallel_size: 1
+    context_parallel_size: 1
+    enable_attention_dp: false
+    cuda_graph_config:
+      enable_padding: true
+      max_batch_size: 1
+    kv_cache_config:
+      enable_block_reuse: false
+      free_gpu_memory_fraction: 0.9
+      dtype: fp8
+    moe_config:
+      backend: TRTLLM
+    cache_transceiver_config:
+      max_tokens_in_buffer: 16384
+      backend: UCX
+    disable_overlap_scheduler: true
+    speculative_config: &id001
+      decoding_type: MTP
+      num_nextn_predict_layers: 3
+    num_postprocess_workers: 4
+    stream_interval: 20
+  ctx:
+    print_iter_log: true
+    max_batch_size: 2
+    max_num_tokens: 16384
+    tensor_parallel_size: 4
+    moe_expert_parallel_size: 4
+    pipeline_parallel_size: 1
+    context_parallel_size: 1
+    enable_attention_dp: true
+    enable_lm_head_tp_in_adp: true
+    cuda_graph_config: null
+    kv_cache_config:
+      enable_block_reuse: false
+      free_gpu_memory_fraction: 0.6
+      dtype: fp8
+    moe_config:
+      backend: TRTLLM
+    cache_transceiver_config:
+      max_tokens_in_buffer: 16384
+      backend: UCX
+    disable_overlap_scheduler: true
+    speculative_config: *id001
--- a/tests/integration/defs/perf/disagg/test_configs/disagg/perf-sanity/gb200-deepseek-r1-fp4_128k8k_ctx1_pp8_gen1_dep16_eplb0_mtp2_ccb-UCX.yaml
+++ b/tests/integration/defs/perf/disagg/test_configs/disagg/perf-sanity/gb200-deepseek-r1-fp4_128k8k_ctx1_pp8_gen1_dep16_eplb0_mtp2_ccb-UCX.yaml
@ -0,0 +1,96 @@
+metadata:
+  model_name: deepseek_r1_0528_fp4_v2
+  precision: fp4
+  model_dir_name: DeepSeek-R1-0528-FP4-v2
+  supported_gpus:
+  - GB200
+  script_file: disaggr_torch.slurm
+  benchmark_type: 128k8k
+slurm:
+  script_file: disaggr_torch.slurm
+  partition: <partition>
+  account: <account>
+  job_time: 02:00:00
+  job_name: unified-benchmark
+  extra_args: --gres=gpu:4
+  numa_bind: true
+benchmark:
+  mode: e2e
+  use_nv_sa_benchmark: false
+  multi_round: 10
+  benchmark_ratio: 0.0
+  streaming: true
+  concurrency_list: '128'
+  input_length: 131072
+  output_length: 8192
+  dataset_file: datasets/perf-ci/deepseek_r1-128k8k-20480-ratio-1_for_serve.json
+hardware:
+  gpus_per_node: 4
+  num_ctx_servers: 1
+  num_gen_servers: 1
+environment:
+  container_mount: <container_mount>
+  container_image: <container_image>
+  model_path: <model_path>
+  trtllm_repo: ''
+  build_wheel: false
+  work_dir: <full_path_to_work_dir>
+  worker_env_var: TLLM_LOG_LEVEL=INFO TRTLLM_SERVER_DISABLE_GC=1 TRTLLM_WORKER_DISABLE_GC=1 TRTLLM_ENABLE_PDL=1 ENROOT_ALLOW_DEV=yes
+  server_env_var: TRTLLM_SERVER_DISABLE_GC=1
+profiling:
+  nsys_on: false
+accuracy:
+  enable_accuracy_test: false
+  model: local-completions
+  tasks: gsm8k
+  model_args_extra: num_concurrent=512,max_retries=3,tokenized_requests=false,timeout=1200,max_gen_toks=256,max_length=4096
+worker_config:
+  gen:
+    print_iter_log: true
+    max_batch_size: 8
+    max_num_tokens: 32
+    tensor_parallel_size: 16
+    moe_expert_parallel_size: 16
+    pipeline_parallel_size: 1
+    context_parallel_size: 1
+    enable_attention_dp: true
+    enable_lm_head_tp_in_adp: true
+    cuda_graph_config:
+      enable_padding: true
+      max_batch_size: 8
+    kv_cache_config:
+      enable_block_reuse: false
+      free_gpu_memory_fraction: 0.8
+      dtype: fp8
+    moe_config:
+      backend: CUTEDSL
+    cache_transceiver_config:
+      max_tokens_in_buffer: 131104
+      backend: UCX
+    disable_overlap_scheduler: true
+    speculative_config: &id001
+      decoding_type: MTP
+      num_nextn_predict_layers: 2
+    num_postprocess_workers: 4
+    stream_interval: 20
+  ctx:
+    print_iter_log: true
+    max_batch_size: 1
+    max_num_tokens: 131104
+    tensor_parallel_size: 1
+    moe_expert_parallel_size: 1
+    pipeline_parallel_size: 8
+    context_parallel_size: 1
+    enable_attention_dp: false
+    cuda_graph_config: null
+    kv_cache_config:
+      enable_block_reuse: false
+      free_gpu_memory_fraction: 0.3
+      dtype: fp8
+    moe_config:
+      backend: TRTLLM
+    cache_transceiver_config:
+      max_tokens_in_buffer: 131104
+      backend: UCX
+    disable_overlap_scheduler: true
+    speculative_config: *id001
--- a/tests/integration/defs/perf/disagg/test_configs/disagg/perf-sanity/gb200-deepseek-r1-fp4_128k8k_ctx1_pp8_gen1_dep32_eplb0_mtp3_ccb-UCX.yaml
+++ b/tests/integration/defs/perf/disagg/test_configs/disagg/perf-sanity/gb200-deepseek-r1-fp4_128k8k_ctx1_pp8_gen1_dep32_eplb0_mtp3_ccb-UCX.yaml
@ -0,0 +1,96 @@
+metadata:
+  model_name: deepseek_r1_0528_fp4_v2
+  precision: fp4
+  model_dir_name: DeepSeek-R1-0528-FP4-v2
+  supported_gpus:
+  - GB200
+  script_file: disaggr_torch.slurm
+  benchmark_type: 128k8k
+slurm:
+  script_file: disaggr_torch.slurm
+  partition: <partition>
+  account: <account>
+  job_time: 02:00:00
+  job_name: unified-benchmark
+  extra_args: --gres=gpu:4
+  numa_bind: true
+benchmark:
+  mode: e2e
+  use_nv_sa_benchmark: false
+  multi_round: 10
+  benchmark_ratio: 0.0
+  streaming: true
+  concurrency_list: '64'
+  input_length: 131072
+  output_length: 8192
+  dataset_file: datasets/perf-ci/deepseek_r1-128k8k-20480-ratio-1_for_serve.json
+hardware:
+  gpus_per_node: 4
+  num_ctx_servers: 1
+  num_gen_servers: 1
+environment:
+  container_mount: <container_mount>
+  container_image: <container_image>
+  model_path: <model_path>
+  trtllm_repo: ''
+  build_wheel: false
+  work_dir: <full_path_to_work_dir>
+  worker_env_var: TLLM_LOG_LEVEL=INFO TRTLLM_SERVER_DISABLE_GC=1 TRTLLM_WORKER_DISABLE_GC=1 TRTLLM_ENABLE_PDL=1 ENROOT_ALLOW_DEV=yes
+  server_env_var: TRTLLM_SERVER_DISABLE_GC=1
+profiling:
+  nsys_on: false
+accuracy:
+  enable_accuracy_test: false
+  model: local-completions
+  tasks: gsm8k
+  model_args_extra: num_concurrent=512,max_retries=3,tokenized_requests=false,timeout=1200,max_gen_toks=256,max_length=4096
+worker_config:
+  gen:
+    print_iter_log: true
+    max_batch_size: 2
+    max_num_tokens: 8
+    tensor_parallel_size: 32
+    moe_expert_parallel_size: 32
+    pipeline_parallel_size: 1
+    context_parallel_size: 1
+    enable_attention_dp: true
+    enable_lm_head_tp_in_adp: true
+    cuda_graph_config:
+      enable_padding: true
+      max_batch_size: 2
+    kv_cache_config:
+      enable_block_reuse: false
+      free_gpu_memory_fraction: 0.6
+      dtype: fp8
+    moe_config:
+      backend: CUTEDSL
+    cache_transceiver_config:
+      max_tokens_in_buffer: 131104
+      backend: UCX
+    disable_overlap_scheduler: true
+    speculative_config: &id001
+      decoding_type: MTP
+      num_nextn_predict_layers: 3
+    num_postprocess_workers: 4
+    stream_interval: 20
+  ctx:
+    print_iter_log: true
+    max_batch_size: 1
+    max_num_tokens: 131104
+    tensor_parallel_size: 1
+    moe_expert_parallel_size: 1
+    pipeline_parallel_size: 8
+    context_parallel_size: 1
+    enable_attention_dp: false
+    cuda_graph_config: null
+    kv_cache_config:
+      enable_block_reuse: false
+      free_gpu_memory_fraction: 0.3
+      dtype: fp8
+    moe_config:
+      backend: TRTLLM
+    cache_transceiver_config:
+      max_tokens_in_buffer: 131104
+      backend: UCX
+    disable_overlap_scheduler: true
+    speculative_config: *id001
--- a/tests/integration/defs/perf/disagg/test_configs/disagg/perf-sanity/gb200-deepseek-r1-fp4_128k8k_ctx1_pp8_gen1_tep8_eplb0_mtp3_ccb-UCX.yaml
+++ b/tests/integration/defs/perf/disagg/test_configs/disagg/perf-sanity/gb200-deepseek-r1-fp4_128k8k_ctx1_pp8_gen1_tep8_eplb0_mtp3_ccb-UCX.yaml
@ -0,0 +1,96 @@
+metadata:
+  model_name: deepseek_r1_0528_fp4_v2
+  precision: fp4
+  model_dir_name: DeepSeek-R1-0528-FP4-v2
+  supported_gpus:
+  - GB200
+  script_file: disaggr_torch.slurm
+  benchmark_type: 128k8k
+slurm:
+  script_file: disaggr_torch.slurm
+  partition: <partition>
+  account: <account>
+  job_time: 02:00:00
+  job_name: unified-benchmark
+  extra_args: --gres=gpu:4
+  numa_bind: true
+benchmark:
+  mode: e2e
+  use_nv_sa_benchmark: false
+  multi_round: 10
+  benchmark_ratio: 0.0
+  streaming: true
+  concurrency_list: '1'
+  input_length: 131072
+  output_length: 8192
+  dataset_file: datasets/perf-ci/deepseek_r1-128k8k-20480-ratio-1_for_serve.json
+hardware:
+  gpus_per_node: 4
+  num_ctx_servers: 1
+  num_gen_servers: 1
+environment:
+  container_mount: <container_mount>
+  container_image: <container_image>
+  model_path: <model_path>
+  trtllm_repo: ''
+  build_wheel: false
+  work_dir: <full_path_to_work_dir>
+  worker_env_var: TLLM_LOG_LEVEL=INFO TRTLLM_SERVER_DISABLE_GC=1 TRTLLM_WORKER_DISABLE_GC=1 TRTLLM_ENABLE_PDL=1 ENROOT_ALLOW_DEV=yes
+  server_env_var: TRTLLM_SERVER_DISABLE_GC=1
+profiling:
+  nsys_on: false
+accuracy:
+  enable_accuracy_test: false
+  model: local-completions
+  tasks: gsm8k
+  model_args_extra: num_concurrent=512,max_retries=3,tokenized_requests=false,timeout=1200,max_gen_toks=256,max_length=4096
+worker_config:
+  gen:
+    print_iter_log: true
+    max_batch_size: 1
+    max_num_tokens: 4
+    tensor_parallel_size: 8
+    moe_expert_parallel_size: 8
+    pipeline_parallel_size: 1
+    context_parallel_size: 1
+    enable_attention_dp: false
+    cuda_graph_config:
+      enable_padding: true
+      max_batch_size: 1
+    kv_cache_config:
+      enable_block_reuse: false
+      free_gpu_memory_fraction: 0.8
+      dtype: fp8
+    moe_config:
+      backend: TRTLLM
+    cache_transceiver_config:
+      max_tokens_in_buffer: 131104
+      backend: UCX
+    disable_overlap_scheduler: true
+    speculative_config: &id001
+      decoding_type: MTP
+      num_nextn_predict_layers: 3
+    num_postprocess_workers: 4
+    stream_interval: 20
+    allreduce_strategy: MNNVL
+  ctx:
+    print_iter_log: true
+    max_batch_size: 1
+    max_num_tokens: 131104
+    tensor_parallel_size: 1
+    moe_expert_parallel_size: 1
+    pipeline_parallel_size: 8
+    context_parallel_size: 1
+    enable_attention_dp: false
+    cuda_graph_config: null
+    kv_cache_config:
+      enable_block_reuse: false
+      free_gpu_memory_fraction: 0.3
+      dtype: fp8
+    moe_config:
+      backend: TRTLLM
+    cache_transceiver_config:
+      max_tokens_in_buffer: 131104
+      backend: UCX
+    disable_overlap_scheduler: true
+    speculative_config: *id001
--- a/tests/integration/defs/perf/disagg/test_configs/disagg/perf-sanity/gb200-deepseek-r1-fp4_1k1k_ctx1_dep4_gen1_dep32_eplb0_mtp3_ccb-UCX.yaml
+++ b/tests/integration/defs/perf/disagg/test_configs/disagg/perf-sanity/gb200-deepseek-r1-fp4_1k1k_ctx1_dep4_gen1_dep32_eplb0_mtp3_ccb-UCX.yaml
@ -0,0 +1,97 @@
+metadata:
+  model_name: deepseek_r1_0528_fp4_v2
+  precision: fp4
+  model_dir_name: DeepSeek-R1-0528-FP4-v2
+  supported_gpus:
+  - GB200
+  script_file: disaggr_torch.slurm
+  benchmark_type: 1k1k
+slurm:
+  script_file: disaggr_torch.slurm
+  partition: <partition>
+  account: <account>
+  job_time: 02:00:00
+  job_name: unified-benchmark
+  extra_args: --gres=gpu:4
+  numa_bind: true
+benchmark:
+  mode: e2e
+  use_nv_sa_benchmark: false
+  multi_round: 10
+  benchmark_ratio: 0.0
+  streaming: true
+  concurrency_list: '1024'
+  input_length: 1024
+  output_length: 1024
+  dataset_file: datasets/perf-ci/deepseek_r1-1k1k-20480-ratio-1_for_serve.json
+hardware:
+  gpus_per_node: 4
+  num_ctx_servers: 1
+  num_gen_servers: 1
+environment:
+  container_mount: <container_mount>
+  container_image: <container_image>
+  model_path: <model_path>
+  trtllm_repo: ''
+  build_wheel: false
+  work_dir: <full_path_to_work_dir>
+  worker_env_var: TLLM_LOG_LEVEL=INFO TRTLLM_SERVER_DISABLE_GC=1 TRTLLM_WORKER_DISABLE_GC=1 TRTLLM_ENABLE_PDL=1 ENROOT_ALLOW_DEV=yes
+  server_env_var: TRTLLM_SERVER_DISABLE_GC=1
+profiling:
+  nsys_on: false
+accuracy:
+  enable_accuracy_test: false
+  model: local-completions
+  tasks: gsm8k
+  model_args_extra: num_concurrent=512,max_retries=3,tokenized_requests=false,timeout=1200,max_gen_toks=256,max_length=4096
+worker_config:
+  gen:
+    print_iter_log: true
+    max_batch_size: 32
+    max_num_tokens: 128
+    tensor_parallel_size: 32
+    moe_expert_parallel_size: 32
+    pipeline_parallel_size: 1
+    context_parallel_size: 1
+    enable_attention_dp: true
+    enable_lm_head_tp_in_adp: true
+    cuda_graph_config:
+      enable_padding: true
+      max_batch_size: 32
+    kv_cache_config:
+      enable_block_reuse: false
+      free_gpu_memory_fraction: 0.6
+      dtype: fp8
+    moe_config:
+      backend: CUTEDSL
+    cache_transceiver_config:
+      max_tokens_in_buffer: 16384
+      backend: UCX
+    disable_overlap_scheduler: true
+    speculative_config: &id001
+      decoding_type: MTP
+      num_nextn_predict_layers: 3
+    num_postprocess_workers: 4
+    stream_interval: 20
+  ctx:
+    print_iter_log: true
+    max_batch_size: 16
+    max_num_tokens: 16384
+    tensor_parallel_size: 4
+    moe_expert_parallel_size: 4
+    pipeline_parallel_size: 1
+    context_parallel_size: 1
+    enable_attention_dp: true
+    enable_lm_head_tp_in_adp: true
+    cuda_graph_config: null
+    kv_cache_config:
+      enable_block_reuse: false
+      free_gpu_memory_fraction: 0.6
+      dtype: fp8
+    moe_config:
+      backend: TRTLLM
+    cache_transceiver_config:
+      max_tokens_in_buffer: 16384
+      backend: UCX
+    disable_overlap_scheduler: true
+    speculative_config: *id001
--- a/tests/integration/defs/perf/disagg/test_configs/disagg/perf-sanity/gb200-deepseek-r1-fp4_1k1k_ctx1_dep4_gen1_dep4_eplb0_mtp1_ccb-UCX.yaml
+++ b/tests/integration/defs/perf/disagg/test_configs/disagg/perf-sanity/gb200-deepseek-r1-fp4_1k1k_ctx1_dep4_gen1_dep4_eplb0_mtp1_ccb-UCX.yaml
@ -0,0 +1,97 @@
+metadata:
+  model_name: deepseek_r1_0528_fp4_v2
+  precision: fp4
+  model_dir_name: DeepSeek-R1-0528-FP4-v2
+  supported_gpus:
+  - GB200
+  script_file: disaggr_torch.slurm
+  benchmark_type: 1k1k
+slurm:
+  script_file: disaggr_torch.slurm
+  partition: <partition>
+  account: <account>
+  job_time: 02:00:00
+  job_name: unified-benchmark
+  extra_args: --gres=gpu:4
+  numa_bind: true
+benchmark:
+  mode: e2e
+  use_nv_sa_benchmark: false
+  multi_round: 5
+  benchmark_ratio: 0.0
+  streaming: true
+  concurrency_list: '3072'
+  input_length: 1024
+  output_length: 1024
+  dataset_file: datasets/perf-ci/deepseek_r1-1k1k-20480-ratio-1_for_serve.json
+hardware:
+  gpus_per_node: 4
+  num_ctx_servers: 1
+  num_gen_servers: 1
+environment:
+  container_mount: <container_mount>
+  container_image: <container_image>
+  model_path: <model_path>
+  trtllm_repo: ''
+  build_wheel: false
+  work_dir: <full_path_to_work_dir>
+  worker_env_var: TLLM_LOG_LEVEL=INFO TRTLLM_SERVER_DISABLE_GC=1 TRTLLM_WORKER_DISABLE_GC=1 TRTLLM_ENABLE_PDL=1 ENROOT_ALLOW_DEV=yes
+  server_env_var: TRTLLM_SERVER_DISABLE_GC=1
+profiling:
+  nsys_on: false
+accuracy:
+  enable_accuracy_test: false
+  model: local-completions
+  tasks: gsm8k
+  model_args_extra: num_concurrent=512,max_retries=3,tokenized_requests=false,timeout=1200,max_gen_toks=256,max_length=4096
+worker_config:
+  gen:
+    print_iter_log: true
+    max_batch_size: 768
+    max_num_tokens: 1536
+    tensor_parallel_size: 4
+    moe_expert_parallel_size: 4
+    pipeline_parallel_size: 1
+    context_parallel_size: 1
+    enable_attention_dp: true
+    enable_lm_head_tp_in_adp: true
+    cuda_graph_config:
+      enable_padding: true
+      max_batch_size: 768
+    kv_cache_config:
+      enable_block_reuse: false
+      free_gpu_memory_fraction: 0.8
+      dtype: fp8
+    moe_config:
+      backend: CUTEDSL
+    cache_transceiver_config:
+      max_tokens_in_buffer: 16384
+      backend: UCX
+    disable_overlap_scheduler: true
+    speculative_config: &id001
+      decoding_type: MTP
+      num_nextn_predict_layers: 1
+    num_postprocess_workers: 4
+    stream_interval: 20
+  ctx:
+    print_iter_log: true
+    max_batch_size: 16
+    max_num_tokens: 16384
+    tensor_parallel_size: 4
+    moe_expert_parallel_size: 4
+    pipeline_parallel_size: 1
+    context_parallel_size: 1
+    enable_attention_dp: true
+    enable_lm_head_tp_in_adp: true
+    cuda_graph_config: null
+    kv_cache_config:
+      enable_block_reuse: false
+      free_gpu_memory_fraction: 0.6
+      dtype: fp8
+    moe_config:
+      backend: TRTLLM
+    cache_transceiver_config:
+      max_tokens_in_buffer: 16384
+      backend: UCX
+    disable_overlap_scheduler: true
+    speculative_config: *id001
--- a/tests/integration/defs/perf/disagg/test_configs/disagg/perf-sanity/gb200-deepseek-r1-fp4_1k1k_ctx1_dep4_gen1_dep8_eplb0_mtp0_ccb-UCX.yaml
+++ b/tests/integration/defs/perf/disagg/test_configs/disagg/perf-sanity/gb200-deepseek-r1-fp4_1k1k_ctx1_dep4_gen1_dep8_eplb0_mtp0_ccb-UCX.yaml
@ -25,7 +25,7 @@ benchmark:
  concurrency_list: '1024'
  input_length: 1024
  output_length: 1024
-  dataset_file: <dataset_file>
+  dataset_file: datasets/perf-ci/deepseek_r1-1k1k-20480-ratio-1_for_serve.json
 hardware:
  gpus_per_node: 4
  num_ctx_servers: 1
@ -56,18 +56,7 @@ worker_config:
    max_seq_len: 2068
    cuda_graph_config:
      enable_padding: true
-      batch_sizes:
-      - 1
-      - 2
-      - 4
-      - 8
-      - 16
-      - 32
-      - 64
-      - 128
-      - 256
-      - 512
-      - 768
+      max_batch_size: 768
    print_iter_log: true
    kv_cache_config:
      enable_block_reuse: false
--- a/tests/integration/defs/perf/disagg/test_configs/disagg/perf-sanity/gb200-deepseek-r1-fp4_1k1k_ctx1_dep4_gen1_tep8_eplb0_mtp3_ccb-UCX.yaml
+++ b/tests/integration/defs/perf/disagg/test_configs/disagg/perf-sanity/gb200-deepseek-r1-fp4_1k1k_ctx1_dep4_gen1_tep8_eplb0_mtp3_ccb-UCX.yaml
@ -0,0 +1,97 @@
+metadata:
+  model_name: deepseek_r1_0528_fp4_v2
+  precision: fp4
+  model_dir_name: DeepSeek-R1-0528-FP4-v2
+  supported_gpus:
+  - GB200
+  script_file: disaggr_torch.slurm
+  benchmark_type: 1k1k
+slurm:
+  script_file: disaggr_torch.slurm
+  partition: <partition>
+  account: <account>
+  job_time: 02:00:00
+  job_name: unified-benchmark
+  extra_args: --gres=gpu:4
+  numa_bind: true
+benchmark:
+  mode: e2e
+  use_nv_sa_benchmark: false
+  multi_round: 10
+  benchmark_ratio: 0.0
+  streaming: true
+  concurrency_list: '1'
+  input_length: 1024
+  output_length: 1024
+  dataset_file: datasets/perf-ci/deepseek_r1-1k1k-20480-ratio-1_for_serve.json
+hardware:
+  gpus_per_node: 4
+  num_ctx_servers: 1
+  num_gen_servers: 1
+environment:
+  container_mount: <container_mount>
+  container_image: <container_image>
+  model_path: <model_path>
+  trtllm_repo: ''
+  build_wheel: false
+  work_dir: <full_path_to_work_dir>
+  worker_env_var: TLLM_LOG_LEVEL=INFO TRTLLM_SERVER_DISABLE_GC=1 TRTLLM_WORKER_DISABLE_GC=1 TRTLLM_ENABLE_PDL=1 ENROOT_ALLOW_DEV=yes
+  server_env_var: TRTLLM_SERVER_DISABLE_GC=1
+profiling:
+  nsys_on: false
+accuracy:
+  enable_accuracy_test: false
+  model: local-completions
+  tasks: gsm8k
+  model_args_extra: num_concurrent=512,max_retries=3,tokenized_requests=false,timeout=1200,max_gen_toks=256,max_length=4096
+worker_config:
+  gen:
+    print_iter_log: true
+    max_batch_size: 128
+    max_num_tokens: 512
+    tensor_parallel_size: 8
+    moe_expert_parallel_size: 8
+    pipeline_parallel_size: 1
+    context_parallel_size: 1
+    enable_attention_dp: false
+    cuda_graph_config:
+      enable_padding: true
+      max_batch_size: 128
+    kv_cache_config:
+      enable_block_reuse: false
+      free_gpu_memory_fraction: 0.9
+      dtype: fp8
+    moe_config:
+      backend: TRTLLM
+    cache_transceiver_config:
+      max_tokens_in_buffer: 16384
+      backend: UCX
+    disable_overlap_scheduler: true
+    speculative_config: &id001
+      decoding_type: MTP
+      num_nextn_predict_layers: 3
+    num_postprocess_workers: 4
+    stream_interval: 20
+    allreduce_strategy: MNNVL
+  ctx:
+    print_iter_log: true
+    max_batch_size: 16
+    max_num_tokens: 16384
+    tensor_parallel_size: 4
+    moe_expert_parallel_size: 4
+    pipeline_parallel_size: 1
+    context_parallel_size: 1
+    enable_attention_dp: true
+    enable_lm_head_tp_in_adp: true
+    cuda_graph_config: null
+    kv_cache_config:
+      enable_block_reuse: false
+      free_gpu_memory_fraction: 0.6
+      dtype: fp8
+    moe_config:
+      backend: TRTLLM
+    cache_transceiver_config:
+      max_tokens_in_buffer: 16384
+      backend: UCX
+    disable_overlap_scheduler: true
+    speculative_config: *id001
--- a/tests/integration/defs/perf/disagg/test_configs/disagg/perf-sanity/gb200-deepseek-r1-fp4_8k1k_ctx1_dep4_gen1_dep16_eplb0_mtp1_ccb-UCX.yaml
+++ b/tests/integration/defs/perf/disagg/test_configs/disagg/perf-sanity/gb200-deepseek-r1-fp4_8k1k_ctx1_dep4_gen1_dep16_eplb0_mtp1_ccb-UCX.yaml
@ -0,0 +1,97 @@
+metadata:
+  model_name: deepseek_r1_0528_fp4_v2
+  precision: fp4
+  model_dir_name: DeepSeek-R1-0528-FP4-v2
+  supported_gpus:
+  - GB200
+  script_file: disaggr_torch.slurm
+  benchmark_type: 8k1k
+slurm:
+  script_file: disaggr_torch.slurm
+  partition: <partition>
+  account: <account>
+  job_time: 02:00:00
+  job_name: unified-benchmark
+  extra_args: --gres=gpu:4
+  numa_bind: true
+benchmark:
+  mode: e2e
+  use_nv_sa_benchmark: false
+  multi_round: 5
+  benchmark_ratio: 0.0
+  streaming: true
+  concurrency_list: '4096'
+  input_length: 8192
+  output_length: 1024
+  dataset_file: datasets/perf-ci/deepseek_r1-8k1k-20480-ratio-1_for_serve.json
+hardware:
+  gpus_per_node: 4
+  num_ctx_servers: 1
+  num_gen_servers: 1
+environment:
+  container_mount: <container_mount>
+  container_image: <container_image>
+  model_path: <model_path>
+  trtllm_repo: ''
+  build_wheel: false
+  work_dir: <full_path_to_work_dir>
+  worker_env_var: TLLM_LOG_LEVEL=INFO TRTLLM_SERVER_DISABLE_GC=1 TRTLLM_WORKER_DISABLE_GC=1 TRTLLM_ENABLE_PDL=1 ENROOT_ALLOW_DEV=yes
+  server_env_var: TRTLLM_SERVER_DISABLE_GC=1
+profiling:
+  nsys_on: false
+accuracy:
+  enable_accuracy_test: false
+  model: local-completions
+  tasks: gsm8k
+  model_args_extra: num_concurrent=512,max_retries=3,tokenized_requests=false,timeout=1200,max_gen_toks=256,max_length=4096
+worker_config:
+  gen:
+    print_iter_log: true
+    max_batch_size: 256
+    max_num_tokens: 512
+    tensor_parallel_size: 16
+    moe_expert_parallel_size: 16
+    pipeline_parallel_size: 1
+    context_parallel_size: 1
+    enable_attention_dp: true
+    enable_lm_head_tp_in_adp: true
+    cuda_graph_config:
+      enable_padding: true
+      max_batch_size: 256
+    kv_cache_config:
+      enable_block_reuse: false
+      free_gpu_memory_fraction: 0.8
+      dtype: fp8
+    moe_config:
+      backend: CUTEDSL
+    cache_transceiver_config:
+      max_tokens_in_buffer: 16384
+      backend: UCX
+    disable_overlap_scheduler: true
+    speculative_config: &id001
+      decoding_type: MTP
+      num_nextn_predict_layers: 1
+    num_postprocess_workers: 4
+    stream_interval: 20
+  ctx:
+    print_iter_log: true
+    max_batch_size: 2
+    max_num_tokens: 16384
+    tensor_parallel_size: 4
+    moe_expert_parallel_size: 4
+    pipeline_parallel_size: 1
+    context_parallel_size: 1
+    enable_attention_dp: true
+    enable_lm_head_tp_in_adp: true
+    cuda_graph_config: null
+    kv_cache_config:
+      enable_block_reuse: false
+      free_gpu_memory_fraction: 0.6
+      dtype: fp8
+    moe_config:
+      backend: TRTLLM
+    cache_transceiver_config:
+      max_tokens_in_buffer: 16384
+      backend: UCX
+    disable_overlap_scheduler: true
+    speculative_config: *id001
--- a/tests/integration/defs/perf/disagg/test_configs/disagg/perf-sanity/gb200-deepseek-r1-fp4_8k1k_ctx1_dep4_gen1_dep32_eplb0_mtp3_ccb-UCX.yaml
+++ b/tests/integration/defs/perf/disagg/test_configs/disagg/perf-sanity/gb200-deepseek-r1-fp4_8k1k_ctx1_dep4_gen1_dep32_eplb0_mtp3_ccb-UCX.yaml
@ -0,0 +1,97 @@
+metadata:
+  model_name: deepseek_r1_0528_fp4_v2
+  precision: fp4
+  model_dir_name: DeepSeek-R1-0528-FP4-v2
+  supported_gpus:
+  - GB200
+  script_file: disaggr_torch.slurm
+  benchmark_type: 8k1k
+slurm:
+  script_file: disaggr_torch.slurm
+  partition: <partition>
+  account: <account>
+  job_time: 02:00:00
+  job_name: unified-benchmark
+  extra_args: --gres=gpu:4
+  numa_bind: true
+benchmark:
+  mode: e2e
+  use_nv_sa_benchmark: false
+  multi_round: 10
+  benchmark_ratio: 0.0
+  streaming: true
+  concurrency_list: '1024'
+  input_length: 8192
+  output_length: 1024
+  dataset_file: datasets/perf-ci/deepseek_r1-8k1k-20480-ratio-1_for_serve.json
+hardware:
+  gpus_per_node: 4
+  num_ctx_servers: 1
+  num_gen_servers: 1
+environment:
+  container_mount: <container_mount>
+  container_image: <container_image>
+  model_path: <model_path>
+  trtllm_repo: ''
+  build_wheel: false
+  work_dir: <full_path_to_work_dir>
+  worker_env_var: TLLM_LOG_LEVEL=INFO TRTLLM_SERVER_DISABLE_GC=1 TRTLLM_WORKER_DISABLE_GC=1 TRTLLM_ENABLE_PDL=1 ENROOT_ALLOW_DEV=yes
+  server_env_var: TRTLLM_SERVER_DISABLE_GC=1
+profiling:
+  nsys_on: false
+accuracy:
+  enable_accuracy_test: false
+  model: local-completions
+  tasks: gsm8k
+  model_args_extra: num_concurrent=512,max_retries=3,tokenized_requests=false,timeout=1200,max_gen_toks=256,max_length=4096
+worker_config:
+  gen:
+    print_iter_log: true
+    max_batch_size: 32
+    max_num_tokens: 128
+    tensor_parallel_size: 32
+    moe_expert_parallel_size: 32
+    pipeline_parallel_size: 1
+    context_parallel_size: 1
+    enable_attention_dp: true
+    enable_lm_head_tp_in_adp: true
+    cuda_graph_config:
+      enable_padding: true
+      max_batch_size: 32
+    kv_cache_config:
+      enable_block_reuse: false
+      free_gpu_memory_fraction: 0.75
+      dtype: fp8
+    moe_config:
+      backend: CUTEDSL
+    cache_transceiver_config:
+      max_tokens_in_buffer: 16384
+      backend: UCX
+    disable_overlap_scheduler: true
+    speculative_config: &id001
+      decoding_type: MTP
+      num_nextn_predict_layers: 3
+    num_postprocess_workers: 4
+    stream_interval: 20
+  ctx:
+    print_iter_log: true
+    max_batch_size: 2
+    max_num_tokens: 16384
+    tensor_parallel_size: 4
+    moe_expert_parallel_size: 4
+    pipeline_parallel_size: 1
+    context_parallel_size: 1
+    enable_attention_dp: true
+    enable_lm_head_tp_in_adp: true
+    cuda_graph_config: null
+    kv_cache_config:
+      enable_block_reuse: false
+      free_gpu_memory_fraction: 0.6
+      dtype: fp8
+    moe_config:
+      backend: TRTLLM
+    cache_transceiver_config:
+      max_tokens_in_buffer: 16384
+      backend: UCX
+    disable_overlap_scheduler: true
+    speculative_config: *id001
--- a/tests/integration/defs/perf/disagg/test_configs/disagg/perf-sanity/gb200-deepseek-r1-fp4_8k1k_ctx1_dep4_gen1_tep8_eplb0_mtp3_ccb-UCX.yaml
+++ b/tests/integration/defs/perf/disagg/test_configs/disagg/perf-sanity/gb200-deepseek-r1-fp4_8k1k_ctx1_dep4_gen1_tep8_eplb0_mtp3_ccb-UCX.yaml
@ -0,0 +1,97 @@
+metadata:
+  model_name: deepseek_r1_0528_fp4_v2
+  precision: fp4
+  model_dir_name: DeepSeek-R1-0528-FP4-v2
+  supported_gpus:
+  - GB200
+  script_file: disaggr_torch.slurm
+  benchmark_type: 8k1k
+slurm:
+  script_file: disaggr_torch.slurm
+  partition: <partition>
+  account: <account>
+  job_time: 02:00:00
+  job_name: unified-benchmark
+  extra_args: --gres=gpu:4
+  numa_bind: true
+benchmark:
+  mode: e2e
+  use_nv_sa_benchmark: false
+  multi_round: 10
+  benchmark_ratio: 0.0
+  streaming: true
+  concurrency_list: '1'
+  input_length: 8192
+  output_length: 1024
+  dataset_file: datasets/perf-ci/deepseek_r1-8k1k-20480-ratio-1_for_serve.json
+hardware:
+  gpus_per_node: 4
+  num_ctx_servers: 1
+  num_gen_servers: 1
+environment:
+  container_mount: <container_mount>
+  container_image: <container_image>
+  model_path: <model_path>
+  trtllm_repo: ''
+  build_wheel: false
+  work_dir: <full_path_to_work_dir>
+  worker_env_var: TLLM_LOG_LEVEL=INFO TRTLLM_SERVER_DISABLE_GC=1 TRTLLM_WORKER_DISABLE_GC=1 TRTLLM_ENABLE_PDL=1 ENROOT_ALLOW_DEV=yes
+  server_env_var: TRTLLM_SERVER_DISABLE_GC=1
+profiling:
+  nsys_on: false
+accuracy:
+  enable_accuracy_test: false
+  model: local-completions
+  tasks: gsm8k
+  model_args_extra: num_concurrent=512,max_retries=3,tokenized_requests=false,timeout=1200,max_gen_toks=256,max_length=4096
+worker_config:
+  gen:
+    print_iter_log: true
+    max_batch_size: 128
+    max_num_tokens: 512
+    tensor_parallel_size: 8
+    moe_expert_parallel_size: 8
+    pipeline_parallel_size: 1
+    context_parallel_size: 1
+    enable_attention_dp: false
+    cuda_graph_config:
+      enable_padding: true
+      max_batch_size: 128
+    kv_cache_config:
+      enable_block_reuse: false
+      free_gpu_memory_fraction: 0.9
+      dtype: fp8
+    moe_config:
+      backend: TRTLLM
+    cache_transceiver_config:
+      max_tokens_in_buffer: 16384
+      backend: UCX
+    disable_overlap_scheduler: true
+    speculative_config: &id001
+      decoding_type: MTP
+      num_nextn_predict_layers: 3
+    num_postprocess_workers: 4
+    stream_interval: 20
+    allreduce_strategy: MNNVL
+  ctx:
+    print_iter_log: true
+    max_batch_size: 2
+    max_num_tokens: 16384
+    tensor_parallel_size: 4
+    moe_expert_parallel_size: 4
+    pipeline_parallel_size: 1
+    context_parallel_size: 1
+    enable_attention_dp: true
+    enable_lm_head_tp_in_adp: true
+    cuda_graph_config: null
+    kv_cache_config:
+      enable_block_reuse: false
+      free_gpu_memory_fraction: 0.6
+      dtype: fp8
+    moe_config:
+      backend: TRTLLM
+    cache_transceiver_config:
+      max_tokens_in_buffer: 16384
+      backend: UCX
+    disable_overlap_scheduler: true
+    speculative_config: *id001
--- a/tests/integration/defs/perf/disagg/test_configs/disagg/perf-sanity/gb200-deepseek-v32-fp4_1k1k_ctx1_dep4_gen1_dep32_eplb256_mtp3_ccb-UCX.yaml
+++ b/tests/integration/defs/perf/disagg/test_configs/disagg/perf-sanity/gb200-deepseek-v32-fp4_1k1k_ctx1_dep4_gen1_dep32_eplb256_mtp3_ccb-UCX.yaml
@ -0,0 +1,108 @@
+metadata:
+  model_name: deepseek_v32_fp4
+  precision: fp4
+  model_dir_name: DeepSeek-V3.2-FP4-v2
+  supported_gpus:
+  - GB200
+  script_file: disaggr_torch.slurm
+  benchmark_type: 1k1k
+slurm:
+  script_file: disaggr_torch.slurm
+  partition: <partition>
+  account: <account>
+  job_time: 02:00:00
+  job_name: unified-benchmark
+  extra_args: --gres=gpu:4
+  numa_bind: true
+benchmark:
+  mode: e2e
+  use_nv_sa_benchmark: false
+  multi_round: 10
+  benchmark_ratio: 0.0
+  streaming: true
+  concurrency_list: '1024'
+  input_length: 1024
+  output_length: 1024
+  dataset_file: datasets/perf-ci/deepseek_v32-1k1k-20480-ratio-1_for_serve.json
+hardware:
+  gpus_per_node: 4
+  num_ctx_servers: 1
+  num_gen_servers: 1
+environment:
+  container_mount: <container_mount>
+  container_image: <container_image>
+  model_path: <model_path>
+  trtllm_repo: ''
+  build_wheel: false
+  work_dir: <full_path_to_work_dir>
+  worker_env_var: TLLM_LOG_LEVEL=INFO TRTLLM_SERVER_DISABLE_GC=1 TRTLLM_WORKER_DISABLE_GC=1 TRTLLM_ENABLE_PDL=1 ENROOT_ALLOW_DEV=yes
+  server_env_var: TRTLLM_SERVER_DISABLE_GC=1
+profiling:
+  nsys_on: false
+accuracy:
+  enable_accuracy_test: false
+  model: local-completions
+  tasks: gsm8k
+  model_args_extra: num_concurrent=512,max_retries=3,tokenized_requests=false,timeout=1200,max_gen_toks=256,max_length=4096
+worker_config:
+  gen:
+    print_iter_log: true
+    max_batch_size: 32
+    max_num_tokens: 128
+    tensor_parallel_size: 32
+    moe_expert_parallel_size: 32
+    pipeline_parallel_size: 1
+    context_parallel_size: 1
+    enable_attention_dp: true
+    enable_lm_head_tp_in_adp: true
+    cuda_graph_config:
+      enable_padding: true
+      max_batch_size: 32
+    kv_cache_config:
+      enable_block_reuse: false
+      free_gpu_memory_fraction: 0.6
+      dtype: fp8
+      tokens_per_block: 64
+    moe_config:
+      backend: CUTEDSL
+      load_balancer:
+        num_slots: 256
+        layer_updates_per_iter: 1
+    cache_transceiver_config:
+      max_tokens_in_buffer: 16384
+      backend: UCX
+    disable_overlap_scheduler: true
+    speculative_config: &id001
+      decoding_type: MTP
+      num_nextn_predict_layers: 3
+    num_postprocess_workers: 4
+    stream_interval: 20
+    nvfp4_gemm_config:
+      allowed_backends:
+      - cutlass
+      - cublaslt
+      - cutedsl
+      - cuda_core
+  ctx:
+    print_iter_log: true
+    max_batch_size: 16
+    max_num_tokens: 16384
+    tensor_parallel_size: 4
+    moe_expert_parallel_size: 4
+    pipeline_parallel_size: 1
+    context_parallel_size: 1
+    enable_attention_dp: true
+    enable_lm_head_tp_in_adp: true
+    cuda_graph_config: null
+    kv_cache_config:
+      enable_block_reuse: false
+      free_gpu_memory_fraction: 0.6
+      dtype: fp8
+      tokens_per_block: 64
+    moe_config:
+      backend: TRTLLM
+    cache_transceiver_config:
+      max_tokens_in_buffer: 16384
+      backend: UCX
+    disable_overlap_scheduler: true
+    speculative_config: *id001
--- a/tests/integration/defs/perf/disagg/test_configs/disagg/perf-sanity/gb200-deepseek-v32-fp4_1k1k_ctx1_dep4_gen1_dep4_eplb0_mtp1_ccb-UCX.yaml
+++ b/tests/integration/defs/perf/disagg/test_configs/disagg/perf-sanity/gb200-deepseek-v32-fp4_1k1k_ctx1_dep4_gen1_dep4_eplb0_mtp1_ccb-UCX.yaml
@ -0,0 +1,105 @@
+metadata:
+  model_name: deepseek_v32_fp4
+  precision: fp4
+  model_dir_name: DeepSeek-V3.2-FP4-v2
+  supported_gpus:
+  - GB200
+  script_file: disaggr_torch.slurm
+  benchmark_type: 1k1k
+slurm:
+  script_file: disaggr_torch.slurm
+  partition: <partition>
+  account: <account>
+  job_time: 02:00:00
+  job_name: unified-benchmark
+  extra_args: --gres=gpu:4
+  numa_bind: true
+benchmark:
+  mode: e2e
+  use_nv_sa_benchmark: false
+  multi_round: 5
+  benchmark_ratio: 0.0
+  streaming: true
+  concurrency_list: '2048'
+  input_length: 1024
+  output_length: 1024
+  dataset_file: datasets/perf-ci/deepseek_v32-1k1k-20480-ratio-1_for_serve.json
+hardware:
+  gpus_per_node: 4
+  num_ctx_servers: 1
+  num_gen_servers: 1
+environment:
+  container_mount: <container_mount>
+  container_image: <container_image>
+  model_path: <model_path>
+  trtllm_repo: ''
+  build_wheel: false
+  work_dir: <full_path_to_work_dir>
+  worker_env_var: TLLM_LOG_LEVEL=INFO TRTLLM_SERVER_DISABLE_GC=1 TRTLLM_WORKER_DISABLE_GC=1 TRTLLM_ENABLE_PDL=1 ENROOT_ALLOW_DEV=yes
+  server_env_var: TRTLLM_SERVER_DISABLE_GC=1
+profiling:
+  nsys_on: false
+accuracy:
+  enable_accuracy_test: false
+  model: local-completions
+  tasks: gsm8k
+  model_args_extra: num_concurrent=512,max_retries=3,tokenized_requests=false,timeout=1200,max_gen_toks=256,max_length=4096
+worker_config:
+  gen:
+    print_iter_log: true
+    max_batch_size: 512
+    max_num_tokens: 1024
+    tensor_parallel_size: 4
+    moe_expert_parallel_size: 4
+    pipeline_parallel_size: 1
+    context_parallel_size: 1
+    enable_attention_dp: true
+    enable_lm_head_tp_in_adp: true
+    cuda_graph_config:
+      enable_padding: true
+      max_batch_size: 512
+    kv_cache_config:
+      enable_block_reuse: false
+      free_gpu_memory_fraction: 0.8
+      dtype: fp8
+      tokens_per_block: 64
+    moe_config:
+      backend: CUTEDSL
+    cache_transceiver_config:
+      max_tokens_in_buffer: 16384
+      backend: UCX
+    disable_overlap_scheduler: true
+    speculative_config: &id001
+      decoding_type: MTP
+      num_nextn_predict_layers: 1
+    num_postprocess_workers: 4
+    stream_interval: 20
+    nvfp4_gemm_config:
+      allowed_backends:
+      - cutlass
+      - cublaslt
+      - cutedsl
+      - cuda_core
+  ctx:
+    print_iter_log: true
+    max_batch_size: 16
+    max_num_tokens: 16384
+    tensor_parallel_size: 4
+    moe_expert_parallel_size: 4
+    pipeline_parallel_size: 1
+    context_parallel_size: 1
+    enable_attention_dp: true
+    enable_lm_head_tp_in_adp: true
+    cuda_graph_config: null
+    kv_cache_config:
+      enable_block_reuse: false
+      free_gpu_memory_fraction: 0.6
+      dtype: fp8
+      tokens_per_block: 64
+    moe_config:
+      backend: TRTLLM
+    cache_transceiver_config:
+      max_tokens_in_buffer: 16384
+      backend: UCX
+    disable_overlap_scheduler: true
+    speculative_config: *id001
--- a/tests/integration/defs/perf/disagg/test_configs/disagg/perf-sanity/gb200-deepseek-v32-fp4_1k1k_ctx1_dep4_gen1_tep8_eplb0_mtp3_ccb-UCX.yaml
+++ b/tests/integration/defs/perf/disagg/test_configs/disagg/perf-sanity/gb200-deepseek-v32-fp4_1k1k_ctx1_dep4_gen1_tep8_eplb0_mtp3_ccb-UCX.yaml
@ -0,0 +1,105 @@
+metadata:
+  model_name: deepseek_v32_fp4
+  precision: fp4
+  model_dir_name: DeepSeek-V3.2-FP4-v2
+  supported_gpus:
+  - GB200
+  script_file: disaggr_torch.slurm
+  benchmark_type: 1k1k
+slurm:
+  script_file: disaggr_torch.slurm
+  partition: <partition>
+  account: <account>
+  job_time: 02:00:00
+  job_name: unified-benchmark
+  extra_args: --gres=gpu:4
+  numa_bind: true
+benchmark:
+  mode: e2e
+  use_nv_sa_benchmark: false
+  multi_round: 10
+  benchmark_ratio: 0.0
+  streaming: true
+  concurrency_list: '1'
+  input_length: 1024
+  output_length: 1024
+  dataset_file: datasets/perf-ci/deepseek_v32-1k1k-20480-ratio-1_for_serve.json
+hardware:
+  gpus_per_node: 4
+  num_ctx_servers: 1
+  num_gen_servers: 1
+environment:
+  container_mount: <container_mount>
+  container_image: <container_image>
+  model_path: <model_path>
+  trtllm_repo: ''
+  build_wheel: false
+  work_dir: <full_path_to_work_dir>
+  worker_env_var: TLLM_LOG_LEVEL=INFO TRTLLM_SERVER_DISABLE_GC=1 TRTLLM_WORKER_DISABLE_GC=1 TRTLLM_ENABLE_PDL=1 ENROOT_ALLOW_DEV=yes
+  server_env_var: TRTLLM_SERVER_DISABLE_GC=1
+profiling:
+  nsys_on: false
+accuracy:
+  enable_accuracy_test: false
+  model: local-completions
+  tasks: gsm8k
+  model_args_extra: num_concurrent=512,max_retries=3,tokenized_requests=false,timeout=1200,max_gen_toks=256,max_length=4096
+worker_config:
+  gen:
+    print_iter_log: true
+    max_batch_size: 1
+    max_num_tokens: 512
+    tensor_parallel_size: 8
+    moe_expert_parallel_size: 8
+    pipeline_parallel_size: 1
+    context_parallel_size: 1
+    enable_attention_dp: false
+    cuda_graph_config:
+      enable_padding: true
+      max_batch_size: 1
+    kv_cache_config:
+      enable_block_reuse: false
+      free_gpu_memory_fraction: 0.9
+      dtype: fp8
+      tokens_per_block: 64
+    moe_config:
+      backend: TRTLLM
+    cache_transceiver_config:
+      max_tokens_in_buffer: 16384
+      backend: UCX
+    disable_overlap_scheduler: true
+    speculative_config: &id001
+      decoding_type: MTP
+      num_nextn_predict_layers: 3
+    num_postprocess_workers: 4
+    stream_interval: 20
+    nvfp4_gemm_config:
+      allowed_backends:
+      - cutlass
+      - cublaslt
+      - cutedsl
+      - cuda_core
+    allreduce_strategy: MNNVL
+  ctx:
+    print_iter_log: true
+    max_batch_size: 16
+    max_num_tokens: 16384
+    tensor_parallel_size: 4
+    moe_expert_parallel_size: 4
+    pipeline_parallel_size: 1
+    context_parallel_size: 1
+    enable_attention_dp: true
+    enable_lm_head_tp_in_adp: true
+    cuda_graph_config: null
+    kv_cache_config:
+      enable_block_reuse: false
+      free_gpu_memory_fraction: 0.6
+      dtype: fp8
+      tokens_per_block: 64
+    moe_config:
+      backend: TRTLLM
+    cache_transceiver_config:
+      max_tokens_in_buffer: 16384
+      backend: UCX
+    disable_overlap_scheduler: true
+    speculative_config: *id001
--- a/tests/integration/defs/perf/disagg/test_configs/disagg/perf-sanity/gb200-deepseek-v32-fp4_32k4k_ctx1_dep4_gen1_dep32_eplb0_mtp3_ccb-UCX.yaml
+++ b/tests/integration/defs/perf/disagg/test_configs/disagg/perf-sanity/gb200-deepseek-v32-fp4_32k4k_ctx1_dep4_gen1_dep32_eplb0_mtp3_ccb-UCX.yaml
@ -0,0 +1,105 @@
+metadata:
+  model_name: deepseek_v32_fp4
+  precision: fp4
+  model_dir_name: DeepSeek-V3.2-FP4-v2
+  supported_gpus:
+  - GB200
+  script_file: disaggr_torch.slurm
+  benchmark_type: 32k4k
+slurm:
+  script_file: disaggr_torch.slurm
+  partition: <partition>
+  account: <account>
+  job_time: 02:00:00
+  job_name: unified-benchmark
+  extra_args: --gres=gpu:4
+  numa_bind: true
+benchmark:
+  mode: e2e
+  use_nv_sa_benchmark: false
+  multi_round: 10
+  benchmark_ratio: 0.0
+  streaming: true
+  concurrency_list: '256'
+  input_length: 32768
+  output_length: 4096
+  dataset_file: datasets/perf-ci/deepseek_v32-32k4k-20480-ratio-1_for_serve.json
+hardware:
+  gpus_per_node: 4
+  num_ctx_servers: 1
+  num_gen_servers: 1
+environment:
+  container_mount: <container_mount>
+  container_image: <container_image>
+  model_path: <model_path>
+  trtllm_repo: ''
+  build_wheel: false
+  work_dir: <full_path_to_work_dir>
+  worker_env_var: TLLM_LOG_LEVEL=INFO TRTLLM_SERVER_DISABLE_GC=1 TRTLLM_WORKER_DISABLE_GC=1 TRTLLM_ENABLE_PDL=1 ENROOT_ALLOW_DEV=yes
+  server_env_var: TRTLLM_SERVER_DISABLE_GC=1
+profiling:
+  nsys_on: false
+accuracy:
+  enable_accuracy_test: false
+  model: local-completions
+  tasks: gsm8k
+  model_args_extra: num_concurrent=512,max_retries=3,tokenized_requests=false,timeout=1200,max_gen_toks=256,max_length=4096
+worker_config:
+  gen:
+    print_iter_log: true
+    max_batch_size: 8
+    max_num_tokens: 256
+    tensor_parallel_size: 32
+    moe_expert_parallel_size: 32
+    pipeline_parallel_size: 1
+    context_parallel_size: 1
+    enable_attention_dp: true
+    enable_lm_head_tp_in_adp: true
+    cuda_graph_config:
+      enable_padding: true
+      max_batch_size: 8
+    kv_cache_config:
+      enable_block_reuse: false
+      free_gpu_memory_fraction: 0.85
+      dtype: fp8
+      tokens_per_block: 64
+    moe_config:
+      backend: CUTEDSL
+    cache_transceiver_config:
+      max_tokens_in_buffer: 32768
+      backend: UCX
+    disable_overlap_scheduler: true
+    speculative_config: &id001
+      decoding_type: MTP
+      num_nextn_predict_layers: 3
+    num_postprocess_workers: 4
+    stream_interval: 20
+    nvfp4_gemm_config:
+      allowed_backends:
+      - cutlass
+      - cublaslt
+      - cutedsl
+      - cuda_core
+  ctx:
+    print_iter_log: true
+    max_batch_size: 1
+    max_num_tokens: 32784
+    tensor_parallel_size: 4
+    moe_expert_parallel_size: 4
+    pipeline_parallel_size: 1
+    context_parallel_size: 1
+    enable_attention_dp: true
+    enable_lm_head_tp_in_adp: true
+    cuda_graph_config: null
+    kv_cache_config:
+      enable_block_reuse: false
+      free_gpu_memory_fraction: 0.6
+      dtype: fp8
+      tokens_per_block: 64
+    moe_config:
+      backend: TRTLLM
+    cache_transceiver_config:
+      max_tokens_in_buffer: 32768
+      backend: UCX
+    disable_overlap_scheduler: true
+    speculative_config: *id001
--- a/tests/integration/defs/perf/disagg/test_configs/disagg/perf-sanity/gb200-deepseek-v32-fp4_32k4k_ctx1_dep4_gen1_dep32_eplb288_mtp1_ccb-UCX.yaml
+++ b/tests/integration/defs/perf/disagg/test_configs/disagg/perf-sanity/gb200-deepseek-v32-fp4_32k4k_ctx1_dep4_gen1_dep32_eplb288_mtp1_ccb-UCX.yaml
@ -0,0 +1,108 @@
+metadata:
+  model_name: deepseek_v32_fp4
+  precision: fp4
+  model_dir_name: DeepSeek-V3.2-FP4-v2
+  supported_gpus:
+  - GB200
+  script_file: disaggr_torch.slurm
+  benchmark_type: 32k4k
+slurm:
+  script_file: disaggr_torch.slurm
+  partition: <partition>
+  account: <account>
+  job_time: 02:00:00
+  job_name: unified-benchmark
+  extra_args: --gres=gpu:4
+  numa_bind: true
+benchmark:
+  mode: e2e
+  use_nv_sa_benchmark: false
+  multi_round: 5
+  benchmark_ratio: 0.0
+  streaming: true
+  concurrency_list: '2048'
+  input_length: 32768
+  output_length: 4096
+  dataset_file: datasets/perf-ci/deepseek_v32-32k4k-20480-ratio-1_for_serve.json
+hardware:
+  gpus_per_node: 4
+  num_ctx_servers: 1
+  num_gen_servers: 1
+environment:
+  container_mount: <container_mount>
+  container_image: <container_image>
+  model_path: <model_path>
+  trtllm_repo: ''
+  build_wheel: false
+  work_dir: <full_path_to_work_dir>
+  worker_env_var: TLLM_LOG_LEVEL=INFO TRTLLM_SERVER_DISABLE_GC=1 TRTLLM_WORKER_DISABLE_GC=1 TRTLLM_ENABLE_PDL=1 ENROOT_ALLOW_DEV=yes
+  server_env_var: TRTLLM_SERVER_DISABLE_GC=1
+profiling:
+  nsys_on: false
+accuracy:
+  enable_accuracy_test: false
+  model: local-completions
+  tasks: gsm8k
+  model_args_extra: num_concurrent=512,max_retries=3,tokenized_requests=false,timeout=1200,max_gen_toks=256,max_length=4096
+worker_config:
+  gen:
+    print_iter_log: true
+    max_batch_size: 64
+    max_num_tokens: 128
+    tensor_parallel_size: 32
+    moe_expert_parallel_size: 32
+    pipeline_parallel_size: 1
+    context_parallel_size: 1
+    enable_attention_dp: true
+    enable_lm_head_tp_in_adp: true
+    cuda_graph_config:
+      enable_padding: true
+      max_batch_size: 64
+    kv_cache_config:
+      enable_block_reuse: false
+      free_gpu_memory_fraction: 0.85
+      dtype: fp8
+      tokens_per_block: 64
+    moe_config:
+      backend: CUTEDSL
+      load_balancer:
+        num_slots: 288
+        layer_updates_per_iter: 1
+    cache_transceiver_config:
+      max_tokens_in_buffer: 32768
+      backend: UCX
+    disable_overlap_scheduler: true
+    speculative_config: &id001
+      decoding_type: MTP
+      num_nextn_predict_layers: 1
+    num_postprocess_workers: 4
+    stream_interval: 20
+    nvfp4_gemm_config:
+      allowed_backends:
+      - cutlass
+      - cublaslt
+      - cutedsl
+      - cuda_core
+  ctx:
+    print_iter_log: true
+    max_batch_size: 1
+    max_num_tokens: 32784
+    tensor_parallel_size: 4
+    moe_expert_parallel_size: 4
+    pipeline_parallel_size: 1
+    context_parallel_size: 1
+    enable_attention_dp: true
+    enable_lm_head_tp_in_adp: true
+    cuda_graph_config: null
+    kv_cache_config:
+      enable_block_reuse: false
+      free_gpu_memory_fraction: 0.6
+      dtype: fp8
+      tokens_per_block: 64
+    moe_config:
+      backend: TRTLLM
+    cache_transceiver_config:
+      max_tokens_in_buffer: 32768
+      backend: UCX
+    disable_overlap_scheduler: true
+    speculative_config: *id001
--- a/tests/integration/defs/perf/disagg/test_configs/disagg/perf-sanity/gb200-deepseek-v32-fp4_32k4k_ctx1_dep4_gen1_tep8_eplb0_mtp3_ccb-UCX.yaml
+++ b/tests/integration/defs/perf/disagg/test_configs/disagg/perf-sanity/gb200-deepseek-v32-fp4_32k4k_ctx1_dep4_gen1_tep8_eplb0_mtp3_ccb-UCX.yaml
@ -0,0 +1,105 @@
+metadata:
+  model_name: deepseek_v32_fp4
+  precision: fp4
+  model_dir_name: DeepSeek-V3.2-FP4-v2
+  supported_gpus:
+  - GB200
+  script_file: disaggr_torch.slurm
+  benchmark_type: 32k4k
+slurm:
+  script_file: disaggr_torch.slurm
+  partition: <partition>
+  account: <account>
+  job_time: 02:00:00
+  job_name: unified-benchmark
+  extra_args: --gres=gpu:4
+  numa_bind: true
+benchmark:
+  mode: e2e
+  use_nv_sa_benchmark: false
+  multi_round: 10
+  benchmark_ratio: 0.0
+  streaming: true
+  concurrency_list: '1'
+  input_length: 32768
+  output_length: 4096
+  dataset_file: datasets/perf-ci/deepseek_v32-32k4k-20480-ratio-1_for_serve.json
+hardware:
+  gpus_per_node: 4
+  num_ctx_servers: 1
+  num_gen_servers: 1
+environment:
+  container_mount: <container_mount>
+  container_image: <container_image>
+  model_path: <model_path>
+  trtllm_repo: ''
+  build_wheel: false
+  work_dir: <full_path_to_work_dir>
+  worker_env_var: TLLM_LOG_LEVEL=INFO TRTLLM_SERVER_DISABLE_GC=1 TRTLLM_WORKER_DISABLE_GC=1 TRTLLM_ENABLE_PDL=1 ENROOT_ALLOW_DEV=yes
+  server_env_var: TRTLLM_SERVER_DISABLE_GC=1
+profiling:
+  nsys_on: false
+accuracy:
+  enable_accuracy_test: false
+  model: local-completions
+  tasks: gsm8k
+  model_args_extra: num_concurrent=512,max_retries=3,tokenized_requests=false,timeout=1200,max_gen_toks=256,max_length=4096
+worker_config:
+  gen:
+    print_iter_log: true
+    max_batch_size: 1
+    max_num_tokens: 256
+    tensor_parallel_size: 8
+    moe_expert_parallel_size: 8
+    pipeline_parallel_size: 1
+    context_parallel_size: 1
+    enable_attention_dp: false
+    cuda_graph_config:
+      enable_padding: true
+      max_batch_size: 1
+    kv_cache_config:
+      enable_block_reuse: false
+      free_gpu_memory_fraction: 0.9
+      dtype: fp8
+      tokens_per_block: 64
+    moe_config:
+      backend: TRTLLM
+    cache_transceiver_config:
+      max_tokens_in_buffer: 32768
+      backend: UCX
+    disable_overlap_scheduler: true
+    speculative_config: &id001
+      decoding_type: MTP
+      num_nextn_predict_layers: 3
+    num_postprocess_workers: 4
+    stream_interval: 20
+    nvfp4_gemm_config:
+      allowed_backends:
+      - cutlass
+      - cublaslt
+      - cutedsl
+      - cuda_core
+    allreduce_strategy: MNNVL
+  ctx:
+    print_iter_log: true
+    max_batch_size: 1
+    max_num_tokens: 32784
+    tensor_parallel_size: 4
+    moe_expert_parallel_size: 4
+    pipeline_parallel_size: 1
+    context_parallel_size: 1
+    enable_attention_dp: true
+    enable_lm_head_tp_in_adp: true
+    cuda_graph_config: null
+    kv_cache_config:
+      enable_block_reuse: false
+      free_gpu_memory_fraction: 0.6
+      dtype: fp8
+      tokens_per_block: 64
+    moe_config:
+      backend: TRTLLM
+    cache_transceiver_config:
+      max_tokens_in_buffer: 32768
+      backend: UCX
+    disable_overlap_scheduler: true
+    speculative_config: *id001
--- a/tests/integration/defs/perf/disagg/test_configs/disagg/perf-sanity/gb200-deepseek-v32-fp4_8k1k_ctx1_dep4_gen1_dep32_eplb256_mtp0_ccb-UCX.yaml
+++ b/tests/integration/defs/perf/disagg/test_configs/disagg/perf-sanity/gb200-deepseek-v32-fp4_8k1k_ctx1_dep4_gen1_dep32_eplb256_mtp0_ccb-UCX.yaml
@ -0,0 +1,104 @@
+metadata:
+  model_name: deepseek_v32_fp4
+  precision: fp4
+  model_dir_name: DeepSeek-V3.2-FP4-v2
+  supported_gpus:
+  - GB200
+  script_file: disaggr_torch.slurm
+  benchmark_type: 8k1k
+slurm:
+  script_file: disaggr_torch.slurm
+  partition: <partition>
+  account: <account>
+  job_time: 02:00:00
+  job_name: unified-benchmark
+  extra_args: --gres=gpu:4
+  numa_bind: true
+benchmark:
+  mode: e2e
+  use_nv_sa_benchmark: false
+  multi_round: 5
+  benchmark_ratio: 0.0
+  streaming: true
+  concurrency_list: '4096'
+  input_length: 8192
+  output_length: 1024
+  dataset_file: datasets/perf-ci/deepseek_v32-8k1k-20480-ratio-1_for_serve.json
+hardware:
+  gpus_per_node: 4
+  num_ctx_servers: 1
+  num_gen_servers: 1
+environment:
+  container_mount: <container_mount>
+  container_image: <container_image>
+  model_path: <model_path>
+  trtllm_repo: ''
+  build_wheel: false
+  work_dir: <full_path_to_work_dir>
+  worker_env_var: TLLM_LOG_LEVEL=INFO TRTLLM_SERVER_DISABLE_GC=1 TRTLLM_WORKER_DISABLE_GC=1 TRTLLM_ENABLE_PDL=1 ENROOT_ALLOW_DEV=yes
+  server_env_var: TRTLLM_SERVER_DISABLE_GC=1
+profiling:
+  nsys_on: false
+accuracy:
+  enable_accuracy_test: false
+  model: local-completions
+  tasks: gsm8k
+  model_args_extra: num_concurrent=512,max_retries=3,tokenized_requests=false,timeout=1200,max_gen_toks=256,max_length=4096
+worker_config:
+  gen:
+    print_iter_log: true
+    max_batch_size: 128
+    max_num_tokens: 128
+    tensor_parallel_size: 32
+    moe_expert_parallel_size: 32
+    pipeline_parallel_size: 1
+    context_parallel_size: 1
+    enable_attention_dp: true
+    enable_lm_head_tp_in_adp: true
+    cuda_graph_config:
+      enable_padding: true
+      max_batch_size: 128
+    kv_cache_config:
+      enable_block_reuse: false
+      free_gpu_memory_fraction: 0.8
+      dtype: fp8
+      tokens_per_block: 64
+    moe_config:
+      backend: CUTEDSL
+      load_balancer:
+        num_slots: 256
+        layer_updates_per_iter: 1
+    cache_transceiver_config:
+      max_tokens_in_buffer: 16384
+      backend: UCX
+    disable_overlap_scheduler: true
+    num_postprocess_workers: 4
+    stream_interval: 20
+    nvfp4_gemm_config:
+      allowed_backends:
+      - cutlass
+      - cublaslt
+      - cutedsl
+      - cuda_core
+  ctx:
+    print_iter_log: true
+    max_batch_size: 2
+    max_num_tokens: 16384
+    tensor_parallel_size: 4
+    moe_expert_parallel_size: 4
+    pipeline_parallel_size: 1
+    context_parallel_size: 1
+    enable_attention_dp: true
+    enable_lm_head_tp_in_adp: true
+    cuda_graph_config: null
+    kv_cache_config:
+      enable_block_reuse: false
+      free_gpu_memory_fraction: 0.6
+      dtype: fp8
+      tokens_per_block: 64
+    moe_config:
+      backend: TRTLLM
+    cache_transceiver_config:
+      max_tokens_in_buffer: 16384
+      backend: UCX
+    disable_overlap_scheduler: true
--- a/tests/integration/defs/perf/disagg/test_configs/disagg/perf-sanity/gb200-deepseek-v32-fp4_8k1k_ctx1_dep4_gen1_dep32_eplb256_mtp3_ccb-UCX.yaml
+++ b/tests/integration/defs/perf/disagg/test_configs/disagg/perf-sanity/gb200-deepseek-v32-fp4_8k1k_ctx1_dep4_gen1_dep32_eplb256_mtp3_ccb-UCX.yaml
@ -0,0 +1,108 @@
+metadata:
+  model_name: deepseek_v32_fp4
+  precision: fp4
+  model_dir_name: DeepSeek-V3.2-FP4-v2
+  supported_gpus:
+  - GB200
+  script_file: disaggr_torch.slurm
+  benchmark_type: 8k1k
+slurm:
+  script_file: disaggr_torch.slurm
+  partition: <partition>
+  account: <account>
+  job_time: 02:00:00
+  job_name: unified-benchmark
+  extra_args: --gres=gpu:4
+  numa_bind: true
+benchmark:
+  mode: e2e
+  use_nv_sa_benchmark: false
+  multi_round: 10
+  benchmark_ratio: 0.0
+  streaming: true
+  concurrency_list: '1024'
+  input_length: 8192
+  output_length: 1024
+  dataset_file: datasets/perf-ci/deepseek_v32-8k1k-20480-ratio-1_for_serve.json
+hardware:
+  gpus_per_node: 4
+  num_ctx_servers: 1
+  num_gen_servers: 1
+environment:
+  container_mount: <container_mount>
+  container_image: <container_image>
+  model_path: <model_path>
+  trtllm_repo: ''
+  build_wheel: false
+  work_dir: <full_path_to_work_dir>
+  worker_env_var: TLLM_LOG_LEVEL=INFO TRTLLM_SERVER_DISABLE_GC=1 TRTLLM_WORKER_DISABLE_GC=1 TRTLLM_ENABLE_PDL=1 ENROOT_ALLOW_DEV=yes
+  server_env_var: TRTLLM_SERVER_DISABLE_GC=1
+profiling:
+  nsys_on: false
+accuracy:
+  enable_accuracy_test: false
+  model: local-completions
+  tasks: gsm8k
+  model_args_extra: num_concurrent=512,max_retries=3,tokenized_requests=false,timeout=1200,max_gen_toks=256,max_length=4096
+worker_config:
+  gen:
+    print_iter_log: true
+    max_batch_size: 32
+    max_num_tokens: 128
+    tensor_parallel_size: 32
+    moe_expert_parallel_size: 32
+    pipeline_parallel_size: 1
+    context_parallel_size: 1
+    enable_attention_dp: true
+    enable_lm_head_tp_in_adp: true
+    cuda_graph_config:
+      enable_padding: true
+      max_batch_size: 32
+    kv_cache_config:
+      enable_block_reuse: false
+      free_gpu_memory_fraction: 0.75
+      dtype: fp8
+      tokens_per_block: 64
+    moe_config:
+      backend: CUTEDSL
+      load_balancer:
+        num_slots: 256
+        layer_updates_per_iter: 1
+    cache_transceiver_config:
+      max_tokens_in_buffer: 16384
+      backend: UCX
+    disable_overlap_scheduler: true
+    speculative_config: &id001
+      decoding_type: MTP
+      num_nextn_predict_layers: 3
+    num_postprocess_workers: 4
+    stream_interval: 20
+    nvfp4_gemm_config:
+      allowed_backends:
+      - cutlass
+      - cublaslt
+      - cutedsl
+      - cuda_core
+  ctx:
+    print_iter_log: true
+    max_batch_size: 2
+    max_num_tokens: 16384
+    tensor_parallel_size: 4
+    moe_expert_parallel_size: 4
+    pipeline_parallel_size: 1
+    context_parallel_size: 1
+    enable_attention_dp: true
+    enable_lm_head_tp_in_adp: true
+    cuda_graph_config: null
+    kv_cache_config:
+      enable_block_reuse: false
+      free_gpu_memory_fraction: 0.6
+      dtype: fp8
+      tokens_per_block: 64
+    moe_config:
+      backend: TRTLLM
+    cache_transceiver_config:
+      max_tokens_in_buffer: 16384
+      backend: UCX
+    disable_overlap_scheduler: true
+    speculative_config: *id001
--- a/tests/integration/defs/perf/disagg/test_configs/disagg/perf-sanity/gb200-deepseek-v32-fp4_8k1k_ctx1_dep4_gen1_tep8_eplb0_mtp3_ccb-UCX.yaml
+++ b/tests/integration/defs/perf/disagg/test_configs/disagg/perf-sanity/gb200-deepseek-v32-fp4_8k1k_ctx1_dep4_gen1_tep8_eplb0_mtp3_ccb-UCX.yaml
@ -0,0 +1,105 @@
+metadata:
+  model_name: deepseek_v32_fp4
+  precision: fp4
+  model_dir_name: DeepSeek-V3.2-FP4-v2
+  supported_gpus:
+  - GB200
+  script_file: disaggr_torch.slurm
+  benchmark_type: 8k1k
+slurm:
+  script_file: disaggr_torch.slurm
+  partition: <partition>
+  account: <account>
+  job_time: 02:00:00
+  job_name: unified-benchmark
+  extra_args: --gres=gpu:4
+  numa_bind: true
+benchmark:
+  mode: e2e
+  use_nv_sa_benchmark: false
+  multi_round: 10
+  benchmark_ratio: 0.0
+  streaming: true
+  concurrency_list: '1'
+  input_length: 8192
+  output_length: 1024
+  dataset_file: datasets/perf-ci/deepseek_v32-8k1k-20480-ratio-1_for_serve.json
+hardware:
+  gpus_per_node: 4
+  num_ctx_servers: 1
+  num_gen_servers: 1
+environment:
+  container_mount: <container_mount>
+  container_image: <container_image>
+  model_path: <model_path>
+  trtllm_repo: ''
+  build_wheel: false
+  work_dir: <full_path_to_work_dir>
+  worker_env_var: TLLM_LOG_LEVEL=INFO TRTLLM_SERVER_DISABLE_GC=1 TRTLLM_WORKER_DISABLE_GC=1 TRTLLM_ENABLE_PDL=1 ENROOT_ALLOW_DEV=yes
+  server_env_var: TRTLLM_SERVER_DISABLE_GC=1
+profiling:
+  nsys_on: false
+accuracy:
+  enable_accuracy_test: false
+  model: local-completions
+  tasks: gsm8k
+  model_args_extra: num_concurrent=512,max_retries=3,tokenized_requests=false,timeout=1200,max_gen_toks=256,max_length=4096
+worker_config:
+  gen:
+    print_iter_log: true
+    max_batch_size: 1
+    max_num_tokens: 512
+    tensor_parallel_size: 8
+    moe_expert_parallel_size: 8
+    pipeline_parallel_size: 1
+    context_parallel_size: 1
+    enable_attention_dp: false
+    cuda_graph_config:
+      enable_padding: true
+      max_batch_size: 1
+    kv_cache_config:
+      enable_block_reuse: false
+      free_gpu_memory_fraction: 0.9
+      dtype: fp8
+      tokens_per_block: 64
+    moe_config:
+      backend: TRTLLM
+    cache_transceiver_config:
+      max_tokens_in_buffer: 16384
+      backend: UCX
+    disable_overlap_scheduler: true
+    speculative_config: &id001
+      decoding_type: MTP
+      num_nextn_predict_layers: 3
+    num_postprocess_workers: 4
+    stream_interval: 20
+    nvfp4_gemm_config:
+      allowed_backends:
+      - cutlass
+      - cublaslt
+      - cutedsl
+      - cuda_core
+    allreduce_strategy: MNNVL
+  ctx:
+    print_iter_log: true
+    max_batch_size: 2
+    max_num_tokens: 16384
+    tensor_parallel_size: 4
+    moe_expert_parallel_size: 4
+    pipeline_parallel_size: 1
+    context_parallel_size: 1
+    enable_attention_dp: true
+    enable_lm_head_tp_in_adp: true
+    cuda_graph_config: null
+    kv_cache_config:
+      enable_block_reuse: false
+      free_gpu_memory_fraction: 0.6
+      dtype: fp8
+      tokens_per_block: 64
+    moe_config:
+      backend: TRTLLM
+    cache_transceiver_config:
+      max_tokens_in_buffer: 16384
+      backend: UCX
+    disable_overlap_scheduler: true
+    speculative_config: *id001
--- a/tests/integration/defs/perf/disagg/test_configs/disagg/perf-sanity/gb200-kimi-k2-thinking-fp4_1k1k_ctx1_dep4_gen1_dep32_eplb384_mtp0_ccb-UCX.yaml
+++ b/tests/integration/defs/perf/disagg/test_configs/disagg/perf-sanity/gb200-kimi-k2-thinking-fp4_1k1k_ctx1_dep4_gen1_dep32_eplb384_mtp0_ccb-UCX.yaml
@ -0,0 +1,98 @@
+metadata:
+  model_name: k2_thinking_fp4
+  precision: fp4
+  model_dir_name: Kimi-K2-Thinking-NVFP4
+  supported_gpus:
+  - GB200
+  script_file: disaggr_torch.slurm
+  benchmark_type: 1k1k
+slurm:
+  script_file: disaggr_torch.slurm
+  partition: <partition>
+  account: <account>
+  job_time: 02:00:00
+  job_name: unified-benchmark
+  extra_args: --gres=gpu:4
+  numa_bind: true
+benchmark:
+  mode: e2e
+  use_nv_sa_benchmark: false
+  multi_round: 5
+  benchmark_ratio: 0.0
+  streaming: true
+  concurrency_list: '2048'
+  input_length: 1024
+  output_length: 1024
+  dataset_file: datasets/perf-ci/k2_thinking-1k1k-20480-ratio-1_for_serve.json
+hardware:
+  gpus_per_node: 4
+  num_ctx_servers: 1
+  num_gen_servers: 1
+environment:
+  container_mount: <container_mount>
+  container_image: <container_image>
+  model_path: <model_path>
+  trtllm_repo: ''
+  build_wheel: false
+  work_dir: <full_path_to_work_dir>
+  worker_env_var: TLLM_LOG_LEVEL=INFO TRTLLM_SERVER_DISABLE_GC=1 TRTLLM_WORKER_DISABLE_GC=1 TRTLLM_ENABLE_PDL=1 ENROOT_ALLOW_DEV=yes
+  server_env_var: TRTLLM_SERVER_DISABLE_GC=1
+profiling:
+  nsys_on: false
+accuracy:
+  enable_accuracy_test: false
+  model: local-completions
+  tasks: gsm8k
+  model_args_extra: num_concurrent=512,max_retries=3,tokenized_requests=false,timeout=1200,max_gen_toks=256,max_length=4096
+worker_config:
+  gen:
+    print_iter_log: true
+    max_batch_size: 64
+    max_num_tokens: 128
+    tensor_parallel_size: 32
+    moe_expert_parallel_size: 32
+    pipeline_parallel_size: 1
+    context_parallel_size: 1
+    enable_attention_dp: true
+    enable_lm_head_tp_in_adp: true
+    cuda_graph_config:
+      enable_padding: true
+      max_batch_size: 64
+    kv_cache_config:
+      enable_block_reuse: false
+      free_gpu_memory_fraction: 0.8
+      dtype: fp8
+    moe_config:
+      backend: WIDEEP
+      load_balancer:
+        num_slots: 384
+        layer_updates_per_iter: 1
+    cache_transceiver_config:
+      max_tokens_in_buffer: 16384
+      backend: UCX
+    disable_overlap_scheduler: true
+    trust_remote_code: true
+    num_postprocess_workers: 4
+    stream_interval: 20
+  ctx:
+    print_iter_log: true
+    max_batch_size: 16
+    max_num_tokens: 8192
+    tensor_parallel_size: 4
+    moe_expert_parallel_size: 4
+    pipeline_parallel_size: 1
+    context_parallel_size: 1
+    enable_attention_dp: true
+    enable_lm_head_tp_in_adp: true
+    cuda_graph_config: null
+    kv_cache_config:
+      enable_block_reuse: false
+      free_gpu_memory_fraction: 0.6
+      dtype: fp8
+    moe_config:
+      backend: TRTLLM
+    cache_transceiver_config:
+      max_tokens_in_buffer: 16384
+      backend: UCX
+    disable_overlap_scheduler: true
+    trust_remote_code: true
--- a/tests/integration/defs/perf/disagg/test_configs/disagg/perf-sanity/gb200-kimi-k2-thinking-fp4_1k1k_ctx1_dep4_gen1_dep8_eplb0_mtp0_ccb-UCX.yaml
+++ b/tests/integration/defs/perf/disagg/test_configs/disagg/perf-sanity/gb200-kimi-k2-thinking-fp4_1k1k_ctx1_dep4_gen1_dep8_eplb0_mtp0_ccb-UCX.yaml
@ -0,0 +1,95 @@
+metadata:
+  model_name: k2_thinking_fp4
+  precision: fp4
+  model_dir_name: Kimi-K2-Thinking-NVFP4
+  supported_gpus:
+  - GB200
+  script_file: disaggr_torch.slurm
+  benchmark_type: 1k1k
+slurm:
+  script_file: disaggr_torch.slurm
+  partition: <partition>
+  account: <account>
+  job_time: 02:00:00
+  job_name: unified-benchmark
+  extra_args: --gres=gpu:4
+  numa_bind: true
+benchmark:
+  mode: e2e
+  use_nv_sa_benchmark: false
+  multi_round: 5
+  benchmark_ratio: 0.0
+  streaming: true
+  concurrency_list: '4096'
+  input_length: 1024
+  output_length: 1024
+  dataset_file: datasets/perf-ci/k2_thinking-1k1k-20480-ratio-1_for_serve.json
+hardware:
+  gpus_per_node: 4
+  num_ctx_servers: 1
+  num_gen_servers: 1
+environment:
+  container_mount: <container_mount>
+  container_image: <container_image>
+  model_path: <model_path>
+  trtllm_repo: ''
+  build_wheel: false
+  work_dir: <full_path_to_work_dir>
+  worker_env_var: TLLM_LOG_LEVEL=INFO TRTLLM_SERVER_DISABLE_GC=1 TRTLLM_WORKER_DISABLE_GC=1 TRTLLM_ENABLE_PDL=1 ENROOT_ALLOW_DEV=yes
+  server_env_var: TRTLLM_SERVER_DISABLE_GC=1
+profiling:
+  nsys_on: false
+accuracy:
+  enable_accuracy_test: false
+  model: local-completions
+  tasks: gsm8k
+  model_args_extra: num_concurrent=512,max_retries=3,tokenized_requests=false,timeout=1200,max_gen_toks=256,max_length=4096
+worker_config:
+  gen:
+    print_iter_log: true
+    max_batch_size: 512
+    max_num_tokens: 512
+    tensor_parallel_size: 8
+    moe_expert_parallel_size: 8
+    pipeline_parallel_size: 1
+    context_parallel_size: 1
+    enable_attention_dp: true
+    enable_lm_head_tp_in_adp: true
+    cuda_graph_config:
+      enable_padding: true
+      max_batch_size: 512
+    kv_cache_config:
+      enable_block_reuse: false
+      free_gpu_memory_fraction: 0.8
+      dtype: fp8
+    moe_config:
+      backend: CUTLASS
+    cache_transceiver_config:
+      max_tokens_in_buffer: 16384
+      backend: UCX
+    disable_overlap_scheduler: true
+    trust_remote_code: true
+    num_postprocess_workers: 4
+    stream_interval: 20
+  ctx:
+    print_iter_log: true
+    max_batch_size: 16
+    max_num_tokens: 8192
+    tensor_parallel_size: 4
+    moe_expert_parallel_size: 4
+    pipeline_parallel_size: 1
+    context_parallel_size: 1
+    enable_attention_dp: true
+    enable_lm_head_tp_in_adp: true
+    cuda_graph_config: null
+    kv_cache_config:
+      enable_block_reuse: false
+      free_gpu_memory_fraction: 0.6
+      dtype: fp8
+    moe_config:
+      backend: TRTLLM
+    cache_transceiver_config:
+      max_tokens_in_buffer: 16384
+      backend: UCX
+    disable_overlap_scheduler: true
+    trust_remote_code: true
--- a/tests/integration/defs/perf/disagg/test_configs/disagg/perf-sanity/gb200-kimi-k2-thinking-fp4_1k1k_ctx1_dep4_gen1_tep4_eplb0_mtp0_ccb-UCX.yaml
+++ b/tests/integration/defs/perf/disagg/test_configs/disagg/perf-sanity/gb200-kimi-k2-thinking-fp4_1k1k_ctx1_dep4_gen1_tep4_eplb0_mtp0_ccb-UCX.yaml
@ -0,0 +1,94 @@
+metadata:
+  model_name: k2_thinking_fp4
+  precision: fp4
+  model_dir_name: Kimi-K2-Thinking-NVFP4
+  supported_gpus:
+  - GB200
+  script_file: disaggr_torch.slurm
+  benchmark_type: 1k1k
+slurm:
+  script_file: disaggr_torch.slurm
+  partition: <partition>
+  account: <account>
+  job_time: 02:00:00
+  job_name: unified-benchmark
+  extra_args: --gres=gpu:4
+  numa_bind: true
+benchmark:
+  mode: e2e
+  use_nv_sa_benchmark: false
+  multi_round: 10
+  benchmark_ratio: 0.0
+  streaming: true
+  concurrency_list: '4'
+  input_length: 1024
+  output_length: 1024
+  dataset_file: datasets/perf-ci/k2_thinking-1k1k-20480-ratio-1_for_serve.json
+hardware:
+  gpus_per_node: 4
+  num_ctx_servers: 1
+  num_gen_servers: 1
+environment:
+  container_mount: <container_mount>
+  container_image: <container_image>
+  model_path: <model_path>
+  trtllm_repo: ''
+  build_wheel: false
+  work_dir: <full_path_to_work_dir>
+  worker_env_var: TLLM_LOG_LEVEL=INFO TRTLLM_SERVER_DISABLE_GC=1 TRTLLM_WORKER_DISABLE_GC=1 TRTLLM_ENABLE_PDL=1 ENROOT_ALLOW_DEV=yes
+  server_env_var: TRTLLM_SERVER_DISABLE_GC=1
+profiling:
+  nsys_on: false
+accuracy:
+  enable_accuracy_test: false
+  model: local-completions
+  tasks: gsm8k
+  model_args_extra: num_concurrent=512,max_retries=3,tokenized_requests=false,timeout=1200,max_gen_toks=256,max_length=4096
+worker_config:
+  gen:
+    print_iter_log: true
+    max_batch_size: 4
+    max_num_tokens: 128
+    tensor_parallel_size: 4
+    moe_expert_parallel_size: 4
+    pipeline_parallel_size: 1
+    context_parallel_size: 1
+    enable_attention_dp: false
+    cuda_graph_config:
+      enable_padding: true
+      max_batch_size: 4
+    kv_cache_config:
+      enable_block_reuse: false
+      free_gpu_memory_fraction: 0.8
+      dtype: fp8
+    moe_config:
+      backend: TRTLLM
+    cache_transceiver_config:
+      max_tokens_in_buffer: 16384
+      backend: UCX
+    disable_overlap_scheduler: true
+    trust_remote_code: true
+    num_postprocess_workers: 4
+    stream_interval: 20
+  ctx:
+    print_iter_log: true
+    max_batch_size: 16
+    max_num_tokens: 8192
+    tensor_parallel_size: 4
+    moe_expert_parallel_size: 4
+    pipeline_parallel_size: 1
+    context_parallel_size: 1
+    enable_attention_dp: true
+    enable_lm_head_tp_in_adp: true
+    cuda_graph_config: null
+    kv_cache_config:
+      enable_block_reuse: false
+      free_gpu_memory_fraction: 0.6
+      dtype: fp8
+    moe_config:
+      backend: TRTLLM
+    cache_transceiver_config:
+      max_tokens_in_buffer: 16384
+      backend: UCX
+    disable_overlap_scheduler: true
+    trust_remote_code: true
--- a/tests/integration/defs/perf/disagg/test_configs/disagg/perf-sanity/gb200-kimi-k2-thinking-fp4_8k1k_ctx1_dep4_gen1_dep16_eplb384_mtp0_ccb-UCX.yaml
+++ b/tests/integration/defs/perf/disagg/test_configs/disagg/perf-sanity/gb200-kimi-k2-thinking-fp4_8k1k_ctx1_dep4_gen1_dep16_eplb384_mtp0_ccb-UCX.yaml
@ -0,0 +1,98 @@
+metadata:
+  model_name: k2_thinking_fp4
+  precision: fp4
+  model_dir_name: Kimi-K2-Thinking-NVFP4
+  supported_gpus:
+  - GB200
+  script_file: disaggr_torch.slurm
+  benchmark_type: 8k1k
+slurm:
+  script_file: disaggr_torch.slurm
+  partition: <partition>
+  account: <account>
+  job_time: 02:00:00
+  job_name: unified-benchmark
+  extra_args: --gres=gpu:4
+  numa_bind: true
+benchmark:
+  mode: e2e
+  use_nv_sa_benchmark: false
+  multi_round: 5
+  benchmark_ratio: 0.0
+  streaming: true
+  concurrency_list: '4096'
+  input_length: 8192
+  output_length: 1024
+  dataset_file: datasets/perf-ci/k2_thinking-8k1k-20480-ratio-1_for_serve.json
+hardware:
+  gpus_per_node: 4
+  num_ctx_servers: 1
+  num_gen_servers: 1
+environment:
+  container_mount: <container_mount>
+  container_image: <container_image>
+  model_path: <model_path>
+  trtllm_repo: ''
+  build_wheel: false
+  work_dir: <full_path_to_work_dir>
+  worker_env_var: TLLM_LOG_LEVEL=INFO TRTLLM_SERVER_DISABLE_GC=1 TRTLLM_WORKER_DISABLE_GC=1 TRTLLM_ENABLE_PDL=1 ENROOT_ALLOW_DEV=yes
+  server_env_var: TRTLLM_SERVER_DISABLE_GC=1
+profiling:
+  nsys_on: false
+accuracy:
+  enable_accuracy_test: false
+  model: local-completions
+  tasks: gsm8k
+  model_args_extra: num_concurrent=512,max_retries=3,tokenized_requests=false,timeout=1200,max_gen_toks=256,max_length=4096
+worker_config:
+  gen:
+    print_iter_log: true
+    max_batch_size: 256
+    max_num_tokens: 256
+    tensor_parallel_size: 16
+    moe_expert_parallel_size: 16
+    pipeline_parallel_size: 1
+    context_parallel_size: 1
+    enable_attention_dp: true
+    enable_lm_head_tp_in_adp: true
+    cuda_graph_config:
+      enable_padding: true
+      max_batch_size: 256
+    kv_cache_config:
+      enable_block_reuse: false
+      free_gpu_memory_fraction: 0.8
+      dtype: fp8
+    moe_config:
+      backend: WIDEEP
+      load_balancer:
+        num_slots: 384
+        layer_updates_per_iter: 1
+    cache_transceiver_config:
+      max_tokens_in_buffer: 16384
+      backend: UCX
+    disable_overlap_scheduler: true
+    trust_remote_code: true
+    num_postprocess_workers: 4
+    stream_interval: 20
+  ctx:
+    print_iter_log: true
+    max_batch_size: 2
+    max_num_tokens: 8192
+    tensor_parallel_size: 4
+    moe_expert_parallel_size: 4
+    pipeline_parallel_size: 1
+    context_parallel_size: 1
+    enable_attention_dp: true
+    enable_lm_head_tp_in_adp: true
+    cuda_graph_config: null
+    kv_cache_config:
+      enable_block_reuse: false
+      free_gpu_memory_fraction: 0.6
+      dtype: fp8
+    moe_config:
+      backend: TRTLLM
+    cache_transceiver_config:
+      max_tokens_in_buffer: 16384
+      backend: UCX
+    disable_overlap_scheduler: true
+    trust_remote_code: true
--- a/tests/integration/defs/perf/disagg/test_configs/disagg/perf-sanity/gb200-kimi-k2-thinking-fp4_8k1k_ctx1_dep4_gen1_dep32_eplb416_mtp3_ccb-UCX.yaml
+++ b/tests/integration/defs/perf/disagg/test_configs/disagg/perf-sanity/gb200-kimi-k2-thinking-fp4_8k1k_ctx1_dep4_gen1_dep32_eplb416_mtp3_ccb-UCX.yaml
@ -0,0 +1,104 @@
+metadata:
+  model_name: k2_thinking_fp4
+  precision: fp4
+  model_dir_name: Kimi-K2-Thinking-NVFP4
+  supported_gpus:
+  - GB200
+  script_file: disaggr_torch.slurm
+  benchmark_type: 8k1k
+slurm:
+  script_file: disaggr_torch.slurm
+  partition: <partition>
+  account: <account>
+  job_time: 02:00:00
+  job_name: unified-benchmark
+  extra_args: --gres=gpu:4
+  numa_bind: true
+benchmark:
+  mode: e2e
+  use_nv_sa_benchmark: false
+  multi_round: 10
+  benchmark_ratio: 0.0
+  streaming: true
+  concurrency_list: '1024'
+  input_length: 8192
+  output_length: 1024
+  dataset_file: datasets/perf-ci/k2_thinking-8k1k-20480-ratio-1_for_serve.json
+hardware:
+  gpus_per_node: 4
+  num_ctx_servers: 1
+  num_gen_servers: 1
+environment:
+  container_mount: <container_mount>
+  container_image: <container_image>
+  model_path: <model_path>
+  trtllm_repo: ''
+  build_wheel: false
+  work_dir: <full_path_to_work_dir>
+  worker_env_var: TLLM_LOG_LEVEL=INFO TRTLLM_SERVER_DISABLE_GC=1 TRTLLM_WORKER_DISABLE_GC=1 TRTLLM_ENABLE_PDL=1 ENROOT_ALLOW_DEV=yes
+  server_env_var: TRTLLM_SERVER_DISABLE_GC=1
+profiling:
+  nsys_on: false
+accuracy:
+  enable_accuracy_test: false
+  model: local-completions
+  tasks: gsm8k
+  model_args_extra: num_concurrent=512,max_retries=3,tokenized_requests=false,timeout=1200,max_gen_toks=256,max_length=4096
+worker_config:
+  gen:
+    print_iter_log: true
+    max_batch_size: 32
+    max_num_tokens: 128
+    tensor_parallel_size: 32
+    moe_expert_parallel_size: 32
+    pipeline_parallel_size: 1
+    context_parallel_size: 1
+    enable_attention_dp: true
+    enable_lm_head_tp_in_adp: true
+    cuda_graph_config:
+      enable_padding: true
+      max_batch_size: 32
+    kv_cache_config:
+      enable_block_reuse: false
+      free_gpu_memory_fraction: 0.8
+      dtype: fp8
+    moe_config:
+      backend: WIDEEP
+      load_balancer:
+        num_slots: 416
+        layer_updates_per_iter: 1
+    cache_transceiver_config:
+      max_tokens_in_buffer: 16384
+      backend: UCX
+    disable_overlap_scheduler: true
+    speculative_config: &id001
+      decoding_type: Eagle
+      max_draft_len: 3
+      eagle3_one_model: true
+      speculative_model: Kimi-K2-Thinking-NVFP4-Eagle3
+    trust_remote_code: true
+    num_postprocess_workers: 4
+    stream_interval: 20
+  ctx:
+    print_iter_log: true
+    max_batch_size: 2
+    max_num_tokens: 8192
+    tensor_parallel_size: 4
+    moe_expert_parallel_size: 4
+    pipeline_parallel_size: 1
+    context_parallel_size: 1
+    enable_attention_dp: true
+    enable_lm_head_tp_in_adp: true
+    cuda_graph_config: null
+    kv_cache_config:
+      enable_block_reuse: false
+      free_gpu_memory_fraction: 0.6
+      dtype: fp8
+    moe_config:
+      backend: TRTLLM
+    cache_transceiver_config:
+      max_tokens_in_buffer: 16384
+      backend: UCX
+    disable_overlap_scheduler: true
+    speculative_config: *id001
+    trust_remote_code: true
--- a/tests/integration/defs/perf/disagg/test_configs/disagg/perf-sanity/gb200-kimi-k2-thinking-fp4_8k1k_ctx1_dep4_gen1_tep8_eplb0_mtp3_ccb-UCX.yaml
+++ b/tests/integration/defs/perf/disagg/test_configs/disagg/perf-sanity/gb200-kimi-k2-thinking-fp4_8k1k_ctx1_dep4_gen1_tep8_eplb0_mtp3_ccb-UCX.yaml
@ -0,0 +1,101 @@
+metadata:
+  model_name: k2_thinking_fp4
+  precision: fp4
+  model_dir_name: Kimi-K2-Thinking-NVFP4
+  supported_gpus:
+  - GB200
+  script_file: disaggr_torch.slurm
+  benchmark_type: 8k1k
+slurm:
+  script_file: disaggr_torch.slurm
+  partition: <partition>
+  account: <account>
+  job_time: 02:00:00
+  job_name: unified-benchmark
+  extra_args: --gres=gpu:4
+  numa_bind: true
+benchmark:
+  mode: e2e
+  use_nv_sa_benchmark: false
+  multi_round: 10
+  benchmark_ratio: 0.0
+  streaming: true
+  concurrency_list: '4'
+  input_length: 8192
+  output_length: 1024
+  dataset_file: datasets/perf-ci/k2_thinking-8k1k-20480-ratio-1_for_serve.json
+hardware:
+  gpus_per_node: 4
+  num_ctx_servers: 1
+  num_gen_servers: 1
+environment:
+  container_mount: <container_mount>
+  container_image: <container_image>
+  model_path: <model_path>
+  trtllm_repo: ''
+  build_wheel: false
+  work_dir: <full_path_to_work_dir>
+  worker_env_var: TLLM_LOG_LEVEL=INFO TRTLLM_SERVER_DISABLE_GC=1 TRTLLM_WORKER_DISABLE_GC=1 TRTLLM_ENABLE_PDL=1 ENROOT_ALLOW_DEV=yes
+  server_env_var: TRTLLM_SERVER_DISABLE_GC=1
+profiling:
+  nsys_on: false
+accuracy:
+  enable_accuracy_test: false
+  model: local-completions
+  tasks: gsm8k
+  model_args_extra: num_concurrent=512,max_retries=3,tokenized_requests=false,timeout=1200,max_gen_toks=256,max_length=4096
+worker_config:
+  gen:
+    print_iter_log: true
+    max_batch_size: 4
+    max_num_tokens: 128
+    tensor_parallel_size: 8
+    moe_expert_parallel_size: 8
+    pipeline_parallel_size: 1
+    context_parallel_size: 1
+    enable_attention_dp: false
+    cuda_graph_config:
+      enable_padding: true
+      max_batch_size: 4
+    kv_cache_config:
+      enable_block_reuse: false
+      free_gpu_memory_fraction: 0.8
+      dtype: fp8
+    moe_config:
+      backend: TRTLLM
+    cache_transceiver_config:
+      max_tokens_in_buffer: 16384
+      backend: UCX
+    disable_overlap_scheduler: true
+    speculative_config: &id001
+      decoding_type: Eagle
+      max_draft_len: 3
+      eagle3_one_model: true
+      speculative_model: Kimi-K2-Thinking-NVFP4-Eagle3
+    trust_remote_code: true
+    num_postprocess_workers: 4
+    stream_interval: 20
+    allreduce_strategy: MNNVL
+  ctx:
+    print_iter_log: true
+    max_batch_size: 2
+    max_num_tokens: 8192
+    tensor_parallel_size: 4
+    moe_expert_parallel_size: 4
+    pipeline_parallel_size: 1
+    context_parallel_size: 1
+    enable_attention_dp: true
+    enable_lm_head_tp_in_adp: true
+    cuda_graph_config: null
+    kv_cache_config:
+      enable_block_reuse: false
+      free_gpu_memory_fraction: 0.6
+      dtype: fp8
+    moe_config:
+      backend: TRTLLM
+    cache_transceiver_config:
+      max_tokens_in_buffer: 16384
+      backend: UCX
+    disable_overlap_scheduler: true
+    speculative_config: *id001
+    trust_remote_code: true
--- a/tests/integration/defs/perf/disagg/test_configs/disagg/perf-sanity/gb200-qwen3-235b-fp4_8k1k_ctx1_tp1_gen1_dep8_eplb0_mtp0_ccb-UCX.yaml
+++ b/tests/integration/defs/perf/disagg/test_configs/disagg/perf-sanity/gb200-qwen3-235b-fp4_8k1k_ctx1_tp1_gen1_dep8_eplb0_mtp0_ccb-UCX.yaml
@ -0,0 +1,92 @@
+metadata:
+  model_name: qwen3_235b_a22b_fp4
+  precision: fp4
+  model_dir_name: Qwen3-235B-A22B-FP4
+  supported_gpus:
+  - GB200
+  script_file: disaggr_torch.slurm
+  benchmark_type: 8k1k
+slurm:
+  script_file: disaggr_torch.slurm
+  partition: <partition>
+  account: <account>
+  job_time: 02:00:00
+  job_name: unified-benchmark
+  extra_args: --gres=gpu:4
+  numa_bind: true
+benchmark:
+  mode: e2e
+  use_nv_sa_benchmark: false
+  multi_round: 10
+  benchmark_ratio: 0.0
+  streaming: true
+  concurrency_list: '1024'
+  input_length: 8192
+  output_length: 1024
+  dataset_file: datasets/perf-ci/qwen3_235b-8k1k-20480-ratio-1_for_serve.json
+hardware:
+  gpus_per_node: 4
+  num_ctx_servers: 1
+  num_gen_servers: 1
+environment:
+  container_mount: <container_mount>
+  container_image: <container_image>
+  model_path: <model_path>
+  trtllm_repo: ''
+  build_wheel: false
+  work_dir: <full_path_to_work_dir>
+  worker_env_var: TLLM_LOG_LEVEL=INFO TRTLLM_SERVER_DISABLE_GC=1 TRTLLM_WORKER_DISABLE_GC=1 TRTLLM_ENABLE_PDL=1 ENROOT_ALLOW_DEV=yes
+  server_env_var: TRTLLM_SERVER_DISABLE_GC=1
+profiling:
+  nsys_on: false
+accuracy:
+  enable_accuracy_test: false
+  model: local-completions
+  tasks: gsm8k
+  model_args_extra: num_concurrent=512,max_retries=3,tokenized_requests=false,timeout=1200,max_gen_toks=256,max_length=4096
+worker_config:
+  gen:
+    print_iter_log: true
+    max_batch_size: 128
+    max_num_tokens: 128
+    tensor_parallel_size: 8
+    moe_expert_parallel_size: 8
+    pipeline_parallel_size: 1
+    context_parallel_size: 1
+    enable_attention_dp: true
+    enable_lm_head_tp_in_adp: true
+    cuda_graph_config:
+      enable_padding: true
+      max_batch_size: 128
+    kv_cache_config:
+      enable_block_reuse: false
+      free_gpu_memory_fraction: 0.9
+      dtype: fp8
+    moe_config:
+      backend: CUTEDSL
+    cache_transceiver_config:
+      max_tokens_in_buffer: 32768
+      backend: UCX
+    disable_overlap_scheduler: true
+    num_postprocess_workers: 4
+    stream_interval: 20
+  ctx:
+    print_iter_log: true
+    max_batch_size: 4
+    max_num_tokens: 32768
+    tensor_parallel_size: 1
+    moe_expert_parallel_size: 1
+    pipeline_parallel_size: 1
+    context_parallel_size: 1
+    enable_attention_dp: false
+    cuda_graph_config: null
+    kv_cache_config:
+      enable_block_reuse: false
+      free_gpu_memory_fraction: 0.6
+      dtype: fp8
+    moe_config:
+      backend: TRTLLM
+    cache_transceiver_config:
+      max_tokens_in_buffer: 32768
+      backend: UCX
+    disable_overlap_scheduler: true
--- a/tests/integration/defs/perf/disagg/test_configs/disagg/perf-sanity/gb200-qwen3-235b-fp4_8k1k_ctx1_tp1_gen1_tep4_eplb0_mtp0_ccb-UCX.yaml
+++ b/tests/integration/defs/perf/disagg/test_configs/disagg/perf-sanity/gb200-qwen3-235b-fp4_8k1k_ctx1_tp1_gen1_tep4_eplb0_mtp0_ccb-UCX.yaml
@ -0,0 +1,91 @@
+metadata:
+  model_name: qwen3_235b_a22b_fp4
+  precision: fp4
+  model_dir_name: Qwen3-235B-A22B-FP4
+  supported_gpus:
+  - GB200
+  script_file: disaggr_torch.slurm
+  benchmark_type: 8k1k
+slurm:
+  script_file: disaggr_torch.slurm
+  partition: <partition>
+  account: <account>
+  job_time: 02:00:00
+  job_name: unified-benchmark
+  extra_args: --gres=gpu:4
+  numa_bind: true
+benchmark:
+  mode: e2e
+  use_nv_sa_benchmark: false
+  multi_round: 10
+  benchmark_ratio: 0.0
+  streaming: true
+  concurrency_list: '64'
+  input_length: 8192
+  output_length: 1024
+  dataset_file: datasets/perf-ci/qwen3_235b-8k1k-20480-ratio-1_for_serve.json
+hardware:
+  gpus_per_node: 4
+  num_ctx_servers: 1
+  num_gen_servers: 1
+environment:
+  container_mount: <container_mount>
+  container_image: <container_image>
+  model_path: <model_path>
+  trtllm_repo: ''
+  build_wheel: false
+  work_dir: <full_path_to_work_dir>
+  worker_env_var: TLLM_LOG_LEVEL=INFO TRTLLM_SERVER_DISABLE_GC=1 TRTLLM_WORKER_DISABLE_GC=1 TRTLLM_ENABLE_PDL=1 ENROOT_ALLOW_DEV=yes
+  server_env_var: TRTLLM_SERVER_DISABLE_GC=1
+profiling:
+  nsys_on: false
+accuracy:
+  enable_accuracy_test: false
+  model: local-completions
+  tasks: gsm8k
+  model_args_extra: num_concurrent=512,max_retries=3,tokenized_requests=false,timeout=1200,max_gen_toks=256,max_length=4096
+worker_config:
+  gen:
+    print_iter_log: true
+    max_batch_size: 64
+    max_num_tokens: 64
+    tensor_parallel_size: 4
+    moe_expert_parallel_size: 4
+    pipeline_parallel_size: 1
+    context_parallel_size: 1
+    enable_attention_dp: false
+    cuda_graph_config:
+      enable_padding: true
+      max_batch_size: 64
+    kv_cache_config:
+      enable_block_reuse: false
+      free_gpu_memory_fraction: 0.9
+      dtype: fp8
+    moe_config:
+      backend: TRTLLM
+    cache_transceiver_config:
+      max_tokens_in_buffer: 32768
+      backend: UCX
+    disable_overlap_scheduler: true
+    num_postprocess_workers: 4
+    stream_interval: 20
+  ctx:
+    print_iter_log: true
+    max_batch_size: 4
+    max_num_tokens: 32768
+    tensor_parallel_size: 1
+    moe_expert_parallel_size: 1
+    pipeline_parallel_size: 1
+    context_parallel_size: 1
+    enable_attention_dp: false
+    cuda_graph_config: null
+    kv_cache_config:
+      enable_block_reuse: false
+      free_gpu_memory_fraction: 0.6
+      dtype: fp8
+    moe_config:
+      backend: TRTLLM
+    cache_transceiver_config:
+      max_tokens_in_buffer: 32768
+      backend: UCX
+    disable_overlap_scheduler: true
--- a/tests/integration/defs/perf/open_search_db_utils.py
+++ b/tests/integration/defs/perf/open_search_db_utils.py
@ -448,9 +448,12 @@ def prepare_regressive_test_cases(history_baseline_dict, new_data_dict):
            continue

        is_post_merge = new_data.get("b_is_post_merge", False)
-        baseline_id = history_baseline.get("_id", "")
-
-        info_parts = [f"baseline_id: {baseline_id}"]
+        info_parts = [
+            f"baseline_id: {history_baseline.get('_id', '')}",
+            f"baseline_branch: {history_baseline.get('s_branch', '')}",
+            f"baseline_commit: {history_baseline.get('s_commit', '')}",
+            f"baseline_date: {history_baseline.get('ts_created', '')}",
+        ]
        regressive_metrics = []
        # Check all metrics and build info string
        for metric in MAXIMIZE_METRICS + MINIMIZE_METRICS:
--- a/tests/integration/defs/perf/test_perf_sanity.py
+++ b/tests/integration/defs/perf/test_perf_sanity.py
@ -56,6 +56,7 @@ MODEL_PATH_DICT = {
    "deepseek_v32_fp4": "DeepSeek-V3.2-Exp-FP4-v2",
    "gpt_oss_120b_fp4": "gpt_oss/gpt-oss-120b",
    "k2_thinking_fp4": "Kimi-K2-Thinking-NVFP4",
+    "qwen3_235b_a22b_fp4": "Qwen3/saved_models_Qwen3-235B-A22B_nvfp4_hf",  # Qwen3-235B-A22B-FP4
 }

 SUPPORTED_GPU_MAPPING = {
@ -68,6 +69,9 @@ SUPPORTED_GPU_MAPPING = {

 DEFAULT_TIMEOUT = 7200

+AGGR_CONFIG_FOLDER = "tests/scripts/perf-sanity"
+DISAGG_CONFIG_FOLDER = "tests/integration/defs/perf/disagg/test_configs/disagg/perf-sanity"
+
 # Regex patterns for parsing benchmark output metrics
 # Key is the metric name used in database (e.g., "mean_e2el", "seq_throughput")
 PERF_METRIC_LOG_QUERIES = {
@ -97,9 +101,20 @@ def get_model_dir(model_name: str) -> str:
    return ""


-def get_dataset_path() -> str:
-    """Get dataset path for benchmark."""
-    return os.path.join(llm_models_root(), "datasets", "ShareGPT_V3_unfiltered_cleaned_split.json")
+def get_dataset_dir(dataset_file: Optional[str]) -> str:
+    """Get dataset directory path from dataset file."""
+    if not dataset_file or dataset_file == "<dataset_file>":
+        return ""
+
+    # return os.path.join(llm_models_root(), "datasets", "ShareGPT_V3_unfiltered_cleaned_split.json")
+    llm_models_path = os.path.join(llm_models_root(), dataset_file)
+    if os.path.exists(llm_models_path):
+        return llm_models_path
+    elif os.path.exists(dataset_file):
+        return dataset_file
+    else:
+        print_info(f"Dataset file not found in {llm_models_path} and {dataset_file}")
+        return ""


 def to_env_dict(env_vars: str) -> Dict[str, str]:
@ -141,6 +156,7 @@ class ServerConfig:
        self.disable_overlap_scheduler = server_config_data.get("disable_overlap_scheduler", False)
        self.num_postprocess_workers = server_config_data.get("num_postprocess_workers", 0)
        self.stream_interval = server_config_data.get("stream_interval", 10)
+        self.print_iter_log = server_config_data.get("print_iter_log", False)
        self.attn_backend = server_config_data.get("attn_backend", "TRTLLM")
        self.enable_chunked_prefill = server_config_data.get("enable_chunked_prefill", False)
        self.enable_attention_dp = server_config_data.get("enable_attention_dp", False)
@ -213,6 +229,7 @@ class ServerConfig:
            self.eagle3_layers_to_capture = []
        self.max_draft_len = speculative_config.get("max_draft_len", 0)
        self.speculative_model = speculative_config.get("speculative_model", "")
+        self.eagle3_one_model = speculative_config.get("eagle3_one_model", False)

        # match_mode: "config" (default) or "scenario"
        self.match_mode = server_config_data.get("match_mode", "config")
@ -340,6 +357,7 @@ class ServerConfig:
            "s_eagle3_layers_to_capture": ",".join(map(str, self.eagle3_layers_to_capture)),
            "l_max_draft_len": self.max_draft_len,
            "s_speculative_model_dir": self.speculative_model,
+            "b_eagle3_one_model": self.eagle3_one_model,
            "s_server_log_link": "",
            "s_server_env_var": self.env_vars,
        }
@ -366,7 +384,12 @@ class ServerConfig:
 class ClientConfig:
    """Configurations of benchmark client."""

-    def __init__(self, client_config_data: dict, model_name: str, env_vars: str = ""):
+    def __init__(
+        self,
+        client_config_data: dict,
+        model_name: str,
+        env_vars: str = "",
+    ):
        self.model_name = model_name
        self.concurrency = client_config_data.get("concurrency", 1)
        self.iterations = client_config_data.get("iterations", 1)
@ -378,6 +401,7 @@ class ClientConfig:
        self.streaming = client_config_data.get("streaming", True)
        self.trust_remote_code = client_config_data.get("trust_remote_code", True)
        self.model_path = ""
+        self.dataset_file = client_config_data.get("dataset_file", "")
        self.env_vars = env_vars

        # Generate default name if not provided
@ -389,7 +413,7 @@ class ClientConfig:
        """Generate benchmark command."""
        model_dir = get_model_dir(self.model_name)
        self.model_path = model_dir if os.path.exists(model_dir) else self.model_name
-        dataset_path = get_dataset_path()
+        dataset_path = get_dataset_dir(self.dataset_file)
        benchmark_cmd = [
            "python",
            "-m",
@ -398,9 +422,6 @@ class ClientConfig:
            self.model_path,
            "--tokenizer",
            self.model_path,
-            "--dataset-name",
-            "random",
-            "--random-ids",
            "--num-prompts",
            str(self.concurrency * self.iterations),
            "--max-concurrency",
@ -409,15 +430,27 @@ class ClientConfig:
            str(self.isl),
            "--random-output-len",
            str(self.osl),
-            "--random-range-ratio",
-            str(self.random_range_ratio),
            "--ignore-eos",
+            "--no-test-input",
            "--percentile-metrics",
            "ttft,tpot,itl,e2el",
        ]
-        if dataset_path and os.path.exists(dataset_path):
+        if dataset_path:
+            benchmark_cmd.append("--dataset-name")
+            benchmark_cmd.append("trtllm_custom")
            benchmark_cmd.append("--dataset-path")
            benchmark_cmd.append(dataset_path)
+            print_info(f"Dataset: {dataset_path} exists. Use trtllm_custom dataset for benchmark.")
+        else:
+            benchmark_cmd.append("--dataset-name")
+            benchmark_cmd.append("random")
+            benchmark_cmd.append("--random-ids")
+            benchmark_cmd.append("--random-range-ratio")
+            benchmark_cmd.append(str(self.random_range_ratio))
+            print_info(
+                f"Dataset: {dataset_path} is not provided or does not exist. "
+                f"Use random dataset (random_range_ratio={self.random_range_ratio}) for benchmark."
+            )
        if self.backend:
            benchmark_cmd.append("--backend")
            benchmark_cmd.append(self.backend)
@ -453,6 +486,7 @@ class ClientConfig:
            "l_isl": self.isl,
            "l_osl": self.osl,
            "d_random_range_ratio": self.random_range_ratio,
+            "s_dataset_file": self.dataset_file,
            "s_backend": self.backend,
            "b_use_chat_template": self.use_chat_template,
            "b_streaming": self.streaming,
@ -840,7 +874,7 @@ class PerfSanityTestConfig:
        if is_disagg:
            # For disagg: disagg_upload-deepseek-r1-fp4_8k1k_ctx1_gen1_dep32_bs128_eplb0_mtp0_ccb-UCX
            self.runtime = "multi_node_disagg_server"
-            self.config_dir = "tests/integration/defs/perf/disagg/test_configs/disagg/perf"
+            self.config_dir = DISAGG_CONFIG_FOLDER
            config_base = "-".join(labels[1:])
            self.config_file = (
                f"{config_base}.yaml" if not config_base.endswith(".yaml") else config_base
@ -849,7 +883,7 @@ class PerfSanityTestConfig:
        else:
            # For aggr: aggr_upload-config_yml or aggr_upload-config_yml-server_config_name
            self.runtime = "aggr_server"
-            self.config_dir = "tests/scripts/perf-sanity"
+            self.config_dir = AGGR_CONFIG_FOLDER
            config_base = labels[1]
            self.config_file = (
                f"{config_base}.yaml"
@ -922,7 +956,9 @@ class PerfSanityTestConfig:
            client_configs = []
            for client_config_data in server_config_data["client_configs"]:
                client_config = ClientConfig(
-                    client_config_data, server_config_data["model_name"], client_env_var
+                    client_config_data,
+                    server_config_data["model_name"],
+                    env_vars=client_env_var,
                )
                client_configs.append(client_config)

@ -1026,8 +1062,13 @@ class PerfSanityTestConfig:
                "backend": "openai",
                "use_chat_template": False,
                "streaming": benchmark.get("streaming", True),
+                "dataset_file": benchmark.get("dataset_file", ""),
            }
-            client_config = ClientConfig(client_config_data, model_name, client_env_var)
+            client_config = ClientConfig(
+                client_config_data,
+                model_name,
+                env_vars=client_env_var,
+            )
            client_configs.append(client_config)

        self.server_client_configs = {0: client_configs}
@ -1417,9 +1458,6 @@ class PerfSanityTestConfig:
 AGG_TEST_TYPES = ["aggr_upload", "aggr"]
 DISAGG_TEST_TYPES = ["disagg_upload", "disagg"]

-AGGR_CONFIG_FOLDER = "tests/scripts/perf-sanity"
-DISAGG_CONFIG_FOLDER = "tests/integration/defs/perf/disagg/test_configs/disagg/perf"
-

 def get_server_config_names(yaml_path: str) -> List[str]:
    """Read a YAML file and return the list of server_config names."""
--- a/tests/integration/test_lists/test-db/l0_dgx_b200_multi_nodes_disagg_perf_sanity_ctx1_node1_gpu4_gen1_node1_gpu8.yml
+++ b/tests/integration/test_lists/test-db/l0_dgx_b200_multi_nodes_disagg_perf_sanity_ctx1_node1_gpu4_gen1_node1_gpu8.yml
@ -0,0 +1,21 @@
+version: 0.0.1
+l0_dgx_b200_multi_nodes_disagg_perf_sanity_ctx1_node1_gpu4_gen1_node1_gpu8:
+- condition:
+    ranges:
+      # 2 nodes with each node has 8 GPUs
+      system_gpu_count:
+        gte: 16
+        lte: 16
+    wildcards:
+      gpu:
+      - '*b200*'
+    terms:
+      stage: post_merge
+      backend: pytorch
+  tests:
+  - perf/test_perf_sanity.py::test_e2e[disagg_upload-b200-deepseek-r1-fp4_1k1k_ctx1_dep4_gen1_dep8_eplb0_mtp1_ccb-UCX] TIMEOUT (90)
+  - perf/test_perf_sanity.py::test_e2e[disagg_upload-b200-deepseek-r1-fp4_1k1k_ctx1_dep4_gen1_dep8_eplb0_mtp3_ccb-UCX] TIMEOUT (90)
+  - perf/test_perf_sanity.py::test_e2e[disagg_upload-b200-deepseek-r1-fp4_1k1k_ctx1_dep4_gen1_tep8_eplb0_mtp3_ccb-UCX] TIMEOUT (90)
+  - perf/test_perf_sanity.py::test_e2e[disagg_upload-b200-deepseek-r1-fp4_8k1k_con256_ctx1_dep4_gen1_dep8_eplb0_mtp1_ccb-UCX] TIMEOUT (90)
+  - perf/test_perf_sanity.py::test_e2e[disagg_upload-b200-deepseek-r1-fp4_8k1k_con1536_ctx1_dep4_gen1_dep8_eplb0_mtp1_ccb-UCX] TIMEOUT (90)
+  - perf/test_perf_sanity.py::test_e2e[disagg_upload-b200-deepseek-r1-fp4_8k1k_ctx1_dep4_gen1_tep8_eplb0_mtp3_ccb-UCX] TIMEOUT (90)
--- a/tests/integration/test_lists/test-db/l0_gb200_multi_nodes_aggr_perf_sanity_node2_gpu8.yml
+++ b/tests/integration/test_lists/test-db/l0_gb200_multi_nodes_aggr_perf_sanity_node2_gpu8.yml
@ -1,5 +1,5 @@
 version: 0.0.1
-l0_gb200_multi_nodes_aggr_perf_sanity_2_nodes:
+l0_gb200_multi_nodes_aggr_perf_sanity_node2_gpu8:
 - condition:
    ranges:
      # 2 nodes with each node has 4 GPUs
--- a/tests/integration/test_lists/test-db/l0_gb200_multi_nodes_disagg_perf_sanity_3_nodes.yml
+++ b/tests/integration/test_lists/test-db/l0_gb200_multi_nodes_disagg_perf_sanity_3_nodes.yml
@ -1,16 +0,0 @@
-version: 0.0.1
-l0_gb200_multi_nodes_disagg_perf_sanity_3_nodes:
- condition:
-    ranges:
-      # 3 nodes with each node has 4 GPUs
-      system_gpu_count:
-        gte: 12
-        lte: 12
-    wildcards:
-      gpu:
-      - '*gb200*'
-    terms:
-      stage: post_merge
-      backend: pytorch
-  tests:
-  - perf/test_perf_sanity.py::test_e2e[disagg_upload-deepseek-r1-fp4_1k1k_ctx1_gen1_dep8_bs768_eplb0_mtp0_ccb-UCX] TIMEOUT (90)
--- a/tests/integration/test_lists/test-db/l0_gb200_multi_nodes_disagg_perf_sanity_6_nodes.yml
+++ b/tests/integration/test_lists/test-db/l0_gb200_multi_nodes_disagg_perf_sanity_6_nodes.yml
@ -1,17 +0,0 @@
-version: 0.0.1
-l0_gb200_multi_nodes_disagg_perf_sanity_6_nodes:
- condition:
-    ranges:
-      # 6 nodes with each node has 4 GPUs
-      system_gpu_count:
-        gte: 24
-        lte: 24
-    wildcards:
-      gpu:
-      - '*gb200*'
-    terms:
-      stage: post_merge
-      backend: pytorch
-  tests:
-  - perf/test_perf_sanity.py::test_e2e[disagg_upload-deepseek-r1-fp4_1k1k_ctx2_gen1_dep16_bs128_eplb0_mtp3_ccb-UCX] TIMEOUT (90)
-  - perf/test_perf_sanity.py::test_e2e[disagg_upload-deepseek-r1-fp4_1k1k_ctx2_gen1_dep16_bs128_eplb0_mtp3_ccb-NIXL] TIMEOUT (90)
--- a/tests/integration/test_lists/test-db/l0_gb200_multi_nodes_disagg_perf_sanity_8_nodes.yml
+++ b/tests/integration/test_lists/test-db/l0_gb200_multi_nodes_disagg_perf_sanity_8_nodes.yml
@ -1,16 +0,0 @@
-version: 0.0.1
-l0_gb200_multi_nodes_disagg_perf_sanity_8_nodes:
- condition:
-    ranges:
-      # 8 nodes with each node has 4 GPUs
-      system_gpu_count:
-        gte: 32
-        lte: 32
-    wildcards:
-      gpu:
-      - '*gb200*'
-    terms:
-      stage: post_merge
-      backend: pytorch
-  tests:
-  - perf/test_perf_sanity.py::test_e2e[disagg_upload-deepseek-r1-fp4_8k1k_ctx1_gen1_dep32_bs128_eplb0_mtp3_ccb-UCX] TIMEOUT (90)
--- a/tests/integration/test_lists/test-db/l0_gb200_multi_nodes_disagg_perf_sanity_ctx1_node1_gpu1_gen1_node1_gpu4.yml
+++ b/tests/integration/test_lists/test-db/l0_gb200_multi_nodes_disagg_perf_sanity_ctx1_node1_gpu1_gen1_node1_gpu4.yml
@ -0,0 +1,17 @@
+version: 0.0.1
+l0_gb200_multi_nodes_disagg_perf_sanity_ctx1_node1_gpu1_gen1_node1_gpu4:
+- condition:
+    ranges:
+      # 1 ctx worker with each 1 node and 1 GPU
+      # 1 gen worker with each 1 node and 4 GPUs
+      system_gpu_count:
+        gte: 8
+        lte: 8
+    wildcards:
+      gpu:
+      - '*gb200*'
+    terms:
+      stage: post_merge
+      backend: pytorch
+  tests:
+  - perf/test_perf_sanity.py::test_e2e[disagg_upload-gb200-qwen3-235b-fp4_8k1k_ctx1_tp1_gen1_tep4_eplb0_mtp0_ccb-UCX] TIMEOUT (120)
--- a/tests/integration/test_lists/test-db/l0_gb200_multi_nodes_disagg_perf_sanity_ctx1_node1_gpu1_gen1_node2_gpu8.yml
+++ b/tests/integration/test_lists/test-db/l0_gb200_multi_nodes_disagg_perf_sanity_ctx1_node1_gpu1_gen1_node2_gpu8.yml
@ -0,0 +1,17 @@
+version: 0.0.1
+l0_gb200_multi_nodes_disagg_perf_sanity_ctx1_node1_gpu1_gen1_node2_gpu8:
+- condition:
+    ranges:
+      # 1 ctx worker with each 1 node and 1 GPU
+      # 1 gen worker with each 2 nodes and 8 GPUs
+      system_gpu_count:
+        gte: 12
+        lte: 12
+    wildcards:
+      gpu:
+      - '*gb200*'
+    terms:
+      stage: post_merge
+      backend: pytorch
+  tests:
+  - perf/test_perf_sanity.py::test_e2e[disagg_upload-gb200-qwen3-235b-fp4_8k1k_ctx1_tp1_gen1_dep8_eplb0_mtp0_ccb-UCX] TIMEOUT (120)
--- a/tests/integration/test_lists/test-db/l0_gb200_multi_nodes_disagg_perf_sanity_ctx1_node1_gpu4_gen1_node1_gpu4.yml
+++ b/tests/integration/test_lists/test-db/l0_gb200_multi_nodes_disagg_perf_sanity_ctx1_node1_gpu4_gen1_node1_gpu4.yml
@ -0,0 +1,19 @@
+version: 0.0.1
+l0_gb200_multi_nodes_disagg_perf_sanity_ctx1_node1_gpu4_gen1_node1_gpu4:
+- condition:
+    ranges:
+      # 1 ctx worker with each 1 node and 4 GPUs
+      # 1 gen worker with each 1 node and 4 GPUs
+      system_gpu_count:
+        gte: 8
+        lte: 8
+    wildcards:
+      gpu:
+      - '*gb200*'
+    terms:
+      stage: post_merge
+      backend: pytorch
+  tests:
+  - perf/test_perf_sanity.py::test_e2e[disagg_upload-gb200-deepseek-r1-fp4_1k1k_ctx1_dep4_gen1_dep4_eplb0_mtp1_ccb-UCX] TIMEOUT (120)
+  - perf/test_perf_sanity.py::test_e2e[disagg_upload-gb200-deepseek-v32-fp4_1k1k_ctx1_dep4_gen1_dep4_eplb0_mtp1_ccb-UCX] TIMEOUT (120)
+  - perf/test_perf_sanity.py::test_e2e[disagg_upload-gb200-kimi-k2-thinking-fp4_1k1k_ctx1_dep4_gen1_tep4_eplb0_mtp0_ccb-UCX] TIMEOUT (120)
--- a/tests/integration/test_lists/test-db/l0_gb200_multi_nodes_disagg_perf_sanity_ctx1_node1_gpu4_gen1_node2_gpu8.yml
+++ b/tests/integration/test_lists/test-db/l0_gb200_multi_nodes_disagg_perf_sanity_ctx1_node1_gpu4_gen1_node2_gpu8.yml
@ -0,0 +1,24 @@
+version: 0.0.1
+l0_gb200_multi_nodes_disagg_perf_sanity_ctx1_node1_gpu4_gen1_node2_gpu8:
+- condition:
+    ranges:
+      # 1 ctx worker with each 1 node and 4 GPUs
+      # 1 gen worker with each 2 nodes and 8 GPUs
+      system_gpu_count:
+        gte: 12
+        lte: 12
+    wildcards:
+      gpu:
+      - '*gb200*'
+    terms:
+      stage: post_merge
+      backend: pytorch
+  tests:
+  - perf/test_perf_sanity.py::test_e2e[disagg_upload-gb200-deepseek-r1-fp4_1k1k_ctx1_dep4_gen1_dep8_eplb0_mtp0_ccb-UCX] TIMEOUT (120)
+  - perf/test_perf_sanity.py::test_e2e[disagg_upload-gb200-deepseek-r1-fp4_1k1k_ctx1_dep4_gen1_tep8_eplb0_mtp3_ccb-UCX] TIMEOUT (120)
+  - perf/test_perf_sanity.py::test_e2e[disagg_upload-gb200-deepseek-r1-fp4_8k1k_ctx1_dep4_gen1_tep8_eplb0_mtp3_ccb-UCX] TIMEOUT (120)
+  # - perf/test_perf_sanity.py::test_e2e[disagg_upload-gb200-deepseek-v32-fp4_1k1k_ctx1_dep4_gen1_tep8_eplb0_mtp3_ccb-UCX] TIMEOUT (120)
+  # - perf/test_perf_sanity.py::test_e2e[disagg_upload-gb200-deepseek-v32-fp4_32k4k_ctx1_dep4_gen1_tep8_eplb0_mtp3_ccb-UCX] TIMEOUT (120)
+  # - perf/test_perf_sanity.py::test_e2e[disagg_upload-gb200-deepseek-v32-fp4_8k1k_ctx1_dep4_gen1_tep8_eplb0_mtp3_ccb-UCX] TIMEOUT (120)
+  - perf/test_perf_sanity.py::test_e2e[disagg_upload-gb200-kimi-k2-thinking-fp4_1k1k_ctx1_dep4_gen1_dep8_eplb0_mtp0_ccb-UCX] TIMEOUT (120)
+  - perf/test_perf_sanity.py::test_e2e[disagg_upload-gb200-kimi-k2-thinking-fp4_8k1k_ctx1_dep4_gen1_tep8_eplb0_mtp3_ccb-UCX] TIMEOUT (120)
--- a/tests/integration/test_lists/test-db/l0_gb200_multi_nodes_disagg_perf_sanity_ctx1_node1_gpu4_gen1_node4_gpu16.yml
+++ b/tests/integration/test_lists/test-db/l0_gb200_multi_nodes_disagg_perf_sanity_ctx1_node1_gpu4_gen1_node4_gpu16.yml
@ -0,0 +1,18 @@
+version: 0.0.1
+l0_gb200_multi_nodes_disagg_perf_sanity_ctx1_node1_gpu4_gen1_node4_gpu16:
+- condition:
+    ranges:
+      # 1 ctx worker with each 1 node and 4 GPUs
+      # 1 gen worker with each 4 nodes and 16 GPUs
+      system_gpu_count:
+        gte: 20
+        lte: 20
+    wildcards:
+      gpu:
+      - '*gb200*'
+    terms:
+      stage: post_merge
+      backend: pytorch
+  tests:
+  - perf/test_perf_sanity.py::test_e2e[disagg_upload-gb200-deepseek-r1-fp4_8k1k_ctx1_dep4_gen1_dep16_eplb0_mtp1_ccb-UCX] TIMEOUT (120)
+  - perf/test_perf_sanity.py::test_e2e[disagg_upload-gb200-kimi-k2-thinking-fp4_8k1k_ctx1_dep4_gen1_dep16_eplb384_mtp0_ccb-UCX] TIMEOUT (120)
--- a/tests/integration/test_lists/test-db/l0_gb200_multi_nodes_disagg_perf_sanity_ctx1_node1_gpu4_gen1_node8_gpu32.yml
+++ b/tests/integration/test_lists/test-db/l0_gb200_multi_nodes_disagg_perf_sanity_ctx1_node1_gpu4_gen1_node8_gpu32.yml
@ -0,0 +1,25 @@
+version: 0.0.1
+l0_gb200_multi_nodes_disagg_perf_sanity_ctx1_node1_gpu4_gen1_node8_gpu32:
+- condition:
+    ranges:
+      # 1 ctx worker with each 1 node and 4 GPUs
+      # 1 gen worker with each 8 nodes and 32 GPUs
+      system_gpu_count:
+        gte: 36
+        lte: 36
+    wildcards:
+      gpu:
+      - '*gb200*'
+    terms:
+      stage: post_merge
+      backend: pytorch
+  tests:
+  - perf/test_perf_sanity.py::test_e2e[disagg_upload-gb200-deepseek-r1-fp4_1k1k_ctx1_dep4_gen1_dep32_eplb0_mtp3_ccb-UCX] TIMEOUT (120)
+  - perf/test_perf_sanity.py::test_e2e[disagg_upload-gb200-deepseek-r1-fp4_8k1k_ctx1_dep4_gen1_dep32_eplb0_mtp3_ccb-UCX] TIMEOUT (120)
+  - perf/test_perf_sanity.py::test_e2e[disagg_upload-gb200-deepseek-v32-fp4_1k1k_ctx1_dep4_gen1_dep32_eplb256_mtp3_ccb-UCX] TIMEOUT (120)
+  - perf/test_perf_sanity.py::test_e2e[disagg_upload-gb200-deepseek-v32-fp4_32k4k_ctx1_dep4_gen1_dep32_eplb288_mtp1_ccb-UCX] TIMEOUT (120)
+  - perf/test_perf_sanity.py::test_e2e[disagg_upload-gb200-deepseek-v32-fp4_32k4k_ctx1_dep4_gen1_dep32_eplb0_mtp3_ccb-UCX] TIMEOUT (120)
+  - perf/test_perf_sanity.py::test_e2e[disagg_upload-gb200-deepseek-v32-fp4_8k1k_ctx1_dep4_gen1_dep32_eplb256_mtp0_ccb-UCX] TIMEOUT (120)
+  - perf/test_perf_sanity.py::test_e2e[disagg_upload-gb200-deepseek-v32-fp4_8k1k_ctx1_dep4_gen1_dep32_eplb256_mtp3_ccb-UCX] TIMEOUT (120)
+  - perf/test_perf_sanity.py::test_e2e[disagg_upload-gb200-kimi-k2-thinking-fp4_1k1k_ctx1_dep4_gen1_dep32_eplb384_mtp0_ccb-UCX] TIMEOUT (120)
+  - perf/test_perf_sanity.py::test_e2e[disagg_upload-gb200-kimi-k2-thinking-fp4_8k1k_ctx1_dep4_gen1_dep32_eplb416_mtp3_ccb-UCX] TIMEOUT (120)
--- a/tests/integration/test_lists/test-db/l0_gb200_multi_nodes_disagg_perf_sanity_ctx1_node2_gpu8_gen1_node2_gpu8.yml
+++ b/tests/integration/test_lists/test-db/l0_gb200_multi_nodes_disagg_perf_sanity_ctx1_node2_gpu8_gen1_node2_gpu8.yml
@ -0,0 +1,17 @@
+version: 0.0.1
+l0_gb200_multi_nodes_disagg_perf_sanity_ctx1_node2_gpu8_gen1_node2_gpu8:
+- condition:
+    ranges:
+      # 1 ctx worker with each 2 nodes and 8 GPUs
+      # 1 gen worker with each 2 nodes and 8 GPUs
+      system_gpu_count:
+        gte: 16
+        lte: 16
+    wildcards:
+      gpu:
+      - '*gb200*'
+    terms:
+      stage: post_merge
+      backend: pytorch
+  tests:
+  - perf/test_perf_sanity.py::test_e2e[disagg_upload-gb200-deepseek-r1-fp4_128k8k_ctx1_pp8_gen1_tep8_eplb0_mtp3_ccb-UCX] TIMEOUT (120)
--- a/tests/integration/test_lists/test-db/l0_gb200_multi_nodes_disagg_perf_sanity_ctx1_node2_gpu8_gen1_node4_gpu16.yml
+++ b/tests/integration/test_lists/test-db/l0_gb200_multi_nodes_disagg_perf_sanity_ctx1_node2_gpu8_gen1_node4_gpu16.yml
@ -0,0 +1,17 @@
+version: 0.0.1
+l0_gb200_multi_nodes_disagg_perf_sanity_ctx1_node2_gpu8_gen1_node4_gpu16:
+- condition:
+    ranges:
+      # 1 ctx worker with each 2 nodes and 8 GPUs
+      # 1 gen worker with each 4 nodes and 16 GPUs
+      system_gpu_count:
+        gte: 24
+        lte: 24
+    wildcards:
+      gpu:
+      - '*gb200*'
+    terms:
+      stage: post_merge
+      backend: pytorch
+  tests:
+  - perf/test_perf_sanity.py::test_e2e[disagg_upload-gb200-deepseek-r1-fp4_128k8k_ctx1_pp8_gen1_dep16_eplb0_mtp2_ccb-UCX] TIMEOUT (120)
--- a/tests/integration/test_lists/test-db/l0_gb200_multi_nodes_disagg_perf_sanity_ctx1_node2_gpu8_gen1_node8_gpu32.yml
+++ b/tests/integration/test_lists/test-db/l0_gb200_multi_nodes_disagg_perf_sanity_ctx1_node2_gpu8_gen1_node8_gpu32.yml
@ -0,0 +1,17 @@
+version: 0.0.1
+l0_gb200_multi_nodes_disagg_perf_sanity_ctx1_node2_gpu8_gen1_node8_gpu32:
+- condition:
+    ranges:
+      # 1 ctx worker with each 2 nodes and 8 GPUs
+      # 1 gen worker with each 8 nodes and 32 GPUs
+      system_gpu_count:
+        gte: 40
+        lte: 40
+    wildcards:
+      gpu:
+      - '*gb200*'
+    terms:
+      stage: post_merge
+      backend: pytorch
+  tests:
+  - perf/test_perf_sanity.py::test_e2e[disagg_upload-gb200-deepseek-r1-fp4_128k8k_ctx1_pp8_gen1_dep32_eplb0_mtp3_ccb-UCX] TIMEOUT (120)
--- a/tests/integration/test_lists/waives.txt
+++ b/tests/integration/test_lists/waives.txt
@ -291,9 +291,7 @@ unittest/llmapi/test_mpi_session.py::test_llmapi_launch_multiple_tasks SKIP (htt
 accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16_4gpus[ep4-mtp_nextn=0-attention_dp=False-cuda_graph=False-overlap_scheduler=True-torch_compile=False] SKIP (https://nvbugs/5819019)
 unittest/_torch/thop/serial/test_moe.py::TestMoeFp4::test_gptoss_style_nvfp4[limit1-beta0-alpha1-RoutingGPTOSS-512-512-1] SKIP (https://nvbugs/5819042)
 accuracy/test_llm_api_pytorch.py::TestQwen3_235B_A22B::test_nvfp4_4gpus[latency_moe_trtllm_eagle3] SKIP (https://nvbugs/5819048)
-perf/test_perf_sanity.py::test_e2e[aggr_upload-k2_thinking_fp4_2_nodes_grace_blackwell-k2_thinking_fp4_dep8_32k8k] SKIP (https://nvbugs/5819053)
-perf/test_perf_sanity.py::test_e2e[aggr_upload-k2_thinking_fp4_2_nodes_grace_blackwell-k2_thinking_fp4_tep8_32k8k] SKIP (https://nvbugs/5819053)
-perf/test_perf_sanity.py::test_e2e[aggr_upload-k2_thinking_fp4_grace_blackwell-k2_thinking_fp4_tep4_8k1k] SKIP (https://nvbugs/5820541)
+accuracy/test_llm_api_pytorch.py::TestQwen3_30B_A3B::test_nvfp4[tep4_latency_moe_cutlass-torch_compile=False] SKIP (https://nvbugs/5819021)
 disaggregated/test_disaggregated.py::test_disaggregated_deepseek_v3_lite_bf16_cache_aware_balance[DeepSeek-V3-Lite-bf16] SKIP (https://nvbugs/5820576)
 llmapi/test_llm_examples.py::test_llmapi_tensorrt_engine SKIP (https://nvbugs/5820553)
 accuracy/test_llm_api_pytorch.py::TestLlama4ScoutInstruct::test_auto_dtype[tp4-cuda_graph=False] SKIP (https://nvbugs/5820938)
@ -317,10 +315,8 @@ accuracy/test_disaggregated_serving.py::TestDeepSeekV3Lite::test_auto_dtype[mtp_
 accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_fp8_block_scales_4gpus[tp4-mtp_nextn=2-fp8kv=True-attention_dp=False-cuda_graph=False-overlap_scheduler=False-torch_compile=False-sampler_async_worker=False] SKIP (https://nvbugs/5701445)
 accuracy/test_llm_api_pytorch.py::TestLlama4ScoutInstruct::test_fp4[tp4-cuda_graph=True] SKIP (https://nvbugs/5820734)
 accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4_4gpus[moe_backend=CUTLASS-mtp_nextn=2-ep4-fp8kv=False-attention_dp=False-cuda_graph=False-overlap_scheduler=False-torch_compile=False] SKIP (https://nvbugs/5800672)
-perf/test_perf_sanity.py::test_e2e[aggr_upload-deepseek_r1_fp4_v2_2_nodes_grace_blackwell-r1_fp4_v2_tep8_mtp3] SKIP (https://nvbugs/5819053)
+accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16_4gpus[ep4-mtp_nextn=2-attention_dp=False-cuda_graph=False-overlap_scheduler=False-torch_compile=False] SKIP (https://nvbugs/5823284)
 accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4_4gpus[moe_backend=CUTLASS-mtp_nextn=2-tp2pp2-fp8kv=False-attention_dp=False-cuda_graph=False-overlap_scheduler=False-torch_compile=False] SKIP (https://nvbugs/5800672)
-perf/test_perf_sanity.py::test_e2e[aggr_upload-deepseek_r1_fp4_v2_grace_blackwell-r1_fp4_v2_tp4_mtp3_1k8k] SKIP (https://nvbugs/5819053)
-perf/test_perf_sanity.py::test_e2e[disagg_upload-deepseek-r1-fp4_1k1k_ctx1_gen1_dep8_bs768_eplb0_mtp0_ccb-UCX] SKIP (https://nvbugs/5819053)
 accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_bfloat16_4gpus[tp4-attn_backend=TRTLLM-torch_compile=True] SKIP (https://nvbugs/5826604)
 disaggregated/test_disaggregated.py::test_disaggregated_mixed[TinyLlama-1.1B-Chat-v1.0] SKIP (https://nvbugs/5834212)
 accuracy/test_llm_api_pytorch.py::TestGLM4_5Air::test_nvfp4_multi_gpus[throughput_trtllm] SKIP (https://nvbugs/5837275)
@ -337,7 +333,6 @@ test_e2e.py::test_openai_chat_harmony SKIP (https://nvbugs/5819444)
 accuracy/test_cli_flow.py::TestLlama2_7B::test_fp8 SKIP (https://nvbugs/5819452)
 accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16_4gpus[ep4-mtp_nextn=2-attention_dp=True-cuda_graph=True-overlap_scheduler=True-torch_compile=False] SKIP (https://nvbugs/5800646)
 accuracy/test_disaggregated_serving.py::TestQwen3_30B_A3B::test_mixed_ctx_gen_model[ctxpp2gentp2] SKIP (https://nvbugs/5748664)
-perf/test_perf_sanity.py::test_e2e[aggr_upload-deepseek_r1_fp4_v2_grace_blackwell-r1_fp4_v2_tep4_mtp3_8k1k] SKIP (https://nvbugs/5819053)
 examples/test_llama.py::test_llama_3_x_with_bf16_lora_torch[llama-3.2-1b-instruct] SKIP (https://nvbugs/5838178)
 accuracy/test_llm_api_autodeploy.py::TestNemotronSuperV3::test_bf16 SKIP (https://nvbugs/5838184)
 cpp/test_multi_gpu.py::test_cache_transceiver[8proc-mooncake_kvcache-90] SKIP (https://nvbugs/5838199)
@ -362,7 +357,6 @@ full:RTXPro6000D/accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bflo
 full:RTXPro6000D/accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4_4gpus[moe_backend=CUTLASS-mtp_nextn=2-ep4-fp8kv=True-attention_dp=True-cuda_graph=True-overlap_scheduler=True-torch_compile=False] SKIP (https://nvbugs/5846154)
 full:RTXPro6000D/accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4_4gpus[moe_backend=CUTLASS-mtp_nextn=2-tp2pp2-fp8kv=True-attention_dp=True-cuda_graph=True-overlap_scheduler=True-torch_compile=False] SKIP (https://nvbugs/5846154)
 full:RTXPro6000D/accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4_4gpus[moe_backend=CUTLASS-mtp_nextn=2-tp4-fp8kv=True-attention_dp=True-cuda_graph=True-overlap_scheduler=True-torch_compile=False] SKIP (https://nvbugs/5846154)
-perf/test_perf_sanity.py::test_e2e[aggr_upload-deepseek_r1_fp4_v2_grace_blackwell-r1_fp4_v2_dep4_mtp1_1k8k] SKIP (https://nvbugs/5846166)
 accuracy/test_disaggregated_serving.py::TestDeepSeekV32Exp::test_auto_dtype[False] SKIP (https://nvbugs/5847284)
 accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_4gpus[v1_kv_cache-dp4-trtllm-fp8] SKIP (https://nvbugs/5850183)
 accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_4gpus[v2_kv_cache-dp4-trtllm-fp8] SKIP (https://nvbugs/5850183)
--- a/tests/scripts/perf-sanity/deepseek_r1_fp4_v2_2_nodes_grace_blackwell.yaml
+++ b/tests/scripts/perf-sanity/deepseek_r1_fp4_v2_2_nodes_grace_blackwell.yaml
@ -38,8 +38,8 @@ server_configs:
        iterations: 10
        isl: 1024
        osl: 1024
-        random_range_ratio: 0.2
        backend: "openai"
+        dataset_file: datasets/perf-ci/deepseek_r1-1k1k-20480-ratio-1_for_serve.json

  # 8k1k configs - DEP8 with CUTLASS, MTP1
  - name: "r1_fp4_v2_dep8_mtp1_8k1k"
@ -74,8 +74,8 @@ server_configs:
        iterations: 10
        isl: 8192
        osl: 1024
-        random_range_ratio: 0.2
        backend: "openai"
+        dataset_file: datasets/perf-ci/deepseek_r1-8k1k-20480-ratio-1_for_serve.json

  # 1k1k configs - TEP8 with TRTLLM, MTP3
  - name: "r1_fp4_v2_tep8_mtp3"
@ -105,5 +105,5 @@ server_configs:
        iterations: 12
        isl: 1024
        osl: 1024
-        random_range_ratio: 0.8
        backend: "openai"
+        dataset_file: datasets/perf-ci/deepseek_r1-1k1k-20480-ratio-1_for_serve.json
--- a/tests/scripts/perf-sanity/deepseek_r1_fp4_v2_blackwell.yaml
+++ b/tests/scripts/perf-sanity/deepseek_r1_fp4_v2_blackwell.yaml
@ -31,8 +31,8 @@ server_configs:
        iterations: 10
        isl: 1024
        osl: 1024
-        random_range_ratio: 0.2
        backend: "openai"
+        dataset_file: datasets/perf-ci/deepseek_r1-1k1k-20480-ratio-1_for_serve.json

  # 1k1k configs - DEP8 with CUTLASS, MTP1
  - name: "r1_fp4_v2_dep8_mtp1_1k1k"
@ -66,8 +66,8 @@ server_configs:
        iterations: 10
        isl: 1024
        osl: 1024
-        random_range_ratio: 0.2
        backend: "openai"
+        dataset_file: datasets/perf-ci/deepseek_r1-1k1k-20480-ratio-1_for_serve.json

  # 8k1k configs - TP4 with TRTLLM, MTP3
  - name: "r1_fp4_v2_tp4_mtp3_8k1k"
@ -97,8 +97,8 @@ server_configs:
        iterations: 10
        isl: 8192
        osl: 1024
-        random_range_ratio: 0.2
        backend: "openai"
+        dataset_file: datasets/perf-ci/deepseek_r1-8k1k-20480-ratio-1_for_serve.json

  # 8k1k configs - DEP8 with CUTLASS, MTP1
  - name: "r1_fp4_v2_dep8_mtp1_8k1k"
@ -132,5 +132,5 @@ server_configs:
        iterations: 10
        isl: 8192
        osl: 1024
-        random_range_ratio: 0.2
        backend: "openai"
+        dataset_file: datasets/perf-ci/deepseek_r1-8k1k-20480-ratio-1_for_serve.json
--- a/tests/scripts/perf-sanity/deepseek_r1_fp4_v2_grace_blackwell.yaml
+++ b/tests/scripts/perf-sanity/deepseek_r1_fp4_v2_grace_blackwell.yaml
@ -31,20 +31,13 @@ server_configs:
      decoding_type: 'MTP'
      num_nextn_predict_layers: 1
    client_configs:
-      - name: "con2048_iter5_1k1k"
-        concurrency: 2048
-        iterations: 5
-        isl: 1024
-        osl: 1024
-        random_range_ratio: 0.8
-        backend: "openai"
      - name: "con1024_iter10_1k1k"
        concurrency: 1024
        iterations: 10
        isl: 1024
        osl: 1024
-        random_range_ratio: 0.2
        backend: "openai"
+        dataset_file: datasets/perf-ci/deepseek_r1-1k1k-20480-ratio-1_for_serve.json

  # 1k1k configs - TEP4 with TRTLLM, MTP3
  - name: "r1_fp4_v2_tep4_mtp3_1k1k"
@ -74,8 +67,8 @@ server_configs:
        iterations: 10
        isl: 1024
        osl: 1024
-        random_range_ratio: 0.8
        backend: "openai"
+        dataset_file: datasets/perf-ci/deepseek_r1-1k1k-20480-ratio-1_for_serve.json

  # 1k1k configs - TP4 with TRTLLM, MTP3
  - name: "r1_fp4_v2_tp4_mtp3_1k1k"
@ -100,20 +93,13 @@ server_configs:
      decoding_type: 'MTP'
      num_nextn_predict_layers: 3
    client_configs:
-      - name: "con4_iter10_1k1k"
-        concurrency: 4
-        iterations: 10
-        isl: 1024
-        osl: 1024
-        random_range_ratio: 0.8
-        backend: "openai"
      - name: "con2_iter10_1k1k"
        concurrency: 2
        iterations: 10
        isl: 1024
        osl: 1024
-        random_range_ratio: 0.2
        backend: "openai"
+        dataset_file: datasets/perf-ci/deepseek_r1-1k1k-20480-ratio-1_for_serve.json

  # 8k1k configs - DEP4 with CUTLASS, MTP1
  - name: "r1_fp4_v2_dep4_mtp1_8k1k"
@ -142,20 +128,13 @@ server_configs:
      decoding_type: 'MTP'
      num_nextn_predict_layers: 1
    client_configs:
-      - name: "con2048_iter5_8k1k"
-        concurrency: 2048
-        iterations: 5
-        isl: 8192
-        osl: 1024
-        random_range_ratio: 0.2
-        backend: "openai"
      - name: "con256_iter10_8k1k"
        concurrency: 256
        iterations: 10
        isl: 8192
        osl: 1024
-        random_range_ratio: 0.2
        backend: "openai"
+        dataset_file: datasets/perf-ci/deepseek_r1-8k1k-20480-ratio-1_for_serve.json

  # 8k1k configs - TEP4 with TRTLLM, MTP3
  - name: "r1_fp4_v2_tep4_mtp3_8k1k"
@ -185,8 +164,8 @@ server_configs:
        iterations: 10
        isl: 8192
        osl: 1024
-        random_range_ratio: 0.2
        backend: "openai"
+        dataset_file: datasets/perf-ci/deepseek_r1-8k1k-20480-ratio-1_for_serve.json

  # 8k1k configs - TP4 with TRTLLM, MTP3
  - name: "r1_fp4_v2_tp4_mtp3_8k1k"
@ -211,20 +190,13 @@ server_configs:
      decoding_type: 'MTP'
      num_nextn_predict_layers: 3
    client_configs:
-      - name: "con4_iter10_8k1k"
-        concurrency: 4
-        iterations: 10
-        isl: 8192
-        osl: 1024
-        random_range_ratio: 0.2
-        backend: "openai"
      - name: "con2_iter10_8k1k"
        concurrency: 2
        iterations: 10
        isl: 8192
        osl: 1024
-        random_range_ratio: 0.2
        backend: "openai"
+        dataset_file: datasets/perf-ci/deepseek_r1-8k1k-20480-ratio-1_for_serve.json

  # 1k8k configs - DEP4 with CUTLASS, MTP1
  - name: "r1_fp4_v2_dep4_mtp1_1k8k"
@ -258,8 +230,8 @@ server_configs:
        iterations: 5
        isl: 1024
        osl: 8192
-        random_range_ratio: 0.8
        backend: "openai"
+        dataset_file: datasets/perf-ci/deepseek_r1-1k8k-20480-ratio-1_for_serve.json

  # 1k8k configs - TEP4 with TRTLLM, MTP3
  - name: "r1_fp4_v2_tep4_mtp3_1k8k"
@ -289,8 +261,8 @@ server_configs:
        iterations: 10
        isl: 1024
        osl: 8192
-        random_range_ratio: 0.8
        backend: "openai"
+        dataset_file: datasets/perf-ci/deepseek_r1-1k8k-20480-ratio-1_for_serve.json

  # 1k8k configs - TP4 with TRTLLM, MTP3
  - name: "r1_fp4_v2_tp4_mtp3_1k8k"
@ -320,5 +292,5 @@ server_configs:
        iterations: 10
        isl: 1024
        osl: 8192
-        random_range_ratio: 0.8
        backend: "openai"
+        dataset_file: datasets/perf-ci/deepseek_r1-1k8k-20480-ratio-1_for_serve.json
--- a/tests/scripts/perf-sanity/deepseek_r1_fp8_blackwell.yaml
+++ b/tests/scripts/perf-sanity/deepseek_r1_fp8_blackwell.yaml
@ -31,8 +31,8 @@ server_configs:
        iterations: 10
        isl: 1024
        osl: 1024
-        random_range_ratio: 0.2
        backend: "openai"
+        dataset_file: datasets/perf-ci/deepseek_r1-1k1k-20480-ratio-1_for_serve.json

  # 1k1k configs - DEP8 with DEEPGEMM, MTP1
  - name: "r1_fp8_dep8_mtp1_1k1k"
@ -66,8 +66,8 @@ server_configs:
        iterations: 10
        isl: 1024
        osl: 1024
-        random_range_ratio: 0.2
        backend: "openai"
+        dataset_file: datasets/perf-ci/deepseek_r1-1k1k-20480-ratio-1_for_serve.json

  # 8k1k configs - TP8 with TRTLLM, MTP3
  - name: "r1_fp8_tp8_mtp3_8k1k"
@ -97,8 +97,8 @@ server_configs:
        iterations: 10
        isl: 8192
        osl: 1024
-        random_range_ratio: 0.2
        backend: "openai"
+        dataset_file: datasets/perf-ci/deepseek_r1-8k1k-20480-ratio-1_for_serve.json

  # 8k1k configs - DEP8 with DEEPGEMM, MTP1
  - name: "r1_fp8_dep8_mtp1_8k1k"
@ -132,5 +132,5 @@ server_configs:
        iterations: 10
        isl: 8192
        osl: 1024
-        random_range_ratio: 0.2
        backend: "openai"
+        dataset_file: datasets/perf-ci/deepseek_r1-8k1k-20480-ratio-1_for_serve.json
--- a/tests/scripts/perf-sanity/deepseek_v32_fp4_blackwell.yaml
+++ b/tests/scripts/perf-sanity/deepseek_v32_fp4_blackwell.yaml
@ -31,8 +31,8 @@ server_configs:
        iterations: 10
        isl: 8192
        osl: 1024
-        random_range_ratio: 0.2
        backend: "openai"
+        dataset_file: datasets/perf-ci/deepseek_v32-8k1k-20480-ratio-1_for_serve.json

  # 8k1k configs - DEP8 with CUTLASS, MTP1
  - name: "v32_fp4_dep8_mtp1_8k1k"
@ -66,5 +66,5 @@ server_configs:
        iterations: 10
        isl: 8192
        osl: 1024
-        random_range_ratio: 0.2
        backend: "openai"
+        dataset_file: datasets/perf-ci/deepseek_v32-8k1k-20480-ratio-1_for_serve.json
--- a/tests/scripts/perf-sanity/deepseek_v32_fp4_grace_blackwell.yaml
+++ b/tests/scripts/perf-sanity/deepseek_v32_fp4_grace_blackwell.yaml
@ -31,8 +31,8 @@ server_configs:
        iterations: 10
        isl: 1024
        osl: 1024
-        random_range_ratio: 0.2
        backend: "openai"
+        dataset_file: datasets/perf-ci/deepseek_v32-1k1k-20480-ratio-1_for_serve.json

  # 1k1k configs - DEP4 with CUTLASS, MTP1
  - name: "v32_fp4_dep4_mtp1_1k1k"
@ -66,8 +66,8 @@ server_configs:
        iterations: 10
        isl: 1024
        osl: 1024
-        random_range_ratio: 0.2
        backend: "openai"
+        dataset_file: datasets/perf-ci/deepseek_v32-1k1k-20480-ratio-1_for_serve.json

  # 8k1k configs - TEP4 with TRTLLM, MTP3
  - name: "v32_fp4_tep4_mtp3_8k1k"
@ -97,8 +97,8 @@ server_configs:
        iterations: 10
        isl: 8192
        osl: 1024
-        random_range_ratio: 0.2
        backend: "openai"
+        dataset_file: datasets/perf-ci/deepseek_v32-8k1k-20480-ratio-1_for_serve.json

  # 8k1k configs - DEP4 with CUTLASS, MTP1
  - name: "v32_fp4_dep4_mtp1_8k1k"
@ -132,5 +132,5 @@ server_configs:
        iterations: 10
        isl: 8192
        osl: 1024
-        random_range_ratio: 0.2
        backend: "openai"
+        dataset_file: datasets/perf-ci/deepseek_v32-8k1k-20480-ratio-1_for_serve.json
--- a/tests/scripts/perf-sanity/gpt_oss_120b_fp4_grace_blackwell.yaml
+++ b/tests/scripts/perf-sanity/gpt_oss_120b_fp4_grace_blackwell.yaml
@ -32,7 +32,7 @@ server_configs:
        iterations: 5
        isl: 1024
        osl: 8192
-        random_range_ratio: 0.8
+        random_range_ratio: 0.0
        backend: "openai"

  - name: "gpt_oss_fp4_dep2_1k1k"
@ -63,7 +63,7 @@ server_configs:
        iterations: 5
        isl: 1024
        osl: 1024
-        random_range_ratio: 0.8
+        random_range_ratio: 0.0
        backend: "openai"

  - name: "gpt_oss_fp4_tep2_1k8k"
@ -92,7 +92,7 @@ server_configs:
        iterations: 10
        isl: 1024
        osl: 8192
-        random_range_ratio: 0.8
+        random_range_ratio: 0.0
        backend: "openai"

  - name: "gpt_oss_fp4_tp2_1k8k"
@ -121,7 +121,7 @@ server_configs:
        iterations: 10
        isl: 1024
        osl: 8192
-        random_range_ratio: 0.8
+        random_range_ratio: 0.0
        backend: "openai"

  - name: "gpt_oss_fp4_tp4_eagle3_1k1k"
@ -155,5 +155,5 @@ server_configs:
        iterations: 32
        isl: 1024
        osl: 1024
-        random_range_ratio: 0.8
+        random_range_ratio: 0.0
        backend: "openai"
--- a/tests/scripts/perf-sanity/k2_thinking_fp4_2_nodes_grace_blackwell.yaml
+++ b/tests/scripts/perf-sanity/k2_thinking_fp4_2_nodes_grace_blackwell.yaml
@ -32,9 +32,9 @@ server_configs:
        iterations: 10
        isl: 32768
        osl: 8192
-        random_range_ratio: 0.2
        backend: "openai"
        trust_remote_code: true
+        dataset_file: datasets/perf-ci/k2_thinking-32k8k-20480-ratio-1_for_serve.json

  # 32k8k configs - DEP8 with CUTLASS
  - name: "k2_thinking_fp4_dep8_32k8k"
@ -67,6 +67,6 @@ server_configs:
        iterations: 10
        isl: 32768
        osl: 8192
-        random_range_ratio: 0.2
        backend: "openai"
        trust_remote_code: true
+        dataset_file: datasets/perf-ci/k2_thinking-32k8k-20480-ratio-1_for_serve.json
--- a/tests/scripts/perf-sanity/k2_thinking_fp4_blackwell.yaml
+++ b/tests/scripts/perf-sanity/k2_thinking_fp4_blackwell.yaml
@ -29,9 +29,9 @@ server_configs:
        iterations: 10
        isl: 8192
        osl: 1024
-        random_range_ratio: 0.2
        backend: "openai"
        trust_remote_code: true
+        dataset_file: datasets/perf-ci/k2_thinking-8k1k-20480-ratio-1_for_serve.json

  # 8k1k configs - DEP8 with CUTLASS
  - name: "k2_thinking_fp4_dep8_8k1k"
@ -63,9 +63,9 @@ server_configs:
        iterations: 10
        isl: 8192
        osl: 1024
-        random_range_ratio: 0.2
        backend: "openai"
        trust_remote_code: true
+        dataset_file: datasets/perf-ci/k2_thinking-8k1k-20480-ratio-1_for_serve.json

  # 32k8k configs - TEP8 with TRTLLM
  - name: "k2_thinking_fp4_tep8_32k8k"
@ -94,9 +94,9 @@ server_configs:
        iterations: 10
        isl: 32768
        osl: 8192
-        random_range_ratio: 0.2
        backend: "openai"
        trust_remote_code: true
+        dataset_file: datasets/perf-ci/k2_thinking-32k8k-20480-ratio-1_for_serve.json

  # 32k8k configs - DEP8 with CUTLASS
  - name: "k2_thinking_fp4_dep8_32k8k"
@ -129,6 +129,6 @@ server_configs:
        iterations: 10
        isl: 32768
        osl: 8192
-        random_range_ratio: 0.2
        backend: "openai"
        trust_remote_code: true
+        dataset_file: datasets/perf-ci/k2_thinking-32k8k-20480-ratio-1_for_serve.json
--- a/tests/scripts/perf-sanity/k2_thinking_fp4_grace_blackwell.yaml
+++ b/tests/scripts/perf-sanity/k2_thinking_fp4_grace_blackwell.yaml
@ -29,9 +29,9 @@ server_configs:
        iterations: 10
        isl: 8192
        osl: 1024
-        random_range_ratio: 0.2
        backend: "openai"
        trust_remote_code: true
+        dataset_file: datasets/perf-ci/k2_thinking-8k1k-20480-ratio-1_for_serve.json

  # 8k1k configs - DEP4 with CUTLASS
  - name: "k2_thinking_fp4_dep4_8k1k"
@ -63,6 +63,6 @@ server_configs:
        iterations: 10
        isl: 8192
        osl: 1024
-        random_range_ratio: 0.2
        backend: "openai"
        trust_remote_code: true
+        dataset_file: datasets/perf-ci/k2_thinking-8k1k-20480-ratio-1_for_serve.json