Test: Improve model re-use in C++ DGX tests for CI stability (#4263)

* Fix padded vocab size for Llama Signed-off-by: Dom Brown <3886319+DomBrown@users.noreply.github.com> * Refactor multi GPU llama executor tests, and reuse the built model engines Signed-off-by: Dom Brown <3886319+DomBrown@users.noreply.github.com> * Fix test list typo Signed-off-by: Dom Brown <3886319+DomBrown@users.noreply.github.com> * WIP Signed-off-by: Dom Brown <3886319+DomBrown@users.noreply.github.com> * Further WIP Signed-off-by: Dom Brown <3886319+DomBrown@users.noreply.github.com> * WIP Signed-off-by: Dom Brown <3886319+DomBrown@users.noreply.github.com> * Update test lists and readme Signed-off-by: Dom Brown <3886319+DomBrown@users.noreply.github.com> * Try parametrize for asymmetric Signed-off-by: Dom Brown <3886319+DomBrown@users.noreply.github.com> * Parametrize + skip unsupported combinations Signed-off-by: domb <3886319+DomBrown@users.noreply.github.com> * Update test list Signed-off-by: domb <3886319+DomBrown@users.noreply.github.com> * Reduce environment duplicated code Signed-off-by: domb <3886319+DomBrown@users.noreply.github.com> --------- Signed-off-by: Dom Brown <3886319+DomBrown@users.noreply.github.com> Signed-off-by: domb <3886319+DomBrown@users.noreply.github.com>
2026-01-13 22:18:36 +08:00 · 2025-05-19 14:20:21 +01:00 · 2025-05-19 14:20:21 +01:00 · c45f414bbf
commit c45f414bbf
parent 98018f3bb9
4 changed files with 363 additions and 205 deletions
--- a/cpp/tests/README.md
+++ b/cpp/tests/README.md
@ -31,7 +31,7 @@ pytest tests/integration/defs/cpp/test_e2e.py::test_model[llama-90]

 pytest tests/integration/defs/cpp/test_e2e.py::test_benchmarks[gpt-90]

-pytest tests/integration/defs/cpp/test_multi_gpu.py::test_disagg[90]
+pytest tests/integration/defs/cpp/test_multi_gpu.py::TestDisagg::test_symmetric_executor[gpt-mpi_kvcache-90]
 ```

 ## Manual steps
--- a/cpp/tests/executor/disaggExecutorTest.cpp
+++ b/cpp/tests/executor/disaggExecutorTest.cpp
@ -29,6 +29,7 @@ namespace
 {

 auto constexpr LLAMA_INPUT_FILE = "input_tokens_llama.npy";
+auto constexpr LLAMA_VOCAB_SIZE_PADDED = 128256;
 auto constexpr LLAMA_END_ID = 128001;
 auto constexpr LLAMA_PAD_ID = 128001;

@ -520,6 +521,7 @@ TEST_P(DisaggParamsTest, DisaggTokenComparison)
    // set defaults and adjust if needed by different models
    fs::path inputPath = DATA_PATH / "input_tokens.npy";
    ModelIds modelIds{50256, 50256};
+    SizeType32 vocabSizePadded{50257}; // gpt vocabSizePadded
    bool isSpeculativeDecoding{false};

    // NOTE: This can be used to disable checks for certain prompt batch entries
@ -557,6 +559,7 @@ TEST_P(DisaggParamsTest, DisaggTokenComparison)
        || modelName == "llama_tp1_pp2_cp1" || modelName == "llama_tp2_pp1_cp1" || modelName == "llama_tp1_pp1_cp1")
    {
        inputPath = DATA_PATH / LLAMA_INPUT_FILE;
+        vocabSizePadded = LLAMA_VOCAB_SIZE_PADDED;

        auto const resultsPath
            = LLAMA_DATA_PATH / ((beamWidth == 1) ? "sampling" : "beam_search_" + std::to_string(beamWidth));
@ -619,8 +622,6 @@ TEST_P(DisaggParamsTest, DisaggTokenComparison)
        }
    }

-    SizeType32 constexpr vocabSizePadded{50257}; // gpt vocabSizePadded
-
    // Returning logits will bring higher latency
    if (streaming && (outConfig.returnContextLogits || outConfig.returnGenerationLogits))
    {
@ -756,6 +757,7 @@ TEST_P(DisaggOrchestratorParamsTest, DisaggTokenComparison)
    // set defaults and adjust if needed by different models
    fs::path inputPath = DATA_PATH / "input_tokens.npy";
    ModelIds modelIds{50256, 50256};
+    SizeType32 vocabSizePadded{50257}; // gpt vocabSizePadded
    bool isSpeculativeDecoding{false};

    // NOTE: This can be used to disable checks for certain prompt batch entries
@ -764,6 +766,7 @@ TEST_P(DisaggOrchestratorParamsTest, DisaggTokenComparison)
        || modelName == "llama_tp1_pp2" || modelName == "llama_tp2_pp1" || modelName == "llama_tp1_pp1")
    {
        inputPath = DATA_PATH / LLAMA_INPUT_FILE;
+        vocabSizePadded = LLAMA_VOCAB_SIZE_PADDED;

        auto const resultsPath
            = LLAMA_DATA_PATH / ((beamWidth == 1) ? "sampling" : "beam_search_" + std::to_string(beamWidth));
@ -827,8 +830,6 @@ TEST_P(DisaggOrchestratorParamsTest, DisaggTokenComparison)
        }
    }

-    SizeType32 constexpr vocabSizePadded{50257}; // gpt vocabSizePadded
-
    // Returning logits will bring higher latency
    if (streaming && (outConfig.returnContextLogits || outConfig.returnGenerationLogits))
    {
@ -920,6 +921,7 @@ TEST_P(ConditionalDisaggParamsTest, DisaggTokenComparison)
    // set defaults and adjust if needed by different models
    fs::path inputPath = DATA_PATH / "input_tokens.npy";
    ModelIds modelIds{50256, 50256};
+    SizeType32 vocabSizePadded{50257}; // gpt vocabSizePadded
    bool isSpeculativeDecoding{false};

    // NOTE: This can be used to disable checks for certain prompt batch entries
@ -935,6 +937,7 @@ TEST_P(ConditionalDisaggParamsTest, DisaggTokenComparison)
    else if (modelName == "llama_tp1_pp1_cp1")
    {
        inputPath = DATA_PATH / LLAMA_INPUT_FILE;
+        vocabSizePadded = LLAMA_VOCAB_SIZE_PADDED;

        auto const resultsPath
            = LLAMA_DATA_PATH / ((beamWidth == 1) ? "sampling" : "beam_search_" + std::to_string(beamWidth));
@ -948,8 +951,6 @@ TEST_P(ConditionalDisaggParamsTest, DisaggTokenComparison)
        TLLM_THROW("Unrecognized modelName");
    }

-    SizeType32 constexpr vocabSizePadded{50257}; // gpt vocabSizePadded
-
    auto executorConfig = ExecutorConfig(maxBeamWidth);
    FloatType freeGpuMemoryFraction = 0.9f;
    KvCacheConfig kvCacheConfig{true, std::nullopt, std::nullopt, std::nullopt, freeGpuMemoryFraction};
@ -1115,8 +1116,11 @@ INSTANTIATE_TEST_SUITE_P(GptSingleDeviceDisaggSymmetricExecutorMixedTest, Disagg
        testing::Values(1)),
    generateTestNameDisaggParams);

-INSTANTIATE_TEST_SUITE_P(ConditionalDisaggSymmetricExecutorTest, ConditionalDisaggParamsTest,
-    testing::Combine(testing::Values("gpt", "llama_tp1_pp1_cp1")), generateTestNameCondDisaggParams);
+INSTANTIATE_TEST_SUITE_P(GptConditionalDisaggSymmetricExecutorTest, ConditionalDisaggParamsTest,
+    testing::Combine(testing::Values("gpt")), generateTestNameCondDisaggParams);
+
+INSTANTIATE_TEST_SUITE_P(LlamaConditionalDisaggSymmetricExecutorTest, ConditionalDisaggParamsTest,
+    testing::Combine(testing::Values("llama_tp1_pp1_cp1")), generateTestNameCondDisaggParams);

 INSTANTIATE_TEST_SUITE_P(LlamaTP2DisaggSymmetricExecutorTest, DisaggParamsTest,
    testing::Combine(testing::Values(4),
--- a/tests/integration/defs/cpp/test_multi_gpu.py
+++ b/tests/integration/defs/cpp/test_multi_gpu.py
@ -1,14 +1,49 @@
-import copy
 import os as _os
 import pathlib as _pl
 import platform
 import time
+from enum import Enum, auto
 from typing import List, Optional

 import defs.cpp.cpp_common as _cpp
 import pytest


+# Helper filter for disagg google tests
+def get_model_test_filter_prefix(model: str) -> str:
+    if model == "llama":
+        return "Llama"
+    elif model == "gpt":
+        return "Gpt"
+    else:
+        raise ValueError(f"Unsupported model: {model}")
+
+
+class KVCacheType(Enum):
+    NONE = auto()
+    MPI = auto()
+    UCX = auto()
+
+
+def get_multi_gpu_env(kv_cache_type=KVCacheType.NONE, llama_multi_gpu=False):
+    env = {**_os.environ}
+
+    match kv_cache_type:
+        case KVCacheType.MPI:
+            env["TRTLLM_USE_MPI_KVCACHE"] = "1"
+        case KVCacheType.UCX:
+            env["TRTLLM_USE_UCX_KVCACHE"] = "1"
+        case KVCacheType.NONE:
+            pass
+        case _:
+            raise ValueError(f"Unsupported KVCacheType: {kv_cache_type}")
+
+    if llama_multi_gpu:
+        env["RUN_LLAMA_MULTI_GPU"] = "true"
+
+    return env
+
+
 def run_simple_multi_gpu_tests(build_dir: _pl.Path, timeout=1500):
    tests_dir = build_dir / "tests"
    cpp_env = {**_os.environ}
@ -22,9 +57,9 @@ def run_simple_multi_gpu_tests(build_dir: _pl.Path, timeout=1500):
    ]
    _cpp.run_command(mpi_utils_test, cwd=tests_dir, env=cpp_env, timeout=300)

-    # Cache transceiver tests
-    new_env = copy.copy(cpp_env)
-    new_env["TRTLLM_USE_MPI_KVCACHE"] = "1"
+    # Cache transceiver MPI tests
+    new_env = get_multi_gpu_env(kv_cache_type=KVCacheType.MPI)
+
    cache_trans_test = [
        "mpirun",
        "-n",
@ -34,9 +69,6 @@ def run_simple_multi_gpu_tests(build_dir: _pl.Path, timeout=1500):
    ]
    _cpp.run_command(cache_trans_test, cwd=tests_dir, env=new_env, timeout=300)

-    new_env = copy.copy(cpp_env)
-    new_env["TRTLLM_USE_MPI_KVCACHE"] = "1"
-    # Cache transceiver tests
    cache_trans_test_8_proc = [
        "mpirun",
        "-n",
@ -50,8 +82,8 @@ def run_simple_multi_gpu_tests(build_dir: _pl.Path, timeout=1500):
                     timeout=600)

    # Cache transceiver tests with UCX
-    new_env = copy.copy(cpp_env)
-    new_env["TRTLLM_USE_UCX_KVCACHE"] = "1"
+    new_env = get_multi_gpu_env(kv_cache_type=KVCacheType.UCX)
+
    cache_trans_test = [
        "mpirun",
        "-n",
@ -61,8 +93,6 @@ def run_simple_multi_gpu_tests(build_dir: _pl.Path, timeout=1500):
    ]
    _cpp.run_command(cache_trans_test, cwd=tests_dir, env=new_env, timeout=300)

-    new_env = copy.copy(cpp_env)
-    new_env["TRTLLM_USE_UCX_KVCACHE"] = "1"
    # Cache transceiver tests
    cache_trans_test_8_proc = [
        "mpirun",
@ -91,12 +121,10 @@ def run_simple_multi_gpu_tests(build_dir: _pl.Path, timeout=1500):
                     timeout=600)


-def run_llama_executor_multi_gpu_tests(build_dir: _pl.Path, timeout=1500):
+def run_llama_executor_leader_tests(build_dir: _pl.Path, timeout=1500):
    tests_dir = build_dir / "tests"
-    cpp_env = {**_os.environ}

-    mgpu_env = copy.copy(cpp_env)
-    mgpu_env["RUN_LLAMA_MULTI_GPU"] = "true"
+    mgpu_env = get_multi_gpu_env(llama_multi_gpu=True)

    #Executor test in leader mode
    xml_output_file = build_dir / "results-multi-gpu-llama-exec-leader-mode.xml"
@ -108,12 +136,14 @@ def run_llama_executor_multi_gpu_tests(build_dir: _pl.Path, timeout=1500):
            "--gtest_filter=*LlamaExecutorTest*LeaderMode*:*LlamaMultiExecutorTest*LeaderMode*"
        ],
        leader_commands=[f"--gtest_output=xml:{xml_output_file}"])
-    # https://nvbugspro.nvidia.com/bug/5026255 disable below tests for now.
-    if False:
-        _cpp.run_command(trt_model_test,
-                         cwd=tests_dir,
-                         env=mgpu_env,
-                         timeout=1500)
+
+    _cpp.run_command(trt_model_test, cwd=tests_dir, env=mgpu_env, timeout=1500)
+
+
+def run_llama_executor_orchestrator_tests(build_dir: _pl.Path, timeout=1500):
+    tests_dir = build_dir / "tests"
+
+    mgpu_env = get_multi_gpu_env(llama_multi_gpu=True)

    #Executor test in orchestrator mode
    xml_output_file = build_dir / "results-multi-gpu-llama-exec-orch-mode.xml"
@ -124,18 +154,23 @@ def run_llama_executor_multi_gpu_tests(build_dir: _pl.Path, timeout=1500):
    ]
    _cpp.run_command(trt_model_test, cwd=tests_dir, env=mgpu_env, timeout=1500)

-    #Logits processor and guided decoding test in leader mode
+
+def run_llama_executor_logits_proc_tests(build_dir: _pl.Path, timeout=1500):
+    tests_dir = build_dir / "tests"
+
+    mgpu_env = get_multi_gpu_env(llama_multi_gpu=True)
+
+    #Logits processor test in leader mode
    xml_output_file = build_dir / "results-multi-gpu-logits-proc.xml"
+
    tp_pp_sizes = [(4, 1), (2, 2), (1, 4)]
    gtest_filter = [
        f"LlamaExecutorTest/LogitsProcParamsTest*tp{tp}_pp{pp}*"
        for tp, pp in tp_pp_sizes
    ]
-    gtest_filter.extend([
-        f"LlamaExecutorGuidedDecodingTest/GuidedDecodingParamsTest*tp{tp}_pp{pp}*"
-        for tp, pp in tp_pp_sizes
-    ])
+
    gtest_filter = ":".join(gtest_filter)
+
    trt_model_test = _cpp.produce_mpirun_command(
        global_commands=["mpirun", "--allow-run-as-root"],
        nranks=4,
@ -143,10 +178,38 @@ def run_llama_executor_multi_gpu_tests(build_dir: _pl.Path, timeout=1500):
            "executor/executorTest", f"--gtest_filter={gtest_filter}"
        ],
        leader_commands=[f"--gtest_output=xml:{xml_output_file}"])
+
    _cpp.run_command(trt_model_test, cwd=tests_dir, env=mgpu_env, timeout=1500)


-def run_t5_multi_gpu_tests(build_dir: _pl.Path, timeout=1500):
+def run_llama_executor_guided_decoding_tests(build_dir: _pl.Path, timeout=1500):
+    tests_dir = build_dir / "tests"
+
+    mgpu_env = get_multi_gpu_env(llama_multi_gpu=True)
+
+    #Guided decoding test in leader mode
+    xml_output_file = build_dir / "results-multi-gpu-guided-decoding.xml"
+
+    tp_pp_sizes = [(4, 1), (2, 2), (1, 4)]
+    gtest_filter = [
+        f"LlamaExecutorGuidedDecodingTest/GuidedDecodingParamsTest*tp{tp}_pp{pp}*"
+        for tp, pp in tp_pp_sizes
+    ]
+
+    gtest_filter = ":".join(gtest_filter)
+
+    trt_model_test = _cpp.produce_mpirun_command(
+        global_commands=["mpirun", "--allow-run-as-root"],
+        nranks=4,
+        local_commands=[
+            "executor/executorTest", f"--gtest_filter={gtest_filter}"
+        ],
+        leader_commands=[f"--gtest_output=xml:{xml_output_file}"])
+
+    _cpp.run_command(trt_model_test, cwd=tests_dir, env=mgpu_env, timeout=1500)
+
+
+def run_enc_dec_multi_gpu_tests(build_dir: _pl.Path, timeout=1500):
    tests_dir = build_dir / "tests"
    cpp_env = {**_os.environ}

@ -184,82 +247,75 @@ def run_trt_gpt_model_real_decoder_multi_gpu_tests(build_dir: _pl.Path,
                     timeout=timeout)  # expecting ~ 1200s


-def run_disagg_multi_gpu_tests(build_dir: _pl.Path):
+def run_disagg_symmetric_executor_tests(build_dir: _pl.Path,
+                                        model: str,
+                                        nprocs=2,
+                                        kvcache_type=KVCacheType.MPI,
+                                        timeout=1500):
+    tests_dir = build_dir / "tests"
+
+    prefix = get_model_test_filter_prefix(model)
+
+    mgpu_env = get_multi_gpu_env(kv_cache_type=kvcache_type,
+                                 llama_multi_gpu=True)
+
+    xml_output_file = build_dir / f"results-multi-gpu-disagg-executor-{nprocs}-process.xml"
+    trt_model_test = _cpp.produce_mpirun_command(
+        global_commands=["mpirun", "--allow-run-as-root"],
+        nranks=nprocs,
+        local_commands=[
+            "executor/disaggExecutorTest",
+            f"--gtest_filter=*{prefix}*DisaggSymmetricExecutorTest*"
+        ],
+        leader_commands=[f"--gtest_output=xml:{xml_output_file}"])
+
+    _cpp.run_command(trt_model_test,
+                     cwd=tests_dir,
+                     env=mgpu_env,
+                     timeout=timeout)
+
+
+def run_disagg_asymmetric_executor_tests(build_dir: _pl.Path,
+                                         model: str,
+                                         nprocs=4,
+                                         kvcache_type=KVCacheType.MPI,
+                                         timeout=1500):

    tests_dir = build_dir / "tests"
-    cpp_env = {**_os.environ}

-    new_env = copy.copy(cpp_env)
-    new_env["TRTLLM_USE_MPI_KVCACHE"] = "1"
-    xml_output_file = build_dir / "results-multi-gpu-disagg-executor-2-process.xml"
+    prefix = get_model_test_filter_prefix(model)
+
+    mgpu_env = get_multi_gpu_env(kv_cache_type=kvcache_type,
+                                 llama_multi_gpu=True)
+
+    xml_output_file = build_dir / f"results-multi-gpu-disagg-asymmetric-executor-{nprocs}-process.xml"
+
    trt_model_test = _cpp.produce_mpirun_command(
        global_commands=["mpirun", "--allow-run-as-root"],
-        nranks=2,
+        nranks=nprocs,
        local_commands=[
            "executor/disaggExecutorTest",
-            "--gtest_filter=*DisaggSymmetricExecutorTest*"
+            f"--gtest_filter=*{prefix}*DisaggAsymmetricExecutorTest*"
        ],
        leader_commands=[f"--gtest_output=xml:{xml_output_file}"])
-    _cpp.run_command(trt_model_test, cwd=tests_dir, env=new_env, timeout=1500)

-    mgpu_env = copy.copy(cpp_env)
-    mgpu_env["RUN_LLAMA_MULTI_GPU"] = "true"
-    mgpu_env["TRTLLM_USE_MPI_KVCACHE"] = "1"
-    xml_output_file = build_dir / "results-multi-gpu-disagg-executor-4-process.xml"
-    trt_model_test = _cpp.produce_mpirun_command(
-        global_commands=["mpirun", "--allow-run-as-root"],
-        nranks=4,
-        local_commands=[
-            "executor/disaggExecutorTest",
-            "--gtest_filter=*DisaggSymmetricExecutorTest*"
-        ],
-        leader_commands=[f"--gtest_output=xml:{xml_output_file}"])
-    # https://nvbugspro.nvidia.com/bug/5026255 disable below tests for now.
-    _cpp.run_command(trt_model_test, cwd=tests_dir, env=mgpu_env, timeout=1500)
+    _cpp.run_command(trt_model_test,
+                     cwd=tests_dir,
+                     env=mgpu_env,
+                     timeout=timeout)

-    xml_output_file = build_dir / "results-multi-gpu-disagg-executor-8-process.xml"
-    trt_model_test = _cpp.produce_mpirun_command(
-        global_commands=["mpirun", "--allow-run-as-root"],
-        nranks=8,
-        local_commands=[
-            "executor/disaggExecutorTest",
-            "--gtest_filter=*LlamaTP2PP2DisaggSymmetricExecutorTest*"
-        ],
-        leader_commands=[f"--gtest_output=xml:{xml_output_file}"])
-    _cpp.run_command(trt_model_test, cwd=tests_dir, env=mgpu_env, timeout=1500)

-    xml_output_file = build_dir / "results-multi-gpu-disagg-asymmetric-executor-4-process.xml"
-    trt_model_test = _cpp.produce_mpirun_command(
-        global_commands=["mpirun", "--allow-run-as-root"],
-        nranks=4,
-        local_commands=[
-            "executor/disaggExecutorTest",
-            "--gtest_filter=*DisaggAsymmetricExecutorTest*"
-        ],
-        leader_commands=[f"--gtest_output=xml:{xml_output_file}"])
-    _cpp.run_command(trt_model_test, cwd=tests_dir, env=mgpu_env, timeout=1500)
+def run_disagg_orchestrator_params_tests(build_dir: _pl.Path,
+                                         model: str,
+                                         kvcache_type=KVCacheType.MPI,
+                                         timeout=1500):

-    xml_output_file = build_dir / "results-multi-gpu-disagg-asymmetric-executor-6-process.xml"
-    trt_model_test = _cpp.produce_mpirun_command(
-        global_commands=["mpirun", "--allow-run-as-root"],
-        nranks=6,
-        local_commands=[
-            "executor/disaggExecutorTest",
-            "--gtest_filter=*DisaggAsymmetricExecutorTest*"
-        ],
-        leader_commands=[f"--gtest_output=xml:{xml_output_file}"])
-    _cpp.run_command(trt_model_test, cwd=tests_dir, env=mgpu_env, timeout=1500)
+    tests_dir = build_dir / "tests"

-    xml_output_file = build_dir / "results-multi-gpu-disagg-asymmetric-executor-8-process.xml"
-    trt_model_test = _cpp.produce_mpirun_command(
-        global_commands=["mpirun", "--allow-run-as-root"],
-        nranks=8,
-        local_commands=[
-            "executor/disaggExecutorTest",
-            "--gtest_filter=*DisaggAsymmetricExecutorTest*"
-        ],
-        leader_commands=[f"--gtest_output=xml:{xml_output_file}"])
-    _cpp.run_command(trt_model_test, cwd=tests_dir, env=mgpu_env, timeout=1500)
+    prefix = get_model_test_filter_prefix(model)
+
+    mgpu_env = get_multi_gpu_env(kv_cache_type=kvcache_type,
+                                 llama_multi_gpu=True)

    xml_output_file = build_dir / "results-multi-gpu-disagg-asymmetric-orchestrator-executor-7-process.xml"
    trt_model_test = _cpp.produce_mpirun_command(
@ -267,101 +323,35 @@ def run_disagg_multi_gpu_tests(build_dir: _pl.Path):
        nranks=7,
        local_commands=[
            "executor/disaggExecutorTest",
-            "--gtest_filter=*DisaggOrchestratorParamsTest*"
+            f"--gtest_filter=*{prefix}*DisaggOrchestratorParamsTest*"
        ],
        leader_commands=[f"--gtest_output=xml:{xml_output_file}"])
-    _cpp.run_command(trt_model_test, cwd=tests_dir, env=mgpu_env, timeout=1500)
+    _cpp.run_command(trt_model_test,
+                     cwd=tests_dir,
+                     env=mgpu_env,
+                     timeout=timeout)

-    # UCX transceiver tests, the test may not be built if ENABLE_UCX is 0
-    new_env = copy.copy(cpp_env)
-    new_env["TRTLLM_USE_UCX_KVCACHE"] = "1"
-    xml_output_file = build_dir / "results-multi-gpu-disagg-executor-2-process.xml"
-    trt_model_test = _cpp.produce_mpirun_command(
-        global_commands=["mpirun", "--allow-run-as-root"],
-        nranks=2,
-        local_commands=[
-            "executor/disaggExecutorTest",
-            "--gtest_filter=*DisaggSymmetricExecutorTest*"
-        ],
-        leader_commands=[f"--gtest_output=xml:{xml_output_file}"])
-    _cpp.run_command(trt_model_test, cwd=tests_dir, env=new_env, timeout=1500)

-    mgpu_env = copy.copy(cpp_env)
-    mgpu_env["RUN_LLAMA_MULTI_GPU"] = "true"
-    mgpu_env["TRTLLM_USE_UCX_KVCACHE"] = "1"
-    xml_output_file = build_dir / "results-multi-gpu-disagg-executor-4-process.xml"
-    trt_model_test = _cpp.produce_mpirun_command(
-        global_commands=["mpirun", "--allow-run-as-root"],
-        nranks=4,
-        local_commands=[
-            "executor/disaggExecutorTest",
-            "--gtest_filter=*DisaggSymmetricExecutorTest*"
-        ],
-        leader_commands=[f"--gtest_output=xml:{xml_output_file}"])
-    # https://nvbugspro.nvidia.com/bug/5026255 disable below tests for now.
-    _cpp.run_command(trt_model_test, cwd=tests_dir, env=mgpu_env, timeout=1500)
+def run_disagg_spawn_orchestrator_tests(build_dir: _pl.Path,
+                                        model: str,
+                                        kvcache_type=False,
+                                        timeout=1500):

-    xml_output_file = build_dir / "results-multi-gpu-disagg-executor-8-process.xml"
-    trt_model_test = _cpp.produce_mpirun_command(
-        global_commands=["mpirun", "--allow-run-as-root"],
-        nranks=8,
-        local_commands=[
-            "executor/disaggExecutorTest",
-            "--gtest_filter=*LlamaTP2PP2DisaggSymmetricExecutorTest*"
-        ],
-        leader_commands=[f"--gtest_output=xml:{xml_output_file}"])
-    _cpp.run_command(trt_model_test, cwd=tests_dir, env=mgpu_env, timeout=1500)
+    tests_dir = build_dir / "tests"

-    xml_output_file = build_dir / "results-multi-gpu-disagg-asymmetric-executor-4-process.xml"
-    trt_model_test = _cpp.produce_mpirun_command(
-        global_commands=["mpirun", "--allow-run-as-root"],
-        nranks=4,
-        local_commands=[
-            "executor/disaggExecutorTest",
-            "--gtest_filter=*DisaggAsymmetricExecutorTest*"
-        ],
-        leader_commands=[f"--gtest_output=xml:{xml_output_file}"])
-    _cpp.run_command(trt_model_test, cwd=tests_dir, env=mgpu_env, timeout=1500)
+    prefix = get_model_test_filter_prefix(model)

-    xml_output_file = build_dir / "results-multi-gpu-disagg-asymmetric-executor-6-process.xml"
-    trt_model_test = _cpp.produce_mpirun_command(
-        global_commands=["mpirun", "--allow-run-as-root"],
-        nranks=6,
-        local_commands=[
-            "executor/disaggExecutorTest",
-            "--gtest_filter=*DisaggAsymmetricExecutorTest*"
-        ],
-        leader_commands=[f"--gtest_output=xml:{xml_output_file}"])
-    _cpp.run_command(trt_model_test, cwd=tests_dir, env=mgpu_env, timeout=1500)
-
-    xml_output_file = build_dir / "results-multi-gpu-disagg-asymmetric-executor-8-process.xml"
-    trt_model_test = _cpp.produce_mpirun_command(
-        global_commands=["mpirun", "--allow-run-as-root"],
-        nranks=8,
-        local_commands=[
-            "executor/disaggExecutorTest",
-            "--gtest_filter=*DisaggAsymmetricExecutorTest*"
-        ],
-        leader_commands=[f"--gtest_output=xml:{xml_output_file}"])
-    _cpp.run_command(trt_model_test, cwd=tests_dir, env=mgpu_env, timeout=1500)
-
-    xml_output_file = build_dir / "results-multi-gpu-disagg-asymmetric-orchestrator-executor-7-process.xml"
-    trt_model_test = _cpp.produce_mpirun_command(
-        global_commands=["mpirun", "--allow-run-as-root"],
-        nranks=7,
-        local_commands=[
-            "executor/disaggExecutorTest",
-            "--gtest_filter=*DisaggOrchestratorParamsTest*"
-        ],
-        leader_commands=[f"--gtest_output=xml:{xml_output_file}"])
-    _cpp.run_command(trt_model_test, cwd=tests_dir, env=mgpu_env, timeout=1500)
+    mgpu_env = get_multi_gpu_env(kv_cache_type=kvcache_type,
+                                 llama_multi_gpu=True)

    xml_output_file = build_dir / "results-multi-gpu-disagg-spawn-asymmetric-orchestrator-executor-1-process.xml"
+
    comms = [
        "executor/disaggExecutorTest",
-        "--gtest_filter=*DisaaggSpawnOrchestrator*"
+        f"--gtest_filter=*{prefix}*DisaaggSpawnOrchestrator*",
+        f"--gtest_output=xml:{xml_output_file}"
    ]
-    _cpp.run_command(comms, cwd=tests_dir, env=mgpu_env, timeout=1500)
+    _cpp.run_command(comms, cwd=tests_dir, env=mgpu_env, timeout=timeout)


 def prepare_multi_gpu_model_tests(test_list: List[str],
@ -413,6 +403,62 @@ def prepare_model_multi_gpu(python_exe, root_dir, cpp_resources_dir,
    return _prepare


+@pytest.fixture(scope="session")
+def gpt_single_gpu_model(prepare_model):
+    prepare_model("gpt")
+    return "gpt"
+
+
+@pytest.fixture(scope="session")
+def llama_single_gpu_model(prepare_model):
+    prepare_model("llama")
+    return "llama"
+
+
+@pytest.fixture(scope="session")
+def llama_multi_gpu_model(prepare_model_multi_gpu):
+    prepare_model_multi_gpu("llama")
+    return "llama"
+
+
+# Allow us to dynamically choose a fixture at runtime
+# Combined with session scope fixtures above to ensure
+# that the model is built only once per pytest session
+@pytest.fixture
+def prepare_models_disagg(request):
+
+    def _prepare(model_name: str):
+        if model_name == "llama":
+            fixture_names = [
+                "llama_single_gpu_model",
+                "llama_multi_gpu_model",
+            ]
+        elif model_name == "gpt":
+            fixture_names = [
+                "gpt_single_gpu_model",
+            ]
+        else:
+            raise ValueError(f"Disagg tests don't support model: {model_name}")
+
+        print(f"Preparing models for disagg tests: {fixture_names}")
+        # Run the fixtures
+        for fixture_name in fixture_names:
+            request.getfixturevalue(fixture_name)
+
+    return _prepare
+
+
+# Use indirect parameterization to ensure that the model is built
+# only once per pytest session
+@pytest.fixture(scope="session")
+def multi_gpu_model(request, prepare_model_multi_gpu):
+
+    model_name = request.param
+    prepare_model_multi_gpu(model_name)
+
+    return model_name
+
+
@pytest.mark.parametrize("build_google_tests", ["80", "86", "89", "90"],
                         indirect=True)
 def test_simple(build_google_tests, build_dir):
@ -425,46 +471,135 @@ def test_simple(build_google_tests, build_dir):

@pytest.mark.parametrize("build_google_tests", ["80", "86", "89", "90"],
                         indirect=True)
-def test_t5(build_google_tests, prepare_model_multi_gpu, build_dir):
+@pytest.mark.parametrize("multi_gpu_model", ["t5"], indirect=True)
+def test_enc_dec(build_google_tests, multi_gpu_model, build_dir):

    if platform.system() != "Windows":
-        prepare_model_multi_gpu("t5")
-        run_t5_multi_gpu_tests(build_dir=build_dir,
-                               timeout=_cpp.default_test_timeout)
+        run_enc_dec_multi_gpu_tests(build_dir=build_dir,
+                                    timeout=_cpp.default_test_timeout)


@pytest.mark.parametrize("build_google_tests", ["80", "86", "89", "90"],
                         indirect=True)
-def test_llama_executor(build_google_tests, prepare_model_multi_gpu, lora_setup,
+@pytest.mark.parametrize("mode", [
+    "orchestrator",
+    pytest.param(
+        "leader",
+        marks=pytest.mark.skip("https://nvbugspro.nvidia.com/bug/5026255"))
+])
+@pytest.mark.parametrize("multi_gpu_model", ["llama"], indirect=True)
+def test_llama_executor(build_google_tests, multi_gpu_model, mode, lora_setup,
                        build_dir):

-    if platform.system() != "Windows":
-        prepare_model_multi_gpu("llama")
-        run_llama_executor_multi_gpu_tests(build_dir=build_dir,
-                                           timeout=_cpp.default_test_timeout)
+    if platform.system() == "Windows":
+        return
+
+    if mode == "orchestrator":
+        run_llama_executor_orchestrator_tests(build_dir=build_dir,
+                                              timeout=_cpp.default_test_timeout)
+    elif mode == "leader":
+        run_llama_executor_leader_tests(build_dir=build_dir,
+                                        timeout=_cpp.default_test_timeout)
+    else:
+        raise ValueError(f"Unsupported mode: {mode}")


@pytest.mark.parametrize("build_google_tests", ["80", "86", "89", "90"],
                         indirect=True)
-def test_trt_gpt_real_decoder(build_google_tests, prepare_model_multi_gpu,
-                              lora_setup, build_dir):
+@pytest.mark.parametrize("multi_gpu_model", ["llama"], indirect=True)
+def test_llama_executor_logits_proc(build_google_tests, multi_gpu_model,
+                                    lora_setup, build_dir):
+
+    if platform.system() != "Windows":
+        run_llama_executor_logits_proc_tests(build_dir=build_dir,
+                                             timeout=_cpp.default_test_timeout)
+
+
+@pytest.mark.parametrize("build_google_tests", ["80", "86", "89", "90"],
+                         indirect=True)
+@pytest.mark.parametrize("multi_gpu_model", ["llama"], indirect=True)
+def test_llama_executor_guided_decoding(build_google_tests, multi_gpu_model,
+                                        lora_setup, build_dir):
+
+    if platform.system() != "Windows":
+        run_llama_executor_guided_decoding_tests(
+            build_dir=build_dir, timeout=_cpp.default_test_timeout)
+
+
+@pytest.mark.parametrize("build_google_tests", ["80", "86", "89", "90"],
+                         indirect=True)
+@pytest.mark.parametrize("multi_gpu_model", ["llama"], indirect=True)
+def test_trt_gpt_real_decoder(build_google_tests, multi_gpu_model, lora_setup,
+                              build_dir):

    if platform.system() != "Windows":
-        prepare_model_multi_gpu("llama")
        run_trt_gpt_model_real_decoder_multi_gpu_tests(
            build_dir=build_dir, timeout=_cpp.default_test_timeout)


@pytest.mark.parametrize("build_google_tests", ["80", "86", "89", "90"],
                         indirect=True)
-def test_disagg(prepare_model, prepare_model_multi_gpu, build_google_tests,
-                build_dir):
+class TestDisagg:

-    if platform.system() != "Windows":
-        # Disagg tests need single + multi GPU llama models.
-        prepare_model("llama")
-        prepare_model_multi_gpu("llama")
+    @pytest.mark.parametrize("kvcache_type", [KVCacheType.MPI, KVCacheType.UCX],
+                             ids=["mpi_kvcache", "ucx_kvcache"])
+    @pytest.mark.parametrize("nprocs", [2, 4, 8],
+                             ids=["2proc", "4proc", "8proc"])
+    @pytest.mark.parametrize("model", ["gpt", "llama"])
+    def test_symmetric_executor(self, build_google_tests, model, nprocs,
+                                kvcache_type, prepare_models_disagg, build_dir):

-        prepare_model("gpt")
+        if model == "gpt" and nprocs > 2:
+            pytest.skip(
+                "test_symmetric_executor only supports 2 processes for gpt")

-        run_disagg_multi_gpu_tests(build_dir=build_dir)
+        if platform.system() != "Windows":
+            prepare_models_disagg(model)
+
+            run_disagg_symmetric_executor_tests(build_dir=build_dir,
+                                                model=model,
+                                                nprocs=nprocs,
+                                                kvcache_type=kvcache_type)
+
+    @pytest.mark.parametrize("kvcache_type", [KVCacheType.MPI, KVCacheType.UCX],
+                             ids=["mpi_kvcache", "ucx_kvcache"])
+    @pytest.mark.parametrize("nprocs", [4, 6, 8],
+                             ids=["4proc", "6proc", "8proc"])
+    @pytest.mark.parametrize("model", ["llama"])
+    def test_asymmetric_executor(self, build_google_tests, model, nprocs,
+                                 kvcache_type, prepare_models_disagg,
+                                 build_dir):
+
+        if platform.system() != "Windows":
+            prepare_models_disagg(model_name=model)
+
+            run_disagg_asymmetric_executor_tests(build_dir=build_dir,
+                                                 model=model,
+                                                 nprocs=nprocs,
+                                                 kvcache_type=kvcache_type)
+
+    @pytest.mark.parametrize("kvcache_type", [KVCacheType.MPI, KVCacheType.UCX],
+                             ids=["mpi_kvcache", "ucx_kvcache"])
+    @pytest.mark.parametrize("model", ["llama"])
+    def test_orchestrator_params(self, build_google_tests, model, kvcache_type,
+                                 prepare_models_disagg, build_dir):
+
+        if platform.system() != "Windows":
+            prepare_models_disagg(model)
+
+            run_disagg_orchestrator_params_tests(build_dir=build_dir,
+                                                 model=model,
+                                                 kvcache_type=kvcache_type)
+
+    @pytest.mark.parametrize("kvcache_type", [KVCacheType.UCX],
+                             ids=["ucx_kvcache"])
+    @pytest.mark.parametrize("model", ["llama"])
+    def test_spawn_orchestrator(self, build_google_tests, model, kvcache_type,
+                                prepare_models_disagg, build_dir):
+
+        if platform.system() != "Windows":
+            prepare_models_disagg(model)
+
+            run_disagg_spawn_orchestrator_tests(build_dir=build_dir,
+                                                model=model,
+                                                kvcache_type=kvcache_type)
--- a/tests/integration/test_lists/test-db/l0_dgx_h100.yml
+++ b/tests/integration/test_lists/test-db/l0_dgx_h100.yml
@ -115,10 +115,29 @@ l0_dgx_h100:
  tests:
  # ------------- CPP tests ---------------
  - cpp/test_multi_gpu.py::test_simple[90]
-  - cpp/test_multi_gpu.py::test_t5[90]
-  - cpp/test_multi_gpu.py::test_llama_executor[90]
-  - cpp/test_multi_gpu.py::test_trt_gpt_real_decoder[90]
-  - cpp/test_multi_gpu.py::test_disagg[90]
+  - cpp/test_multi_gpu.py::test_enc_dec[t5-90]
+  - cpp/test_multi_gpu.py::test_llama_executor[llama-orchestrator-90]
+  - cpp/test_multi_gpu.py::test_llama_executor[llama-leader-90]
+  - cpp/test_multi_gpu.py::test_llama_executor_guided_decoding[llama-90]
+  - cpp/test_multi_gpu.py::test_llama_executor_logits_proc[llama-90]
+  - cpp/test_multi_gpu.py::test_trt_gpt_real_decoder[llama-90]
+  - cpp/test_multi_gpu.py::TestDisagg::test_symmetric_executor[gpt-2proc-mpi_kvcache-90]
+  - cpp/test_multi_gpu.py::TestDisagg::test_symmetric_executor[gpt-2proc-ucx_kvcache-90]
+  - cpp/test_multi_gpu.py::TestDisagg::test_symmetric_executor[llama-2proc-mpi_kvcache-90]
+  - cpp/test_multi_gpu.py::TestDisagg::test_symmetric_executor[llama-4proc-mpi_kvcache-90]
+  - cpp/test_multi_gpu.py::TestDisagg::test_symmetric_executor[llama-8proc-mpi_kvcache-90]
+  - cpp/test_multi_gpu.py::TestDisagg::test_symmetric_executor[llama-2proc-ucx_kvcache-90]
+  - cpp/test_multi_gpu.py::TestDisagg::test_symmetric_executor[llama-4proc-ucx_kvcache-90]
+  - cpp/test_multi_gpu.py::TestDisagg::test_symmetric_executor[llama-8proc-ucx_kvcache-90]
+  - cpp/test_multi_gpu.py::TestDisagg::test_asymmetric_executor[llama-4proc-mpi_kvcache-90]
+  - cpp/test_multi_gpu.py::TestDisagg::test_asymmetric_executor[llama-6proc-mpi_kvcache-90]
+  - cpp/test_multi_gpu.py::TestDisagg::test_asymmetric_executor[llama-8proc-mpi_kvcache-90]
+  - cpp/test_multi_gpu.py::TestDisagg::test_asymmetric_executor[llama-4proc-ucx_kvcache-90]
+  - cpp/test_multi_gpu.py::TestDisagg::test_asymmetric_executor[llama-6proc-ucx_kvcache-90]
+  - cpp/test_multi_gpu.py::TestDisagg::test_asymmetric_executor[llama-8proc-ucx_kvcache-90]
+  - cpp/test_multi_gpu.py::TestDisagg::test_orchestrator_params[llama-mpi_kvcache-90]
+  - cpp/test_multi_gpu.py::TestDisagg::test_orchestrator_params[llama-ucx_kvcache-90]
+  - cpp/test_multi_gpu.py::TestDisagg::test_spawn_orchestrator[llama-ucx_kvcache-90]
 - condition:
    ranges:
      system_gpu_count: