mirror of
https://github.com/NVIDIA/TensorRT-LLM.git
synced 2026-01-13 22:18:36 +08:00
Test: Improve model re-use in C++ DGX tests for CI stability (#4263)
* Fix padded vocab size for Llama Signed-off-by: Dom Brown <3886319+DomBrown@users.noreply.github.com> * Refactor multi GPU llama executor tests, and reuse the built model engines Signed-off-by: Dom Brown <3886319+DomBrown@users.noreply.github.com> * Fix test list typo Signed-off-by: Dom Brown <3886319+DomBrown@users.noreply.github.com> * WIP Signed-off-by: Dom Brown <3886319+DomBrown@users.noreply.github.com> * Further WIP Signed-off-by: Dom Brown <3886319+DomBrown@users.noreply.github.com> * WIP Signed-off-by: Dom Brown <3886319+DomBrown@users.noreply.github.com> * Update test lists and readme Signed-off-by: Dom Brown <3886319+DomBrown@users.noreply.github.com> * Try parametrize for asymmetric Signed-off-by: Dom Brown <3886319+DomBrown@users.noreply.github.com> * Parametrize + skip unsupported combinations Signed-off-by: domb <3886319+DomBrown@users.noreply.github.com> * Update test list Signed-off-by: domb <3886319+DomBrown@users.noreply.github.com> * Reduce environment duplicated code Signed-off-by: domb <3886319+DomBrown@users.noreply.github.com> --------- Signed-off-by: Dom Brown <3886319+DomBrown@users.noreply.github.com> Signed-off-by: domb <3886319+DomBrown@users.noreply.github.com>
This commit is contained in:
parent
98018f3bb9
commit
c45f414bbf
@ -31,7 +31,7 @@ pytest tests/integration/defs/cpp/test_e2e.py::test_model[llama-90]
|
||||
|
||||
pytest tests/integration/defs/cpp/test_e2e.py::test_benchmarks[gpt-90]
|
||||
|
||||
pytest tests/integration/defs/cpp/test_multi_gpu.py::test_disagg[90]
|
||||
pytest tests/integration/defs/cpp/test_multi_gpu.py::TestDisagg::test_symmetric_executor[gpt-mpi_kvcache-90]
|
||||
```
|
||||
|
||||
## Manual steps
|
||||
|
||||
@ -29,6 +29,7 @@ namespace
|
||||
{
|
||||
|
||||
auto constexpr LLAMA_INPUT_FILE = "input_tokens_llama.npy";
|
||||
auto constexpr LLAMA_VOCAB_SIZE_PADDED = 128256;
|
||||
auto constexpr LLAMA_END_ID = 128001;
|
||||
auto constexpr LLAMA_PAD_ID = 128001;
|
||||
|
||||
@ -520,6 +521,7 @@ TEST_P(DisaggParamsTest, DisaggTokenComparison)
|
||||
// set defaults and adjust if needed by different models
|
||||
fs::path inputPath = DATA_PATH / "input_tokens.npy";
|
||||
ModelIds modelIds{50256, 50256};
|
||||
SizeType32 vocabSizePadded{50257}; // gpt vocabSizePadded
|
||||
bool isSpeculativeDecoding{false};
|
||||
|
||||
// NOTE: This can be used to disable checks for certain prompt batch entries
|
||||
@ -557,6 +559,7 @@ TEST_P(DisaggParamsTest, DisaggTokenComparison)
|
||||
|| modelName == "llama_tp1_pp2_cp1" || modelName == "llama_tp2_pp1_cp1" || modelName == "llama_tp1_pp1_cp1")
|
||||
{
|
||||
inputPath = DATA_PATH / LLAMA_INPUT_FILE;
|
||||
vocabSizePadded = LLAMA_VOCAB_SIZE_PADDED;
|
||||
|
||||
auto const resultsPath
|
||||
= LLAMA_DATA_PATH / ((beamWidth == 1) ? "sampling" : "beam_search_" + std::to_string(beamWidth));
|
||||
@ -619,8 +622,6 @@ TEST_P(DisaggParamsTest, DisaggTokenComparison)
|
||||
}
|
||||
}
|
||||
|
||||
SizeType32 constexpr vocabSizePadded{50257}; // gpt vocabSizePadded
|
||||
|
||||
// Returning logits will bring higher latency
|
||||
if (streaming && (outConfig.returnContextLogits || outConfig.returnGenerationLogits))
|
||||
{
|
||||
@ -756,6 +757,7 @@ TEST_P(DisaggOrchestratorParamsTest, DisaggTokenComparison)
|
||||
// set defaults and adjust if needed by different models
|
||||
fs::path inputPath = DATA_PATH / "input_tokens.npy";
|
||||
ModelIds modelIds{50256, 50256};
|
||||
SizeType32 vocabSizePadded{50257}; // gpt vocabSizePadded
|
||||
bool isSpeculativeDecoding{false};
|
||||
|
||||
// NOTE: This can be used to disable checks for certain prompt batch entries
|
||||
@ -764,6 +766,7 @@ TEST_P(DisaggOrchestratorParamsTest, DisaggTokenComparison)
|
||||
|| modelName == "llama_tp1_pp2" || modelName == "llama_tp2_pp1" || modelName == "llama_tp1_pp1")
|
||||
{
|
||||
inputPath = DATA_PATH / LLAMA_INPUT_FILE;
|
||||
vocabSizePadded = LLAMA_VOCAB_SIZE_PADDED;
|
||||
|
||||
auto const resultsPath
|
||||
= LLAMA_DATA_PATH / ((beamWidth == 1) ? "sampling" : "beam_search_" + std::to_string(beamWidth));
|
||||
@ -827,8 +830,6 @@ TEST_P(DisaggOrchestratorParamsTest, DisaggTokenComparison)
|
||||
}
|
||||
}
|
||||
|
||||
SizeType32 constexpr vocabSizePadded{50257}; // gpt vocabSizePadded
|
||||
|
||||
// Returning logits will bring higher latency
|
||||
if (streaming && (outConfig.returnContextLogits || outConfig.returnGenerationLogits))
|
||||
{
|
||||
@ -920,6 +921,7 @@ TEST_P(ConditionalDisaggParamsTest, DisaggTokenComparison)
|
||||
// set defaults and adjust if needed by different models
|
||||
fs::path inputPath = DATA_PATH / "input_tokens.npy";
|
||||
ModelIds modelIds{50256, 50256};
|
||||
SizeType32 vocabSizePadded{50257}; // gpt vocabSizePadded
|
||||
bool isSpeculativeDecoding{false};
|
||||
|
||||
// NOTE: This can be used to disable checks for certain prompt batch entries
|
||||
@ -935,6 +937,7 @@ TEST_P(ConditionalDisaggParamsTest, DisaggTokenComparison)
|
||||
else if (modelName == "llama_tp1_pp1_cp1")
|
||||
{
|
||||
inputPath = DATA_PATH / LLAMA_INPUT_FILE;
|
||||
vocabSizePadded = LLAMA_VOCAB_SIZE_PADDED;
|
||||
|
||||
auto const resultsPath
|
||||
= LLAMA_DATA_PATH / ((beamWidth == 1) ? "sampling" : "beam_search_" + std::to_string(beamWidth));
|
||||
@ -948,8 +951,6 @@ TEST_P(ConditionalDisaggParamsTest, DisaggTokenComparison)
|
||||
TLLM_THROW("Unrecognized modelName");
|
||||
}
|
||||
|
||||
SizeType32 constexpr vocabSizePadded{50257}; // gpt vocabSizePadded
|
||||
|
||||
auto executorConfig = ExecutorConfig(maxBeamWidth);
|
||||
FloatType freeGpuMemoryFraction = 0.9f;
|
||||
KvCacheConfig kvCacheConfig{true, std::nullopt, std::nullopt, std::nullopt, freeGpuMemoryFraction};
|
||||
@ -1115,8 +1116,11 @@ INSTANTIATE_TEST_SUITE_P(GptSingleDeviceDisaggSymmetricExecutorMixedTest, Disagg
|
||||
testing::Values(1)),
|
||||
generateTestNameDisaggParams);
|
||||
|
||||
INSTANTIATE_TEST_SUITE_P(ConditionalDisaggSymmetricExecutorTest, ConditionalDisaggParamsTest,
|
||||
testing::Combine(testing::Values("gpt", "llama_tp1_pp1_cp1")), generateTestNameCondDisaggParams);
|
||||
INSTANTIATE_TEST_SUITE_P(GptConditionalDisaggSymmetricExecutorTest, ConditionalDisaggParamsTest,
|
||||
testing::Combine(testing::Values("gpt")), generateTestNameCondDisaggParams);
|
||||
|
||||
INSTANTIATE_TEST_SUITE_P(LlamaConditionalDisaggSymmetricExecutorTest, ConditionalDisaggParamsTest,
|
||||
testing::Combine(testing::Values("llama_tp1_pp1_cp1")), generateTestNameCondDisaggParams);
|
||||
|
||||
INSTANTIATE_TEST_SUITE_P(LlamaTP2DisaggSymmetricExecutorTest, DisaggParamsTest,
|
||||
testing::Combine(testing::Values(4),
|
||||
|
||||
@ -1,14 +1,49 @@
|
||||
import copy
|
||||
import os as _os
|
||||
import pathlib as _pl
|
||||
import platform
|
||||
import time
|
||||
from enum import Enum, auto
|
||||
from typing import List, Optional
|
||||
|
||||
import defs.cpp.cpp_common as _cpp
|
||||
import pytest
|
||||
|
||||
|
||||
# Helper filter for disagg google tests
|
||||
def get_model_test_filter_prefix(model: str) -> str:
|
||||
if model == "llama":
|
||||
return "Llama"
|
||||
elif model == "gpt":
|
||||
return "Gpt"
|
||||
else:
|
||||
raise ValueError(f"Unsupported model: {model}")
|
||||
|
||||
|
||||
class KVCacheType(Enum):
|
||||
NONE = auto()
|
||||
MPI = auto()
|
||||
UCX = auto()
|
||||
|
||||
|
||||
def get_multi_gpu_env(kv_cache_type=KVCacheType.NONE, llama_multi_gpu=False):
|
||||
env = {**_os.environ}
|
||||
|
||||
match kv_cache_type:
|
||||
case KVCacheType.MPI:
|
||||
env["TRTLLM_USE_MPI_KVCACHE"] = "1"
|
||||
case KVCacheType.UCX:
|
||||
env["TRTLLM_USE_UCX_KVCACHE"] = "1"
|
||||
case KVCacheType.NONE:
|
||||
pass
|
||||
case _:
|
||||
raise ValueError(f"Unsupported KVCacheType: {kv_cache_type}")
|
||||
|
||||
if llama_multi_gpu:
|
||||
env["RUN_LLAMA_MULTI_GPU"] = "true"
|
||||
|
||||
return env
|
||||
|
||||
|
||||
def run_simple_multi_gpu_tests(build_dir: _pl.Path, timeout=1500):
|
||||
tests_dir = build_dir / "tests"
|
||||
cpp_env = {**_os.environ}
|
||||
@ -22,9 +57,9 @@ def run_simple_multi_gpu_tests(build_dir: _pl.Path, timeout=1500):
|
||||
]
|
||||
_cpp.run_command(mpi_utils_test, cwd=tests_dir, env=cpp_env, timeout=300)
|
||||
|
||||
# Cache transceiver tests
|
||||
new_env = copy.copy(cpp_env)
|
||||
new_env["TRTLLM_USE_MPI_KVCACHE"] = "1"
|
||||
# Cache transceiver MPI tests
|
||||
new_env = get_multi_gpu_env(kv_cache_type=KVCacheType.MPI)
|
||||
|
||||
cache_trans_test = [
|
||||
"mpirun",
|
||||
"-n",
|
||||
@ -34,9 +69,6 @@ def run_simple_multi_gpu_tests(build_dir: _pl.Path, timeout=1500):
|
||||
]
|
||||
_cpp.run_command(cache_trans_test, cwd=tests_dir, env=new_env, timeout=300)
|
||||
|
||||
new_env = copy.copy(cpp_env)
|
||||
new_env["TRTLLM_USE_MPI_KVCACHE"] = "1"
|
||||
# Cache transceiver tests
|
||||
cache_trans_test_8_proc = [
|
||||
"mpirun",
|
||||
"-n",
|
||||
@ -50,8 +82,8 @@ def run_simple_multi_gpu_tests(build_dir: _pl.Path, timeout=1500):
|
||||
timeout=600)
|
||||
|
||||
# Cache transceiver tests with UCX
|
||||
new_env = copy.copy(cpp_env)
|
||||
new_env["TRTLLM_USE_UCX_KVCACHE"] = "1"
|
||||
new_env = get_multi_gpu_env(kv_cache_type=KVCacheType.UCX)
|
||||
|
||||
cache_trans_test = [
|
||||
"mpirun",
|
||||
"-n",
|
||||
@ -61,8 +93,6 @@ def run_simple_multi_gpu_tests(build_dir: _pl.Path, timeout=1500):
|
||||
]
|
||||
_cpp.run_command(cache_trans_test, cwd=tests_dir, env=new_env, timeout=300)
|
||||
|
||||
new_env = copy.copy(cpp_env)
|
||||
new_env["TRTLLM_USE_UCX_KVCACHE"] = "1"
|
||||
# Cache transceiver tests
|
||||
cache_trans_test_8_proc = [
|
||||
"mpirun",
|
||||
@ -91,12 +121,10 @@ def run_simple_multi_gpu_tests(build_dir: _pl.Path, timeout=1500):
|
||||
timeout=600)
|
||||
|
||||
|
||||
def run_llama_executor_multi_gpu_tests(build_dir: _pl.Path, timeout=1500):
|
||||
def run_llama_executor_leader_tests(build_dir: _pl.Path, timeout=1500):
|
||||
tests_dir = build_dir / "tests"
|
||||
cpp_env = {**_os.environ}
|
||||
|
||||
mgpu_env = copy.copy(cpp_env)
|
||||
mgpu_env["RUN_LLAMA_MULTI_GPU"] = "true"
|
||||
mgpu_env = get_multi_gpu_env(llama_multi_gpu=True)
|
||||
|
||||
#Executor test in leader mode
|
||||
xml_output_file = build_dir / "results-multi-gpu-llama-exec-leader-mode.xml"
|
||||
@ -108,12 +136,14 @@ def run_llama_executor_multi_gpu_tests(build_dir: _pl.Path, timeout=1500):
|
||||
"--gtest_filter=*LlamaExecutorTest*LeaderMode*:*LlamaMultiExecutorTest*LeaderMode*"
|
||||
],
|
||||
leader_commands=[f"--gtest_output=xml:{xml_output_file}"])
|
||||
# https://nvbugspro.nvidia.com/bug/5026255 disable below tests for now.
|
||||
if False:
|
||||
_cpp.run_command(trt_model_test,
|
||||
cwd=tests_dir,
|
||||
env=mgpu_env,
|
||||
timeout=1500)
|
||||
|
||||
_cpp.run_command(trt_model_test, cwd=tests_dir, env=mgpu_env, timeout=1500)
|
||||
|
||||
|
||||
def run_llama_executor_orchestrator_tests(build_dir: _pl.Path, timeout=1500):
|
||||
tests_dir = build_dir / "tests"
|
||||
|
||||
mgpu_env = get_multi_gpu_env(llama_multi_gpu=True)
|
||||
|
||||
#Executor test in orchestrator mode
|
||||
xml_output_file = build_dir / "results-multi-gpu-llama-exec-orch-mode.xml"
|
||||
@ -124,18 +154,23 @@ def run_llama_executor_multi_gpu_tests(build_dir: _pl.Path, timeout=1500):
|
||||
]
|
||||
_cpp.run_command(trt_model_test, cwd=tests_dir, env=mgpu_env, timeout=1500)
|
||||
|
||||
#Logits processor and guided decoding test in leader mode
|
||||
|
||||
def run_llama_executor_logits_proc_tests(build_dir: _pl.Path, timeout=1500):
|
||||
tests_dir = build_dir / "tests"
|
||||
|
||||
mgpu_env = get_multi_gpu_env(llama_multi_gpu=True)
|
||||
|
||||
#Logits processor test in leader mode
|
||||
xml_output_file = build_dir / "results-multi-gpu-logits-proc.xml"
|
||||
|
||||
tp_pp_sizes = [(4, 1), (2, 2), (1, 4)]
|
||||
gtest_filter = [
|
||||
f"LlamaExecutorTest/LogitsProcParamsTest*tp{tp}_pp{pp}*"
|
||||
for tp, pp in tp_pp_sizes
|
||||
]
|
||||
gtest_filter.extend([
|
||||
f"LlamaExecutorGuidedDecodingTest/GuidedDecodingParamsTest*tp{tp}_pp{pp}*"
|
||||
for tp, pp in tp_pp_sizes
|
||||
])
|
||||
|
||||
gtest_filter = ":".join(gtest_filter)
|
||||
|
||||
trt_model_test = _cpp.produce_mpirun_command(
|
||||
global_commands=["mpirun", "--allow-run-as-root"],
|
||||
nranks=4,
|
||||
@ -143,10 +178,38 @@ def run_llama_executor_multi_gpu_tests(build_dir: _pl.Path, timeout=1500):
|
||||
"executor/executorTest", f"--gtest_filter={gtest_filter}"
|
||||
],
|
||||
leader_commands=[f"--gtest_output=xml:{xml_output_file}"])
|
||||
|
||||
_cpp.run_command(trt_model_test, cwd=tests_dir, env=mgpu_env, timeout=1500)
|
||||
|
||||
|
||||
def run_t5_multi_gpu_tests(build_dir: _pl.Path, timeout=1500):
|
||||
def run_llama_executor_guided_decoding_tests(build_dir: _pl.Path, timeout=1500):
|
||||
tests_dir = build_dir / "tests"
|
||||
|
||||
mgpu_env = get_multi_gpu_env(llama_multi_gpu=True)
|
||||
|
||||
#Guided decoding test in leader mode
|
||||
xml_output_file = build_dir / "results-multi-gpu-guided-decoding.xml"
|
||||
|
||||
tp_pp_sizes = [(4, 1), (2, 2), (1, 4)]
|
||||
gtest_filter = [
|
||||
f"LlamaExecutorGuidedDecodingTest/GuidedDecodingParamsTest*tp{tp}_pp{pp}*"
|
||||
for tp, pp in tp_pp_sizes
|
||||
]
|
||||
|
||||
gtest_filter = ":".join(gtest_filter)
|
||||
|
||||
trt_model_test = _cpp.produce_mpirun_command(
|
||||
global_commands=["mpirun", "--allow-run-as-root"],
|
||||
nranks=4,
|
||||
local_commands=[
|
||||
"executor/executorTest", f"--gtest_filter={gtest_filter}"
|
||||
],
|
||||
leader_commands=[f"--gtest_output=xml:{xml_output_file}"])
|
||||
|
||||
_cpp.run_command(trt_model_test, cwd=tests_dir, env=mgpu_env, timeout=1500)
|
||||
|
||||
|
||||
def run_enc_dec_multi_gpu_tests(build_dir: _pl.Path, timeout=1500):
|
||||
tests_dir = build_dir / "tests"
|
||||
cpp_env = {**_os.environ}
|
||||
|
||||
@ -184,82 +247,75 @@ def run_trt_gpt_model_real_decoder_multi_gpu_tests(build_dir: _pl.Path,
|
||||
timeout=timeout) # expecting ~ 1200s
|
||||
|
||||
|
||||
def run_disagg_multi_gpu_tests(build_dir: _pl.Path):
|
||||
def run_disagg_symmetric_executor_tests(build_dir: _pl.Path,
|
||||
model: str,
|
||||
nprocs=2,
|
||||
kvcache_type=KVCacheType.MPI,
|
||||
timeout=1500):
|
||||
tests_dir = build_dir / "tests"
|
||||
|
||||
prefix = get_model_test_filter_prefix(model)
|
||||
|
||||
mgpu_env = get_multi_gpu_env(kv_cache_type=kvcache_type,
|
||||
llama_multi_gpu=True)
|
||||
|
||||
xml_output_file = build_dir / f"results-multi-gpu-disagg-executor-{nprocs}-process.xml"
|
||||
trt_model_test = _cpp.produce_mpirun_command(
|
||||
global_commands=["mpirun", "--allow-run-as-root"],
|
||||
nranks=nprocs,
|
||||
local_commands=[
|
||||
"executor/disaggExecutorTest",
|
||||
f"--gtest_filter=*{prefix}*DisaggSymmetricExecutorTest*"
|
||||
],
|
||||
leader_commands=[f"--gtest_output=xml:{xml_output_file}"])
|
||||
|
||||
_cpp.run_command(trt_model_test,
|
||||
cwd=tests_dir,
|
||||
env=mgpu_env,
|
||||
timeout=timeout)
|
||||
|
||||
|
||||
def run_disagg_asymmetric_executor_tests(build_dir: _pl.Path,
|
||||
model: str,
|
||||
nprocs=4,
|
||||
kvcache_type=KVCacheType.MPI,
|
||||
timeout=1500):
|
||||
|
||||
tests_dir = build_dir / "tests"
|
||||
cpp_env = {**_os.environ}
|
||||
|
||||
new_env = copy.copy(cpp_env)
|
||||
new_env["TRTLLM_USE_MPI_KVCACHE"] = "1"
|
||||
xml_output_file = build_dir / "results-multi-gpu-disagg-executor-2-process.xml"
|
||||
prefix = get_model_test_filter_prefix(model)
|
||||
|
||||
mgpu_env = get_multi_gpu_env(kv_cache_type=kvcache_type,
|
||||
llama_multi_gpu=True)
|
||||
|
||||
xml_output_file = build_dir / f"results-multi-gpu-disagg-asymmetric-executor-{nprocs}-process.xml"
|
||||
|
||||
trt_model_test = _cpp.produce_mpirun_command(
|
||||
global_commands=["mpirun", "--allow-run-as-root"],
|
||||
nranks=2,
|
||||
nranks=nprocs,
|
||||
local_commands=[
|
||||
"executor/disaggExecutorTest",
|
||||
"--gtest_filter=*DisaggSymmetricExecutorTest*"
|
||||
f"--gtest_filter=*{prefix}*DisaggAsymmetricExecutorTest*"
|
||||
],
|
||||
leader_commands=[f"--gtest_output=xml:{xml_output_file}"])
|
||||
_cpp.run_command(trt_model_test, cwd=tests_dir, env=new_env, timeout=1500)
|
||||
|
||||
mgpu_env = copy.copy(cpp_env)
|
||||
mgpu_env["RUN_LLAMA_MULTI_GPU"] = "true"
|
||||
mgpu_env["TRTLLM_USE_MPI_KVCACHE"] = "1"
|
||||
xml_output_file = build_dir / "results-multi-gpu-disagg-executor-4-process.xml"
|
||||
trt_model_test = _cpp.produce_mpirun_command(
|
||||
global_commands=["mpirun", "--allow-run-as-root"],
|
||||
nranks=4,
|
||||
local_commands=[
|
||||
"executor/disaggExecutorTest",
|
||||
"--gtest_filter=*DisaggSymmetricExecutorTest*"
|
||||
],
|
||||
leader_commands=[f"--gtest_output=xml:{xml_output_file}"])
|
||||
# https://nvbugspro.nvidia.com/bug/5026255 disable below tests for now.
|
||||
_cpp.run_command(trt_model_test, cwd=tests_dir, env=mgpu_env, timeout=1500)
|
||||
_cpp.run_command(trt_model_test,
|
||||
cwd=tests_dir,
|
||||
env=mgpu_env,
|
||||
timeout=timeout)
|
||||
|
||||
xml_output_file = build_dir / "results-multi-gpu-disagg-executor-8-process.xml"
|
||||
trt_model_test = _cpp.produce_mpirun_command(
|
||||
global_commands=["mpirun", "--allow-run-as-root"],
|
||||
nranks=8,
|
||||
local_commands=[
|
||||
"executor/disaggExecutorTest",
|
||||
"--gtest_filter=*LlamaTP2PP2DisaggSymmetricExecutorTest*"
|
||||
],
|
||||
leader_commands=[f"--gtest_output=xml:{xml_output_file}"])
|
||||
_cpp.run_command(trt_model_test, cwd=tests_dir, env=mgpu_env, timeout=1500)
|
||||
|
||||
xml_output_file = build_dir / "results-multi-gpu-disagg-asymmetric-executor-4-process.xml"
|
||||
trt_model_test = _cpp.produce_mpirun_command(
|
||||
global_commands=["mpirun", "--allow-run-as-root"],
|
||||
nranks=4,
|
||||
local_commands=[
|
||||
"executor/disaggExecutorTest",
|
||||
"--gtest_filter=*DisaggAsymmetricExecutorTest*"
|
||||
],
|
||||
leader_commands=[f"--gtest_output=xml:{xml_output_file}"])
|
||||
_cpp.run_command(trt_model_test, cwd=tests_dir, env=mgpu_env, timeout=1500)
|
||||
def run_disagg_orchestrator_params_tests(build_dir: _pl.Path,
|
||||
model: str,
|
||||
kvcache_type=KVCacheType.MPI,
|
||||
timeout=1500):
|
||||
|
||||
xml_output_file = build_dir / "results-multi-gpu-disagg-asymmetric-executor-6-process.xml"
|
||||
trt_model_test = _cpp.produce_mpirun_command(
|
||||
global_commands=["mpirun", "--allow-run-as-root"],
|
||||
nranks=6,
|
||||
local_commands=[
|
||||
"executor/disaggExecutorTest",
|
||||
"--gtest_filter=*DisaggAsymmetricExecutorTest*"
|
||||
],
|
||||
leader_commands=[f"--gtest_output=xml:{xml_output_file}"])
|
||||
_cpp.run_command(trt_model_test, cwd=tests_dir, env=mgpu_env, timeout=1500)
|
||||
tests_dir = build_dir / "tests"
|
||||
|
||||
xml_output_file = build_dir / "results-multi-gpu-disagg-asymmetric-executor-8-process.xml"
|
||||
trt_model_test = _cpp.produce_mpirun_command(
|
||||
global_commands=["mpirun", "--allow-run-as-root"],
|
||||
nranks=8,
|
||||
local_commands=[
|
||||
"executor/disaggExecutorTest",
|
||||
"--gtest_filter=*DisaggAsymmetricExecutorTest*"
|
||||
],
|
||||
leader_commands=[f"--gtest_output=xml:{xml_output_file}"])
|
||||
_cpp.run_command(trt_model_test, cwd=tests_dir, env=mgpu_env, timeout=1500)
|
||||
prefix = get_model_test_filter_prefix(model)
|
||||
|
||||
mgpu_env = get_multi_gpu_env(kv_cache_type=kvcache_type,
|
||||
llama_multi_gpu=True)
|
||||
|
||||
xml_output_file = build_dir / "results-multi-gpu-disagg-asymmetric-orchestrator-executor-7-process.xml"
|
||||
trt_model_test = _cpp.produce_mpirun_command(
|
||||
@ -267,101 +323,35 @@ def run_disagg_multi_gpu_tests(build_dir: _pl.Path):
|
||||
nranks=7,
|
||||
local_commands=[
|
||||
"executor/disaggExecutorTest",
|
||||
"--gtest_filter=*DisaggOrchestratorParamsTest*"
|
||||
f"--gtest_filter=*{prefix}*DisaggOrchestratorParamsTest*"
|
||||
],
|
||||
leader_commands=[f"--gtest_output=xml:{xml_output_file}"])
|
||||
_cpp.run_command(trt_model_test, cwd=tests_dir, env=mgpu_env, timeout=1500)
|
||||
_cpp.run_command(trt_model_test,
|
||||
cwd=tests_dir,
|
||||
env=mgpu_env,
|
||||
timeout=timeout)
|
||||
|
||||
# UCX transceiver tests, the test may not be built if ENABLE_UCX is 0
|
||||
new_env = copy.copy(cpp_env)
|
||||
new_env["TRTLLM_USE_UCX_KVCACHE"] = "1"
|
||||
xml_output_file = build_dir / "results-multi-gpu-disagg-executor-2-process.xml"
|
||||
trt_model_test = _cpp.produce_mpirun_command(
|
||||
global_commands=["mpirun", "--allow-run-as-root"],
|
||||
nranks=2,
|
||||
local_commands=[
|
||||
"executor/disaggExecutorTest",
|
||||
"--gtest_filter=*DisaggSymmetricExecutorTest*"
|
||||
],
|
||||
leader_commands=[f"--gtest_output=xml:{xml_output_file}"])
|
||||
_cpp.run_command(trt_model_test, cwd=tests_dir, env=new_env, timeout=1500)
|
||||
|
||||
mgpu_env = copy.copy(cpp_env)
|
||||
mgpu_env["RUN_LLAMA_MULTI_GPU"] = "true"
|
||||
mgpu_env["TRTLLM_USE_UCX_KVCACHE"] = "1"
|
||||
xml_output_file = build_dir / "results-multi-gpu-disagg-executor-4-process.xml"
|
||||
trt_model_test = _cpp.produce_mpirun_command(
|
||||
global_commands=["mpirun", "--allow-run-as-root"],
|
||||
nranks=4,
|
||||
local_commands=[
|
||||
"executor/disaggExecutorTest",
|
||||
"--gtest_filter=*DisaggSymmetricExecutorTest*"
|
||||
],
|
||||
leader_commands=[f"--gtest_output=xml:{xml_output_file}"])
|
||||
# https://nvbugspro.nvidia.com/bug/5026255 disable below tests for now.
|
||||
_cpp.run_command(trt_model_test, cwd=tests_dir, env=mgpu_env, timeout=1500)
|
||||
def run_disagg_spawn_orchestrator_tests(build_dir: _pl.Path,
|
||||
model: str,
|
||||
kvcache_type=False,
|
||||
timeout=1500):
|
||||
|
||||
xml_output_file = build_dir / "results-multi-gpu-disagg-executor-8-process.xml"
|
||||
trt_model_test = _cpp.produce_mpirun_command(
|
||||
global_commands=["mpirun", "--allow-run-as-root"],
|
||||
nranks=8,
|
||||
local_commands=[
|
||||
"executor/disaggExecutorTest",
|
||||
"--gtest_filter=*LlamaTP2PP2DisaggSymmetricExecutorTest*"
|
||||
],
|
||||
leader_commands=[f"--gtest_output=xml:{xml_output_file}"])
|
||||
_cpp.run_command(trt_model_test, cwd=tests_dir, env=mgpu_env, timeout=1500)
|
||||
tests_dir = build_dir / "tests"
|
||||
|
||||
xml_output_file = build_dir / "results-multi-gpu-disagg-asymmetric-executor-4-process.xml"
|
||||
trt_model_test = _cpp.produce_mpirun_command(
|
||||
global_commands=["mpirun", "--allow-run-as-root"],
|
||||
nranks=4,
|
||||
local_commands=[
|
||||
"executor/disaggExecutorTest",
|
||||
"--gtest_filter=*DisaggAsymmetricExecutorTest*"
|
||||
],
|
||||
leader_commands=[f"--gtest_output=xml:{xml_output_file}"])
|
||||
_cpp.run_command(trt_model_test, cwd=tests_dir, env=mgpu_env, timeout=1500)
|
||||
prefix = get_model_test_filter_prefix(model)
|
||||
|
||||
xml_output_file = build_dir / "results-multi-gpu-disagg-asymmetric-executor-6-process.xml"
|
||||
trt_model_test = _cpp.produce_mpirun_command(
|
||||
global_commands=["mpirun", "--allow-run-as-root"],
|
||||
nranks=6,
|
||||
local_commands=[
|
||||
"executor/disaggExecutorTest",
|
||||
"--gtest_filter=*DisaggAsymmetricExecutorTest*"
|
||||
],
|
||||
leader_commands=[f"--gtest_output=xml:{xml_output_file}"])
|
||||
_cpp.run_command(trt_model_test, cwd=tests_dir, env=mgpu_env, timeout=1500)
|
||||
|
||||
xml_output_file = build_dir / "results-multi-gpu-disagg-asymmetric-executor-8-process.xml"
|
||||
trt_model_test = _cpp.produce_mpirun_command(
|
||||
global_commands=["mpirun", "--allow-run-as-root"],
|
||||
nranks=8,
|
||||
local_commands=[
|
||||
"executor/disaggExecutorTest",
|
||||
"--gtest_filter=*DisaggAsymmetricExecutorTest*"
|
||||
],
|
||||
leader_commands=[f"--gtest_output=xml:{xml_output_file}"])
|
||||
_cpp.run_command(trt_model_test, cwd=tests_dir, env=mgpu_env, timeout=1500)
|
||||
|
||||
xml_output_file = build_dir / "results-multi-gpu-disagg-asymmetric-orchestrator-executor-7-process.xml"
|
||||
trt_model_test = _cpp.produce_mpirun_command(
|
||||
global_commands=["mpirun", "--allow-run-as-root"],
|
||||
nranks=7,
|
||||
local_commands=[
|
||||
"executor/disaggExecutorTest",
|
||||
"--gtest_filter=*DisaggOrchestratorParamsTest*"
|
||||
],
|
||||
leader_commands=[f"--gtest_output=xml:{xml_output_file}"])
|
||||
_cpp.run_command(trt_model_test, cwd=tests_dir, env=mgpu_env, timeout=1500)
|
||||
mgpu_env = get_multi_gpu_env(kv_cache_type=kvcache_type,
|
||||
llama_multi_gpu=True)
|
||||
|
||||
xml_output_file = build_dir / "results-multi-gpu-disagg-spawn-asymmetric-orchestrator-executor-1-process.xml"
|
||||
|
||||
comms = [
|
||||
"executor/disaggExecutorTest",
|
||||
"--gtest_filter=*DisaaggSpawnOrchestrator*"
|
||||
f"--gtest_filter=*{prefix}*DisaaggSpawnOrchestrator*",
|
||||
f"--gtest_output=xml:{xml_output_file}"
|
||||
]
|
||||
_cpp.run_command(comms, cwd=tests_dir, env=mgpu_env, timeout=1500)
|
||||
_cpp.run_command(comms, cwd=tests_dir, env=mgpu_env, timeout=timeout)
|
||||
|
||||
|
||||
def prepare_multi_gpu_model_tests(test_list: List[str],
|
||||
@ -413,6 +403,62 @@ def prepare_model_multi_gpu(python_exe, root_dir, cpp_resources_dir,
|
||||
return _prepare
|
||||
|
||||
|
||||
@pytest.fixture(scope="session")
|
||||
def gpt_single_gpu_model(prepare_model):
|
||||
prepare_model("gpt")
|
||||
return "gpt"
|
||||
|
||||
|
||||
@pytest.fixture(scope="session")
|
||||
def llama_single_gpu_model(prepare_model):
|
||||
prepare_model("llama")
|
||||
return "llama"
|
||||
|
||||
|
||||
@pytest.fixture(scope="session")
|
||||
def llama_multi_gpu_model(prepare_model_multi_gpu):
|
||||
prepare_model_multi_gpu("llama")
|
||||
return "llama"
|
||||
|
||||
|
||||
# Allow us to dynamically choose a fixture at runtime
|
||||
# Combined with session scope fixtures above to ensure
|
||||
# that the model is built only once per pytest session
|
||||
@pytest.fixture
|
||||
def prepare_models_disagg(request):
|
||||
|
||||
def _prepare(model_name: str):
|
||||
if model_name == "llama":
|
||||
fixture_names = [
|
||||
"llama_single_gpu_model",
|
||||
"llama_multi_gpu_model",
|
||||
]
|
||||
elif model_name == "gpt":
|
||||
fixture_names = [
|
||||
"gpt_single_gpu_model",
|
||||
]
|
||||
else:
|
||||
raise ValueError(f"Disagg tests don't support model: {model_name}")
|
||||
|
||||
print(f"Preparing models for disagg tests: {fixture_names}")
|
||||
# Run the fixtures
|
||||
for fixture_name in fixture_names:
|
||||
request.getfixturevalue(fixture_name)
|
||||
|
||||
return _prepare
|
||||
|
||||
|
||||
# Use indirect parameterization to ensure that the model is built
|
||||
# only once per pytest session
|
||||
@pytest.fixture(scope="session")
|
||||
def multi_gpu_model(request, prepare_model_multi_gpu):
|
||||
|
||||
model_name = request.param
|
||||
prepare_model_multi_gpu(model_name)
|
||||
|
||||
return model_name
|
||||
|
||||
|
||||
@pytest.mark.parametrize("build_google_tests", ["80", "86", "89", "90"],
|
||||
indirect=True)
|
||||
def test_simple(build_google_tests, build_dir):
|
||||
@ -425,46 +471,135 @@ def test_simple(build_google_tests, build_dir):
|
||||
|
||||
@pytest.mark.parametrize("build_google_tests", ["80", "86", "89", "90"],
|
||||
indirect=True)
|
||||
def test_t5(build_google_tests, prepare_model_multi_gpu, build_dir):
|
||||
@pytest.mark.parametrize("multi_gpu_model", ["t5"], indirect=True)
|
||||
def test_enc_dec(build_google_tests, multi_gpu_model, build_dir):
|
||||
|
||||
if platform.system() != "Windows":
|
||||
prepare_model_multi_gpu("t5")
|
||||
run_t5_multi_gpu_tests(build_dir=build_dir,
|
||||
timeout=_cpp.default_test_timeout)
|
||||
run_enc_dec_multi_gpu_tests(build_dir=build_dir,
|
||||
timeout=_cpp.default_test_timeout)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("build_google_tests", ["80", "86", "89", "90"],
|
||||
indirect=True)
|
||||
def test_llama_executor(build_google_tests, prepare_model_multi_gpu, lora_setup,
|
||||
@pytest.mark.parametrize("mode", [
|
||||
"orchestrator",
|
||||
pytest.param(
|
||||
"leader",
|
||||
marks=pytest.mark.skip("https://nvbugspro.nvidia.com/bug/5026255"))
|
||||
])
|
||||
@pytest.mark.parametrize("multi_gpu_model", ["llama"], indirect=True)
|
||||
def test_llama_executor(build_google_tests, multi_gpu_model, mode, lora_setup,
|
||||
build_dir):
|
||||
|
||||
if platform.system() != "Windows":
|
||||
prepare_model_multi_gpu("llama")
|
||||
run_llama_executor_multi_gpu_tests(build_dir=build_dir,
|
||||
timeout=_cpp.default_test_timeout)
|
||||
if platform.system() == "Windows":
|
||||
return
|
||||
|
||||
if mode == "orchestrator":
|
||||
run_llama_executor_orchestrator_tests(build_dir=build_dir,
|
||||
timeout=_cpp.default_test_timeout)
|
||||
elif mode == "leader":
|
||||
run_llama_executor_leader_tests(build_dir=build_dir,
|
||||
timeout=_cpp.default_test_timeout)
|
||||
else:
|
||||
raise ValueError(f"Unsupported mode: {mode}")
|
||||
|
||||
|
||||
@pytest.mark.parametrize("build_google_tests", ["80", "86", "89", "90"],
|
||||
indirect=True)
|
||||
def test_trt_gpt_real_decoder(build_google_tests, prepare_model_multi_gpu,
|
||||
lora_setup, build_dir):
|
||||
@pytest.mark.parametrize("multi_gpu_model", ["llama"], indirect=True)
|
||||
def test_llama_executor_logits_proc(build_google_tests, multi_gpu_model,
|
||||
lora_setup, build_dir):
|
||||
|
||||
if platform.system() != "Windows":
|
||||
run_llama_executor_logits_proc_tests(build_dir=build_dir,
|
||||
timeout=_cpp.default_test_timeout)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("build_google_tests", ["80", "86", "89", "90"],
|
||||
indirect=True)
|
||||
@pytest.mark.parametrize("multi_gpu_model", ["llama"], indirect=True)
|
||||
def test_llama_executor_guided_decoding(build_google_tests, multi_gpu_model,
|
||||
lora_setup, build_dir):
|
||||
|
||||
if platform.system() != "Windows":
|
||||
run_llama_executor_guided_decoding_tests(
|
||||
build_dir=build_dir, timeout=_cpp.default_test_timeout)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("build_google_tests", ["80", "86", "89", "90"],
|
||||
indirect=True)
|
||||
@pytest.mark.parametrize("multi_gpu_model", ["llama"], indirect=True)
|
||||
def test_trt_gpt_real_decoder(build_google_tests, multi_gpu_model, lora_setup,
|
||||
build_dir):
|
||||
|
||||
if platform.system() != "Windows":
|
||||
prepare_model_multi_gpu("llama")
|
||||
run_trt_gpt_model_real_decoder_multi_gpu_tests(
|
||||
build_dir=build_dir, timeout=_cpp.default_test_timeout)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("build_google_tests", ["80", "86", "89", "90"],
|
||||
indirect=True)
|
||||
def test_disagg(prepare_model, prepare_model_multi_gpu, build_google_tests,
|
||||
build_dir):
|
||||
class TestDisagg:
|
||||
|
||||
if platform.system() != "Windows":
|
||||
# Disagg tests need single + multi GPU llama models.
|
||||
prepare_model("llama")
|
||||
prepare_model_multi_gpu("llama")
|
||||
@pytest.mark.parametrize("kvcache_type", [KVCacheType.MPI, KVCacheType.UCX],
|
||||
ids=["mpi_kvcache", "ucx_kvcache"])
|
||||
@pytest.mark.parametrize("nprocs", [2, 4, 8],
|
||||
ids=["2proc", "4proc", "8proc"])
|
||||
@pytest.mark.parametrize("model", ["gpt", "llama"])
|
||||
def test_symmetric_executor(self, build_google_tests, model, nprocs,
|
||||
kvcache_type, prepare_models_disagg, build_dir):
|
||||
|
||||
prepare_model("gpt")
|
||||
if model == "gpt" and nprocs > 2:
|
||||
pytest.skip(
|
||||
"test_symmetric_executor only supports 2 processes for gpt")
|
||||
|
||||
run_disagg_multi_gpu_tests(build_dir=build_dir)
|
||||
if platform.system() != "Windows":
|
||||
prepare_models_disagg(model)
|
||||
|
||||
run_disagg_symmetric_executor_tests(build_dir=build_dir,
|
||||
model=model,
|
||||
nprocs=nprocs,
|
||||
kvcache_type=kvcache_type)
|
||||
|
||||
@pytest.mark.parametrize("kvcache_type", [KVCacheType.MPI, KVCacheType.UCX],
|
||||
ids=["mpi_kvcache", "ucx_kvcache"])
|
||||
@pytest.mark.parametrize("nprocs", [4, 6, 8],
|
||||
ids=["4proc", "6proc", "8proc"])
|
||||
@pytest.mark.parametrize("model", ["llama"])
|
||||
def test_asymmetric_executor(self, build_google_tests, model, nprocs,
|
||||
kvcache_type, prepare_models_disagg,
|
||||
build_dir):
|
||||
|
||||
if platform.system() != "Windows":
|
||||
prepare_models_disagg(model_name=model)
|
||||
|
||||
run_disagg_asymmetric_executor_tests(build_dir=build_dir,
|
||||
model=model,
|
||||
nprocs=nprocs,
|
||||
kvcache_type=kvcache_type)
|
||||
|
||||
@pytest.mark.parametrize("kvcache_type", [KVCacheType.MPI, KVCacheType.UCX],
|
||||
ids=["mpi_kvcache", "ucx_kvcache"])
|
||||
@pytest.mark.parametrize("model", ["llama"])
|
||||
def test_orchestrator_params(self, build_google_tests, model, kvcache_type,
|
||||
prepare_models_disagg, build_dir):
|
||||
|
||||
if platform.system() != "Windows":
|
||||
prepare_models_disagg(model)
|
||||
|
||||
run_disagg_orchestrator_params_tests(build_dir=build_dir,
|
||||
model=model,
|
||||
kvcache_type=kvcache_type)
|
||||
|
||||
@pytest.mark.parametrize("kvcache_type", [KVCacheType.UCX],
|
||||
ids=["ucx_kvcache"])
|
||||
@pytest.mark.parametrize("model", ["llama"])
|
||||
def test_spawn_orchestrator(self, build_google_tests, model, kvcache_type,
|
||||
prepare_models_disagg, build_dir):
|
||||
|
||||
if platform.system() != "Windows":
|
||||
prepare_models_disagg(model)
|
||||
|
||||
run_disagg_spawn_orchestrator_tests(build_dir=build_dir,
|
||||
model=model,
|
||||
kvcache_type=kvcache_type)
|
||||
|
||||
@ -115,10 +115,29 @@ l0_dgx_h100:
|
||||
tests:
|
||||
# ------------- CPP tests ---------------
|
||||
- cpp/test_multi_gpu.py::test_simple[90]
|
||||
- cpp/test_multi_gpu.py::test_t5[90]
|
||||
- cpp/test_multi_gpu.py::test_llama_executor[90]
|
||||
- cpp/test_multi_gpu.py::test_trt_gpt_real_decoder[90]
|
||||
- cpp/test_multi_gpu.py::test_disagg[90]
|
||||
- cpp/test_multi_gpu.py::test_enc_dec[t5-90]
|
||||
- cpp/test_multi_gpu.py::test_llama_executor[llama-orchestrator-90]
|
||||
- cpp/test_multi_gpu.py::test_llama_executor[llama-leader-90]
|
||||
- cpp/test_multi_gpu.py::test_llama_executor_guided_decoding[llama-90]
|
||||
- cpp/test_multi_gpu.py::test_llama_executor_logits_proc[llama-90]
|
||||
- cpp/test_multi_gpu.py::test_trt_gpt_real_decoder[llama-90]
|
||||
- cpp/test_multi_gpu.py::TestDisagg::test_symmetric_executor[gpt-2proc-mpi_kvcache-90]
|
||||
- cpp/test_multi_gpu.py::TestDisagg::test_symmetric_executor[gpt-2proc-ucx_kvcache-90]
|
||||
- cpp/test_multi_gpu.py::TestDisagg::test_symmetric_executor[llama-2proc-mpi_kvcache-90]
|
||||
- cpp/test_multi_gpu.py::TestDisagg::test_symmetric_executor[llama-4proc-mpi_kvcache-90]
|
||||
- cpp/test_multi_gpu.py::TestDisagg::test_symmetric_executor[llama-8proc-mpi_kvcache-90]
|
||||
- cpp/test_multi_gpu.py::TestDisagg::test_symmetric_executor[llama-2proc-ucx_kvcache-90]
|
||||
- cpp/test_multi_gpu.py::TestDisagg::test_symmetric_executor[llama-4proc-ucx_kvcache-90]
|
||||
- cpp/test_multi_gpu.py::TestDisagg::test_symmetric_executor[llama-8proc-ucx_kvcache-90]
|
||||
- cpp/test_multi_gpu.py::TestDisagg::test_asymmetric_executor[llama-4proc-mpi_kvcache-90]
|
||||
- cpp/test_multi_gpu.py::TestDisagg::test_asymmetric_executor[llama-6proc-mpi_kvcache-90]
|
||||
- cpp/test_multi_gpu.py::TestDisagg::test_asymmetric_executor[llama-8proc-mpi_kvcache-90]
|
||||
- cpp/test_multi_gpu.py::TestDisagg::test_asymmetric_executor[llama-4proc-ucx_kvcache-90]
|
||||
- cpp/test_multi_gpu.py::TestDisagg::test_asymmetric_executor[llama-6proc-ucx_kvcache-90]
|
||||
- cpp/test_multi_gpu.py::TestDisagg::test_asymmetric_executor[llama-8proc-ucx_kvcache-90]
|
||||
- cpp/test_multi_gpu.py::TestDisagg::test_orchestrator_params[llama-mpi_kvcache-90]
|
||||
- cpp/test_multi_gpu.py::TestDisagg::test_orchestrator_params[llama-ucx_kvcache-90]
|
||||
- cpp/test_multi_gpu.py::TestDisagg::test_spawn_orchestrator[llama-ucx_kvcache-90]
|
||||
- condition:
|
||||
ranges:
|
||||
system_gpu_count:
|
||||
|
||||
Loading…
Reference in New Issue
Block a user