fix [nvbug/5351244]: address remote mpi session submit (#5664)

Signed-off-by: Superjomn <328693+Superjomn@users.noreply.github.com>
This commit is contained in:
Yan Chunwei 2025-07-10 20:22:41 +08:00 committed by Zhenhuan Chen
parent f4f2176cd5
commit f194b65f3e
4 changed files with 8 additions and 5 deletions

View File

@ -14,6 +14,7 @@ l0_a100:
backend: "pytorch"
tests:
- unittest/llmapi/test_llm_pytorch.py
- unittest/llmapi/test_mpi_session.py # generic tests
- condition:
ranges:
system_gpu_count:
@ -27,7 +28,7 @@ l0_a100:
stage: post_merge
backend: tensorrt
tests:
- unittest/trt/attention/test_sage_attention.py unittest/llmapi/test_llm_download.py unittest/llmapi/test_llm_kv_cache_events.py unittest/llmapi/test_mpi_session.py unittest/trt/model/redrafter unittest/trt/model/test_phi.py unittest/trt/model/test_unet.py unittest/trt/python_plugin unittest/tools unittest/utils unittest/others
- unittest/trt/attention/test_sage_attention.py unittest/llmapi/test_llm_download.py unittest/llmapi/test_llm_kv_cache_events.py unittest/trt/model/redrafter unittest/trt/model/test_phi.py unittest/trt/model/test_unet.py unittest/trt/python_plugin unittest/tools unittest/utils unittest/others
- unittest/llmapi/test_llm_models.py -m "part1"
- unittest/llmapi/test_llm_models.py -m "not (part0 or part1)"
- unittest/llmapi/test_llm.py -m "part0"

View File

@ -83,7 +83,7 @@ full:B200_PCIe/unittest/trt/model/test_mamba.py SKIP (Disable for Blackwell)
full:B200_PCIe/examples/test_medusa.py::test_llm_medusa_with_qaunt_base_model_1gpu[fp8-use_cpp_session-medusa-vicuna-7b-v1.3-4-heads-float16-bs1] SKIP (Disable for Blackwell)
full:B200_PCIe/examples/test_medusa.py::test_llm_medusa_with_qaunt_base_model_1gpu[fp8-use_py_session-medusa-vicuna-7b-v1.3-4-heads-float16-bs1] SKIP (Disable for Blackwell)
full:B200_PCIe/unittest/bindings SKIP (Disable for Blackwell)
full:B200_PCIe/unittest/trt/attention/test_sage_attention.py unittest/llmapi/test_llm_download.py unittest/llmapi/test_llm_kv_cache_events.py unittest/llmapi/test_mpi_session.py unittest/trt/model/redrafter unittest/trt/model/test_phi.py unittest/trt/model/test_unet.py unittest/trt/python_plugin unittest/tools unittest/utils unittest/others SKIP (Disable for Blackwell)
full:B200_PCIe/unittest/trt/attention/test_sage_attention.py unittest/llmapi/test_llm_download.py unittest/llmapi/test_llm_kv_cache_events.py unittest/trt/model/redrafter unittest/trt/model/test_phi.py unittest/trt/model/test_unet.py unittest/trt/python_plugin unittest/tools unittest/utils unittest/others SKIP (Disable for Blackwell)
full:B200_PCIe/unittest/trt/quantization/test_weight_only_quant_matmul.py SKIP (Disable for Blackwell)
full:B200_PCIe/unittest/trt/quantization/test_weight_only_groupwise_quant_matmul.py SKIP (Disable for Blackwell)
full:B200_PCIe/examples/test_gpt.py::test_llm_gpt2_starcoder_weight_only[starcoder2-int8-float16] SKIP (Disable for Blackwell)
@ -155,7 +155,7 @@ full:B200/unittest/trt/model/test_mamba.py SKIP (Disable for Blackwell)
full:B200/examples/test_medusa.py::test_llm_medusa_with_qaunt_base_model_1gpu[fp8-use_cpp_session-medusa-vicuna-7b-v1.3-4-heads-float16-bs1] SKIP (Disable for Blackwell)
full:B200/examples/test_medusa.py::test_llm_medusa_with_qaunt_base_model_1gpu[fp8-use_py_session-medusa-vicuna-7b-v1.3-4-heads-float16-bs1] SKIP (Disable for Blackwell)
full:B200/unittest/bindings SKIP (Disable for Blackwell)
full:B200/unittest/trt/attention/test_sage_attention.py unittest/llmapi/test_llm_download.py unittest/llmapi/test_llm_kv_cache_events.py unittest/llmapi/test_mpi_session.py unittest/trt/model/redrafter unittest/trt/model/test_phi.py unittest/trt/model/test_unet.py unittest/trt/python_plugin unittest/tools unittest/utils unittest/others SKIP (Disable for Blackwell)
full:B200/unittest/trt/attention/test_sage_attention.py unittest/llmapi/test_llm_download.py unittest/llmapi/test_llm_kv_cache_events.py unittest/trt/model/redrafter unittest/trt/model/test_phi.py unittest/trt/model/test_unet.py unittest/trt/python_plugin unittest/tools unittest/utils unittest/others SKIP (Disable for Blackwell)
full:B200/unittest/trt/quantization/test_weight_only_quant_matmul.py SKIP (Disable for Blackwell)
full:B200/unittest/trt/quantization/test_weight_only_groupwise_quant_matmul.py SKIP (Disable for Blackwell)
full:B200/examples/test_gpt.py::test_llm_gpt2_starcoder_weight_only[starcoder2-int8-float16] SKIP (Disable for Blackwell)

View File

@ -7,6 +7,6 @@ echo "Starting remote MPI session test with task: $task"
echo "MPI processes: 2"
# Add timeout to prevent infinite hanging
timeout 60 mpirun -np 2 trtllm-llmapi-launch python3 _run_mpi_comm_task.py --task_type $task
timeout 60 mpirun --allow-run-as-root -np 2 trtllm-llmapi-launch python3 _run_mpi_comm_task.py --task_type $task
echo "Remote MPI session test completed"

View File

@ -60,13 +60,15 @@ def test_remote_mpi_session(task_type: Literal["submit", "submit_sync"]):
"""Test RemoteMpiPoolSessionClient and RemoteMpiPoolSessionServer interaction"""
command = ["bash", "_test_remote_mpi_session.sh", task_type]
print(' '.join(command))
with Popen(command,
env=os.environ,
stdout=PIPE,
stderr=PIPE,
bufsize=1,
start_new_session=True,
universal_newlines=True) as process:
universal_newlines=True,
cwd=os.path.dirname(os.path.abspath(__file__))) as process:
# Function to read from a stream and write to output
def read_stream(stream, output_stream):