mirror of
https://github.com/NVIDIA/TensorRT-LLM.git
synced 2026-01-13 22:18:36 +08:00
fix [nvbug/5351244]: address remote mpi session submit (#5664)
Signed-off-by: Superjomn <328693+Superjomn@users.noreply.github.com>
This commit is contained in:
parent
f4f2176cd5
commit
f194b65f3e
@ -14,6 +14,7 @@ l0_a100:
|
||||
backend: "pytorch"
|
||||
tests:
|
||||
- unittest/llmapi/test_llm_pytorch.py
|
||||
- unittest/llmapi/test_mpi_session.py # generic tests
|
||||
- condition:
|
||||
ranges:
|
||||
system_gpu_count:
|
||||
@ -27,7 +28,7 @@ l0_a100:
|
||||
stage: post_merge
|
||||
backend: tensorrt
|
||||
tests:
|
||||
- unittest/trt/attention/test_sage_attention.py unittest/llmapi/test_llm_download.py unittest/llmapi/test_llm_kv_cache_events.py unittest/llmapi/test_mpi_session.py unittest/trt/model/redrafter unittest/trt/model/test_phi.py unittest/trt/model/test_unet.py unittest/trt/python_plugin unittest/tools unittest/utils unittest/others
|
||||
- unittest/trt/attention/test_sage_attention.py unittest/llmapi/test_llm_download.py unittest/llmapi/test_llm_kv_cache_events.py unittest/trt/model/redrafter unittest/trt/model/test_phi.py unittest/trt/model/test_unet.py unittest/trt/python_plugin unittest/tools unittest/utils unittest/others
|
||||
- unittest/llmapi/test_llm_models.py -m "part1"
|
||||
- unittest/llmapi/test_llm_models.py -m "not (part0 or part1)"
|
||||
- unittest/llmapi/test_llm.py -m "part0"
|
||||
|
||||
@ -83,7 +83,7 @@ full:B200_PCIe/unittest/trt/model/test_mamba.py SKIP (Disable for Blackwell)
|
||||
full:B200_PCIe/examples/test_medusa.py::test_llm_medusa_with_qaunt_base_model_1gpu[fp8-use_cpp_session-medusa-vicuna-7b-v1.3-4-heads-float16-bs1] SKIP (Disable for Blackwell)
|
||||
full:B200_PCIe/examples/test_medusa.py::test_llm_medusa_with_qaunt_base_model_1gpu[fp8-use_py_session-medusa-vicuna-7b-v1.3-4-heads-float16-bs1] SKIP (Disable for Blackwell)
|
||||
full:B200_PCIe/unittest/bindings SKIP (Disable for Blackwell)
|
||||
full:B200_PCIe/unittest/trt/attention/test_sage_attention.py unittest/llmapi/test_llm_download.py unittest/llmapi/test_llm_kv_cache_events.py unittest/llmapi/test_mpi_session.py unittest/trt/model/redrafter unittest/trt/model/test_phi.py unittest/trt/model/test_unet.py unittest/trt/python_plugin unittest/tools unittest/utils unittest/others SKIP (Disable for Blackwell)
|
||||
full:B200_PCIe/unittest/trt/attention/test_sage_attention.py unittest/llmapi/test_llm_download.py unittest/llmapi/test_llm_kv_cache_events.py unittest/trt/model/redrafter unittest/trt/model/test_phi.py unittest/trt/model/test_unet.py unittest/trt/python_plugin unittest/tools unittest/utils unittest/others SKIP (Disable for Blackwell)
|
||||
full:B200_PCIe/unittest/trt/quantization/test_weight_only_quant_matmul.py SKIP (Disable for Blackwell)
|
||||
full:B200_PCIe/unittest/trt/quantization/test_weight_only_groupwise_quant_matmul.py SKIP (Disable for Blackwell)
|
||||
full:B200_PCIe/examples/test_gpt.py::test_llm_gpt2_starcoder_weight_only[starcoder2-int8-float16] SKIP (Disable for Blackwell)
|
||||
@ -155,7 +155,7 @@ full:B200/unittest/trt/model/test_mamba.py SKIP (Disable for Blackwell)
|
||||
full:B200/examples/test_medusa.py::test_llm_medusa_with_qaunt_base_model_1gpu[fp8-use_cpp_session-medusa-vicuna-7b-v1.3-4-heads-float16-bs1] SKIP (Disable for Blackwell)
|
||||
full:B200/examples/test_medusa.py::test_llm_medusa_with_qaunt_base_model_1gpu[fp8-use_py_session-medusa-vicuna-7b-v1.3-4-heads-float16-bs1] SKIP (Disable for Blackwell)
|
||||
full:B200/unittest/bindings SKIP (Disable for Blackwell)
|
||||
full:B200/unittest/trt/attention/test_sage_attention.py unittest/llmapi/test_llm_download.py unittest/llmapi/test_llm_kv_cache_events.py unittest/llmapi/test_mpi_session.py unittest/trt/model/redrafter unittest/trt/model/test_phi.py unittest/trt/model/test_unet.py unittest/trt/python_plugin unittest/tools unittest/utils unittest/others SKIP (Disable for Blackwell)
|
||||
full:B200/unittest/trt/attention/test_sage_attention.py unittest/llmapi/test_llm_download.py unittest/llmapi/test_llm_kv_cache_events.py unittest/trt/model/redrafter unittest/trt/model/test_phi.py unittest/trt/model/test_unet.py unittest/trt/python_plugin unittest/tools unittest/utils unittest/others SKIP (Disable for Blackwell)
|
||||
full:B200/unittest/trt/quantization/test_weight_only_quant_matmul.py SKIP (Disable for Blackwell)
|
||||
full:B200/unittest/trt/quantization/test_weight_only_groupwise_quant_matmul.py SKIP (Disable for Blackwell)
|
||||
full:B200/examples/test_gpt.py::test_llm_gpt2_starcoder_weight_only[starcoder2-int8-float16] SKIP (Disable for Blackwell)
|
||||
|
||||
@ -7,6 +7,6 @@ echo "Starting remote MPI session test with task: $task"
|
||||
echo "MPI processes: 2"
|
||||
|
||||
# Add timeout to prevent infinite hanging
|
||||
timeout 60 mpirun -np 2 trtllm-llmapi-launch python3 _run_mpi_comm_task.py --task_type $task
|
||||
timeout 60 mpirun --allow-run-as-root -np 2 trtllm-llmapi-launch python3 _run_mpi_comm_task.py --task_type $task
|
||||
|
||||
echo "Remote MPI session test completed"
|
||||
|
||||
@ -60,13 +60,15 @@ def test_remote_mpi_session(task_type: Literal["submit", "submit_sync"]):
|
||||
"""Test RemoteMpiPoolSessionClient and RemoteMpiPoolSessionServer interaction"""
|
||||
command = ["bash", "_test_remote_mpi_session.sh", task_type]
|
||||
print(' '.join(command))
|
||||
|
||||
with Popen(command,
|
||||
env=os.environ,
|
||||
stdout=PIPE,
|
||||
stderr=PIPE,
|
||||
bufsize=1,
|
||||
start_new_session=True,
|
||||
universal_newlines=True) as process:
|
||||
universal_newlines=True,
|
||||
cwd=os.path.dirname(os.path.abspath(__file__))) as process:
|
||||
|
||||
# Function to read from a stream and write to output
|
||||
def read_stream(stream, output_stream):
|
||||
|
||||
Loading…
Reference in New Issue
Block a user