mirror of
https://github.com/NVIDIA/TensorRT-LLM.git
synced 2026-01-13 22:18:36 +08:00
49 lines
1.3 KiB
Bash
Executable File
49 lines
1.3 KiB
Bash
Executable File
#!/bin/bash
|
|
|
|
set -euo pipefail
|
|
|
|
if [ -v OMPI_COMM_WORLD_SIZE ]; then
|
|
export WORLD_SIZE=$OMPI_COMM_WORLD_SIZE
|
|
export RANK=$OMPI_COMM_WORLD_RANK
|
|
export LOCAL_RANK=$OMPI_COMM_WORLD_LOCAL_RANK
|
|
export NODE_RANK=$OMPI_COMM_WORLD_NODE_RANK
|
|
fi
|
|
|
|
if [ "$RANK" -eq 0 ]; then
|
|
export TLLM_LOG_LEVEL=INFO
|
|
fi
|
|
|
|
PROFILE_DIR=${PROFILE_DIR:-profiles}
|
|
mkdir -p ${PROFILE_DIR}
|
|
|
|
PROFILE=${PROFILE:-1}
|
|
BACKTRACE=${BACKTRACE:-0}
|
|
GPU_METRICS=${GPU_METRICS:-0}
|
|
if [ "$PROFILE" -eq 1 ]; then
|
|
PROFILE_CMD="nsys profile
|
|
-t cuda,nvtx
|
|
--cpuctxsw none --cuda-event-trace false
|
|
--cuda-graph-trace node
|
|
-c cudaProfilerApi --capture-range-end stop
|
|
-o ${PROFILE_DIR}/report_np${WORLD_SIZE}_rank${RANK}.nsys-rep
|
|
--force-overwrite true"
|
|
if [ "$BACKTRACE" -eq 1 ]; then
|
|
PROFILE_CMD+=" --python-backtrace=cuda --cudabacktrace all"
|
|
else
|
|
PROFILE_CMD+=" -s none"
|
|
fi
|
|
if [ "$GPU_METRICS" -eq 1 ]; then
|
|
PROFILE_CMD+=" --gpu-metrics-devices $LOCAL_RANK
|
|
--gpu-metrics-frequency 10000"
|
|
fi
|
|
else
|
|
PROFILE_CMD=
|
|
fi
|
|
|
|
SCRIPT_PATH=$(realpath --relative-to="$(pwd)" "$(dirname -- "$0")"/run.py)
|
|
|
|
set -x
|
|
$PROFILE_CMD bash -o pipefail -c \
|
|
"python3 -u \"\$1\" \"\${@:3}\" 2>&1 | tee \"\$2/report_np${WORLD_SIZE}_rank${RANK}.log\"" \
|
|
bash "$SCRIPT_PATH" "$PROFILE_DIR" "$@"
|