diff --git a/.gitignore b/.gitignore index 840a515397..40d086f60e 100644 --- a/.gitignore +++ b/.gitignore @@ -77,6 +77,7 @@ cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/fmha_v2_cu/ cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_cubin.h cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_cubin.cpp .devcontainer/.env +/examples/layer_wise_benchmarks/autotuner_cache/ /examples/layer_wise_benchmarks/profiles/ # User config files diff --git a/examples/layer_wise_benchmarks/README.md b/examples/layer_wise_benchmarks/README.md index 63b8f6f3da..af2663c9aa 100644 --- a/examples/layer_wise_benchmarks/README.md +++ b/examples/layer_wise_benchmarks/README.md @@ -15,6 +15,9 @@ pip install -e ../.. **Step 3:** In the container, run benchmarks and generate profiles: ```bash +# Set autotune cache path +export TLLM_AUTOTUNER_CACHE_PATH=autotuner_cache/cache + # Run DeepSeek-R1 NVFP4 NP=4 ./mpi_launch.sh ./run.sh config_ctx.yaml NP=4 ./mpi_launch.sh ./run.sh config_gen.yaml @@ -24,16 +27,16 @@ NP=4 ./mpi_launch.sh ./run.sh config_ctx.yaml --model deepseek-ai/DeepSeek-V3.2- NP=4 ./mpi_launch.sh ./run.sh config_gen.yaml --model deepseek-ai/DeepSeek-V3.2-Exp --tokens-per-block 64 --moe-backend DEEPGEMM # Run DeepSeek-V3.2-Exp with 32k context length -NP=4 ./mpi_launch.sh ./run.sh config_ctx.yaml --model deepseek-ai/DeepSeek-V3.2-Exp --tokens-per-block 64 --max-seq-len $((32768 + 1024 + 4)) --moe-backend DEEPGEMM --batch-size 1 --seq-len-q 32769 -NP=4 ./mpi_launch.sh ./run.sh config_gen.yaml --model deepseek-ai/DeepSeek-V3.2-Exp --tokens-per-block 64 --max-seq-len $((32768 + 1024 + 4)) --moe-backend DEEPGEMM --seq-len-kv-cache 32769 +NP=4 ./mpi_launch.sh ./run.sh config_ctx.yaml --model deepseek-ai/DeepSeek-V3.2-Exp --tokens-per-block 64 --moe-backend DEEPGEMM --batch-size 1 --seq-len-q 32769 +NP=4 ./mpi_launch.sh ./run.sh config_gen.yaml --model deepseek-ai/DeepSeek-V3.2-Exp --tokens-per-block 64 --moe-backend DEEPGEMM --seq-len-kv-cache 32769 # Run with attention TP NP=4 ./mpi_launch.sh ./run.sh config_ctx.yaml --no-enable-attention-dp NP=4 ./mpi_launch.sh ./run.sh config_gen.yaml --no-enable-attention-dp # Run with attention TP and TRTLLMGen -NP=4 ./mpi_launch.sh -x TRTLLM_ENABLE_PDL=1 ./run.sh config_ctx.yaml --no-enable-attention-dp --moe-backend TRTLLM --balance-method NotModified -NP=4 ./mpi_launch.sh -x TRTLLM_ENABLE_PDL=1 ./run.sh config_gen.yaml --no-enable-attention-dp --moe-backend TRTLLM --balance-method NotModified +NP=4 ./mpi_launch.sh -x TRTLLM_ENABLE_PDL=1 ./run.sh config_ctx.yaml --no-enable-attention-dp --moe-backend TRTLLM +NP=4 ./mpi_launch.sh -x TRTLLM_ENABLE_PDL=1 ./run.sh config_gen.yaml --no-enable-attention-dp --moe-backend TRTLLM # Run with MTP3 NP=4 ./mpi_launch.sh ./run.sh config_gen.yaml --batch-size 32 --seq-len-q 4 @@ -48,7 +51,7 @@ NP=4 ./mpi_launch.sh ./run.sh config_gen.yaml --scaled-from 16 --moe-backend WID # Scale TEP=16 to 4 GPUs: reduce the number of attention heads and experts NP=4 ./mpi_launch.sh ./run.sh config_gen.yaml --scaled-from 16 --no-enable-attention-dp -# Run Qwen3-Next (balanced routing is not implemented) +# Run Qwen3-Next NP=2 ./mpi_launch.sh ./run.sh config_ctx.yaml --model Qwen/Qwen3-Next-80B-A3B-Instruct --layer-indices 6,7 --no-enable-attention-dp --batch-size 4 NP=2 ./mpi_launch.sh ./run.sh config_gen.yaml --model Qwen/Qwen3-Next-80B-A3B-Instruct --layer-indices 6,7 --no-enable-attention-dp --batch-size 512 @@ -69,13 +72,13 @@ NP=4 ./mpi_launch.sh ./run.sh config_gen.yaml --balance-method ImbalancedExperts ### Run with Slurm > Tips: -> 1. If you have a running Slurm job, please skip step 1 and go straight to step 2 and 3. -> 2. Further, if you have installed `tensorrt_llm` in the Slurm job, you can also skip step 2 and run step 3 with `export CONTAINER_NAME=aaa` specified. If you don't know the container name, run `export CONTAINER_NAME=$(SLURM_JOB_ID=$SLURM_JOB_ID ./slurm_query_container_name.sh)` to get it. +> 1. If you have a running Slurm job, you can set environment variable `export SLURM_JOB_ID=aaa` and skip step 1. +> 2. Further, if you have installed `tensorrt_llm` in the Slurm job, you can also skip step 2. Just run step 3 with `export CONTAINER_NAME=aaa` specified. If you don't know the container name, run `export CONTAINER_NAME=$(./slurm_query_container_name.sh)` to get it. -**Step 1:** On the controller node, allocate one or multiple nodes, and record the `SLURM_JOB_ID`: +**Step 1:** On the controller node, allocate one or multiple nodes, and export the `SLURM_JOB_ID`: ```bash -SLURM_JOB_ID=$(NODES=4 TIME=02:00:00 ./slurm_alloc.sh) +export SLURM_JOB_ID=$(NODES=4 TIME=02:00:00 ./slurm_alloc.sh) ``` Please fill the variables in `./slurm_alloc.sh`. @@ -83,28 +86,47 @@ Please fill the variables in `./slurm_alloc.sh`. **Step 2:** Start a container and install `tensorrt_llm`. Run the following command on the controller node: ```bash -SLURM_JOB_ID=$SLURM_JOB_ID ./slurm_init_containers.sh +./slurm_init_containers.sh ``` It uses the image recorded in `../../jenkins/current_image_tags.properties`. The image will be downloaded to `../../enroot/` for once. > Tips: If you want to change the image, no need to reallocate Slurm jobs. Just start another container by running step 2 with `export CONTAINER_NAME=aaa`, and step 3 will run in the container specified by the `CONTAINER_NAME` env. +**(Optional) Get an interactive shell** + +```bash +NODES=1 NP=1 ./slurm_launch.sh --overlap --pty middleware/exclude_slurm_envs bash +``` + +The `--overlap` option allows this shell to share the node with other jobs. The middleware enables nested MPI process spawning from within Slurm jobs. + +You may compile C++ extensions in the interactive shell: + +```bash +cd ../.. +export CCACHE_DIR=$(realpath cpp/.ccache) +python3 scripts/build_wheel.py --cuda_architectures native --no-venv --skip_building_wheel -G Ninja --use_ccache --clean +``` + **Step 3:** Run benchmarks to generate profiles. Run the following command on the controller node, where `NODES` ≤ the number of allocated nodes: ```bash +# Set autotune cache path +export TLLM_AUTOTUNER_CACHE_PATH=autotuner_cache/cache + # Run DeepSeek-R1 NVFP4 with wide ep: uses MNNVL A2A if applicable -SLURM_JOB_ID=$SLURM_JOB_ID NODES=4 NP=16 ./slurm_launch.sh ./run.sh config_gen.yaml --moe-backend WIDEEP +NODES=4 NP=16 ./slurm_launch.sh ./run.sh config_gen.yaml --moe-backend WIDEEP # Run with TRTLLMGen -SLURM_JOB_ID=$SLURM_JOB_ID NODES=4 NP=16 TRTLLM_ENABLE_PDL=1 ./slurm_launch.sh ./run.sh config_gen.yaml --moe-backend TRTLLM +NODES=4 NP=16 TRTLLM_ENABLE_PDL=1 ./slurm_launch.sh ./run.sh config_gen.yaml --moe-backend TRTLLM # Run with DeepEPLowLatency -SLURM_JOB_ID=$SLURM_JOB_ID NODES=4 NP=16 TRTLLM_FORCE_ALLTOALL_METHOD=DeepEPLowLatency ./slurm_launch.sh ./run.sh config_gen.yaml --moe-backend WIDEEP +NODES=4 NP=16 TRTLLM_FORCE_ALLTOALL_METHOD=DeepEPLowLatency ./slurm_launch.sh ./run.sh config_gen.yaml --moe-backend WIDEEP -# You can run 4-GPU and 8-GPU tasks without reallocate the slurm job -SLURM_JOB_ID=$SLURM_JOB_ID NODES=1 NP=4 ./slurm_launch.sh ./run.sh config_ctx.yaml -SLURM_JOB_ID=$SLURM_JOB_ID NODES=2 NP=8 ./slurm_launch.sh ./run.sh config_gtx.yaml +# You can run 4-GPU and 8-GPU tasks without reallocating the slurm job +NODES=1 NP=4 ./slurm_launch.sh ./run.sh config_ctx.yaml +NODES=2 NP=8 ./slurm_launch.sh ./run.sh config_gen.yaml ``` ### Batched run @@ -131,9 +153,11 @@ NP=4 ./mpi_launch.sh ./run.sh config_gen.yaml --scaled-from 16 --moe-backend WID Run the following command in the container: ```bash +# Parse the profile at the default directory python3 parse.py --world-size 4 -# Specify the location of the .nsys-rep file +# Specify the file path +python3 parse.py --file-path profiles/report_np4_rank0.nsys-rep python3 parse.py --profile-dir ./profiles --world-size 4 --rank 0 # Parse a specific module. The module must appear exactly once in each run. @@ -145,6 +169,15 @@ You will receive three reports, each containing kernel timing statistics grouped 2. A CSV report at `profiles/report_np4_rank0.csv` 3. An HTML report at `profiles/report_np4_rank0.html` +## Developer utilities + +1. Less startup time when debug a model + 1. Disable autotuner: add `--no-enable-autotuner` option + 2. Disable nsys profile: set `PROFILE=0` environment variable +2. Capture more information + 1. Enable GPU metrics: set `GPU_METRICS=1` environment variable + 2. Enable backtrace: set `BACKTRACE=1` environment variable + ## Trouble shooting 1. Error `fp8 blockscale gemm only support Hopper` on Blackwell. diff --git a/examples/layer_wise_benchmarks/middleware/exclude_slurm_envs b/examples/layer_wise_benchmarks/middleware/exclude_slurm_envs new file mode 100755 index 0000000000..1b0b504d6c --- /dev/null +++ b/examples/layer_wise_benchmarks/middleware/exclude_slurm_envs @@ -0,0 +1,8 @@ +#!/bin/bash + +set -euo pipefail + +# Clear slurm envs +unset $(env | awk -F'=' '{print $1}' | (grep -E "SLURM_|SLURMD_|slurm_|MPI_|PMIX_" || true)) + +"$@" diff --git a/examples/layer_wise_benchmarks/mpi_launch.sh b/examples/layer_wise_benchmarks/mpi_launch.sh index 24f7643beb..44f6d28dcf 100755 --- a/examples/layer_wise_benchmarks/mpi_launch.sh +++ b/examples/layer_wise_benchmarks/mpi_launch.sh @@ -3,8 +3,12 @@ set -euo pipefail # Clear slurm envs -unset $(env | grep -i slurm | awk -F'=' '{print $1}') -unset $(env | grep MPI | awk -F'=' '{print $1}') +unset $(env | awk -F'=' '{print $1}' | (grep -E "SLURM_|SLURMD_|slurm_|MPI_|PMIX_" || true)) + +extra_args= +if [ -v TLLM_AUTOTUNER_CACHE_PATH ]; then + extra_args+="-x TLLM_AUTOTUNER_CACHE_PATH" +fi set -x -mpirun --allow-run-as-root --np ${NP} "$@" +mpirun --allow-run-as-root --np ${NP} $extra_args "$@" diff --git a/examples/layer_wise_benchmarks/parse.py b/examples/layer_wise_benchmarks/parse.py index 9253a9115d..3ebd479922 100644 --- a/examples/layer_wise_benchmarks/parse.py +++ b/examples/layer_wise_benchmarks/parse.py @@ -5,6 +5,7 @@ import json import re import sqlite3 import subprocess +import sys from collections import defaultdict from pathlib import Path @@ -14,8 +15,9 @@ import pandas as pd # Parse cmdline parser = argparse.ArgumentParser() +parser.add_argument("--file-path", type=str) parser.add_argument("--profile-dir", type=str, default="profiles") -parser.add_argument("--world-size", "--np", type=int, required=True) +parser.add_argument("--world-size", "--np", type=int) parser.add_argument("--rank", type=int, default=0) parser.add_argument("--warmup-times", type=int) parser.add_argument("--module", type=str) @@ -27,6 +29,8 @@ group.add_argument( ) parser.set_defaults(error_on_unknown_kernel=False) args = parser.parse_args() +if (args.file_path is None) == (args.world_size is None): + parser.error("Please specify exactly one of --file-path and --world-size.") print(args) @@ -89,11 +93,20 @@ def shortest_common_supersequence(a, b): return res -profile_dir = Path(args.profile_dir) -nsys_rep_file_path = profile_dir / f"report_np{args.world_size}_rank{args.rank}.nsys-rep" -sqlite_file_path = profile_dir / f"report_np{args.world_size}_rank{args.rank}.sqlite" -csv_file_path = profile_dir / f"report_np{args.world_size}_rank{args.rank}.csv" -html_file_path = profile_dir / f"report_np{args.world_size}_rank{args.rank}.html" +if args.file_path is not None: + nsys_rep_file_path = Path(args.file_path) + if not nsys_rep_file_path.name.endswith(".nsys-rep"): + raise ValueError("Expect a .nsys-rep file") +else: + profile_dir = Path(args.profile_dir) + nsys_rep_file_path = profile_dir / f"report_np{args.world_size}_rank{args.rank}.nsys-rep" +sqlite_file_path = nsys_rep_file_path.parent / ( + nsys_rep_file_path.name[: -len(".nsys-rep")] + ".sqlite" +) +csv_file_path = nsys_rep_file_path.parent / (nsys_rep_file_path.name[: -len(".nsys-rep")] + ".csv") +html_file_path = nsys_rep_file_path.parent / ( + nsys_rep_file_path.name[: -len(".nsys-rep")] + ".html" +) lazy_convert_sqlite(nsys_rep_file_path, sqlite_file_path) conn = sqlite3.connect(f"file:{sqlite_file_path}?mode=ro", uri=True) @@ -196,7 +209,8 @@ if "CUDA_GRAPH_NODE_EVENTS" in tables: R.start AS runtime_start, R.end AS runtime_end, CGE2.start AS capture_start, CGE2.end AS capture_end FROM ({unified_subquery}) AS unified - JOIN CUPTI_ACTIVITY_KIND_RUNTIME AS R ON unified.correlationId = R.correlationId + JOIN CUPTI_ACTIVITY_KIND_RUNTIME AS R ON unified.graphNodeId IS NOT NULL AND + unified.correlationId = R.correlationId LEFT JOIN CUDA_GRAPH_NODE_EVENTS AS CGE1 ON unified.graphNodeId = CGE1.graphNodeId AND CGE1.originalGraphNodeId IS NOT NULL LEFT JOIN CUDA_GRAPH_NODE_EVENTS AS CGE2 ON CGE1.originalGraphNodeId = CGE2.graphNodeId""" @@ -318,6 +332,7 @@ parser_keywords = [ ("routingInitExpertCounts", "routingInitExpertCounts"), ("routingIndicesCluster", "routingIndicesClusterKernel"), ("routingIndicesCoop", "routingIndicesCoopKernel"), + ("router_gemm", "router_gemm_kernel"), ("bmm_4_44_32", "bmm_E2m1_E2m1E2m1_Fp32_t"), ("finalize", "finalize::finalizeKernel"), ("bmm_16_44_32", "bmm_Bfloat16_E2m1E2m1_Fp32_"), @@ -381,7 +396,7 @@ def parse_kernel_name(demangledName): if all(keyword in name for keyword in src): return dst if name not in warned_names: - print(f"Unknown kernel name: {name}") + print(f"Unknown kernel name: {name}", file=sys.stderr) warned_names.add(name) if args.error_on_unknown_kernel: raise NotImplementedError(f"Unknown kernel name: {name}") @@ -491,11 +506,33 @@ with csv_file_path.open("w", newline="") as f: for row in csv_data: csv_writer.writerow(row) js_header_config = [{"name": problem["text"]} for problem in problem_set] +js_header_config = [] +for problem in problem_set: + innermost_children = js_header_config + for k, msg_prefix in [ + ("batch_size", "b="), + ("seq_len_q", "q="), + ("seq_len_kv_cache", "past="), + ]: + if len(run_args[k + "_list"]) > 1: + if len(innermost_children) == 0 or problem["spec"][k] != innermost_children[-1][k]: + innermost_children.append( + { + "name": msg_prefix + str(problem["spec"][k]), + "children": [], + k: problem["spec"][k], + } + ) + innermost_children = innermost_children[-1]["children"] + innermost_children.append({"name": problem["text"]}) loader = jinja2.FileSystemLoader(Path(__file__).parent) template = jinja2.Environment(loader=loader).get_template("template.html") with html_file_path.open("w") as f: configText = ( - "Run:\n" + json.dumps(run_args, indent=4) + "\n\nParse:\n" + json.dumps(args.__dict__) + "Run:\n" + + json.dumps(run_args, indent=4) + + "\n\nParse:\n" + + json.dumps(args.__dict__, indent=4) ) f.write(template.render(headerConfig=js_header_config, rawData=js_data, configText=configText)) diff --git a/examples/layer_wise_benchmarks/run.py b/examples/layer_wise_benchmarks/run.py index c1e3ab5133..1dc61fc9dc 100644 --- a/examples/layer_wise_benchmarks/run.py +++ b/examples/layer_wise_benchmarks/run.py @@ -2,6 +2,7 @@ import argparse import itertools import json import os +from unittest import mock import numpy as np import nvtx @@ -9,6 +10,8 @@ import torch import yaml from tensorrt_llm._torch.autotuner import AutoTuner, autotune +from tensorrt_llm._torch.modules.fused_moe.fused_moe_cutlass import CutlassFusedMoE +from tensorrt_llm._torch.modules.fused_moe.interface import AlltoallMethodType from tensorrt_llm._torch.modules.multi_stream_utils import with_multi_stream from tensorrt_llm._utils import local_mpi_rank, mpi_rank, mpi_world_size from tensorrt_llm.logger import logger @@ -108,17 +111,6 @@ if args.enable_attention_dp is None: args.enable_attention_dp = False if args.max_num_tokens is None: args.max_num_tokens = args.max_batch_size * max(args.seq_len_q_list) - if args.run_type == "GEN": - ctx_batch_size = max(1, max(20480, args.max_num_tokens) // max(args.seq_len_kv_cache_list)) - args.max_num_tokens = max( - args.max_num_tokens, ctx_batch_size * max(args.seq_len_kv_cache_list) - ) -else: - if args.run_type == "GEN": - ctx_batch_size = max(1, args.max_num_tokens // max(args.seq_len_kv_cache_list)) - assert args.max_num_tokens >= ctx_batch_size * max(args.seq_len_kv_cache_list), ( - "Max_num_tokens is too small to prefill KV cache" - ) if args.use_low_precision_moe_combine is None: args.use_low_precision_moe_combine = False if args.enable_autotuner is None: @@ -169,17 +161,75 @@ runner = Runner( ) logger.info("Layer-wise benchmarks: Create runner ... Done") +# Autotune +run_pack = runner.create_run_pack( + args.run_type, + batch_size=max(args.batch_size_list), + request_id_begin=0, + seq_len_q=max(args.seq_len_q_list), + seq_len_kv_cache=args.seq_len_kv_cache_list[0], + kv_cache_manager=kv_cache_manager, + attn_workspace=attn_workspace, +) +if args.enable_autotuner: + cache_path = os.getenv("TLLM_AUTOTUNER_CACHE_PATH") or None + with autotune(cache_path=cache_path): + run_pack() +else: + run_pack() + +# Prefill KV cache +if args.run_type == "GEN": + logger.info("Layer-wise benchmarks: Create runner for prefill") + ctx_seq_len_q = max(args.seq_len_kv_cache_list) + ctx_batch_size = min( + args.max_batch_size, + max(1, 20480 // ctx_seq_len_q), + ) + ctx_attn_workspace = torch.empty((0,), device="cuda", dtype=torch.int8) + with mock.patch.object( + CutlassFusedMoE, "select_alltoall_method_type", return_value=AlltoallMethodType.NotEnabled + ): + ctx_runner = Runner( + args.model, + mapping, + moe_backend="CUTLASS", + layer_indices=args.layer_indices, + scaled_from=args.scaled_from, + max_seq_len=args.max_seq_len, + max_num_tokens=ctx_batch_size * ctx_seq_len_q, + moe_max_num_tokens=16384, + use_low_precision_moe_combine=args.use_low_precision_moe_combine, + use_cuda_graph=False, + ) + logger.info("Layer-wise benchmarks: Create runner for prefill ... Done") + + logger.info("Layer-wise benchmarks: Prefill KV cache") + assert ctx_batch_size <= args.max_batch_size + assert ctx_seq_len_q + 0 <= args.max_seq_len + num_requests = max(args.batch_size_list) + for request_id_begin in range(0, num_requests, ctx_batch_size): + run_pack = ctx_runner.create_run_pack( + "CTX", + batch_size=min(ctx_batch_size, num_requests - request_id_begin), + request_id_begin=request_id_begin, + seq_len_q=ctx_seq_len_q, + seq_len_kv_cache=0, + kv_cache_manager=kv_cache_manager, + attn_workspace=ctx_attn_workspace, + ) + with ctx_runner.replace_routing_method_ctx( + balance_method=BalanceMethod.Balanced, balance_ratio=None + ): + run_pack(check=True) + del ctx_runner + del ctx_attn_workspace + logger.info("Layer-wise benchmarks: Prefill KV cache ... Done") + # Warm up -for autotune_flag, batch_size, seq_len_q, seq_len_kv_cache, balance_ratio in [ - [ - True, - max(args.batch_size_list), - max(args.seq_len_q_list), - args.seq_len_kv_cache_list[0], - args.balance_ratio_list[0], - ], +logger.info("Layer-wise benchmarks: Warmup") +for batch_size, seq_len_q, seq_len_kv_cache, balance_ratio in [ *itertools.product( - [False], args.batch_size_list, args.seq_len_q_list, args.seq_len_kv_cache_list, @@ -203,34 +253,10 @@ for autotune_flag, batch_size, seq_len_q, seq_len_kv_cache, balance_ratio in [ ): capture_stream.wait_stream(torch.cuda.current_stream()) with torch.cuda.stream(capture_stream): - if autotune_flag: - if args.enable_autotuner: - cache_path = os.getenv("TLLM_AUTOTUNER_CACHE_PATH") or None - with autotune(cache_path=cache_path): - run_pack() - if args.run_type == "GEN": - logger.info("Layer-wise benchmarks: Prefill KV cache") - ctx_seq_len_q = max(args.seq_len_kv_cache_list) - assert ctx_batch_size <= args.max_batch_size - assert ctx_seq_len_q + 0 <= args.max_seq_len - assert ctx_batch_size * ctx_seq_len_q <= args.max_num_tokens - max_batch_size = max(args.batch_size_list) - for request_id_begin in range(0, max_batch_size, ctx_batch_size): - ctx_run_pack = runner.create_run_pack( - "CTX", - batch_size=min(ctx_batch_size, max_batch_size - request_id_begin), - request_id_begin=request_id_begin, - seq_len_q=ctx_seq_len_q, - seq_len_kv_cache=0, - kv_cache_manager=kv_cache_manager, - attn_workspace=attn_workspace, - ) - ctx_run_pack(check=True) - logger.info("Layer-wise benchmarks: Prefill KV cache ... Done") - else: - run_pack(check=True) + run_pack(check=True) torch.cuda.current_stream().wait_stream(capture_stream) torch.cuda.synchronize() +logger.info("Layer-wise benchmarks: Warmup ... Done") events = [ torch.cuda.Event(enable_timing=True) for _ in range(args.warmup_times + args.run_times + 1) @@ -270,7 +296,7 @@ for batch_size, seq_len_q, seq_len_kv_cache, balance_ratio in itertools.product( with torch.cuda.graph(g, stream=capture_stream, capture_error_mode="global"): run_pack() - balance_ratio_str = "" if balance_ratio is None else f" balance={balance_ratio:.2g}" + balance_ratio_str = "" if balance_ratio is None else f" balance={balance_ratio:.2g}" nvtx_message = f"b={batch_size} s={seq_len_q} past={seq_len_kv_cache}{balance_ratio_str} NP{world_size}" for i in range(args.warmup_times + args.run_times): events[i].record() diff --git a/examples/layer_wise_benchmarks/run.sh b/examples/layer_wise_benchmarks/run.sh index a96265d8f3..4f20394d84 100755 --- a/examples/layer_wise_benchmarks/run.sh +++ b/examples/layer_wise_benchmarks/run.sh @@ -40,7 +40,9 @@ else PROFILE_CMD= fi +SCRIPT_PATH=$(realpath --relative-to="$(pwd)" "$(dirname -- "$0")"/run.py) + set -x -$PROFILE_CMD bash -c \ - "python3 -u run.py \"\$@\" 2>&1 | tee \"$PROFILE_DIR/report_np${WORLD_SIZE}_rank${RANK}.log\"" \ - bash "$@" +$PROFILE_CMD bash -o pipefail -c \ + "python3 -u \"\$1\" \"\${@:3}\" 2>&1 | tee \"\$2/report_np${WORLD_SIZE}_rank${RANK}.log\"" \ + bash "$SCRIPT_PATH" "$PROFILE_DIR" "$@" diff --git a/examples/layer_wise_benchmarks/slurm_init_containers.sh b/examples/layer_wise_benchmarks/slurm_init_containers.sh index 08e77c5623..7e97505bc1 100755 --- a/examples/layer_wise_benchmarks/slurm_init_containers.sh +++ b/examples/layer_wise_benchmarks/slurm_init_containers.sh @@ -4,23 +4,23 @@ set -euo pipefail # CONTAINER_IMAGE= CONTAINER_NAME=${CONTAINER_NAME:-layer_wise_benchmarks} -CONTAINER_MOUNTS=$(realpath "$(pwd)/../.."):$(realpath "$(pwd)/../..") +TRTLLM_ROOT=$(realpath "$(dirname -- "$0")"/../..) +CONTAINER_MOUNTS=$TRTLLM_ROOT:$TRTLLM_ROOT -if [ "${SLURM_JOB_ID:-}" == "" ]; then +if [ -z "${SLURM_JOB_ID:-}" ]; then echo "Please set SLURM_JOB_ID" exit 1 fi NODES=$(squeue -j $SLURM_JOB_ID -h -o "%D") -if [ "${CONTAINER_IMAGE:-}" == "" ]; then +if [ -z "${CONTAINER_IMAGE:-}" ]; then # Read Docker image from current_image_tags.properties - source ../../jenkins/current_image_tags.properties MACHINE="$(uname -m)" if [ "$MACHINE" == "x86_64" ]; then - DOCKER_IMAGE=$LLM_DOCKER_IMAGE + DOCKER_IMAGE=$(source "$TRTLLM_ROOT/jenkins/current_image_tags.properties" && echo $LLM_DOCKER_IMAGE) elif [ "$MACHINE" == "aarch64" ]; then - DOCKER_IMAGE=$LLM_SBSA_DOCKER_IMAGE + DOCKER_IMAGE=$(source "$TRTLLM_ROOT/jenkins/current_image_tags.properties" && echo $LLM_SBSA_DOCKER_IMAGE) else echo "Unsupported machine hardware name \"$MACHINE\"" exit 1 @@ -34,7 +34,7 @@ if [ "${CONTAINER_IMAGE:-}" == "" ]; then SQSH_FILE_NAME=$(echo "$DOCKER_IMAGE" | awk -F'#' '{print $2}' | awk -F':' '{gsub(/\//,"+",$1); print $1"+"$2".sqsh"}') - CONTAINER_IMAGE="../../enroot/$SQSH_FILE_NAME" + CONTAINER_IMAGE="$TRTLLM_ROOT/enroot/$SQSH_FILE_NAME" if [ ! -f "$CONTAINER_IMAGE" ]; then echo "Container image file $CONTAINER_IMAGE does not exist, importing ..." srun -N 1 --pty enroot import -o "$CONTAINER_IMAGE" "docker://$DOCKER_IMAGE" @@ -50,6 +50,7 @@ srun -N "$NODES" \ --container-name "$CONTAINER_NAME" \ --container-mounts "$CONTAINER_MOUNTS" \ --container-workdir "$WORKDIR" \ -bash -c "pip install -U packaging && - pip install -r ../../requirements.txt --no-build-isolation && - pip install -e ../.." +bash -c "cd \"\$1\" && + pip install -U packaging && + pip install -r requirements.txt --no-build-isolation && + pip install -e ." bash "$TRTLLM_ROOT" diff --git a/examples/layer_wise_benchmarks/slurm_launch.sh b/examples/layer_wise_benchmarks/slurm_launch.sh index 36fe0a9a00..64857b2877 100755 --- a/examples/layer_wise_benchmarks/slurm_launch.sh +++ b/examples/layer_wise_benchmarks/slurm_launch.sh @@ -3,9 +3,10 @@ set -euo pipefail CONTAINER_NAME=${CONTAINER_NAME:-layer_wise_benchmarks} -CONTAINER_MOUNTS=$(realpath "$(pwd)/../.."):$(realpath "$(pwd)/../..") +TRTLLM_ROOT=$(realpath "$(dirname -- "$0")"/../..) +CONTAINER_MOUNTS=$TRTLLM_ROOT:$TRTLLM_ROOT -if [ "${SLURM_JOB_ID:-}" == "" ]; then +if [ -z "${SLURM_JOB_ID:-}" ]; then echo "Please set SLURM_JOB_ID" exit 1 fi diff --git a/examples/layer_wise_benchmarks/slurm_query_container_name.sh b/examples/layer_wise_benchmarks/slurm_query_container_name.sh index a7f6b1ba81..6d3192da0b 100755 --- a/examples/layer_wise_benchmarks/slurm_query_container_name.sh +++ b/examples/layer_wise_benchmarks/slurm_query_container_name.sh @@ -2,6 +2,11 @@ set -euo pipefail +if [ -z "${SLURM_JOB_ID:-}" ]; then + echo "Please set SLURM_JOB_ID" + exit 1 +fi + prefix="pyxis_${SLURM_JOB_ID}_" matches=$(printf "%s\n" "$(srun -N 1 enroot list)" | grep "^${prefix}" || true) count=$(printf "%s\n" "$matches" | wc -l) diff --git a/examples/layer_wise_benchmarks/template.html b/examples/layer_wise_benchmarks/template.html index 6f190ed664..ac35ee2ce1 100644 --- a/examples/layer_wise_benchmarks/template.html +++ b/examples/layer_wise_benchmarks/template.html @@ -313,6 +313,7 @@ const columnLabels = []; let columnTotals = []; let totalNode = null; // Special node for the "Total" row + let numCols = null; const fmt = (num) => num.toFixed(1); @@ -332,7 +333,7 @@ const topLevelTimes = rawData.map(aggregate); // Calculate Totals - const numCols = topLevelTimes[0].length; + numCols = topLevelTimes[0].length; columnTotals = new Array(numCols).fill(0); for (let c = 0; c < numCols; c++) { columnTotals[c] = topLevelTimes.reduce((sum, row) => sum + row[c], 0); @@ -420,7 +421,8 @@ const total = columnTotals[colIdx] || 1; const pct = (val / total) * 100; - html += `