mirror of
https://github.com/NVIDIA/TensorRT-LLM.git
synced 2026-02-05 02:31:33 +08:00
[None][chore] Clean up layer-wise benchmarks code (#11092)
Signed-off-by: Tailing Yuan <yuantailing@gmail.com>
This commit is contained in:
parent
ab7dd34bbe
commit
4345636b04
@ -1,10 +1,12 @@
|
||||
# Layer-wise Benchmarks
|
||||
|
||||
This tool profiles individual layers of LLM models to help understand the performance characteristics of each layer and compare layer-wise benchmarks with end-to-end profiling results.
|
||||
|
||||
## Generate profiles
|
||||
|
||||
### Run with OpenMPI
|
||||
|
||||
**Step 1:** Start a container using Docker, Enroot or others. Please refer to `../../jenkins/current_image_tags.properties` for the Docker image URI.
|
||||
**Step 1:** Start a container using Docker, Enroot, or other container runtimes. Please refer to `../../jenkins/current_image_tags.properties` for the Docker image URI.
|
||||
|
||||
**Step 2:** In the container, install `tensorrt_llm`:
|
||||
|
||||
@ -19,7 +21,7 @@ pip install -e ../..
|
||||
NP=4 ./mpi_launch.sh ./run.sh config_ctx.yaml
|
||||
NP=4 ./mpi_launch.sh ./run.sh config_gen.yaml
|
||||
|
||||
# Run with weights loaded. Requires local model directory
|
||||
# Run with weights loaded (requires a local model directory)
|
||||
NP=4 ./mpi_launch.sh ./run.sh config_ctx.yaml --model "$LLM_MODELS_ROOT/DeepSeek-R1/DeepSeek-R1-0528-FP4-v2" --load-format AUTO
|
||||
NP=4 ./mpi_launch.sh ./run.sh config_gen.yaml --model "$LLM_MODELS_ROOT/DeepSeek-R1/DeepSeek-R1-0528-FP4-v2" --load-format AUTO
|
||||
|
||||
@ -46,10 +48,10 @@ NP=4 ./mpi_launch.sh ./run.sh config_gen.yaml --batch-size 32 --seq-len-q 4
|
||||
NP=4 ./mpi_launch.sh ./run.sh config_ctx.yaml --layer-indices 5,6,7,8
|
||||
NP=4 ./mpi_launch.sh ./run.sh config_gen.yaml --layer-indices 5,6,7,8
|
||||
|
||||
# Scale DEP=16 to 4 GPUs: reduce the number of experts, uses MNNVL A2A if applicable
|
||||
# Scale DEP=16 to 4 GPUs: reduces the number of experts; uses MNNVL A2A if applicable
|
||||
NP=4 ./mpi_launch.sh ./run.sh config_gen.yaml --scaled-from 16 --moe-backend WIDEEP
|
||||
|
||||
# Scale TEP=16 to 4 GPUs: reduce the number of attention heads and experts
|
||||
# Scale TEP=16 to 4 GPUs: reduces the number of attention heads and experts
|
||||
NP=4 ./mpi_launch.sh ./run.sh config_gen.yaml --scaled-from 16 --no-enable-attention-dp
|
||||
|
||||
# Run Nemotron-3-Nano
|
||||
@ -64,12 +66,12 @@ NP=2 ./mpi_launch.sh ./run.sh config_gen.yaml --model Qwen/Qwen3-Next-80B-A3B-In
|
||||
NP=4 ./mpi_launch.sh -x TRTLLM_FORCE_ALLTOALL_METHOD=DeepEP ./run.sh config_ctx.yaml --moe-backend WIDEEP
|
||||
NP=4 ./mpi_launch.sh -x TRTLLM_FORCE_ALLTOALL_METHOD=DeepEP ./run.sh config_gen.yaml --moe-backend WIDEEP
|
||||
|
||||
# Run with imbalanced ranks: except for activating all experts, a% of the tokens are sent to the 1st rank
|
||||
# Note: if balance ratio is 0, ignore activating all experts
|
||||
# Run with imbalanced ranks: in addition to activating all experts, the specified ratio of tokens is sent to rank 0
|
||||
# Note: if balance ratio is 0, the "activate all experts" behavior is not applied
|
||||
NP=4 ./mpi_launch.sh ./run.sh config_ctx.yaml --balance-method ImbalancedRanks --balance-ratio 0.5
|
||||
NP=4 ./mpi_launch.sh ./run.sh config_gen.yaml --balance-method ImbalancedRanks --balance-ratio 0.5
|
||||
|
||||
# Run with imbalanced experts and balanced ranks: except for activating all experts, a% of the tokens are sent to the front experts on each rank
|
||||
# Run with imbalanced experts and balanced ranks: in addition to activating all experts, the specified ratio of tokens is sent to the front experts on each rank
|
||||
NP=4 ./mpi_launch.sh ./run.sh config_ctx.yaml --balance-method ImbalancedExperts --balance-ratio 0.5
|
||||
NP=4 ./mpi_launch.sh ./run.sh config_gen.yaml --balance-method ImbalancedExperts --balance-ratio 0.5
|
||||
```
|
||||
@ -77,8 +79,8 @@ NP=4 ./mpi_launch.sh ./run.sh config_gen.yaml --balance-method ImbalancedExperts
|
||||
### Run with Slurm
|
||||
|
||||
> Tips:
|
||||
> 1. If you have a running Slurm job, you can set environment variable `export SLURM_JOB_ID=aaa` and skip step 1.
|
||||
> 2. Further, if you have installed `tensorrt_llm` in the Slurm job, you can also skip step 2. Just run step 3 with `export CONTAINER_NAME=aaa` specified. If you don't know the container name, run `export CONTAINER_NAME=$(./slurm_query_container_name.sh)` to get it.
|
||||
> 1. If you have a running Slurm job, you can set the environment variable by running `export SLURM_JOB_ID=<job_id>` and skip Step 1.
|
||||
> 2. Further, if you have already installed `tensorrt_llm` in the Slurm job, you can also skip Step 2. Just run Step 3 with `export CONTAINER_NAME=<name>` specified. If you don't know the container name, run `export CONTAINER_NAME=$(./slurm_query_container_name.sh)` to get it.
|
||||
|
||||
**Step 1:** On the controller node, allocate one or multiple nodes, and export the `SLURM_JOB_ID`:
|
||||
|
||||
@ -86,7 +88,7 @@ NP=4 ./mpi_launch.sh ./run.sh config_gen.yaml --balance-method ImbalancedExperts
|
||||
export SLURM_JOB_ID=$(NODES=4 TIME=02:00:00 ./slurm_alloc.sh)
|
||||
```
|
||||
|
||||
Please fill the variables in `./slurm_alloc.sh`.
|
||||
Please set the variables in `./slurm_alloc.sh` before running.
|
||||
|
||||
**Step 2:** Start a container and install `tensorrt_llm`. Run the following command on the controller node:
|
||||
|
||||
@ -94,9 +96,9 @@ Please fill the variables in `./slurm_alloc.sh`.
|
||||
./slurm_init_containers.sh
|
||||
```
|
||||
|
||||
It uses the image recorded in `../../jenkins/current_image_tags.properties`. The image will be downloaded to `../../enroot/` for once.
|
||||
This script uses the image recorded in `../../jenkins/current_image_tags.properties`. The image will be downloaded to `../../enroot/` once.
|
||||
|
||||
> Tips: If you want to change the image, no need to reallocate Slurm jobs. Just start another container by running step 2 with `export CONTAINER_NAME=aaa`, and step 3 will run in the container specified by the `CONTAINER_NAME` env.
|
||||
> Tip: If you want to change the image, there is no need to reallocate Slurm jobs. Just start another container by running Step 2 with `export CONTAINER_NAME=<new_name>`, and Step 3 will run in the container specified by the `CONTAINER_NAME` environment variable.
|
||||
|
||||
**(Optional) Get an interactive shell**
|
||||
|
||||
@ -117,7 +119,7 @@ python3 scripts/build_wheel.py --cuda_architectures native --no-venv --skip_buil
|
||||
**Step 3:** Run benchmarks to generate profiles. Run the following command on the controller node, where `NODES` ≤ the number of allocated nodes:
|
||||
|
||||
```bash
|
||||
# Run DeepSeek-R1 NVFP4 with wide ep: uses MNNVL A2A if applicable
|
||||
# Run DeepSeek-R1 NVFP4 with wide EP; uses MNNVL A2A if applicable
|
||||
NODES=4 NP=16 ./slurm_launch.sh ./run.sh config_gen.yaml --moe-backend WIDEEP
|
||||
|
||||
# Run with TRTLLMGen
|
||||
@ -126,7 +128,7 @@ NODES=4 NP=16 ./slurm_launch.sh ./run.sh config_gen.yaml --moe-backend TRTLLM
|
||||
# Run with DeepEPLowLatency
|
||||
NODES=4 NP=16 TRTLLM_FORCE_ALLTOALL_METHOD=DeepEPLowLatency ./slurm_launch.sh ./run.sh config_gen.yaml --moe-backend WIDEEP
|
||||
|
||||
# You can run 4-GPU and 8-GPU tasks without reallocating the slurm job
|
||||
# You can run 4-GPU and 8-GPU tasks without reallocating the Slurm job
|
||||
NODES=1 NP=4 ./slurm_launch.sh ./run.sh config_ctx.yaml
|
||||
NODES=2 NP=8 ./slurm_launch.sh ./run.sh config_gen.yaml
|
||||
```
|
||||
@ -141,7 +143,7 @@ Supported list arguments:
|
||||
- `--seq-len-kv-cache` (or `seq_len_kv_cache` in YAML)
|
||||
- `--balance-ratio` (or `balance_ratio` in YAML)
|
||||
|
||||
Command line arguments are comma separated, for example, `--batch-size 1,2,4`. Configs in the YAML file are lists, for example, `batch_size: [1, 2, 4]`.
|
||||
Command-line arguments are comma-separated, for example, `--batch-size 1,2,4`. Values in the YAML file are lists, for example, `batch_size: [1, 2, 4]`.
|
||||
|
||||
Run with OpenMPI:
|
||||
|
||||
@ -166,18 +168,19 @@ python3 parse.py --profile-dir ./profiles --world-size 4 --rank 0
|
||||
python3 parse.py --world-size 4 --module MoE
|
||||
```
|
||||
|
||||
You will receive three reports, each containing kernel timing statistics grouped by module:
|
||||
You will receive four reports, each containing kernel timing statistics grouped by module:
|
||||
1. A printed report on stdout
|
||||
2. A CSV report at `profiles/report_np4_rank0.csv`
|
||||
3. An HTML report at `profiles/report_np4_rank0.html`
|
||||
4. A JSON report at `profiles/report_np4_rank0.json` (for correlation analysis)
|
||||
|
||||
## Performance alignment between end-to-end performance and layer-wise benchmarks
|
||||
|
||||
An overall example can be found in `sample_performance_alignment.sh`. Here is an abstract of the main steps.
|
||||
A complete example can be found in `sample_performance_alignment.sh`. Below is an overview of the main steps.
|
||||
|
||||
1. Run end-to-end serving in **COLLECT** mode, and capture nsys profiles. This step generates a calibration file.
|
||||
1. Run end-to-end serving in **COLLECT** mode and capture nsys profiles. This step generates a calibration file.
|
||||
|
||||
Please meet the following requirements.
|
||||
Requirements:
|
||||
|
||||
1. Add the following fields to `config.yaml`.
|
||||
|
||||
@ -187,13 +190,13 @@ An overall example can be found in `sample_performance_alignment.sh`. Here is an
|
||||
calibration_file_path: profiles/calibration_data.json
|
||||
```
|
||||
|
||||
2. Set `TLLM_PROFILE_START_STOP` to a range that can capture some iterations (typically tens of iterations) of GEN phase. Ensure every iteration has the same batch size. Please capture 5 more iterations at beginning, because the first 5 iterations are regarded as warm-ups and will be dropped by the parser by default.
|
||||
2. Set `TLLM_PROFILE_START_STOP` to a range that captures some iterations (typically tens of iterations) of the GEN phase. Ensure that every iteration has the same batch size. Capture 5 extra iterations at the beginning, because the first 5 iterations are treated as warm-ups and will be dropped by the parser by default.
|
||||
|
||||
3. Capture per-rank nsys profiles, and every rank should produce a separate file.
|
||||
3. Capture per-rank nsys profiles; each rank should produce a separate file.
|
||||
|
||||
You need to put `nsys profile` behind `mpirun` or `srun`. To minimize profile overhead and file size, there is no need to capture samples and GPU metrics.
|
||||
Place `nsys profile` after `mpirun` or `srun`. To minimize profiling overhead and file size, there is no need to capture samples or GPU metrics.
|
||||
|
||||
If you use `trtllm-serve` or `trtllm-bench`, please follow the following command order. If you use `examples/disaggregated/slurm/benchmark/submit.py`, setting `gen_profile_range` is enough.
|
||||
If you use `trtllm-serve` or `trtllm-bench`, use the following command order. If you use `examples/disaggregated/slurm/benchmark/submit.py`, setting `gen_profile_range` is sufficient.
|
||||
|
||||
```bash
|
||||
NP=$NP ./mpi_launch.sh middleware/mpi_env_from_ompi \
|
||||
@ -209,11 +212,11 @@ An overall example can be found in `sample_performance_alignment.sh`. Here is an
|
||||
--model ...
|
||||
```
|
||||
|
||||
4. To be more precise, set the same `TLLM_AUTOTUNER_CACHE_PATH` for all the steps. The autotuner cache file should be generated by Step 1, and be reused by Step 2 and Step 3.
|
||||
4. For more accurate results, set the same `TLLM_AUTOTUNER_CACHE_PATH` for all steps. The autotuner cache file should be generated in Step 1 and reused in Steps 2 and 3.
|
||||
|
||||
2. If the end-to-end serving uses CUDA Graphs, run Step 1 again in **MARK** mode without CUDA Graphs, and also capture nsys profiles.
|
||||
2. If the end-to-end serving uses CUDA Graphs, run Step 1 again in **MARK** mode without CUDA Graphs and capture nsys profiles.
|
||||
|
||||
The differences are as follows.
|
||||
The differences from Step 1 are as follows:
|
||||
|
||||
1. Add the following fields to `config.yaml`.
|
||||
|
||||
@ -241,18 +244,18 @@ An overall example can be found in `sample_performance_alignment.sh`. Here is an
|
||||
--replay-stop 67
|
||||
```
|
||||
|
||||
Here are explanations of every argument:
|
||||
Argument explanations:
|
||||
|
||||
| Argument/Parameter | Explanation |
|
||||
|-------------------|-------------|
|
||||
| ------------------ | ----------- |
|
||||
| `NP=4` | Should match the end-to-end run. |
|
||||
| `--load-format AUTO` | Instruct the benchmark to load model weights instead of initializing random weights. |
|
||||
| `--layer-indices 5,6,7` | A list of contiguous layers you want to calibrate. |
|
||||
| `--load-format AUTO` | Instructs the benchmark to load model weights instead of using random weights. |
|
||||
| `--layer-indices 5,6,7` | A list of contiguous layers to calibrate. |
|
||||
| `--batch-size 32` | Should match the end-to-end run. |
|
||||
| `--seq-len-q 1` | Should match (1+MTP) of the end-to-end run. |
|
||||
| `--seq-len-kv-cache 2090` | Estimation of the average context length for iterations you captured. The first 5 iterations should be excluded from the estimation, because they will be dropped by parser. |
|
||||
| `--replay-file-path` | The calibration file obtained by Step 1. |
|
||||
| `--replay-start` and `--replay-stop` | Should match the end-to-end `TLLM_PROFILE_START_STOP`. Do not replay the first 5 iterations, because they will be dropped by parser. |
|
||||
| `--seq-len-q 1` | Should match (1 + MTP) of the end-to-end run. |
|
||||
| `--seq-len-kv-cache 2090` | An estimate of the average context length for the captured iterations. The first 5 iterations should be excluded from this estimate because they will be dropped by the parser. |
|
||||
| `--replay-file-path` | The calibration file obtained from Step 1. |
|
||||
| `--replay-start` and `--replay-stop` | Should match the end-to-end `TLLM_PROFILE_START_STOP`. Do not replay the first 5 iterations because they will be dropped by the parser. |
|
||||
|
||||
4. Parse end-to-end profiles with `parse_e2e.py`, and parse layer-wise benchmarks profiles with `parse.py`.
|
||||
|
||||
@ -278,30 +281,30 @@ An overall example can be found in `sample_performance_alignment.sh`. Here is an
|
||||
-o profiles/correlation.html
|
||||
```
|
||||
|
||||
Please find `profiles/correlation.html` for the report.
|
||||
The report can be found at `profiles/correlation.html`.
|
||||
|
||||
Limitations:
|
||||
|
||||
1. Pipeline parallelism is not supported.
|
||||
2. MoE backends CUTLASS and WIDEEP are supported.
|
||||
3. Only tested with GEN phase and attention DP.
|
||||
2. Only the CUTLASS and WIDEEP MoE backends are supported.
|
||||
3. Only tested with the GEN phase and attention DP.
|
||||
|
||||
## Developer utilities
|
||||
|
||||
1. Less startup time when debug a model
|
||||
1. Reduce startup time when debugging a model
|
||||
1. Set autotuner cache or disable autotuner
|
||||
1. Set autotuner cache: add `TLLM_AUTOTUNER_CACHE_PATH=autotuner_cache/cache` environment variable. This is enabled at your own risk, and you may need to delete the cache if `NP` changes or the code changes
|
||||
2. Disable autotuner: add `--no-enable-autotuner` option
|
||||
2. Disable nsys profile: set `PROFILE=0` environment variable
|
||||
1. Set autotuner cache: set the `TLLM_AUTOTUNER_CACHE_PATH=autotuner_cache/cache` environment variable. Use this at your own risk; you may need to delete the cache if `NP` changes or the code changes
|
||||
2. Disable autotuner: add the `--no-enable-autotuner` option
|
||||
2. Disable nsys profiling: set the `PROFILE=0` environment variable
|
||||
2. Capture more information
|
||||
1. Enable GPU metrics: set `GPU_METRICS=1` environment variable
|
||||
2. Enable backtrace: set `BACKTRACE=1` environment variable
|
||||
1. Enable GPU metrics: set the `GPU_METRICS=1` environment variable
|
||||
2. Enable backtrace: set the `BACKTRACE=1` environment variable
|
||||
|
||||
## Trouble shooting
|
||||
## Troubleshooting
|
||||
|
||||
1. Error `fp8 blockscale gemm only support Hopper` on Blackwell.
|
||||
|
||||
The default MoE backend "CUTLASS" does not support FP8 weights. Please choose the same MoE backend as your end-to-end config. A typical choice is adding `--moe-backend DEEPGEMM` (or `TRTLLM`, `WIDEEP`) and `--moe-backend-for-prefill DEEPGEMM` (or `WIDEEP`) option.
|
||||
The default MoE backend "CUTLASS" does not support FP8 weights. Please choose the same MoE backend as your end-to-end config. A typical solution is to add the `--moe-backend DEEPGEMM` (or `TRTLLM`, `WIDEEP`) and `--moe-backend-for-prefill DEEPGEMM` (or `WIDEEP`) options.
|
||||
|
||||
2. Error `huggingface_hub.errors.HfHubHTTPError: 429 Client Error: Too Many Requests for url: https://huggingface.co/nvidia/DeepSeek-R1-0528-FP4-v2/resolve/main/config.json`.
|
||||
|
||||
|
||||
@ -5,10 +5,10 @@ set -euo pipefail
|
||||
# Clear slurm envs
|
||||
unset $(env | awk -F'=' '{print $1}' | (grep -E "SLURM_|SLURMD_|slurm_|MPI_|PMIX_" || true))
|
||||
|
||||
extra_args=
|
||||
extra_args=()
|
||||
if [ -v TLLM_AUTOTUNER_CACHE_PATH ]; then
|
||||
extra_args+="-x TLLM_AUTOTUNER_CACHE_PATH"
|
||||
extra_args+=(-x TLLM_AUTOTUNER_CACHE_PATH)
|
||||
fi
|
||||
|
||||
set -x
|
||||
mpirun --allow-run-as-root --np ${NP} $extra_args "$@"
|
||||
mpirun --allow-run-as-root --np $NP ${extra_args[@]+"${extra_args[@]}"} "$@"
|
||||
|
||||
@ -5,6 +5,7 @@ import json
|
||||
import re
|
||||
import sqlite3
|
||||
from pathlib import Path
|
||||
from typing import NamedTuple
|
||||
|
||||
import jinja2
|
||||
import numpy as np
|
||||
@ -16,6 +17,50 @@ from parser_utils import (
|
||||
warned_names,
|
||||
)
|
||||
|
||||
|
||||
class NvtxRange(NamedTuple):
|
||||
"""Represents an NVTX range with start/end times and text label."""
|
||||
|
||||
start: int
|
||||
end: int
|
||||
text: str
|
||||
|
||||
|
||||
class KernelRecord(NamedTuple):
|
||||
"""Represents a kernel record from the database query.
|
||||
|
||||
Used for sorting and grouping kernels by runtime and capture time.
|
||||
"""
|
||||
|
||||
problem_id: int
|
||||
run_id: int
|
||||
range_names: tuple[str, ...]
|
||||
kernel_start: int
|
||||
kernel_end: int
|
||||
demangled_name: int # String ID reference
|
||||
runtime_start: int
|
||||
capture_start: int
|
||||
|
||||
|
||||
class KernelTiming(NamedTuple):
|
||||
"""Represents a kernel's timing within a run.
|
||||
|
||||
Used after sorting and grouping for per-run analysis.
|
||||
"""
|
||||
|
||||
demangled_name: int # String ID reference
|
||||
kernel_start: int
|
||||
kernel_end: int
|
||||
range_names: tuple[str, ...]
|
||||
|
||||
|
||||
class CategoryTime(NamedTuple):
|
||||
"""Represents a category (hierarchical path) and its associated time."""
|
||||
|
||||
category: tuple[str, ...]
|
||||
time_ns: float
|
||||
|
||||
|
||||
# Parse cmdline
|
||||
parser = argparse.ArgumentParser()
|
||||
parser.add_argument("--file-path", type=str)
|
||||
@ -71,19 +116,19 @@ query = """SELECT T1.start, T2.value AS text
|
||||
JOIN StringIds AS T2 ON T1.textId = T2.id
|
||||
WHERE eventType = ? AND T2.value LIKE ?"""
|
||||
df = pd.read_sql_query(query, conn, params=(event_id_NvtxPushPopRange, "layer_wise_benchmarks %"))
|
||||
problem_start = []
|
||||
problem_set = []
|
||||
problem_start_times: list[int] = []
|
||||
problem_set: list[dict] = []
|
||||
for start, text in df.itertuples(index=False):
|
||||
if text.startswith("layer_wise_benchmarks args {"):
|
||||
run_args = json.loads(text[len("layer_wise_benchmarks args") :])
|
||||
elif text.startswith("layer_wise_benchmarks problem_spec {"):
|
||||
problem_start.append(start)
|
||||
problem_start_times.append(start)
|
||||
problem_set.append(
|
||||
{
|
||||
"spec": json.loads(text[len("layer_wise_benchmarks problem_spec ") :]),
|
||||
"text": "",
|
||||
"runs": [],
|
||||
"runs_end": [],
|
||||
"run_starts": [],
|
||||
"run_ends": [],
|
||||
"ranges": [],
|
||||
"kernel_count_per_range": [],
|
||||
}
|
||||
@ -99,7 +144,7 @@ df = pd.read_sql_query(
|
||||
params=(event_id_NvtxPushPopRange, "[DG]%", nccl_domain_id),
|
||||
)
|
||||
for start, end, text in df.itertuples(index=False):
|
||||
problem_id = bisect.bisect(problem_start, start) - 1
|
||||
problem_id = bisect.bisect(problem_start_times, start) - 1
|
||||
if text.startswith("layer_wise_benchmarks "):
|
||||
if text != "layer_wise_benchmarks ignore":
|
||||
continue
|
||||
@ -107,10 +152,10 @@ for start, end, text in df.itertuples(index=False):
|
||||
assert problem_id != -1
|
||||
if re.match(r"b=\d+ s=\d+ ", text):
|
||||
problem_set[problem_id]["text"] = text
|
||||
problem_set[problem_id]["runs"].append(start)
|
||||
problem_set[problem_id]["runs_end"].append(end)
|
||||
problem_set[problem_id]["run_starts"].append(start)
|
||||
problem_set[problem_id]["run_ends"].append(end)
|
||||
else:
|
||||
problem_set[problem_id]["ranges"].append((start, end, text))
|
||||
problem_set[problem_id]["ranges"].append(NvtxRange(start, end, text))
|
||||
problem_set[problem_id]["kernel_count_per_range"].append(0)
|
||||
|
||||
query = """SELECT name FROM sqlite_master WHERE type = ?"""
|
||||
@ -127,16 +172,14 @@ if "CUPTI_ACTIVITY_KIND_MEMSET" in tables:
|
||||
SELECT T3.start, T3.end, -3 AS demangledName, T3.correlationId, T3.graphNodeId
|
||||
FROM CUPTI_ACTIVITY_KIND_MEMSET AS T3"""
|
||||
query = f"""SELECT unified.start, unified.end, unified.demangledName,
|
||||
R.start AS runtime_start, R.end AS runtime_end,
|
||||
R.start AS capture_start, R.end AS capture_end
|
||||
R.start AS runtime_start, R.start AS capture_start, R.end AS capture_end
|
||||
FROM ({unified_subquery}) AS unified
|
||||
JOIN CUPTI_ACTIVITY_KIND_RUNTIME AS R ON unified.correlationId = R.correlationId
|
||||
WHERE unified.graphNodeId IS NULL"""
|
||||
if "CUDA_GRAPH_NODE_EVENTS" in tables:
|
||||
query += f""" UNION ALL
|
||||
SELECT unified.start, unified.end, unified.demangledName,
|
||||
R.start AS runtime_start, R.end AS runtime_end,
|
||||
CGE2.start AS capture_start, CGE2.end AS capture_end
|
||||
R.start AS runtime_start, CGE2.start AS capture_start, CGE2.end AS capture_end
|
||||
FROM ({unified_subquery}) AS unified
|
||||
JOIN CUPTI_ACTIVITY_KIND_RUNTIME AS R ON unified.graphNodeId IS NOT NULL AND
|
||||
unified.correlationId = R.correlationId
|
||||
@ -144,44 +187,41 @@ if "CUDA_GRAPH_NODE_EVENTS" in tables:
|
||||
CGE1.originalGraphNodeId IS NOT NULL
|
||||
LEFT JOIN CUDA_GRAPH_NODE_EVENTS AS CGE2 ON CGE1.originalGraphNodeId = CGE2.graphNodeId"""
|
||||
df = pd.read_sql_query(query, conn)
|
||||
kernel_list = []
|
||||
kernel_records: list[KernelRecord] = []
|
||||
for (
|
||||
start,
|
||||
end,
|
||||
demangledName,
|
||||
kernel_start,
|
||||
kernel_end,
|
||||
demangled_name,
|
||||
runtime_start,
|
||||
runtime_end,
|
||||
capture_start,
|
||||
capture_end,
|
||||
) in df.itertuples(index=False):
|
||||
problem_id = bisect.bisect(problem_start, start) - 1
|
||||
problem_id = bisect.bisect(problem_start_times, kernel_start) - 1
|
||||
problem = problem_set[problem_id]
|
||||
run_id = bisect.bisect(problem["runs"], runtime_start) - 1
|
||||
if run_id == -1 or runtime_start >= problem["runs_end"][run_id]:
|
||||
run_id = bisect.bisect(problem["run_starts"], runtime_start) - 1
|
||||
if run_id == -1 or runtime_start >= problem["run_ends"][run_id]:
|
||||
continue
|
||||
ranges = [
|
||||
matching_range_indices = [
|
||||
i
|
||||
for i, (range_start, range_end, text) in enumerate(problem["ranges"])
|
||||
if capture_start >= range_start and capture_end <= range_end
|
||||
for i, nvtx_range in enumerate(problem["ranges"])
|
||||
if capture_start >= nvtx_range.start and capture_end <= nvtx_range.end
|
||||
]
|
||||
for range_id in ranges:
|
||||
problem["kernel_count_per_range"][range_id] += 1
|
||||
range_names = [problem["ranges"][i][2] for i in ranges]
|
||||
for range_idx in matching_range_indices:
|
||||
problem["kernel_count_per_range"][range_idx] += 1
|
||||
range_names = tuple(problem["ranges"][i].text for i in matching_range_indices)
|
||||
if (
|
||||
args.module is None or args.module in range_names
|
||||
) and "layer_wise_benchmarks ignore" not in range_names:
|
||||
kernel_list.append(
|
||||
(
|
||||
problem_id,
|
||||
run_id,
|
||||
range_names,
|
||||
start,
|
||||
end,
|
||||
demangledName,
|
||||
runtime_start,
|
||||
runtime_end,
|
||||
capture_start,
|
||||
capture_end,
|
||||
kernel_records.append(
|
||||
KernelRecord(
|
||||
problem_id=problem_id,
|
||||
run_id=run_id,
|
||||
range_names=range_names,
|
||||
kernel_start=kernel_start,
|
||||
kernel_end=kernel_end,
|
||||
demangled_name=demangled_name,
|
||||
runtime_start=runtime_start,
|
||||
capture_start=capture_start,
|
||||
)
|
||||
)
|
||||
|
||||
@ -195,12 +235,10 @@ conn.close()
|
||||
# Check ambiguous modules
|
||||
if args.module:
|
||||
for problem in problem_set:
|
||||
num_matches_per_run = [0] * (len(problem["runs"]) + 1)
|
||||
for (range_start, _, text), kernel_count in zip(
|
||||
problem["ranges"], problem["kernel_count_per_range"]
|
||||
):
|
||||
if text == args.module and kernel_count > 0:
|
||||
num_matches_per_run[bisect.bisect(problem["runs"], range_start)] += 1
|
||||
num_matches_per_run = [0] * (len(problem["run_starts"]) + 1)
|
||||
for nvtx_range, kernel_count in zip(problem["ranges"], problem["kernel_count_per_range"]):
|
||||
if nvtx_range.text == args.module and kernel_count > 0:
|
||||
num_matches_per_run[bisect.bisect(problem["run_starts"], nvtx_range.start)] += 1
|
||||
for run_id_plus_one, num_matches in enumerate(num_matches_per_run):
|
||||
if num_matches > 1:
|
||||
raise ValueError(
|
||||
@ -208,72 +246,70 @@ if args.module:
|
||||
f' in "{problem["text"]}"\'s {run_id_plus_one}-th run'
|
||||
)
|
||||
|
||||
kernel_list.sort(key=lambda t: (t[6], t[8]))
|
||||
kernels = [[[] for _ in problem["runs"]] for problem in problem_set]
|
||||
for (
|
||||
problem_id,
|
||||
run_id,
|
||||
ranges,
|
||||
start,
|
||||
end,
|
||||
demangledName,
|
||||
runtime_start,
|
||||
runtime_end,
|
||||
capture_start,
|
||||
capture_end,
|
||||
) in kernel_list:
|
||||
kernels[problem_id][run_id].append((demangledName, start, end, ranges))
|
||||
for problem_id in range(len(kernels)):
|
||||
required_seq = [demangledName for demangledName, _, _, _ in kernels[problem_id][0]]
|
||||
for run_id in range(len(kernels[problem_id])):
|
||||
seq = [demangledName for demangledName, _, _, _ in kernels[problem_id][run_id]]
|
||||
kernel_records.sort(key=lambda rec: (rec.runtime_start, rec.capture_start))
|
||||
kernels_per_problem: list[list[list[KernelTiming]]] = [
|
||||
[[] for _ in problem["run_starts"]] for problem in problem_set
|
||||
]
|
||||
for rec in kernel_records:
|
||||
kernels_per_problem[rec.problem_id][rec.run_id].append(
|
||||
KernelTiming(
|
||||
demangled_name=rec.demangled_name,
|
||||
kernel_start=rec.kernel_start,
|
||||
kernel_end=rec.kernel_end,
|
||||
range_names=rec.range_names,
|
||||
)
|
||||
)
|
||||
for problem_id, runs in enumerate(kernels_per_problem):
|
||||
required_seq = [kernel.demangled_name for kernel in runs[0]]
|
||||
for run_id, run in enumerate(runs):
|
||||
seq = [kernel.demangled_name for kernel in run]
|
||||
assert seq == required_seq
|
||||
|
||||
converted_seqs = []
|
||||
converted_seqs: list[list[CategoryTime]] = []
|
||||
warmup_times = run_args["warmup_times"] if args.warmup_times is None else args.warmup_times
|
||||
for runs in kernels:
|
||||
converted_seq = []
|
||||
for runs in kernels_per_problem:
|
||||
converted_seq: list[CategoryTime] = []
|
||||
# Kernel time
|
||||
for i, (demangledName, _, _, ranges) in enumerate(runs[0]):
|
||||
name = kernel_short_name(string_ids[demangledName])
|
||||
category = (*ranges, name)
|
||||
time_list = [run[i][2] - run[i][1] for run in runs]
|
||||
t = np.mean(time_list[warmup_times:]).tolist()
|
||||
converted_seq.append((category, t))
|
||||
for i, kernel in enumerate(runs[0]):
|
||||
name = kernel_short_name(string_ids[kernel.demangled_name])
|
||||
category = (*kernel.range_names, name)
|
||||
time_list = [run[i].kernel_end - run[i].kernel_start for run in runs]
|
||||
time_ns = np.mean(time_list[warmup_times:]).tolist()
|
||||
converted_seq.append(CategoryTime(category, time_ns))
|
||||
# Space and Overlap
|
||||
overlap_list = []
|
||||
space_list = []
|
||||
for run in runs:
|
||||
sorted_run = sorted(run, key=lambda op: op[1])
|
||||
last_end = sorted_run[0][1]
|
||||
sorted_run = sorted(run, key=lambda k: k.kernel_start)
|
||||
last_end = sorted_run[0].kernel_start
|
||||
overlap_time = 0
|
||||
space_time = 0
|
||||
for _, start, end, _ in sorted_run:
|
||||
if start > last_end:
|
||||
space_time += start - last_end
|
||||
for kernel in sorted_run:
|
||||
if kernel.kernel_start > last_end:
|
||||
space_time += kernel.kernel_start - last_end
|
||||
else:
|
||||
overlap_time += min(last_end, end) - start
|
||||
last_end = max(last_end, end)
|
||||
overlap_time += min(last_end, kernel.kernel_end) - kernel.kernel_start
|
||||
last_end = max(last_end, kernel.kernel_end)
|
||||
overlap_list.append(-overlap_time)
|
||||
space_list.append(space_time)
|
||||
converted_seq.append((("Overlap",), np.mean(overlap_list[warmup_times:]).tolist()))
|
||||
converted_seq.append((("Space",), np.mean(space_list[warmup_times:]).tolist()))
|
||||
converted_seq.append((("Total",), sum(t for _, t in converted_seq)))
|
||||
converted_seq.append(CategoryTime(("Overlap",), np.mean(overlap_list[warmup_times:]).tolist()))
|
||||
converted_seq.append(CategoryTime(("Space",), np.mean(space_list[warmup_times:]).tolist()))
|
||||
converted_seq.append(CategoryTime(("Total",), sum(ct.time_ns for ct in converted_seq)))
|
||||
converted_seqs.append(converted_seq)
|
||||
if args.error_on_unknown_kernel and warned_names:
|
||||
raise ValueError("Unknown kernel names encountered")
|
||||
|
||||
merged_title = []
|
||||
merged_title: list[tuple[str, ...]] = []
|
||||
for converted_seq in converted_seqs:
|
||||
title = [name for name, _ in converted_seq]
|
||||
title = [ct.category for ct in converted_seq]
|
||||
merged_title = shortest_common_supersequence(merged_title, title)
|
||||
|
||||
merged_data = [[0.0] * len(problem_set) for _ in merged_title]
|
||||
merged_data: list[list[float]] = [[0.0] * len(problem_set) for _ in merged_title]
|
||||
for problem_id, converted_seq in enumerate(converted_seqs):
|
||||
cur = 0
|
||||
for category, t in converted_seq:
|
||||
cur = merged_title.index(category, cur)
|
||||
merged_data[cur][problem_id] = t
|
||||
for ct in converted_seq:
|
||||
cur = merged_title.index(ct.category, cur)
|
||||
merged_data[cur][problem_id] = ct.time_ns
|
||||
cur += 1
|
||||
|
||||
print("Run args:")
|
||||
@ -282,14 +318,14 @@ print(run_args)
|
||||
print("Problem set:")
|
||||
for problem in problem_set:
|
||||
print(
|
||||
f'- "{problem["text"]}" {len(problem["runs"])} runs'
|
||||
f" Ranges: [{', '.join(text for _, end, text in problem['ranges'] if end <= problem['runs_end'][0])}]"
|
||||
f'- "{problem["text"]}" {len(problem["run_starts"])} runs'
|
||||
f" Ranges: [{', '.join(r.text for r in problem['ranges'] if r.end <= problem['run_ends'][0])}]"
|
||||
)
|
||||
|
||||
stack = []
|
||||
csv_data = [["", *[problem["text"] for problem in problem_set]]]
|
||||
js_data = []
|
||||
js_stack = [js_data]
|
||||
stack: list[str] = []
|
||||
csv_data: list[list[str]] = [["", *[problem["text"] for problem in problem_set]]]
|
||||
js_data: list[dict] = []
|
||||
js_stack: list[list[dict]] = [js_data]
|
||||
max_title_len = max((len(title) - 1) * 3 + len(title[-1][:40]) for title in merged_title)
|
||||
print("-" * (max_title_len + 1 + 6 * len(problem_set)))
|
||||
for title, time_data in zip(merged_title, merged_data):
|
||||
@ -330,8 +366,7 @@ with csv_file_path.open("w", newline="") as f:
|
||||
csv_writer = csv.writer(f, quoting=csv.QUOTE_MINIMAL)
|
||||
for row in csv_data:
|
||||
csv_writer.writerow(row)
|
||||
js_header_config = [{"name": problem["text"]} for problem in problem_set]
|
||||
js_header_config = []
|
||||
js_header_config: list[dict] = []
|
||||
for problem in problem_set:
|
||||
innermost_children = js_header_config
|
||||
for k, msg_prefix in [
|
||||
@ -353,35 +388,35 @@ for problem in problem_set:
|
||||
loader = jinja2.FileSystemLoader(Path(__file__).parent)
|
||||
template = jinja2.Environment(loader=loader).get_template("breakdown_template.html")
|
||||
with html_file_path.open("w") as f:
|
||||
configText = (
|
||||
config_text = (
|
||||
"Run:\n"
|
||||
+ json.dumps(run_args, indent=4)
|
||||
+ "\n\nParse:\n"
|
||||
+ json.dumps(args.__dict__, indent=4)
|
||||
)
|
||||
f.write(template.render(headerConfig=js_header_config, rawData=js_data, configText=configText))
|
||||
f.write(template.render(headerConfig=js_header_config, rawData=js_data, configText=config_text))
|
||||
|
||||
if args.query is not None:
|
||||
print("Query:")
|
||||
for query in args.query.split(","):
|
||||
query = query.strip()
|
||||
for query_str in args.query.split(","):
|
||||
query_str = query_str.strip()
|
||||
query_matched = [0.0] * len(problem_set)
|
||||
for title, time_data in zip(merged_title, merged_data):
|
||||
if query in ".".join(title):
|
||||
if query_str in ".".join(title):
|
||||
for i, x in enumerate(time_data):
|
||||
query_matched[i] += x
|
||||
print(
|
||||
query + " " * (max_title_len - len(query)),
|
||||
query_str + " " * (max_title_len - len(query_str)),
|
||||
*[f"{x / 1000:-6.1f}" for x in query_matched],
|
||||
)
|
||||
|
||||
correlation = []
|
||||
for problem, runs in zip(problem_set, kernels):
|
||||
timeline = []
|
||||
for i, (demangledName, _, _, _) in enumerate(runs[0]):
|
||||
name = string_ids[demangledName]
|
||||
duration_list = [run[i][2] - run[i][1] for run in runs]
|
||||
end_list = [run[i][2] - run[0][1] for run in runs]
|
||||
correlation: list[dict] = []
|
||||
for problem, runs in zip(problem_set, kernels_per_problem):
|
||||
timeline: list[dict] = []
|
||||
for i, kernel in enumerate(runs[0]):
|
||||
name = string_ids[kernel.demangled_name]
|
||||
duration_list = [run[i].kernel_end - run[i].kernel_start for run in runs]
|
||||
end_list = [run[i].kernel_end - run[0].kernel_start for run in runs]
|
||||
timeline.append(
|
||||
{
|
||||
"name": name,
|
||||
|
||||
@ -4,6 +4,7 @@ import json
|
||||
import re
|
||||
import sqlite3
|
||||
from pathlib import Path
|
||||
from typing import NamedTuple
|
||||
|
||||
import numpy as np
|
||||
import pandas as pd
|
||||
@ -15,7 +16,36 @@ from parser_utils import (
|
||||
)
|
||||
|
||||
|
||||
def comma_separated_ints(s):
|
||||
class IterInfo(NamedTuple):
|
||||
"""Represents an iteration's timing information."""
|
||||
|
||||
start: int
|
||||
end: int
|
||||
iter_id: int
|
||||
|
||||
|
||||
class LayerInfo(NamedTuple):
|
||||
"""Represents a layer's timing information within an iteration."""
|
||||
|
||||
start: int
|
||||
end: int
|
||||
layer_idx: int
|
||||
|
||||
|
||||
class KernelQueryResult(NamedTuple):
|
||||
"""Represents a kernel query result for e2e parsing.
|
||||
|
||||
Sorted by runtime_start for consistent ordering.
|
||||
"""
|
||||
|
||||
runtime_start: int
|
||||
graph_node_id: int | None
|
||||
kernel_start: int
|
||||
kernel_end: int
|
||||
demangled_name: int # String ID reference
|
||||
|
||||
|
||||
def comma_separated_ints(s: str) -> list[int]:
|
||||
return [int(x) for x in s.split(",")]
|
||||
|
||||
|
||||
@ -41,12 +71,12 @@ if args.graph_trace is not None and not args.graph_trace.endswith(".nsys-rep"):
|
||||
print(args)
|
||||
|
||||
|
||||
def is_gemm(name):
|
||||
def is_gemm(name: str) -> bool:
|
||||
return "nvjet" in name or "gemm" in name.lower()
|
||||
|
||||
|
||||
eager_nsys_rep_file_path = Path(args.eager_trace)
|
||||
# For CTX phase which does not use CUDA Graphs, analysis the eager trace instead.
|
||||
# For CTX phase which does not use CUDA Graphs, analyze the eager trace instead.
|
||||
# Here we do not change the identifier name "graph_*" for convenience.
|
||||
graph_nsys_rep_file_path = Path(args.graph_trace or args.eager_trace)
|
||||
eager_sqlite_file_path = eager_nsys_rep_file_path.parent / (
|
||||
@ -89,47 +119,47 @@ if target_gen_reqs is None:
|
||||
else:
|
||||
target_gen_reqs = 0
|
||||
print(f"{target_ctx_reqs=} {target_gen_reqs=}")
|
||||
eager_iters = []
|
||||
eager_iters: list[IterInfo] = []
|
||||
for start, end, text in df.itertuples(index=False):
|
||||
if m := re.match(r"^\[Executor\] _forward_step (\d+): (\d+) ctx reqs, (\d+) gen reqs", text):
|
||||
it = int(m.group(1))
|
||||
iter_id = int(m.group(1))
|
||||
ctx_reqs = int(m.group(2))
|
||||
gen_reqs = int(m.group(3))
|
||||
if ctx_reqs == target_ctx_reqs and gen_reqs == target_gen_reqs:
|
||||
eager_iters.append((start, end, it))
|
||||
eager_iters.append(IterInfo(start, end, iter_id))
|
||||
eager_iters = sorted(eager_iters)[args.warmup_times :]
|
||||
iter_list = [t[2] for t in eager_iters]
|
||||
print("Iters (eager)", *iter_list)
|
||||
per_iter_eager_layers = [[] for _ in iter_list]
|
||||
iter_id_list = [it.iter_id for it in eager_iters]
|
||||
print("Iters (eager)", *iter_id_list)
|
||||
per_iter_eager_layers: list[list[LayerInfo]] = [[] for _ in iter_id_list]
|
||||
for start, end, text in df.itertuples(index=False):
|
||||
if m := re.match(r"^layer_wise_benchmarks layer_idx (\d+)$", text):
|
||||
layer_idx = int(m.group(1))
|
||||
it_idx = bisect.bisect(eager_iters, (start,)) - 1
|
||||
if it_idx < 0 or end > eager_iters[it_idx][1]:
|
||||
iter_idx = bisect.bisect(eager_iters, (start,)) - 1
|
||||
if iter_idx < 0 or end > eager_iters[iter_idx].end:
|
||||
continue
|
||||
assert end <= eager_iters[it_idx][1], "Not belong to any iter"
|
||||
per_iter_eager_layers[it_idx].append((start, end, it_idx, layer_idx))
|
||||
layer_list = [t[3] for t in per_iter_eager_layers[0]]
|
||||
print("Layers (eager)", *layer_list)
|
||||
assert end <= eager_iters[iter_idx].end, "Not belong to any iter"
|
||||
per_iter_eager_layers[iter_idx].append(LayerInfo(start, end, layer_idx))
|
||||
layer_idx_list = [layer.layer_idx for layer in per_iter_eager_layers[0]]
|
||||
print("Layers (eager)", *layer_idx_list)
|
||||
for eager_layers in per_iter_eager_layers:
|
||||
assert [t[3] for t in eager_layers] == layer_list, "inconsistent layer idx"
|
||||
assert [layer.layer_idx for layer in eager_layers] == layer_idx_list, "inconsistent layer idx"
|
||||
df = pd.read_sql_query(query, graph_conn, params=(graph_event_id_NvtxPushPopRange,))
|
||||
graph_iters = []
|
||||
graph_iters: list[IterInfo] = []
|
||||
for start, end, text in df.itertuples(index=False):
|
||||
if m := re.match(r"^\[Executor\] _forward_step (\d+): (\d+) ctx reqs, (\d+) gen reqs", text):
|
||||
it = int(m.group(1))
|
||||
iter_id = int(m.group(1))
|
||||
ctx_reqs = int(m.group(2))
|
||||
gen_reqs = int(m.group(3))
|
||||
if ctx_reqs == target_ctx_reqs and gen_reqs == target_gen_reqs:
|
||||
graph_iters.append((start, end, it))
|
||||
graph_iters.append(IterInfo(start, end, iter_id))
|
||||
graph_iters = sorted(graph_iters)[args.warmup_times :]
|
||||
graph_iter_list = [t[2] for t in graph_iters]
|
||||
print("Iters (graph)", *graph_iter_list)
|
||||
if iter_list != graph_iter_list:
|
||||
graph_iter_id_list = [it.iter_id for it in graph_iters]
|
||||
print("Iters (graph)", *graph_iter_id_list)
|
||||
if iter_id_list != graph_iter_id_list:
|
||||
raise ValueError("The ID of iterations do not match")
|
||||
|
||||
|
||||
def query_kernels(conn, iters):
|
||||
def query_kernels(conn: sqlite3.Connection, iters: list[IterInfo]) -> list[list[KernelQueryResult]]:
|
||||
query = """SELECT name FROM sqlite_master WHERE type = ?"""
|
||||
df = pd.read_sql_query(query, conn, params=("table",))
|
||||
tables = df["name"].tolist()
|
||||
@ -148,16 +178,25 @@ def query_kernels(conn, iters):
|
||||
FROM ({unified_subquery}) AS unified
|
||||
JOIN CUPTI_ACTIVITY_KIND_RUNTIME AS R ON unified.correlationId = R.correlationId"""
|
||||
df = pd.read_sql_query(query, conn)
|
||||
per_iter_kernels = [[] for _ in iters]
|
||||
for start, end, graphNodeId, demangledName, runtime_start, runtime_end in df.itertuples(
|
||||
index=False
|
||||
):
|
||||
it_idx = bisect.bisect(iters, (runtime_start,)) - 1
|
||||
if it_idx < 0 or runtime_end > iters[it_idx][1]:
|
||||
per_iter_kernels: list[list[KernelQueryResult]] = [[] for _ in iters]
|
||||
for (
|
||||
kernel_start,
|
||||
kernel_end,
|
||||
graph_node_id,
|
||||
demangled_name,
|
||||
runtime_start,
|
||||
runtime_end,
|
||||
) in df.itertuples(index=False):
|
||||
iter_idx = bisect.bisect(iters, (runtime_start,)) - 1
|
||||
if iter_idx < 0 or runtime_end > iters[iter_idx].end:
|
||||
continue
|
||||
per_iter_kernels[it_idx].append((runtime_start, graphNodeId, start, end, demangledName))
|
||||
per_iter_kernels[iter_idx].append(
|
||||
KernelQueryResult(
|
||||
runtime_start, graph_node_id, kernel_start, kernel_end, demangled_name
|
||||
)
|
||||
)
|
||||
for kernels in per_iter_kernels:
|
||||
kernels.sort()
|
||||
kernels.sort(key=lambda k: (k.runtime_start, k.graph_node_id))
|
||||
return per_iter_kernels
|
||||
|
||||
|
||||
@ -166,12 +205,14 @@ graph_per_iter_kernels = query_kernels(graph_conn, graph_iters)
|
||||
print("#Kernels (eager)", *[len(kernels) for kernels in eager_per_iter_kernels])
|
||||
print("#Kernels (graph)", *[len(kernels) for kernels in graph_per_iter_kernels])
|
||||
for eager_kernels, graph_kernels in zip(eager_per_iter_kernels, graph_per_iter_kernels):
|
||||
assert all(a[4] == eager_per_iter_kernels[0][i][4] for i, a in enumerate(eager_kernels)), (
|
||||
"eager kernels change across iterations"
|
||||
)
|
||||
assert all(a[4] == graph_per_iter_kernels[0][i][4] for i, a in enumerate(graph_kernels)), (
|
||||
"graph kernels change across iterations"
|
||||
)
|
||||
assert all(
|
||||
kernel.demangled_name == eager_per_iter_kernels[0][i].demangled_name
|
||||
for i, kernel in enumerate(eager_kernels)
|
||||
), "eager kernels change across iterations"
|
||||
assert all(
|
||||
kernel.demangled_name == graph_per_iter_kernels[0][i].demangled_name
|
||||
for i, kernel in enumerate(graph_kernels)
|
||||
), "graph kernels change across iterations"
|
||||
|
||||
query = "SELECT * FROM StringIds"
|
||||
df = pd.read_sql_query(query, eager_conn)
|
||||
@ -184,26 +225,33 @@ graph_string_ids.update({-2: "Memcpy", -3: "Memset"})
|
||||
eager_conn.close()
|
||||
graph_conn.close()
|
||||
|
||||
eager_kernel_names = [eager_string_ids[kernel[4]] for kernel in eager_per_iter_kernels[0]]
|
||||
graph_kernel_names = [graph_string_ids[kernel[4]] for kernel in graph_per_iter_kernels[0]]
|
||||
eager_kernel_names = [
|
||||
eager_string_ids[kernel.demangled_name] for kernel in eager_per_iter_kernels[0]
|
||||
]
|
||||
graph_kernel_names = [
|
||||
graph_string_ids[kernel.demangled_name] for kernel in graph_per_iter_kernels[0]
|
||||
]
|
||||
super_kernel_names = shortest_common_supersequence(eager_kernel_names, graph_kernel_names)
|
||||
print(f"#Kernels (supersequence) {len(super_kernel_names)}")
|
||||
eager_per_layer_kernels = [[] for _ in layer_list]
|
||||
eager_per_layer_kernels: list[list[int]] = [[] for _ in layer_idx_list]
|
||||
for i, eager_kernel in enumerate(eager_per_iter_kernels[0]):
|
||||
eager_layers_idx = bisect.bisect(per_iter_eager_layers[0], (eager_kernel[0],)) - 1
|
||||
if eager_layers_idx < 0 or eager_kernel[0] > per_iter_eager_layers[0][eager_layers_idx][1]:
|
||||
eager_layer_idx = bisect.bisect(per_iter_eager_layers[0], (eager_kernel.runtime_start,)) - 1
|
||||
if (
|
||||
eager_layer_idx < 0
|
||||
or eager_kernel.runtime_start > per_iter_eager_layers[0][eager_layer_idx].end
|
||||
):
|
||||
continue
|
||||
eager_per_layer_kernels[eager_layers_idx].append(i)
|
||||
eager2super = []
|
||||
eager_per_layer_kernels[eager_layer_idx].append(i)
|
||||
eager2super: list[int] = []
|
||||
j = 0
|
||||
for i, eager_kernel_name in enumerate(eager_kernel_names):
|
||||
for eager_kernel_name in eager_kernel_names:
|
||||
while eager_kernel_name != super_kernel_names[j]:
|
||||
j += 1
|
||||
eager2super.append(j)
|
||||
j += 1
|
||||
super_per_layer_starts = [eager2super[a[0]] for a in eager_per_layer_kernels]
|
||||
super_per_layer_ends = [eager2super[a[-1]] for a in eager_per_layer_kernels]
|
||||
graph_per_layer_kernels = [[] for _ in layer_list]
|
||||
super_per_layer_starts = [eager2super[indices[0]] for indices in eager_per_layer_kernels]
|
||||
super_per_layer_ends = [eager2super[indices[-1]] for indices in eager_per_layer_kernels]
|
||||
graph_per_layer_kernels: list[list[int]] = [[] for _ in layer_idx_list]
|
||||
j = 0
|
||||
for i, graph_kernel_name in enumerate(graph_kernel_names):
|
||||
while graph_kernel_name != super_kernel_names[j]:
|
||||
@ -212,16 +260,16 @@ for i, graph_kernel_name in enumerate(graph_kernel_names):
|
||||
if layer_idx >= 0 and j <= super_per_layer_ends[layer_idx]:
|
||||
graph_per_layer_kernels[layer_idx].append(i)
|
||||
j += 1
|
||||
timeline = []
|
||||
timeline: list[dict] = []
|
||||
first_kernel_idx = min(graph_per_layer_kernels[layer_idx][0] for layer_idx in args.layer_indices)
|
||||
for layer_idx in args.layer_indices:
|
||||
for kernel_idx in graph_per_layer_kernels[layer_idx]:
|
||||
duration_list = []
|
||||
end_list = []
|
||||
for it_idx in range(len(graph_per_iter_kernels)):
|
||||
layer_start_time = graph_per_iter_kernels[it_idx][first_kernel_idx][2]
|
||||
kernel_start_time = graph_per_iter_kernels[it_idx][kernel_idx][2]
|
||||
kernel_end_time = graph_per_iter_kernels[it_idx][kernel_idx][3]
|
||||
duration_list: list[int] = []
|
||||
end_list: list[int] = []
|
||||
for iter_idx in range(len(graph_per_iter_kernels)):
|
||||
layer_start_time = graph_per_iter_kernels[iter_idx][first_kernel_idx].kernel_start
|
||||
kernel_start_time = graph_per_iter_kernels[iter_idx][kernel_idx].kernel_start
|
||||
kernel_end_time = graph_per_iter_kernels[iter_idx][kernel_idx].kernel_end
|
||||
duration_list.append(kernel_end_time - kernel_start_time)
|
||||
end_list.append(kernel_end_time - layer_start_time)
|
||||
timeline.append(
|
||||
@ -233,9 +281,11 @@ for layer_idx in args.layer_indices:
|
||||
)
|
||||
print(f"{'Kernel':40s} {'Duration':>8s} {'End':>8s}")
|
||||
print("-" * (40 + 1 + 8 + 1 + 8))
|
||||
for o in timeline:
|
||||
for entry in timeline:
|
||||
print(
|
||||
f"{kernel_short_name(o['name'])[:40]:40s} {o['duration'] / 1000.0:-8.1f} {o['end'] / 1000.0:-8.1f}"
|
||||
f"{kernel_short_name(entry['name'])[:40]:40s} "
|
||||
f"{entry['duration'] / 1000.0:-8.1f} "
|
||||
f"{entry['end'] / 1000.0:-8.1f}"
|
||||
)
|
||||
if args.error_on_unknown_kernel and warned_names:
|
||||
raise ValueError("Unknown kernel names encountered")
|
||||
|
||||
@ -14,35 +14,39 @@ if [ "$RANK" -eq 0 ]; then
|
||||
fi
|
||||
|
||||
PROFILE_DIR=${PROFILE_DIR:-profiles}
|
||||
mkdir -p ${PROFILE_DIR}
|
||||
mkdir -p -- "$PROFILE_DIR"
|
||||
|
||||
PROFILE=${PROFILE:-1}
|
||||
BACKTRACE=${BACKTRACE:-0}
|
||||
GPU_METRICS=${GPU_METRICS:-0}
|
||||
if [ "$PROFILE" -eq 1 ]; then
|
||||
PROFILE_CMD="nsys profile
|
||||
PROFILE_CMD=(
|
||||
nsys profile
|
||||
-t cuda,nvtx
|
||||
--cpuctxsw none --cuda-event-trace false
|
||||
--cuda-graph-trace node
|
||||
-c cudaProfilerApi --capture-range-end stop
|
||||
-o ${PROFILE_DIR}/report_np${WORLD_SIZE}_rank${RANK}.nsys-rep
|
||||
--force-overwrite true"
|
||||
-o "${PROFILE_DIR}/report_np${WORLD_SIZE}_rank${RANK}.nsys-rep"
|
||||
--force-overwrite true
|
||||
)
|
||||
if [ "$BACKTRACE" -eq 1 ]; then
|
||||
PROFILE_CMD+=" --python-backtrace=cuda --cudabacktrace all"
|
||||
PROFILE_CMD+=(--python-backtrace=cuda --cudabacktrace all)
|
||||
else
|
||||
PROFILE_CMD+=" -s none"
|
||||
PROFILE_CMD+=(-s none)
|
||||
fi
|
||||
if [ "$GPU_METRICS" -eq 1 ]; then
|
||||
PROFILE_CMD+=" --gpu-metrics-devices $LOCAL_RANK
|
||||
--gpu-metrics-frequency 10000"
|
||||
PROFILE_CMD+=(
|
||||
--gpu-metrics-devices $LOCAL_RANK
|
||||
--gpu-metrics-frequency 10000
|
||||
)
|
||||
fi
|
||||
else
|
||||
PROFILE_CMD=
|
||||
PROFILE_CMD=()
|
||||
fi
|
||||
|
||||
SCRIPT_PATH=$(realpath --relative-to="$(pwd)" "$(dirname -- "$0")"/run.py)
|
||||
SCRIPT_PATH=$(realpath --relative-to="$(pwd)" -- "$(dirname -- "$0")"/run.py)
|
||||
|
||||
set -x
|
||||
$PROFILE_CMD bash -o pipefail -c \
|
||||
"python3 -u \"\$1\" \"\${@:3}\" 2>&1 | tee \"\$2/report_np${WORLD_SIZE}_rank${RANK}.log\"" \
|
||||
${PROFILE_CMD[@]+"${PROFILE_CMD[@]}"} bash -o pipefail -c \
|
||||
'python3 -u "$1" "${@:3}" 2>&1 | tee "$2/report_np'"${WORLD_SIZE}"'_rank'"${RANK}"'.log"' \
|
||||
bash "$SCRIPT_PATH" "$PROFILE_DIR" "$@"
|
||||
|
||||
@ -12,14 +12,14 @@ export PROFILE_DIR="${PROFILE_DIR:-profiles}"
|
||||
export TLLM_AUTOTUNER_CACHE_PATH="$PROFILE_DIR/sample_performance_alignment_cache.json"
|
||||
|
||||
mkdir -p -- "$PROFILE_DIR"
|
||||
mkdir -p -- "$(dirname "$TLLM_AUTOTUNER_CACHE_PATH")"
|
||||
mkdir -p -- "$(dirname -- "$TLLM_AUTOTUNER_CACHE_PATH")"
|
||||
|
||||
python3 ../../benchmarks/cpp/prepare_dataset.py \
|
||||
--tokenizer "$MODEL" \
|
||||
--stdout \
|
||||
--random-seed 42 \
|
||||
token-norm-dist \
|
||||
--num-requests $((BATCH_SIZE*NP)) \
|
||||
--num-requests $((BATCH_SIZE * NP)) \
|
||||
--input-mean 2048 \
|
||||
--input-stdev 0 \
|
||||
--output-mean 256 \
|
||||
@ -61,8 +61,8 @@ trtllm-bench \
|
||||
--max_batch_size $BATCH_SIZE \
|
||||
--max_num_tokens 3072 \
|
||||
--disable_chunked_context \
|
||||
--num_requests $((BATCH_SIZE*NP)) \
|
||||
--concurrency $((BATCH_SIZE*NP)) \
|
||||
--num_requests $((BATCH_SIZE * NP)) \
|
||||
--concurrency $((BATCH_SIZE * NP)) \
|
||||
--config /tmp/config_collect.yaml
|
||||
|
||||
# Step 2
|
||||
@ -98,8 +98,8 @@ trtllm-bench \
|
||||
--max_batch_size $BATCH_SIZE \
|
||||
--max_num_tokens 3072 \
|
||||
--disable_chunked_context \
|
||||
--num_requests $((BATCH_SIZE*NP)) \
|
||||
--concurrency $((BATCH_SIZE*NP)) \
|
||||
--num_requests $((BATCH_SIZE * NP)) \
|
||||
--concurrency $((BATCH_SIZE * NP)) \
|
||||
--config /tmp/config_mark.yaml
|
||||
|
||||
# Step 3
|
||||
|
||||
@ -4,8 +4,8 @@ set -euo pipefail
|
||||
|
||||
# CONTAINER_IMAGE=
|
||||
CONTAINER_NAME=${CONTAINER_NAME:-layer_wise_benchmarks}
|
||||
TRTLLM_ROOT=$(realpath "$(dirname -- "$0")"/../..)
|
||||
CONTAINER_MOUNTS=$TRTLLM_ROOT:$TRTLLM_ROOT
|
||||
TRTLLM_ROOT=$(realpath -- "$(dirname -- "$0")"/../..)
|
||||
CONTAINER_MOUNTS="$TRTLLM_ROOT:$TRTLLM_ROOT"
|
||||
|
||||
if [ -z "${SLURM_JOB_ID:-}" ]; then
|
||||
echo "Please set SLURM_JOB_ID"
|
||||
@ -18,9 +18,9 @@ if [ -z "${CONTAINER_IMAGE:-}" ]; then
|
||||
# Read Docker image from current_image_tags.properties
|
||||
MACHINE="$(srun -N 1 uname -m)"
|
||||
if [ "$MACHINE" == "x86_64" ]; then
|
||||
DOCKER_IMAGE=$(source "$TRTLLM_ROOT/jenkins/current_image_tags.properties" && echo $LLM_DOCKER_IMAGE)
|
||||
DOCKER_IMAGE=$(source "$TRTLLM_ROOT/jenkins/current_image_tags.properties" && echo "$LLM_DOCKER_IMAGE")
|
||||
elif [ "$MACHINE" == "aarch64" ]; then
|
||||
DOCKER_IMAGE=$(source "$TRTLLM_ROOT/jenkins/current_image_tags.properties" && echo $LLM_SBSA_DOCKER_IMAGE)
|
||||
DOCKER_IMAGE=$(source "$TRTLLM_ROOT/jenkins/current_image_tags.properties" && echo "$LLM_SBSA_DOCKER_IMAGE")
|
||||
else
|
||||
echo "Unsupported machine hardware name \"$MACHINE\""
|
||||
exit 1
|
||||
@ -31,7 +31,7 @@ if [ -z "${CONTAINER_IMAGE:-}" ]; then
|
||||
echo "CONTAINER_IMAGE was not set, using Docker image $DOCKER_IMAGE"
|
||||
|
||||
# Import to .sqsh file
|
||||
SQSH_FILE_NAME=$(echo "$DOCKER_IMAGE" |
|
||||
SQSH_FILE_NAME=$(printf '%s\n' "$DOCKER_IMAGE" |
|
||||
awk -F'#' '{print $2}' |
|
||||
awk -F':' '{gsub(/\//,"+",$1); print $1"+"$2".sqsh"}')
|
||||
CONTAINER_IMAGE="$TRTLLM_ROOT/enroot/$SQSH_FILE_NAME"
|
||||
@ -41,7 +41,7 @@ if [ -z "${CONTAINER_IMAGE:-}" ]; then
|
||||
fi
|
||||
fi
|
||||
|
||||
WORKDIR=$(realpath "$(pwd)")
|
||||
WORKDIR=$(realpath -- "$(pwd)")
|
||||
|
||||
set -x
|
||||
srun -N "$NODES" \
|
||||
@ -50,7 +50,7 @@ srun -N "$NODES" \
|
||||
--container-name "$CONTAINER_NAME" \
|
||||
--container-mounts "$CONTAINER_MOUNTS" \
|
||||
--container-workdir "$WORKDIR" \
|
||||
bash -c "cd \"\$1\" &&
|
||||
bash -c 'cd "$1" &&
|
||||
pip install -U packaging &&
|
||||
pip install -r requirements.txt --no-build-isolation &&
|
||||
pip install -e ." bash "$TRTLLM_ROOT"
|
||||
pip install -e .' bash "$TRTLLM_ROOT"
|
||||
|
||||
@ -3,20 +3,20 @@
|
||||
set -euo pipefail
|
||||
|
||||
CONTAINER_NAME=${CONTAINER_NAME:-layer_wise_benchmarks}
|
||||
TRTLLM_ROOT=$(realpath "$(dirname -- "$0")"/../..)
|
||||
CONTAINER_MOUNTS=$TRTLLM_ROOT:$TRTLLM_ROOT
|
||||
TRTLLM_ROOT=$(realpath -- "$(dirname -- "$0")"/../..)
|
||||
CONTAINER_MOUNTS="$TRTLLM_ROOT:$TRTLLM_ROOT"
|
||||
|
||||
if [ -z "${SLURM_JOB_ID:-}" ]; then
|
||||
echo "Please set SLURM_JOB_ID"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
WORKDIR=$(realpath "$(pwd)")
|
||||
WORKDIR=$(realpath -- "$(pwd)")
|
||||
|
||||
set -x
|
||||
srun --mpi=pmix \
|
||||
-N "$NODES" \
|
||||
--ntasks-per-node $(($NP / $NODES)) \
|
||||
--ntasks-per-node $((NP / NODES)) \
|
||||
--container-name "$CONTAINER_NAME" \
|
||||
--container-mounts "$CONTAINER_MOUNTS" \
|
||||
--container-workdir "$WORKDIR" \
|
||||
|
||||
@ -8,22 +8,23 @@ if [ -z "${SLURM_JOB_ID:-}" ]; then
|
||||
fi
|
||||
|
||||
prefix="pyxis_${SLURM_JOB_ID}_"
|
||||
matches=$(printf "%s\n" "$(srun -N 1 enroot list)" | grep "^${prefix}" || true)
|
||||
count=$(printf "%s\n" "$matches" | wc -l)
|
||||
matches=$(printf '%s\n' "$(srun -N 1 enroot list)" | grep "^${prefix}" || true)
|
||||
|
||||
if [ "$count" -eq 0 ]; then
|
||||
if [ -z "$matches" ]; then
|
||||
echo "Error: No container found" >&2
|
||||
exit 1
|
||||
else
|
||||
count=$(printf '%s\n' "$matches" | wc -l)
|
||||
fi
|
||||
|
||||
if [ "$count" -gt 1 ]; then
|
||||
echo "Error: Multiple containers found" >&2
|
||||
while IFS= read -r match; do
|
||||
echo "- ${match#$prefix}" >&2
|
||||
echo "- ${match#"$prefix"}" >&2
|
||||
done <<< "$matches"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
suffix=${matches#$prefix}
|
||||
suffix=${matches#"$prefix"}
|
||||
echo "Container name: $suffix" >&2
|
||||
echo "$suffix"
|
||||
printf '%s\n' "$suffix"
|
||||
|
||||
Loading…
Reference in New Issue
Block a user