[None][chore] Clean up layer-wise benchmarks code (#11092)

Signed-off-by: Tailing Yuan <yuantailing@gmail.com>
2026-02-05 02:31:33 +08:00 · 2026-01-30 03:29:37 +08:00 · 2026-01-30 03:29:37 +08:00 · 4345636b04
commit 4345636b04
parent ab7dd34bbe
9 changed files with 344 additions and 251 deletions
--- a/examples/layer_wise_benchmarks/README.md
+++ b/examples/layer_wise_benchmarks/README.md
@ -1,10 +1,12 @@
 # Layer-wise Benchmarks

+This tool profiles individual layers of LLM models to help understand the performance characteristics of each layer and compare layer-wise benchmarks with end-to-end profiling results.
+
 ## Generate profiles

 ### Run with OpenMPI

-**Step 1:** Start a container using Docker, Enroot or others. Please refer to `../../jenkins/current_image_tags.properties` for the Docker image URI.
+**Step 1:** Start a container using Docker, Enroot, or other container runtimes. Please refer to `../../jenkins/current_image_tags.properties` for the Docker image URI.

 **Step 2:** In the container, install `tensorrt_llm`:

@ -19,7 +21,7 @@ pip install -e ../..
 NP=4 ./mpi_launch.sh ./run.sh config_ctx.yaml
 NP=4 ./mpi_launch.sh ./run.sh config_gen.yaml

-# Run with weights loaded. Requires local model directory
+# Run with weights loaded (requires a local model directory)
 NP=4 ./mpi_launch.sh ./run.sh config_ctx.yaml --model "$LLM_MODELS_ROOT/DeepSeek-R1/DeepSeek-R1-0528-FP4-v2" --load-format AUTO
 NP=4 ./mpi_launch.sh ./run.sh config_gen.yaml --model "$LLM_MODELS_ROOT/DeepSeek-R1/DeepSeek-R1-0528-FP4-v2" --load-format AUTO

@ -46,10 +48,10 @@ NP=4 ./mpi_launch.sh ./run.sh config_gen.yaml --batch-size 32 --seq-len-q 4
 NP=4 ./mpi_launch.sh ./run.sh config_ctx.yaml --layer-indices 5,6,7,8
 NP=4 ./mpi_launch.sh ./run.sh config_gen.yaml --layer-indices 5,6,7,8

-# Scale DEP=16 to 4 GPUs: reduce the number of experts, uses MNNVL A2A if applicable
+# Scale DEP=16 to 4 GPUs: reduces the number of experts; uses MNNVL A2A if applicable
 NP=4 ./mpi_launch.sh ./run.sh config_gen.yaml --scaled-from 16 --moe-backend WIDEEP

-# Scale TEP=16 to 4 GPUs: reduce the number of attention heads and experts
+# Scale TEP=16 to 4 GPUs: reduces the number of attention heads and experts
 NP=4 ./mpi_launch.sh ./run.sh config_gen.yaml --scaled-from 16 --no-enable-attention-dp

 # Run Nemotron-3-Nano
@ -64,12 +66,12 @@ NP=2 ./mpi_launch.sh ./run.sh config_gen.yaml --model Qwen/Qwen3-Next-80B-A3B-In
 NP=4 ./mpi_launch.sh -x TRTLLM_FORCE_ALLTOALL_METHOD=DeepEP ./run.sh config_ctx.yaml --moe-backend WIDEEP
 NP=4 ./mpi_launch.sh -x TRTLLM_FORCE_ALLTOALL_METHOD=DeepEP ./run.sh config_gen.yaml --moe-backend WIDEEP

-# Run with imbalanced ranks: except for activating all experts, a% of the tokens are sent to the 1st rank
-# Note: if balance ratio is 0, ignore activating all experts
+# Run with imbalanced ranks: in addition to activating all experts, the specified ratio of tokens is sent to rank 0
+# Note: if balance ratio is 0, the "activate all experts" behavior is not applied
 NP=4 ./mpi_launch.sh ./run.sh config_ctx.yaml --balance-method ImbalancedRanks --balance-ratio 0.5
 NP=4 ./mpi_launch.sh ./run.sh config_gen.yaml --balance-method ImbalancedRanks --balance-ratio 0.5

-# Run with imbalanced experts and balanced ranks: except for activating all experts, a% of the tokens are sent to the front experts on each rank
+# Run with imbalanced experts and balanced ranks: in addition to activating all experts, the specified ratio of tokens is sent to the front experts on each rank
 NP=4 ./mpi_launch.sh ./run.sh config_ctx.yaml --balance-method ImbalancedExperts --balance-ratio 0.5
 NP=4 ./mpi_launch.sh ./run.sh config_gen.yaml --balance-method ImbalancedExperts --balance-ratio 0.5
 ```
@ -77,8 +79,8 @@ NP=4 ./mpi_launch.sh ./run.sh config_gen.yaml --balance-method ImbalancedExperts
 ### Run with Slurm

 > Tips:
-> 1. If you have a running Slurm job, you can set environment variable `export SLURM_JOB_ID=aaa` and skip step 1.
-> 2. Further, if you have installed `tensorrt_llm` in the Slurm job, you can also skip step 2. Just run step 3 with `export CONTAINER_NAME=aaa` specified. If you don't know the container name, run `export CONTAINER_NAME=$(./slurm_query_container_name.sh)` to get it.
+> 1. If you have a running Slurm job, you can set the environment variable by running `export SLURM_JOB_ID=<job_id>` and skip Step 1.
+> 2. Further, if you have already installed `tensorrt_llm` in the Slurm job, you can also skip Step 2. Just run Step 3 with `export CONTAINER_NAME=<name>` specified. If you don't know the container name, run `export CONTAINER_NAME=$(./slurm_query_container_name.sh)` to get it.

 **Step 1:** On the controller node, allocate one or multiple nodes, and export the `SLURM_JOB_ID`:

@ -86,7 +88,7 @@ NP=4 ./mpi_launch.sh ./run.sh config_gen.yaml --balance-method ImbalancedExperts
 export SLURM_JOB_ID=$(NODES=4 TIME=02:00:00 ./slurm_alloc.sh)
 ```

-Please fill the variables in `./slurm_alloc.sh`.
+Please set the variables in `./slurm_alloc.sh` before running.

 **Step 2:** Start a container and install `tensorrt_llm`. Run the following command on the controller node:

@ -94,9 +96,9 @@ Please fill the variables in `./slurm_alloc.sh`.
 ./slurm_init_containers.sh
 ```

-It uses the image recorded in `../../jenkins/current_image_tags.properties`. The image will be downloaded to `../../enroot/` for once.
+This script uses the image recorded in `../../jenkins/current_image_tags.properties`. The image will be downloaded to `../../enroot/` once.

-> Tips: If you want to change the image, no need to reallocate Slurm jobs. Just start another container by running step 2 with `export CONTAINER_NAME=aaa`, and step 3 will run in the container specified by the `CONTAINER_NAME` env.
+> Tip: If you want to change the image, there is no need to reallocate Slurm jobs. Just start another container by running Step 2 with `export CONTAINER_NAME=<new_name>`, and Step 3 will run in the container specified by the `CONTAINER_NAME` environment variable.

 **(Optional) Get an interactive shell**

@ -117,7 +119,7 @@ python3 scripts/build_wheel.py --cuda_architectures native --no-venv --skip_buil
 **Step 3:** Run benchmarks to generate profiles. Run the following command on the controller node, where `NODES` &le; the number of allocated nodes:

 ```bash
-# Run DeepSeek-R1 NVFP4 with wide ep: uses MNNVL A2A if applicable
+# Run DeepSeek-R1 NVFP4 with wide EP; uses MNNVL A2A if applicable
 NODES=4 NP=16 ./slurm_launch.sh ./run.sh config_gen.yaml --moe-backend WIDEEP

 # Run with TRTLLMGen
@ -126,7 +128,7 @@ NODES=4 NP=16 ./slurm_launch.sh ./run.sh config_gen.yaml --moe-backend TRTLLM
 # Run with DeepEPLowLatency
 NODES=4 NP=16 TRTLLM_FORCE_ALLTOALL_METHOD=DeepEPLowLatency ./slurm_launch.sh ./run.sh config_gen.yaml --moe-backend WIDEEP

-# You can run 4-GPU and 8-GPU tasks without reallocating the slurm job
+# You can run 4-GPU and 8-GPU tasks without reallocating the Slurm job
 NODES=1 NP=4 ./slurm_launch.sh ./run.sh config_ctx.yaml
 NODES=2 NP=8 ./slurm_launch.sh ./run.sh config_gen.yaml
 ```
@ -141,7 +143,7 @@ Supported list arguments:
 - `--seq-len-kv-cache` (or `seq_len_kv_cache` in YAML)
 - `--balance-ratio` (or `balance_ratio` in YAML)

-Command line arguments are comma separated, for example, `--batch-size 1,2,4`. Configs in the YAML file are lists, for example, `batch_size: [1, 2, 4]`.
+Command-line arguments are comma-separated, for example, `--batch-size 1,2,4`. Values in the YAML file are lists, for example, `batch_size: [1, 2, 4]`.

 Run with OpenMPI:

@ -166,18 +168,19 @@ python3 parse.py --profile-dir ./profiles --world-size 4 --rank 0
 python3 parse.py --world-size 4 --module MoE
 ```

-You will receive three reports, each containing kernel timing statistics grouped by module:
+You will receive four reports, each containing kernel timing statistics grouped by module:
 1. A printed report on stdout
 2. A CSV report at `profiles/report_np4_rank0.csv`
 3. An HTML report at `profiles/report_np4_rank0.html`
+4. A JSON report at `profiles/report_np4_rank0.json` (for correlation analysis)

 ## Performance alignment between end-to-end performance and layer-wise benchmarks

-An overall example can be found in `sample_performance_alignment.sh`. Here is an abstract of the main steps.
+A complete example can be found in `sample_performance_alignment.sh`. Below is an overview of the main steps.

-1. Run end-to-end serving in **COLLECT** mode, and capture nsys profiles. This step generates a calibration file.
+1. Run end-to-end serving in **COLLECT** mode and capture nsys profiles. This step generates a calibration file.

-   Please meet the following requirements.
+   Requirements:

   1. Add the following fields to `config.yaml`.

@ -187,13 +190,13 @@ An overall example can be found in `sample_performance_alignment.sh`. Here is an
          calibration_file_path: profiles/calibration_data.json
      ```

-   2. Set `TLLM_PROFILE_START_STOP` to a range that can capture some iterations (typically tens of iterations) of GEN phase. Ensure every iteration has the same batch size. Please capture 5 more iterations at beginning, because the first 5 iterations are regarded as warm-ups and will be dropped by the parser by default.
+   2. Set `TLLM_PROFILE_START_STOP` to a range that captures some iterations (typically tens of iterations) of the GEN phase. Ensure that every iteration has the same batch size. Capture 5 extra iterations at the beginning, because the first 5 iterations are treated as warm-ups and will be dropped by the parser by default.

-   3. Capture per-rank nsys profiles, and every rank should produce a separate file.
+   3. Capture per-rank nsys profiles; each rank should produce a separate file.

-      You need to put `nsys profile` behind `mpirun` or `srun`. To minimize profile overhead and file size, there is no need to capture samples and GPU metrics.
+      Place `nsys profile` after `mpirun` or `srun`. To minimize profiling overhead and file size, there is no need to capture samples or GPU metrics.

-      If you use `trtllm-serve` or `trtllm-bench`, please follow the following command order. If you use `examples/disaggregated/slurm/benchmark/submit.py`, setting `gen_profile_range` is enough.
+      If you use `trtllm-serve` or `trtllm-bench`, use the following command order. If you use `examples/disaggregated/slurm/benchmark/submit.py`, setting `gen_profile_range` is sufficient.

      ```bash
      NP=$NP ./mpi_launch.sh middleware/mpi_env_from_ompi \
@ -209,11 +212,11 @@ An overall example can be found in `sample_performance_alignment.sh`. Here is an
          --model ...
      ```

-   4. To be more precise, set the same `TLLM_AUTOTUNER_CACHE_PATH` for all the steps. The autotuner cache file should be generated by Step 1, and be reused by Step 2 and Step 3.
+   4. For more accurate results, set the same `TLLM_AUTOTUNER_CACHE_PATH` for all steps. The autotuner cache file should be generated in Step 1 and reused in Steps 2 and 3.

-2. If the end-to-end serving uses CUDA Graphs, run Step 1 again in **MARK** mode without CUDA Graphs, and also capture nsys profiles.
+2. If the end-to-end serving uses CUDA Graphs, run Step 1 again in **MARK** mode without CUDA Graphs and capture nsys profiles.

-   The differences are as follows.
+   The differences from Step 1 are as follows:

   1. Add the following fields to `config.yaml`.

@ -241,18 +244,18 @@ An overall example can be found in `sample_performance_alignment.sh`. Here is an
       --replay-stop 67
   ```

-   Here are explanations of every argument:
+   Argument explanations:

   | Argument/Parameter | Explanation |
-   |-------------------|-------------|
+   | ------------------ | ----------- |
   | `NP=4` | Should match the end-to-end run. |
-   | `--load-format AUTO` | Instruct the benchmark to load model weights instead of initializing random weights. |
-   | `--layer-indices 5,6,7` | A list of contiguous layers you want to calibrate. |
+   | `--load-format AUTO` | Instructs the benchmark to load model weights instead of using random weights. |
+   | `--layer-indices 5,6,7` | A list of contiguous layers to calibrate. |
   | `--batch-size 32` | Should match the end-to-end run. |
-   | `--seq-len-q 1` | Should match (1+MTP) of the end-to-end run. |
-   | `--seq-len-kv-cache 2090` | Estimation of the average context length for iterations you captured. The first 5 iterations should be excluded from the estimation, because they will be dropped by parser. |
-   | `--replay-file-path` | The calibration file obtained by Step 1. |
-   | `--replay-start` and `--replay-stop` | Should match the end-to-end `TLLM_PROFILE_START_STOP`. Do not replay the first 5 iterations, because they will be dropped by parser. |
+   | `--seq-len-q 1` | Should match (1 + MTP) of the end-to-end run. |
+   | `--seq-len-kv-cache 2090` | An estimate of the average context length for the captured iterations. The first 5 iterations should be excluded from this estimate because they will be dropped by the parser. |
+   | `--replay-file-path` | The calibration file obtained from Step 1. |
+   | `--replay-start` and `--replay-stop` | Should match the end-to-end `TLLM_PROFILE_START_STOP`. Do not replay the first 5 iterations because they will be dropped by the parser. |

 4. Parse end-to-end profiles with `parse_e2e.py`, and parse layer-wise benchmarks profiles with `parse.py`.

@ -278,30 +281,30 @@ An overall example can be found in `sample_performance_alignment.sh`. Here is an
       -o profiles/correlation.html
   ```

-   Please find `profiles/correlation.html` for the report.
+   The report can be found at `profiles/correlation.html`.

 Limitations:

 1. Pipeline parallelism is not supported.
-2. MoE backends CUTLASS and WIDEEP are supported.
-3. Only tested with GEN phase and attention DP.
+2. Only the CUTLASS and WIDEEP MoE backends are supported.
+3. Only tested with the GEN phase and attention DP.

 ## Developer utilities

-1. Less startup time when debug a model
+1. Reduce startup time when debugging a model
   1. Set autotuner cache or disable autotuner
-      1. Set autotuner cache: add `TLLM_AUTOTUNER_CACHE_PATH=autotuner_cache/cache` environment variable. This is enabled at your own risk, and you may need to delete the cache if `NP` changes or the code changes
-      2. Disable autotuner: add `--no-enable-autotuner` option
-   2. Disable nsys profile: set `PROFILE=0` environment variable
+      1. Set autotuner cache: set the `TLLM_AUTOTUNER_CACHE_PATH=autotuner_cache/cache` environment variable. Use this at your own risk; you may need to delete the cache if `NP` changes or the code changes
+      2. Disable autotuner: add the `--no-enable-autotuner` option
+   2. Disable nsys profiling: set the `PROFILE=0` environment variable
 2. Capture more information
-   1. Enable GPU metrics: set `GPU_METRICS=1` environment variable
-   2. Enable backtrace: set `BACKTRACE=1` environment variable
+   1. Enable GPU metrics: set the `GPU_METRICS=1` environment variable
+   2. Enable backtrace: set the `BACKTRACE=1` environment variable

-## Trouble shooting
+## Troubleshooting

 1. Error `fp8 blockscale gemm only support Hopper` on Blackwell.

-   The default MoE backend "CUTLASS" does not support FP8 weights. Please choose the same MoE backend as your end-to-end config. A typical choice is adding `--moe-backend DEEPGEMM` (or `TRTLLM`, `WIDEEP`) and `--moe-backend-for-prefill DEEPGEMM` (or `WIDEEP`) option.
+   The default MoE backend "CUTLASS" does not support FP8 weights. Please choose the same MoE backend as your end-to-end config. A typical solution is to add the `--moe-backend DEEPGEMM` (or `TRTLLM`, `WIDEEP`) and `--moe-backend-for-prefill DEEPGEMM` (or `WIDEEP`) options.

 2. Error `huggingface_hub.errors.HfHubHTTPError: 429 Client Error: Too Many Requests for url: https://huggingface.co/nvidia/DeepSeek-R1-0528-FP4-v2/resolve/main/config.json`.

--- a/examples/layer_wise_benchmarks/mpi_launch.sh
+++ b/examples/layer_wise_benchmarks/mpi_launch.sh
@ -5,10 +5,10 @@ set -euo pipefail
 # Clear slurm envs
 unset $(env | awk -F'=' '{print $1}' | (grep -E "SLURM_|SLURMD_|slurm_|MPI_|PMIX_" || true))

-extra_args=
+extra_args=()
 if [ -v TLLM_AUTOTUNER_CACHE_PATH ]; then
-    extra_args+="-x TLLM_AUTOTUNER_CACHE_PATH"
+    extra_args+=(-x TLLM_AUTOTUNER_CACHE_PATH)
 fi

 set -x
-mpirun --allow-run-as-root --np ${NP} $extra_args "$@"
+mpirun --allow-run-as-root --np $NP ${extra_args[@]+"${extra_args[@]}"} "$@"
--- a/examples/layer_wise_benchmarks/parse.py
+++ b/examples/layer_wise_benchmarks/parse.py
@ -5,6 +5,7 @@ import json
 import re
 import sqlite3
 from pathlib import Path
+from typing import NamedTuple

 import jinja2
 import numpy as np
@ -16,6 +17,50 @@ from parser_utils import (
    warned_names,
 )

+
+class NvtxRange(NamedTuple):
+    """Represents an NVTX range with start/end times and text label."""
+
+    start: int
+    end: int
+    text: str
+
+
+class KernelRecord(NamedTuple):
+    """Represents a kernel record from the database query.
+
+    Used for sorting and grouping kernels by runtime and capture time.
+    """
+
+    problem_id: int
+    run_id: int
+    range_names: tuple[str, ...]
+    kernel_start: int
+    kernel_end: int
+    demangled_name: int  # String ID reference
+    runtime_start: int
+    capture_start: int
+
+
+class KernelTiming(NamedTuple):
+    """Represents a kernel's timing within a run.
+
+    Used after sorting and grouping for per-run analysis.
+    """
+
+    demangled_name: int  # String ID reference
+    kernel_start: int
+    kernel_end: int
+    range_names: tuple[str, ...]
+
+
+class CategoryTime(NamedTuple):
+    """Represents a category (hierarchical path) and its associated time."""
+
+    category: tuple[str, ...]
+    time_ns: float
+
+
 # Parse cmdline
 parser = argparse.ArgumentParser()
 parser.add_argument("--file-path", type=str)
@ -71,19 +116,19 @@ query = """SELECT T1.start, T2.value AS text
    JOIN StringIds AS T2 ON T1.textId = T2.id
    WHERE eventType = ? AND T2.value LIKE ?"""
 df = pd.read_sql_query(query, conn, params=(event_id_NvtxPushPopRange, "layer_wise_benchmarks %"))
-problem_start = []
-problem_set = []
+problem_start_times: list[int] = []
+problem_set: list[dict] = []
 for start, text in df.itertuples(index=False):
    if text.startswith("layer_wise_benchmarks args {"):
        run_args = json.loads(text[len("layer_wise_benchmarks args") :])
    elif text.startswith("layer_wise_benchmarks problem_spec {"):
-        problem_start.append(start)
+        problem_start_times.append(start)
        problem_set.append(
            {
                "spec": json.loads(text[len("layer_wise_benchmarks problem_spec ") :]),
                "text": "",
-                "runs": [],
-                "runs_end": [],
+                "run_starts": [],
+                "run_ends": [],
                "ranges": [],
                "kernel_count_per_range": [],
            }
@ -99,7 +144,7 @@ df = pd.read_sql_query(
    params=(event_id_NvtxPushPopRange, "[DG]%", nccl_domain_id),
 )
 for start, end, text in df.itertuples(index=False):
-    problem_id = bisect.bisect(problem_start, start) - 1
+    problem_id = bisect.bisect(problem_start_times, start) - 1
    if text.startswith("layer_wise_benchmarks "):
        if text != "layer_wise_benchmarks ignore":
            continue
@ -107,10 +152,10 @@ for start, end, text in df.itertuples(index=False):
        assert problem_id != -1
    if re.match(r"b=\d+ s=\d+ ", text):
        problem_set[problem_id]["text"] = text
-        problem_set[problem_id]["runs"].append(start)
-        problem_set[problem_id]["runs_end"].append(end)
+        problem_set[problem_id]["run_starts"].append(start)
+        problem_set[problem_id]["run_ends"].append(end)
    else:
-        problem_set[problem_id]["ranges"].append((start, end, text))
+        problem_set[problem_id]["ranges"].append(NvtxRange(start, end, text))
        problem_set[problem_id]["kernel_count_per_range"].append(0)

 query = """SELECT name FROM sqlite_master WHERE type = ?"""
@ -127,16 +172,14 @@ if "CUPTI_ACTIVITY_KIND_MEMSET" in tables:
        SELECT T3.start, T3.end, -3 AS demangledName, T3.correlationId, T3.graphNodeId
        FROM CUPTI_ACTIVITY_KIND_MEMSET AS T3"""
 query = f"""SELECT unified.start, unified.end, unified.demangledName,
-       R.start AS runtime_start, R.end AS runtime_end,
-       R.start AS capture_start, R.end AS capture_end
+       R.start AS runtime_start, R.start AS capture_start, R.end AS capture_end
 FROM ({unified_subquery}) AS unified
 JOIN CUPTI_ACTIVITY_KIND_RUNTIME AS R ON unified.correlationId = R.correlationId
 WHERE unified.graphNodeId IS NULL"""
 if "CUDA_GRAPH_NODE_EVENTS" in tables:
    query += f""" UNION ALL
    SELECT unified.start, unified.end, unified.demangledName,
-           R.start AS runtime_start, R.end AS runtime_end,
-           CGE2.start AS capture_start, CGE2.end AS capture_end
+           R.start AS runtime_start, CGE2.start AS capture_start, CGE2.end AS capture_end
    FROM ({unified_subquery}) AS unified
    JOIN CUPTI_ACTIVITY_KIND_RUNTIME AS R ON unified.graphNodeId IS NOT NULL AND
                                             unified.correlationId = R.correlationId
@ -144,44 +187,41 @@ if "CUDA_GRAPH_NODE_EVENTS" in tables:
                                                CGE1.originalGraphNodeId IS NOT NULL
    LEFT JOIN CUDA_GRAPH_NODE_EVENTS AS CGE2 ON CGE1.originalGraphNodeId = CGE2.graphNodeId"""
 df = pd.read_sql_query(query, conn)
-kernel_list = []
+kernel_records: list[KernelRecord] = []
 for (
-    start,
-    end,
-    demangledName,
+    kernel_start,
+    kernel_end,
+    demangled_name,
    runtime_start,
-    runtime_end,
    capture_start,
    capture_end,
 ) in df.itertuples(index=False):
-    problem_id = bisect.bisect(problem_start, start) - 1
+    problem_id = bisect.bisect(problem_start_times, kernel_start) - 1
    problem = problem_set[problem_id]
-    run_id = bisect.bisect(problem["runs"], runtime_start) - 1
-    if run_id == -1 or runtime_start >= problem["runs_end"][run_id]:
+    run_id = bisect.bisect(problem["run_starts"], runtime_start) - 1
+    if run_id == -1 or runtime_start >= problem["run_ends"][run_id]:
        continue
-    ranges = [
+    matching_range_indices = [
        i
-        for i, (range_start, range_end, text) in enumerate(problem["ranges"])
-        if capture_start >= range_start and capture_end <= range_end
+        for i, nvtx_range in enumerate(problem["ranges"])
+        if capture_start >= nvtx_range.start and capture_end <= nvtx_range.end
    ]
-    for range_id in ranges:
-        problem["kernel_count_per_range"][range_id] += 1
-    range_names = [problem["ranges"][i][2] for i in ranges]
+    for range_idx in matching_range_indices:
+        problem["kernel_count_per_range"][range_idx] += 1
+    range_names = tuple(problem["ranges"][i].text for i in matching_range_indices)
    if (
        args.module is None or args.module in range_names
    ) and "layer_wise_benchmarks ignore" not in range_names:
-        kernel_list.append(
-            (
-                problem_id,
-                run_id,
-                range_names,
-                start,
-                end,
-                demangledName,
-                runtime_start,
-                runtime_end,
-                capture_start,
-                capture_end,
+        kernel_records.append(
+            KernelRecord(
+                problem_id=problem_id,
+                run_id=run_id,
+                range_names=range_names,
+                kernel_start=kernel_start,
+                kernel_end=kernel_end,
+                demangled_name=demangled_name,
+                runtime_start=runtime_start,
+                capture_start=capture_start,
            )
        )

@ -195,12 +235,10 @@ conn.close()
 # Check ambiguous modules
 if args.module:
    for problem in problem_set:
-        num_matches_per_run = [0] * (len(problem["runs"]) + 1)
-        for (range_start, _, text), kernel_count in zip(
-            problem["ranges"], problem["kernel_count_per_range"]
-        ):
-            if text == args.module and kernel_count > 0:
-                num_matches_per_run[bisect.bisect(problem["runs"], range_start)] += 1
+        num_matches_per_run = [0] * (len(problem["run_starts"]) + 1)
+        for nvtx_range, kernel_count in zip(problem["ranges"], problem["kernel_count_per_range"]):
+            if nvtx_range.text == args.module and kernel_count > 0:
+                num_matches_per_run[bisect.bisect(problem["run_starts"], nvtx_range.start)] += 1
        for run_id_plus_one, num_matches in enumerate(num_matches_per_run):
            if num_matches > 1:
                raise ValueError(
@ -208,72 +246,70 @@ if args.module:
                    f' in "{problem["text"]}"\'s {run_id_plus_one}-th run'
                )

-kernel_list.sort(key=lambda t: (t[6], t[8]))
-kernels = [[[] for _ in problem["runs"]] for problem in problem_set]
-for (
-    problem_id,
-    run_id,
-    ranges,
-    start,
-    end,
-    demangledName,
-    runtime_start,
-    runtime_end,
-    capture_start,
-    capture_end,
-) in kernel_list:
-    kernels[problem_id][run_id].append((demangledName, start, end, ranges))
-for problem_id in range(len(kernels)):
-    required_seq = [demangledName for demangledName, _, _, _ in kernels[problem_id][0]]
-    for run_id in range(len(kernels[problem_id])):
-        seq = [demangledName for demangledName, _, _, _ in kernels[problem_id][run_id]]
+kernel_records.sort(key=lambda rec: (rec.runtime_start, rec.capture_start))
+kernels_per_problem: list[list[list[KernelTiming]]] = [
+    [[] for _ in problem["run_starts"]] for problem in problem_set
+]
+for rec in kernel_records:
+    kernels_per_problem[rec.problem_id][rec.run_id].append(
+        KernelTiming(
+            demangled_name=rec.demangled_name,
+            kernel_start=rec.kernel_start,
+            kernel_end=rec.kernel_end,
+            range_names=rec.range_names,
+        )
+    )
+for problem_id, runs in enumerate(kernels_per_problem):
+    required_seq = [kernel.demangled_name for kernel in runs[0]]
+    for run_id, run in enumerate(runs):
+        seq = [kernel.demangled_name for kernel in run]
        assert seq == required_seq

-converted_seqs = []
+converted_seqs: list[list[CategoryTime]] = []
 warmup_times = run_args["warmup_times"] if args.warmup_times is None else args.warmup_times
-for runs in kernels:
-    converted_seq = []
+for runs in kernels_per_problem:
+    converted_seq: list[CategoryTime] = []
    # Kernel time
-    for i, (demangledName, _, _, ranges) in enumerate(runs[0]):
-        name = kernel_short_name(string_ids[demangledName])
-        category = (*ranges, name)
-        time_list = [run[i][2] - run[i][1] for run in runs]
-        t = np.mean(time_list[warmup_times:]).tolist()
-        converted_seq.append((category, t))
+    for i, kernel in enumerate(runs[0]):
+        name = kernel_short_name(string_ids[kernel.demangled_name])
+        category = (*kernel.range_names, name)
+        time_list = [run[i].kernel_end - run[i].kernel_start for run in runs]
+        time_ns = np.mean(time_list[warmup_times:]).tolist()
+        converted_seq.append(CategoryTime(category, time_ns))
    # Space and Overlap
    overlap_list = []
    space_list = []
    for run in runs:
-        sorted_run = sorted(run, key=lambda op: op[1])
-        last_end = sorted_run[0][1]
+        sorted_run = sorted(run, key=lambda k: k.kernel_start)
+        last_end = sorted_run[0].kernel_start
        overlap_time = 0
        space_time = 0
-        for _, start, end, _ in sorted_run:
-            if start > last_end:
-                space_time += start - last_end
+        for kernel in sorted_run:
+            if kernel.kernel_start > last_end:
+                space_time += kernel.kernel_start - last_end
            else:
-                overlap_time += min(last_end, end) - start
-            last_end = max(last_end, end)
+                overlap_time += min(last_end, kernel.kernel_end) - kernel.kernel_start
+            last_end = max(last_end, kernel.kernel_end)
        overlap_list.append(-overlap_time)
        space_list.append(space_time)
-    converted_seq.append((("Overlap",), np.mean(overlap_list[warmup_times:]).tolist()))
-    converted_seq.append((("Space",), np.mean(space_list[warmup_times:]).tolist()))
-    converted_seq.append((("Total",), sum(t for _, t in converted_seq)))
+    converted_seq.append(CategoryTime(("Overlap",), np.mean(overlap_list[warmup_times:]).tolist()))
+    converted_seq.append(CategoryTime(("Space",), np.mean(space_list[warmup_times:]).tolist()))
+    converted_seq.append(CategoryTime(("Total",), sum(ct.time_ns for ct in converted_seq)))
    converted_seqs.append(converted_seq)
 if args.error_on_unknown_kernel and warned_names:
    raise ValueError("Unknown kernel names encountered")

-merged_title = []
+merged_title: list[tuple[str, ...]] = []
 for converted_seq in converted_seqs:
-    title = [name for name, _ in converted_seq]
+    title = [ct.category for ct in converted_seq]
    merged_title = shortest_common_supersequence(merged_title, title)

-merged_data = [[0.0] * len(problem_set) for _ in merged_title]
+merged_data: list[list[float]] = [[0.0] * len(problem_set) for _ in merged_title]
 for problem_id, converted_seq in enumerate(converted_seqs):
    cur = 0
-    for category, t in converted_seq:
-        cur = merged_title.index(category, cur)
-        merged_data[cur][problem_id] = t
+    for ct in converted_seq:
+        cur = merged_title.index(ct.category, cur)
+        merged_data[cur][problem_id] = ct.time_ns
        cur += 1

 print("Run args:")
@ -282,14 +318,14 @@ print(run_args)
 print("Problem set:")
 for problem in problem_set:
    print(
-        f'- "{problem["text"]}"    {len(problem["runs"])} runs'
-        f"    Ranges: [{', '.join(text for _, end, text in problem['ranges'] if end <= problem['runs_end'][0])}]"
+        f'- "{problem["text"]}"    {len(problem["run_starts"])} runs'
+        f"    Ranges: [{', '.join(r.text for r in problem['ranges'] if r.end <= problem['run_ends'][0])}]"
    )

-stack = []
-csv_data = [["", *[problem["text"] for problem in problem_set]]]
-js_data = []
-js_stack = [js_data]
+stack: list[str] = []
+csv_data: list[list[str]] = [["", *[problem["text"] for problem in problem_set]]]
+js_data: list[dict] = []
+js_stack: list[list[dict]] = [js_data]
 max_title_len = max((len(title) - 1) * 3 + len(title[-1][:40]) for title in merged_title)
 print("-" * (max_title_len + 1 + 6 * len(problem_set)))
 for title, time_data in zip(merged_title, merged_data):
@ -330,8 +366,7 @@ with csv_file_path.open("w", newline="") as f:
    csv_writer = csv.writer(f, quoting=csv.QUOTE_MINIMAL)
    for row in csv_data:
        csv_writer.writerow(row)
-js_header_config = [{"name": problem["text"]} for problem in problem_set]
-js_header_config = []
+js_header_config: list[dict] = []
 for problem in problem_set:
    innermost_children = js_header_config
    for k, msg_prefix in [
@ -353,35 +388,35 @@ for problem in problem_set:
 loader = jinja2.FileSystemLoader(Path(__file__).parent)
 template = jinja2.Environment(loader=loader).get_template("breakdown_template.html")
 with html_file_path.open("w") as f:
-    configText = (
+    config_text = (
        "Run:\n"
        + json.dumps(run_args, indent=4)
        + "\n\nParse:\n"
        + json.dumps(args.__dict__, indent=4)
    )
-    f.write(template.render(headerConfig=js_header_config, rawData=js_data, configText=configText))
+    f.write(template.render(headerConfig=js_header_config, rawData=js_data, configText=config_text))

 if args.query is not None:
    print("Query:")
-    for query in args.query.split(","):
-        query = query.strip()
+    for query_str in args.query.split(","):
+        query_str = query_str.strip()
        query_matched = [0.0] * len(problem_set)
        for title, time_data in zip(merged_title, merged_data):
-            if query in ".".join(title):
+            if query_str in ".".join(title):
                for i, x in enumerate(time_data):
                    query_matched[i] += x
        print(
-            query + " " * (max_title_len - len(query)),
+            query_str + " " * (max_title_len - len(query_str)),
            *[f"{x / 1000:-6.1f}" for x in query_matched],
        )

-correlation = []
-for problem, runs in zip(problem_set, kernels):
-    timeline = []
-    for i, (demangledName, _, _, _) in enumerate(runs[0]):
-        name = string_ids[demangledName]
-        duration_list = [run[i][2] - run[i][1] for run in runs]
-        end_list = [run[i][2] - run[0][1] for run in runs]
+correlation: list[dict] = []
+for problem, runs in zip(problem_set, kernels_per_problem):
+    timeline: list[dict] = []
+    for i, kernel in enumerate(runs[0]):
+        name = string_ids[kernel.demangled_name]
+        duration_list = [run[i].kernel_end - run[i].kernel_start for run in runs]
+        end_list = [run[i].kernel_end - run[0].kernel_start for run in runs]
        timeline.append(
            {
                "name": name,
--- a/examples/layer_wise_benchmarks/parse_e2e.py
+++ b/examples/layer_wise_benchmarks/parse_e2e.py
@ -4,6 +4,7 @@ import json
 import re
 import sqlite3
 from pathlib import Path
+from typing import NamedTuple

 import numpy as np
 import pandas as pd
@ -15,7 +16,36 @@ from parser_utils import (
 )


-def comma_separated_ints(s):
+class IterInfo(NamedTuple):
+    """Represents an iteration's timing information."""
+
+    start: int
+    end: int
+    iter_id: int
+
+
+class LayerInfo(NamedTuple):
+    """Represents a layer's timing information within an iteration."""
+
+    start: int
+    end: int
+    layer_idx: int
+
+
+class KernelQueryResult(NamedTuple):
+    """Represents a kernel query result for e2e parsing.
+
+    Sorted by runtime_start for consistent ordering.
+    """
+
+    runtime_start: int
+    graph_node_id: int | None
+    kernel_start: int
+    kernel_end: int
+    demangled_name: int  # String ID reference
+
+
+def comma_separated_ints(s: str) -> list[int]:
    return [int(x) for x in s.split(",")]


@ -41,12 +71,12 @@ if args.graph_trace is not None and not args.graph_trace.endswith(".nsys-rep"):
 print(args)


-def is_gemm(name):
+def is_gemm(name: str) -> bool:
    return "nvjet" in name or "gemm" in name.lower()


 eager_nsys_rep_file_path = Path(args.eager_trace)
-# For CTX phase which does not use CUDA Graphs, analysis the eager trace instead.
+# For CTX phase which does not use CUDA Graphs, analyze the eager trace instead.
 # Here we do not change the identifier name "graph_*" for convenience.
 graph_nsys_rep_file_path = Path(args.graph_trace or args.eager_trace)
 eager_sqlite_file_path = eager_nsys_rep_file_path.parent / (
@ -89,47 +119,47 @@ if target_gen_reqs is None:
    else:
        target_gen_reqs = 0
 print(f"{target_ctx_reqs=} {target_gen_reqs=}")
-eager_iters = []
+eager_iters: list[IterInfo] = []
 for start, end, text in df.itertuples(index=False):
    if m := re.match(r"^\[Executor\] _forward_step (\d+): (\d+) ctx reqs, (\d+) gen reqs", text):
-        it = int(m.group(1))
+        iter_id = int(m.group(1))
        ctx_reqs = int(m.group(2))
        gen_reqs = int(m.group(3))
        if ctx_reqs == target_ctx_reqs and gen_reqs == target_gen_reqs:
-            eager_iters.append((start, end, it))
+            eager_iters.append(IterInfo(start, end, iter_id))
 eager_iters = sorted(eager_iters)[args.warmup_times :]
-iter_list = [t[2] for t in eager_iters]
-print("Iters (eager)", *iter_list)
-per_iter_eager_layers = [[] for _ in iter_list]
+iter_id_list = [it.iter_id for it in eager_iters]
+print("Iters (eager)", *iter_id_list)
+per_iter_eager_layers: list[list[LayerInfo]] = [[] for _ in iter_id_list]
 for start, end, text in df.itertuples(index=False):
    if m := re.match(r"^layer_wise_benchmarks layer_idx (\d+)$", text):
        layer_idx = int(m.group(1))
-        it_idx = bisect.bisect(eager_iters, (start,)) - 1
-        if it_idx < 0 or end > eager_iters[it_idx][1]:
+        iter_idx = bisect.bisect(eager_iters, (start,)) - 1
+        if iter_idx < 0 or end > eager_iters[iter_idx].end:
            continue
-        assert end <= eager_iters[it_idx][1], "Not belong to any iter"
-        per_iter_eager_layers[it_idx].append((start, end, it_idx, layer_idx))
-layer_list = [t[3] for t in per_iter_eager_layers[0]]
-print("Layers (eager)", *layer_list)
+        assert end <= eager_iters[iter_idx].end, "Not belong to any iter"
+        per_iter_eager_layers[iter_idx].append(LayerInfo(start, end, layer_idx))
+layer_idx_list = [layer.layer_idx for layer in per_iter_eager_layers[0]]
+print("Layers (eager)", *layer_idx_list)
 for eager_layers in per_iter_eager_layers:
-    assert [t[3] for t in eager_layers] == layer_list, "inconsistent layer idx"
+    assert [layer.layer_idx for layer in eager_layers] == layer_idx_list, "inconsistent layer idx"
 df = pd.read_sql_query(query, graph_conn, params=(graph_event_id_NvtxPushPopRange,))
-graph_iters = []
+graph_iters: list[IterInfo] = []
 for start, end, text in df.itertuples(index=False):
    if m := re.match(r"^\[Executor\] _forward_step (\d+): (\d+) ctx reqs, (\d+) gen reqs", text):
-        it = int(m.group(1))
+        iter_id = int(m.group(1))
        ctx_reqs = int(m.group(2))
        gen_reqs = int(m.group(3))
        if ctx_reqs == target_ctx_reqs and gen_reqs == target_gen_reqs:
-            graph_iters.append((start, end, it))
+            graph_iters.append(IterInfo(start, end, iter_id))
 graph_iters = sorted(graph_iters)[args.warmup_times :]
-graph_iter_list = [t[2] for t in graph_iters]
-print("Iters (graph)", *graph_iter_list)
-if iter_list != graph_iter_list:
+graph_iter_id_list = [it.iter_id for it in graph_iters]
+print("Iters (graph)", *graph_iter_id_list)
+if iter_id_list != graph_iter_id_list:
    raise ValueError("The ID of iterations do not match")


-def query_kernels(conn, iters):
+def query_kernels(conn: sqlite3.Connection, iters: list[IterInfo]) -> list[list[KernelQueryResult]]:
    query = """SELECT name FROM sqlite_master WHERE type = ?"""
    df = pd.read_sql_query(query, conn, params=("table",))
    tables = df["name"].tolist()
@ -148,16 +178,25 @@ def query_kernels(conn, iters):
    FROM ({unified_subquery}) AS unified
    JOIN CUPTI_ACTIVITY_KIND_RUNTIME AS R ON unified.correlationId = R.correlationId"""
    df = pd.read_sql_query(query, conn)
-    per_iter_kernels = [[] for _ in iters]
-    for start, end, graphNodeId, demangledName, runtime_start, runtime_end in df.itertuples(
-        index=False
-    ):
-        it_idx = bisect.bisect(iters, (runtime_start,)) - 1
-        if it_idx < 0 or runtime_end > iters[it_idx][1]:
+    per_iter_kernels: list[list[KernelQueryResult]] = [[] for _ in iters]
+    for (
+        kernel_start,
+        kernel_end,
+        graph_node_id,
+        demangled_name,
+        runtime_start,
+        runtime_end,
+    ) in df.itertuples(index=False):
+        iter_idx = bisect.bisect(iters, (runtime_start,)) - 1
+        if iter_idx < 0 or runtime_end > iters[iter_idx].end:
            continue
-        per_iter_kernels[it_idx].append((runtime_start, graphNodeId, start, end, demangledName))
+        per_iter_kernels[iter_idx].append(
+            KernelQueryResult(
+                runtime_start, graph_node_id, kernel_start, kernel_end, demangled_name
+            )
+        )
    for kernels in per_iter_kernels:
-        kernels.sort()
+        kernels.sort(key=lambda k: (k.runtime_start, k.graph_node_id))
    return per_iter_kernels


@ -166,12 +205,14 @@ graph_per_iter_kernels = query_kernels(graph_conn, graph_iters)
 print("#Kernels (eager)", *[len(kernels) for kernels in eager_per_iter_kernels])
 print("#Kernels (graph)", *[len(kernels) for kernels in graph_per_iter_kernels])
 for eager_kernels, graph_kernels in zip(eager_per_iter_kernels, graph_per_iter_kernels):
-    assert all(a[4] == eager_per_iter_kernels[0][i][4] for i, a in enumerate(eager_kernels)), (
-        "eager kernels change across iterations"
-    )
-    assert all(a[4] == graph_per_iter_kernels[0][i][4] for i, a in enumerate(graph_kernels)), (
-        "graph kernels change across iterations"
-    )
+    assert all(
+        kernel.demangled_name == eager_per_iter_kernels[0][i].demangled_name
+        for i, kernel in enumerate(eager_kernels)
+    ), "eager kernels change across iterations"
+    assert all(
+        kernel.demangled_name == graph_per_iter_kernels[0][i].demangled_name
+        for i, kernel in enumerate(graph_kernels)
+    ), "graph kernels change across iterations"

 query = "SELECT * FROM StringIds"
 df = pd.read_sql_query(query, eager_conn)
@ -184,26 +225,33 @@ graph_string_ids.update({-2: "Memcpy", -3: "Memset"})
 eager_conn.close()
 graph_conn.close()

-eager_kernel_names = [eager_string_ids[kernel[4]] for kernel in eager_per_iter_kernels[0]]
-graph_kernel_names = [graph_string_ids[kernel[4]] for kernel in graph_per_iter_kernels[0]]
+eager_kernel_names = [
+    eager_string_ids[kernel.demangled_name] for kernel in eager_per_iter_kernels[0]
+]
+graph_kernel_names = [
+    graph_string_ids[kernel.demangled_name] for kernel in graph_per_iter_kernels[0]
+]
 super_kernel_names = shortest_common_supersequence(eager_kernel_names, graph_kernel_names)
 print(f"#Kernels (supersequence) {len(super_kernel_names)}")
-eager_per_layer_kernels = [[] for _ in layer_list]
+eager_per_layer_kernels: list[list[int]] = [[] for _ in layer_idx_list]
 for i, eager_kernel in enumerate(eager_per_iter_kernels[0]):
-    eager_layers_idx = bisect.bisect(per_iter_eager_layers[0], (eager_kernel[0],)) - 1
-    if eager_layers_idx < 0 or eager_kernel[0] > per_iter_eager_layers[0][eager_layers_idx][1]:
+    eager_layer_idx = bisect.bisect(per_iter_eager_layers[0], (eager_kernel.runtime_start,)) - 1
+    if (
+        eager_layer_idx < 0
+        or eager_kernel.runtime_start > per_iter_eager_layers[0][eager_layer_idx].end
+    ):
        continue
-    eager_per_layer_kernels[eager_layers_idx].append(i)
-eager2super = []
+    eager_per_layer_kernels[eager_layer_idx].append(i)
+eager2super: list[int] = []
 j = 0
-for i, eager_kernel_name in enumerate(eager_kernel_names):
+for eager_kernel_name in eager_kernel_names:
    while eager_kernel_name != super_kernel_names[j]:
        j += 1
    eager2super.append(j)
    j += 1
-super_per_layer_starts = [eager2super[a[0]] for a in eager_per_layer_kernels]
-super_per_layer_ends = [eager2super[a[-1]] for a in eager_per_layer_kernels]
-graph_per_layer_kernels = [[] for _ in layer_list]
+super_per_layer_starts = [eager2super[indices[0]] for indices in eager_per_layer_kernels]
+super_per_layer_ends = [eager2super[indices[-1]] for indices in eager_per_layer_kernels]
+graph_per_layer_kernels: list[list[int]] = [[] for _ in layer_idx_list]
 j = 0
 for i, graph_kernel_name in enumerate(graph_kernel_names):
    while graph_kernel_name != super_kernel_names[j]:
@ -212,16 +260,16 @@ for i, graph_kernel_name in enumerate(graph_kernel_names):
    if layer_idx >= 0 and j <= super_per_layer_ends[layer_idx]:
        graph_per_layer_kernels[layer_idx].append(i)
    j += 1
-timeline = []
+timeline: list[dict] = []
 first_kernel_idx = min(graph_per_layer_kernels[layer_idx][0] for layer_idx in args.layer_indices)
 for layer_idx in args.layer_indices:
    for kernel_idx in graph_per_layer_kernels[layer_idx]:
-        duration_list = []
-        end_list = []
-        for it_idx in range(len(graph_per_iter_kernels)):
-            layer_start_time = graph_per_iter_kernels[it_idx][first_kernel_idx][2]
-            kernel_start_time = graph_per_iter_kernels[it_idx][kernel_idx][2]
-            kernel_end_time = graph_per_iter_kernels[it_idx][kernel_idx][3]
+        duration_list: list[int] = []
+        end_list: list[int] = []
+        for iter_idx in range(len(graph_per_iter_kernels)):
+            layer_start_time = graph_per_iter_kernels[iter_idx][first_kernel_idx].kernel_start
+            kernel_start_time = graph_per_iter_kernels[iter_idx][kernel_idx].kernel_start
+            kernel_end_time = graph_per_iter_kernels[iter_idx][kernel_idx].kernel_end
            duration_list.append(kernel_end_time - kernel_start_time)
            end_list.append(kernel_end_time - layer_start_time)
        timeline.append(
@ -233,9 +281,11 @@ for layer_idx in args.layer_indices:
        )
 print(f"{'Kernel':40s} {'Duration':>8s} {'End':>8s}")
 print("-" * (40 + 1 + 8 + 1 + 8))
-for o in timeline:
+for entry in timeline:
    print(
-        f"{kernel_short_name(o['name'])[:40]:40s} {o['duration'] / 1000.0:-8.1f} {o['end'] / 1000.0:-8.1f}"
+        f"{kernel_short_name(entry['name'])[:40]:40s} "
+        f"{entry['duration'] / 1000.0:-8.1f} "
+        f"{entry['end'] / 1000.0:-8.1f}"
    )
 if args.error_on_unknown_kernel and warned_names:
    raise ValueError("Unknown kernel names encountered")
--- a/examples/layer_wise_benchmarks/run.sh
+++ b/examples/layer_wise_benchmarks/run.sh
@ -14,35 +14,39 @@ if [ "$RANK" -eq 0 ]; then
 fi

 PROFILE_DIR=${PROFILE_DIR:-profiles}
-mkdir -p ${PROFILE_DIR}
+mkdir -p -- "$PROFILE_DIR"

 PROFILE=${PROFILE:-1}
 BACKTRACE=${BACKTRACE:-0}
 GPU_METRICS=${GPU_METRICS:-0}
 if [ "$PROFILE" -eq 1 ]; then
-    PROFILE_CMD="nsys profile
+    PROFILE_CMD=(
+        nsys profile
        -t cuda,nvtx
        --cpuctxsw none --cuda-event-trace false
        --cuda-graph-trace node
        -c cudaProfilerApi --capture-range-end stop
-        -o ${PROFILE_DIR}/report_np${WORLD_SIZE}_rank${RANK}.nsys-rep
-        --force-overwrite true"
+        -o "${PROFILE_DIR}/report_np${WORLD_SIZE}_rank${RANK}.nsys-rep"
+        --force-overwrite true
+    )
    if [ "$BACKTRACE" -eq 1 ]; then
-        PROFILE_CMD+=" --python-backtrace=cuda --cudabacktrace all"
+        PROFILE_CMD+=(--python-backtrace=cuda --cudabacktrace all)
    else
-        PROFILE_CMD+=" -s none"
+        PROFILE_CMD+=(-s none)
    fi
    if [ "$GPU_METRICS" -eq 1 ]; then
-        PROFILE_CMD+=" --gpu-metrics-devices $LOCAL_RANK
-            --gpu-metrics-frequency 10000"
+        PROFILE_CMD+=(
+            --gpu-metrics-devices $LOCAL_RANK
+            --gpu-metrics-frequency 10000
+        )
    fi
 else
-    PROFILE_CMD=
+    PROFILE_CMD=()
 fi

-SCRIPT_PATH=$(realpath --relative-to="$(pwd)" "$(dirname -- "$0")"/run.py)
+SCRIPT_PATH=$(realpath --relative-to="$(pwd)" -- "$(dirname -- "$0")"/run.py)

 set -x
-$PROFILE_CMD bash -o pipefail -c \
-    "python3 -u \"\$1\" \"\${@:3}\" 2>&1 | tee \"\$2/report_np${WORLD_SIZE}_rank${RANK}.log\"" \
+${PROFILE_CMD[@]+"${PROFILE_CMD[@]}"} bash -o pipefail -c \
+    'python3 -u "$1" "${@:3}" 2>&1 | tee "$2/report_np'"${WORLD_SIZE}"'_rank'"${RANK}"'.log"' \
    bash "$SCRIPT_PATH" "$PROFILE_DIR" "$@"
--- a/examples/layer_wise_benchmarks/sample_performance_alignment.sh
+++ b/examples/layer_wise_benchmarks/sample_performance_alignment.sh
@ -12,14 +12,14 @@ export PROFILE_DIR="${PROFILE_DIR:-profiles}"
 export TLLM_AUTOTUNER_CACHE_PATH="$PROFILE_DIR/sample_performance_alignment_cache.json"

 mkdir -p -- "$PROFILE_DIR"
-mkdir -p -- "$(dirname "$TLLM_AUTOTUNER_CACHE_PATH")"
+mkdir -p -- "$(dirname -- "$TLLM_AUTOTUNER_CACHE_PATH")"

 python3 ../../benchmarks/cpp/prepare_dataset.py \
    --tokenizer "$MODEL" \
    --stdout \
    --random-seed 42 \
    token-norm-dist \
-    --num-requests $((BATCH_SIZE*NP)) \
+    --num-requests $((BATCH_SIZE * NP)) \
    --input-mean 2048 \
    --input-stdev 0 \
    --output-mean 256 \
@ -61,8 +61,8 @@ trtllm-bench \
    --max_batch_size $BATCH_SIZE \
    --max_num_tokens 3072 \
    --disable_chunked_context \
-    --num_requests $((BATCH_SIZE*NP)) \
-    --concurrency $((BATCH_SIZE*NP)) \
+    --num_requests $((BATCH_SIZE * NP)) \
+    --concurrency $((BATCH_SIZE * NP)) \
    --config /tmp/config_collect.yaml

 # Step 2
@ -98,8 +98,8 @@ trtllm-bench \
    --max_batch_size $BATCH_SIZE \
    --max_num_tokens 3072 \
    --disable_chunked_context \
-    --num_requests $((BATCH_SIZE*NP)) \
-    --concurrency $((BATCH_SIZE*NP)) \
+    --num_requests $((BATCH_SIZE * NP)) \
+    --concurrency $((BATCH_SIZE * NP)) \
    --config /tmp/config_mark.yaml

 # Step 3
--- a/examples/layer_wise_benchmarks/slurm_init_containers.sh
+++ b/examples/layer_wise_benchmarks/slurm_init_containers.sh
@ -4,8 +4,8 @@ set -euo pipefail

 # CONTAINER_IMAGE=
 CONTAINER_NAME=${CONTAINER_NAME:-layer_wise_benchmarks}
-TRTLLM_ROOT=$(realpath "$(dirname -- "$0")"/../..)
-CONTAINER_MOUNTS=$TRTLLM_ROOT:$TRTLLM_ROOT
+TRTLLM_ROOT=$(realpath -- "$(dirname -- "$0")"/../..)
+CONTAINER_MOUNTS="$TRTLLM_ROOT:$TRTLLM_ROOT"

 if [ -z "${SLURM_JOB_ID:-}" ]; then
    echo "Please set SLURM_JOB_ID"
@ -18,9 +18,9 @@ if [ -z "${CONTAINER_IMAGE:-}" ]; then
    # Read Docker image from current_image_tags.properties
    MACHINE="$(srun -N 1 uname -m)"
    if [ "$MACHINE" == "x86_64" ]; then
-        DOCKER_IMAGE=$(source "$TRTLLM_ROOT/jenkins/current_image_tags.properties" && echo $LLM_DOCKER_IMAGE)
+        DOCKER_IMAGE=$(source "$TRTLLM_ROOT/jenkins/current_image_tags.properties" && echo "$LLM_DOCKER_IMAGE")
    elif [ "$MACHINE" == "aarch64" ]; then
-        DOCKER_IMAGE=$(source "$TRTLLM_ROOT/jenkins/current_image_tags.properties" && echo $LLM_SBSA_DOCKER_IMAGE)
+        DOCKER_IMAGE=$(source "$TRTLLM_ROOT/jenkins/current_image_tags.properties" && echo "$LLM_SBSA_DOCKER_IMAGE")
    else
        echo "Unsupported machine hardware name \"$MACHINE\""
        exit 1
@ -31,7 +31,7 @@ if [ -z "${CONTAINER_IMAGE:-}" ]; then
    echo "CONTAINER_IMAGE was not set, using Docker image $DOCKER_IMAGE"

    # Import to .sqsh file
-    SQSH_FILE_NAME=$(echo "$DOCKER_IMAGE" |
+    SQSH_FILE_NAME=$(printf '%s\n' "$DOCKER_IMAGE" |
                     awk -F'#' '{print $2}' |
                     awk -F':' '{gsub(/\//,"+",$1); print $1"+"$2".sqsh"}')
    CONTAINER_IMAGE="$TRTLLM_ROOT/enroot/$SQSH_FILE_NAME"
@ -41,7 +41,7 @@ if [ -z "${CONTAINER_IMAGE:-}" ]; then
    fi
 fi

-WORKDIR=$(realpath "$(pwd)")
+WORKDIR=$(realpath -- "$(pwd)")

 set -x
 srun -N "$NODES" \
@ -50,7 +50,7 @@ srun -N "$NODES" \
    --container-name "$CONTAINER_NAME" \
    --container-mounts "$CONTAINER_MOUNTS" \
    --container-workdir "$WORKDIR" \
-bash -c "cd \"\$1\" &&
+bash -c 'cd "$1" &&
    pip install -U packaging &&
    pip install -r requirements.txt --no-build-isolation &&
-    pip install -e ." bash "$TRTLLM_ROOT"
+    pip install -e .' bash "$TRTLLM_ROOT"
--- a/examples/layer_wise_benchmarks/slurm_launch.sh
+++ b/examples/layer_wise_benchmarks/slurm_launch.sh
@ -3,20 +3,20 @@
 set -euo pipefail

 CONTAINER_NAME=${CONTAINER_NAME:-layer_wise_benchmarks}
-TRTLLM_ROOT=$(realpath "$(dirname -- "$0")"/../..)
-CONTAINER_MOUNTS=$TRTLLM_ROOT:$TRTLLM_ROOT
+TRTLLM_ROOT=$(realpath -- "$(dirname -- "$0")"/../..)
+CONTAINER_MOUNTS="$TRTLLM_ROOT:$TRTLLM_ROOT"

 if [ -z "${SLURM_JOB_ID:-}" ]; then
    echo "Please set SLURM_JOB_ID"
    exit 1
 fi

-WORKDIR=$(realpath "$(pwd)")
+WORKDIR=$(realpath -- "$(pwd)")

 set -x
 srun --mpi=pmix \
    -N "$NODES" \
-    --ntasks-per-node $(($NP / $NODES)) \
+    --ntasks-per-node $((NP / NODES)) \
    --container-name "$CONTAINER_NAME" \
    --container-mounts "$CONTAINER_MOUNTS" \
    --container-workdir "$WORKDIR" \
--- a/examples/layer_wise_benchmarks/slurm_query_container_name.sh
+++ b/examples/layer_wise_benchmarks/slurm_query_container_name.sh
@ -8,22 +8,23 @@ if [ -z "${SLURM_JOB_ID:-}" ]; then
 fi

 prefix="pyxis_${SLURM_JOB_ID}_"
-matches=$(printf "%s\n" "$(srun -N 1 enroot list)" | grep "^${prefix}" || true)
-count=$(printf "%s\n" "$matches" | wc -l)
+matches=$(printf '%s\n' "$(srun -N 1 enroot list)" | grep "^${prefix}" || true)

-if [ "$count" -eq 0 ]; then
+if [ -z "$matches" ]; then
    echo "Error: No container found" >&2
    exit 1
+else
+    count=$(printf '%s\n' "$matches" | wc -l)
 fi

 if [ "$count" -gt 1 ]; then
    echo "Error: Multiple containers found" >&2
    while IFS= read -r match; do
-        echo "- ${match#$prefix}" >&2
+        echo "- ${match#"$prefix"}" >&2
    done <<< "$matches"
    exit 1
 fi

-suffix=${matches#$prefix}
+suffix=${matches#"$prefix"}
 echo "Container name: $suffix" >&2
-echo "$suffix"
+printf '%s\n' "$suffix"