diff --git a/examples/layer_wise_benchmarks/README.md b/examples/layer_wise_benchmarks/README.md
index 7bdc794a9b..21964ae3e6 100644
--- a/examples/layer_wise_benchmarks/README.md
+++ b/examples/layer_wise_benchmarks/README.md
@@ -1,10 +1,12 @@
 # Layer-wise Benchmarks
 
+This tool profiles individual layers of LLM models to help understand the performance characteristics of each layer and compare layer-wise benchmarks with end-to-end profiling results.
+
 ## Generate profiles
 
 ### Run with OpenMPI
 
-**Step 1:** Start a container using Docker, Enroot or others. Please refer to `../../jenkins/current_image_tags.properties` for the Docker image URI.
+**Step 1:** Start a container using Docker, Enroot, or other container runtimes. Please refer to `../../jenkins/current_image_tags.properties` for the Docker image URI.
 
 **Step 2:** In the container, install `tensorrt_llm`:
 
@@ -19,7 +21,7 @@ pip install -e ../..
 NP=4 ./mpi_launch.sh ./run.sh config_ctx.yaml
 NP=4 ./mpi_launch.sh ./run.sh config_gen.yaml
 
-# Run with weights loaded. Requires local model directory
+# Run with weights loaded (requires a local model directory)
 NP=4 ./mpi_launch.sh ./run.sh config_ctx.yaml --model "$LLM_MODELS_ROOT/DeepSeek-R1/DeepSeek-R1-0528-FP4-v2" --load-format AUTO
 NP=4 ./mpi_launch.sh ./run.sh config_gen.yaml --model "$LLM_MODELS_ROOT/DeepSeek-R1/DeepSeek-R1-0528-FP4-v2" --load-format AUTO
 
@@ -46,10 +48,10 @@ NP=4 ./mpi_launch.sh ./run.sh config_gen.yaml --batch-size 32 --seq-len-q 4
 NP=4 ./mpi_launch.sh ./run.sh config_ctx.yaml --layer-indices 5,6,7,8
 NP=4 ./mpi_launch.sh ./run.sh config_gen.yaml --layer-indices 5,6,7,8
 
-# Scale DEP=16 to 4 GPUs: reduce the number of experts, uses MNNVL A2A if applicable
+# Scale DEP=16 to 4 GPUs: reduces the number of experts; uses MNNVL A2A if applicable
 NP=4 ./mpi_launch.sh ./run.sh config_gen.yaml --scaled-from 16 --moe-backend WIDEEP
 
-# Scale TEP=16 to 4 GPUs: reduce the number of attention heads and experts
+# Scale TEP=16 to 4 GPUs: reduces the number of attention heads and experts
 NP=4 ./mpi_launch.sh ./run.sh config_gen.yaml --scaled-from 16 --no-enable-attention-dp
 
 # Run Nemotron-3-Nano
@@ -64,12 +66,12 @@ NP=2 ./mpi_launch.sh ./run.sh config_gen.yaml --model Qwen/Qwen3-Next-80B-A3B-In
 NP=4 ./mpi_launch.sh -x TRTLLM_FORCE_ALLTOALL_METHOD=DeepEP ./run.sh config_ctx.yaml --moe-backend WIDEEP
 NP=4 ./mpi_launch.sh -x TRTLLM_FORCE_ALLTOALL_METHOD=DeepEP ./run.sh config_gen.yaml --moe-backend WIDEEP
 
-# Run with imbalanced ranks: except for activating all experts, a% of the tokens are sent to the 1st rank
-# Note: if balance ratio is 0, ignore activating all experts
+# Run with imbalanced ranks: in addition to activating all experts, the specified ratio of tokens is sent to rank 0
+# Note: if balance ratio is 0, the "activate all experts" behavior is not applied
 NP=4 ./mpi_launch.sh ./run.sh config_ctx.yaml --balance-method ImbalancedRanks --balance-ratio 0.5
 NP=4 ./mpi_launch.sh ./run.sh config_gen.yaml --balance-method ImbalancedRanks --balance-ratio 0.5
 
-# Run with imbalanced experts and balanced ranks: except for activating all experts, a% of the tokens are sent to the front experts on each rank
+# Run with imbalanced experts and balanced ranks: in addition to activating all experts, the specified ratio of tokens is sent to the front experts on each rank
 NP=4 ./mpi_launch.sh ./run.sh config_ctx.yaml --balance-method ImbalancedExperts --balance-ratio 0.5
 NP=4 ./mpi_launch.sh ./run.sh config_gen.yaml --balance-method ImbalancedExperts --balance-ratio 0.5
 ```
@@ -77,8 +79,8 @@ NP=4 ./mpi_launch.sh ./run.sh config_gen.yaml --balance-method ImbalancedExperts
 ### Run with Slurm
 
 > Tips:
-> 1. If you have a running Slurm job, you can set environment variable `export SLURM_JOB_ID=aaa` and skip step 1.
-> 2. Further, if you have installed `tensorrt_llm` in the Slurm job, you can also skip step 2. Just run step 3 with `export CONTAINER_NAME=aaa` specified. If you don't know the container name, run `export CONTAINER_NAME=$(./slurm_query_container_name.sh)` to get it.
+> 1. If you have a running Slurm job, you can set the environment variable by running `export SLURM_JOB_ID=<job_id>` and skip Step 1.
+> 2. Further, if you have already installed `tensorrt_llm` in the Slurm job, you can also skip Step 2. Just run Step 3 with `export CONTAINER_NAME=<name>` specified. If you don't know the container name, run `export CONTAINER_NAME=$(./slurm_query_container_name.sh)` to get it.
 
 **Step 1:** On the controller node, allocate one or multiple nodes, and export the `SLURM_JOB_ID`:
 
@@ -86,7 +88,7 @@ NP=4 ./mpi_launch.sh ./run.sh config_gen.yaml --balance-method ImbalancedExperts
 export SLURM_JOB_ID=$(NODES=4 TIME=02:00:00 ./slurm_alloc.sh)
 ```
 
-Please fill the variables in `./slurm_alloc.sh`.
+Please set the variables in `./slurm_alloc.sh` before running.
 
 **Step 2:** Start a container and install `tensorrt_llm`. Run the following command on the controller node:
 
@@ -94,9 +96,9 @@ Please fill the variables in `./slurm_alloc.sh`.
 ./slurm_init_containers.sh
 ```
 
-It uses the image recorded in `../../jenkins/current_image_tags.properties`. The image will be downloaded to `../../enroot/` for once.
+This script uses the image recorded in `../../jenkins/current_image_tags.properties`. The image will be downloaded to `../../enroot/` once.
 
-> Tips: If you want to change the image, no need to reallocate Slurm jobs. Just start another container by running step 2 with `export CONTAINER_NAME=aaa`, and step 3 will run in the container specified by the `CONTAINER_NAME` env.
+> Tip: If you want to change the image, there is no need to reallocate Slurm jobs. Just start another container by running Step 2 with `export CONTAINER_NAME=<new_name>`, and Step 3 will run in the container specified by the `CONTAINER_NAME` environment variable.
 
 **(Optional) Get an interactive shell**
 
@@ -117,7 +119,7 @@ python3 scripts/build_wheel.py --cuda_architectures native --no-venv --skip_buil
 **Step 3:** Run benchmarks to generate profiles. Run the following command on the controller node, where `NODES` &le; the number of allocated nodes:
 
 ```bash
-# Run DeepSeek-R1 NVFP4 with wide ep: uses MNNVL A2A if applicable
+# Run DeepSeek-R1 NVFP4 with wide EP; uses MNNVL A2A if applicable
 NODES=4 NP=16 ./slurm_launch.sh ./run.sh config_gen.yaml --moe-backend WIDEEP
 
 # Run with TRTLLMGen
@@ -126,7 +128,7 @@ NODES=4 NP=16 ./slurm_launch.sh ./run.sh config_gen.yaml --moe-backend TRTLLM
 # Run with DeepEPLowLatency
 NODES=4 NP=16 TRTLLM_FORCE_ALLTOALL_METHOD=DeepEPLowLatency ./slurm_launch.sh ./run.sh config_gen.yaml --moe-backend WIDEEP
 
-# You can run 4-GPU and 8-GPU tasks without reallocating the slurm job
+# You can run 4-GPU and 8-GPU tasks without reallocating the Slurm job
 NODES=1 NP=4 ./slurm_launch.sh ./run.sh config_ctx.yaml
 NODES=2 NP=8 ./slurm_launch.sh ./run.sh config_gen.yaml
 ```
@@ -141,7 +143,7 @@ Supported list arguments:
 - `--seq-len-kv-cache` (or `seq_len_kv_cache` in YAML)
 - `--balance-ratio` (or `balance_ratio` in YAML)
 
-Command line arguments are comma separated, for example, `--batch-size 1,2,4`. Configs in the YAML file are lists, for example, `batch_size: [1, 2, 4]`.
+Command-line arguments are comma-separated, for example, `--batch-size 1,2,4`. Values in the YAML file are lists, for example, `batch_size: [1, 2, 4]`.
 
 Run with OpenMPI:
 
@@ -166,18 +168,19 @@ python3 parse.py --profile-dir ./profiles --world-size 4 --rank 0
 python3 parse.py --world-size 4 --module MoE
 ```
 
-You will receive three reports, each containing kernel timing statistics grouped by module:
+You will receive four reports, each containing kernel timing statistics grouped by module:
 1. A printed report on stdout
 2. A CSV report at `profiles/report_np4_rank0.csv`
 3. An HTML report at `profiles/report_np4_rank0.html`
+4. A JSON report at `profiles/report_np4_rank0.json` (for correlation analysis)
 
 ## Performance alignment between end-to-end performance and layer-wise benchmarks
 
-An overall example can be found in `sample_performance_alignment.sh`. Here is an abstract of the main steps.
+A complete example can be found in `sample_performance_alignment.sh`. Below is an overview of the main steps.
 
-1. Run end-to-end serving in **COLLECT** mode, and capture nsys profiles. This step generates a calibration file.
+1. Run end-to-end serving in **COLLECT** mode and capture nsys profiles. This step generates a calibration file.
 
-   Please meet the following requirements.
+   Requirements:
 
    1. Add the following fields to `config.yaml`.
 
@@ -187,13 +190,13 @@ An overall example can be found in `sample_performance_alignment.sh`. Here is an
           calibration_file_path: profiles/calibration_data.json
       ```
 
-   2. Set `TLLM_PROFILE_START_STOP` to a range that can capture some iterations (typically tens of iterations) of GEN phase. Ensure every iteration has the same batch size. Please capture 5 more iterations at beginning, because the first 5 iterations are regarded as warm-ups and will be dropped by the parser by default.
+   2. Set `TLLM_PROFILE_START_STOP` to a range that captures some iterations (typically tens of iterations) of the GEN phase. Ensure that every iteration has the same batch size. Capture 5 extra iterations at the beginning, because the first 5 iterations are treated as warm-ups and will be dropped by the parser by default.
 
-   3. Capture per-rank nsys profiles, and every rank should produce a separate file.
+   3. Capture per-rank nsys profiles; each rank should produce a separate file.
 
-      You need to put `nsys profile` behind `mpirun` or `srun`. To minimize profile overhead and file size, there is no need to capture samples and GPU metrics.
+      Place `nsys profile` after `mpirun` or `srun`. To minimize profiling overhead and file size, there is no need to capture samples or GPU metrics.
 
-      If you use `trtllm-serve` or `trtllm-bench`, please follow the following command order. If you use `examples/disaggregated/slurm/benchmark/submit.py`, setting `gen_profile_range` is enough.
+      If you use `trtllm-serve` or `trtllm-bench`, use the following command order. If you use `examples/disaggregated/slurm/benchmark/submit.py`, setting `gen_profile_range` is sufficient.
 
       ```bash
       NP=$NP ./mpi_launch.sh middleware/mpi_env_from_ompi \
@@ -209,11 +212,11 @@ An overall example can be found in `sample_performance_alignment.sh`. Here is an
           --model ...
       ```
 
-   4. To be more precise, set the same `TLLM_AUTOTUNER_CACHE_PATH` for all the steps. The autotuner cache file should be generated by Step 1, and be reused by Step 2 and Step 3.
+   4. For more accurate results, set the same `TLLM_AUTOTUNER_CACHE_PATH` for all steps. The autotuner cache file should be generated in Step 1 and reused in Steps 2 and 3.
 
-2. If the end-to-end serving uses CUDA Graphs, run Step 1 again in **MARK** mode without CUDA Graphs, and also capture nsys profiles.
+2. If the end-to-end serving uses CUDA Graphs, run Step 1 again in **MARK** mode without CUDA Graphs and capture nsys profiles.
 
-   The differences are as follows.
+   The differences from Step 1 are as follows:
 
    1. Add the following fields to `config.yaml`.
 
@@ -241,18 +244,18 @@ An overall example can be found in `sample_performance_alignment.sh`. Here is an
        --replay-stop 67
    ```
 
-   Here are explanations of every argument:
+   Argument explanations:
 
    | Argument/Parameter | Explanation |
-   |-------------------|-------------|
+   | ------------------ | ----------- |
    | `NP=4` | Should match the end-to-end run. |
-   | `--load-format AUTO` | Instruct the benchmark to load model weights instead of initializing random weights. |
-   | `--layer-indices 5,6,7` | A list of contiguous layers you want to calibrate. |
+   | `--load-format AUTO` | Instructs the benchmark to load model weights instead of using random weights. |
+   | `--layer-indices 5,6,7` | A list of contiguous layers to calibrate. |
    | `--batch-size 32` | Should match the end-to-end run. |
-   | `--seq-len-q 1` | Should match (1+MTP) of the end-to-end run. |
-   | `--seq-len-kv-cache 2090` | Estimation of the average context length for iterations you captured. The first 5 iterations should be excluded from the estimation, because they will be dropped by parser. |
-   | `--replay-file-path` | The calibration file obtained by Step 1. |
-   | `--replay-start` and `--replay-stop` | Should match the end-to-end `TLLM_PROFILE_START_STOP`. Do not replay the first 5 iterations, because they will be dropped by parser. |
+   | `--seq-len-q 1` | Should match (1 + MTP) of the end-to-end run. |
+   | `--seq-len-kv-cache 2090` | An estimate of the average context length for the captured iterations. The first 5 iterations should be excluded from this estimate because they will be dropped by the parser. |
+   | `--replay-file-path` | The calibration file obtained from Step 1. |
+   | `--replay-start` and `--replay-stop` | Should match the end-to-end `TLLM_PROFILE_START_STOP`. Do not replay the first 5 iterations because they will be dropped by the parser. |
 
 4. Parse end-to-end profiles with `parse_e2e.py`, and parse layer-wise benchmarks profiles with `parse.py`.
 
@@ -278,30 +281,30 @@ An overall example can be found in `sample_performance_alignment.sh`. Here is an
        -o profiles/correlation.html
    ```
 
-   Please find `profiles/correlation.html` for the report.
+   The report can be found at `profiles/correlation.html`.
 
 Limitations:
 
 1. Pipeline parallelism is not supported.
-2. MoE backends CUTLASS and WIDEEP are supported.
-3. Only tested with GEN phase and attention DP.
+2. Only the CUTLASS and WIDEEP MoE backends are supported.
+3. Only tested with the GEN phase and attention DP.
 
 ## Developer utilities
 
-1. Less startup time when debug a model
+1. Reduce startup time when debugging a model
    1. Set autotuner cache or disable autotuner
-      1. Set autotuner cache: add `TLLM_AUTOTUNER_CACHE_PATH=autotuner_cache/cache` environment variable. This is enabled at your own risk, and you may need to delete the cache if `NP` changes or the code changes
-      2. Disable autotuner: add `--no-enable-autotuner` option
-   2. Disable nsys profile: set `PROFILE=0` environment variable
+      1. Set autotuner cache: set the `TLLM_AUTOTUNER_CACHE_PATH=autotuner_cache/cache` environment variable. Use this at your own risk; you may need to delete the cache if `NP` changes or the code changes
+      2. Disable autotuner: add the `--no-enable-autotuner` option
+   2. Disable nsys profiling: set the `PROFILE=0` environment variable
 2. Capture more information
-   1. Enable GPU metrics: set `GPU_METRICS=1` environment variable
-   2. Enable backtrace: set `BACKTRACE=1` environment variable
+   1. Enable GPU metrics: set the `GPU_METRICS=1` environment variable
+   2. Enable backtrace: set the `BACKTRACE=1` environment variable
 
-## Trouble shooting
+## Troubleshooting
 
 1. Error `fp8 blockscale gemm only support Hopper` on Blackwell.
 
-   The default MoE backend "CUTLASS" does not support FP8 weights. Please choose the same MoE backend as your end-to-end config. A typical choice is adding `--moe-backend DEEPGEMM` (or `TRTLLM`, `WIDEEP`) and `--moe-backend-for-prefill DEEPGEMM` (or `WIDEEP`) option.
+   The default MoE backend "CUTLASS" does not support FP8 weights. Please choose the same MoE backend as your end-to-end config. A typical solution is to add the `--moe-backend DEEPGEMM` (or `TRTLLM`, `WIDEEP`) and `--moe-backend-for-prefill DEEPGEMM` (or `WIDEEP`) options.
 
 2. Error `huggingface_hub.errors.HfHubHTTPError: 429 Client Error: Too Many Requests for url: https://huggingface.co/nvidia/DeepSeek-R1-0528-FP4-v2/resolve/main/config.json`.
 
diff --git a/examples/layer_wise_benchmarks/mpi_launch.sh b/examples/layer_wise_benchmarks/mpi_launch.sh
index 44f6d28dcf..0df7d187d8 100755
--- a/examples/layer_wise_benchmarks/mpi_launch.sh
+++ b/examples/layer_wise_benchmarks/mpi_launch.sh
@@ -5,10 +5,10 @@ set -euo pipefail
 # Clear slurm envs
 unset $(env | awk -F'=' '{print $1}' | (grep -E "SLURM_|SLURMD_|slurm_|MPI_|PMIX_" || true))
 
-extra_args=
+extra_args=()
 if [ -v TLLM_AUTOTUNER_CACHE_PATH ]; then
-    extra_args+="-x TLLM_AUTOTUNER_CACHE_PATH"
+    extra_args+=(-x TLLM_AUTOTUNER_CACHE_PATH)
 fi
 
 set -x
-mpirun --allow-run-as-root --np ${NP} $extra_args "$@"
+mpirun --allow-run-as-root --np $NP ${extra_args[@]+"${extra_args[@]}"} "$@"
diff --git a/examples/layer_wise_benchmarks/parse.py b/examples/layer_wise_benchmarks/parse.py
index 086d6fcd19..b52f656b76 100644
--- a/examples/layer_wise_benchmarks/parse.py
+++ b/examples/layer_wise_benchmarks/parse.py
@@ -5,6 +5,7 @@ import json
 import re
 import sqlite3
 from pathlib import Path
+from typing import NamedTuple
 
 import jinja2
 import numpy as np
@@ -16,6 +17,50 @@ from parser_utils import (
     warned_names,
 )
 
+
+class NvtxRange(NamedTuple):
+    """Represents an NVTX range with start/end times and text label."""
+
+    start: int
+    end: int
+    text: str
+
+
+class KernelRecord(NamedTuple):
+    """Represents a kernel record from the database query.
+
+    Used for sorting and grouping kernels by runtime and capture time.
+    """
+
+    problem_id: int
+    run_id: int
+    range_names: tuple[str, ...]
+    kernel_start: int
+    kernel_end: int
+    demangled_name: int  # String ID reference
+    runtime_start: int
+    capture_start: int
+
+
+class KernelTiming(NamedTuple):
+    """Represents a kernel's timing within a run.
+
+    Used after sorting and grouping for per-run analysis.
+    """
+
+    demangled_name: int  # String ID reference
+    kernel_start: int
+    kernel_end: int
+    range_names: tuple[str, ...]
+
+
+class CategoryTime(NamedTuple):
+    """Represents a category (hierarchical path) and its associated time."""
+
+    category: tuple[str, ...]
+    time_ns: float
+
+
 # Parse cmdline
 parser = argparse.ArgumentParser()
 parser.add_argument("--file-path", type=str)
@@ -71,19 +116,19 @@ query = """SELECT T1.start, T2.value AS text
     JOIN StringIds AS T2 ON T1.textId = T2.id
     WHERE eventType = ? AND T2.value LIKE ?"""
 df = pd.read_sql_query(query, conn, params=(event_id_NvtxPushPopRange, "layer_wise_benchmarks %"))
-problem_start = []
-problem_set = []
+problem_start_times: list[int] = []
+problem_set: list[dict] = []
 for start, text in df.itertuples(index=False):
     if text.startswith("layer_wise_benchmarks args {"):
         run_args = json.loads(text[len("layer_wise_benchmarks args") :])
     elif text.startswith("layer_wise_benchmarks problem_spec {"):
-        problem_start.append(start)
+        problem_start_times.append(start)
         problem_set.append(
             {
                 "spec": json.loads(text[len("layer_wise_benchmarks problem_spec ") :]),
                 "text": "",
-                "runs": [],
-                "runs_end": [],
+                "run_starts": [],
+                "run_ends": [],
                 "ranges": [],
                 "kernel_count_per_range": [],
             }
@@ -99,7 +144,7 @@ df = pd.read_sql_query(
     params=(event_id_NvtxPushPopRange, "[DG]%", nccl_domain_id),
 )
 for start, end, text in df.itertuples(index=False):
-    problem_id = bisect.bisect(problem_start, start) - 1
+    problem_id = bisect.bisect(problem_start_times, start) - 1
     if text.startswith("layer_wise_benchmarks "):
         if text != "layer_wise_benchmarks ignore":
             continue
@@ -107,10 +152,10 @@ for start, end, text in df.itertuples(index=False):
         assert problem_id != -1
     if re.match(r"b=\d+ s=\d+ ", text):
         problem_set[problem_id]["text"] = text
-        problem_set[problem_id]["runs"].append(start)
-        problem_set[problem_id]["runs_end"].append(end)
+        problem_set[problem_id]["run_starts"].append(start)
+        problem_set[problem_id]["run_ends"].append(end)
     else:
-        problem_set[problem_id]["ranges"].append((start, end, text))
+        problem_set[problem_id]["ranges"].append(NvtxRange(start, end, text))
         problem_set[problem_id]["kernel_count_per_range"].append(0)
 
 query = """SELECT name FROM sqlite_master WHERE type = ?"""
@@ -127,16 +172,14 @@ if "CUPTI_ACTIVITY_KIND_MEMSET" in tables:
         SELECT T3.start, T3.end, -3 AS demangledName, T3.correlationId, T3.graphNodeId
         FROM CUPTI_ACTIVITY_KIND_MEMSET AS T3"""
 query = f"""SELECT unified.start, unified.end, unified.demangledName,
-       R.start AS runtime_start, R.end AS runtime_end,
-       R.start AS capture_start, R.end AS capture_end
+       R.start AS runtime_start, R.start AS capture_start, R.end AS capture_end
 FROM ({unified_subquery}) AS unified
 JOIN CUPTI_ACTIVITY_KIND_RUNTIME AS R ON unified.correlationId = R.correlationId
 WHERE unified.graphNodeId IS NULL"""
 if "CUDA_GRAPH_NODE_EVENTS" in tables:
     query += f""" UNION ALL
     SELECT unified.start, unified.end, unified.demangledName,
-           R.start AS runtime_start, R.end AS runtime_end,
-           CGE2.start AS capture_start, CGE2.end AS capture_end
+           R.start AS runtime_start, CGE2.start AS capture_start, CGE2.end AS capture_end
     FROM ({unified_subquery}) AS unified
     JOIN CUPTI_ACTIVITY_KIND_RUNTIME AS R ON unified.graphNodeId IS NOT NULL AND
                                              unified.correlationId = R.correlationId
@@ -144,44 +187,41 @@ if "CUDA_GRAPH_NODE_EVENTS" in tables:
                                                 CGE1.originalGraphNodeId IS NOT NULL
     LEFT JOIN CUDA_GRAPH_NODE_EVENTS AS CGE2 ON CGE1.originalGraphNodeId = CGE2.graphNodeId"""
 df = pd.read_sql_query(query, conn)
-kernel_list = []
+kernel_records: list[KernelRecord] = []
 for (
-    start,
-    end,
-    demangledName,
+    kernel_start,
+    kernel_end,
+    demangled_name,
     runtime_start,
-    runtime_end,
     capture_start,
     capture_end,
 ) in df.itertuples(index=False):
-    problem_id = bisect.bisect(problem_start, start) - 1
+    problem_id = bisect.bisect(problem_start_times, kernel_start) - 1
     problem = problem_set[problem_id]
-    run_id = bisect.bisect(problem["runs"], runtime_start) - 1
-    if run_id == -1 or runtime_start >= problem["runs_end"][run_id]:
+    run_id = bisect.bisect(problem["run_starts"], runtime_start) - 1
+    if run_id == -1 or runtime_start >= problem["run_ends"][run_id]:
         continue
-    ranges = [
+    matching_range_indices = [
         i
-        for i, (range_start, range_end, text) in enumerate(problem["ranges"])
-        if capture_start >= range_start and capture_end <= range_end
+        for i, nvtx_range in enumerate(problem["ranges"])
+        if capture_start >= nvtx_range.start and capture_end <= nvtx_range.end
     ]
-    for range_id in ranges:
-        problem["kernel_count_per_range"][range_id] += 1
-    range_names = [problem["ranges"][i][2] for i in ranges]
+    for range_idx in matching_range_indices:
+        problem["kernel_count_per_range"][range_idx] += 1
+    range_names = tuple(problem["ranges"][i].text for i in matching_range_indices)
     if (
         args.module is None or args.module in range_names
     ) and "layer_wise_benchmarks ignore" not in range_names:
-        kernel_list.append(
-            (
-                problem_id,
-                run_id,
-                range_names,
-                start,
-                end,
-                demangledName,
-                runtime_start,
-                runtime_end,
-                capture_start,
-                capture_end,
+        kernel_records.append(
+            KernelRecord(
+                problem_id=problem_id,
+                run_id=run_id,
+                range_names=range_names,
+                kernel_start=kernel_start,
+                kernel_end=kernel_end,
+                demangled_name=demangled_name,
+                runtime_start=runtime_start,
+                capture_start=capture_start,
             )
         )
 
@@ -195,12 +235,10 @@ conn.close()
 # Check ambiguous modules
 if args.module:
     for problem in problem_set:
-        num_matches_per_run = [0] * (len(problem["runs"]) + 1)
-        for (range_start, _, text), kernel_count in zip(
-            problem["ranges"], problem["kernel_count_per_range"]
-        ):
-            if text == args.module and kernel_count > 0:
-                num_matches_per_run[bisect.bisect(problem["runs"], range_start)] += 1
+        num_matches_per_run = [0] * (len(problem["run_starts"]) + 1)
+        for nvtx_range, kernel_count in zip(problem["ranges"], problem["kernel_count_per_range"]):
+            if nvtx_range.text == args.module and kernel_count > 0:
+                num_matches_per_run[bisect.bisect(problem["run_starts"], nvtx_range.start)] += 1
         for run_id_plus_one, num_matches in enumerate(num_matches_per_run):
             if num_matches > 1:
                 raise ValueError(
@@ -208,72 +246,70 @@ if args.module:
                     f' in "{problem["text"]}"\'s {run_id_plus_one}-th run'
                 )
 
-kernel_list.sort(key=lambda t: (t[6], t[8]))
-kernels = [[[] for _ in problem["runs"]] for problem in problem_set]
-for (
-    problem_id,
-    run_id,
-    ranges,
-    start,
-    end,
-    demangledName,
-    runtime_start,
-    runtime_end,
-    capture_start,
-    capture_end,
-) in kernel_list:
-    kernels[problem_id][run_id].append((demangledName, start, end, ranges))
-for problem_id in range(len(kernels)):
-    required_seq = [demangledName for demangledName, _, _, _ in kernels[problem_id][0]]
-    for run_id in range(len(kernels[problem_id])):
-        seq = [demangledName for demangledName, _, _, _ in kernels[problem_id][run_id]]
+kernel_records.sort(key=lambda rec: (rec.runtime_start, rec.capture_start))
+kernels_per_problem: list[list[list[KernelTiming]]] = [
+    [[] for _ in problem["run_starts"]] for problem in problem_set
+]
+for rec in kernel_records:
+    kernels_per_problem[rec.problem_id][rec.run_id].append(
+        KernelTiming(
+            demangled_name=rec.demangled_name,
+            kernel_start=rec.kernel_start,
+            kernel_end=rec.kernel_end,
+            range_names=rec.range_names,
+        )
+    )
+for problem_id, runs in enumerate(kernels_per_problem):
+    required_seq = [kernel.demangled_name for kernel in runs[0]]
+    for run_id, run in enumerate(runs):
+        seq = [kernel.demangled_name for kernel in run]
         assert seq == required_seq
 
-converted_seqs = []
+converted_seqs: list[list[CategoryTime]] = []
 warmup_times = run_args["warmup_times"] if args.warmup_times is None else args.warmup_times
-for runs in kernels:
-    converted_seq = []
+for runs in kernels_per_problem:
+    converted_seq: list[CategoryTime] = []
     # Kernel time
-    for i, (demangledName, _, _, ranges) in enumerate(runs[0]):
-        name = kernel_short_name(string_ids[demangledName])
-        category = (*ranges, name)
-        time_list = [run[i][2] - run[i][1] for run in runs]
-        t = np.mean(time_list[warmup_times:]).tolist()
-        converted_seq.append((category, t))
+    for i, kernel in enumerate(runs[0]):
+        name = kernel_short_name(string_ids[kernel.demangled_name])
+        category = (*kernel.range_names, name)
+        time_list = [run[i].kernel_end - run[i].kernel_start for run in runs]
+        time_ns = np.mean(time_list[warmup_times:]).tolist()
+        converted_seq.append(CategoryTime(category, time_ns))
     # Space and Overlap
     overlap_list = []
     space_list = []
     for run in runs:
-        sorted_run = sorted(run, key=lambda op: op[1])
-        last_end = sorted_run[0][1]
+        sorted_run = sorted(run, key=lambda k: k.kernel_start)
+        last_end = sorted_run[0].kernel_start
         overlap_time = 0
         space_time = 0
-        for _, start, end, _ in sorted_run:
-            if start > last_end:
-                space_time += start - last_end
+        for kernel in sorted_run:
+            if kernel.kernel_start > last_end:
+                space_time += kernel.kernel_start - last_end
             else:
-                overlap_time += min(last_end, end) - start
-            last_end = max(last_end, end)
+                overlap_time += min(last_end, kernel.kernel_end) - kernel.kernel_start
+            last_end = max(last_end, kernel.kernel_end)
         overlap_list.append(-overlap_time)
         space_list.append(space_time)
-    converted_seq.append((("Overlap",), np.mean(overlap_list[warmup_times:]).tolist()))
-    converted_seq.append((("Space",), np.mean(space_list[warmup_times:]).tolist()))
-    converted_seq.append((("Total",), sum(t for _, t in converted_seq)))
+    converted_seq.append(CategoryTime(("Overlap",), np.mean(overlap_list[warmup_times:]).tolist()))
+    converted_seq.append(CategoryTime(("Space",), np.mean(space_list[warmup_times:]).tolist()))
+    converted_seq.append(CategoryTime(("Total",), sum(ct.time_ns for ct in converted_seq)))
     converted_seqs.append(converted_seq)
 if args.error_on_unknown_kernel and warned_names:
     raise ValueError("Unknown kernel names encountered")
 
-merged_title = []
+merged_title: list[tuple[str, ...]] = []
 for converted_seq in converted_seqs:
-    title = [name for name, _ in converted_seq]
+    title = [ct.category for ct in converted_seq]
     merged_title = shortest_common_supersequence(merged_title, title)
 
-merged_data = [[0.0] * len(problem_set) for _ in merged_title]
+merged_data: list[list[float]] = [[0.0] * len(problem_set) for _ in merged_title]
 for problem_id, converted_seq in enumerate(converted_seqs):
     cur = 0
-    for category, t in converted_seq:
-        cur = merged_title.index(category, cur)
-        merged_data[cur][problem_id] = t
+    for ct in converted_seq:
+        cur = merged_title.index(ct.category, cur)
+        merged_data[cur][problem_id] = ct.time_ns
         cur += 1
 
 print("Run args:")
@@ -282,14 +318,14 @@ print(run_args)
 print("Problem set:")
 for problem in problem_set:
     print(
-        f'- "{problem["text"]}"    {len(problem["runs"])} runs'
-        f"    Ranges: [{', '.join(text for _, end, text in problem['ranges'] if end <= problem['runs_end'][0])}]"
+        f'- "{problem["text"]}"    {len(problem["run_starts"])} runs'
+        f"    Ranges: [{', '.join(r.text for r in problem['ranges'] if r.end <= problem['run_ends'][0])}]"
     )
 
-stack = []
-csv_data = [["", *[problem["text"] for problem in problem_set]]]
-js_data = []
-js_stack = [js_data]
+stack: list[str] = []
+csv_data: list[list[str]] = [["", *[problem["text"] for problem in problem_set]]]
+js_data: list[dict] = []
+js_stack: list[list[dict]] = [js_data]
 max_title_len = max((len(title) - 1) * 3 + len(title[-1][:40]) for title in merged_title)
 print("-" * (max_title_len + 1 + 6 * len(problem_set)))
 for title, time_data in zip(merged_title, merged_data):
@@ -330,8 +366,7 @@ with csv_file_path.open("w", newline="") as f:
     csv_writer = csv.writer(f, quoting=csv.QUOTE_MINIMAL)
     for row in csv_data:
         csv_writer.writerow(row)
-js_header_config = [{"name": problem["text"]} for problem in problem_set]
-js_header_config = []
+js_header_config: list[dict] = []
 for problem in problem_set:
     innermost_children = js_header_config
     for k, msg_prefix in [
@@ -353,35 +388,35 @@ for problem in problem_set:
 loader = jinja2.FileSystemLoader(Path(__file__).parent)
 template = jinja2.Environment(loader=loader).get_template("breakdown_template.html")
 with html_file_path.open("w") as f:
-    configText = (
+    config_text = (
         "Run:\n"
         + json.dumps(run_args, indent=4)
         + "\n\nParse:\n"
         + json.dumps(args.__dict__, indent=4)
     )
-    f.write(template.render(headerConfig=js_header_config, rawData=js_data, configText=configText))
+    f.write(template.render(headerConfig=js_header_config, rawData=js_data, configText=config_text))
 
 if args.query is not None:
     print("Query:")
-    for query in args.query.split(","):
-        query = query.strip()
+    for query_str in args.query.split(","):
+        query_str = query_str.strip()
         query_matched = [0.0] * len(problem_set)
         for title, time_data in zip(merged_title, merged_data):
-            if query in ".".join(title):
+            if query_str in ".".join(title):
                 for i, x in enumerate(time_data):
                     query_matched[i] += x
         print(
-            query + " " * (max_title_len - len(query)),
+            query_str + " " * (max_title_len - len(query_str)),
             *[f"{x / 1000:-6.1f}" for x in query_matched],
         )
 
-correlation = []
-for problem, runs in zip(problem_set, kernels):
-    timeline = []
-    for i, (demangledName, _, _, _) in enumerate(runs[0]):
-        name = string_ids[demangledName]
-        duration_list = [run[i][2] - run[i][1] for run in runs]
-        end_list = [run[i][2] - run[0][1] for run in runs]
+correlation: list[dict] = []
+for problem, runs in zip(problem_set, kernels_per_problem):
+    timeline: list[dict] = []
+    for i, kernel in enumerate(runs[0]):
+        name = string_ids[kernel.demangled_name]
+        duration_list = [run[i].kernel_end - run[i].kernel_start for run in runs]
+        end_list = [run[i].kernel_end - run[0].kernel_start for run in runs]
         timeline.append(
             {
                 "name": name,
diff --git a/examples/layer_wise_benchmarks/parse_e2e.py b/examples/layer_wise_benchmarks/parse_e2e.py
index d55d5aae7d..b4df91052a 100644
--- a/examples/layer_wise_benchmarks/parse_e2e.py
+++ b/examples/layer_wise_benchmarks/parse_e2e.py
@@ -4,6 +4,7 @@ import json
 import re
 import sqlite3
 from pathlib import Path
+from typing import NamedTuple
 
 import numpy as np
 import pandas as pd
@@ -15,7 +16,36 @@ from parser_utils import (
 )
 
 
-def comma_separated_ints(s):
+class IterInfo(NamedTuple):
+    """Represents an iteration's timing information."""
+
+    start: int
+    end: int
+    iter_id: int
+
+
+class LayerInfo(NamedTuple):
+    """Represents a layer's timing information within an iteration."""
+
+    start: int
+    end: int
+    layer_idx: int
+
+
+class KernelQueryResult(NamedTuple):
+    """Represents a kernel query result for e2e parsing.
+
+    Sorted by runtime_start for consistent ordering.
+    """
+
+    runtime_start: int
+    graph_node_id: int | None
+    kernel_start: int
+    kernel_end: int
+    demangled_name: int  # String ID reference
+
+
+def comma_separated_ints(s: str) -> list[int]:
     return [int(x) for x in s.split(",")]
 
 
@@ -41,12 +71,12 @@ if args.graph_trace is not None and not args.graph_trace.endswith(".nsys-rep"):
 print(args)
 
 
-def is_gemm(name):
+def is_gemm(name: str) -> bool:
     return "nvjet" in name or "gemm" in name.lower()
 
 
 eager_nsys_rep_file_path = Path(args.eager_trace)
-# For CTX phase which does not use CUDA Graphs, analysis the eager trace instead.
+# For CTX phase which does not use CUDA Graphs, analyze the eager trace instead.
 # Here we do not change the identifier name "graph_*" for convenience.
 graph_nsys_rep_file_path = Path(args.graph_trace or args.eager_trace)
 eager_sqlite_file_path = eager_nsys_rep_file_path.parent / (
@@ -89,47 +119,47 @@ if target_gen_reqs is None:
     else:
         target_gen_reqs = 0
 print(f"{target_ctx_reqs=} {target_gen_reqs=}")
-eager_iters = []
+eager_iters: list[IterInfo] = []
 for start, end, text in df.itertuples(index=False):
     if m := re.match(r"^\[Executor\] _forward_step (\d+): (\d+) ctx reqs, (\d+) gen reqs", text):
-        it = int(m.group(1))
+        iter_id = int(m.group(1))
         ctx_reqs = int(m.group(2))
         gen_reqs = int(m.group(3))
         if ctx_reqs == target_ctx_reqs and gen_reqs == target_gen_reqs:
-            eager_iters.append((start, end, it))
+            eager_iters.append(IterInfo(start, end, iter_id))
 eager_iters = sorted(eager_iters)[args.warmup_times :]
-iter_list = [t[2] for t in eager_iters]
-print("Iters (eager)", *iter_list)
-per_iter_eager_layers = [[] for _ in iter_list]
+iter_id_list = [it.iter_id for it in eager_iters]
+print("Iters (eager)", *iter_id_list)
+per_iter_eager_layers: list[list[LayerInfo]] = [[] for _ in iter_id_list]
 for start, end, text in df.itertuples(index=False):
     if m := re.match(r"^layer_wise_benchmarks layer_idx (\d+)$", text):
         layer_idx = int(m.group(1))
-        it_idx = bisect.bisect(eager_iters, (start,)) - 1
-        if it_idx < 0 or end > eager_iters[it_idx][1]:
+        iter_idx = bisect.bisect(eager_iters, (start,)) - 1
+        if iter_idx < 0 or end > eager_iters[iter_idx].end:
             continue
-        assert end <= eager_iters[it_idx][1], "Not belong to any iter"
-        per_iter_eager_layers[it_idx].append((start, end, it_idx, layer_idx))
-layer_list = [t[3] for t in per_iter_eager_layers[0]]
-print("Layers (eager)", *layer_list)
+        assert end <= eager_iters[iter_idx].end, "Not belong to any iter"
+        per_iter_eager_layers[iter_idx].append(LayerInfo(start, end, layer_idx))
+layer_idx_list = [layer.layer_idx for layer in per_iter_eager_layers[0]]
+print("Layers (eager)", *layer_idx_list)
 for eager_layers in per_iter_eager_layers:
-    assert [t[3] for t in eager_layers] == layer_list, "inconsistent layer idx"
+    assert [layer.layer_idx for layer in eager_layers] == layer_idx_list, "inconsistent layer idx"
 df = pd.read_sql_query(query, graph_conn, params=(graph_event_id_NvtxPushPopRange,))
-graph_iters = []
+graph_iters: list[IterInfo] = []
 for start, end, text in df.itertuples(index=False):
     if m := re.match(r"^\[Executor\] _forward_step (\d+): (\d+) ctx reqs, (\d+) gen reqs", text):
-        it = int(m.group(1))
+        iter_id = int(m.group(1))
         ctx_reqs = int(m.group(2))
         gen_reqs = int(m.group(3))
         if ctx_reqs == target_ctx_reqs and gen_reqs == target_gen_reqs:
-            graph_iters.append((start, end, it))
+            graph_iters.append(IterInfo(start, end, iter_id))
 graph_iters = sorted(graph_iters)[args.warmup_times :]
-graph_iter_list = [t[2] for t in graph_iters]
-print("Iters (graph)", *graph_iter_list)
-if iter_list != graph_iter_list:
+graph_iter_id_list = [it.iter_id for it in graph_iters]
+print("Iters (graph)", *graph_iter_id_list)
+if iter_id_list != graph_iter_id_list:
     raise ValueError("The ID of iterations do not match")
 
 
-def query_kernels(conn, iters):
+def query_kernels(conn: sqlite3.Connection, iters: list[IterInfo]) -> list[list[KernelQueryResult]]:
     query = """SELECT name FROM sqlite_master WHERE type = ?"""
     df = pd.read_sql_query(query, conn, params=("table",))
     tables = df["name"].tolist()
@@ -148,16 +178,25 @@ def query_kernels(conn, iters):
     FROM ({unified_subquery}) AS unified
     JOIN CUPTI_ACTIVITY_KIND_RUNTIME AS R ON unified.correlationId = R.correlationId"""
     df = pd.read_sql_query(query, conn)
-    per_iter_kernels = [[] for _ in iters]
-    for start, end, graphNodeId, demangledName, runtime_start, runtime_end in df.itertuples(
-        index=False
-    ):
-        it_idx = bisect.bisect(iters, (runtime_start,)) - 1
-        if it_idx < 0 or runtime_end > iters[it_idx][1]:
+    per_iter_kernels: list[list[KernelQueryResult]] = [[] for _ in iters]
+    for (
+        kernel_start,
+        kernel_end,
+        graph_node_id,
+        demangled_name,
+        runtime_start,
+        runtime_end,
+    ) in df.itertuples(index=False):
+        iter_idx = bisect.bisect(iters, (runtime_start,)) - 1
+        if iter_idx < 0 or runtime_end > iters[iter_idx].end:
             continue
-        per_iter_kernels[it_idx].append((runtime_start, graphNodeId, start, end, demangledName))
+        per_iter_kernels[iter_idx].append(
+            KernelQueryResult(
+                runtime_start, graph_node_id, kernel_start, kernel_end, demangled_name
+            )
+        )
     for kernels in per_iter_kernels:
-        kernels.sort()
+        kernels.sort(key=lambda k: (k.runtime_start, k.graph_node_id))
     return per_iter_kernels
 
 
@@ -166,12 +205,14 @@ graph_per_iter_kernels = query_kernels(graph_conn, graph_iters)
 print("#Kernels (eager)", *[len(kernels) for kernels in eager_per_iter_kernels])
 print("#Kernels (graph)", *[len(kernels) for kernels in graph_per_iter_kernels])
 for eager_kernels, graph_kernels in zip(eager_per_iter_kernels, graph_per_iter_kernels):
-    assert all(a[4] == eager_per_iter_kernels[0][i][4] for i, a in enumerate(eager_kernels)), (
-        "eager kernels change across iterations"
-    )
-    assert all(a[4] == graph_per_iter_kernels[0][i][4] for i, a in enumerate(graph_kernels)), (
-        "graph kernels change across iterations"
-    )
+    assert all(
+        kernel.demangled_name == eager_per_iter_kernels[0][i].demangled_name
+        for i, kernel in enumerate(eager_kernels)
+    ), "eager kernels change across iterations"
+    assert all(
+        kernel.demangled_name == graph_per_iter_kernels[0][i].demangled_name
+        for i, kernel in enumerate(graph_kernels)
+    ), "graph kernels change across iterations"
 
 query = "SELECT * FROM StringIds"
 df = pd.read_sql_query(query, eager_conn)
@@ -184,26 +225,33 @@ graph_string_ids.update({-2: "Memcpy", -3: "Memset"})
 eager_conn.close()
 graph_conn.close()
 
-eager_kernel_names = [eager_string_ids[kernel[4]] for kernel in eager_per_iter_kernels[0]]
-graph_kernel_names = [graph_string_ids[kernel[4]] for kernel in graph_per_iter_kernels[0]]
+eager_kernel_names = [
+    eager_string_ids[kernel.demangled_name] for kernel in eager_per_iter_kernels[0]
+]
+graph_kernel_names = [
+    graph_string_ids[kernel.demangled_name] for kernel in graph_per_iter_kernels[0]
+]
 super_kernel_names = shortest_common_supersequence(eager_kernel_names, graph_kernel_names)
 print(f"#Kernels (supersequence) {len(super_kernel_names)}")
-eager_per_layer_kernels = [[] for _ in layer_list]
+eager_per_layer_kernels: list[list[int]] = [[] for _ in layer_idx_list]
 for i, eager_kernel in enumerate(eager_per_iter_kernels[0]):
-    eager_layers_idx = bisect.bisect(per_iter_eager_layers[0], (eager_kernel[0],)) - 1
-    if eager_layers_idx < 0 or eager_kernel[0] > per_iter_eager_layers[0][eager_layers_idx][1]:
+    eager_layer_idx = bisect.bisect(per_iter_eager_layers[0], (eager_kernel.runtime_start,)) - 1
+    if (
+        eager_layer_idx < 0
+        or eager_kernel.runtime_start > per_iter_eager_layers[0][eager_layer_idx].end
+    ):
         continue
-    eager_per_layer_kernels[eager_layers_idx].append(i)
-eager2super = []
+    eager_per_layer_kernels[eager_layer_idx].append(i)
+eager2super: list[int] = []
 j = 0
-for i, eager_kernel_name in enumerate(eager_kernel_names):
+for eager_kernel_name in eager_kernel_names:
     while eager_kernel_name != super_kernel_names[j]:
         j += 1
     eager2super.append(j)
     j += 1
-super_per_layer_starts = [eager2super[a[0]] for a in eager_per_layer_kernels]
-super_per_layer_ends = [eager2super[a[-1]] for a in eager_per_layer_kernels]
-graph_per_layer_kernels = [[] for _ in layer_list]
+super_per_layer_starts = [eager2super[indices[0]] for indices in eager_per_layer_kernels]
+super_per_layer_ends = [eager2super[indices[-1]] for indices in eager_per_layer_kernels]
+graph_per_layer_kernels: list[list[int]] = [[] for _ in layer_idx_list]
 j = 0
 for i, graph_kernel_name in enumerate(graph_kernel_names):
     while graph_kernel_name != super_kernel_names[j]:
@@ -212,16 +260,16 @@ for i, graph_kernel_name in enumerate(graph_kernel_names):
     if layer_idx >= 0 and j <= super_per_layer_ends[layer_idx]:
         graph_per_layer_kernels[layer_idx].append(i)
     j += 1
-timeline = []
+timeline: list[dict] = []
 first_kernel_idx = min(graph_per_layer_kernels[layer_idx][0] for layer_idx in args.layer_indices)
 for layer_idx in args.layer_indices:
     for kernel_idx in graph_per_layer_kernels[layer_idx]:
-        duration_list = []
-        end_list = []
-        for it_idx in range(len(graph_per_iter_kernels)):
-            layer_start_time = graph_per_iter_kernels[it_idx][first_kernel_idx][2]
-            kernel_start_time = graph_per_iter_kernels[it_idx][kernel_idx][2]
-            kernel_end_time = graph_per_iter_kernels[it_idx][kernel_idx][3]
+        duration_list: list[int] = []
+        end_list: list[int] = []
+        for iter_idx in range(len(graph_per_iter_kernels)):
+            layer_start_time = graph_per_iter_kernels[iter_idx][first_kernel_idx].kernel_start
+            kernel_start_time = graph_per_iter_kernels[iter_idx][kernel_idx].kernel_start
+            kernel_end_time = graph_per_iter_kernels[iter_idx][kernel_idx].kernel_end
             duration_list.append(kernel_end_time - kernel_start_time)
             end_list.append(kernel_end_time - layer_start_time)
         timeline.append(
@@ -233,9 +281,11 @@ for layer_idx in args.layer_indices:
         )
 print(f"{'Kernel':40s} {'Duration':>8s} {'End':>8s}")
 print("-" * (40 + 1 + 8 + 1 + 8))
-for o in timeline:
+for entry in timeline:
     print(
-        f"{kernel_short_name(o['name'])[:40]:40s} {o['duration'] / 1000.0:-8.1f} {o['end'] / 1000.0:-8.1f}"
+        f"{kernel_short_name(entry['name'])[:40]:40s} "
+        f"{entry['duration'] / 1000.0:-8.1f} "
+        f"{entry['end'] / 1000.0:-8.1f}"
     )
 if args.error_on_unknown_kernel and warned_names:
     raise ValueError("Unknown kernel names encountered")
diff --git a/examples/layer_wise_benchmarks/run.sh b/examples/layer_wise_benchmarks/run.sh
index 4f20394d84..e94283d23a 100755
--- a/examples/layer_wise_benchmarks/run.sh
+++ b/examples/layer_wise_benchmarks/run.sh
@@ -14,35 +14,39 @@ if [ "$RANK" -eq 0 ]; then
 fi
 
 PROFILE_DIR=${PROFILE_DIR:-profiles}
-mkdir -p ${PROFILE_DIR}
+mkdir -p -- "$PROFILE_DIR"
 
 PROFILE=${PROFILE:-1}
 BACKTRACE=${BACKTRACE:-0}
 GPU_METRICS=${GPU_METRICS:-0}
 if [ "$PROFILE" -eq 1 ]; then
-    PROFILE_CMD="nsys profile
+    PROFILE_CMD=(
+        nsys profile
         -t cuda,nvtx
         --cpuctxsw none --cuda-event-trace false
         --cuda-graph-trace node
         -c cudaProfilerApi --capture-range-end stop
-        -o ${PROFILE_DIR}/report_np${WORLD_SIZE}_rank${RANK}.nsys-rep
-        --force-overwrite true"
+        -o "${PROFILE_DIR}/report_np${WORLD_SIZE}_rank${RANK}.nsys-rep"
+        --force-overwrite true
+    )
     if [ "$BACKTRACE" -eq 1 ]; then
-        PROFILE_CMD+=" --python-backtrace=cuda --cudabacktrace all"
+        PROFILE_CMD+=(--python-backtrace=cuda --cudabacktrace all)
     else
-        PROFILE_CMD+=" -s none"
+        PROFILE_CMD+=(-s none)
     fi
     if [ "$GPU_METRICS" -eq 1 ]; then
-        PROFILE_CMD+=" --gpu-metrics-devices $LOCAL_RANK
-            --gpu-metrics-frequency 10000"
+        PROFILE_CMD+=(
+            --gpu-metrics-devices $LOCAL_RANK
+            --gpu-metrics-frequency 10000
+        )
     fi
 else
-    PROFILE_CMD=
+    PROFILE_CMD=()
 fi
 
-SCRIPT_PATH=$(realpath --relative-to="$(pwd)" "$(dirname -- "$0")"/run.py)
+SCRIPT_PATH=$(realpath --relative-to="$(pwd)" -- "$(dirname -- "$0")"/run.py)
 
 set -x
-$PROFILE_CMD bash -o pipefail -c \
-    "python3 -u \"\$1\" \"\${@:3}\" 2>&1 | tee \"\$2/report_np${WORLD_SIZE}_rank${RANK}.log\"" \
+${PROFILE_CMD[@]+"${PROFILE_CMD[@]}"} bash -o pipefail -c \
+    'python3 -u "$1" "${@:3}" 2>&1 | tee "$2/report_np'"${WORLD_SIZE}"'_rank'"${RANK}"'.log"' \
     bash "$SCRIPT_PATH" "$PROFILE_DIR" "$@"
diff --git a/examples/layer_wise_benchmarks/sample_performance_alignment.sh b/examples/layer_wise_benchmarks/sample_performance_alignment.sh
index b39267c60a..a0ce1c1438 100755
--- a/examples/layer_wise_benchmarks/sample_performance_alignment.sh
+++ b/examples/layer_wise_benchmarks/sample_performance_alignment.sh
@@ -12,14 +12,14 @@ export PROFILE_DIR="${PROFILE_DIR:-profiles}"
 export TLLM_AUTOTUNER_CACHE_PATH="$PROFILE_DIR/sample_performance_alignment_cache.json"
 
 mkdir -p -- "$PROFILE_DIR"
-mkdir -p -- "$(dirname "$TLLM_AUTOTUNER_CACHE_PATH")"
+mkdir -p -- "$(dirname -- "$TLLM_AUTOTUNER_CACHE_PATH")"
 
 python3 ../../benchmarks/cpp/prepare_dataset.py \
     --tokenizer "$MODEL" \
     --stdout \
     --random-seed 42 \
     token-norm-dist \
-    --num-requests $((BATCH_SIZE*NP)) \
+    --num-requests $((BATCH_SIZE * NP)) \
     --input-mean 2048 \
     --input-stdev 0 \
     --output-mean 256 \
@@ -61,8 +61,8 @@ trtllm-bench \
     --max_batch_size $BATCH_SIZE \
     --max_num_tokens 3072 \
     --disable_chunked_context \
-    --num_requests $((BATCH_SIZE*NP)) \
-    --concurrency $((BATCH_SIZE*NP)) \
+    --num_requests $((BATCH_SIZE * NP)) \
+    --concurrency $((BATCH_SIZE * NP)) \
     --config /tmp/config_collect.yaml
 
 # Step 2
@@ -98,8 +98,8 @@ trtllm-bench \
     --max_batch_size $BATCH_SIZE \
     --max_num_tokens 3072 \
     --disable_chunked_context \
-    --num_requests $((BATCH_SIZE*NP)) \
-    --concurrency $((BATCH_SIZE*NP)) \
+    --num_requests $((BATCH_SIZE * NP)) \
+    --concurrency $((BATCH_SIZE * NP)) \
     --config /tmp/config_mark.yaml
 
 # Step 3
diff --git a/examples/layer_wise_benchmarks/slurm_init_containers.sh b/examples/layer_wise_benchmarks/slurm_init_containers.sh
index 83215561c7..0269cbe420 100755
--- a/examples/layer_wise_benchmarks/slurm_init_containers.sh
+++ b/examples/layer_wise_benchmarks/slurm_init_containers.sh
@@ -4,8 +4,8 @@ set -euo pipefail
 
 # CONTAINER_IMAGE=
 CONTAINER_NAME=${CONTAINER_NAME:-layer_wise_benchmarks}
-TRTLLM_ROOT=$(realpath "$(dirname -- "$0")"/../..)
-CONTAINER_MOUNTS=$TRTLLM_ROOT:$TRTLLM_ROOT
+TRTLLM_ROOT=$(realpath -- "$(dirname -- "$0")"/../..)
+CONTAINER_MOUNTS="$TRTLLM_ROOT:$TRTLLM_ROOT"
 
 if [ -z "${SLURM_JOB_ID:-}" ]; then
     echo "Please set SLURM_JOB_ID"
@@ -18,9 +18,9 @@ if [ -z "${CONTAINER_IMAGE:-}" ]; then
     # Read Docker image from current_image_tags.properties
     MACHINE="$(srun -N 1 uname -m)"
     if [ "$MACHINE" == "x86_64" ]; then
-        DOCKER_IMAGE=$(source "$TRTLLM_ROOT/jenkins/current_image_tags.properties" && echo $LLM_DOCKER_IMAGE)
+        DOCKER_IMAGE=$(source "$TRTLLM_ROOT/jenkins/current_image_tags.properties" && echo "$LLM_DOCKER_IMAGE")
     elif [ "$MACHINE" == "aarch64" ]; then
-        DOCKER_IMAGE=$(source "$TRTLLM_ROOT/jenkins/current_image_tags.properties" && echo $LLM_SBSA_DOCKER_IMAGE)
+        DOCKER_IMAGE=$(source "$TRTLLM_ROOT/jenkins/current_image_tags.properties" && echo "$LLM_SBSA_DOCKER_IMAGE")
     else
         echo "Unsupported machine hardware name \"$MACHINE\""
         exit 1
@@ -31,7 +31,7 @@ if [ -z "${CONTAINER_IMAGE:-}" ]; then
     echo "CONTAINER_IMAGE was not set, using Docker image $DOCKER_IMAGE"
 
     # Import to .sqsh file
-    SQSH_FILE_NAME=$(echo "$DOCKER_IMAGE" |
+    SQSH_FILE_NAME=$(printf '%s\n' "$DOCKER_IMAGE" |
                      awk -F'#' '{print $2}' |
                      awk -F':' '{gsub(/\//,"+",$1); print $1"+"$2".sqsh"}')
     CONTAINER_IMAGE="$TRTLLM_ROOT/enroot/$SQSH_FILE_NAME"
@@ -41,7 +41,7 @@ if [ -z "${CONTAINER_IMAGE:-}" ]; then
     fi
 fi
 
-WORKDIR=$(realpath "$(pwd)")
+WORKDIR=$(realpath -- "$(pwd)")
 
 set -x
 srun -N "$NODES" \
@@ -50,7 +50,7 @@ srun -N "$NODES" \
     --container-name "$CONTAINER_NAME" \
     --container-mounts "$CONTAINER_MOUNTS" \
     --container-workdir "$WORKDIR" \
-bash -c "cd \"\$1\" &&
+bash -c 'cd "$1" &&
     pip install -U packaging &&
     pip install -r requirements.txt --no-build-isolation &&
-    pip install -e ." bash "$TRTLLM_ROOT"
+    pip install -e .' bash "$TRTLLM_ROOT"
diff --git a/examples/layer_wise_benchmarks/slurm_launch.sh b/examples/layer_wise_benchmarks/slurm_launch.sh
index 64857b2877..c091c0f571 100755
--- a/examples/layer_wise_benchmarks/slurm_launch.sh
+++ b/examples/layer_wise_benchmarks/slurm_launch.sh
@@ -3,20 +3,20 @@
 set -euo pipefail
 
 CONTAINER_NAME=${CONTAINER_NAME:-layer_wise_benchmarks}
-TRTLLM_ROOT=$(realpath "$(dirname -- "$0")"/../..)
-CONTAINER_MOUNTS=$TRTLLM_ROOT:$TRTLLM_ROOT
+TRTLLM_ROOT=$(realpath -- "$(dirname -- "$0")"/../..)
+CONTAINER_MOUNTS="$TRTLLM_ROOT:$TRTLLM_ROOT"
 
 if [ -z "${SLURM_JOB_ID:-}" ]; then
     echo "Please set SLURM_JOB_ID"
     exit 1
 fi
 
-WORKDIR=$(realpath "$(pwd)")
+WORKDIR=$(realpath -- "$(pwd)")
 
 set -x
 srun --mpi=pmix \
     -N "$NODES" \
-    --ntasks-per-node $(($NP / $NODES)) \
+    --ntasks-per-node $((NP / NODES)) \
     --container-name "$CONTAINER_NAME" \
     --container-mounts "$CONTAINER_MOUNTS" \
     --container-workdir "$WORKDIR" \
diff --git a/examples/layer_wise_benchmarks/slurm_query_container_name.sh b/examples/layer_wise_benchmarks/slurm_query_container_name.sh
index 6d3192da0b..73778dd7d0 100755
--- a/examples/layer_wise_benchmarks/slurm_query_container_name.sh
+++ b/examples/layer_wise_benchmarks/slurm_query_container_name.sh
@@ -8,22 +8,23 @@ if [ -z "${SLURM_JOB_ID:-}" ]; then
 fi
 
 prefix="pyxis_${SLURM_JOB_ID}_"
-matches=$(printf "%s\n" "$(srun -N 1 enroot list)" | grep "^${prefix}" || true)
-count=$(printf "%s\n" "$matches" | wc -l)
+matches=$(printf '%s\n' "$(srun -N 1 enroot list)" | grep "^${prefix}" || true)
 
-if [ "$count" -eq 0 ]; then
+if [ -z "$matches" ]; then
     echo "Error: No container found" >&2
     exit 1
+else
+    count=$(printf '%s\n' "$matches" | wc -l)
 fi
 
 if [ "$count" -gt 1 ]; then
     echo "Error: Multiple containers found" >&2
     while IFS= read -r match; do
-        echo "- ${match#$prefix}" >&2
+        echo "- ${match#"$prefix"}" >&2
     done <<< "$matches"
     exit 1
 fi
 
-suffix=${matches#$prefix}
+suffix=${matches#"$prefix"}
 echo "Container name: $suffix" >&2
-echo "$suffix"
+printf '%s\n' "$suffix"