TensorRT-LLMs/tests/scripts/allreduce_perf/allreduce_heuristic_code_gen.py
Ludwig Schneider 41ce14ab04
[None][feat] Enable NCCL_SYMMETRIC as default fallback for AllReduce (#9314)
Signed-off-by: Ludwig Schneider <lschneider@nvidia.com>
2025-12-07 09:43:26 -08:00

257 lines
8.8 KiB
Python

import os
import subprocess
import tempfile
from dataclasses import dataclass
from pathlib import Path
import numpy as np
import pandas as pd
from tensorrt_llm._utils import get_sm_version
@dataclass
class Constants:
# 16384
num_tokens_bits = 15
hidden_size_bits = 14
max_num_tokens_considered = 2**num_tokens_bits
max_hidden_size_considered = 2**hidden_size_bits
oneshot_num_tokens_threshold: int = 1
oneshot_hidden_size_threshold = 128
num_tokens_list = [2**i for i in range(num_tokens_bits)]
hidden_size_list = [2**i for i in range(7, hidden_size_bits)]
fusion_op_list = [
'NONE', 'RESIDUAL_RMS_NORM', 'RESIDUAL_RMS_NORM_QUANT_FP8',
'RESIDUAL_RMS_NORM_QUANT_NVFP4'
]
tp_size_list = [2, 4, 8]
strategy_name_to_enum = {
'NCCL': 0,
'NCCL_SYMMETRIC': 8,
'ONESHOT': 4,
'TWOSHOT': 5,
}
def find_best_strategy(df: pd.DataFrame):
"""Find the best strategy for each combination of parameters."""
return df.groupby([
'world_size', 'fusion', 'hidden_size', 'num_tokens'
]).apply(lambda group: group.loc[group['time (us)'].idxmin(), 'strategy'])
def filter_df(df: pd.DataFrame):
df = df[(df['num_tokens'] >= Constants.oneshot_num_tokens_threshold)
& (df['num_tokens'] <= Constants.max_num_tokens_considered) &
(df['hidden_size'] >= Constants.oneshot_hidden_size_threshold) &
(df['hidden_size'] <= Constants.max_hidden_size_considered)]
return df
def generate_heuristic_look_up_table(df: pd.DataFrame) -> str:
"""
Generate a heuristic lookup table from benchmark data and output as C++ array.
Args:
df: DataFrame with columns: world_size, dtype, size, num_tokens, hidden_size,
strategy, fusion, time (us)
Returns:
String containing C++ array definition for the lookup table
"""
if df is None or df.empty:
print("DataFrame is empty or None")
return ""
print(f"Input DataFrame shape: {df.shape}")
print(f"Available strategies: {df['strategy'].unique()}")
print(f"Available fusions: {df['fusion'].unique()}")
print(f"Available tp_sizes: {sorted(df['world_size'].unique())}")
# Filter out AUTO strategy as it's not a concrete implementation
df_filtered = df[df['strategy'] != 'AUTO'].copy()
print(f"After filtering AUTO strategy: {df_filtered.shape}")
# Apply range filters
df_filtered = filter_df(df_filtered)
# Find best strategy for each combination
best_strategies = find_best_strategy(df_filtered)
# Create lookup table dimensions
tp_size_count = len(Constants.tp_size_list)
fusion_count = len(Constants.fusion_op_list)
hidden_size_count = len(Constants.hidden_size_list)
num_tokens_count = len(Constants.num_tokens_list)
# Initialize lookup table with default values (NCCL_SYMMETRIC = 8)
strategy_table = np.full(
(tp_size_count, fusion_count, hidden_size_count, num_tokens_count),
Constants.strategy_name_to_enum['NCCL_SYMMETRIC'],
dtype=int)
# Fill the lookup table with best strategies
filled_entries = 0
for (tp_size, fusion, hidden_size,
num_tokens), best_strategy in best_strategies.items():
try:
tp_idx = Constants.tp_size_list.index(tp_size)
fusion_idx = Constants.fusion_op_list.index(fusion)
hidden_size_idx = Constants.hidden_size_list.index(hidden_size)
num_tokens_idx = Constants.num_tokens_list.index(num_tokens)
if best_strategy in Constants.strategy_name_to_enum:
strategy_value = Constants.strategy_name_to_enum[best_strategy]
strategy_table[tp_idx, fusion_idx, hidden_size_idx,
num_tokens_idx] = strategy_value
filled_entries += 1
except ValueError:
# Skip entries that don't match our defined lists
continue
print(f"Filled {filled_entries} entries in the lookup table")
return strategy_table
def generate_cpp_strategy_lut_code(
strategy_table: np.ndarray,
sm_version: int,
) -> str:
"""Generate formatted C++ array code from numpy lookup table."""
tp_size_count, fusion_count, hidden_size_count, num_tokens_count = strategy_table.shape
# Header with compact comments
cpp_code = f"// AllReduce lookup: [tp][fusion][hidden][tokens] = strategy\n"
cpp_code += f"// TP:{Constants.tp_size_list}\n"
cpp_code += f"// Fusion:{Constants.fusion_op_list}\n"
cpp_code += f"// Hidden:{Constants.hidden_size_list}\n"
cpp_code += f"// Tokens:{Constants.num_tokens_list}\n"
cpp_code += f"inline AllReduceBestStrategyTableType AllReduceBestStrategyTableSM{sm_version} = {{\n"
# Generate formatted array notation
for tp_idx in range(tp_size_count):
cpp_code += " {\n"
cpp_code += f" // TP={Constants.tp_size_list[tp_idx]}\n"
for fusion_idx in range(fusion_count):
cpp_code += f" {{ // Fusion={Constants.fusion_op_list[fusion_idx]}\n"
for hidden_idx in range(hidden_size_count):
cpp_code += " {"
# Put all token values on one line
token_values = []
for token_idx in range(num_tokens_count):
value = strategy_table[tp_idx, fusion_idx, hidden_idx,
token_idx]
token_values.append(str(value))
cpp_code += ",".join(token_values)
cpp_code += "}"
if hidden_idx < hidden_size_count - 1:
cpp_code += ","
cpp_code += "\n"
cpp_code += " }"
if fusion_idx < fusion_count - 1:
cpp_code += ","
cpp_code += "\n"
cpp_code += " }"
if tp_idx < tp_size_count - 1:
cpp_code += ","
cpp_code += "\n"
cpp_code += "};\n"
return cpp_code
def main():
# add args
import argparse
parser = argparse.ArgumentParser()
parser.add_argument("--data_dir", type=str, default=None)
parser.add_argument("--sm_version", type=int, default=None)
parser.add_argument("--save_csv_dir", type=str, default=None)
parser.add_argument("--enable_auto", action="store_true", default=False)
args = parser.parse_args()
tp_size_list = [2]
# Process the benchmark data
# combine all the data into one dataframe
data_dir = args.data_dir
sm_version = args.sm_version
if sm_version is None:
sm_version = get_sm_version()
print(f"Using SM version: {sm_version}")
df = pd.DataFrame()
if data_dir is None:
if args.save_csv_dir is not None:
data_dir = args.save_csv_dir
os.makedirs(data_dir, exist_ok=True)
else:
tmpdir = tempfile.TemporaryDirectory()
data_dir = tmpdir.name
for tp_size in tp_size_list:
# use mpi to run all_reduce.py to benchmark the performance if data_dir is not provided
script_path = os.path.join(
os.path.dirname(os.path.abspath(__file__)),
"../../microbenchmarks/all_reduce.py")
save_csv = f"{data_dir}/benchmark.tp{tp_size}.sm{sm_version}.csv"
print("enable_auto", args.enable_auto)
cmd = [
"mpirun",
"-n",
str(tp_size),
"python",
script_path,
"--explore_2d",
"--save_csv",
save_csv,
]
if args.enable_auto:
cmd.append("--enable_auto")
subprocess.run(
cmd,
env=os.environ,
)
for tp_size in tp_size_list:
data_file = f"{data_dir}/benchmark.tp{tp_size}.sm{sm_version}.csv"
if not (Path(data_file)).exists():
print(f"File {data_file} does not exist")
return
df_tp = pd.read_csv(Path(data_file))
df = pd.concat([df, df_tp])
assert df.empty == False, "Benchmark data is empty"
if not os.path.exists(f"{data_dir}/gen_heuristic_code"):
os.makedirs(f"{data_dir}/gen_heuristic_code")
if df is not None:
# Generate the C++ lookup table code
strategy_table = generate_heuristic_look_up_table(df)
cpp_code = generate_cpp_strategy_lut_code(strategy_table, sm_version)
# Write the generated code to a file
output_file = f"{data_dir}/gen_heuristic_code/generated_lookup_table.cpp"
with open(output_file, 'w') as f:
f.write(cpp_code)
print(f"\nGenerated C++ lookup table, written to: {output_file}")
print("\nFirst 20 lines of generated code:")
print(cpp_code)
else:
print("Failed to load benchmark data")
if __name__ == "__main__":
main()