TensorRT-LLMs/tests/integration/defs/perf/disagg/utils/config_loader.py
fredricz-20070104 621156ad44
[None][chore] Fix GB300 support issues (#10196)
Signed-off-by: FredricZ-2007 <226039983+fredricz-20070104@users.noreply.github.com>
Signed-off-by: fredricz-20070104 <226039983+fredricz-20070104@users.noreply.github.com>
2025-12-23 10:42:41 +08:00

562 lines
22 KiB
Python

"""YAML Configuration Loader with Default Metrics Support."""
import copy
import os
from dataclasses import dataclass
from pathlib import Path
from typing import Dict, List, Optional
import yaml
from reporting.accuracy_validator import DatasetThreshold
from utils.common import EnvManager
from utils.logger import logger
@dataclass
class MetricsConfig:
"""Metrics configuration."""
log_file: str # Log file name
extractor_pattern: str # Regular expression
metric_names: List[str] # Metric names list
def merge(self, override: Optional[Dict]) -> "MetricsConfig":
"""Merge with override dict.
Args:
override: Dict with optional keys: log_file, extractor_pattern, metric_names
Returns:
New MetricsConfig with overridden values
"""
if not override:
return self
return MetricsConfig(
log_file=override.get("log_file", self.log_file),
extractor_pattern=override.get("extractor_pattern", self.extractor_pattern),
metric_names=override.get("metric_names", self.metric_names),
)
@dataclass
class AccuracyConfig:
"""Accuracy test configuration (supports multiple datasets)."""
datasets: List[DatasetThreshold] # List of dataset threshold configurations
def get_dataset_config(self, dataset_name: str) -> Optional[DatasetThreshold]:
"""Get configuration by dataset name.
Args:
dataset_name: Name of the dataset to look up
Returns:
DatasetThreshold config if found, None otherwise
"""
for ds in self.datasets:
if ds.dataset_name.lower() == dataset_name.lower():
return ds
return None
def get_all_dataset_names(self) -> List[str]:
"""Get all configured dataset names.
Returns:
List of dataset names
"""
return [ds.dataset_name for ds in self.datasets]
# ============================================================================
# Default Metrics configuration
# ============================================================================
# Accuracy test uses accuracy_eval.log (markdown table output from lm_eval)
# Note: Only log_file is used by AccuracyParser (accuracy_parser.py)
# The regex pattern is hardcoded in AccuracyParser._extract_accuracy_values()
_COMMON_ACCURACY_METRICS = MetricsConfig(
log_file="7_accuracy_eval.log",
extractor_pattern=r"\|([a-zA-Z0-9_-]+)\|.*?\|([\w-]+)\|.*?\|exact_match\|.*?\|([0-9.]+)\|",
metric_names=["flexible-extract", "strict-match"],
)
DEFAULT_METRICS_CONFIG = {
# Performance test default configuration
("disagg", "perf"): MetricsConfig(
log_file="6_bench.log",
extractor_pattern=r"""
^.*?Median\ TTFT\ \(ms\):\s+([0-9.]+).*?$\n
(?:.*\n)*?
^.*?Median\ E2EL\ \(ms\):\s+([0-9.]+).*?$\n
(?:.*\n)*?
^.*?Benchmark\ with\ concurrency\ (\d+)\ done
""",
metric_names=["SERVER_MEDIAN_TTFT", "SERVER_MEDIAN_E2EL"],
),
("wideep", "perf"): MetricsConfig(
log_file="6_bench.log",
extractor_pattern=r"""
^.*?Mean\ TTFT\ \(ms\):\s+([0-9.]+).*?$\n
(?:.*\n)*?
^.*?Median\ TTFT\ \(ms\):\s+([0-9.]+).*?$\n
(?:.*\n)*?
^.*?P99\ TTFT\ \(ms\):\s+([0-9.]+).*?$\n
(?:.*\n)*?
^.*?Mean\ TPOT\ \(ms\):\s+([0-9.]+).*?$\n
(?:.*\n)*?
^.*?Median\ TPOT\ \(ms\):\s+([0-9.]+).*?$\n
(?:.*\n)*?
^.*?P99\ TPOT\ \(ms\):\s+([0-9.]+).*?$\n
(?:.*\n)*?
^.*?Mean\ ITL\ \(ms\):\s+([0-9.]+).*?$\n
(?:.*\n)*?
^.*?Median\ ITL\ \(ms\):\s+([0-9.]+).*?$\n
(?:.*\n)*?
^.*?P99\ ITL\ \(ms\):\s+([0-9.]+).*?$\n
(?:.*\n)*?
^.*?Mean\ E2EL\ \(ms\):\s+([0-9.]+).*?$\n
(?:.*\n)*?
^.*?Median\ E2EL\ \(ms\):\s+([0-9.]+).*?$\n
(?:.*\n)*?
^.*?P99\ E2EL\ \(ms\):\s+([0-9.]+).*?$\n
(?:.*\n)*?
^.*?Benchmark\ with\ concurrency\ (\d+)\ done
""",
metric_names=[
"SERVER_MEAN_TTFT",
"SERVER_MEDIAN_TTFT", # Median TTFT (keep the same name as disagg)
"SERVER_P99_TTFT",
"SERVER_MEAN_TPOT",
"SERVER_MEDIAN_TPOT",
"SERVER_P99_TPOT",
"SERVER_MEAN_ITL",
"SERVER_MEDIAN_ITL",
"SERVER_P99_ITL",
"SERVER_MEAN_E2EL",
"SERVER_E2EL", # Median E2EL (keep the same name as disagg)
"SERVER_P99_E2EL",
],
),
# Accuracy test configuration
("disagg", "accuracy"): _COMMON_ACCURACY_METRICS,
("wideep", "accuracy"): _COMMON_ACCURACY_METRICS,
}
@dataclass
class TestConfig:
"""Test configuration data class."""
config_path: str # YAML file path
temp_config_path: str # Temporary config file path (pre-calculated, not created yet)
test_id: str # Auto-generated test ID
test_type: str # disagg, widep, etc.
model_name: str # Model name (read from metadata)
test_category: str # perf or accuracy
benchmark_type: str # 1k1k, 8k1k, etc. (generated from sequence)
config_data: dict # Full YAML content
metrics_config: MetricsConfig # Metrics configuration (default or overridden)
supported_gpus: List[str] # Supported GPU types list
accuracy_config: Optional[AccuracyConfig] = None # Accuracy configuration (for accuracy tests)
@property
def display_name(self) -> str:
"""Display name for pytest."""
return f"{self.test_type}/{self.test_category}/{Path(self.config_path).stem}"
class ConfigLoader:
"""Configuration loader with default metrics support."""
def __init__(self, base_dir: str = "test_configs"):
"""Initialize ConfigLoader.
Args:
base_dir: Base directory for test configs
"""
self.base_dir = Path(base_dir)
if not self.base_dir.exists():
raise FileNotFoundError(f"Config directory not found: {self.base_dir}")
def scan_configs(
self,
test_type: Optional[str] = None,
test_category: Optional[str] = None,
model_name: Optional[str] = None,
gpu_type: Optional[str] = None,
) -> List[TestConfig]:
"""Scan configuration files.
Directory structure: test_type/category/model_bench_config.yaml
Args:
test_type: Filter by test type (disagg, widep, etc.)
test_category: Filter by category (perf, accuracy)
model_name: Filter by model name
gpu_type: Filter by GPU type (GB200, H100, etc.). If None, uses EnvManager.get_gpu_type()
Returns:
List of TestConfig objects (filtered by GPU support)
"""
# Get current GPU type from environment if not specified
if gpu_type is None:
gpu_type = EnvManager.get_gpu_type()
configs = []
if not self.base_dir.exists():
logger.warning(f"Config directory not found: {self.base_dir}")
return configs
# Traverse: test_type/category/config.yaml
for test_type_dir in self.base_dir.iterdir():
if not test_type_dir.is_dir() or test_type_dir.name == "templates":
continue
current_test_type = test_type_dir.name
# Filter by test_type
if test_type and current_test_type != test_type:
continue
# Traverse category (perf/accuracy)
for category_dir in test_type_dir.iterdir():
if not category_dir.is_dir():
continue
current_category = category_dir.name
# Filter by test_category
if test_category and current_category != test_category:
continue
# Load all YAML files in this category
for yaml_file in category_dir.glob("*.yaml"):
try:
config = self._load_config_file(
yaml_file, current_test_type, current_category
)
# Filter by model_name
if model_name and config.model_name != model_name:
continue
# Filter by GPU support
if gpu_type and gpu_type not in config.supported_gpus:
logger.info(
f"Skipping {yaml_file.name}: not supported on {gpu_type} "
f"(supported: {config.supported_gpus})"
)
continue
configs.append(config)
except Exception as e:
logger.warning(f"Failed to load {yaml_file}: {e}")
# Check if any configuration files are found
if len(configs) == 0:
# Build detailed error information
filter_info = []
if test_type:
filter_info.append(f"test_type='{test_type}'")
if test_category:
filter_info.append(f"test_category='{test_category}'")
if model_name:
filter_info.append(f"model_name='{model_name}'")
if gpu_type:
filter_info.append(f"gpu_type='{gpu_type}'")
filters = ", ".join(filter_info) if filter_info else "no filters"
raise RuntimeError(
f"No configuration files found in '{self.base_dir}' with {filters}. "
f"Please check:\n"
f" 1. Configuration files exist in the correct directory structure\n"
f" 2. YAML files contain valid 'metadata' section with required fields\n"
f" 3. GPU type '{gpu_type}' is in the 'supported_gpus' list\n"
f" 4. Filter parameters match existing configurations"
)
logger.success(f"Loaded {len(configs)} configurations for GPU type: {gpu_type}")
return configs
def _make_test_id(self, test_type: str, test_category: str, test_file_name: str) -> str:
"""Generate test ID from test type, category, and filename.
Since YAML filenames now contain all configuration info in a standardized format,
we simply combine test_type, test_category, and the filename.
Format: {test_type}_{test_category}_{test_file_name}
Example: disagg_perf_deepseek-r1-fp4_1k1k_ctx2_gen1_dep16_bs128_eplb288_mtp3_ccb-NIXL
Args:
test_type: Test type (disagg, widep, etc.)
test_category: Test category (perf, accuracy)
test_file_name: Test file name (without extension, contains all config info)
Returns:
Generated test ID string
"""
# Simplified: just combine test_type, test_category, and filename
# The filename already contains all necessary configuration details
test_id = f"{test_type}_{test_category}_{test_file_name}"
return test_id
def _load_config_file(self, yaml_path: Path, test_type: str, test_category: str) -> TestConfig:
"""Load single YAML config file."""
with open(yaml_path, "r") as f:
config_data = yaml.safe_load(f)
# Extract metadata from YAML file
metadata = config_data.get("metadata", {})
model_name = metadata.get("model_name", "unknown")
supported_gpus = metadata.get("supported_gpus", ["GB200", "GB300", "H100", "B200", "B300"])
# Override config with environment variables (in memory only, do not write back)
config_data = self._apply_env_overrides(config_data, model_name)
# Generate benchmark_type from sequence configuration
benchmark_type = self._generate_benchmark_type(config_data)
# Get metrics config (default or override)
metrics_config = self._get_metrics_config(test_type, test_category, config_data)
# Extract test file name (without extension)
test_file_name = yaml_path.stem # e.g., "deepseek-r1-fp4-0"
# Generate test ID using config data
test_id = self._make_test_id(test_type, test_category, test_file_name)
# Pre-calculate temporary config file path (not created yet, will be created in submit_job)
temp_filename = f"{yaml_path.stem}-temp.yaml"
temp_config_path = str(yaml_path.parent / temp_filename)
# Load accuracy configuration (only for accuracy tests)
accuracy_config = None
if test_category == "accuracy":
acc_meta = metadata.get("accuracy", {})
if acc_meta and "datasets" in acc_meta:
datasets = []
for ds_config in acc_meta["datasets"]:
# Parse optional hypothesis testing parameters
alpha = ds_config.get("alpha")
beta = ds_config.get("beta")
sigma = ds_config.get("sigma")
num_samples = ds_config.get("num_samples")
higher_is_better = ds_config.get("higher_is_better")
# Convert to appropriate types if present
if alpha is not None:
alpha = float(alpha)
if beta is not None:
beta = float(beta)
if sigma is not None:
sigma = float(sigma)
if num_samples is not None:
num_samples = int(num_samples)
if higher_is_better is not None:
higher_is_better = bool(higher_is_better)
datasets.append(
DatasetThreshold(
dataset_name=ds_config.get("name", "gsm8k"),
expected_value=float(ds_config.get("expected_value", 0.0)),
threshold_type=ds_config.get("threshold_type", "hypothesis_test"),
filter_type=ds_config.get("filter_type", "flexible-extract"),
alpha=alpha,
beta=beta,
sigma=sigma,
num_samples=num_samples,
higher_is_better=higher_is_better,
)
)
accuracy_config = AccuracyConfig(datasets=datasets)
logger.info(f"Loaded accuracy config with {len(datasets)} dataset(s)")
return TestConfig(
config_path=str(yaml_path),
temp_config_path=temp_config_path,
test_id=test_id,
test_type=test_type,
model_name=model_name,
test_category=test_category,
benchmark_type=benchmark_type,
config_data=config_data,
metrics_config=metrics_config,
supported_gpus=supported_gpus,
accuracy_config=accuracy_config,
)
def _generate_benchmark_type(self, config_data: dict) -> str:
"""Generate benchmark type from sequence configuration.
Examples:
input=1024, output=1024 -> "1k1k"
input=8192, output=1024 -> "8k1k"
input=16384, output=2048 -> "16k2k"
Args:
config_data: Full YAML config data
Returns:
Benchmark type string (e.g., "1k1k", "8k1k")
"""
sequence = config_data.get("sequence", {})
input_length = sequence.get("input_length", 0)
output_length = sequence.get("output_length", 0)
# Convert to k notation
input_k = input_length // 1024
output_k = output_length // 1024
return f"{input_k}k{output_k}k"
def _get_metrics_config(
self, test_type: str, test_category: str, config_data: dict
) -> MetricsConfig:
"""Get metrics config: use default or merge with override.
Args:
test_category: 'perf' or 'accuracy'
config_data: Full YAML config data
Returns:
MetricsConfig (default or merged with overrides)
"""
# Get default configuration
config_key = (test_type, test_category)
default_config = DEFAULT_METRICS_CONFIG.get(config_key)
if not default_config:
# If no default configuration, trigger exception
logger.warning(f"No default metrics config for config_key: {config_key}")
raise ValueError(f"No default metrics config for config_key: {config_key}")
# Check if there are metrics overrides in YAML
# Metrics are defined in metadata section instead of benchmark
metadata_config = config_data.get("metadata", {})
metrics_override = metadata_config.get("metrics")
if metrics_override:
# There are metrics overrides, merge them
logger.debug("Using custom metrics config (overriding defaults)")
return default_config.merge(metrics_override)
else:
# No metrics overrides, use default
logger.debug(f"Using default metrics config for {test_category}")
return default_config
def _apply_env_overrides(self, config_data: dict, model_name: str) -> dict:
"""Apply environment variable overrides to configuration.
Intelligently replaces empty or None values based on field path.
No placeholders needed - automatically matches by field name.
Args:
config_data: Original configuration dict
Returns:
Updated configuration dict with environment variables applied
"""
# Create a deep copy to avoid modifying original
config = copy.deepcopy(config_data)
# Field path mapping: (path, key) -> environment value getter
# Uses lazy evaluation to get values only when needed
field_mapping = {
("slurm", "partition"): lambda: EnvManager.get_slurm_partition(),
("slurm", "account"): lambda: EnvManager.get_slurm_account(),
("slurm", "job_name"): lambda: EnvManager.get_slurm_job_name(),
("environment", "container_mount"): lambda: EnvManager.get_container_mount(model_name),
("environment", "container_image"): lambda: EnvManager.get_container_image(),
("environment", "trtllm_repo"): lambda: EnvManager.get_repo_dir(),
("environment", "trtllm_wheel_path"): lambda: EnvManager.get_trtllm_wheel_path(),
("benchmark", "dataset_file"): lambda: self._get_dataset_file(config),
("environment", "work_dir"): lambda: EnvManager.get_script_dir(),
("environment", "model_path"): lambda: self._get_full_model_path(config),
("slurm", "script_file"): lambda: self._get_script_file(config),
("slurm", "set_segment"): lambda: EnvManager.get_slurm_set_segment(),
("slurm", "extra_args"): lambda: EnvManager.get_slurm_extra_args(),
}
# Apply overrides based on field paths
for (section, key), value_getter in field_mapping.items():
if section in config:
config[section][key] = value_getter()
return config
def _get_full_model_path(self, config: dict) -> str:
"""Get full model path by combining MODEL_DIR with model directory name.
Priority:
1. metadata.model_dir_name (explicit model directory name)
2. Empty string (fallback)
"""
metadata = config.get("metadata", {})
model_dir_name = metadata.get("model_dir_name", "")
if model_dir_name:
return os.path.join(EnvManager.get_model_dir(), model_dir_name)
else:
return ""
def _get_dataset_file(self, config: dict) -> str:
"""Get dataset file by combining dataset directory with dataset file name.
Args:
config: Full YAML config data
"""
metadata = config.get("metadata", {})
dataset_file = metadata.get("dataset_file", "")
return os.path.join(EnvManager.get_dataset_dir(), dataset_file)
def _get_script_file(self, config: dict) -> str:
"""Get script file by combining scripts directory with script file name.
Args:
config: Full YAML config data
"""
metadata = config.get("metadata", {})
script_file = metadata.get("script_file", "disaggr_torch.slurm")
return os.path.join(EnvManager.get_script_dir(), script_file)
def _write_config_file(self, yaml_path: Path, config_data: dict) -> None:
"""Write updated configuration back to YAML file.
This is necessary because submit.py reads the YAML file to submit jobs.
The file needs to contain actual values, not empty placeholders.
Args:
yaml_path: Path to the YAML file
config_data: Updated configuration dict with environment values applied
"""
try:
with open(yaml_path, "w") as f:
yaml.dump(
config_data,
f,
default_flow_style=False,
sort_keys=False,
allow_unicode=True,
width=1000, # Prevent line wrapping
)
logger.success(f"Updated config written to: {yaml_path.name}")
except Exception as e:
logger.warning(f"Failed to write config file {yaml_path}: {e}")
# Don't fail the test if write fails, just log warning
def get_all_models(self) -> List[str]:
"""Get list of all unique model names."""
configs = self.scan_configs()
return sorted(set(config.model_name for config in configs))
def get_all_test_types(self) -> List[str]:
"""Get list of all test types."""
if not self.base_dir.exists():
return []
return sorted(
[d.name for d in self.base_dir.iterdir() if d.is_dir() and d.name != "templates"]
)