mirror of
https://github.com/NVIDIA/TensorRT-LLM.git
synced 2026-02-02 01:01:35 +08:00
Signed-off-by: FredricZ-2007 <226039983+fredricz-20070104@users.noreply.github.com> Signed-off-by: fredricz-20070104 <226039983+fredricz-20070104@users.noreply.github.com>
839 lines
32 KiB
Python
839 lines
32 KiB
Python
"""Disaggregated Benchmark Executor."""
|
|
|
|
import os
|
|
import re
|
|
import shutil
|
|
import time
|
|
from typing import Any, Dict, List, Optional
|
|
|
|
import yaml
|
|
from reporting.report import LogParser, LogWriter, ResultSaver
|
|
from utils.common import (
|
|
GPU_RESOURCE_CONFIG,
|
|
SESSION_COLLECT_CMD_TYPE,
|
|
EnvManager,
|
|
extract_config_fields,
|
|
)
|
|
from utils.logger import logger
|
|
|
|
from execution.subprocess_utils import exec_cmd, exec_cmd_with_output
|
|
|
|
# ============================================================================
|
|
# SLURM Run Command Builder
|
|
# ============================================================================
|
|
|
|
|
|
class SlurmRunCommandBuilder:
|
|
"""SLURM Run Command Builder.
|
|
|
|
Build srun commands for different GPU types and command types.
|
|
Reuses GPU_RESOURCE_CONFIG for consistency with SlurmJobBuilder.
|
|
"""
|
|
|
|
def build_srun_prefix(self, job_name: str) -> List[str]:
|
|
"""Build srun command prefix based on GPU type."""
|
|
gpu_type = EnvManager.get_gpu_type()
|
|
|
|
# Reuse the same GPU_RESOURCE_CONFIG as SlurmJobBuilder
|
|
gpu_config = GPU_RESOURCE_CONFIG.get(gpu_type)
|
|
if not gpu_config:
|
|
raise ValueError(
|
|
f"GPU resource configuration not found for {gpu_type}. "
|
|
f"Please add configuration in GPU_RESOURCE_CONFIG."
|
|
)
|
|
|
|
# Common srun arguments
|
|
srun_args = [
|
|
"srun",
|
|
"-l",
|
|
"--container-name=sysinfo-get",
|
|
f"--container-image={EnvManager.get_container_image()}",
|
|
f"--container-mounts={EnvManager.get_container_mount()}",
|
|
]
|
|
|
|
# Add GPU-specific gres parameter (reuse gres_gpu field)
|
|
# If gres_gpu is not None, add --gres parameter
|
|
if gpu_config["gres_gpu"] is not None:
|
|
srun_args.append(f"--gres=gpu:{gpu_config['gres_gpu']}")
|
|
|
|
# Add common parameters
|
|
srun_args.extend(
|
|
[
|
|
f"--partition={EnvManager.get_slurm_partition()}",
|
|
f"--account={EnvManager.get_slurm_account()}",
|
|
f"--job-name={job_name}",
|
|
"--time=02:00:00",
|
|
"--mpi=pmix",
|
|
# Note: Removed --overlap to ensure GPU allocation for session_collect
|
|
# which runs after all test jobs have completed
|
|
"-N",
|
|
"1",
|
|
"-n",
|
|
"1",
|
|
]
|
|
)
|
|
|
|
return srun_args
|
|
|
|
def build_script_command(self, cmd_type: str) -> List[str]:
|
|
"""Build script command based on command type."""
|
|
work_dir = EnvManager.get_work_dir()
|
|
output_path = EnvManager.get_output_path()
|
|
install_mode = EnvManager.get_install_mode()
|
|
repo_dir = EnvManager.get_repo_dir()
|
|
trtllm_wheel_path = EnvManager.get_trtllm_wheel_path()
|
|
|
|
if cmd_type == SESSION_COLLECT_CMD_TYPE:
|
|
if install_mode == "none":
|
|
return [
|
|
"bash",
|
|
"-c",
|
|
f"cd {work_dir} && python3 {work_dir}/simple_collect.py {output_path}",
|
|
]
|
|
elif install_mode == "wheel":
|
|
# Install TensorRT-LLM wheel first, then run simple_collect.py
|
|
# Note: Use --no-deps to avoid overwriting container's pre-installed packages (like torch)
|
|
install_cmd = f"""
|
|
cd {repo_dir}
|
|
echo 'Step 1: Installing TensorRT-LLM wheel...'
|
|
pip3 install {trtllm_wheel_path} || echo 'Wheel install failed, continuing...'
|
|
echo 'Wheel installation completed'
|
|
|
|
echo 'Step 2: Running simple_collect.py...'
|
|
cd {work_dir}
|
|
python3 {work_dir}/simple_collect.py {output_path}
|
|
"""
|
|
return ["bash", "-c", install_cmd]
|
|
elif install_mode == "source":
|
|
install_cmd = f"""
|
|
cd {repo_dir}
|
|
pip3 install -e . || echo 'Source install failed, continuing...'
|
|
|
|
echo 'Source installation completed'
|
|
|
|
echo 'Step 3: Running simple_collect.py...'
|
|
cd {work_dir}
|
|
python3 {work_dir}/simple_collect.py {output_path}
|
|
"""
|
|
return ["bash", "-c", install_cmd]
|
|
else:
|
|
raise ValueError(f"Invalid install mode: {install_mode}")
|
|
else:
|
|
# Future command types can be added here
|
|
# elif cmd_type == "benchmark_collect":
|
|
# model_dir = EnvManager.get_model_dir()
|
|
# return [
|
|
# "bash", "-c",
|
|
# f"cd {work_dir} && python3 {work_dir}/benchmark_collect.py "
|
|
# f"--model-dir {model_dir} --output {output_path}"
|
|
# ]
|
|
# elif cmd_type == "metrics_collect":
|
|
# return [
|
|
# "bash", "-c",
|
|
# f"cd {work_dir} && python3 {work_dir}/metrics_collect.py --config {work_dir}/config.yaml"
|
|
# ]
|
|
raise ValueError(
|
|
f"Unsupported command type: {cmd_type}. "
|
|
f"Currently supported: {SESSION_COLLECT_CMD_TYPE}"
|
|
)
|
|
|
|
def run_job(self, cmd_type: str, job_name: str, log_file: str = None) -> Dict[str, Any]:
|
|
"""Execute srun job.
|
|
|
|
Args:
|
|
cmd_type: Type of command to execute
|
|
job_name: Name for the SLURM job
|
|
log_file: Optional path to save command output
|
|
|
|
Returns:
|
|
Dict with status and message
|
|
"""
|
|
try:
|
|
# Build complete command
|
|
srun_prefix = self.build_srun_prefix(job_name)
|
|
script_command = self.build_script_command(cmd_type)
|
|
full_command = srun_prefix + script_command
|
|
|
|
# Execute with optional log file
|
|
if log_file:
|
|
logger.info(f"Saving output to: {log_file}")
|
|
# Use Python file redirection to avoid shell quoting issues
|
|
import subprocess
|
|
|
|
with open(log_file, "w") as f:
|
|
result = subprocess.run(
|
|
full_command, stdout=f, stderr=subprocess.STDOUT, timeout=7200, text=True
|
|
)
|
|
if result.returncode != 0:
|
|
raise subprocess.CalledProcessError(result.returncode, full_command)
|
|
logger.success(f"Output saved to {log_file}")
|
|
output = "" # Output is in file
|
|
else:
|
|
output = exec_cmd_with_output(full_command, timeout=7200)
|
|
|
|
return {"status": True, "msg": "Job executed successfully", "output": output}
|
|
except Exception as e:
|
|
logger.error(f"Job execution failed: {e}")
|
|
return {"status": False, "msg": str(e)}
|
|
|
|
|
|
def make_slurm_run_command():
|
|
"""Create run command function (maintain interface compatibility)."""
|
|
builder = SlurmRunCommandBuilder()
|
|
return builder.run_job
|
|
|
|
|
|
class JobManager:
|
|
"""Job manager class."""
|
|
|
|
@staticmethod
|
|
def submit_job(test_config) -> tuple:
|
|
"""Submit job using submit.py with YAML config.
|
|
|
|
Args:
|
|
test_config: TestConfig object containing configuration
|
|
|
|
Returns:
|
|
tuple: (success: bool, job_id: str)
|
|
"""
|
|
logger.info("Submitting job using submit.py...")
|
|
|
|
try:
|
|
import re
|
|
|
|
# Get pre-calculated temporary config file path from test_config
|
|
temp_config_path = test_config.temp_config_path
|
|
|
|
# Write temporary config file with replaced environment variables
|
|
logger.info(f"Creating temporary config: {temp_config_path}")
|
|
with open(temp_config_path, "w") as f:
|
|
yaml.dump(
|
|
test_config.config_data,
|
|
f,
|
|
default_flow_style=False,
|
|
sort_keys=False,
|
|
allow_unicode=True,
|
|
width=1000,
|
|
)
|
|
logger.success(f"Temporary config created: {os.path.basename(temp_config_path)}")
|
|
|
|
# Call submit.py with the temporary config file
|
|
submit_script = os.path.join(EnvManager.get_script_dir(), "submit.py")
|
|
cmd = ["python3", submit_script, "-c", temp_config_path]
|
|
|
|
logger.info(f"Command: {' '.join(cmd)}")
|
|
|
|
# Execute submission
|
|
output = exec_cmd_with_output(cmd, timeout=60)
|
|
logger.info(f"Output: {output}")
|
|
|
|
# Parse job ID from output
|
|
if "Submitted batch job" in output:
|
|
match = re.search(r"Submitted batch job (\d+)", output)
|
|
if match:
|
|
job_id = match.group(1)
|
|
logger.success(f"Job submitted successfully: {job_id}")
|
|
return True, job_id
|
|
|
|
logger.error("Unable to extract job ID from output")
|
|
# Clean up temporary file if submission failed
|
|
if os.path.exists(temp_config_path):
|
|
os.remove(temp_config_path)
|
|
return False, ""
|
|
|
|
except Exception as e:
|
|
error_msg = str(e)
|
|
# Extract stderr from CalledProcessError if available
|
|
if hasattr(e, "stderr") and e.stderr:
|
|
error_msg = e.stderr.decode() if isinstance(e.stderr, bytes) else e.stderr
|
|
logger.error(f"Job submission exception: {error_msg}")
|
|
# Clean up temporary file on exception
|
|
temp_config_path = test_config.temp_config_path
|
|
# if os.path.exists(temp_config_path):
|
|
# os.remove(temp_config_path)
|
|
return False, error_msg
|
|
|
|
@staticmethod
|
|
def backup_logs(
|
|
job_id: str,
|
|
test_config,
|
|
result_dir: str,
|
|
is_passed: bool,
|
|
) -> Optional[str]:
|
|
"""Backup logs and config files to test_id directory.
|
|
|
|
Args:
|
|
job_id: SLURM job ID
|
|
test_config: TestConfig object
|
|
result_dir: Result directory path
|
|
is_passed: Whether the job passed
|
|
Returns:
|
|
backup_dir path if successful, None otherwise
|
|
"""
|
|
if not os.path.exists(result_dir):
|
|
logger.warning(f"Result directory does not exist yet: {result_dir}")
|
|
return None
|
|
|
|
# Replace colons with hyphens for safe directory naming
|
|
dst_dir_name = test_config.test_id.replace(":", "-")
|
|
# Add ERROR suffix if the job failed
|
|
if not is_passed:
|
|
dst_dir_name = f"{dst_dir_name}_ERROR"
|
|
backup_dir = os.path.join(os.path.dirname(result_dir), dst_dir_name)
|
|
|
|
try:
|
|
logger.info("Copying result directory to backup...")
|
|
logger.info(f"Source: {result_dir}")
|
|
logger.info(f"Destination: {backup_dir}")
|
|
|
|
# Remove old backup if it exists
|
|
if os.path.exists(backup_dir):
|
|
logger.warning("Backup directory already exists, removing old backup")
|
|
shutil.rmtree(backup_dir)
|
|
|
|
# Copy result directory
|
|
shutil.copytree(result_dir, backup_dir)
|
|
logger.success(f"Backup created successfully: {backup_dir}")
|
|
|
|
# Move temporary config file to backup directory (not copy)
|
|
temp_config_path = test_config.temp_config_path
|
|
if os.path.exists(temp_config_path):
|
|
dest_path = os.path.join(backup_dir, os.path.basename(temp_config_path))
|
|
shutil.move(temp_config_path, dest_path)
|
|
logger.success(f"Temporary config moved to backup: {dest_path}")
|
|
else:
|
|
# Fallback: copy original config if no temp file (backward compatibility)
|
|
case_config_path = test_config.config_path
|
|
if os.path.exists(case_config_path):
|
|
shutil.copy(case_config_path, backup_dir)
|
|
logger.success(f"Case config copied successfully: {case_config_path}")
|
|
else:
|
|
logger.warning(f"Case config not found: {case_config_path}")
|
|
|
|
return backup_dir
|
|
|
|
except Exception as e:
|
|
logger.warning(f"Failed to create backup copy: {e}")
|
|
# Try to clean up temporary file on backup failure
|
|
temp_config_path = test_config.temp_config_path
|
|
if os.path.exists(temp_config_path):
|
|
try:
|
|
os.remove(temp_config_path)
|
|
logger.info(f"Cleaned up temp config after backup failure: {temp_config_path}")
|
|
except Exception as cleanup_error:
|
|
logger.warning(f"Failed to cleanup temp config: {cleanup_error}")
|
|
return None
|
|
|
|
@staticmethod
|
|
def cleanup_result_dir(result_dir: str) -> bool:
|
|
"""Clean up result directory.
|
|
|
|
Args:
|
|
result_dir: Result directory path
|
|
|
|
Returns:
|
|
True if successful, False otherwise
|
|
"""
|
|
if os.path.exists(result_dir):
|
|
try:
|
|
shutil.rmtree(result_dir)
|
|
logger.success(f"Result directory removed: {result_dir}")
|
|
return True
|
|
except Exception as e:
|
|
logger.warning(f"Failed to remove result directory: {e}")
|
|
return False
|
|
return True
|
|
|
|
@staticmethod
|
|
def get_result_dir(test_config) -> str:
|
|
"""Get result directory.
|
|
|
|
Args:
|
|
test_config: TestConfig object
|
|
|
|
Returns:
|
|
Result directory path
|
|
"""
|
|
config_data = test_config.config_data
|
|
fields = extract_config_fields(config_data)
|
|
|
|
# Extract fields for logging and result directory
|
|
log_base = fields["log_base"]
|
|
context_dir = fields["context_dir"]
|
|
log_dir_name = log_base
|
|
|
|
result_dir = os.path.join(EnvManager.get_script_dir(), log_dir_name, context_dir)
|
|
return result_dir
|
|
|
|
@staticmethod
|
|
def check_result(
|
|
job_id: str,
|
|
test_config,
|
|
timestamps: Optional[Dict[str, str]] = None,
|
|
test_name: Optional[str] = None,
|
|
) -> Dict[str, Any]:
|
|
"""Check job execution result and generate report.
|
|
|
|
High-level method that automatically extracts parameters from TestConfig,
|
|
parses logs, generates performance reports, and saves results to CSV.
|
|
|
|
Note: backup_logs should be called separately by the caller (test_disagg.py).
|
|
|
|
Args:
|
|
job_id: SLURM job ID
|
|
test_config: TestConfig object containing configuration
|
|
timestamps: Optional timestamps dict for the test case
|
|
test_name: Optional test name for reporting
|
|
|
|
Returns:
|
|
Dict with 'success' status and other result information
|
|
"""
|
|
config_data = test_config.config_data
|
|
# Get result directory
|
|
result_dir = JobManager.get_result_dir(test_config)
|
|
logger.info(f"Result directory: {result_dir}")
|
|
|
|
# Initialize default result in case of exception
|
|
check_result = {"job_id": job_id, "status": "ERROR", "success": False}
|
|
|
|
try:
|
|
# Call the internal implementation method
|
|
check_result = JobManager._check_job_result(
|
|
job_id=job_id,
|
|
test_category=test_config.test_category, # Pass test category for routing
|
|
benchmark_type=test_config.benchmark_type,
|
|
config=config_data,
|
|
metrics_config=test_config.metrics_config,
|
|
accuracy_config=test_config.accuracy_config, # Pass accuracy config
|
|
model_name=test_config.model_name,
|
|
result_dir=result_dir,
|
|
timestamps=timestamps,
|
|
test_name=test_name,
|
|
)
|
|
except Exception as e:
|
|
logger.error(f"Exception during result checking: {e}")
|
|
check_result["error"] = f"Exception during result checking: {str(e)}"
|
|
|
|
# Clean up result directory
|
|
if EnvManager.get_debug_mode():
|
|
logger.debug(f"Debug mode: Skipping result directory cleanup: {result_dir}")
|
|
else:
|
|
try:
|
|
JobManager.cleanup_result_dir(result_dir)
|
|
except Exception as e:
|
|
logger.warning(f"Failed to cleanup result directory: {e}")
|
|
|
|
return check_result
|
|
|
|
@staticmethod
|
|
def check_for_early_failure(job_id: str, test_config) -> tuple[bool, Optional[str]]:
|
|
"""Check logs for early failure indicators.
|
|
|
|
Args:
|
|
job_id: SLURM job ID
|
|
test_config: TestConfig object
|
|
|
|
Returns:
|
|
tuple: (has_error, error_message)
|
|
"""
|
|
# Key error patterns
|
|
error_patterns = [
|
|
(
|
|
r"\[E\]\s+Traceback[\s\S]*?mpi4py\.MPI\.Comm\.allgather",
|
|
"MPI communication error detected",
|
|
),
|
|
(
|
|
r"\[E\]\s+Traceback[\s\S]*?pickle data was truncated",
|
|
"Pickle serialization error detected",
|
|
),
|
|
]
|
|
|
|
try:
|
|
# Check gen and ctx server logs if result_dir exists
|
|
try:
|
|
result_dir = JobManager.get_result_dir(test_config)
|
|
if os.path.exists(result_dir):
|
|
# Find all output_gen_*.log and output_ctx_*.log files
|
|
for filename in os.listdir(result_dir):
|
|
is_gen_log = filename.startswith("output_gen_")
|
|
is_ctx_log = filename.startswith("output_ctx_")
|
|
if (is_gen_log or is_ctx_log) and filename.endswith(".log"):
|
|
log_path = os.path.join(result_dir, filename)
|
|
try:
|
|
with open(log_path, "r", encoding="utf-8", errors="ignore") as f:
|
|
# Read last 100KB
|
|
f.seek(0, 2)
|
|
file_size = f.tell()
|
|
f.seek(max(0, file_size - 102400), 0)
|
|
recent_content = f.read()
|
|
|
|
for pattern, error_msg in error_patterns:
|
|
if re.search(pattern, recent_content, re.MULTILINE):
|
|
return True, f"{error_msg} in {filename}"
|
|
except Exception as e:
|
|
logger.warning(f"Failed to check {filename}: {e}")
|
|
except Exception:
|
|
# result_dir might not exist yet, that's OK
|
|
pass
|
|
|
|
except Exception as e:
|
|
logger.warning(f"Error during early failure check: {e}")
|
|
|
|
return False, None
|
|
|
|
@staticmethod
|
|
def check_job_exists(job_id: str) -> bool:
|
|
"""Check if job still exists in SLURM queue.
|
|
|
|
Returns:
|
|
True if job exists (running or pending), False if job is gone
|
|
"""
|
|
try:
|
|
# Use squeue to check if job exists in queue
|
|
squeue_output = exec_cmd_with_output(["squeue", "-j", job_id, "--noheader"], timeout=30)
|
|
# If output is not empty, job exists
|
|
return bool(squeue_output.strip())
|
|
except Exception as e:
|
|
# If command fails, assume job doesn't exist
|
|
logger.debug(f"squeue check failed (job likely finished): {e}")
|
|
return False
|
|
|
|
@staticmethod
|
|
def wait_for_completion(
|
|
job_id: str, timeout: int = 3600, test_config=None, check_early_failure: bool = True
|
|
) -> None:
|
|
"""Wait for job to finish (disappear from queue).
|
|
|
|
Simplified logic: Just wait until job no longer exists in SLURM queue,
|
|
regardless of final status (COMPLETED, CANCELLED, FAILED, etc).
|
|
If timeout or early failure detected, cancel the job.
|
|
The actual success/failure will be determined by log file parsing.
|
|
|
|
Args:
|
|
job_id: SLURM job ID
|
|
timeout: Maximum wait time in seconds
|
|
test_config: TestConfig object (required for early failure detection)
|
|
check_early_failure: Whether to check logs for early failures
|
|
"""
|
|
start_time = time.time()
|
|
check_interval = 180 # Check every 3 minutes
|
|
failure_check_interval = 60 # Check for failures every 60 seconds
|
|
last_failure_check = start_time
|
|
|
|
# Wait for job to appear in system (initial delay)
|
|
logger.info(f"Waiting for job {job_id} to start...")
|
|
time.sleep(60) # Initial wait for job to be scheduled
|
|
|
|
logger.info(f"Waiting for job {job_id} to finish...")
|
|
|
|
while time.time() - start_time < timeout:
|
|
# Simple check: does job still exist?
|
|
job_exists = JobManager.check_job_exists(job_id)
|
|
|
|
if not job_exists:
|
|
# Job has disappeared from queue - it's done (whatever the status was)
|
|
logger.success(f"Job {job_id} finished (no longer in queue)")
|
|
return
|
|
|
|
# Check for early failures (only if test_config is provided)
|
|
current_time = time.time()
|
|
if (
|
|
check_early_failure
|
|
and test_config
|
|
and current_time - last_failure_check >= failure_check_interval
|
|
):
|
|
has_error, error_msg = JobManager.check_for_early_failure(job_id, test_config)
|
|
if has_error:
|
|
logger.error(f"Early failure detected: {error_msg}")
|
|
logger.warning(f"Cancelling job {job_id} due to early failure")
|
|
JobManager.cancel_job(job_id)
|
|
# Wait a bit for job to be cancelled, then return
|
|
time.sleep(10)
|
|
return
|
|
last_failure_check = current_time
|
|
|
|
time.sleep(check_interval)
|
|
|
|
# Timeout - cancel the job
|
|
logger.warning(f"Job {job_id} timeout after {timeout} seconds, cancelling...")
|
|
JobManager.cancel_job(job_id)
|
|
# Wait a bit for job to be cancelled
|
|
time.sleep(10)
|
|
|
|
@staticmethod
|
|
def cancel_job(job_id: str) -> bool:
|
|
"""Cancel job."""
|
|
try:
|
|
exec_cmd(["scancel", job_id], timeout=30)
|
|
logger.warning(f"Job cancelled: {job_id}")
|
|
return True
|
|
except Exception as e:
|
|
logger.error(f"Job cancellation failed: {e}")
|
|
return False
|
|
|
|
@staticmethod
|
|
def _print_logs_to_console(job_id: str, result_dir: str) -> None:
|
|
"""Print SLURM log and all .log/.yaml files in result_dir to console.
|
|
|
|
Args:
|
|
job_id: SLURM job ID for finding the slurm log file
|
|
result_dir: Result directory containing log and config files
|
|
"""
|
|
# Print the slurm log to console (check if exists first)
|
|
slurm_log_path = os.path.join(EnvManager.get_work_dir(), f"slurm-{job_id}.out")
|
|
if os.path.exists(slurm_log_path):
|
|
slurm_log_writer = LogWriter(EnvManager.get_work_dir())
|
|
slurm_log_writer.print_to_console(f"slurm-{job_id}.out")
|
|
else:
|
|
logger.warning(f"SLURM log file not found: {slurm_log_path}")
|
|
|
|
# Print all .log and .yaml files in result_dir (except output_server.log)
|
|
if not os.path.exists(result_dir):
|
|
logger.warning(f"Result directory not found: {result_dir}")
|
|
return
|
|
|
|
log_writer = LogWriter(result_dir)
|
|
files_to_print = []
|
|
for file in os.listdir(result_dir):
|
|
if (file.endswith(".log") or file.endswith(".yaml")) and file != "output_server.log":
|
|
files_to_print.append(file)
|
|
|
|
# Sort files for consistent output order
|
|
files_to_print.sort()
|
|
|
|
for file in files_to_print:
|
|
file_path = os.path.join(result_dir, file)
|
|
if os.path.exists(file_path):
|
|
log_writer.print_to_console(file)
|
|
else:
|
|
logger.warning(f"Log file not found: {file}")
|
|
|
|
@staticmethod
|
|
def _check_accuracy_result(
|
|
job_id: str,
|
|
metrics_config,
|
|
accuracy_config,
|
|
result_dir: str,
|
|
) -> Dict[str, Any]:
|
|
"""Check accuracy test result.
|
|
|
|
Args:
|
|
job_id: SLURM job ID
|
|
metrics_config: MetricsConfig object
|
|
accuracy_config: AccuracyConfig object
|
|
result_dir: Result directory
|
|
|
|
Returns:
|
|
Dict with success status and accuracy details
|
|
"""
|
|
# Initialize base result
|
|
result: Dict[str, Any] = {"job_id": job_id, "status": "UNKNOWN", "success": False}
|
|
|
|
# Validate accuracy_config
|
|
if not accuracy_config:
|
|
result["error"] = "Accuracy config not found in test configuration"
|
|
return result
|
|
|
|
# Check if result_dir exists
|
|
if not os.path.exists(result_dir):
|
|
error_msg = f"Result directory not found: {result_dir}"
|
|
logger.error(error_msg)
|
|
result["error"] = error_msg
|
|
return result
|
|
|
|
# Check if required log file exists (7_accuracy_eval.log)
|
|
accuracy_log = os.path.join(result_dir, "7_accuracy_eval.log")
|
|
if not os.path.exists(accuracy_log):
|
|
error_msg = f"Accuracy evaluation log file not found: {accuracy_log}"
|
|
logger.error(error_msg)
|
|
result["error"] = error_msg
|
|
return result
|
|
|
|
# Import and use AccuracyParser
|
|
from reporting.accuracy_parser import AccuracyParser
|
|
|
|
accuracy_parser = AccuracyParser(metrics_config, accuracy_config, result_dir)
|
|
validation_result = accuracy_parser.parse_and_validate()
|
|
|
|
# Check if parsing succeeded
|
|
if not validation_result["success"]:
|
|
result["error"] = validation_result.get("error", "Accuracy validation failed")
|
|
return result
|
|
|
|
# Log validation results
|
|
logger.info("Accuracy Validation Results:")
|
|
all_passed = validation_result["all_passed"]
|
|
|
|
# Log results for each run (using dataclass attributes for type safety)
|
|
for run_validation in validation_result.get("runs", []):
|
|
run_name = run_validation.run_name
|
|
run_passed = run_validation.all_passed
|
|
status = "PASSED" if run_passed else "FAILED"
|
|
|
|
logger.info(f"[{status}] {run_name}:")
|
|
|
|
for ds_result in run_validation.results:
|
|
status = "PASSED" if ds_result.passed else "FAILED"
|
|
dataset_name = ds_result.dataset
|
|
filter_type = ds_result.filter
|
|
threshold_type = ds_result.threshold_type
|
|
|
|
logger.info(f" [{status}] {dataset_name} ({filter_type}) - {threshold_type}:")
|
|
if ds_result.error:
|
|
logger.error(f" Error: {ds_result.error}")
|
|
else:
|
|
logger.info(f" Expected: {ds_result.expected:.4f}")
|
|
logger.info(f" Actual: {ds_result.actual:.4f}")
|
|
logger.info(f" Threshold type: {ds_result.threshold_type}")
|
|
logger.info(f" {ds_result.message}")
|
|
|
|
# Set result status
|
|
if all_passed:
|
|
logger.success("All accuracy tests PASSED (all runs)")
|
|
result["success"] = True
|
|
result["status"] = "PASSED"
|
|
else:
|
|
logger.failure("Some accuracy tests FAILED")
|
|
result["success"] = False
|
|
result["status"] = "FAILED"
|
|
result["error"] = "Some accuracy tests FAILED"
|
|
|
|
# Add detailed results
|
|
result["all_passed"] = validation_result["all_passed"]
|
|
result["accuracy_runs"] = validation_result["runs"]
|
|
result["raw_accuracy"] = validation_result.get("raw_results", [])
|
|
|
|
return result
|
|
|
|
@staticmethod
|
|
def _check_perf_result(
|
|
job_id: str,
|
|
benchmark_type: str,
|
|
config: dict,
|
|
metrics_config,
|
|
model_name: str,
|
|
result_dir: str,
|
|
timestamps: Optional[Dict[str, str]] = None,
|
|
test_name: Optional[str] = None,
|
|
) -> Dict[str, Any]:
|
|
"""Check performance test result.
|
|
|
|
Args:
|
|
job_id: SLURM job ID
|
|
benchmark_type: Benchmark type (1k1k, 8k1k, etc.)
|
|
config: Configuration dict (YAML data)
|
|
metrics_config: MetricsConfig object
|
|
model_name: Model name
|
|
result_dir: Result directory
|
|
timestamps: Optional timestamps dict
|
|
test_name: Optional test name
|
|
|
|
Returns:
|
|
Dict with success status and performance details
|
|
"""
|
|
result = {"job_id": job_id, "status": "UNKNOWN", "success": False}
|
|
|
|
# Check if result_dir exists
|
|
if not os.path.exists(result_dir):
|
|
error_msg = f"Result directory not found: {result_dir}"
|
|
logger.error(error_msg)
|
|
result["error"] = error_msg
|
|
return result
|
|
|
|
# Check if required log file exists (6_bench.log)
|
|
bench_log = os.path.join(result_dir, "6_bench.log")
|
|
if not os.path.exists(bench_log):
|
|
error_msg = f"Benchmark log file not found: {bench_log}"
|
|
logger.error(error_msg)
|
|
result["error"] = error_msg
|
|
return result
|
|
|
|
# Parse metrics and save to CSV
|
|
log_parser = LogParser(benchmark_type, config, metrics_config, result_dir)
|
|
parse_result = log_parser.parse(model_name, timestamps=timestamps, test_name=test_name)
|
|
|
|
if not parse_result["status"]:
|
|
result["error"] = "Failed to parse benchmark logs"
|
|
return result
|
|
|
|
# Check if df is None
|
|
result_df = parse_result.get("df")
|
|
if result_df is None:
|
|
logger.error("Parse result contains None DataFrame")
|
|
result["error"] = "Parse result contains None DataFrame"
|
|
return result
|
|
|
|
# Save results to CSV
|
|
output_path = EnvManager.get_output_path()
|
|
os.makedirs(output_path, exist_ok=True)
|
|
|
|
output_csv = os.path.join(output_path, "perf_script_test_results.csv")
|
|
result_saver = ResultSaver(output_csv)
|
|
result_saver.append_a_df(result_df)
|
|
|
|
result["success"] = True
|
|
result["status"] = "SUCCESS"
|
|
return result
|
|
|
|
@staticmethod
|
|
def _check_job_result(
|
|
job_id: str,
|
|
test_category: str,
|
|
benchmark_type: str,
|
|
config: dict,
|
|
metrics_config,
|
|
accuracy_config,
|
|
model_name: str,
|
|
result_dir: str,
|
|
timestamps: Optional[Dict[str, str]] = None,
|
|
test_name: Optional[str] = None,
|
|
) -> Dict[str, Any]:
|
|
"""Internal method: Check job result with category routing.
|
|
|
|
This is a low-level method that requires manual parameter extraction.
|
|
Use check_result() for a high-level interface with TestConfig.
|
|
|
|
Args:
|
|
job_id: SLURM job ID
|
|
test_category: Test category ("perf" or "accuracy")
|
|
benchmark_type: Benchmark type (1k1k, 8k1k, etc.)
|
|
config: Configuration dict (YAML data)
|
|
metrics_config: MetricsConfig object (default or custom)
|
|
accuracy_config: AccuracyConfig object (required for accuracy tests)
|
|
model_name: Model name
|
|
result_dir: Result directory
|
|
timestamps: Optional timestamps dict
|
|
test_name: Optional test name
|
|
|
|
Returns:
|
|
Dict with success status and details
|
|
"""
|
|
logger.info(f"Checking result directory: {result_dir}")
|
|
|
|
# Print logs and config files to console
|
|
JobManager._print_logs_to_console(job_id, result_dir)
|
|
|
|
# Route based on test_category
|
|
if test_category == "accuracy":
|
|
return JobManager._check_accuracy_result(
|
|
job_id=job_id,
|
|
metrics_config=metrics_config,
|
|
accuracy_config=accuracy_config,
|
|
result_dir=result_dir,
|
|
)
|
|
else: # perf
|
|
return JobManager._check_perf_result(
|
|
job_id=job_id,
|
|
benchmark_type=benchmark_type,
|
|
config=config,
|
|
metrics_config=metrics_config,
|
|
model_name=model_name,
|
|
result_dir=result_dir,
|
|
timestamps=timestamps,
|
|
test_name=test_name,
|
|
)
|
|
|
|
|
|
# create executor function
|
|
run_job = make_slurm_run_command()
|