TensorRT-LLMs/tests/integration/defs/perf/disagg/execution/executor.py
fredricz-20070104 621156ad44
[None][chore] Fix GB300 support issues (#10196)
Signed-off-by: FredricZ-2007 <226039983+fredricz-20070104@users.noreply.github.com>
Signed-off-by: fredricz-20070104 <226039983+fredricz-20070104@users.noreply.github.com>
2025-12-23 10:42:41 +08:00

839 lines
32 KiB
Python

"""Disaggregated Benchmark Executor."""
import os
import re
import shutil
import time
from typing import Any, Dict, List, Optional
import yaml
from reporting.report import LogParser, LogWriter, ResultSaver
from utils.common import (
GPU_RESOURCE_CONFIG,
SESSION_COLLECT_CMD_TYPE,
EnvManager,
extract_config_fields,
)
from utils.logger import logger
from execution.subprocess_utils import exec_cmd, exec_cmd_with_output
# ============================================================================
# SLURM Run Command Builder
# ============================================================================
class SlurmRunCommandBuilder:
"""SLURM Run Command Builder.
Build srun commands for different GPU types and command types.
Reuses GPU_RESOURCE_CONFIG for consistency with SlurmJobBuilder.
"""
def build_srun_prefix(self, job_name: str) -> List[str]:
"""Build srun command prefix based on GPU type."""
gpu_type = EnvManager.get_gpu_type()
# Reuse the same GPU_RESOURCE_CONFIG as SlurmJobBuilder
gpu_config = GPU_RESOURCE_CONFIG.get(gpu_type)
if not gpu_config:
raise ValueError(
f"GPU resource configuration not found for {gpu_type}. "
f"Please add configuration in GPU_RESOURCE_CONFIG."
)
# Common srun arguments
srun_args = [
"srun",
"-l",
"--container-name=sysinfo-get",
f"--container-image={EnvManager.get_container_image()}",
f"--container-mounts={EnvManager.get_container_mount()}",
]
# Add GPU-specific gres parameter (reuse gres_gpu field)
# If gres_gpu is not None, add --gres parameter
if gpu_config["gres_gpu"] is not None:
srun_args.append(f"--gres=gpu:{gpu_config['gres_gpu']}")
# Add common parameters
srun_args.extend(
[
f"--partition={EnvManager.get_slurm_partition()}",
f"--account={EnvManager.get_slurm_account()}",
f"--job-name={job_name}",
"--time=02:00:00",
"--mpi=pmix",
# Note: Removed --overlap to ensure GPU allocation for session_collect
# which runs after all test jobs have completed
"-N",
"1",
"-n",
"1",
]
)
return srun_args
def build_script_command(self, cmd_type: str) -> List[str]:
"""Build script command based on command type."""
work_dir = EnvManager.get_work_dir()
output_path = EnvManager.get_output_path()
install_mode = EnvManager.get_install_mode()
repo_dir = EnvManager.get_repo_dir()
trtllm_wheel_path = EnvManager.get_trtllm_wheel_path()
if cmd_type == SESSION_COLLECT_CMD_TYPE:
if install_mode == "none":
return [
"bash",
"-c",
f"cd {work_dir} && python3 {work_dir}/simple_collect.py {output_path}",
]
elif install_mode == "wheel":
# Install TensorRT-LLM wheel first, then run simple_collect.py
# Note: Use --no-deps to avoid overwriting container's pre-installed packages (like torch)
install_cmd = f"""
cd {repo_dir}
echo 'Step 1: Installing TensorRT-LLM wheel...'
pip3 install {trtllm_wheel_path} || echo 'Wheel install failed, continuing...'
echo 'Wheel installation completed'
echo 'Step 2: Running simple_collect.py...'
cd {work_dir}
python3 {work_dir}/simple_collect.py {output_path}
"""
return ["bash", "-c", install_cmd]
elif install_mode == "source":
install_cmd = f"""
cd {repo_dir}
pip3 install -e . || echo 'Source install failed, continuing...'
echo 'Source installation completed'
echo 'Step 3: Running simple_collect.py...'
cd {work_dir}
python3 {work_dir}/simple_collect.py {output_path}
"""
return ["bash", "-c", install_cmd]
else:
raise ValueError(f"Invalid install mode: {install_mode}")
else:
# Future command types can be added here
# elif cmd_type == "benchmark_collect":
# model_dir = EnvManager.get_model_dir()
# return [
# "bash", "-c",
# f"cd {work_dir} && python3 {work_dir}/benchmark_collect.py "
# f"--model-dir {model_dir} --output {output_path}"
# ]
# elif cmd_type == "metrics_collect":
# return [
# "bash", "-c",
# f"cd {work_dir} && python3 {work_dir}/metrics_collect.py --config {work_dir}/config.yaml"
# ]
raise ValueError(
f"Unsupported command type: {cmd_type}. "
f"Currently supported: {SESSION_COLLECT_CMD_TYPE}"
)
def run_job(self, cmd_type: str, job_name: str, log_file: str = None) -> Dict[str, Any]:
"""Execute srun job.
Args:
cmd_type: Type of command to execute
job_name: Name for the SLURM job
log_file: Optional path to save command output
Returns:
Dict with status and message
"""
try:
# Build complete command
srun_prefix = self.build_srun_prefix(job_name)
script_command = self.build_script_command(cmd_type)
full_command = srun_prefix + script_command
# Execute with optional log file
if log_file:
logger.info(f"Saving output to: {log_file}")
# Use Python file redirection to avoid shell quoting issues
import subprocess
with open(log_file, "w") as f:
result = subprocess.run(
full_command, stdout=f, stderr=subprocess.STDOUT, timeout=7200, text=True
)
if result.returncode != 0:
raise subprocess.CalledProcessError(result.returncode, full_command)
logger.success(f"Output saved to {log_file}")
output = "" # Output is in file
else:
output = exec_cmd_with_output(full_command, timeout=7200)
return {"status": True, "msg": "Job executed successfully", "output": output}
except Exception as e:
logger.error(f"Job execution failed: {e}")
return {"status": False, "msg": str(e)}
def make_slurm_run_command():
"""Create run command function (maintain interface compatibility)."""
builder = SlurmRunCommandBuilder()
return builder.run_job
class JobManager:
"""Job manager class."""
@staticmethod
def submit_job(test_config) -> tuple:
"""Submit job using submit.py with YAML config.
Args:
test_config: TestConfig object containing configuration
Returns:
tuple: (success: bool, job_id: str)
"""
logger.info("Submitting job using submit.py...")
try:
import re
# Get pre-calculated temporary config file path from test_config
temp_config_path = test_config.temp_config_path
# Write temporary config file with replaced environment variables
logger.info(f"Creating temporary config: {temp_config_path}")
with open(temp_config_path, "w") as f:
yaml.dump(
test_config.config_data,
f,
default_flow_style=False,
sort_keys=False,
allow_unicode=True,
width=1000,
)
logger.success(f"Temporary config created: {os.path.basename(temp_config_path)}")
# Call submit.py with the temporary config file
submit_script = os.path.join(EnvManager.get_script_dir(), "submit.py")
cmd = ["python3", submit_script, "-c", temp_config_path]
logger.info(f"Command: {' '.join(cmd)}")
# Execute submission
output = exec_cmd_with_output(cmd, timeout=60)
logger.info(f"Output: {output}")
# Parse job ID from output
if "Submitted batch job" in output:
match = re.search(r"Submitted batch job (\d+)", output)
if match:
job_id = match.group(1)
logger.success(f"Job submitted successfully: {job_id}")
return True, job_id
logger.error("Unable to extract job ID from output")
# Clean up temporary file if submission failed
if os.path.exists(temp_config_path):
os.remove(temp_config_path)
return False, ""
except Exception as e:
error_msg = str(e)
# Extract stderr from CalledProcessError if available
if hasattr(e, "stderr") and e.stderr:
error_msg = e.stderr.decode() if isinstance(e.stderr, bytes) else e.stderr
logger.error(f"Job submission exception: {error_msg}")
# Clean up temporary file on exception
temp_config_path = test_config.temp_config_path
# if os.path.exists(temp_config_path):
# os.remove(temp_config_path)
return False, error_msg
@staticmethod
def backup_logs(
job_id: str,
test_config,
result_dir: str,
is_passed: bool,
) -> Optional[str]:
"""Backup logs and config files to test_id directory.
Args:
job_id: SLURM job ID
test_config: TestConfig object
result_dir: Result directory path
is_passed: Whether the job passed
Returns:
backup_dir path if successful, None otherwise
"""
if not os.path.exists(result_dir):
logger.warning(f"Result directory does not exist yet: {result_dir}")
return None
# Replace colons with hyphens for safe directory naming
dst_dir_name = test_config.test_id.replace(":", "-")
# Add ERROR suffix if the job failed
if not is_passed:
dst_dir_name = f"{dst_dir_name}_ERROR"
backup_dir = os.path.join(os.path.dirname(result_dir), dst_dir_name)
try:
logger.info("Copying result directory to backup...")
logger.info(f"Source: {result_dir}")
logger.info(f"Destination: {backup_dir}")
# Remove old backup if it exists
if os.path.exists(backup_dir):
logger.warning("Backup directory already exists, removing old backup")
shutil.rmtree(backup_dir)
# Copy result directory
shutil.copytree(result_dir, backup_dir)
logger.success(f"Backup created successfully: {backup_dir}")
# Move temporary config file to backup directory (not copy)
temp_config_path = test_config.temp_config_path
if os.path.exists(temp_config_path):
dest_path = os.path.join(backup_dir, os.path.basename(temp_config_path))
shutil.move(temp_config_path, dest_path)
logger.success(f"Temporary config moved to backup: {dest_path}")
else:
# Fallback: copy original config if no temp file (backward compatibility)
case_config_path = test_config.config_path
if os.path.exists(case_config_path):
shutil.copy(case_config_path, backup_dir)
logger.success(f"Case config copied successfully: {case_config_path}")
else:
logger.warning(f"Case config not found: {case_config_path}")
return backup_dir
except Exception as e:
logger.warning(f"Failed to create backup copy: {e}")
# Try to clean up temporary file on backup failure
temp_config_path = test_config.temp_config_path
if os.path.exists(temp_config_path):
try:
os.remove(temp_config_path)
logger.info(f"Cleaned up temp config after backup failure: {temp_config_path}")
except Exception as cleanup_error:
logger.warning(f"Failed to cleanup temp config: {cleanup_error}")
return None
@staticmethod
def cleanup_result_dir(result_dir: str) -> bool:
"""Clean up result directory.
Args:
result_dir: Result directory path
Returns:
True if successful, False otherwise
"""
if os.path.exists(result_dir):
try:
shutil.rmtree(result_dir)
logger.success(f"Result directory removed: {result_dir}")
return True
except Exception as e:
logger.warning(f"Failed to remove result directory: {e}")
return False
return True
@staticmethod
def get_result_dir(test_config) -> str:
"""Get result directory.
Args:
test_config: TestConfig object
Returns:
Result directory path
"""
config_data = test_config.config_data
fields = extract_config_fields(config_data)
# Extract fields for logging and result directory
log_base = fields["log_base"]
context_dir = fields["context_dir"]
log_dir_name = log_base
result_dir = os.path.join(EnvManager.get_script_dir(), log_dir_name, context_dir)
return result_dir
@staticmethod
def check_result(
job_id: str,
test_config,
timestamps: Optional[Dict[str, str]] = None,
test_name: Optional[str] = None,
) -> Dict[str, Any]:
"""Check job execution result and generate report.
High-level method that automatically extracts parameters from TestConfig,
parses logs, generates performance reports, and saves results to CSV.
Note: backup_logs should be called separately by the caller (test_disagg.py).
Args:
job_id: SLURM job ID
test_config: TestConfig object containing configuration
timestamps: Optional timestamps dict for the test case
test_name: Optional test name for reporting
Returns:
Dict with 'success' status and other result information
"""
config_data = test_config.config_data
# Get result directory
result_dir = JobManager.get_result_dir(test_config)
logger.info(f"Result directory: {result_dir}")
# Initialize default result in case of exception
check_result = {"job_id": job_id, "status": "ERROR", "success": False}
try:
# Call the internal implementation method
check_result = JobManager._check_job_result(
job_id=job_id,
test_category=test_config.test_category, # Pass test category for routing
benchmark_type=test_config.benchmark_type,
config=config_data,
metrics_config=test_config.metrics_config,
accuracy_config=test_config.accuracy_config, # Pass accuracy config
model_name=test_config.model_name,
result_dir=result_dir,
timestamps=timestamps,
test_name=test_name,
)
except Exception as e:
logger.error(f"Exception during result checking: {e}")
check_result["error"] = f"Exception during result checking: {str(e)}"
# Clean up result directory
if EnvManager.get_debug_mode():
logger.debug(f"Debug mode: Skipping result directory cleanup: {result_dir}")
else:
try:
JobManager.cleanup_result_dir(result_dir)
except Exception as e:
logger.warning(f"Failed to cleanup result directory: {e}")
return check_result
@staticmethod
def check_for_early_failure(job_id: str, test_config) -> tuple[bool, Optional[str]]:
"""Check logs for early failure indicators.
Args:
job_id: SLURM job ID
test_config: TestConfig object
Returns:
tuple: (has_error, error_message)
"""
# Key error patterns
error_patterns = [
(
r"\[E\]\s+Traceback[\s\S]*?mpi4py\.MPI\.Comm\.allgather",
"MPI communication error detected",
),
(
r"\[E\]\s+Traceback[\s\S]*?pickle data was truncated",
"Pickle serialization error detected",
),
]
try:
# Check gen and ctx server logs if result_dir exists
try:
result_dir = JobManager.get_result_dir(test_config)
if os.path.exists(result_dir):
# Find all output_gen_*.log and output_ctx_*.log files
for filename in os.listdir(result_dir):
is_gen_log = filename.startswith("output_gen_")
is_ctx_log = filename.startswith("output_ctx_")
if (is_gen_log or is_ctx_log) and filename.endswith(".log"):
log_path = os.path.join(result_dir, filename)
try:
with open(log_path, "r", encoding="utf-8", errors="ignore") as f:
# Read last 100KB
f.seek(0, 2)
file_size = f.tell()
f.seek(max(0, file_size - 102400), 0)
recent_content = f.read()
for pattern, error_msg in error_patterns:
if re.search(pattern, recent_content, re.MULTILINE):
return True, f"{error_msg} in {filename}"
except Exception as e:
logger.warning(f"Failed to check {filename}: {e}")
except Exception:
# result_dir might not exist yet, that's OK
pass
except Exception as e:
logger.warning(f"Error during early failure check: {e}")
return False, None
@staticmethod
def check_job_exists(job_id: str) -> bool:
"""Check if job still exists in SLURM queue.
Returns:
True if job exists (running or pending), False if job is gone
"""
try:
# Use squeue to check if job exists in queue
squeue_output = exec_cmd_with_output(["squeue", "-j", job_id, "--noheader"], timeout=30)
# If output is not empty, job exists
return bool(squeue_output.strip())
except Exception as e:
# If command fails, assume job doesn't exist
logger.debug(f"squeue check failed (job likely finished): {e}")
return False
@staticmethod
def wait_for_completion(
job_id: str, timeout: int = 3600, test_config=None, check_early_failure: bool = True
) -> None:
"""Wait for job to finish (disappear from queue).
Simplified logic: Just wait until job no longer exists in SLURM queue,
regardless of final status (COMPLETED, CANCELLED, FAILED, etc).
If timeout or early failure detected, cancel the job.
The actual success/failure will be determined by log file parsing.
Args:
job_id: SLURM job ID
timeout: Maximum wait time in seconds
test_config: TestConfig object (required for early failure detection)
check_early_failure: Whether to check logs for early failures
"""
start_time = time.time()
check_interval = 180 # Check every 3 minutes
failure_check_interval = 60 # Check for failures every 60 seconds
last_failure_check = start_time
# Wait for job to appear in system (initial delay)
logger.info(f"Waiting for job {job_id} to start...")
time.sleep(60) # Initial wait for job to be scheduled
logger.info(f"Waiting for job {job_id} to finish...")
while time.time() - start_time < timeout:
# Simple check: does job still exist?
job_exists = JobManager.check_job_exists(job_id)
if not job_exists:
# Job has disappeared from queue - it's done (whatever the status was)
logger.success(f"Job {job_id} finished (no longer in queue)")
return
# Check for early failures (only if test_config is provided)
current_time = time.time()
if (
check_early_failure
and test_config
and current_time - last_failure_check >= failure_check_interval
):
has_error, error_msg = JobManager.check_for_early_failure(job_id, test_config)
if has_error:
logger.error(f"Early failure detected: {error_msg}")
logger.warning(f"Cancelling job {job_id} due to early failure")
JobManager.cancel_job(job_id)
# Wait a bit for job to be cancelled, then return
time.sleep(10)
return
last_failure_check = current_time
time.sleep(check_interval)
# Timeout - cancel the job
logger.warning(f"Job {job_id} timeout after {timeout} seconds, cancelling...")
JobManager.cancel_job(job_id)
# Wait a bit for job to be cancelled
time.sleep(10)
@staticmethod
def cancel_job(job_id: str) -> bool:
"""Cancel job."""
try:
exec_cmd(["scancel", job_id], timeout=30)
logger.warning(f"Job cancelled: {job_id}")
return True
except Exception as e:
logger.error(f"Job cancellation failed: {e}")
return False
@staticmethod
def _print_logs_to_console(job_id: str, result_dir: str) -> None:
"""Print SLURM log and all .log/.yaml files in result_dir to console.
Args:
job_id: SLURM job ID for finding the slurm log file
result_dir: Result directory containing log and config files
"""
# Print the slurm log to console (check if exists first)
slurm_log_path = os.path.join(EnvManager.get_work_dir(), f"slurm-{job_id}.out")
if os.path.exists(slurm_log_path):
slurm_log_writer = LogWriter(EnvManager.get_work_dir())
slurm_log_writer.print_to_console(f"slurm-{job_id}.out")
else:
logger.warning(f"SLURM log file not found: {slurm_log_path}")
# Print all .log and .yaml files in result_dir (except output_server.log)
if not os.path.exists(result_dir):
logger.warning(f"Result directory not found: {result_dir}")
return
log_writer = LogWriter(result_dir)
files_to_print = []
for file in os.listdir(result_dir):
if (file.endswith(".log") or file.endswith(".yaml")) and file != "output_server.log":
files_to_print.append(file)
# Sort files for consistent output order
files_to_print.sort()
for file in files_to_print:
file_path = os.path.join(result_dir, file)
if os.path.exists(file_path):
log_writer.print_to_console(file)
else:
logger.warning(f"Log file not found: {file}")
@staticmethod
def _check_accuracy_result(
job_id: str,
metrics_config,
accuracy_config,
result_dir: str,
) -> Dict[str, Any]:
"""Check accuracy test result.
Args:
job_id: SLURM job ID
metrics_config: MetricsConfig object
accuracy_config: AccuracyConfig object
result_dir: Result directory
Returns:
Dict with success status and accuracy details
"""
# Initialize base result
result: Dict[str, Any] = {"job_id": job_id, "status": "UNKNOWN", "success": False}
# Validate accuracy_config
if not accuracy_config:
result["error"] = "Accuracy config not found in test configuration"
return result
# Check if result_dir exists
if not os.path.exists(result_dir):
error_msg = f"Result directory not found: {result_dir}"
logger.error(error_msg)
result["error"] = error_msg
return result
# Check if required log file exists (7_accuracy_eval.log)
accuracy_log = os.path.join(result_dir, "7_accuracy_eval.log")
if not os.path.exists(accuracy_log):
error_msg = f"Accuracy evaluation log file not found: {accuracy_log}"
logger.error(error_msg)
result["error"] = error_msg
return result
# Import and use AccuracyParser
from reporting.accuracy_parser import AccuracyParser
accuracy_parser = AccuracyParser(metrics_config, accuracy_config, result_dir)
validation_result = accuracy_parser.parse_and_validate()
# Check if parsing succeeded
if not validation_result["success"]:
result["error"] = validation_result.get("error", "Accuracy validation failed")
return result
# Log validation results
logger.info("Accuracy Validation Results:")
all_passed = validation_result["all_passed"]
# Log results for each run (using dataclass attributes for type safety)
for run_validation in validation_result.get("runs", []):
run_name = run_validation.run_name
run_passed = run_validation.all_passed
status = "PASSED" if run_passed else "FAILED"
logger.info(f"[{status}] {run_name}:")
for ds_result in run_validation.results:
status = "PASSED" if ds_result.passed else "FAILED"
dataset_name = ds_result.dataset
filter_type = ds_result.filter
threshold_type = ds_result.threshold_type
logger.info(f" [{status}] {dataset_name} ({filter_type}) - {threshold_type}:")
if ds_result.error:
logger.error(f" Error: {ds_result.error}")
else:
logger.info(f" Expected: {ds_result.expected:.4f}")
logger.info(f" Actual: {ds_result.actual:.4f}")
logger.info(f" Threshold type: {ds_result.threshold_type}")
logger.info(f" {ds_result.message}")
# Set result status
if all_passed:
logger.success("All accuracy tests PASSED (all runs)")
result["success"] = True
result["status"] = "PASSED"
else:
logger.failure("Some accuracy tests FAILED")
result["success"] = False
result["status"] = "FAILED"
result["error"] = "Some accuracy tests FAILED"
# Add detailed results
result["all_passed"] = validation_result["all_passed"]
result["accuracy_runs"] = validation_result["runs"]
result["raw_accuracy"] = validation_result.get("raw_results", [])
return result
@staticmethod
def _check_perf_result(
job_id: str,
benchmark_type: str,
config: dict,
metrics_config,
model_name: str,
result_dir: str,
timestamps: Optional[Dict[str, str]] = None,
test_name: Optional[str] = None,
) -> Dict[str, Any]:
"""Check performance test result.
Args:
job_id: SLURM job ID
benchmark_type: Benchmark type (1k1k, 8k1k, etc.)
config: Configuration dict (YAML data)
metrics_config: MetricsConfig object
model_name: Model name
result_dir: Result directory
timestamps: Optional timestamps dict
test_name: Optional test name
Returns:
Dict with success status and performance details
"""
result = {"job_id": job_id, "status": "UNKNOWN", "success": False}
# Check if result_dir exists
if not os.path.exists(result_dir):
error_msg = f"Result directory not found: {result_dir}"
logger.error(error_msg)
result["error"] = error_msg
return result
# Check if required log file exists (6_bench.log)
bench_log = os.path.join(result_dir, "6_bench.log")
if not os.path.exists(bench_log):
error_msg = f"Benchmark log file not found: {bench_log}"
logger.error(error_msg)
result["error"] = error_msg
return result
# Parse metrics and save to CSV
log_parser = LogParser(benchmark_type, config, metrics_config, result_dir)
parse_result = log_parser.parse(model_name, timestamps=timestamps, test_name=test_name)
if not parse_result["status"]:
result["error"] = "Failed to parse benchmark logs"
return result
# Check if df is None
result_df = parse_result.get("df")
if result_df is None:
logger.error("Parse result contains None DataFrame")
result["error"] = "Parse result contains None DataFrame"
return result
# Save results to CSV
output_path = EnvManager.get_output_path()
os.makedirs(output_path, exist_ok=True)
output_csv = os.path.join(output_path, "perf_script_test_results.csv")
result_saver = ResultSaver(output_csv)
result_saver.append_a_df(result_df)
result["success"] = True
result["status"] = "SUCCESS"
return result
@staticmethod
def _check_job_result(
job_id: str,
test_category: str,
benchmark_type: str,
config: dict,
metrics_config,
accuracy_config,
model_name: str,
result_dir: str,
timestamps: Optional[Dict[str, str]] = None,
test_name: Optional[str] = None,
) -> Dict[str, Any]:
"""Internal method: Check job result with category routing.
This is a low-level method that requires manual parameter extraction.
Use check_result() for a high-level interface with TestConfig.
Args:
job_id: SLURM job ID
test_category: Test category ("perf" or "accuracy")
benchmark_type: Benchmark type (1k1k, 8k1k, etc.)
config: Configuration dict (YAML data)
metrics_config: MetricsConfig object (default or custom)
accuracy_config: AccuracyConfig object (required for accuracy tests)
model_name: Model name
result_dir: Result directory
timestamps: Optional timestamps dict
test_name: Optional test name
Returns:
Dict with success status and details
"""
logger.info(f"Checking result directory: {result_dir}")
# Print logs and config files to console
JobManager._print_logs_to_console(job_id, result_dir)
# Route based on test_category
if test_category == "accuracy":
return JobManager._check_accuracy_result(
job_id=job_id,
metrics_config=metrics_config,
accuracy_config=accuracy_config,
result_dir=result_dir,
)
else: # perf
return JobManager._check_perf_result(
job_id=job_id,
benchmark_type=benchmark_type,
config=config,
metrics_config=metrics_config,
model_name=model_name,
result_dir=result_dir,
timestamps=timestamps,
test_name=test_name,
)
# create executor function
run_job = make_slurm_run_command()