[None][chore] Fix slurm job name (#11265)

Signed-off-by: yingguo-trt <244492186+yingguo-trt@users.noreply.github.com>
Signed-off-by: Wangshanshan <30051912+dominicshanshan@users.noreply.github.com>
This commit is contained in:
yingguo-trt 2026-02-05 14:07:24 +08:00 committed by Yanchao Lu
parent 17e6062690
commit 59b6bee7e6
2 changed files with 30 additions and 25 deletions

View File

@ -219,6 +219,7 @@ class JobManager:
# Write temporary config file with replaced environment variables
logger.info(f"Creating temporary config: {temp_config_path}")
with open(temp_config_path, "w") as f:
yaml.dump(
test_config.config_data,

View File

@ -2,49 +2,42 @@
import os
# GPU resource configuration
# Centralized configuration for all GPU-specific parameters
# GPU resource configuration - centralized config for all GPU-specific parameters
GPU_RESOURCE_CONFIG = {
# OCI GB200
"GB200": {
"slurm_extra_args": "--gres=gpu:4", # SLURM extra arguments (empty string if not required)
"GB200": { # OCI GB200
"slurm_extra_args": "--gres=gpu:4",
"set_segment": True,
"lock_freq_graphics_mhz": 2062, # GPU graphics clock lock frequency (MHz)
"lock_freq_memory_mhz": 3996, # GPU memory clock lock frequency (MHz)
"lock_freq_graphics_mhz": 2062,
"lock_freq_memory_mhz": 3996,
},
# Lyris GB200
"GB200_LYRIS": {
"slurm_extra_args": "", # GB300 does not require extra args
"GB200_LYRIS": { # Lyris GB200
"slurm_extra_args": "",
"set_segment": True,
"lock_freq_graphics_mhz": None, # TODO: Set GB300 lock frequency
"lock_freq_graphics_mhz": None,
"lock_freq_memory_mhz": None,
},
# Lyris GB300
"GB300": {
"slurm_extra_args": "", # GB300 does not require extra args
"GB300": { # Lyris GB300
"slurm_extra_args": "",
"set_segment": True,
"lock_freq_graphics_mhz": None, # TODO: Set GB300 lock frequency
"lock_freq_graphics_mhz": None,
"lock_freq_memory_mhz": None,
},
# H100
"H100": {
"slurm_extra_args": "", # H100 does not require extra args
"slurm_extra_args": "",
"set_segment": False,
"lock_freq_graphics_mhz": None, # TODO: Set H100 lock frequency
"lock_freq_graphics_mhz": None,
"lock_freq_memory_mhz": None,
},
# B200
"B200": {
"B200": { # OCI B200
"slurm_extra_args": "--gres=gpu:4",
"set_segment": False,
"lock_freq_graphics_mhz": None, # TODO: Set B200 lock frequency
"lock_freq_graphics_mhz": None,
"lock_freq_memory_mhz": None,
},
# B300
"B300": {
"B300": { # OCI B300
"slurm_extra_args": "--gres=gpu:4",
"set_segment": False,
"lock_freq_graphics_mhz": None, # TODO: Set B300 lock frequency
"lock_freq_graphics_mhz": None,
"lock_freq_memory_mhz": None,
},
}
@ -67,7 +60,18 @@ class EnvManager:
@staticmethod
def get_slurm_job_name() -> str:
return os.getenv("SLURM_JOB_NAME", "unified-benchmark")
"""Get SLURM job name: {SLURM_ACCOUNT}-{base}.
Example: myaccount-unified.benchmark
Customize base via SLURM_JOB_BASE_NAME env var (default: unified.benchmark)
"""
account = EnvManager.get_slurm_account()
base = os.getenv("SLURM_JOB_BASE_NAME", "unified.benchmark")
# Only use account as prefix if it's set and not a placeholder
if account and not account.startswith("<"):
return f"{account}-{base}"
return base
@staticmethod
def get_slurm_set_segment() -> bool: