From 59b6bee7e600b37fe32d84411fde29b03a68b8cb Mon Sep 17 00:00:00 2001 From: yingguo-trt <244492186+yingguo-trt@users.noreply.github.com> Date: Thu, 5 Feb 2026 14:07:24 +0800 Subject: [PATCH] [None][chore] Fix slurm job name (#11265) Signed-off-by: yingguo-trt <244492186+yingguo-trt@users.noreply.github.com> Signed-off-by: Wangshanshan <30051912+dominicshanshan@users.noreply.github.com> --- .../defs/perf/disagg/execution/executor.py | 1 + .../defs/perf/disagg/utils/common.py | 54 ++++++++++--------- 2 files changed, 30 insertions(+), 25 deletions(-) diff --git a/tests/integration/defs/perf/disagg/execution/executor.py b/tests/integration/defs/perf/disagg/execution/executor.py index b5e2d9a83f..68ac6166c7 100644 --- a/tests/integration/defs/perf/disagg/execution/executor.py +++ b/tests/integration/defs/perf/disagg/execution/executor.py @@ -219,6 +219,7 @@ class JobManager: # Write temporary config file with replaced environment variables logger.info(f"Creating temporary config: {temp_config_path}") + with open(temp_config_path, "w") as f: yaml.dump( test_config.config_data, diff --git a/tests/integration/defs/perf/disagg/utils/common.py b/tests/integration/defs/perf/disagg/utils/common.py index ec09e225d2..f119102052 100644 --- a/tests/integration/defs/perf/disagg/utils/common.py +++ b/tests/integration/defs/perf/disagg/utils/common.py @@ -2,49 +2,42 @@ import os -# GPU resource configuration -# Centralized configuration for all GPU-specific parameters +# GPU resource configuration - centralized config for all GPU-specific parameters GPU_RESOURCE_CONFIG = { - # OCI GB200 - "GB200": { - "slurm_extra_args": "--gres=gpu:4", # SLURM extra arguments (empty string if not required) + "GB200": { # OCI GB200 + "slurm_extra_args": "--gres=gpu:4", "set_segment": True, - "lock_freq_graphics_mhz": 2062, # GPU graphics clock lock frequency (MHz) - "lock_freq_memory_mhz": 3996, # GPU memory clock lock frequency (MHz) + "lock_freq_graphics_mhz": 2062, + "lock_freq_memory_mhz": 3996, }, - # Lyris GB200 - "GB200_LYRIS": { - "slurm_extra_args": "", # GB300 does not require extra args + "GB200_LYRIS": { # Lyris GB200 + "slurm_extra_args": "", "set_segment": True, - "lock_freq_graphics_mhz": None, # TODO: Set GB300 lock frequency + "lock_freq_graphics_mhz": None, "lock_freq_memory_mhz": None, }, - # Lyris GB300 - "GB300": { - "slurm_extra_args": "", # GB300 does not require extra args + "GB300": { # Lyris GB300 + "slurm_extra_args": "", "set_segment": True, - "lock_freq_graphics_mhz": None, # TODO: Set GB300 lock frequency + "lock_freq_graphics_mhz": None, "lock_freq_memory_mhz": None, }, - # H100 "H100": { - "slurm_extra_args": "", # H100 does not require extra args + "slurm_extra_args": "", "set_segment": False, - "lock_freq_graphics_mhz": None, # TODO: Set H100 lock frequency + "lock_freq_graphics_mhz": None, "lock_freq_memory_mhz": None, }, - # B200 - "B200": { + "B200": { # OCI B200 "slurm_extra_args": "--gres=gpu:4", "set_segment": False, - "lock_freq_graphics_mhz": None, # TODO: Set B200 lock frequency + "lock_freq_graphics_mhz": None, "lock_freq_memory_mhz": None, }, - # B300 - "B300": { + "B300": { # OCI B300 "slurm_extra_args": "--gres=gpu:4", "set_segment": False, - "lock_freq_graphics_mhz": None, # TODO: Set B300 lock frequency + "lock_freq_graphics_mhz": None, "lock_freq_memory_mhz": None, }, } @@ -67,7 +60,18 @@ class EnvManager: @staticmethod def get_slurm_job_name() -> str: - return os.getenv("SLURM_JOB_NAME", "unified-benchmark") + """Get SLURM job name: {SLURM_ACCOUNT}-{base}. + + Example: myaccount-unified.benchmark + Customize base via SLURM_JOB_BASE_NAME env var (default: unified.benchmark) + """ + account = EnvManager.get_slurm_account() + base = os.getenv("SLURM_JOB_BASE_NAME", "unified.benchmark") + + # Only use account as prefix if it's set and not a placeholder + if account and not account.startswith("<"): + return f"{account}-{base}" + return base @staticmethod def get_slurm_set_segment() -> bool: