mirror of
https://github.com/NVIDIA/TensorRT-LLM.git
synced 2026-02-18 16:55:08 +08:00
[None][chore] Fix slurm job name (#11265)
Signed-off-by: yingguo-trt <244492186+yingguo-trt@users.noreply.github.com> Signed-off-by: Wangshanshan <30051912+dominicshanshan@users.noreply.github.com>
This commit is contained in:
parent
17e6062690
commit
59b6bee7e6
@ -219,6 +219,7 @@ class JobManager:
|
||||
|
||||
# Write temporary config file with replaced environment variables
|
||||
logger.info(f"Creating temporary config: {temp_config_path}")
|
||||
|
||||
with open(temp_config_path, "w") as f:
|
||||
yaml.dump(
|
||||
test_config.config_data,
|
||||
|
||||
@ -2,49 +2,42 @@
|
||||
|
||||
import os
|
||||
|
||||
# GPU resource configuration
|
||||
# Centralized configuration for all GPU-specific parameters
|
||||
# GPU resource configuration - centralized config for all GPU-specific parameters
|
||||
GPU_RESOURCE_CONFIG = {
|
||||
# OCI GB200
|
||||
"GB200": {
|
||||
"slurm_extra_args": "--gres=gpu:4", # SLURM extra arguments (empty string if not required)
|
||||
"GB200": { # OCI GB200
|
||||
"slurm_extra_args": "--gres=gpu:4",
|
||||
"set_segment": True,
|
||||
"lock_freq_graphics_mhz": 2062, # GPU graphics clock lock frequency (MHz)
|
||||
"lock_freq_memory_mhz": 3996, # GPU memory clock lock frequency (MHz)
|
||||
"lock_freq_graphics_mhz": 2062,
|
||||
"lock_freq_memory_mhz": 3996,
|
||||
},
|
||||
# Lyris GB200
|
||||
"GB200_LYRIS": {
|
||||
"slurm_extra_args": "", # GB300 does not require extra args
|
||||
"GB200_LYRIS": { # Lyris GB200
|
||||
"slurm_extra_args": "",
|
||||
"set_segment": True,
|
||||
"lock_freq_graphics_mhz": None, # TODO: Set GB300 lock frequency
|
||||
"lock_freq_graphics_mhz": None,
|
||||
"lock_freq_memory_mhz": None,
|
||||
},
|
||||
# Lyris GB300
|
||||
"GB300": {
|
||||
"slurm_extra_args": "", # GB300 does not require extra args
|
||||
"GB300": { # Lyris GB300
|
||||
"slurm_extra_args": "",
|
||||
"set_segment": True,
|
||||
"lock_freq_graphics_mhz": None, # TODO: Set GB300 lock frequency
|
||||
"lock_freq_graphics_mhz": None,
|
||||
"lock_freq_memory_mhz": None,
|
||||
},
|
||||
# H100
|
||||
"H100": {
|
||||
"slurm_extra_args": "", # H100 does not require extra args
|
||||
"slurm_extra_args": "",
|
||||
"set_segment": False,
|
||||
"lock_freq_graphics_mhz": None, # TODO: Set H100 lock frequency
|
||||
"lock_freq_graphics_mhz": None,
|
||||
"lock_freq_memory_mhz": None,
|
||||
},
|
||||
# B200
|
||||
"B200": {
|
||||
"B200": { # OCI B200
|
||||
"slurm_extra_args": "--gres=gpu:4",
|
||||
"set_segment": False,
|
||||
"lock_freq_graphics_mhz": None, # TODO: Set B200 lock frequency
|
||||
"lock_freq_graphics_mhz": None,
|
||||
"lock_freq_memory_mhz": None,
|
||||
},
|
||||
# B300
|
||||
"B300": {
|
||||
"B300": { # OCI B300
|
||||
"slurm_extra_args": "--gres=gpu:4",
|
||||
"set_segment": False,
|
||||
"lock_freq_graphics_mhz": None, # TODO: Set B300 lock frequency
|
||||
"lock_freq_graphics_mhz": None,
|
||||
"lock_freq_memory_mhz": None,
|
||||
},
|
||||
}
|
||||
@ -67,7 +60,18 @@ class EnvManager:
|
||||
|
||||
@staticmethod
|
||||
def get_slurm_job_name() -> str:
|
||||
return os.getenv("SLURM_JOB_NAME", "unified-benchmark")
|
||||
"""Get SLURM job name: {SLURM_ACCOUNT}-{base}.
|
||||
|
||||
Example: myaccount-unified.benchmark
|
||||
Customize base via SLURM_JOB_BASE_NAME env var (default: unified.benchmark)
|
||||
"""
|
||||
account = EnvManager.get_slurm_account()
|
||||
base = os.getenv("SLURM_JOB_BASE_NAME", "unified.benchmark")
|
||||
|
||||
# Only use account as prefix if it's set and not a placeholder
|
||||
if account and not account.startswith("<"):
|
||||
return f"{account}-{base}"
|
||||
return base
|
||||
|
||||
@staticmethod
|
||||
def get_slurm_set_segment() -> bool:
|
||||
|
||||
Loading…
Reference in New Issue
Block a user