[None][feat] support multi_acc and Lyris GB200 test (#11024)

Signed-off-by: yingguo-trt <244492186+yingguo-trt@users.noreply.github.com>
This commit is contained in:
yingguo-trt 2026-01-28 19:01:48 +08:00 committed by GitHub
parent 29647d9446
commit e70a55bd94
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
62 changed files with 183 additions and 216 deletions

View File

@ -23,14 +23,19 @@ echo "=========================================="
echo "Output path: $OUTPUT_PATH"
echo ""
# Show pytest PID if available (for debugging)
# Terminate pytest process if still running
if [ -f "$PID_FILE" ]; then
PYTEST_PID=$(cat "$PID_FILE" | tr -d '\n')
echo "Pytest PID: $PYTEST_PID"
# Check if pytest is still running
# Check if pytest is still running and kill it
if kill -0 "$PYTEST_PID" 2>/dev/null; then
echo "Status: Still running"
echo "Status: Still running - terminating..."
if kill -9 "$PYTEST_PID" 2>/dev/null; then
echo " [OK] Process killed"
else
echo " [WARN] Failed to kill process (may already be gone)"
fi
else
echo "Status: Already terminated"
fi

View File

@ -620,15 +620,8 @@ class JobManager:
result["error"] = error_msg
return result
# Check if required log file exists (7_accuracy_eval.log)
accuracy_log = os.path.join(result_dir, "7_accuracy_eval.log")
if not os.path.exists(accuracy_log):
error_msg = f"Accuracy evaluation log file not found: {accuracy_log}"
logger.error(error_msg)
result["error"] = error_msg
return result
# Import and use AccuracyParser
# Note: AccuracyParser handles log file checking with glob pattern support
from reporting.accuracy_parser import AccuracyParser
accuracy_parser = AccuracyParser(metrics_config, accuracy_config, result_dir)

View File

@ -1,5 +1,6 @@
"""Accuracy test result parser."""
import glob
import os
import re
from typing import Dict, List
@ -28,40 +29,77 @@ class AccuracyParser:
self.result_dir = result_dir
def parse_and_validate(self) -> AccuracyValidationResult:
"""Parse accuracy_eval.log and validate all configured datasets for all runs.
"""Parse accuracy_eval.log(s) and validate all configured datasets for all runs.
Supports multiple runs (e.g., pre-benchmark and post-benchmark).
Supports wildcard patterns in log_file (e.g., "7_accuracy_eval_*.log").
All runs must pass for the validation to succeed.
Returns:
AccuracyValidationResult with validation results for all runs
"""
log_file = os.path.join(self.result_dir, self.metrics_config.log_file)
log_pattern = self.metrics_config.log_file
if not os.path.exists(log_file):
# Check if pattern contains wildcards
if "*" in log_pattern or "?" in log_pattern or "[" in log_pattern:
# Use glob to match multiple files
log_files = sorted(glob.glob(os.path.join(self.result_dir, log_pattern)))
if not log_files:
return {
"success": False,
"all_passed": False,
"runs": [],
"raw_results": [],
"error": f"No log files found matching pattern: {log_pattern}",
}
logger.info(f"Found {len(log_files)} log file(s) matching pattern '{log_pattern}'")
else:
# Single file (backward compatible)
log_file = os.path.join(self.result_dir, log_pattern)
if not os.path.exists(log_file):
return {
"success": False,
"all_passed": False,
"runs": [],
"raw_results": [],
"error": f"Log file not found: {log_file}",
}
log_files = [log_file]
# Read and merge all log files
combined_log_content = ""
failed_files = []
for log_file in log_files:
try:
with open(log_file, "r", encoding="utf-8", errors="replace") as f:
content = f.read()
combined_log_content += content
if not content.endswith("\n"):
combined_log_content += "\n" # Ensure separation between files
logger.info(f"Successfully read log file: {os.path.basename(log_file)}")
except Exception as e:
failed_files.append((log_file, str(e)))
logger.warning(f"Failed to read {log_file}: {e}")
if not combined_log_content:
error_msg = "No valid log content found."
if failed_files:
error_msg += f" Failed files: {failed_files}"
return {
"success": False,
"all_passed": False,
"runs": [],
"raw_results": [],
"error": f"Log file not found: {log_file}",
}
# Read log file
try:
with open(log_file, "r", encoding="utf-8", errors="replace") as f:
log_content = f.read()
except Exception as e:
return {
"success": False,
"all_passed": False,
"runs": [],
"raw_results": [],
"error": f"Failed to read log file: {e}",
"error": error_msg,
}
# Extract accuracy values for all runs
all_runs_results = self._extract_accuracy_values(log_content)
all_runs_results = self._extract_accuracy_values(combined_log_content)
if not all_runs_results:
return {

View File

@ -135,7 +135,7 @@ DATASET_DEFAULTS = {
"higher_is_better": True,
},
# Alias for gpqa_diamond (same task, different naming convention)
"gpqa_diamond_cot_zeroshot": {
"gpqa_diamond_local": {
"alpha": 0.05,
"beta": 0.2,
"sigma": 50,

View File

@ -42,9 +42,6 @@ profiling:
nsys_on: false
accuracy:
enable_accuracy_test: false
model: local-completions
tasks: gsm8k
model_args_extra: num_concurrent=512,max_retries=3,tokenized_requests=false,timeout=1200,max_gen_toks=256,max_length=4096
worker_config:
gen:
tensor_parallel_size: 16

View File

@ -42,9 +42,6 @@ profiling:
nsys_on: false
accuracy:
enable_accuracy_test: false
model: local-completions
tasks: gsm8k
model_args_extra: num_concurrent=512,max_retries=3,tokenized_requests=false,timeout=1200,max_gen_toks=256,max_length=4096
worker_config:
gen:
tensor_parallel_size: 16

View File

@ -42,9 +42,6 @@ profiling:
nsys_on: false
accuracy:
enable_accuracy_test: false
model: local-completions
tasks: gsm8k
model_args_extra: num_concurrent=512,max_retries=3,tokenized_requests=false,timeout=1200,max_gen_toks=256,max_length=4096
worker_config:
gen:
tensor_parallel_size: 32

View File

@ -42,9 +42,6 @@ profiling:
nsys_on: false
accuracy:
enable_accuracy_test: false
model: local-completions
tasks: gsm8k
model_args_extra: num_concurrent=512,max_retries=3,tokenized_requests=false,timeout=1200,max_gen_toks=256,max_length=4096
worker_config:
gen:
tensor_parallel_size: 32

View File

@ -42,9 +42,6 @@ profiling:
nsys_on: false
accuracy:
enable_accuracy_test: false
model: local-completions
tasks: gsm8k
model_args_extra: num_concurrent=512,max_retries=3,tokenized_requests=false,timeout=1200,max_gen_toks=256,max_length=4096
worker_config:
gen:
tensor_parallel_size: 8

View File

@ -42,9 +42,6 @@ profiling:
nsys_on: false
accuracy:
enable_accuracy_test: false
model: local-completions
tasks: gsm8k
model_args_extra: num_concurrent=512,max_retries=3,tokenized_requests=false,timeout=1200,max_gen_toks=256,max_length=4096
worker_config:
gen:
tensor_parallel_size: 8

View File

@ -42,9 +42,6 @@ profiling:
nsys_on: false
accuracy:
enable_accuracy_test: false
model: local-completions
tasks: gsm8k
model_args_extra: num_concurrent=512,max_retries=3,tokenized_requests=false,timeout=1200,max_gen_toks=256,max_length=4096
worker_config:
gen:
tensor_parallel_size: 16

View File

@ -42,9 +42,6 @@ profiling:
nsys_on: false
accuracy:
enable_accuracy_test: false
model: local-completions
tasks: gsm8k
model_args_extra: num_concurrent=512,max_retries=3,tokenized_requests=false,timeout=1200,max_gen_toks=256,max_length=4096
worker_config:
gen:
tensor_parallel_size: 16

View File

@ -43,9 +43,6 @@ profiling:
nsys_on: false
accuracy:
enable_accuracy_test: false
model: local-completions
tasks: gsm8k
model_args_extra: num_concurrent=512,max_retries=3,tokenized_requests=false,timeout=1200,max_gen_toks=256,max_length=4096
worker_config:
gen:
tensor_parallel_size: 4

View File

@ -43,9 +43,6 @@ profiling:
nsys_on: false
accuracy:
enable_accuracy_test: false
model: local-completions
tasks: gsm8k
model_args_extra: num_concurrent=512,max_retries=3,tokenized_requests=false,timeout=1200,max_gen_toks=256,max_length=4096
worker_config:
gen:
tensor_parallel_size: 4

View File

@ -43,9 +43,6 @@ profiling:
nsys_on: false
accuracy:
enable_accuracy_test: false
model: local-completions
tasks: gsm8k
model_args_extra: num_concurrent=512,max_retries=3,tokenized_requests=false,timeout=1200,max_gen_toks=256,max_length=4096
worker_config:
gen:
tensor_parallel_size: 32

View File

@ -42,9 +42,6 @@ profiling:
nsys_on: false
accuracy:
enable_accuracy_test: false
model: local-completions
tasks: gsm8k
model_args_extra: num_concurrent=512,max_retries=3,tokenized_requests=false,timeout=1200,max_gen_toks=256,max_length=4096
worker_config:
gen:
tensor_parallel_size: 32

View File

@ -43,9 +43,6 @@ profiling:
nsys_on: false
accuracy:
enable_accuracy_test: false
model: local-completions
tasks: gsm8k
model_args_extra: num_concurrent=512,max_retries=3,tokenized_requests=false,timeout=1200,max_gen_toks=256,max_length=4096
worker_config:
gen:
tensor_parallel_size: 8

View File

@ -42,9 +42,6 @@ profiling:
nsys_on: false
accuracy:
enable_accuracy_test: false
model: local-completions
tasks: gsm8k
model_args_extra: num_concurrent=512,max_retries=3,tokenized_requests=false,timeout=1200,max_gen_toks=256,max_length=4096
worker_config:
gen:
tensor_parallel_size: 8

View File

@ -42,9 +42,6 @@ profiling:
nsys_on: false
accuracy:
enable_accuracy_test: false
model: local-completions
tasks: gsm8k
model_args_extra: num_concurrent=512,max_retries=3,tokenized_requests=false,timeout=1200,max_gen_toks=256,max_length=4096
worker_config:
gen:
tensor_parallel_size: 8

View File

@ -42,9 +42,6 @@ profiling:
nsys_on: false
accuracy:
enable_accuracy_test: false
model: local-completions
tasks: gsm8k
model_args_extra: num_concurrent=512,max_retries=3,tokenized_requests=false,timeout=1200,max_gen_toks=256,max_length=4096
worker_config:
gen:
tensor_parallel_size: 8

View File

@ -42,9 +42,6 @@ profiling:
nsys_on: false
accuracy:
enable_accuracy_test: false
model: local-completions
tasks: gsm8k
model_args_extra: num_concurrent=512,max_retries=3,tokenized_requests=false,timeout=1200,max_gen_toks=256,max_length=4096
worker_config:
gen:
tensor_parallel_size: 8

View File

@ -42,9 +42,6 @@ profiling:
nsys_on: false
accuracy:
enable_accuracy_test: false
model: local-completions
tasks: gsm8k
model_args_extra: num_concurrent=512,max_retries=3,tokenized_requests=false,timeout=1200,max_gen_toks=256,max_length=4096
worker_config:
gen:
tensor_parallel_size: 16

View File

@ -42,9 +42,6 @@ profiling:
nsys_on: false
accuracy:
enable_accuracy_test: false
model: local-completions
tasks: gsm8k
model_args_extra: num_concurrent=512,max_retries=3,tokenized_requests=false,timeout=1200,max_gen_toks=256,max_length=4096
worker_config:
gen:
tensor_parallel_size: 16

View File

@ -43,9 +43,6 @@ profiling:
nsys_on: false
accuracy:
enable_accuracy_test: false
model: local-completions
tasks: gsm8k
model_args_extra: num_concurrent=512,max_retries=3,tokenized_requests=false,timeout=1200,max_gen_toks=256,max_length=4096
worker_config:
gen:
tensor_parallel_size: 32

View File

@ -42,9 +42,6 @@ profiling:
nsys_on: false
accuracy:
enable_accuracy_test: false
model: local-completions
tasks: gsm8k
model_args_extra: num_concurrent=512,max_retries=3,tokenized_requests=false,timeout=1200,max_gen_toks=256,max_length=4096
worker_config:
gen:
tensor_parallel_size: 8

View File

@ -42,9 +42,6 @@ profiling:
nsys_on: false
accuracy:
enable_accuracy_test: false
model: local-completions
tasks: gsm8k
model_args_extra: num_concurrent=512,max_retries=3,tokenized_requests=false,timeout=1200,max_gen_toks=256,max_length=4096
worker_config:
gen:
tensor_parallel_size: 8

View File

@ -42,9 +42,6 @@ profiling:
nsys_on: false
accuracy:
enable_accuracy_test: false
model: local-completions
tasks: gsm8k
model_args_extra: num_concurrent=512,max_retries=3,tokenized_requests=false,timeout=1200,max_gen_toks=256,max_length=4096
worker_config:
gen:
tensor_parallel_size: 8

View File

@ -42,9 +42,6 @@ profiling:
nsys_on: false
accuracy:
enable_accuracy_test: false
model: local-completions
tasks: gsm8k
model_args_extra: num_concurrent=512,max_retries=3,tokenized_requests=false,timeout=1200,max_gen_toks=256,max_length=4096
worker_config:
gen:
tensor_parallel_size: 8

View File

@ -42,9 +42,6 @@ profiling:
nsys_on: false
accuracy:
enable_accuracy_test: false
model: local-completions
tasks: gsm8k
model_args_extra: num_concurrent=512,max_retries=3,tokenized_requests=false,timeout=1200,max_gen_toks=256,max_length=4096
worker_config:
gen:
tensor_parallel_size: 16

View File

@ -42,9 +42,6 @@ profiling:
nsys_on: false
accuracy:
enable_accuracy_test: false
model: local-completions
tasks: gsm8k
model_args_extra: num_concurrent=512,max_retries=3,tokenized_requests=false,timeout=1200,max_gen_toks=256,max_length=4096
worker_config:
gen:
tensor_parallel_size: 16

View File

@ -42,9 +42,6 @@ profiling:
nsys_on: false
accuracy:
enable_accuracy_test: false
model: local-completions
tasks: gsm8k
model_args_extra: num_concurrent=512,max_retries=3,tokenized_requests=false,timeout=1200,max_gen_toks=256,max_length=4096
worker_config:
gen:
tensor_parallel_size: 32

View File

@ -42,9 +42,6 @@ profiling:
nsys_on: false
accuracy:
enable_accuracy_test: false
model: local-completions
tasks: gsm8k
model_args_extra: num_concurrent=512,max_retries=3,tokenized_requests=false,timeout=1200,max_gen_toks=256,max_length=4096
worker_config:
gen:
tensor_parallel_size: 32

View File

@ -50,9 +50,14 @@ profiling:
nsys_on: false
accuracy:
enable_accuracy_test: true
model: local-completions
tasks: gsm8k
model_args_extra: num_concurrent=512,max_retries=3,tokenized_requests=false,timeout=1200,max_gen_toks=256,max_length=4096
env_var:
HF_HOME: <hf_home_path>
tasks:
gsm8k:
model: "local-completions"
model_args_extra: "num_concurrent=512,max_retries=3,tokenized_requests=false,timeout=7200,max_gen_toks=16384"
extra_kwargs:
trust_remote_code: true
worker_config:
gen:
enable_layerwise_nvtx_marker: true

View File

@ -47,9 +47,14 @@ profiling:
nsys_on: false
accuracy:
enable_accuracy_test: true
model: local-completions
tasks: gsm8k
model_args_extra: num_concurrent=512,max_retries=3,tokenized_requests=false,timeout=1200,max_gen_toks=256,max_length=4096
env_var:
HF_HOME: <hf_home_path>
tasks:
gsm8k:
model: "local-completions"
model_args_extra: "num_concurrent=512,max_retries=3,tokenized_requests=false,timeout=7200,max_gen_toks=16384"
extra_kwargs:
trust_remote_code: true
worker_config:
gen:
enable_layerwise_nvtx_marker: true

View File

@ -10,7 +10,7 @@ metadata:
dataset_file: disagg_datasets/kimi-k2-1024-1024-20000-ratio-1_for_serve.json
accuracy:
datasets:
- dataset_name: gpqa_diamond_cot_zeroshot
- dataset_name: gpqa_diamond_local
expected_value: 0.65
threshold_type: hypothesis_test
filter_type: strict-match
@ -29,7 +29,7 @@ benchmark:
multi_round: 8
benchmark_ratio: 1.0
streaming: true
concurrency_list: '16384'
concurrency_list: '8192'
input_length: 1024
output_length: 1024
dataset_file: <dataset_file>

View File

@ -43,9 +43,6 @@ profiling:
nsys_on: false
accuracy:
enable_accuracy_test: false
model: local-completions
tasks: gsm8k
model_args_extra: num_concurrent=512,max_retries=3,tokenized_requests=false,timeout=1200,max_gen_toks=256,max_length=4096
worker_config:
gen:
enable_layerwise_nvtx_marker: true

View File

@ -43,9 +43,6 @@ profiling:
nsys_on: false
accuracy:
enable_accuracy_test: false
model: local-completions
tasks: gsm8k
model_args_extra: num_concurrent=512,max_retries=3,tokenized_requests=false,timeout=1200,max_gen_toks=256,max_length=4096
worker_config:
gen:
enable_layerwise_nvtx_marker: true

View File

@ -43,9 +43,6 @@ profiling:
nsys_on: false
accuracy:
enable_accuracy_test: false
model: local-completions
tasks: gsm8k
model_args_extra: num_concurrent=512,max_retries=3,tokenized_requests=false,timeout=1200,max_gen_toks=256,max_length=4096
worker_config:
gen:
enable_layerwise_nvtx_marker: true

View File

@ -43,9 +43,6 @@ profiling:
nsys_on: false
accuracy:
enable_accuracy_test: false
model: local-completions
tasks: gsm8k
model_args_extra: num_concurrent=512,max_retries=3,tokenized_requests=false,timeout=1200,max_gen_toks=256,max_length=4096
worker_config:
gen:
enable_layerwise_nvtx_marker: true

View File

@ -43,9 +43,6 @@ profiling:
nsys_on: false
accuracy:
enable_accuracy_test: false
model: local-completions
tasks: gsm8k
model_args_extra: num_concurrent=512,max_retries=3,tokenized_requests=false,timeout=1200,max_gen_toks=256,max_length=4096
worker_config:
gen:
enable_layerwise_nvtx_marker: true

View File

@ -43,9 +43,6 @@ profiling:
nsys_on: false
accuracy:
enable_accuracy_test: false
model: local-completions
tasks: gsm8k
model_args_extra: num_concurrent=512,max_retries=3,tokenized_requests=false,timeout=1200,max_gen_toks=256,max_length=4096
worker_config:
gen:
enable_layerwise_nvtx_marker: true

View File

@ -43,9 +43,6 @@ profiling:
nsys_on: false
accuracy:
enable_accuracy_test: false
model: local-completions
tasks: gsm8k
model_args_extra: num_concurrent=512,max_retries=3,tokenized_requests=false,timeout=1200,max_gen_toks=256,max_length=4096
worker_config:
gen:
enable_layerwise_nvtx_marker: true

View File

@ -43,9 +43,6 @@ profiling:
nsys_on: false
accuracy:
enable_accuracy_test: false
model: local-completions
tasks: gsm8k
model_args_extra: num_concurrent=512,max_retries=3,tokenized_requests=false,timeout=1200,max_gen_toks=256,max_length=4096
worker_config:
gen:
enable_layerwise_nvtx_marker: true

View File

@ -43,9 +43,6 @@ profiling:
nsys_on: false
accuracy:
enable_accuracy_test: false
model: local-completions
tasks: gsm8k
model_args_extra: num_concurrent=512,max_retries=3,tokenized_requests=false,timeout=1200,max_gen_toks=256,max_length=4096
worker_config:
gen:
enable_layerwise_nvtx_marker: true

View File

@ -43,9 +43,6 @@ profiling:
nsys_on: false
accuracy:
enable_accuracy_test: false
model: local-completions
tasks: gsm8k
model_args_extra: num_concurrent=512,max_retries=3,tokenized_requests=false,timeout=1200,max_gen_toks=256,max_length=4096
worker_config:
gen:
enable_layerwise_nvtx_marker: true

View File

@ -45,9 +45,6 @@ profiling:
nsys_on: false
accuracy:
enable_accuracy_test: false
model: local-completions
tasks: gsm8k
model_args_extra: num_concurrent=512,max_retries=3,tokenized_requests=false,timeout=1200,max_gen_toks=256,max_length=4096
worker_config:
gen:
enable_layerwise_nvtx_marker: true

View File

@ -43,9 +43,6 @@ profiling:
nsys_on: false
accuracy:
enable_accuracy_test: false
model: local-completions
tasks: gsm8k
model_args_extra: num_concurrent=512,max_retries=3,tokenized_requests=false,timeout=1200,max_gen_toks=256,max_length=4096
worker_config:
gen:
enable_layerwise_nvtx_marker: true

View File

@ -43,9 +43,6 @@ profiling:
nsys_on: false
accuracy:
enable_accuracy_test: false
model: local-completions
tasks: gsm8k
model_args_extra: num_concurrent=512,max_retries=3,tokenized_requests=false,timeout=1200,max_gen_toks=256,max_length=4096
worker_config:
gen:
enable_layerwise_nvtx_marker: true

View File

@ -43,9 +43,6 @@ profiling:
nsys_on: false
accuracy:
enable_accuracy_test: false
model: local-completions
tasks: gsm8k
model_args_extra: num_concurrent=512,max_retries=3,tokenized_requests=false,timeout=1200,max_gen_toks=256,max_length=4096
worker_config:
gen:
enable_layerwise_nvtx_marker: true

View File

@ -43,9 +43,6 @@ profiling:
nsys_on: false
accuracy:
enable_accuracy_test: false
model: local-completions
tasks: gsm8k
model_args_extra: num_concurrent=512,max_retries=3,tokenized_requests=false,timeout=1200,max_gen_toks=256,max_length=4096
worker_config:
gen:
enable_layerwise_nvtx_marker: true

View File

@ -43,9 +43,6 @@ profiling:
nsys_on: false
accuracy:
enable_accuracy_test: false
model: local-completions
tasks: gsm8k
model_args_extra: num_concurrent=512,max_retries=3,tokenized_requests=false,timeout=1200,max_gen_toks=256,max_length=4096
worker_config:
gen:
enable_layerwise_nvtx_marker: true

View File

@ -13,7 +13,7 @@ slurm:
script_file: disaggr_torch.slurm
partition: <partition>
account: <account>
job_time: 02:00:00
job_time: 03:00:00
job_name: unified-benchmark
extra_args: "--gres=gpu:4"
numa_bind: true
@ -44,9 +44,6 @@ profiling:
nsys_on: false
accuracy:
enable_accuracy_test: false # Set to true to enable accuracy evaluation
model: local-completions
tasks: gsm8k
model_args_extra: num_concurrent=512,max_retries=3,tokenized_requests=false,timeout=1200,max_gen_toks=256,max_length=4096
worker_config:
gen:
tensor_parallel_size: 32

View File

@ -13,7 +13,7 @@ slurm:
script_file: disaggr_torch.slurm
partition: <partition>
account: <account>
job_time: 02:00:00
job_time: 04:00:00
job_name: unified-benchmark
extra_args: "--gres=gpu:4"
numa_bind: true
@ -44,9 +44,6 @@ profiling:
nsys_on: false
accuracy:
enable_accuracy_test: false
model: local-completions
tasks: gsm8k
model_args_extra: num_concurrent=512,max_retries=3,tokenized_requests=false,timeout=1200,max_gen_toks=256,max_length=4096
worker_config:
gen:
tensor_parallel_size: 16

View File

@ -14,7 +14,7 @@ slurm:
script_file: disaggr_torch.slurm
partition: <partition>
account: <account>
job_time: 02:00:00
job_time: 04:00:00
job_name: unified-benchmark
extra_args: "--gres=gpu:4"
numa_bind: true
@ -45,9 +45,6 @@ profiling:
nsys_on: false
accuracy:
enable_accuracy_test: false
model: local-completions
tasks: gsm8k
model_args_extra: num_concurrent=512,max_retries=3,tokenized_requests=false,timeout=1200,max_gen_toks=256,max_length=4096
worker_config:
gen:
tensor_parallel_size: 48

View File

@ -13,7 +13,7 @@ slurm:
script_file: disaggr_torch.slurm
partition: <partition>
account: <account>
job_time: 02:00:00
job_time: 04:00:00
job_name: disaggr-test
extra_args: "--gres=gpu:4"
numa_bind: true
@ -44,9 +44,6 @@ profiling:
nsys_on: false
accuracy:
enable_accuracy_test: false
model: local-completions
tasks: gsm8k
model_args_extra: num_concurrent=512,max_retries=3,tokenized_requests=false,timeout=1200,max_gen_toks=256,max_length=4096
worker_config:
gen:
tensor_parallel_size: 32

View File

@ -13,7 +13,7 @@ slurm:
script_file: disaggr_torch.slurm
partition: <partition>
account: <account>
job_time: 02:00:00
job_time: 04:00:00
job_name: unified-benchmark
extra_args: "--gres=gpu:4"
numa_bind: true
@ -44,9 +44,6 @@ profiling:
nsys_on: false
accuracy:
enable_accuracy_test: false
model: local-completions
tasks: gsm8k
model_args_extra: num_concurrent=512,max_retries=3,tokenized_requests=false,timeout=1200,max_gen_toks=256,max_length=4096
worker_config:
gen:
tensor_parallel_size: 16

View File

@ -13,7 +13,7 @@ slurm:
script_file: disaggr_torch.slurm
partition: <partition>
account: <account>
job_time: 02:00:00
job_time: 04:00:00
job_name: unified-benchmark
extra_args: "--gres=gpu:4"
numa_bind: true
@ -44,9 +44,6 @@ profiling:
nsys_on: false
accuracy:
enable_accuracy_test: false
model: local-completions
tasks: gsm8k
model_args_extra: num_concurrent=512,max_retries=3,tokenized_requests=false,timeout=1200,max_gen_toks=256,max_length=4096
worker_config:
gen:
tensor_parallel_size: 32

View File

@ -41,9 +41,6 @@ profiling:
nsys_on: false
accuracy:
enable_accuracy_test: false
model: local-completions
tasks: gsm8k
model_args_extra: num_concurrent=512,max_retries=3,tokenized_requests=false,timeout=1200,max_gen_toks=256,max_length=4096
worker_config:
gen:
enable_layerwise_nvtx_marker: true

View File

@ -41,9 +41,6 @@ profiling:
nsys_on: false
accuracy:
enable_accuracy_test: false
model: local-completions
tasks: gsm8k
model_args_extra: num_concurrent=512,max_retries=3,tokenized_requests=false,timeout=1200,max_gen_toks=256,max_length=4096
worker_config:
gen:
enable_layerwise_nvtx_marker: true

View File

@ -112,8 +112,8 @@ class TestDisaggBenchmark:
)
assert job_id, f"Failed to submit job for {test_config.test_id}\n{error_msg}"
# Wait for completion (timeout: 10 hours = 36000 seconds)
JobManager.wait_for_completion(job_id, 36000, test_config, check_early_failure=True)
# Wait for completion (timeout: 15 hours = 54000 seconds)
JobManager.wait_for_completion(job_id, 54000, test_config, check_early_failure=True)
# End tracking test case
test_tracker.end_test_case()
@ -188,8 +188,8 @@ class TestDisaggBenchmark:
# Validate submission result
assert job_id, f"Failed to get job_id for {test_config.test_id}"
# Wait for completion (timeout: 10 hours = 36000 seconds)
JobManager.wait_for_completion(job_id, 36000, test_config, check_early_failure=True)
# Wait for completion (timeout: 15 hours = 54000 seconds)
JobManager.wait_for_completion(job_id, 54000, test_config, check_early_failure=True)
# End tracking test case
test_tracker.end_test_case()
@ -272,8 +272,8 @@ class TestDisaggBenchmark:
)
assert job_id, f"Failed to submit job for {test_config.test_id}\n{error_msg}"
# Wait for completion (timeout: 10 hours = 36000 seconds)
JobManager.wait_for_completion(job_id, 36000, test_config, check_early_failure=True)
# Wait for completion (timeout: 15 hours = 54000 seconds)
JobManager.wait_for_completion(job_id, 54000, test_config, check_early_failure=True)
# End tracking test case
test_tracker.end_test_case()

View File

@ -12,7 +12,14 @@ GPU_RESOURCE_CONFIG = {
"lock_freq_graphics_mhz": 2062, # GPU graphics clock lock frequency (MHz)
"lock_freq_memory_mhz": 3996, # GPU memory clock lock frequency (MHz)
},
# OCI GB300
# Lyris GB200
"GB200_LYRIS": {
"slurm_extra_args": "", # GB200 does not require extra args
"set_segment": True,
"lock_freq_graphics_mhz": None, # TODO: Set GB200 lock frequency
"lock_freq_memory_mhz": None,
},
# Lyris GB300
"GB300": {
"slurm_extra_args": "", # GB300 does not require extra args
"set_segment": True,
@ -121,6 +128,10 @@ class EnvManager:
def get_dataset_dir() -> str:
return os.getenv("DATASET_DIR", "<Your dataset directory>")
@staticmethod
def get_hf_home_dir() -> str:
return os.getenv("HF_HOME_DIR", "<Your HF home directory>")
@staticmethod
def get_output_path() -> str:
output_path = os.getenv(

View File

@ -87,10 +87,10 @@ class AccuracyConfig:
# ============================================================================
# Accuracy test uses accuracy_eval.log (markdown table output from lm_eval)
# Note: Only log_file is used by AccuracyParser (accuracy_parser.py)
# The regex pattern is hardcoded in AccuracyParser._extract_accuracy_values()
# Note: submit.py generates separate log files for each task (e.g., 7_accuracy_eval_{task}.log)
# Use glob pattern to automatically match all accuracy log files
_COMMON_ACCURACY_METRICS = MetricsConfig(
log_file="7_accuracy_eval.log",
log_file="7_accuracy_eval_*.log",
extractor_pattern=r"\|([a-zA-Z0-9_-]+)\|.*?\|([\w-]+)\|.*?\|exact_match\|.*?\|([0-9.]+)\|",
metric_names=["flexible-extract", "strict-match"],
)
@ -148,7 +148,7 @@ DEFAULT_METRICS_CONFIG = {
"SERVER_MEDIAN_ITL",
"SERVER_P99_ITL",
"SERVER_MEAN_E2EL",
"SERVER_E2EL", # Median E2EL (keep the same name as disagg)
"SERVER_MEDIAN_E2EL", # Median E2EL (keep the same name as disagg)
"SERVER_P99_E2EL",
],
),
@ -230,6 +230,9 @@ class ConfigLoader:
if gpu_type is None:
gpu_type = EnvManager.get_gpu_type()
# GB200_LYRIS in also in the GB200 family
if gpu_type.startswith("GB200_"):
gpu_type = "GB200"
configs = []
if not self.base_dir.exists():
@ -406,7 +409,7 @@ class ConfigLoader:
if "metrics" in acc_meta:
metrics_override = acc_meta["metrics"]
custom_metrics = MetricsConfig(
log_file=metrics_override.get("log_file", "7_accuracy_eval.log"),
log_file=metrics_override.get("log_file", "7_accuracy_eval_*.log"),
extractor_pattern=metrics_override.get(
"extractor_pattern",
r"\|([a-zA-Z0-9_-]+)\|.*?\|([\w-]+)\|.*?\|exact_match\|.*?\|([0-9.]+)\|",
@ -517,9 +520,10 @@ class ConfigLoader:
("slurm", "job_name"): lambda: EnvManager.get_slurm_job_name(),
("environment", "container_mount"): lambda: EnvManager.get_container_mount(model_name),
("environment", "container_image"): lambda: EnvManager.get_container_image(),
("environment", "trtllm_repo"): lambda: EnvManager.get_repo_dir(),
("environment", "trtllm_repo"): lambda: self._get_repo_dir(),
("environment", "trtllm_wheel_path"): lambda: EnvManager.get_trtllm_wheel_path(),
("benchmark", "dataset_file"): lambda: self._get_dataset_file(config),
("accuracy", "env_var", "HF_HOME"): lambda: EnvManager.get_hf_home_dir(),
("environment", "work_dir"): lambda: EnvManager.get_script_dir(),
("environment", "model_path"): lambda: self._get_full_model_path(config),
("slurm", "script_file"): lambda: self._get_script_file(config),
@ -528,11 +532,67 @@ class ConfigLoader:
}
# Apply overrides based on field paths
for (section, key), value_getter in field_mapping.items():
if section in config:
config[section][key] = value_getter()
for path, value_getter in field_mapping.items():
self._set_nested_value(config, path, value_getter())
# Apply dynamic overrides for accuracy.tasks (task names are dynamic)
self._apply_accuracy_tasks_overrides(config)
return config
def _set_nested_value(self, config: dict, path: tuple, value: any) -> None:
"""Set value at nested path in config.
Supports arbitrary nesting depth using tuple paths.
Creates missing intermediate levels automatically.
Args:
config: Configuration dictionary
path: Tuple of keys representing the path (e.g., ("a", "b", "c"))
value: Value to set
Example:
_set_nested_value(config, ("accuracy", "env_var", "HF_HOME"), "/path")
# Sets config["accuracy"]["env_var"]["HF_HOME"] = "/path"
"""
current = config
# Traverse/create path, except for the last key
for key in path[:-1]:
if key not in current:
current[key] = {}
current = current[key]
# Set the final value
current[path[-1]] = value
def _apply_accuracy_tasks_overrides(self, config: dict) -> None:
"""Apply environment overrides for accuracy.tasks configuration.
Handles dynamic task names (e.g., gsm8k, gpqa_diamond_local).
Replaces placeholders in custom_config paths.
Args:
config: Configuration dictionary
"""
if "accuracy" not in config or "tasks" not in config["accuracy"]:
return
repo_dir = EnvManager.get_repo_dir()
# Iterate through all tasks (task names are dynamic)
for task_name, task_config in config["accuracy"]["tasks"].items():
if not isinstance(task_config, dict):
continue
# Replace <repo_path> in custom_config
if "extra_kwargs" in task_config and "custom_config" in task_config["extra_kwargs"]:
custom_config_path = task_config["extra_kwargs"]["custom_config"]
if "<repo_path>" in custom_config_path:
task_config["extra_kwargs"]["custom_config"] = custom_config_path.replace(
"<repo_path>", repo_dir
)
def _get_full_model_path(self, config: dict) -> str:
"""Get full model path by combining MODEL_DIR with model directory name.
@ -548,6 +608,12 @@ class ConfigLoader:
else:
return ""
def _get_repo_dir(self):
if EnvManager.get_install_mode() == "source":
return EnvManager.get_repo_dir()
else: # wheel/none install_mode, no need to set repo_dir
return ""
def _get_dataset_file(self, config: dict) -> str:
"""Get dataset file by combining dataset directory with dataset file name.