diff --git a/.gitignore b/.gitignore index c2acd1cd92..c588d39d9b 100644 --- a/.gitignore +++ b/.gitignore @@ -72,6 +72,7 @@ ad-test-workspace/ */tllm_debug/** *.patch !cpp/tensorrt_llm/deep_ep/*.patch +examples/disaggregated/slurm/benchmark/logs/ # Generated files cpp/include/tensorrt_llm/executor/version.h diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 33a4dca2af..debf0bd855 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -1396,7 +1396,7 @@ repos: exclude: | (?x)^(.*cubin.cpp | .*cubin.h)$ - id: check-yaml - args: [--allow-multiple-documents] + args: [--allow-multiple-documents, --unsafe] exclude: ".*/gitlab/.*.yml" - id: trailing-whitespace exclude: '\.(patch|md)$' diff --git a/examples/disaggregated/slurm/benchmark/accuracy_eval.sh b/examples/disaggregated/slurm/benchmark/accuracy_eval.sh deleted file mode 100644 index 028b5344a9..0000000000 --- a/examples/disaggregated/slurm/benchmark/accuracy_eval.sh +++ /dev/null @@ -1,34 +0,0 @@ -#!/bin/bash -set -euo pipefail - -# Parse arguments -full_logdir=${1} -accuracy_model=${2} -accuracy_tasks=${3} -model_path=${4} -model_args_extra=${5} -output_dir=${6} -hostname=${7} -port=${8} - -echo "Starting accuracy evaluation..." -echo "Log directory: ${full_logdir}" - -echo "Hostname: ${hostname}, Port: ${port}" -base_url="http://${hostname}:${port}/v1/completions" -echo "Using base_url: ${base_url}" - -# Install lm_eval and run evaluation -echo "Installing lm_eval[api] and running evaluation..." -pip install lm_eval[api]==0.4.8 - -echo "Running lm_eval with tasks: ${accuracy_tasks}..." - -mkdir -p ${output_dir} -lm_eval --model ${accuracy_model} \ - --tasks ${accuracy_tasks} \ - --model_args model=${model_path},base_url=${base_url},${model_args_extra} \ - --output_path ${output_dir} --log_samples \ - --trust_remote_code - -echo "Accuracy evaluation completed successfully" diff --git a/examples/disaggregated/slurm/benchmark/disaggr_torch.slurm b/examples/disaggregated/slurm/benchmark/disaggr_torch.slurm index 597fd51911..1938db569f 100644 --- a/examples/disaggregated/slurm/benchmark/disaggr_torch.slurm +++ b/examples/disaggregated/slurm/benchmark/disaggr_torch.slurm @@ -86,7 +86,7 @@ if [ -n "${trtllm_wheel_path}" ]; then if ! srun --container-name=${container_name} \ --container-mounts=${container_mount} --no-container-mount-home \ --mpi=pmix --overlap -N $SLURM_NNODES --ntasks-per-node=1 \ - bash -c "pip install ${trtllm_wheel_path}" \ + bash -c "pip install ${trtllm_wheel_path}[devel]" \ &> ${full_logdir}/2_install.log; then cleanup_on_failure "TensorRT-LLM wheel installation failed. Check ${full_logdir}/2_install.log for details" fi @@ -117,7 +117,7 @@ elif [ -d "${trtllm_repo}" ]; then if ! srun --container-name=${container_name} \ --container-mounts=${container_mount} --no-container-mount-home \ --mpi=pmix --overlap -N $SLURM_NNODES --ntasks-per-node=1 \ - bash -c "cd ${trtllm_repo} && pip install -e ." \ + bash -c "cd ${trtllm_repo} && pip install -e .[devel]" \ &> ${full_logdir}/2_install.log; then cleanup_on_failure "TensorRT-LLM installation failed. Check ${full_logdir}/2_install.log for details" fi diff --git a/examples/disaggregated/slurm/benchmark/submit.py b/examples/disaggregated/slurm/benchmark/submit.py index a12e675134..d7e9f0a621 100644 --- a/examples/disaggregated/slurm/benchmark/submit.py +++ b/examples/disaggregated/slurm/benchmark/submit.py @@ -138,6 +138,36 @@ def convert_allocations_to_server_config(allocations, server_port=8333): return server_config +def convert_envs_to_str(env_vars: Dict[str, str]) -> str: + return ','.join([f"{key}='{value}'" for key, value in env_vars.items()]) + + +def replace_env_in_file(log_dir, file_path, env_var): + with open(file_path, 'r', encoding='utf-8') as f: + config_content = f.read() + + for env_name, env_value in env_var.items(): + file_content = config_content.replace(env_name, env_value) + + tmp_dir = os.path.join(log_dir, "lm_eval_configs") + os.makedirs(tmp_dir, exist_ok=True) + tmp_file = os.path.join(tmp_dir, os.path.basename(file_path)) + + # Write modified config to temp file + with open(tmp_file, 'w', encoding='utf-8') as f: + f.write(file_content) + + # Check if has custom utils.py in the same directory + # Needed for GPQA task + custom_utils_path = os.path.join(os.path.dirname(file_path), 'utils.py') + if os.path.exists(custom_utils_path): + # copy utils.py to temp directory + shutil.copy(custom_utils_path, tmp_dir) + + # Return temp directory + return tmp_dir + + def submit_job(config, log_dir, dry_run): # Extract configurations slurm_config = config['slurm'] @@ -208,24 +238,24 @@ def submit_job(config, log_dir, dry_run): gen_batch_size = worker_config['gen']['max_batch_size'] gen_enable_attention_dp = worker_config['gen']['enable_attention_dp'] + # Get eplb num_slots for gen worker + load_balancer_config = worker_config['gen'].get('moe_config', {}).get( + 'load_balancer', {}) + if isinstance(load_balancer_config, str): + with open(load_balancer_config, 'r') as f: + load_balancer_config = yaml.safe_load(f) + eplb_num_slots = load_balancer_config.get('num_slots', 0) + + # Get mtp_size from gen config's speculative_config + mtp_size = worker_config['gen'].get('speculative_config', + {}).get('num_nextn_predict_layers', 0) + + # Create base log directory path if log_dir is None: - # Create base log directory path - date_prefix = datetime.now().strftime("%Y%m%d") - log_base = os.path.join(env_config['work_dir'], - f"{date_prefix}/{isl}-{osl}") + log_base = os.path.join(env_config['work_dir'], "logs") - # Get eplb num_slots for gen worker - load_balancer_config = worker_config['gen'].get('moe_config', {}).get( - 'load_balancer', {}) - if isinstance(load_balancer_config, str): - with open(load_balancer_config, 'r') as f: - load_balancer_config = yaml.safe_load(f) - eplb_num_slots = load_balancer_config.get('num_slots', 0) - - # Get mtp_size from gen config's speculative_config - mtp_size = worker_config['gen'].get('speculative_config', - {}).get('num_nextn_predict_layers', - 0) + date_prefix = datetime.now().strftime("%Y%m%d-%H%M%S") + log_base = os.path.join(log_base, f"{date_prefix}/{isl}-{osl}") # Determine directory suffix based on attention_dp if gen_enable_attention_dp: @@ -340,27 +370,59 @@ def submit_job(config, log_dir, dry_run): f"--container-mounts={env_config['container_mount']}", f"--mpi=pmix --overlap -N 1 -n 1", ] - if benchmark_config['use_nv_sa_benchmark']: - benchmark_cmd = [ - f"bash {env_config['work_dir']}/run_benchmark_nv_sa.sh", - f"'{env_config['model_path']}' {isl} {osl} {benchmark_config['benchmark_ratio']} {benchmark_config['multi_round']} {gen_num} '{benchmark_config['concurrency_list']}' {benchmark_config['streaming']} '{log_dir}' {disagg_server_hostname} {disagg_server_port}", - f"&> {log_dir}/6_bench.log" + + # Append benchmark commands + if benchmark_config.get('enable_benchmark', True): + env_var = config['benchmark'].get('env_var', {}) + benchmark_prefix = client_slurm_prefix + [ + f"--export \"{convert_envs_to_str(env_var)}\"" ] - client_cmds.append(" ".join(client_slurm_prefix + benchmark_cmd)) - else: - benchmark_cmd = [ - f"bash {env_config['work_dir']}/run_benchmark.sh", - f"'{env_config['model_path']}' '{benchmark_config['dataset_file']}' {benchmark_config['multi_round']} {gen_num} '{benchmark_config['concurrency_list']}' {benchmark_config['streaming']} '{log_dir}' {disagg_server_hostname} {disagg_server_port}", - f"&> {log_dir}/6_bench.log" - ] - client_cmds.append(" ".join(client_slurm_prefix + benchmark_cmd)) + if benchmark_config['use_nv_sa_benchmark']: + benchmark_cmd = [ + f"bash {env_config['work_dir']}/run_benchmark_nv_sa.sh", + f"'{env_config['model_path']}' {isl} {osl} {benchmark_config['benchmark_ratio']} {benchmark_config['multi_round']} {gen_num} '{benchmark_config['concurrency_list']}' {benchmark_config['streaming']} '{log_dir}' {disagg_server_hostname} {disagg_server_port}", + f"&> {log_dir}/6_bench.log" + ] + client_cmds.append(" ".join(benchmark_prefix + benchmark_cmd)) + else: + benchmark_cmd = [ + f"bash {env_config['work_dir']}/run_benchmark.sh", + f"'{env_config['model_path']}' '{benchmark_config['dataset_file']}' {benchmark_config['multi_round']} {gen_num} '{benchmark_config['concurrency_list']}' {benchmark_config['streaming']} '{log_dir}' {disagg_server_hostname} {disagg_server_port}", + f"&> {log_dir}/6_bench.log" + ] + client_cmds.append(" ".join(benchmark_prefix + benchmark_cmd)) + + # Append accuracy test commands if config['accuracy']['enable_accuracy_test']: - accuracy_cmd = [ - f"bash {env_config['work_dir']}/accuracy_eval.sh", - f"'{log_dir}' '{config['accuracy']['model']}' '{config['accuracy']['tasks']}' '{env_config['model_path']}' '{config['accuracy']['model_args_extra']}' '{log_dir}/accuracy_eval' {disagg_server_hostname} {disagg_server_port}", - f"&> {log_dir}/7_accuracy_eval.log" + env_var = config['accuracy'].get('env_var', {}) + accuracy_prefix = client_slurm_prefix + [ + f"--export \"{convert_envs_to_str(env_var)}\"" ] - client_cmds.append(" ".join(client_slurm_prefix + accuracy_cmd)) + for task in config['accuracy']['tasks']: + extra_kwargs = config['accuracy']['tasks'][task].get( + 'extra_kwargs', {}) + extra_kwargs_str = "" + for key, value in extra_kwargs.items(): + if isinstance(value, bool): + if value: + extra_kwargs_str += f" --{key}" + elif key == "custom_config": + extra_kwargs_str += f" --include_path={replace_env_in_file(log_dir, value, env_var)}" + else: + extra_kwargs_str += f" --{key}='{value}'" + end_point_map = { + 'local-completions': 'v1/completions', + 'local-chat-completions': 'v1/chat/completions', + } + model = config['accuracy']['tasks'][task]['model'] + accuracy_cmd = [ + 'lm_eval', '--model', model, '--tasks', task, '--model_args', + f"model={env_config['model_path']},base_url=http://{disagg_server_hostname}:{disagg_server_port}/{end_point_map[model]},{config['accuracy']['tasks'][task]['model_args_extra']}", + '--log_samples', '--output_path', + f'{log_dir}/accuracy_eval_{task}', extra_kwargs_str, + f"&> {log_dir}/7_accuracy_eval_{task}.log" + ] + client_cmds.append(" ".join(accuracy_prefix + accuracy_cmd)) with open(os.path.join(log_dir, "client_cmds.sh"), "w") as f: f.write("\n".join(client_cmds) + "\n") diff --git a/tests/integration/defs/perf/disagg/test_configs/wideep/accuracy/kimi-k2-thinking-fp4_1k1k_ctx3_gen1_dep32_bs1024_eplb384_mtp0_ccb-NIXL.yaml b/tests/integration/defs/perf/disagg/test_configs/wideep/accuracy/kimi-k2-thinking-fp4_1k1k_ctx3_gen1_dep32_bs1024_eplb384_mtp0_ccb-NIXL.yaml index d68362abf0..c237dc5f0f 100644 --- a/tests/integration/defs/perf/disagg/test_configs/wideep/accuracy/kimi-k2-thinking-fp4_1k1k_ctx3_gen1_dep32_bs1024_eplb384_mtp0_ccb-NIXL.yaml +++ b/tests/integration/defs/perf/disagg/test_configs/wideep/accuracy/kimi-k2-thinking-fp4_1k1k_ctx3_gen1_dep32_bs1024_eplb384_mtp0_ccb-NIXL.yaml @@ -10,10 +10,10 @@ metadata: dataset_file: disagg_datasets/kimi-k2-1024-1024-20000-ratio-1_for_serve.json accuracy: datasets: - - dataset_name: gsm8k - expected_value: 0.9454 + - dataset_name: gpqa_diamond_cot_zeroshot + expected_value: 0.65 threshold_type: hypothesis_test - filter_type: flexible-extract + filter_type: strict-match slurm: script_file: disaggr_torch.slurm partition: @@ -23,7 +23,8 @@ slurm: extra_args: "--gres=gpu:4" numa_bind: true benchmark: - mode: gen_only + enable_benchmark: false + mode: e2e use_nv_sa_benchmark: false multi_round: 8 benchmark_ratio: 1.0 @@ -47,9 +48,16 @@ profiling: nsys_on: false accuracy: enable_accuracy_test: true - model: local-completions - tasks: gsm8k - model_args_extra: num_concurrent=512,max_retries=3,tokenized_requests=false,timeout=1200,max_gen_toks=256,max_length=4096 + env_var: + HF_HOME: + tasks: + gpqa_diamond_local: + model: "local-chat-completions" + model_args_extra: "num_concurrent=512,max_retries=3,tokenized_requests=false,timeout=7200,max_gen_toks=16384" + extra_kwargs: + apply_chat_template: true + trust_remote_code: true + custom_config: /tests/integration/lm_eval_configs/gpqa_diamond_local.yaml worker_config: gen: enable_layerwise_nvtx_marker: true @@ -58,30 +66,16 @@ worker_config: enable_attention_dp: true enable_lm_head_tp_in_adp: false pipeline_parallel_size: 1 - max_batch_size: 1024 - max_num_tokens: 1024 - max_seq_len: 5120 + max_batch_size: 512 + max_num_tokens: 512 + max_seq_len: 16384 cuda_graph_config: enable_padding: true - batch_sizes: - - 1 - - 2 - - 4 - - 8 - - 16 - - 32 - - 64 - - 128 - - 256 - - 512 - - 768 - - 1024 - - 2048 - - 1024 + max_batch_size: 512 print_iter_log: true kv_cache_config: enable_block_reuse: false - free_gpu_memory_fraction: 0.8 + free_gpu_memory_fraction: 0.85 dtype: fp8 moe_config: backend: WIDEEP @@ -97,9 +91,9 @@ worker_config: trust_remote_code: true ctx: enable_layerwise_nvtx_marker: true - max_batch_size: 8 + max_batch_size: 32 max_num_tokens: 8448 - max_seq_len: 5120 + max_seq_len: 8448 tensor_parallel_size: 4 moe_expert_parallel_size: 4 enable_attention_dp: true diff --git a/tests/integration/lm_eval_configs/gpqa_diamond_local.yaml b/tests/integration/lm_eval_configs/gpqa_diamond_local.yaml new file mode 100644 index 0000000000..a1b3a16264 --- /dev/null +++ b/tests/integration/lm_eval_configs/gpqa_diamond_local.yaml @@ -0,0 +1,33 @@ +# Modified from tensorrt_llm/evaluate/lm_eval_tasks/gpqa/cot_zeroshot_aa/gpqa_diamond_cot_zeroshot_aa.yaml +task: gpqa_diamond_local +dataset_path: HF_HOME/datasets/Idavidrein___gpqa +tag: gpqa +output_type: generate_until +process_docs: !function utils.process_gpqa_docs +training_split: train +# Because huggingface dataset only has train split +validation_split: train +test_split: null +doc_to_text: "Answer the following multiple choice question. The last line of your response should be of the following format: 'Answer: $LETTER' (without quotes) where LETTER is one of ABCD. Think step by step before answering.\n{{Question}}\nA) {{choice1}}\nB) {{choice2}}\nC) {{choice3}}\nD) {{choice4}}" +doc_to_target: answer +filter_list: + - name: "strict-match" + filter: + - function: "regex" + regex_pattern: '(?i)Answer[ \t]*:[ \t]*([A-D])' + group_select: 0 + - function: "take_first" +generation_kwargs: + until: + - "" + do_sample: false + temperature: 0.0 +num_fewshot: 0 +metric_list: + - metric: exact_match + aggregation: mean + higher_is_better: true + ignore_case: true + ignore_punctuation: true +metadata: + version: 1.0 diff --git a/tests/integration/lm_eval_configs/utils.py b/tests/integration/lm_eval_configs/utils.py new file mode 100644 index 0000000000..8c5b65e379 --- /dev/null +++ b/tests/integration/lm_eval_configs/utils.py @@ -0,0 +1,34 @@ +import random + +import datasets + + +def preprocess(text): + if text is None: + return " " + return text.strip() + + +def process_gpqa_docs(dataset: datasets.Dataset) -> datasets.Dataset: + def _process_doc(doc): + choices = [ + preprocess(doc["Incorrect Answer 1"]), + preprocess(doc["Incorrect Answer 2"]), + preprocess(doc["Incorrect Answer 3"]), + preprocess(doc["Correct Answer"]), + ] + + random.shuffle(choices) + correct_answer_index = choices.index(preprocess(doc["Correct Answer"])) + + out_doc = { + "choice1": choices[0], + "choice2": choices[1], + "choice3": choices[2], + "choice4": choices[3], + "choices": [choices[0], choices[1], choices[2], choices[3]], + "answer": f"{chr(65 + correct_answer_index)}", + } + return out_doc + + return dataset.map(_process_doc)