diff --git a/benchmarks/cpp/utils/prepare_real_data.py b/benchmarks/cpp/utils/prepare_real_data.py index e15ef9267e..4441c57ee4 100644 --- a/benchmarks/cpp/utils/prepare_real_data.py +++ b/benchmarks/cpp/utils/prepare_real_data.py @@ -2,10 +2,11 @@ import logging import random import re import tempfile +from pathlib import Path from typing import Optional import click -from datasets import load_dataset +import datasets from PIL import Image from pydantic import BaseModel, model_validator from utils.utils import (get_norm_dist_lengths, multimodal_dataset_dump, @@ -29,7 +30,7 @@ def validate_output_len_dist(ctx, param, value): class DatasetConfig(BaseModel): """Dataset configurations.""" """Name of the dataset on HuggingFace.""" - name: str + name: Optional[str] = None """Config name of the dataset if existing.""" config_name: Optional[str] = None """Split of the dataset. Typical values: train, validation, test. Setting to None will include all splits.""" @@ -44,6 +45,8 @@ class DatasetConfig(BaseModel): prompt: Optional[str] = None """The dataset dictionary key used to derive the output sequence length. Set to None if the dataset does not have a key for output.""" output_key: Optional[str] + """The local path to the dataset to be loaded when using a local cache.""" + local_path: Optional[str] = None @model_validator(mode='after') def check_prompt(self) -> 'DatasetConfig': @@ -54,19 +57,40 @@ class DatasetConfig(BaseModel): raise AssertionError("Either --prompt-key or --prompt must be set.") return self + @model_validator(mode='after') + def check_name_and_local_path(self) -> 'DatasetConfig': + if self.name and self.local_path: + raise AssertionError( + "--dataset-name and --dataset-local-path cannot be set at the same time." + ) + if (not self.name) and (not self.local_path): + raise AssertionError( + "Either --dataset-name or --dataset-local-path must be set.") + return self + @property def query(self): """Generate the query for HuggingFace `datasets.load_dataset()`""" + first_arg = self.local_path if self.local_path else self.name + if self.config_name: - return [self.name, self.config_name] + return [first_arg, self.config_name] else: - return [self.name] + return [first_arg] + + @property + def display_name(self) -> str: + """Returns a human-readable identifier for error messages.""" + # model_validator ensures exactly one of name or local_path is set + if self.name is not None: + return self.name + return self.local_path def get_prompt(self, req): """Get the prompt sentence from the given request.""" if self.prompt_key: assert self.prompt_key in req, ( - f"Dataset {self.name} does not have key '{self.prompt_key}'. " + f"Dataset {self.display_name} does not have key '{self.prompt_key}'. " "Please set --prompt-key to one of the available keys: " f"{req.keys()}") return req[self.prompt_key] @@ -76,7 +100,7 @@ class DatasetConfig(BaseModel): def get_input(self, req): """Get the input sentence from the given request.""" assert self.input_key in req, ( - f"Dataset {self.name} does not have key '{self.input_key}'. " + f"Dataset {self.display_name} does not have key '{self.input_key}'. " "Please set --input-key to one of the available keys: " f"{req.keys()}") return req[self.input_key] @@ -86,7 +110,7 @@ class DatasetConfig(BaseModel): image_keys = [self.image_key ] + [f"{self.image_key}_{i}" for i in range(1, 8)] assert any(key in req for key in image_keys), ( - f"Dataset {self.name} does not have key '{self.image_key}'. " + f"Dataset {self.display_name} does not have key '{self.image_key}'. " "Please set --dataset-image-key to one of the available keys: " f"{req.keys()}") images = [] @@ -101,16 +125,47 @@ class DatasetConfig(BaseModel): raise RuntimeError( "--output-key is not set. Please either:\n" "1. Define output length through --output-len-dist.\n" - f"2. If the dataset {self.name} has key for golden output and " + f"2. If the dataset {self.display_name} has key for golden output and " "you wish to set output length to the length of the golden " "output, set --output-key.") assert self.output_key in req, ( - f"Dataset {self.name} does not have key '{self.output_key}'. " + f"Dataset {self.display_name} does not have key '{self.output_key}'. " "Please set --output-key to one of the available keys: " f"{req.keys()}") return req[self.output_key] +def _create_dataset_load_error(e: ValueError) -> ValueError: + """Create a more informative ValueError from a dataset loading error. + + Args: + e: The original ValueError from datasets.load_dataset(). + Returns: + A new ValueError with additional context. + """ + error_msg = str(e) + if "Config" in error_msg: + error_msg += "\n Please add the config name to the dataset config yaml." + elif "split" in error_msg: + error_msg += "\n Please specify supported split in the dataset config yaml." + return ValueError(error_msg) + + +def load_dataset(dataset_config: DatasetConfig): + """Load dataset from local path or HuggingFace. + Args: + dataset_config: A `DatasetConfig` object that defines the dataset to load. + Returns: + Dataset iterator. + Raises: + ValueError: When dataset loading fails due to incorrect dataset config setting. + """ + if dataset_config.local_path: + return load_dataset_from_local(dataset_config) + else: + return load_dataset_from_hf(dataset_config) + + def load_dataset_from_hf(dataset_config: DatasetConfig): """Load dataset from HuggingFace. @@ -121,55 +176,117 @@ def load_dataset_from_hf(dataset_config: DatasetConfig): Raises: ValueError: When dataset loading fails due to incorrect dataset config setting. """ + logging.debug( + f"Loading dataset from HF: query={dataset_config.query}, split={dataset_config.split}" + ) + try: dataset = iter( - load_dataset(*dataset_config.query, - split=dataset_config.split, - streaming=True, - trust_remote_code=True)) + datasets.load_dataset(*dataset_config.query, + split=dataset_config.split, + streaming=True, + trust_remote_code=True)) except ValueError as e: - if "Config" in e: - e += "\n Please add the config name to the dataset config yaml." - elif "split" in e: - e += "\n Please specify supported split in the dataset config yaml." - raise ValueError(e) + raise _create_dataset_load_error(e) + + logging.debug("Finished loading HF dataset") return dataset +def load_dataset_from_local(dataset_config: DatasetConfig): + """Load dataset from local path. + + Args: + dataset_config: A `DatasetConfig` object that defines the dataset to load. + Returns: + Dataset iterator. + Raises: + FileNotFoundError: When local dataset path does not exist. + ValueError: When dataset loading fails due to incorrect dataset config setting. + """ + + local_path = Path(dataset_config.local_path) + + if not local_path.exists(): + raise FileNotFoundError( + f"Local dataset path {local_path} does not exist.") + + logging.debug( + f"Loading dataset from local path: path={local_path}, query={dataset_config.query}, split={dataset_config.split}" + ) + + # If it's a directory we can use the normal loader, otherwise custom loader + # depends on the file extension + if local_path.is_dir(): + try: + dataset = datasets.load_dataset(*dataset_config.query, + split=dataset_config.split, + trust_remote_code=True) + except ValueError as e: + raise _create_dataset_load_error(e) + else: + format_map = { + ".json": "json", + ".jsonl": "json", + ".csv": "csv", + ".parquet": "parquet", + } + + file_extension = local_path.suffix + dataset_type = format_map.get(file_extension) + + if dataset_type is None: + raise ValueError(f"Unsupported file extension: {file_extension}") + + try: + dataset = datasets.load_dataset(dataset_type, + data_files=str(local_path), + split=dataset_config.split) + except ValueError as e: + raise _create_dataset_load_error(e) + + logging.debug("Finished loading local dataset") + + return iter(dataset) + + @click.command() -@click.option("--dataset-name", - required=True, - type=str, - help=f"Dataset name in HuggingFace.") +@click.option("--dataset-name", type=str, help="Dataset name in HuggingFace.") @click.option("--dataset-config-name", type=str, default=None, - help=f"Dataset config name in HuggingFace (if exists).") + help="Dataset config name in HuggingFace (if exists).") @click.option("--dataset-split", type=str, required=True, - help=f"Split of the dataset to use.") + help="Split of the dataset to use.") @click.option("--dataset-input-key", type=str, - help=f"The dataset dictionary key for input.") + help="The dataset dictionary key for input.") @click.option("--dataset-image-key", type=str, default="image", - help=f"The dataset dictionary key for images.") + help="The dataset dictionary key for images.") @click.option("--dataset-prompt-key", type=str, default=None, - help=f"The dataset dictionary key for prompt (if exists).") + help="The dataset dictionary key for prompt (if exists).") +@click.option( + "--dataset-local-path", + type=str, + default=None, + help= + "The local path to the dataset to be loaded when using an offline cache.") @click.option( "--dataset-prompt", type=str, default=None, - help=f"The prompt string when there is no prompt key for the dataset.") + help="The prompt string when there is no prompt key for the dataset.") @click.option("--dataset-output-key", type=str, default=None, - help=f"The dataset dictionary key for output (if exists).") + help="The dataset dictionary key for output (if exists).") @click.option( "--num-requests", type=int, @@ -208,7 +325,7 @@ def dataset(root_args, **kwargs): modality = None multimodal_texts = [] multimodal_image_paths = [] - for req in load_dataset_from_hf(dataset_config): + for req in load_dataset(dataset_config): if any(key in req for key in ['image', 'image_1', 'video']): # multimodal input if 'video' in req and req['video'] is not None: diff --git a/tests/integration/defs/cpp/test_e2e.py b/tests/integration/defs/cpp/test_e2e.py index 5a90df6e08..ce583671cf 100644 --- a/tests/integration/defs/cpp/test_e2e.py +++ b/tests/integration/defs/cpp/test_e2e.py @@ -2,12 +2,81 @@ import copy import logging as _logger import os as _os import pathlib as _pl -from typing import List +from dataclasses import dataclass +from typing import List, Optional import defs.cpp.cpp_common as _cpp import pytest +@dataclass(frozen=True) +class DatasetConfig: + """Configuration for a benchmark dataset.""" + name: str + local_path: str + split: str + input_key: str + output_key: str + max_input_len: str + num_requests: str + config_name: Optional[str] = None + prompt: Optional[str] = None + prompt_key: Optional[str] = None + + @property + def token_file(self) -> str: + return "prepared_" + self.name.replace('/', '_') + + def get_dataset_args(self) -> dict[str, str]: + """Build the dataset args dict for prepare_dataset.py.""" + args = { + '--dataset-local-path': self.local_path, + '--dataset-split': self.split, + '--dataset-input-key': self.input_key, + '--dataset-output-key': self.output_key, + } + if self.config_name: + args['--dataset-config-name'] = self.config_name + if self.prompt: + args['--dataset-prompt'] = self.prompt + if self.prompt_key: + args['--dataset-prompt-key'] = self.prompt_key + return args + + +def get_benchmark_dataset_configs(model_cache: str) -> List[DatasetConfig]: + """Define dataset configurations for benchmark tests. + + To add a new dataset, add a new DatasetConfig entry to this list. + """ + datasets_dir = _pl.Path(model_cache) / "datasets" + + return [ + DatasetConfig( + name="ccdv/cnn_dailymail", + local_path=str(datasets_dir / "ccdv" / "cnn_dailymail"), + config_name="3.0.0", + split="validation", + input_key="article", + prompt="Summarize the following article:", + output_key="highlights", + max_input_len="256", + num_requests="50", + ), + DatasetConfig( + name="Open-Orca/1million-gpt-4", + local_path=str(datasets_dir / "Open-Orca" / "1million-gpt-4" / + "1M-GPT4-Augmented.parquet"), + split="train", + input_key="question", + prompt_key="system_prompt", + output_key="response", + max_input_len="20", + num_requests="10", + ), + ] + + def run_single_gpu_tests(build_dir: _pl.Path, test_list: List[str], run_fp8=False, @@ -93,27 +162,6 @@ def run_benchmarks( ) return NotImplementedError - prompt_datasets_args = [{ - '--dataset-name': "cnn_dailymail", - '--dataset-config-name': "3.0.0", - '--dataset-split': "validation", - '--dataset-input-key': "article", - '--dataset-prompt': "Summarize the following article:", - '--dataset-output-key': "highlights" - }, { - '--dataset-name': "Open-Orca/1million-gpt-4", - '--dataset-split': "train", - '--dataset-input-key': "question", - '--dataset-prompt-key': "system_prompt", - '--dataset-output-key': "response" - }] - token_files = [ - "prepared_" + s['--dataset-name'].replace('/', '_') - for s in prompt_datasets_args - ] - max_input_lens = ["256", "20"] - num_reqs = ["50", "10"] - if model_name == "gpt": model_engine_path = model_engine_dir / "fp16_plugin_packed_paged" / "tp1-pp1-cp1-gpu" @@ -127,27 +175,25 @@ def run_benchmarks( # model_engine_path = model_engine_dir / model_spec_obj.get_model_path( # ) / "tp1-pp1-cp1-gpu" - for prompt_ds_args, tokens_f, len, num_req in zip(prompt_datasets_args, - token_files, - max_input_lens, num_reqs): - + for config in get_benchmark_dataset_configs(model_cache): benchmark_src_dir = _pl.Path("benchmarks") / "cpp" data_dir = resources_dir / "data" prepare_dataset = [ python_exe, str(benchmark_src_dir / "prepare_dataset.py"), "--tokenizer", str(tokenizer_dir), "--output", - str(data_dir / tokens_f), "dataset", "--max-input-len", len, - "--num-requests", num_req + str(data_dir / config.token_file), "dataset", "--max-input-len", + config.max_input_len, "--num-requests", config.num_requests ] - for k, v in prompt_ds_args.items(): + for k, v in config.get_dataset_args().items(): prepare_dataset += [k, v] - # https://nvbugs/4658787 - # WAR before the prepare dataset can use offline cached dataset + + # Use environment variable to force HuggingFace to use offline cached dataset + offline_env = {**_os.environ, 'HF_DATASETS_OFFLINE': '1'} _cpp.run_command(prepare_dataset, cwd=root_dir, timeout=300, - env={'HF_DATASETS_OFFLINE': '0'}) + env=offline_env) for batching_type in batching_types: for api_type in api_types: @@ -157,7 +203,7 @@ def run_benchmarks( str(model_engine_path), "--type", str(batching_type), "--api", str(api_type), "--dataset", - str(data_dir / tokens_f) + str(data_dir / config.token_file) ] if model_name == "enc_dec": benchmark += [ @@ -175,12 +221,13 @@ def run_benchmarks( cwd=root_dir, timeout=600) - if "IFB" in batching_type and "executor" in api_types: + if "IFB" in batching_types and "executor" in api_types: # executor streaming test benchmark = [ str(benchmark_exe_dir / "gptManagerBenchmark"), "--engine_dir", str(model_engine_path), "--type", "IFB", "--dataset", - str(data_dir / tokens_f), "--api", "executor", "--streaming" + str(data_dir / config.token_file), "--api", "executor", + "--streaming" ] if model_name == "enc_dec": benchmark += [ @@ -263,7 +310,6 @@ def test_model(build_google_tests, model, prepare_model, run_model_tests, run_model_tests(model, run_fp8) -@pytest.mark.skip(reason="https://nvbugs/5601670") @pytest.mark.parametrize("build_google_tests", ["80", "86", "89", "90"], indirect=True) @pytest.mark.parametrize("model", ["bart", "gpt", "t5"])