[https://nvbugs/5630196] [fix] Prevent flaky failures in C++ test_e2e.py by using local cached datasets for benchmarking (#10638)

Signed-off-by: Dom Brown <3886319+DomBrown@users.noreply.github.com>
This commit is contained in:
Dom Brown 2026-01-15 02:39:55 +00:00 committed by GitHub
parent 73d1840c12
commit 94c7b69048
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
2 changed files with 229 additions and 66 deletions

View File

@ -2,10 +2,11 @@ import logging
import random
import re
import tempfile
from pathlib import Path
from typing import Optional
import click
from datasets import load_dataset
import datasets
from PIL import Image
from pydantic import BaseModel, model_validator
from utils.utils import (get_norm_dist_lengths, multimodal_dataset_dump,
@ -29,7 +30,7 @@ def validate_output_len_dist(ctx, param, value):
class DatasetConfig(BaseModel):
"""Dataset configurations."""
"""Name of the dataset on HuggingFace."""
name: str
name: Optional[str] = None
"""Config name of the dataset if existing."""
config_name: Optional[str] = None
"""Split of the dataset. Typical values: train, validation, test. Setting to None will include all splits."""
@ -44,6 +45,8 @@ class DatasetConfig(BaseModel):
prompt: Optional[str] = None
"""The dataset dictionary key used to derive the output sequence length. Set to None if the dataset does not have a key for output."""
output_key: Optional[str]
"""The local path to the dataset to be loaded when using a local cache."""
local_path: Optional[str] = None
@model_validator(mode='after')
def check_prompt(self) -> 'DatasetConfig':
@ -54,19 +57,40 @@ class DatasetConfig(BaseModel):
raise AssertionError("Either --prompt-key or --prompt must be set.")
return self
@model_validator(mode='after')
def check_name_and_local_path(self) -> 'DatasetConfig':
if self.name and self.local_path:
raise AssertionError(
"--dataset-name and --dataset-local-path cannot be set at the same time."
)
if (not self.name) and (not self.local_path):
raise AssertionError(
"Either --dataset-name or --dataset-local-path must be set.")
return self
@property
def query(self):
"""Generate the query for HuggingFace `datasets.load_dataset()`"""
first_arg = self.local_path if self.local_path else self.name
if self.config_name:
return [self.name, self.config_name]
return [first_arg, self.config_name]
else:
return [self.name]
return [first_arg]
@property
def display_name(self) -> str:
"""Returns a human-readable identifier for error messages."""
# model_validator ensures exactly one of name or local_path is set
if self.name is not None:
return self.name
return self.local_path
def get_prompt(self, req):
"""Get the prompt sentence from the given request."""
if self.prompt_key:
assert self.prompt_key in req, (
f"Dataset {self.name} does not have key '{self.prompt_key}'. "
f"Dataset {self.display_name} does not have key '{self.prompt_key}'. "
"Please set --prompt-key to one of the available keys: "
f"{req.keys()}")
return req[self.prompt_key]
@ -76,7 +100,7 @@ class DatasetConfig(BaseModel):
def get_input(self, req):
"""Get the input sentence from the given request."""
assert self.input_key in req, (
f"Dataset {self.name} does not have key '{self.input_key}'. "
f"Dataset {self.display_name} does not have key '{self.input_key}'. "
"Please set --input-key to one of the available keys: "
f"{req.keys()}")
return req[self.input_key]
@ -86,7 +110,7 @@ class DatasetConfig(BaseModel):
image_keys = [self.image_key
] + [f"{self.image_key}_{i}" for i in range(1, 8)]
assert any(key in req for key in image_keys), (
f"Dataset {self.name} does not have key '{self.image_key}'. "
f"Dataset {self.display_name} does not have key '{self.image_key}'. "
"Please set --dataset-image-key to one of the available keys: "
f"{req.keys()}")
images = []
@ -101,16 +125,47 @@ class DatasetConfig(BaseModel):
raise RuntimeError(
"--output-key is not set. Please either:\n"
"1. Define output length through --output-len-dist.\n"
f"2. If the dataset {self.name} has key for golden output and "
f"2. If the dataset {self.display_name} has key for golden output and "
"you wish to set output length to the length of the golden "
"output, set --output-key.")
assert self.output_key in req, (
f"Dataset {self.name} does not have key '{self.output_key}'. "
f"Dataset {self.display_name} does not have key '{self.output_key}'. "
"Please set --output-key to one of the available keys: "
f"{req.keys()}")
return req[self.output_key]
def _create_dataset_load_error(e: ValueError) -> ValueError:
"""Create a more informative ValueError from a dataset loading error.
Args:
e: The original ValueError from datasets.load_dataset().
Returns:
A new ValueError with additional context.
"""
error_msg = str(e)
if "Config" in error_msg:
error_msg += "\n Please add the config name to the dataset config yaml."
elif "split" in error_msg:
error_msg += "\n Please specify supported split in the dataset config yaml."
return ValueError(error_msg)
def load_dataset(dataset_config: DatasetConfig):
"""Load dataset from local path or HuggingFace.
Args:
dataset_config: A `DatasetConfig` object that defines the dataset to load.
Returns:
Dataset iterator.
Raises:
ValueError: When dataset loading fails due to incorrect dataset config setting.
"""
if dataset_config.local_path:
return load_dataset_from_local(dataset_config)
else:
return load_dataset_from_hf(dataset_config)
def load_dataset_from_hf(dataset_config: DatasetConfig):
"""Load dataset from HuggingFace.
@ -121,55 +176,117 @@ def load_dataset_from_hf(dataset_config: DatasetConfig):
Raises:
ValueError: When dataset loading fails due to incorrect dataset config setting.
"""
logging.debug(
f"Loading dataset from HF: query={dataset_config.query}, split={dataset_config.split}"
)
try:
dataset = iter(
load_dataset(*dataset_config.query,
split=dataset_config.split,
streaming=True,
trust_remote_code=True))
datasets.load_dataset(*dataset_config.query,
split=dataset_config.split,
streaming=True,
trust_remote_code=True))
except ValueError as e:
if "Config" in e:
e += "\n Please add the config name to the dataset config yaml."
elif "split" in e:
e += "\n Please specify supported split in the dataset config yaml."
raise ValueError(e)
raise _create_dataset_load_error(e)
logging.debug("Finished loading HF dataset")
return dataset
def load_dataset_from_local(dataset_config: DatasetConfig):
"""Load dataset from local path.
Args:
dataset_config: A `DatasetConfig` object that defines the dataset to load.
Returns:
Dataset iterator.
Raises:
FileNotFoundError: When local dataset path does not exist.
ValueError: When dataset loading fails due to incorrect dataset config setting.
"""
local_path = Path(dataset_config.local_path)
if not local_path.exists():
raise FileNotFoundError(
f"Local dataset path {local_path} does not exist.")
logging.debug(
f"Loading dataset from local path: path={local_path}, query={dataset_config.query}, split={dataset_config.split}"
)
# If it's a directory we can use the normal loader, otherwise custom loader
# depends on the file extension
if local_path.is_dir():
try:
dataset = datasets.load_dataset(*dataset_config.query,
split=dataset_config.split,
trust_remote_code=True)
except ValueError as e:
raise _create_dataset_load_error(e)
else:
format_map = {
".json": "json",
".jsonl": "json",
".csv": "csv",
".parquet": "parquet",
}
file_extension = local_path.suffix
dataset_type = format_map.get(file_extension)
if dataset_type is None:
raise ValueError(f"Unsupported file extension: {file_extension}")
try:
dataset = datasets.load_dataset(dataset_type,
data_files=str(local_path),
split=dataset_config.split)
except ValueError as e:
raise _create_dataset_load_error(e)
logging.debug("Finished loading local dataset")
return iter(dataset)
@click.command()
@click.option("--dataset-name",
required=True,
type=str,
help=f"Dataset name in HuggingFace.")
@click.option("--dataset-name", type=str, help="Dataset name in HuggingFace.")
@click.option("--dataset-config-name",
type=str,
default=None,
help=f"Dataset config name in HuggingFace (if exists).")
help="Dataset config name in HuggingFace (if exists).")
@click.option("--dataset-split",
type=str,
required=True,
help=f"Split of the dataset to use.")
help="Split of the dataset to use.")
@click.option("--dataset-input-key",
type=str,
help=f"The dataset dictionary key for input.")
help="The dataset dictionary key for input.")
@click.option("--dataset-image-key",
type=str,
default="image",
help=f"The dataset dictionary key for images.")
help="The dataset dictionary key for images.")
@click.option("--dataset-prompt-key",
type=str,
default=None,
help=f"The dataset dictionary key for prompt (if exists).")
help="The dataset dictionary key for prompt (if exists).")
@click.option(
"--dataset-local-path",
type=str,
default=None,
help=
"The local path to the dataset to be loaded when using an offline cache.")
@click.option(
"--dataset-prompt",
type=str,
default=None,
help=f"The prompt string when there is no prompt key for the dataset.")
help="The prompt string when there is no prompt key for the dataset.")
@click.option("--dataset-output-key",
type=str,
default=None,
help=f"The dataset dictionary key for output (if exists).")
help="The dataset dictionary key for output (if exists).")
@click.option(
"--num-requests",
type=int,
@ -208,7 +325,7 @@ def dataset(root_args, **kwargs):
modality = None
multimodal_texts = []
multimodal_image_paths = []
for req in load_dataset_from_hf(dataset_config):
for req in load_dataset(dataset_config):
if any(key in req for key in ['image', 'image_1', 'video']):
# multimodal input
if 'video' in req and req['video'] is not None:

View File

@ -2,12 +2,81 @@ import copy
import logging as _logger
import os as _os
import pathlib as _pl
from typing import List
from dataclasses import dataclass
from typing import List, Optional
import defs.cpp.cpp_common as _cpp
import pytest
@dataclass(frozen=True)
class DatasetConfig:
"""Configuration for a benchmark dataset."""
name: str
local_path: str
split: str
input_key: str
output_key: str
max_input_len: str
num_requests: str
config_name: Optional[str] = None
prompt: Optional[str] = None
prompt_key: Optional[str] = None
@property
def token_file(self) -> str:
return "prepared_" + self.name.replace('/', '_')
def get_dataset_args(self) -> dict[str, str]:
"""Build the dataset args dict for prepare_dataset.py."""
args = {
'--dataset-local-path': self.local_path,
'--dataset-split': self.split,
'--dataset-input-key': self.input_key,
'--dataset-output-key': self.output_key,
}
if self.config_name:
args['--dataset-config-name'] = self.config_name
if self.prompt:
args['--dataset-prompt'] = self.prompt
if self.prompt_key:
args['--dataset-prompt-key'] = self.prompt_key
return args
def get_benchmark_dataset_configs(model_cache: str) -> List[DatasetConfig]:
"""Define dataset configurations for benchmark tests.
To add a new dataset, add a new DatasetConfig entry to this list.
"""
datasets_dir = _pl.Path(model_cache) / "datasets"
return [
DatasetConfig(
name="ccdv/cnn_dailymail",
local_path=str(datasets_dir / "ccdv" / "cnn_dailymail"),
config_name="3.0.0",
split="validation",
input_key="article",
prompt="Summarize the following article:",
output_key="highlights",
max_input_len="256",
num_requests="50",
),
DatasetConfig(
name="Open-Orca/1million-gpt-4",
local_path=str(datasets_dir / "Open-Orca" / "1million-gpt-4" /
"1M-GPT4-Augmented.parquet"),
split="train",
input_key="question",
prompt_key="system_prompt",
output_key="response",
max_input_len="20",
num_requests="10",
),
]
def run_single_gpu_tests(build_dir: _pl.Path,
test_list: List[str],
run_fp8=False,
@ -93,27 +162,6 @@ def run_benchmarks(
)
return NotImplementedError
prompt_datasets_args = [{
'--dataset-name': "cnn_dailymail",
'--dataset-config-name': "3.0.0",
'--dataset-split': "validation",
'--dataset-input-key': "article",
'--dataset-prompt': "Summarize the following article:",
'--dataset-output-key': "highlights"
}, {
'--dataset-name': "Open-Orca/1million-gpt-4",
'--dataset-split': "train",
'--dataset-input-key': "question",
'--dataset-prompt-key': "system_prompt",
'--dataset-output-key': "response"
}]
token_files = [
"prepared_" + s['--dataset-name'].replace('/', '_')
for s in prompt_datasets_args
]
max_input_lens = ["256", "20"]
num_reqs = ["50", "10"]
if model_name == "gpt":
model_engine_path = model_engine_dir / "fp16_plugin_packed_paged" / "tp1-pp1-cp1-gpu"
@ -127,27 +175,25 @@ def run_benchmarks(
# model_engine_path = model_engine_dir / model_spec_obj.get_model_path(
# ) / "tp1-pp1-cp1-gpu"
for prompt_ds_args, tokens_f, len, num_req in zip(prompt_datasets_args,
token_files,
max_input_lens, num_reqs):
for config in get_benchmark_dataset_configs(model_cache):
benchmark_src_dir = _pl.Path("benchmarks") / "cpp"
data_dir = resources_dir / "data"
prepare_dataset = [
python_exe,
str(benchmark_src_dir / "prepare_dataset.py"), "--tokenizer",
str(tokenizer_dir), "--output",
str(data_dir / tokens_f), "dataset", "--max-input-len", len,
"--num-requests", num_req
str(data_dir / config.token_file), "dataset", "--max-input-len",
config.max_input_len, "--num-requests", config.num_requests
]
for k, v in prompt_ds_args.items():
for k, v in config.get_dataset_args().items():
prepare_dataset += [k, v]
# https://nvbugs/4658787
# WAR before the prepare dataset can use offline cached dataset
# Use environment variable to force HuggingFace to use offline cached dataset
offline_env = {**_os.environ, 'HF_DATASETS_OFFLINE': '1'}
_cpp.run_command(prepare_dataset,
cwd=root_dir,
timeout=300,
env={'HF_DATASETS_OFFLINE': '0'})
env=offline_env)
for batching_type in batching_types:
for api_type in api_types:
@ -157,7 +203,7 @@ def run_benchmarks(
str(model_engine_path), "--type",
str(batching_type), "--api",
str(api_type), "--dataset",
str(data_dir / tokens_f)
str(data_dir / config.token_file)
]
if model_name == "enc_dec":
benchmark += [
@ -175,12 +221,13 @@ def run_benchmarks(
cwd=root_dir,
timeout=600)
if "IFB" in batching_type and "executor" in api_types:
if "IFB" in batching_types and "executor" in api_types:
# executor streaming test
benchmark = [
str(benchmark_exe_dir / "gptManagerBenchmark"), "--engine_dir",
str(model_engine_path), "--type", "IFB", "--dataset",
str(data_dir / tokens_f), "--api", "executor", "--streaming"
str(data_dir / config.token_file), "--api", "executor",
"--streaming"
]
if model_name == "enc_dec":
benchmark += [
@ -263,7 +310,6 @@ def test_model(build_google_tests, model, prepare_model, run_model_tests,
run_model_tests(model, run_fp8)
@pytest.mark.skip(reason="https://nvbugs/5601670")
@pytest.mark.parametrize("build_google_tests", ["80", "86", "89", "90"],
indirect=True)
@pytest.mark.parametrize("model", ["bart", "gpt", "t5"])