[https://nvbugs/5630196] [fix] Prevent flaky failures in C++ test_e2e.py by using local cached datasets for benchmarking (#10638)

Signed-off-by: Dom Brown <3886319+DomBrown@users.noreply.github.com>
2026-02-04 02:02:01 +08:00 · 2026-01-15 02:39:55 +00:00 · 2026-01-15 02:39:55 +00:00 · 94c7b69048
commit 94c7b69048
parent 73d1840c12
2 changed files with 229 additions and 66 deletions
--- a/benchmarks/cpp/utils/prepare_real_data.py
+++ b/benchmarks/cpp/utils/prepare_real_data.py
@ -2,10 +2,11 @@ import logging
 import random
 import re
 import tempfile
+from pathlib import Path
 from typing import Optional

 import click
-from datasets import load_dataset
+import datasets
 from PIL import Image
 from pydantic import BaseModel, model_validator
 from utils.utils import (get_norm_dist_lengths, multimodal_dataset_dump,
@ -29,7 +30,7 @@ def validate_output_len_dist(ctx, param, value):
 class DatasetConfig(BaseModel):
    """Dataset configurations."""
    """Name of the dataset on HuggingFace."""
-    name: str
+    name: Optional[str] = None
    """Config name of the dataset if existing."""
    config_name: Optional[str] = None
    """Split of the dataset. Typical values: train, validation, test. Setting to None will include all splits."""
@ -44,6 +45,8 @@ class DatasetConfig(BaseModel):
    prompt: Optional[str] = None
    """The dataset dictionary key used to derive the output sequence length. Set to None if the dataset does not have a key for output."""
    output_key: Optional[str]
+    """The local path to the dataset to be loaded when using a local cache."""
+    local_path: Optional[str] = None

    @model_validator(mode='after')
    def check_prompt(self) -> 'DatasetConfig':
@ -54,19 +57,40 @@ class DatasetConfig(BaseModel):
            raise AssertionError("Either --prompt-key or --prompt must be set.")
        return self

+    @model_validator(mode='after')
+    def check_name_and_local_path(self) -> 'DatasetConfig':
+        if self.name and self.local_path:
+            raise AssertionError(
+                "--dataset-name and --dataset-local-path cannot be set at the same time."
+            )
+        if (not self.name) and (not self.local_path):
+            raise AssertionError(
+                "Either --dataset-name or --dataset-local-path must be set.")
+        return self
+
    @property
    def query(self):
        """Generate the query for HuggingFace `datasets.load_dataset()`"""
+        first_arg = self.local_path if self.local_path else self.name
+
        if self.config_name:
-            return [self.name, self.config_name]
+            return [first_arg, self.config_name]
        else:
-            return [self.name]
+            return [first_arg]
+
+    @property
+    def display_name(self) -> str:
+        """Returns a human-readable identifier for error messages."""
+        # model_validator ensures exactly one of name or local_path is set
+        if self.name is not None:
+            return self.name
+        return self.local_path

    def get_prompt(self, req):
        """Get the prompt sentence from the given request."""
        if self.prompt_key:
            assert self.prompt_key in req, (
-                f"Dataset {self.name} does not have key '{self.prompt_key}'. "
+                f"Dataset {self.display_name} does not have key '{self.prompt_key}'. "
                "Please set --prompt-key to one of the available keys: "
                f"{req.keys()}")
            return req[self.prompt_key]
@ -76,7 +100,7 @@ class DatasetConfig(BaseModel):
    def get_input(self, req):
        """Get the input sentence from the given request."""
        assert self.input_key in req, (
-            f"Dataset {self.name} does not have key '{self.input_key}'. "
+            f"Dataset {self.display_name} does not have key '{self.input_key}'. "
            "Please set --input-key to one of the available keys: "
            f"{req.keys()}")
        return req[self.input_key]
@ -86,7 +110,7 @@ class DatasetConfig(BaseModel):
        image_keys = [self.image_key
                      ] + [f"{self.image_key}_{i}" for i in range(1, 8)]
        assert any(key in req for key in image_keys), (
-            f"Dataset {self.name} does not have key '{self.image_key}'. "
+            f"Dataset {self.display_name} does not have key '{self.image_key}'. "
            "Please set --dataset-image-key to one of the available keys: "
            f"{req.keys()}")
        images = []
@ -101,16 +125,47 @@ class DatasetConfig(BaseModel):
            raise RuntimeError(
                "--output-key is not set. Please either:\n"
                "1. Define output length through --output-len-dist.\n"
-                f"2. If the dataset {self.name} has key for golden output and "
+                f"2. If the dataset {self.display_name} has key for golden output and "
                "you wish to set output length to the length of the golden "
                "output, set --output-key.")
        assert self.output_key in req, (
-            f"Dataset {self.name} does not have key '{self.output_key}'. "
+            f"Dataset {self.display_name} does not have key '{self.output_key}'. "
            "Please set --output-key to one of the available keys: "
            f"{req.keys()}")
        return req[self.output_key]


+def _create_dataset_load_error(e: ValueError) -> ValueError:
+    """Create a more informative ValueError from a dataset loading error.
+
+    Args:
+        e: The original ValueError from datasets.load_dataset().
+    Returns:
+        A new ValueError with additional context.
+    """
+    error_msg = str(e)
+    if "Config" in error_msg:
+        error_msg += "\n Please add the config name to the dataset config yaml."
+    elif "split" in error_msg:
+        error_msg += "\n Please specify supported split in the dataset config yaml."
+    return ValueError(error_msg)
+
+
+def load_dataset(dataset_config: DatasetConfig):
+    """Load dataset from local path or HuggingFace.
+    Args:
+        dataset_config: A `DatasetConfig` object that defines the dataset to load.
+    Returns:
+        Dataset iterator.
+    Raises:
+        ValueError: When dataset loading fails due to incorrect dataset config setting.
+    """
+    if dataset_config.local_path:
+        return load_dataset_from_local(dataset_config)
+    else:
+        return load_dataset_from_hf(dataset_config)
+
+
 def load_dataset_from_hf(dataset_config: DatasetConfig):
    """Load dataset from HuggingFace.

@ -121,55 +176,117 @@ def load_dataset_from_hf(dataset_config: DatasetConfig):
    Raises:
        ValueError: When dataset loading fails due to incorrect dataset config setting.
    """
+    logging.debug(
+        f"Loading dataset from HF: query={dataset_config.query}, split={dataset_config.split}"
+    )
+
    try:
        dataset = iter(
-            load_dataset(*dataset_config.query,
-                         split=dataset_config.split,
-                         streaming=True,
-                         trust_remote_code=True))
+            datasets.load_dataset(*dataset_config.query,
+                                  split=dataset_config.split,
+                                  streaming=True,
+                                  trust_remote_code=True))
    except ValueError as e:
-        if "Config" in e:
-            e += "\n Please add the config name to the dataset config yaml."
-        elif "split" in e:
-            e += "\n Please specify supported split in the dataset config yaml."
-        raise ValueError(e)
+        raise _create_dataset_load_error(e)
+
+    logging.debug("Finished loading HF dataset")

    return dataset


+def load_dataset_from_local(dataset_config: DatasetConfig):
+    """Load dataset from local path.
+
+    Args:
+        dataset_config: A `DatasetConfig` object that defines the dataset to load.
+    Returns:
+        Dataset iterator.
+    Raises:
+        FileNotFoundError: When local dataset path does not exist.
+        ValueError: When dataset loading fails due to incorrect dataset config setting.
+    """
+
+    local_path = Path(dataset_config.local_path)
+
+    if not local_path.exists():
+        raise FileNotFoundError(
+            f"Local dataset path {local_path} does not exist.")
+
+    logging.debug(
+        f"Loading dataset from local path: path={local_path}, query={dataset_config.query}, split={dataset_config.split}"
+    )
+
+    # If it's a directory we can use the normal loader, otherwise custom loader
+    # depends on the file extension
+    if local_path.is_dir():
+        try:
+            dataset = datasets.load_dataset(*dataset_config.query,
+                                            split=dataset_config.split,
+                                            trust_remote_code=True)
+        except ValueError as e:
+            raise _create_dataset_load_error(e)
+    else:
+        format_map = {
+            ".json": "json",
+            ".jsonl": "json",
+            ".csv": "csv",
+            ".parquet": "parquet",
+        }
+
+        file_extension = local_path.suffix
+        dataset_type = format_map.get(file_extension)
+
+        if dataset_type is None:
+            raise ValueError(f"Unsupported file extension: {file_extension}")
+
+        try:
+            dataset = datasets.load_dataset(dataset_type,
+                                            data_files=str(local_path),
+                                            split=dataset_config.split)
+        except ValueError as e:
+            raise _create_dataset_load_error(e)
+
+    logging.debug("Finished loading local dataset")
+
+    return iter(dataset)
+
+
@click.command()
-@click.option("--dataset-name",
-              required=True,
-              type=str,
-              help=f"Dataset name in HuggingFace.")
+@click.option("--dataset-name", type=str, help="Dataset name in HuggingFace.")
@click.option("--dataset-config-name",
              type=str,
              default=None,
-              help=f"Dataset config name in HuggingFace (if exists).")
+              help="Dataset config name in HuggingFace (if exists).")
@click.option("--dataset-split",
              type=str,
              required=True,
-              help=f"Split of the dataset to use.")
+              help="Split of the dataset to use.")
@click.option("--dataset-input-key",
              type=str,
-              help=f"The dataset dictionary key for input.")
+              help="The dataset dictionary key for input.")
@click.option("--dataset-image-key",
              type=str,
              default="image",
-              help=f"The dataset dictionary key for images.")
+              help="The dataset dictionary key for images.")
@click.option("--dataset-prompt-key",
              type=str,
              default=None,
-              help=f"The dataset dictionary key for prompt (if exists).")
+              help="The dataset dictionary key for prompt (if exists).")
+@click.option(
+    "--dataset-local-path",
+    type=str,
+    default=None,
+    help=
+    "The local path to the dataset to be loaded when using an offline cache.")
@click.option(
    "--dataset-prompt",
    type=str,
    default=None,
-    help=f"The prompt string when there is no prompt key for the dataset.")
+    help="The prompt string when there is no prompt key for the dataset.")
@click.option("--dataset-output-key",
              type=str,
              default=None,
-              help=f"The dataset dictionary key for output (if exists).")
+              help="The dataset dictionary key for output (if exists).")
@click.option(
    "--num-requests",
    type=int,
@ -208,7 +325,7 @@ def dataset(root_args, **kwargs):
    modality = None
    multimodal_texts = []
    multimodal_image_paths = []
-    for req in load_dataset_from_hf(dataset_config):
+    for req in load_dataset(dataset_config):
        if any(key in req for key in ['image', 'image_1', 'video']):
            # multimodal input
            if 'video' in req and req['video'] is not None:
--- a/tests/integration/defs/cpp/test_e2e.py
+++ b/tests/integration/defs/cpp/test_e2e.py
@ -2,12 +2,81 @@ import copy
 import logging as _logger
 import os as _os
 import pathlib as _pl
-from typing import List
+from dataclasses import dataclass
+from typing import List, Optional

 import defs.cpp.cpp_common as _cpp
 import pytest


+@dataclass(frozen=True)
+class DatasetConfig:
+    """Configuration for a benchmark dataset."""
+    name: str
+    local_path: str
+    split: str
+    input_key: str
+    output_key: str
+    max_input_len: str
+    num_requests: str
+    config_name: Optional[str] = None
+    prompt: Optional[str] = None
+    prompt_key: Optional[str] = None
+
+    @property
+    def token_file(self) -> str:
+        return "prepared_" + self.name.replace('/', '_')
+
+    def get_dataset_args(self) -> dict[str, str]:
+        """Build the dataset args dict for prepare_dataset.py."""
+        args = {
+            '--dataset-local-path': self.local_path,
+            '--dataset-split': self.split,
+            '--dataset-input-key': self.input_key,
+            '--dataset-output-key': self.output_key,
+        }
+        if self.config_name:
+            args['--dataset-config-name'] = self.config_name
+        if self.prompt:
+            args['--dataset-prompt'] = self.prompt
+        if self.prompt_key:
+            args['--dataset-prompt-key'] = self.prompt_key
+        return args
+
+
+def get_benchmark_dataset_configs(model_cache: str) -> List[DatasetConfig]:
+    """Define dataset configurations for benchmark tests.
+
+    To add a new dataset, add a new DatasetConfig entry to this list.
+    """
+    datasets_dir = _pl.Path(model_cache) / "datasets"
+
+    return [
+        DatasetConfig(
+            name="ccdv/cnn_dailymail",
+            local_path=str(datasets_dir / "ccdv" / "cnn_dailymail"),
+            config_name="3.0.0",
+            split="validation",
+            input_key="article",
+            prompt="Summarize the following article:",
+            output_key="highlights",
+            max_input_len="256",
+            num_requests="50",
+        ),
+        DatasetConfig(
+            name="Open-Orca/1million-gpt-4",
+            local_path=str(datasets_dir / "Open-Orca" / "1million-gpt-4" /
+                           "1M-GPT4-Augmented.parquet"),
+            split="train",
+            input_key="question",
+            prompt_key="system_prompt",
+            output_key="response",
+            max_input_len="20",
+            num_requests="10",
+        ),
+    ]
+
+
 def run_single_gpu_tests(build_dir: _pl.Path,
                         test_list: List[str],
                         run_fp8=False,
@ -93,27 +162,6 @@ def run_benchmarks(
        )
        return NotImplementedError

-    prompt_datasets_args = [{
-        '--dataset-name': "cnn_dailymail",
-        '--dataset-config-name': "3.0.0",
-        '--dataset-split': "validation",
-        '--dataset-input-key': "article",
-        '--dataset-prompt': "Summarize the following article:",
-        '--dataset-output-key': "highlights"
-    }, {
-        '--dataset-name': "Open-Orca/1million-gpt-4",
-        '--dataset-split': "train",
-        '--dataset-input-key': "question",
-        '--dataset-prompt-key': "system_prompt",
-        '--dataset-output-key': "response"
-    }]
-    token_files = [
-        "prepared_" + s['--dataset-name'].replace('/', '_')
-        for s in prompt_datasets_args
-    ]
-    max_input_lens = ["256", "20"]
-    num_reqs = ["50", "10"]
-
    if model_name == "gpt":
        model_engine_path = model_engine_dir / "fp16_plugin_packed_paged" / "tp1-pp1-cp1-gpu"

@ -127,27 +175,25 @@ def run_benchmarks(
        # model_engine_path = model_engine_dir / model_spec_obj.get_model_path(
        # ) / "tp1-pp1-cp1-gpu"

-    for prompt_ds_args, tokens_f, len, num_req in zip(prompt_datasets_args,
-                                                      token_files,
-                                                      max_input_lens, num_reqs):
-
+    for config in get_benchmark_dataset_configs(model_cache):
        benchmark_src_dir = _pl.Path("benchmarks") / "cpp"
        data_dir = resources_dir / "data"
        prepare_dataset = [
            python_exe,
            str(benchmark_src_dir / "prepare_dataset.py"), "--tokenizer",
            str(tokenizer_dir), "--output",
-            str(data_dir / tokens_f), "dataset", "--max-input-len", len,
-            "--num-requests", num_req
+            str(data_dir / config.token_file), "dataset", "--max-input-len",
+            config.max_input_len, "--num-requests", config.num_requests
        ]
-        for k, v in prompt_ds_args.items():
+        for k, v in config.get_dataset_args().items():
            prepare_dataset += [k, v]
-        # https://nvbugs/4658787
-        # WAR before the prepare dataset can use offline cached dataset
+
+        # Use environment variable to force HuggingFace to use offline cached dataset
+        offline_env = {**_os.environ, 'HF_DATASETS_OFFLINE': '1'}
        _cpp.run_command(prepare_dataset,
                         cwd=root_dir,
                         timeout=300,
-                         env={'HF_DATASETS_OFFLINE': '0'})
+                         env=offline_env)

        for batching_type in batching_types:
            for api_type in api_types:
@ -157,7 +203,7 @@ def run_benchmarks(
                    str(model_engine_path), "--type",
                    str(batching_type), "--api",
                    str(api_type), "--dataset",
-                    str(data_dir / tokens_f)
+                    str(data_dir / config.token_file)
                ]
                if model_name == "enc_dec":
                    benchmark += [
@ -175,12 +221,13 @@ def run_benchmarks(
                                 cwd=root_dir,
                                 timeout=600)

-        if "IFB" in batching_type and "executor" in api_types:
+        if "IFB" in batching_types and "executor" in api_types:
            # executor streaming test
            benchmark = [
                str(benchmark_exe_dir / "gptManagerBenchmark"), "--engine_dir",
                str(model_engine_path), "--type", "IFB", "--dataset",
-                str(data_dir / tokens_f), "--api", "executor", "--streaming"
+                str(data_dir / config.token_file), "--api", "executor",
+                "--streaming"
            ]
            if model_name == "enc_dec":
                benchmark += [
@ -263,7 +310,6 @@ def test_model(build_google_tests, model, prepare_model, run_model_tests,
    run_model_tests(model, run_fp8)


-@pytest.mark.skip(reason="https://nvbugs/5601670")
@pytest.mark.parametrize("build_google_tests", ["80", "86", "89", "90"],
                         indirect=True)
@pytest.mark.parametrize("model", ["bart", "gpt", "t5"])