diff --git a/benchmarks/cpp/utils/prepare_real_data.py b/benchmarks/cpp/utils/prepare_real_data.py index 617f255b4f..e15ef9267e 100644 --- a/benchmarks/cpp/utils/prepare_real_data.py +++ b/benchmarks/cpp/utils/prepare_real_data.py @@ -1,12 +1,16 @@ import logging import random import re +import tempfile from typing import Optional import click from datasets import load_dataset +from PIL import Image from pydantic import BaseModel, model_validator -from utils.utils import dataset_dump, get_norm_dist_lengths, print_dataset +from utils.utils import (get_norm_dist_lengths, multimodal_dataset_dump, + print_multimodal_dataset, print_text_dataset, + text_dataset_dump) def validate_output_len_dist(ctx, param, value): @@ -31,8 +35,10 @@ class DatasetConfig(BaseModel): """Split of the dataset. Typical values: train, validation, test. Setting to None will include all splits.""" split: Optional[str] """The dataset dictionary used for the input sentence.""" - input_key: str + input_key: Optional[str] = None """The dataset dictionary key used for the prompt of the input sentence. Must not be set when prompt is set.""" + image_key: Optional[str] = None + """The dataset dictionary key used for the images.""" prompt_key: Optional[str] = None """The prompt sentence to be added to the input sentence. Must not be set when prompt_key is set.""" prompt: Optional[str] = None @@ -75,6 +81,20 @@ class DatasetConfig(BaseModel): f"{req.keys()}") return req[self.input_key] + def get_images(self, req): + """Get the images from the given request.""" + image_keys = [self.image_key + ] + [f"{self.image_key}_{i}" for i in range(1, 8)] + assert any(key in req for key in image_keys), ( + f"Dataset {self.name} does not have key '{self.image_key}'. " + "Please set --dataset-image-key to one of the available keys: " + f"{req.keys()}") + images = [] + for key in image_keys: + if key in req and req[key] is not None: + images.append(req[key]) + return images + def get_output(self, req): """Get the output sentence from the given request.""" if self.output_key is None: @@ -105,7 +125,8 @@ def load_dataset_from_hf(dataset_config: DatasetConfig): dataset = iter( load_dataset(*dataset_config.query, split=dataset_config.split, - streaming=True)) + streaming=True, + trust_remote_code=True)) except ValueError as e: if "Config" in e: e += "\n Please add the config name to the dataset config yaml." @@ -130,9 +151,12 @@ def load_dataset_from_hf(dataset_config: DatasetConfig): required=True, help=f"Split of the dataset to use.") @click.option("--dataset-input-key", - required=True, type=str, help=f"The dataset dictionary key for input.") +@click.option("--dataset-image-key", + type=str, + default="image", + help=f"The dataset dictionary key for images.") @click.option("--dataset-prompt-key", type=str, default=None, @@ -181,21 +205,54 @@ def dataset(root_args, **kwargs): output_lens = [] task_ids = [] req_cnt = 0 + modality = None + multimodal_texts = [] + multimodal_image_paths = [] for req in load_dataset_from_hf(dataset_config): - # input - prompt = dataset_config.get_prompt( - req) + ' ' + dataset_config.get_input(req) - logging.debug(f"Input sequence: {prompt}") - line = root_args.tokenizer.encode(prompt) - if kwargs['max_input_len'] and len(line) > kwargs['max_input_len']: - continue - input_ids.append(line) - input_lens.append(len(line)) + if any(key in req for key in ['image', 'image_1', 'video']): + # multimodal input + if 'video' in req and req['video'] is not None: + assert "Not supported yet" + assert kwargs['output_len_dist'] is not None, ( + "Output length distribution must be set for multimodal requests." + ) + modality = 'image' + text = dataset_config.get_prompt(req) + images = dataset_config.get_images(req) + image_paths = [] + for image in images: + if image is not None: + if isinstance(image, str): + image_paths.append(image) + elif isinstance(image, Image.Image): + with tempfile.NamedTemporaryFile( + suffix=".jpg", delete=False) as tmp_file: + logging.debug(f"Saving image to {tmp_file.name}") + image = image.convert("RGB") + image.save(tmp_file, "JPEG") + filepath = tmp_file.name + image_paths.append(filepath) + else: + raise ValueError(f"Invalid image path: {image}") + multimodal_texts.append(text) + multimodal_image_paths.append(image_paths) + else: + # text input + prompt = dataset_config.get_prompt( + req) + ' ' + dataset_config.get_input(req) + logging.debug(f"Input sequence: {prompt}") + line = root_args.tokenizer.encode(prompt) + if kwargs['max_input_len'] and len(line) > kwargs['max_input_len']: + continue + input_ids.append(line) + input_lens.append(len(line)) - # output if fetch from golden - if kwargs['output_len_dist'] is None: - output_lens.append( - len(root_args.tokenizer.encode(dataset_config.get_output(req)))) + # output if fetch from golden + if kwargs['output_len_dist'] is None: + output_lens.append( + len( + root_args.tokenizer.encode( + dataset_config.get_output(req)))) # lora task id task_id = root_args.task_id @@ -208,30 +265,53 @@ def dataset(root_args, **kwargs): if kwargs['num_requests'] and req_cnt >= kwargs['num_requests']: break - if kwargs['num_requests'] and len(input_ids) < kwargs['num_requests']: + if kwargs['num_requests'] and (len(input_ids) if modality is None else len( + multimodal_texts)) < kwargs['num_requests']: logging.warning( - "Number of requests is smaller than the num-requests user set.") + f"Number of requests={len(input_ids) if modality is None else len(multimodal_texts)} is" + f" smaller than the num-requests user set={kwargs['num_requests']}." + ) # output if randomized if kwargs['output_len_dist'] is not None: osl_mean, osl_stdev = kwargs['output_len_dist'] - output_lens = get_norm_dist_lengths(osl_mean, osl_stdev, len(input_ids), - root_args.random_seed) - + output_lens = get_norm_dist_lengths( + osl_mean, osl_stdev, + len(input_ids) if modality is None else len(multimodal_texts), + root_args.random_seed) logging.debug(f"Input lengths: {[len(i) for i in input_ids]}") logging.debug(f"Output lengths: {output_lens}") + if modality is not None: + logging.debug(f"Modality: {modality}") - if not root_args.std_out: - dataset_dump( - input_lens, input_ids, output_lens, task_ids, { - "workload_type": "dataset", - "tokenizer": root_args.tokenizer.__class__.__name__, - "num_requests": len(input_ids), - "max_input_len": max(input_lens), - "max_output_len": max(output_lens) - }, root_args.output) + if modality is not None: + if not root_args.std_out: + multimodal_dataset_dump( + multimodal_texts, multimodal_image_paths, output_lens, task_ids, + { + "workload_type": "dataset", + "tokenizer": root_args.tokenizer.__class__.__name__, + "num_requests": len(task_ids), + "max_output_len": max(output_lens) + }, root_args.output) + else: + print_multimodal_dataset( + multimodal_texts, + multimodal_image_paths, + output_lens, + ) else: - print_dataset( - input_ids, - output_lens, - ) + if not root_args.std_out: + text_dataset_dump( + input_lens, input_ids, output_lens, task_ids, { + "workload_type": "dataset", + "tokenizer": root_args.tokenizer.__class__.__name__, + "num_requests": len(input_ids), + "max_input_len": max(input_lens), + "max_output_len": max(output_lens) + }, root_args.output) + else: + print_text_dataset( + input_ids, + output_lens, + ) diff --git a/benchmarks/cpp/utils/prepare_synthetic_data.py b/benchmarks/cpp/utils/prepare_synthetic_data.py index 4af9c0c126..721b3649a2 100644 --- a/benchmarks/cpp/utils/prepare_synthetic_data.py +++ b/benchmarks/cpp/utils/prepare_synthetic_data.py @@ -1,8 +1,9 @@ import random import click -from utils.utils import (dataset_dump, gen_random_tokens, get_norm_dist_lengths, - get_unif_dist_lengths, print_dataset) +from utils.utils import (gen_random_tokens, get_norm_dist_lengths, + get_unif_dist_lengths, print_text_dataset, + text_dataset_dump) @click.command() @@ -57,7 +58,7 @@ def token_norm_dist(root_args, **kwargs): task_ids = [random.randint(min_id, max_id) for _ in range(num_reqs)] if not root_args.std_out: - dataset_dump( + text_dataset_dump( input_lens, input_ids, output_lens, task_ids, { "workload_type": "token-norm-dist", "input_mean": kwargs['input_mean'], @@ -70,7 +71,7 @@ def token_norm_dist(root_args, **kwargs): "max_output_len": max_output_len }, root_args.output) else: - print_dataset( + print_text_dataset( input_ids, output_lens, ) @@ -127,7 +128,7 @@ def token_unif_dist(root_args, **kwargs): task_ids = [random.randint(min_id, max_id) for _ in range(num_reqs)] if not root_args.std_out: - dataset_dump( + text_dataset_dump( input_lens, input_ids, output_lens, task_ids, { "workload_type": "token-unif-dist", "input_min": kwargs['input_min'], @@ -140,7 +141,7 @@ def token_unif_dist(root_args, **kwargs): "max_output_len": max_output_len }, root_args.output) else: - print_dataset( + print_text_dataset( input_ids, output_lens, ) diff --git a/benchmarks/cpp/utils/utils.py b/benchmarks/cpp/utils/utils.py index 9735315ba9..f0fcf4403d 100644 --- a/benchmarks/cpp/utils/utils.py +++ b/benchmarks/cpp/utils/utils.py @@ -2,22 +2,29 @@ import json import math import os import random -from typing import List +from typing import List, Union import numpy as np from pydantic import BaseModel -class Sample(BaseModel): +class TextSample(BaseModel): input_len: int input_ids: List[int] output_len: int task_id: int +class MultimodalSample(BaseModel): + task_id: int + prompt: str + media_paths: List[str] + output_len: int + + class Workload(BaseModel): metadata: dict - samples: List[Sample] = [] + samples: List[Union[TextSample, MultimodalSample]] = [] def __init__(self, **kwargs) -> None: super().__init__(**kwargs) @@ -33,22 +40,37 @@ class Workload(BaseModel): self.metadata.setdefault('workload_name', workload_name) -def dataset_dump(input_lens, input_ids, output_lens, task_ids, metadata, - output_file): +def text_dataset_dump(input_lens, input_ids, output_lens, task_ids, metadata, + output_file): samples = [] for i in range(len(input_ids)): samples.append( - Sample(input_len=input_lens[i], - input_ids=input_ids[i], - output_len=output_lens[i], - task_id=task_ids[i])) + TextSample(input_len=input_lens[i], + input_ids=input_ids[i], + output_len=output_lens[i], + task_id=task_ids[i])) workload = Workload(metadata=metadata, samples=samples) os.makedirs(os.path.dirname(output_file), exist_ok=True) with open(output_file, 'w') as f: json.dump(workload.model_dump(), f) -def print_dataset(input_ids, output_lens): +def multimodal_dataset_dump(multimodal_texts, multimodal_image_paths, + output_lens, task_ids, metadata, output_file): + samples = [] + for i in range(len(multimodal_texts)): + samples.append( + MultimodalSample(task_id=task_ids[i], + prompt=multimodal_texts[i], + media_paths=multimodal_image_paths[i], + output_len=output_lens[i])) + workload = Workload(metadata=metadata, samples=samples) + os.makedirs(os.path.dirname(output_file), exist_ok=True) + with open(output_file, 'w') as f: + json.dump(workload.model_dump(), f) + + +def print_text_dataset(input_ids, output_lens): for i, input_tokens in enumerate(input_ids): d = { "task_id": i, @@ -58,6 +80,19 @@ def print_dataset(input_ids, output_lens): print(json.dumps(d, separators=(',', ':'), ensure_ascii=False)) +def print_multimodal_dataset(multimodal_texts, multimodal_image_paths, + output_lens): + for i, (text, image_paths) in enumerate( + zip(multimodal_texts, multimodal_image_paths)): + d = { + "task_id": i, + "prompt": text, + "media_paths": image_paths, + "output_tokens": output_lens[i] + } + print(json.dumps(d, separators=(',', ':'), ensure_ascii=False)) + + def get_list_of_delays(delay_dist, mean_time_bet_reqs, num_reqs, random_seed): if delay_dist == "constant": delays = [mean_time_bet_reqs] * num_reqs diff --git a/docs/source/performance/perf-benchmarking.md b/docs/source/performance/perf-benchmarking.md index b2c0b26237..0e525c12c3 100644 --- a/docs/source/performance/perf-benchmarking.md +++ b/docs/source/performance/perf-benchmarking.md @@ -475,6 +475,115 @@ Total Latency (ms): 18563.6825 ``` +#### Running multi-modal models in the PyTorch Workflow + +To benchmark multi-modal models with PyTorch workflow, you can follow the similar approach as above. + +First, prepare the dataset: +``` +python ./benchmarks/cpp/prepare_dataset.py \ + --tokenizer Qwen/Qwen2-VL-2B-Instruct \ + --stdout \ + dataset \ + --dataset-name lmms-lab/MMMU \ + --dataset-split test \ + --dataset-image-key image \ + --dataset-prompt-key question \ + --num-requests 10 \ + --output-len-dist 128,5 > mm_data.jsonl +``` +It will download the media files to `/tmp` directory and prepare the dataset with their paths. Note that the `prompt` fields are texts and not tokenized ids. This is due to the fact that +the `prompt` and the media (image/video) are processed by a preprocessor for multimodal files. + +Sample dataset for multimodal: +``` +{"task_id":0,"prompt":"Brahma Industries sells vinyl replacement windows to home improvement retailers nationwide. The national sales manager believes that if they invest an additional $25,000 in advertising, they would increase sales volume by 10,000 units. What is the total contribution margin?","media_paths":["/tmp/tmp9so41y3r.jpg"],"output_tokens":126} +{"task_id":1,"prompt":"Let us compute for the missing amounts under work in process inventory, what is the cost of goods manufactured? ","media_paths":["/tmp/tmpowsrb_f4.jpg"],"output_tokens":119} +{"task_id":2,"prompt":"Tsuji is reviewing the price of a 3-month Japanese yen/U.S. dollar currency futures contract, using the currency and interest rate data shown below. Because the 3-month Japanese interest rate has just increased to .50%, Itsuji recognizes that an arbitrage opportunity exists nd decides to borrow $1 million U.S. dollars to purchase Japanese yen. Calculate the yen arbitrage profit from Itsuji's strategy, using the following data: ","media_paths":["/tmp/tmpxhdvasex.jpg"],"output_tokens":126} +... +``` + +Run the benchmark: +``` +trtllm-bench --model Qwen/Qwen2-VL-2B-Instruct \ + throughput \ + --dataset mm_data.jsonl \ + --backend pytorch \ + --num_requests 10 \ + --max_batch_size 4 \ + --modality image +``` + + +Sample output: +``` +=========================================================== += REQUEST DETAILS +=========================================================== +Number of requests: 10 +Number of concurrent requests: 5.3019 +Average Input Length (tokens): 411.6000 +Average Output Length (tokens): 128.7000 +=========================================================== += WORLD + RUNTIME INFORMATION +=========================================================== +TP Size: 1 +PP Size: 1 +EP Size: None +Max Runtime Batch Size: 4 +Max Runtime Tokens: 12288 +Scheduling Policy: GUARANTEED_NO_EVICT +KV Memory Percentage: 90.00% +Issue Rate (req/sec): 1.4117E+17 + +=========================================================== += PERFORMANCE OVERVIEW +=========================================================== +Request Throughput (req/sec): 1.4439 +Total Output Throughput (tokens/sec): 185.8351 +Per User Output Throughput (tokens/sec/user): 38.1959 +Per GPU Output Throughput (tokens/sec/gpu): 185.8351 +Total Token Throughput (tokens/sec): 780.1607 +Total Latency (ms): 6925.4963 +Average request latency (ms): 3671.8441 + +-- Request Latency Breakdown (ms) ----------------------- + +[Latency] P50 : 3936.3022 +[Latency] P90 : 5514.4701 +[Latency] P95 : 5514.4701 +[Latency] P99 : 5514.4701 +[Latency] MINIMUM: 2397.1047 +[Latency] MAXIMUM: 5514.4701 +[Latency] AVERAGE: 3671.8441 + +=========================================================== += DATASET DETAILS +=========================================================== +Dataset Path: /workspaces/tensorrt_llm/mm_data.jsonl +Number of Sequences: 10 + +-- Percentiles statistics --------------------------------- + + Input Output Seq. Length +----------------------------------------------------------- +MIN: 167.0000 119.0000 300.0000 +MAX: 1059.0000 137.0000 1178.0000 +AVG: 411.6000 128.7000 540.3000 +P50: 299.0000 128.0000 427.0000 +P90: 1059.0000 137.0000 1178.0000 +P95: 1059.0000 137.0000 1178.0000 +P99: 1059.0000 137.0000 1178.0000 +=========================================================== +``` + +**Notes and Limitations**: +- Only image datasets are supported for now. +- `--output-len-dist` is a required argument for multimodal datasets. +- Tokenizer is unused during the prepare step but it is still a required argument. +- Since the images are converted to tokens when the model is run, `trtllm-bench` uses a default large value for the maximum input sequence length when setting up the execution settings. + You can also modify the behavior by specifying a different value with the flag `--max_input_len` that suits your use-case. + #### Quantization in the PyTorch Flow In order to run a quantized run with `trtllm-bench` utilizing the PyTorch flow, you will need to use a pre-quantized diff --git a/examples/bert/utils.py b/examples/bert/utils.py index 566101bfa8..105fd00c0b 100644 --- a/examples/bert/utils.py +++ b/examples/bert/utils.py @@ -17,7 +17,7 @@ def prepare_text_inputs(model_name, batch_size=8): f"HF_DATASETS_OFFLINE inside function: {datasets.config.HF_DATASETS_OFFLINE}" ) if model_name == "BertForQuestionAnswering" or model_name == "RobertaForQuestionAnswering": - squad_dataset = load_dataset("squad_v2") + squad_dataset = load_dataset("squad_v2", trust_remote_code=True) val_dataset = squad_dataset["validation"] samples = val_dataset.select(range(batch_size)) @@ -27,7 +27,8 @@ def prepare_text_inputs(model_name, batch_size=8): } return qa_real_test_inputs elif model_name == "BertForSequenceClassification" or model_name == "RobertaForSequenceClassification": - yelp_dataset = load_dataset("fancyzhx/yelp_polarity") + yelp_dataset = load_dataset("fancyzhx/yelp_polarity", + trust_remote_code=True) val_dataset = yelp_dataset["test"] samples = val_dataset.select(range(batch_size)) diff --git a/examples/commandr/requirements.txt b/examples/commandr/requirements.txt index 22619c82a4..4e095dac67 100644 --- a/examples/commandr/requirements.txt +++ b/examples/commandr/requirements.txt @@ -1,5 +1,5 @@ -c ../constraints.txt tensorrt_llm>=0.0.0.dev0 -datasets==2.14.6 +datasets==3.1.0 evaluate rouge_score diff --git a/examples/draft_target_model/requirements.txt b/examples/draft_target_model/requirements.txt index 85068b0352..423ba2d4b0 100644 --- a/examples/draft_target_model/requirements.txt +++ b/examples/draft_target_model/requirements.txt @@ -1,6 +1,6 @@ -c ../constraints.txt tensorrt_llm>=0.0.0.dev0 -datasets~=2.14.5 +datasets==3.1.0 rouge_score sentencepiece>=0.1.99 evaluate diff --git a/examples/eagle/requirements.txt b/examples/eagle/requirements.txt index b3afd79a39..32bd8ce040 100644 --- a/examples/eagle/requirements.txt +++ b/examples/eagle/requirements.txt @@ -1,6 +1,6 @@ -c ../constraints.txt tensorrt_llm>=0.0.0.dev0 -datasets~=2.14.5 +datasets==3.1.0 rouge_score SentencePiece~=0.1.99 evaluate diff --git a/examples/gemma/requirements.txt b/examples/gemma/requirements.txt index 509be84961..4e1079c1e0 100644 --- a/examples/gemma/requirements.txt +++ b/examples/gemma/requirements.txt @@ -11,4 +11,4 @@ sentencepiece>=0.1.99 h5py~=3.12.1 rouge_score nltk -datasets==2.14.6 +datasets==3.1.0 diff --git a/examples/glm-4-9b/requirements.txt b/examples/glm-4-9b/requirements.txt index a3dfa6d35a..210f487888 100644 --- a/examples/glm-4-9b/requirements.txt +++ b/examples/glm-4-9b/requirements.txt @@ -1,6 +1,6 @@ -c ../constraints.txt tensorrt_llm>=0.0.0.dev0 -datasets~=2.14.5 +datasets==3.1.0 evaluate protobuf rouge_score diff --git a/examples/gpt/requirements.txt b/examples/gpt/requirements.txt index 35d78a450b..d464f84b03 100644 --- a/examples/gpt/requirements.txt +++ b/examples/gpt/requirements.txt @@ -1,6 +1,6 @@ -c ../constraints.txt tensorrt_llm>=0.0.0.dev0 -datasets~=2.14.5 +datasets==3.1.0 evaluate rouge_score SentencePiece>=0.1.99 diff --git a/examples/llama/requirements.txt b/examples/llama/requirements.txt index 2b909128f9..07d3dbfbd4 100644 --- a/examples/llama/requirements.txt +++ b/examples/llama/requirements.txt @@ -1,7 +1,7 @@ -c ../constraints.txt tensorrt_llm>=0.0.0.dev0 transformers>=4.43.0 -datasets==2.14.6 +datasets==3.1.0 evaluate rouge_score sentencepiece>=0.1.99 diff --git a/examples/llama/summarize_long.py b/examples/llama/summarize_long.py index c6189a662e..01b23f10ed 100644 --- a/examples/llama/summarize_long.py +++ b/examples/llama/summarize_long.py @@ -312,7 +312,8 @@ def main(args): tokenizer.pad_token = tokenizer.eos_token dataset_openweb = load_dataset("stas/openwebtext-10k", - cache_dir=args.dataset_path) + cache_dir=args.dataset_path, + trust_remote_code=True) long_texts = get_long_texts(dataset_openweb) # generator # get datapoints diff --git a/examples/lookahead/requirements.txt b/examples/lookahead/requirements.txt index 85068b0352..423ba2d4b0 100644 --- a/examples/lookahead/requirements.txt +++ b/examples/lookahead/requirements.txt @@ -1,6 +1,6 @@ -c ../constraints.txt tensorrt_llm>=0.0.0.dev0 -datasets~=2.14.5 +datasets==3.1.0 rouge_score sentencepiece>=0.1.99 evaluate diff --git a/examples/mamba/requirements.txt b/examples/mamba/requirements.txt index 4c69a932a1..d308333d45 100644 --- a/examples/mamba/requirements.txt +++ b/examples/mamba/requirements.txt @@ -1,7 +1,7 @@ -c ../constraints.txt tensorrt_llm>=0.0.0.dev0 transformers>=4.39.0 -datasets~=2.14.5 +datasets==3.1.0 evaluate rouge_score sentencepiece diff --git a/examples/medusa/requirements.txt b/examples/medusa/requirements.txt index 85068b0352..423ba2d4b0 100644 --- a/examples/medusa/requirements.txt +++ b/examples/medusa/requirements.txt @@ -1,6 +1,6 @@ -c ../constraints.txt tensorrt_llm>=0.0.0.dev0 -datasets~=2.14.5 +datasets==3.1.0 rouge_score sentencepiece>=0.1.99 evaluate diff --git a/examples/models/contrib/baichuan/requirements.txt b/examples/models/contrib/baichuan/requirements.txt index cdf475ba24..d234cb9d00 100644 --- a/examples/models/contrib/baichuan/requirements.txt +++ b/examples/models/contrib/baichuan/requirements.txt @@ -1,6 +1,6 @@ -c ../../../constraints.txt tensorrt_llm>=0.0.0.dev0 -datasets~=2.15.0 +datasets==3.1.0 evaluate rouge_score sentencepiece>=0.1.99 diff --git a/examples/models/contrib/bloom/requirements.txt b/examples/models/contrib/bloom/requirements.txt index d38eb00c4b..88232baef8 100644 --- a/examples/models/contrib/bloom/requirements.txt +++ b/examples/models/contrib/bloom/requirements.txt @@ -1,6 +1,6 @@ -c ../../../constraints.txt tensorrt_llm>=0.0.0.dev0 -datasets~=2.14.5 +datasets==3.1.0 evaluate rouge_score sentencepiece>=0.1.99 diff --git a/examples/models/contrib/chatglm-6b/requirements.txt b/examples/models/contrib/chatglm-6b/requirements.txt index 7254805798..cdc65bf2bb 100644 --- a/examples/models/contrib/chatglm-6b/requirements.txt +++ b/examples/models/contrib/chatglm-6b/requirements.txt @@ -1,6 +1,6 @@ -c ../../../constraints.txt tensorrt_llm>=0.0.0.dev0 -datasets~=2.14.5 +datasets==3.1.0 evaluate protobuf rouge_score diff --git a/examples/models/contrib/chatglm2-6b/requirements.txt b/examples/models/contrib/chatglm2-6b/requirements.txt index 7254805798..cdc65bf2bb 100644 --- a/examples/models/contrib/chatglm2-6b/requirements.txt +++ b/examples/models/contrib/chatglm2-6b/requirements.txt @@ -1,6 +1,6 @@ -c ../../../constraints.txt tensorrt_llm>=0.0.0.dev0 -datasets~=2.14.5 +datasets==3.1.0 evaluate protobuf rouge_score diff --git a/examples/models/contrib/chatglm3-6b-32k/requirements.txt b/examples/models/contrib/chatglm3-6b-32k/requirements.txt index 7254805798..cdc65bf2bb 100644 --- a/examples/models/contrib/chatglm3-6b-32k/requirements.txt +++ b/examples/models/contrib/chatglm3-6b-32k/requirements.txt @@ -1,6 +1,6 @@ -c ../../../constraints.txt tensorrt_llm>=0.0.0.dev0 -datasets~=2.14.5 +datasets==3.1.0 evaluate protobuf rouge_score diff --git a/examples/models/contrib/dbrx/requirements.txt b/examples/models/contrib/dbrx/requirements.txt index 1bc3616a7f..0b8f8e5e0f 100644 --- a/examples/models/contrib/dbrx/requirements.txt +++ b/examples/models/contrib/dbrx/requirements.txt @@ -1,6 +1,6 @@ -c ../../../constraints.txt tensorrt_llm>=0.0.0.dev0 -datasets~=2.14.5 +datasets==3.1.0 evaluate rouge_score tiktoken==0.6.0 diff --git a/examples/models/contrib/deepseek_v1/requirements.txt b/examples/models/contrib/deepseek_v1/requirements.txt index 204a16d3ef..2f8713d865 100644 --- a/examples/models/contrib/deepseek_v1/requirements.txt +++ b/examples/models/contrib/deepseek_v1/requirements.txt @@ -1,5 +1,5 @@ -c ../../../constraints.txt tensorrt_llm>=0.0.0.dev0 -datasets~=2.14.6 +datasets==3.1.0 evaluate rouge_score diff --git a/examples/models/contrib/deepseek_v2/requirements.txt b/examples/models/contrib/deepseek_v2/requirements.txt index 6f83e50eb6..6be3961465 100644 --- a/examples/models/contrib/deepseek_v2/requirements.txt +++ b/examples/models/contrib/deepseek_v2/requirements.txt @@ -1,3 +1,3 @@ -datasets~=2.14.6 +datasets==3.1.0 evaluate~=0.4.1 rouge_score~=0.1.2 diff --git a/examples/models/contrib/falcon/requirements.txt b/examples/models/contrib/falcon/requirements.txt index 25e0cb29d0..c26675edee 100644 --- a/examples/models/contrib/falcon/requirements.txt +++ b/examples/models/contrib/falcon/requirements.txt @@ -1,7 +1,7 @@ -c ../../../constraints.txt tensorrt_llm>=0.0.0.dev0 transformers>=4.31.0 -datasets~=2.14.5 +datasets==3.1.0 evaluate rouge_score sentencepiece>=0.1.99 diff --git a/examples/models/contrib/gptj/requirements.txt b/examples/models/contrib/gptj/requirements.txt index 25274fb7fd..2f8713d865 100644 --- a/examples/models/contrib/gptj/requirements.txt +++ b/examples/models/contrib/gptj/requirements.txt @@ -1,5 +1,5 @@ -c ../../../constraints.txt tensorrt_llm>=0.0.0.dev0 -datasets~=2.14.5 +datasets==3.1.0 evaluate rouge_score diff --git a/examples/models/contrib/gptneox/requirements.txt b/examples/models/contrib/gptneox/requirements.txt index ca6bbc7231..cc77e27f78 100644 --- a/examples/models/contrib/gptneox/requirements.txt +++ b/examples/models/contrib/gptneox/requirements.txt @@ -1,5 +1,5 @@ -c ../../../constraints.txt tensorrt_llm>=0.0.0.dev0 -datasets~=2.14.5 +datasets==3.1.0 rouge_score evaluate diff --git a/examples/models/contrib/grok/requirements.txt b/examples/models/contrib/grok/requirements.txt index c84cde73cb..3ab319c0ef 100644 --- a/examples/models/contrib/grok/requirements.txt +++ b/examples/models/contrib/grok/requirements.txt @@ -1,7 +1,7 @@ -f https://storage.googleapis.com/jax-releases/jax_cuda_releases.html -c ../../../constraints.txt tensorrt_llm>=0.0.0.dev0 -datasets==2.14.6 +datasets==3.1.0 evaluate rouge_score sentencepiece==0.2.0 diff --git a/examples/models/contrib/internlm/requirements.txt b/examples/models/contrib/internlm/requirements.txt index 24292885f5..d9354a133c 100644 --- a/examples/models/contrib/internlm/requirements.txt +++ b/examples/models/contrib/internlm/requirements.txt @@ -1,6 +1,6 @@ -c ../../../constraints.txt tensorrt_llm>=0.0.0.dev0 -datasets==2.14.5 +datasets==3.1.0 rouge_score sentencepiece>=0.1.99 evaluate diff --git a/examples/models/contrib/jais/requirements.txt b/examples/models/contrib/jais/requirements.txt index 6097c25004..592e01e5ba 100644 --- a/examples/models/contrib/jais/requirements.txt +++ b/examples/models/contrib/jais/requirements.txt @@ -1,6 +1,6 @@ -c ../../../constraints.txt tensorrt_llm>=0.0.0.dev0 -datasets~=2.14.5 +datasets==3.1.0 evaluate rouge_score SentencePiece>=0.1.99 diff --git a/examples/models/contrib/mpt/requirements.txt b/examples/models/contrib/mpt/requirements.txt index 25274fb7fd..2f8713d865 100644 --- a/examples/models/contrib/mpt/requirements.txt +++ b/examples/models/contrib/mpt/requirements.txt @@ -1,5 +1,5 @@ -c ../../../constraints.txt tensorrt_llm>=0.0.0.dev0 -datasets~=2.14.5 +datasets==3.1.0 evaluate rouge_score diff --git a/examples/models/contrib/opt/requirements.txt b/examples/models/contrib/opt/requirements.txt index 25274fb7fd..2f8713d865 100644 --- a/examples/models/contrib/opt/requirements.txt +++ b/examples/models/contrib/opt/requirements.txt @@ -1,5 +1,5 @@ -c ../../../constraints.txt tensorrt_llm>=0.0.0.dev0 -datasets~=2.14.5 +datasets==3.1.0 evaluate rouge_score diff --git a/examples/models/contrib/skywork/requirements.txt b/examples/models/contrib/skywork/requirements.txt index 10274faa61..88232baef8 100644 --- a/examples/models/contrib/skywork/requirements.txt +++ b/examples/models/contrib/skywork/requirements.txt @@ -1,6 +1,6 @@ -c ../../../constraints.txt tensorrt_llm>=0.0.0.dev0 -datasets~=2.16.1 +datasets==3.1.0 evaluate rouge_score sentencepiece>=0.1.99 diff --git a/examples/models/contrib/smaug/requirements.txt b/examples/models/contrib/smaug/requirements.txt index 0e92afc46d..88232baef8 100644 --- a/examples/models/contrib/smaug/requirements.txt +++ b/examples/models/contrib/smaug/requirements.txt @@ -1,6 +1,6 @@ -c ../../../constraints.txt tensorrt_llm>=0.0.0.dev0 -datasets==2.14.6 +datasets==3.1.0 evaluate rouge_score sentencepiece>=0.1.99 diff --git a/examples/multimodal/eval.py b/examples/multimodal/eval.py index b91f9ff073..90c3a02837 100644 --- a/examples/multimodal/eval.py +++ b/examples/multimodal/eval.py @@ -108,6 +108,7 @@ def load_dataset(args) -> datasets.Dataset: 'timeout': aiohttp.ClientTimeout(total=3600) } }, + trust_remote_code=True, ) return dataset diff --git a/examples/nemotron/requirements.txt b/examples/nemotron/requirements.txt index c4f3de3511..074757de75 100644 --- a/examples/nemotron/requirements.txt +++ b/examples/nemotron/requirements.txt @@ -2,6 +2,6 @@ tensorrt_llm>=0.0.0.dev0 nemo-toolkit[all]==2.0.0rc1 megatron-core @ git+https://github.com/NVIDIA/Megatron-LM@core_r0.8.0 -datasets~=2.14.5 +datasets==3.1.0 evaluate rouge_score diff --git a/examples/nemotron_nas/calibration_utils.py b/examples/nemotron_nas/calibration_utils.py index 42b4382fa3..26c12820a6 100644 --- a/examples/nemotron_nas/calibration_utils.py +++ b/examples/nemotron_nas/calibration_utils.py @@ -20,7 +20,7 @@ def create_trtllm_magpie_calibration_dataset(output_dir: str, calib_size: int = 512) -> None: from datasets import load_dataset - dataset = load_dataset(DATASET, split="train") + dataset = load_dataset(DATASET, split="train", trust_remote_code=True) def transform(conversation): value = '\n'.join(turn['value'] diff --git a/examples/phi/requirements.txt b/examples/phi/requirements.txt index b2778023da..2621b03509 100644 --- a/examples/phi/requirements.txt +++ b/examples/phi/requirements.txt @@ -1,6 +1,6 @@ -c ../constraints.txt tensorrt_llm>=0.0.0.dev0 -datasets~=2.14.5 +datasets==3.1.0 evaluate rouge_score einops~=0.7.0 diff --git a/examples/prompt_lookup/requirements.txt b/examples/prompt_lookup/requirements.txt index 3fe0660a14..a1f618a758 100644 --- a/examples/prompt_lookup/requirements.txt +++ b/examples/prompt_lookup/requirements.txt @@ -1,6 +1,6 @@ -c ../constraints.txt tensorrt_llm>=0.0.0.dev0 -datasets~=2.14.5 +datasets==3.1.0 rouge_score sentencepiece~=0.1.99 evaluate diff --git a/examples/pytorch/quickstart_multimodal.py b/examples/pytorch/quickstart_multimodal.py index 26f93bc516..ca40b532b0 100644 --- a/examples/pytorch/quickstart_multimodal.py +++ b/examples/pytorch/quickstart_multimodal.py @@ -1,11 +1,12 @@ import argparse import json import os +from typing import Any, Dict, List from quickstart_advanced import add_llm_args, setup_llm -from transformers import AutoProcessor -from tensorrt_llm.inputs import load_image, load_video +from tensorrt_llm.inputs import (INPUT_FORMATTER_MAP, default_image_loader, + default_video_loader) example_images = [ "https://huggingface.co/datasets/YiYiXu/testing-images/resolve/main/seashore.png", @@ -27,92 +28,32 @@ example_video_prompts = [ ] -def prepare_vila(args, inputs): +def prepare_multimodal_inputs(model_dir: str, + model_type: str, + modality: str, + prompts: List[str], + media: List[str], + image_data_format: str = "pt", + num_frames: int = 8) -> List[Dict[str, Any]]: - def add_media_token(prompt, multi_modal_data): - mm_tokens = "" - if "image" in multi_modal_data: - for _ in multi_modal_data["image"]: - mm_tokens += "" - elif "video" in multi_modal_data: - for _ in multi_modal_data["video"]: - mm_tokens += "" - return mm_tokens + prompt + inputs = [] + if modality == "image": + inputs = default_image_loader(prompts, media, image_data_format) + elif modality == "video": + inputs = default_video_loader(prompts, media, image_data_format, + num_frames) + else: + raise ValueError(f"Unsupported modality: {modality}") + + inputs = INPUT_FORMATTER_MAP[model_type](model_dir, inputs) - for input in inputs: - input["prompt"] = add_media_token(input["prompt"], - input["multi_modal_data"]) return inputs -def prepare_llava_next(args, inputs): - processor = AutoProcessor.from_pretrained(args.model_dir) - - # Single-image inference chat template. For multi-image template, - # see https://huggingface.co/docs/transformers/en/model_doc/llava_next#multi-image-inference. - def apply_template(prompt, multimodal_data): - conversation = [ - { - "role": "user", - "content": [ - { - "type": "text", - "text": prompt - }, - { - "type": "image" - }, - ], - }, - ] - return processor.apply_chat_template( - conversation, - add_generation_prompt=True, - ) - - for input in inputs: - input["prompt"] = apply_template(input["prompt"], - input["multi_modal_data"]) - return inputs - - -def prepare_qwen2_vl(args, inputs): - processor = AutoProcessor.from_pretrained(args.model_dir) - - def apply_template(prompt, multimodal_data): - content = [{ - "type": media_type - } for media_type, items in multimodal_data.items() - for _ in items] + [{ - "type": "text", - "text": prompt - }] - - conversation = [{"role": "user", "content": content}] - return processor.apply_chat_template( - conversation, - tokenize=False, - add_generation_prompt=True, - ) - - for input in inputs: - input["prompt"] = apply_template(input["prompt"], - input["multi_modal_data"]) - return inputs - - -MODEL_TYPE_MAP = { - "llava_llama": prepare_vila, - "llava_next": prepare_llava_next, - "qwen2_vl": prepare_qwen2_vl, - "qwen2_5_vl": prepare_qwen2_vl, -} - - def add_multimodal_args(parser): parser.add_argument("--model_type", type=str, - choices=MODEL_TYPE_MAP.keys(), + choices=INPUT_FORMATTER_MAP.keys(), help="Model type.") parser.add_argument("--modality", type=str, @@ -150,50 +91,16 @@ def main(): llm, sampling_params = setup_llm(args) image_format = "pt" # ["pt", "pil"] - if args.modality == "image": - prompts = args.prompt if args.prompt else example_image_prompts - images = args.media if args.media else example_images - if len(images) > len(prompts) and len(prompts) == 1: - # 1 prompt + N media - images = [images] - inputs = [{ - "prompt": prompt, - "multi_modal_data": { - "image": [ - load_image(i, format=image_format, device="cuda") - for i in image - ] if isinstance(image, list) else - [load_image(image, format=image_format, device="cuda")] - } - } for prompt, image in zip(prompts, images)] - elif args.modality == "video": - prompts = args.prompt if args.prompt else example_video_prompts - videos = args.media if args.media else example_videos - if len(videos) > len(prompts) and len(prompts) == 1: - # 1 prompt + N media - videos = [videos] - inputs = [{ - "prompt": prompt, - "multi_modal_data": { - "video": [ - load_video( - i, args.num_frames, format=image_format, device="cuda") - for i in video - ] if isinstance(video, list) else [ - load_video(video, - args.num_frames, - format=image_format, - device="cuda") - ] - } - } for prompt, video in zip(prompts, videos)] + if args.model_type is not None: + model_type = args.model_type else: - raise ValueError(f"Unsupported modality: {args.modality}") + model_type = json.load( + open(os.path.join(llm._hf_model_dir, 'config.json')))['model_type'] + assert model_type in INPUT_FORMATTER_MAP, f"Unsupported model_type: {model_type}" - model_type = json.load(open(os.path.join(llm._hf_model_dir, - 'config.json')))['model_type'] - assert model_type in MODEL_TYPE_MAP, f"Unsupported model_type: {model_type}" - inputs = MODEL_TYPE_MAP[model_type](args, inputs) + inputs = prepare_multimodal_inputs(args.model_dir, model_type, + args.modality, args.prompt, args.media, + image_format, args.num_frames) outputs = llm.generate(inputs, sampling_params) diff --git a/examples/quantization/requirements.txt b/examples/quantization/requirements.txt index d25462fb9f..f14563d3a4 100644 --- a/examples/quantization/requirements.txt +++ b/examples/quantization/requirements.txt @@ -1,6 +1,6 @@ -c ../constraints.txt tensorrt_llm>=0.0.0.dev0 -datasets>=2.14.4 +datasets==3.1.0 nemo-toolkit[all]==2.0.0rc1 rouge_score transformers_stream_generator==0.0.4 diff --git a/examples/qwen/requirements.txt b/examples/qwen/requirements.txt index 997d073f09..e53acc9577 100644 --- a/examples/qwen/requirements.txt +++ b/examples/qwen/requirements.txt @@ -1,6 +1,6 @@ -c ../constraints.txt tensorrt_llm>=0.0.0.dev0 -datasets~=2.16.0 +datasets==3.1.0 evaluate rouge_score transformers>=4.40.1 diff --git a/examples/qwen2audio/requirements.txt b/examples/qwen2audio/requirements.txt index 25ca280871..fbfbb970a6 100644 --- a/examples/qwen2audio/requirements.txt +++ b/examples/qwen2audio/requirements.txt @@ -1,6 +1,6 @@ -c ../constraints.txt tensorrt_llm>=0.0.dev0 -datasets~=2.16.0 +datasets==3.1.0 evaluate rouge_score transformers>=4.45.0 diff --git a/examples/qwenvl/requirements.txt b/examples/qwenvl/requirements.txt index 620ed8071c..8d19b00769 100644 --- a/examples/qwenvl/requirements.txt +++ b/examples/qwenvl/requirements.txt @@ -1,6 +1,6 @@ -c ../constraints.txt tensorrt_llm>=0.0.0.dev0 -datasets~=2.16.0 +datasets==3.1.0 evaluate rouge_score transformers-stream-generator diff --git a/examples/recurrentgemma/requirements.txt b/examples/recurrentgemma/requirements.txt index 4146f76f66..1cc58c7636 100644 --- a/examples/recurrentgemma/requirements.txt +++ b/examples/recurrentgemma/requirements.txt @@ -5,7 +5,7 @@ flax>=0.8.2 jax~=0.4.23 orbax-checkpoint==0.5.7 transformers>=4.40.0 -datasets~=2.14.5 +datasets==3.1.0 evaluate rouge_score sentencepiece diff --git a/examples/redrafter/requirements.txt b/examples/redrafter/requirements.txt index 85068b0352..423ba2d4b0 100644 --- a/examples/redrafter/requirements.txt +++ b/examples/redrafter/requirements.txt @@ -1,6 +1,6 @@ -c ../constraints.txt tensorrt_llm>=0.0.0.dev0 -datasets~=2.14.5 +datasets==3.1.0 rouge_score sentencepiece>=0.1.99 evaluate diff --git a/examples/summarize.py b/examples/summarize.py index d7d370d8e6..a584e74fea 100644 --- a/examples/summarize.py +++ b/examples/summarize.py @@ -110,7 +110,8 @@ def main(args): dataset = load_dataset(dataset_name, dataset_revision, cache_dir=args.dataset_cache_dir, - split=dataset_split) + split=dataset_split, + trust_remote_code=True) dataset = dataset.shuffle(args.random_seed) max_batch_size = args.batch_size diff --git a/examples/whisper/requirements.txt b/examples/whisper/requirements.txt index a9eefa968e..2af7da3fec 100644 --- a/examples/whisper/requirements.txt +++ b/examples/whisper/requirements.txt @@ -1,7 +1,7 @@ -c ../constraints.txt tensorrt_llm>=0.0.0.dev0 tiktoken -datasets +datasets==3.1.0 kaldialign openai-whisper librosa diff --git a/examples/whisper/run.py b/examples/whisper/run.py index 17bed30bbe..2e714c1d95 100755 --- a/examples/whisper/run.py +++ b/examples/whisper/run.py @@ -564,7 +564,8 @@ if __name__ == '__main__': normalizer = EnglishTextNormalizer() dataset = load_dataset(args.dataset, args.dataset_name, - split=args.dataset_split) + split=args.dataset_split, + trust_remote_code=True) if args.enable_warmup: results, total_duration = decode_dataset( model, diff --git a/requirements-dev.txt b/requirements-dev.txt index ab59dbbe36..ae71fcf14d 100644 --- a/requirements-dev.txt +++ b/requirements-dev.txt @@ -1,5 +1,4 @@ -r requirements.txt -datasets==2.19.2 einops graphviz mypy diff --git a/requirements.txt b/requirements.txt index 632314d0d2..b352fdd54c 100644 --- a/requirements.txt +++ b/requirements.txt @@ -31,6 +31,8 @@ pydantic>=2.9.1 pillow==10.3.0 wheel<=0.45.1 optimum +# evaluate needs datasets>=2.0.0 which triggers datasets>3.1.0 which is not stable: https://github.com/huggingface/datasets/issues/7467 +datasets==3.1.0 evaluate mpmath>=1.3.0 click diff --git a/tensorrt_llm/_torch/models/modeling_qwen2vl.py b/tensorrt_llm/_torch/models/modeling_qwen2vl.py index ff4159b41f..e9b876cb8e 100644 --- a/tensorrt_llm/_torch/models/modeling_qwen2vl.py +++ b/tensorrt_llm/_torch/models/modeling_qwen2vl.py @@ -437,10 +437,7 @@ class Qwen2VLModelBase(PreTrainedModel): inputs_embeds=input_embeds, return_context_logits=return_context_logits, mrope_config=mrope_config) - logger.debug( - f"output_ids: {(output_prob if output_prob.dim() == 2 else output_prob.unsqueeze(0)).argmax(dim=1).tolist()}" - ) - logger.info(f'output shape: {output_prob.shape}') + logger.debug(f'output shape: {output_prob.shape}') return output_prob diff --git a/tensorrt_llm/bench/benchmark/throughput.py b/tensorrt_llm/bench/benchmark/throughput.py index 83e60eccdb..6c1d348f53 100755 --- a/tensorrt_llm/bench/benchmark/throughput.py +++ b/tensorrt_llm/bench/benchmark/throughput.py @@ -10,6 +10,7 @@ from click_option_group import (MutuallyExclusiveOptionGroup, OptionGroup, from tensorrt_llm.bench.benchmark.utils.asynchronous import async_benchmark from tensorrt_llm.bench.benchmark.utils.processes import IterationWriter +from tensorrt_llm.bench.build.build import get_model_config # isort: off from tensorrt_llm.bench.benchmark.utils.general import ( @@ -21,7 +22,8 @@ from tensorrt_llm.bench.dataclasses.configuration import RuntimeConfig from tensorrt_llm.bench.dataclasses.general import BenchmarkEnvironment from tensorrt_llm.bench.dataclasses.reporting import ReportUtility from tensorrt_llm.bench.utils.data import (create_dataset_from_stream, - initialize_tokenizer) + initialize_tokenizer, + update_metadata_for_multimodal) from tensorrt_llm.llmapi import LLM, CapacitySchedulerPolicy from tensorrt_llm.logger import logger from tensorrt_llm.sampling_params import SamplingParams @@ -92,12 +94,26 @@ from tensorrt_llm.sampling_params import SamplingParams required=False, help="Pass in a dataset file for parsing instead of stdin.", ) +@optgroup.option( + "--modality", + type=click.Choice(["image", "video"]), + default=None, + help="Modality of the multimodal requests.", +) +@optgroup.option( + "--max_input_len", + type=int, + default=4096, + help= + "Maximum input sequence length to use for multimodal models. This is used only when --modality " + "is specified since the actual number of vision tokens is unknown before the model is run.", +) @optgroup.option( "--num_requests", type=int, default=0, help= - "Number of requests to cap benchmark run at. If not specified or set to 0, it will be the" + "Number of requests to cap benchmark run at. If not specified or set to 0, it will be the " "length of dataset.", ) @optgroup.option( @@ -194,6 +210,9 @@ def throughput_command( engine_dir: Path = params.pop("engine_dir") concurrency: int = params.pop("concurrency") backend: str = params.get("backend") + modality: str = params.pop("modality") + max_input_len: int = params.pop("max_input_len") + model_type = get_model_config(model, checkpoint_path).model_type # Reporting options report_json: Path = params.pop("report_json") @@ -209,15 +228,24 @@ def throughput_command( # Dataset Loading and Preparation with open(dataset_path, "r") as dataset: metadata, requests = create_dataset_from_stream( - tokenizer, dataset, num_requests=num_requests) + tokenizer, + dataset, + num_requests=num_requests, + model_dir=checkpoint_path, + model_type=model_type, + modality=modality, + max_input_seq_len_for_multimodal=max_input_len) metadata.dataset_path = dataset_path params["target_input_len"] = params.get( "target_input_len") or metadata.avg_isl params["target_output_len"] = params.get( "target_output_len") or metadata.avg_osl - # Log dataset info - logger.info(metadata.get_summary_for_print()) + if modality is None: + # Log dataset info + # NOTE: This table is only accurate for non-multimodal models. + # The accurate table for multimodal models will be logged after the benchmark is done. + logger.info(metadata.get_summary_for_print()) # Engine configuration parsing if backend and backend.lower() in ["pytorch", "autodeploy"]: @@ -294,8 +322,12 @@ def throughput_command( warmup_dataset = generate_warmup_dataset(requests, warmup) logger.info("Running warmup.") asyncio.run( - async_benchmark(llm, sampling_params, warmup_dataset, False, - concurrency)) + async_benchmark(llm, + sampling_params, + warmup_dataset, + False, + concurrency, + modality=modality)) # WAR: IterationResult is a singleton tied to the executor. # Since the benchmark calls asyncio.run() multiple times (e.g., during warmup), # we must reset it to ensure it attaches to the correct event loop. @@ -304,10 +336,19 @@ def throughput_command( with iteration_writer.capture(): statistics = asyncio.run( - async_benchmark(llm, sampling_params, requests, streaming, - concurrency, iteration_writer.full_address)) + async_benchmark(llm, + sampling_params, + requests, + streaming, + concurrency, + iteration_writer.full_address, + modality=modality)) logger.info(f"Benchmark done. Reporting results...") + if modality is not None: + # For multimodal models, we need to update the metadata with the correct input lengths + metadata = update_metadata_for_multimodal(metadata, statistics) + report_utility = ReportUtility(statistics, metadata, runtime_config, logger, kwargs, streaming) if report_json: diff --git a/tensorrt_llm/bench/benchmark/utils/asynchronous.py b/tensorrt_llm/bench/benchmark/utils/asynchronous.py index 08ba1f2d70..d61241f5ee 100644 --- a/tensorrt_llm/bench/benchmark/utils/asynchronous.py +++ b/tensorrt_llm/bench/benchmark/utils/asynchronous.py @@ -23,7 +23,8 @@ class LlmManager: llm: LLM, outbox: asyncio.Queue[PerfItemTuple], streaming: bool, - concurrency: int = -1) -> None: + concurrency: int = -1, + modality: Optional[str] = None) -> None: self.llm = llm self._inbox: asyncio.Queue[Tuple[InferenceRequest, SamplingParams]] = asyncio.Queue() @@ -38,6 +39,7 @@ class LlmManager: concurrency) if concurrency > 0 else None self.streaming = streaming self.request_seen = asyncio.Event() + self.modality = modality async def process_request(self, request: InferenceRequest, sampling_params: SamplingParams): @@ -50,7 +52,7 @@ class LlmManager: time_on_first_token = None # Schedule the request in the LLM API (asynchronously) output: RequestOutput = self.llm.generate_async( - request.input_ids, + request.input_ids if self.modality is None else request.prompt, sampling_params=sampling_params, streaming=self.streaming) if self.streaming: @@ -70,7 +72,7 @@ class LlmManager: start_timestamp=request_start_timestamp, end_timestamp=response_end_timestamp, request_id=response.request_id, - num_input_tokens=len(request.input_ids), + num_input_tokens=len(output.prompt_token_ids), response_is_final=response.finished, error=False, tokens=tokens, @@ -201,6 +203,7 @@ async def async_benchmark( streaming: bool, concurrency: int = -1, iteration_log_addr: str = None, + modality: Optional[str] = None, ) -> StatsKeeper: outbox = asyncio.Queue() statistics = StatsKeeper() @@ -208,7 +211,11 @@ async def async_benchmark( try: logger.info("Starting benchmarking async task.") - backend = LlmManager(llm, outbox, streaming, concurrency=concurrency) + backend = LlmManager(llm, + outbox, + streaming, + concurrency=concurrency, + modality=modality) backend.run(iteration_addr=iteration_log_addr) enqueue_task = asyncio.create_task( diff --git a/tensorrt_llm/bench/build/dataclasses.py b/tensorrt_llm/bench/build/dataclasses.py index aa37fa0242..ae51bdeb36 100755 --- a/tensorrt_llm/bench/build/dataclasses.py +++ b/tensorrt_llm/bench/build/dataclasses.py @@ -116,6 +116,7 @@ class ModelConfig(BaseModel): setting calculation. """ name: str + model_type: str param_count: int num_hidden_layers: int = Field(validation_alias=AliasChoices( "num_hidden_layers", diff --git a/tensorrt_llm/bench/dataclasses/general.py b/tensorrt_llm/bench/dataclasses/general.py index 5c5fdf95c7..1a8277a415 100644 --- a/tensorrt_llm/bench/dataclasses/general.py +++ b/tensorrt_llm/bench/dataclasses/general.py @@ -1,7 +1,7 @@ from __future__ import annotations from pathlib import Path -from typing import List, Optional +from typing import Any, List, Optional, Union from pydantic import (AliasChoices, BaseModel, Field, computed_field, model_validator) @@ -17,7 +17,7 @@ class BenchmarkEnvironment(BaseModel): class InferenceRequest(BaseModel): task_id: int - prompt: Optional[str] = None + prompt: Optional[Union[str, Any]] = None output_tokens: int input_ids: Optional[List[int]] = Field( alias=AliasChoices("input_ids", "logits")) diff --git a/tensorrt_llm/bench/utils/data.py b/tensorrt_llm/bench/utils/data.py index 64a94ac668..5b9d29d4e2 100644 --- a/tensorrt_llm/bench/utils/data.py +++ b/tensorrt_llm/bench/utils/data.py @@ -1,12 +1,36 @@ import json from functools import partial -from typing import List, TextIO, Tuple +from typing import Any, Dict, List, TextIO, Tuple from transformers import AutoTokenizer, PreTrainedTokenizer from tensorrt_llm.bench.dataclasses.general import (DatasetMetadata, InferenceRequest) from tensorrt_llm.bench.dataclasses.statistics import PercentileStats +from tensorrt_llm.inputs import (INPUT_FORMATTER_MAP, default_image_loader, + default_video_loader) + + +def prepare_multimodal_inputs(model_dir: str, + model_type: str, + modality: str, + prompts: List[str], + media: List[str], + image_data_format: str = "pt", + num_frames: int = 8) -> List[Dict[str, Any]]: + + inputs = [] + if modality == "image": + inputs = default_image_loader(prompts, media, image_data_format) + elif modality == "video": + inputs = default_video_loader(prompts, media, image_data_format, + num_frames) + else: + raise ValueError(f"Unsupported modality: {modality}") + + inputs = INPUT_FORMATTER_MAP[model_type](model_dir, inputs) + + return inputs def initialize_tokenizer(model_name: str) -> PreTrainedTokenizer: @@ -36,6 +60,10 @@ def create_dataset_from_stream( max_input_length: int = 0, max_output_length: int = 0, num_requests: int = 0, + model_dir: str = None, + model_type: str = None, + modality: str = None, + max_input_seq_len_for_multimodal: int = 4096, ) -> Tuple[DatasetMetadata, List[InferenceRequest]]: """Generate metadata and a list of requests to drive benchmarking. @@ -83,13 +111,30 @@ def create_dataset_from_stream( # Each line should be a complete JSON dictionary with no indentation # or newline characters. data = json.loads(line) - logits = data.get("input_ids", data.get("logits", None)) - prompt = data.get("prompt", None) + if modality is not None: + # Multimodal data + assert modality in [ + "image", "video" + ], f"Modality must be one of ['image', 'video'] but got {modality}." + + prompt = data.get("prompt") # cannot be None + media_paths = data.get("media_paths", None) + inputs = prepare_multimodal_inputs( + model_dir, + model_type, + modality, + prompts=[prompt], + media=media_paths) # list of dicts + logits = None # cannot tokenize multi-modal data, handled by preprocessor + prompt = inputs[0] + else: + logits = data.get("input_ids", data.get("logits", None)) + prompt = data.get("prompt", None) + # If the request comes in with logits, just use the provided. + # Otherwise we need to tokenize it. + logits = tokenize(prompt)["input_ids"] if logits is None else logits task_id = data["task_id"] osl = data["output_tokens"] - # If the request comes in with logits, just use the provided. - # Otherwise we need to tokenize it. - logits = tokenize(prompt)["input_ids"] if logits is None else logits request = InferenceRequest( task_id=task_id, @@ -97,9 +142,14 @@ def create_dataset_from_stream( output_tokens=output_limiter(osl), input_ids=logits, ) - all_isl.append(len(logits)) all_osl.append(osl) - all_seq_len.append(len(logits) + osl) + if modality is not None: + cur_isl = max_input_seq_len_for_multimodal # NOTE: actual sequence length is unknown until the model is run + all_isl.append(cur_isl) + all_seq_len.append(cur_isl + osl) + else: + all_isl.append(len(logits)) + all_seq_len.append(len(logits) + osl) dataset.append(request) isl_stats = PercentileStats.from_iterable(all_isl) @@ -115,3 +165,31 @@ def create_dataset_from_stream( ) return metadata, dataset + + +def update_metadata_for_multimodal(metadata, statistics) -> DatasetMetadata: + """Update the metadata from benchmark statistics. Only used for multimodal models. + + Args: + metadata (DatasetMetadata): The metadata to update. + statistics (StatsKeeper): The statistics to update the metadata with. + + Returns: + DatasetMetadata: The updated metadata. + """ + all_isl = [] + all_osl = [] + all_seq_len = [] + for request in statistics.requests.values(): + all_isl.append(request.num_input_tokens) + all_osl.append(request.num_total_output_tokens) + all_seq_len.append(request.num_input_tokens + + request.num_total_output_tokens) + isl_stats = PercentileStats.from_iterable(all_isl) + osl_stats = PercentileStats.from_iterable(all_osl) + seq_len_stats = PercentileStats.from_iterable(all_seq_len) + metadata.isl_stats = isl_stats + metadata.osl_stats = osl_stats + metadata.seq_len_stats = seq_len_stats + + return metadata diff --git a/tensorrt_llm/evaluate/cnn_dailymail.py b/tensorrt_llm/evaluate/cnn_dailymail.py index 494ba4da71..cb54358b92 100644 --- a/tensorrt_llm/evaluate/cnn_dailymail.py +++ b/tensorrt_llm/evaluate/cnn_dailymail.py @@ -36,7 +36,10 @@ class CnnDailymail(Evaluator): system_prompt: Optional[str] = None): super().__init__(apply_chat_template=apply_chat_template, system_prompt=system_prompt) - self.data = datasets.load_dataset(dataset_path, "3.0.0", split="test") + self.data = datasets.load_dataset(dataset_path, + "3.0.0", + split="test", + trust_remote_code=True) self.data = self.data.shuffle(random_seed) if num_samples is None: self.num_samples = self.data.num_rows diff --git a/tensorrt_llm/inputs/__init__.py b/tensorrt_llm/inputs/__init__.py index fd95bfd43f..fd9887ddb1 100644 --- a/tensorrt_llm/inputs/__init__.py +++ b/tensorrt_llm/inputs/__init__.py @@ -1,10 +1,15 @@ from .data import PromptInputs, TextPrompt, TokensPrompt, prompt_inputs from .registry import (ExtraProcessedInputs, InputProcessor, create_input_processor, register_input_processor) -from .utils import load_image, load_video +from .utils import (INPUT_FORMATTER_MAP, default_image_loader, + default_video_loader, format_llava_next_input, + format_qwen2_vl_input, format_vila_input, load_image, + load_video) __all__ = [ "PromptInputs", "prompt_inputs", "TextPrompt", "TokensPrompt", "InputProcessor", "create_input_processor", "register_input_processor", - "ExtraProcessedInputs", "load_image", "load_video" + "ExtraProcessedInputs", "load_image", "load_video", "INPUT_FORMATTER_MAP", + "default_image_loader", "default_video_loader", "format_vila_input", + "format_llava_next_input", "format_qwen2_vl_input" ] diff --git a/tensorrt_llm/inputs/utils.py b/tensorrt_llm/inputs/utils.py index a79b49fcc4..19f8a0d174 100644 --- a/tensorrt_llm/inputs/utils.py +++ b/tensorrt_llm/inputs/utils.py @@ -6,6 +6,7 @@ import requests import torch from PIL import Image from torchvision.transforms import ToTensor +from transformers import AutoProcessor def load_image(image: str, @@ -67,3 +68,158 @@ def load_video( device=device) if format == "pt" else frames[index] for index in indices if index in frames ] + + +""" +VLM input preparation. +""" + + +def format_vila_input(model_dir, inputs): + """ + This function formats the input for the VILA/NVILA VL model. + + Arguments: + model_dir: The directory of the model to load any preprocessor. + inputs: The list of inputs to format. + + Returns: + A list of dictionaries where "prompt" data is modified to a TextPrompt that combines text prompt and multimodal data. + """ + + def add_media_token(prompt, multi_modal_data): + mm_tokens = "" + if "image" in multi_modal_data: + for _ in multi_modal_data["image"]: + mm_tokens += "" + elif "video" in multi_modal_data: + for _ in multi_modal_data["video"]: + mm_tokens += "" + return mm_tokens + prompt + + for input in inputs: + input["prompt"] = add_media_token(input["prompt"], + input["multi_modal_data"]) + return inputs + + +def format_llava_next_input(model_dir, inputs): + """ + This function formats the input for the Llava Next VL model. + + Arguments: + model_dir: The directory of the model to load any preprocessor. + inputs: The list of inputs to format. + + Returns: + A list of dictionaries where "prompt" data is modified to a TextPrompt that combines text prompt and multimodal data. + """ + processor = AutoProcessor.from_pretrained(model_dir) + + # Single-image inference chat template. For multi-image template, + # see https://huggingface.co/docs/transformers/en/model_doc/llava_next#multi-image-inference. + def apply_template(prompt, multimodal_data): + conversation = [ + { + "role": "user", + "content": [ + { + "type": "text", + "text": prompt + }, + { + "type": "image" + }, + ], + }, + ] + return processor.apply_chat_template( + conversation, + add_generation_prompt=True, + ) + + for input in inputs: + input["prompt"] = apply_template(input["prompt"], + input["multi_modal_data"]) + return inputs + + +def format_qwen2_vl_input(model_dir, inputs): + """ + This function formats the input for the Qwen2/Qwen2.5 VL model. + + Arguments: + model_dir: The directory of the model to load any preprocessor. + inputs: The list of inputs to format. + + Returns: + A list of dictionaries where "prompt" data is modified to a TextPrompt that combines text prompt and multimodal data. + """ + processor = AutoProcessor.from_pretrained(model_dir) + + def apply_template(prompt, multimodal_data): + content = [{ + "type": media_type + } for media_type, items in multimodal_data.items() + for _ in items] + [{ + "type": "text", + "text": prompt + }] + + conversation = [{"role": "user", "content": content}] + # print(conversation) + return processor.apply_chat_template( + conversation, + tokenize=False, + add_generation_prompt=True, + ) + + for input in inputs: + input["prompt"] = apply_template(input["prompt"], + input["multi_modal_data"]) + return inputs + + +def default_image_loader(prompts, images, image_data_format="pt"): + if len(images) > len(prompts) and len(prompts) == 1: + # 1 prompt + N media + images = [images] + inputs = [{ + "prompt": prompt, + "multi_modal_data": { + "image": [ + load_image(i, format=image_data_format, device="cuda") + for i in image + ] if isinstance(image, list) else + [load_image(image, format=image_data_format, device="cuda")] + } + } for prompt, image in zip(prompts, images)] + return inputs + + +def default_video_loader(prompts, videos, image_data_format="pt", num_frames=8): + if len(videos) > len(prompts) and len(prompts) == 1: + # 1 prompt + N media + videos = [videos] + inputs = [{ + "prompt": prompt, + "multi_modal_data": { + "video": [ + load_video( + i, num_frames, format=image_data_format, device="cuda") + for i in video + ] if isinstance(video, list) else [ + load_video( + video, num_frames, format=image_data_format, device="cuda") + ] + } + } for prompt, video in zip(prompts, videos)] + return inputs + + +INPUT_FORMATTER_MAP = { + "llava_llama": format_vila_input, + "llava_next": format_llava_next_input, + "qwen2_vl": format_qwen2_vl_input, + "qwen2_5_vl": format_qwen2_vl_input, +} diff --git a/tensorrt_llm/models/convert_utils.py b/tensorrt_llm/models/convert_utils.py index 310fd0ab35..4367fe085a 100644 --- a/tensorrt_llm/models/convert_utils.py +++ b/tensorrt_llm/models/convert_utils.py @@ -306,6 +306,7 @@ def load_calib_dataset(dataset_name_or_dir: str, dataset = load_dataset(dataset_name_or_dir, name=config_name, split=split, + trust_remote_code=trust_remote_code, **kwargs) return dataset[key] diff --git a/tensorrt_llm/quantization/quantize_by_modelopt.py b/tensorrt_llm/quantization/quantize_by_modelopt.py index 7b8e590fd4..01524fc0ff 100755 --- a/tensorrt_llm/quantization/quantize_by_modelopt.py +++ b/tensorrt_llm/quantization/quantize_by_modelopt.py @@ -384,20 +384,26 @@ def get_calib_dataloader(dataset_name_or_dir="cnn_dailymail", dataset = load_dataset( "json", data_files="https://the-eye.eu/public/AI/pile/val.jsonl.zst", - split="train") + split="train", + trust_remote_code=True) dataset = dataset["text"][:calib_size] elif "scienceqa" in dataset_name_or_dir.lower( ) or "science_qa" in dataset_name_or_dir.lower(): if os.path.isdir(dataset_name_or_dir): - dataset = load_dataset(dataset_name_or_dir, split="train") + dataset = load_dataset(dataset_name_or_dir, + split="train", + trust_remote_code=True) else: - dataset = load_dataset("derek-thomas/ScienceQA", split="train") + dataset = load_dataset("derek-thomas/ScienceQA", + split="train", + trust_remote_code=True) dataset = dataset.select(range(calib_size)) elif "cnn_dailymail" in dataset_name_or_dir: dataset = load_dataset( dataset_name_or_dir, name="3.0.0", split="train", + trust_remote_code=True, ) dataset = dataset["article"][:calib_size] elif os.path.isdir(dataset_name_or_dir): @@ -405,7 +411,9 @@ def get_calib_dataloader(dataset_name_or_dir="cnn_dailymail", f"Recognized local dataset repo {dataset_name_or_dir} for calibration; " "assuming the calibration data are in the train split and text column." ) - dataset = load_dataset(dataset_name_or_dir, split="train") + dataset = load_dataset(dataset_name_or_dir, + split="train", + trust_remote_code=True) dataset = dataset["text"][:calib_size] else: raise NotImplementedError( @@ -993,22 +1001,29 @@ def get_nemo_calib_dataloader(dataset_name_or_dir="cnn_dailymail", dataset = load_dataset( "json", data_files="https://the-eye.eu/public/AI/pile/val.jsonl.zst", - split="train") + split="train", + trust_remote_code=True) text_column = "text" elif "wikitext" in dataset_name_or_dir: dataset = load_dataset(dataset_name_or_dir, "wikitext-103-v1", - split="train") + split="train", + trust_remote_code=True) text_column = "text" elif "cnn_dailymail" in dataset_name_or_dir: - dataset = load_dataset(dataset_name_or_dir, name="3.0.0", split="train") + dataset = load_dataset(dataset_name_or_dir, + name="3.0.0", + split="train", + trust_remote_code=True) text_column = "article" elif os.path.isdir(dataset_name_or_dir): logger.info( f"Recognized local dataset repo {dataset_name_or_dir} for calibration; " "assuming the calibration data are in the train split and text column." ) - dataset = load_dataset(dataset_name_or_dir, split="train") + dataset = load_dataset(dataset_name_or_dir, + split="train", + trust_remote_code=True) text_column = "text" else: raise NotImplementedError( diff --git a/tests/unittest/llmapi/test_llm.py b/tests/unittest/llmapi/test_llm.py index 7fe3e5e1c9..f74a6e0074 100644 --- a/tests/unittest/llmapi/test_llm.py +++ b/tests/unittest/llmapi/test_llm.py @@ -366,8 +366,11 @@ def test_tokenizer_decode_incrementally(tokenizer_dir: str, threshold: float): num_samples = 100 cnn_dailymail = datasets.load_dataset(cnn_dailymail_path, name='3.0.0', - split='train') - alpaca_chinese = datasets.load_dataset(alpaca_chinese_path, split='train') + split='train', + trust_remote_code=True) + alpaca_chinese = datasets.load_dataset(alpaca_chinese_path, + split='train', + trust_remote_code=True) dataset = cnn_dailymail['article'][:num_samples // 2] + alpaca_chinese[ 'output_zh'][:num_samples // 2]