diff --git a/benchmarks/cpp/utils/prepare_real_data.py b/benchmarks/cpp/utils/prepare_real_data.py
index 617f255b4f..e15ef9267e 100644
--- a/benchmarks/cpp/utils/prepare_real_data.py
+++ b/benchmarks/cpp/utils/prepare_real_data.py
@@ -1,12 +1,16 @@
 import logging
 import random
 import re
+import tempfile
 from typing import Optional
 
 import click
 from datasets import load_dataset
+from PIL import Image
 from pydantic import BaseModel, model_validator
-from utils.utils import dataset_dump, get_norm_dist_lengths, print_dataset
+from utils.utils import (get_norm_dist_lengths, multimodal_dataset_dump,
+                         print_multimodal_dataset, print_text_dataset,
+                         text_dataset_dump)
 
 
 def validate_output_len_dist(ctx, param, value):
@@ -31,8 +35,10 @@ class DatasetConfig(BaseModel):
     """Split of the dataset. Typical values: train, validation, test. Setting to None will include all splits."""
     split: Optional[str]
     """The dataset dictionary used for the input sentence."""
-    input_key: str
+    input_key: Optional[str] = None
     """The dataset dictionary key used for the prompt of the input sentence. Must not be set when prompt is set."""
+    image_key: Optional[str] = None
+    """The dataset dictionary key used for the images."""
     prompt_key: Optional[str] = None
     """The prompt sentence to be added to the input sentence. Must not be set when prompt_key is set."""
     prompt: Optional[str] = None
@@ -75,6 +81,20 @@ class DatasetConfig(BaseModel):
             f"{req.keys()}")
         return req[self.input_key]
 
+    def get_images(self, req):
+        """Get the images from the given request."""
+        image_keys = [self.image_key
+                      ] + [f"{self.image_key}_{i}" for i in range(1, 8)]
+        assert any(key in req for key in image_keys), (
+            f"Dataset {self.name} does not have key '{self.image_key}'. "
+            "Please set --dataset-image-key to one of the available keys: "
+            f"{req.keys()}")
+        images = []
+        for key in image_keys:
+            if key in req and req[key] is not None:
+                images.append(req[key])
+        return images
+
     def get_output(self, req):
         """Get the output sentence from the given request."""
         if self.output_key is None:
@@ -105,7 +125,8 @@ def load_dataset_from_hf(dataset_config: DatasetConfig):
         dataset = iter(
             load_dataset(*dataset_config.query,
                          split=dataset_config.split,
-                         streaming=True))
+                         streaming=True,
+                         trust_remote_code=True))
     except ValueError as e:
         if "Config" in e:
             e += "\n Please add the config name to the dataset config yaml."
@@ -130,9 +151,12 @@ def load_dataset_from_hf(dataset_config: DatasetConfig):
               required=True,
               help=f"Split of the dataset to use.")
 @click.option("--dataset-input-key",
-              required=True,
               type=str,
               help=f"The dataset dictionary key for input.")
+@click.option("--dataset-image-key",
+              type=str,
+              default="image",
+              help=f"The dataset dictionary key for images.")
 @click.option("--dataset-prompt-key",
               type=str,
               default=None,
@@ -181,21 +205,54 @@ def dataset(root_args, **kwargs):
     output_lens = []
     task_ids = []
     req_cnt = 0
+    modality = None
+    multimodal_texts = []
+    multimodal_image_paths = []
     for req in load_dataset_from_hf(dataset_config):
-        # input
-        prompt = dataset_config.get_prompt(
-            req) + ' ' + dataset_config.get_input(req)
-        logging.debug(f"Input sequence: {prompt}")
-        line = root_args.tokenizer.encode(prompt)
-        if kwargs['max_input_len'] and len(line) > kwargs['max_input_len']:
-            continue
-        input_ids.append(line)
-        input_lens.append(len(line))
+        if any(key in req for key in ['image', 'image_1', 'video']):
+            # multimodal input
+            if 'video' in req and req['video'] is not None:
+                assert "Not supported yet"
+            assert kwargs['output_len_dist'] is not None, (
+                "Output length distribution must be set for multimodal requests."
+            )
+            modality = 'image'
+            text = dataset_config.get_prompt(req)
+            images = dataset_config.get_images(req)
+            image_paths = []
+            for image in images:
+                if image is not None:
+                    if isinstance(image, str):
+                        image_paths.append(image)
+                    elif isinstance(image, Image.Image):
+                        with tempfile.NamedTemporaryFile(
+                                suffix=".jpg", delete=False) as tmp_file:
+                            logging.debug(f"Saving image to {tmp_file.name}")
+                            image = image.convert("RGB")
+                            image.save(tmp_file, "JPEG")
+                            filepath = tmp_file.name
+                            image_paths.append(filepath)
+                    else:
+                        raise ValueError(f"Invalid image path: {image}")
+            multimodal_texts.append(text)
+            multimodal_image_paths.append(image_paths)
+        else:
+            # text input
+            prompt = dataset_config.get_prompt(
+                req) + ' ' + dataset_config.get_input(req)
+            logging.debug(f"Input sequence: {prompt}")
+            line = root_args.tokenizer.encode(prompt)
+            if kwargs['max_input_len'] and len(line) > kwargs['max_input_len']:
+                continue
+            input_ids.append(line)
+            input_lens.append(len(line))
 
-        # output if fetch from golden
-        if kwargs['output_len_dist'] is None:
-            output_lens.append(
-                len(root_args.tokenizer.encode(dataset_config.get_output(req))))
+            # output if fetch from golden
+            if kwargs['output_len_dist'] is None:
+                output_lens.append(
+                    len(
+                        root_args.tokenizer.encode(
+                            dataset_config.get_output(req))))
 
         # lora task id
         task_id = root_args.task_id
@@ -208,30 +265,53 @@ def dataset(root_args, **kwargs):
         if kwargs['num_requests'] and req_cnt >= kwargs['num_requests']:
             break
 
-    if kwargs['num_requests'] and len(input_ids) < kwargs['num_requests']:
+    if kwargs['num_requests'] and (len(input_ids) if modality is None else len(
+            multimodal_texts)) < kwargs['num_requests']:
         logging.warning(
-            "Number of requests is smaller than the num-requests user set.")
+            f"Number of requests={len(input_ids) if modality is None else len(multimodal_texts)} is"
+            f" smaller than the num-requests user set={kwargs['num_requests']}."
+        )
 
     # output if randomized
     if kwargs['output_len_dist'] is not None:
         osl_mean, osl_stdev = kwargs['output_len_dist']
-        output_lens = get_norm_dist_lengths(osl_mean, osl_stdev, len(input_ids),
-                                            root_args.random_seed)
-
+        output_lens = get_norm_dist_lengths(
+            osl_mean, osl_stdev,
+            len(input_ids) if modality is None else len(multimodal_texts),
+            root_args.random_seed)
     logging.debug(f"Input lengths: {[len(i) for i in input_ids]}")
     logging.debug(f"Output lengths: {output_lens}")
+    if modality is not None:
+        logging.debug(f"Modality: {modality}")
 
-    if not root_args.std_out:
-        dataset_dump(
-            input_lens, input_ids, output_lens, task_ids, {
-                "workload_type": "dataset",
-                "tokenizer": root_args.tokenizer.__class__.__name__,
-                "num_requests": len(input_ids),
-                "max_input_len": max(input_lens),
-                "max_output_len": max(output_lens)
-            }, root_args.output)
+    if modality is not None:
+        if not root_args.std_out:
+            multimodal_dataset_dump(
+                multimodal_texts, multimodal_image_paths, output_lens, task_ids,
+                {
+                    "workload_type": "dataset",
+                    "tokenizer": root_args.tokenizer.__class__.__name__,
+                    "num_requests": len(task_ids),
+                    "max_output_len": max(output_lens)
+                }, root_args.output)
+        else:
+            print_multimodal_dataset(
+                multimodal_texts,
+                multimodal_image_paths,
+                output_lens,
+            )
     else:
-        print_dataset(
-            input_ids,
-            output_lens,
-        )
+        if not root_args.std_out:
+            text_dataset_dump(
+                input_lens, input_ids, output_lens, task_ids, {
+                    "workload_type": "dataset",
+                    "tokenizer": root_args.tokenizer.__class__.__name__,
+                    "num_requests": len(input_ids),
+                    "max_input_len": max(input_lens),
+                    "max_output_len": max(output_lens)
+                }, root_args.output)
+        else:
+            print_text_dataset(
+                input_ids,
+                output_lens,
+            )
diff --git a/benchmarks/cpp/utils/prepare_synthetic_data.py b/benchmarks/cpp/utils/prepare_synthetic_data.py
index 4af9c0c126..721b3649a2 100644
--- a/benchmarks/cpp/utils/prepare_synthetic_data.py
+++ b/benchmarks/cpp/utils/prepare_synthetic_data.py
@@ -1,8 +1,9 @@
 import random
 
 import click
-from utils.utils import (dataset_dump, gen_random_tokens, get_norm_dist_lengths,
-                         get_unif_dist_lengths, print_dataset)
+from utils.utils import (gen_random_tokens, get_norm_dist_lengths,
+                         get_unif_dist_lengths, print_text_dataset,
+                         text_dataset_dump)
 
 
 @click.command()
@@ -57,7 +58,7 @@ def token_norm_dist(root_args, **kwargs):
         task_ids = [random.randint(min_id, max_id) for _ in range(num_reqs)]
 
     if not root_args.std_out:
-        dataset_dump(
+        text_dataset_dump(
             input_lens, input_ids, output_lens, task_ids, {
                 "workload_type": "token-norm-dist",
                 "input_mean": kwargs['input_mean'],
@@ -70,7 +71,7 @@ def token_norm_dist(root_args, **kwargs):
                 "max_output_len": max_output_len
             }, root_args.output)
     else:
-        print_dataset(
+        print_text_dataset(
             input_ids,
             output_lens,
         )
@@ -127,7 +128,7 @@ def token_unif_dist(root_args, **kwargs):
         task_ids = [random.randint(min_id, max_id) for _ in range(num_reqs)]
 
     if not root_args.std_out:
-        dataset_dump(
+        text_dataset_dump(
             input_lens, input_ids, output_lens, task_ids, {
                 "workload_type": "token-unif-dist",
                 "input_min": kwargs['input_min'],
@@ -140,7 +141,7 @@ def token_unif_dist(root_args, **kwargs):
                 "max_output_len": max_output_len
             }, root_args.output)
     else:
-        print_dataset(
+        print_text_dataset(
             input_ids,
             output_lens,
         )
diff --git a/benchmarks/cpp/utils/utils.py b/benchmarks/cpp/utils/utils.py
index 9735315ba9..f0fcf4403d 100644
--- a/benchmarks/cpp/utils/utils.py
+++ b/benchmarks/cpp/utils/utils.py
@@ -2,22 +2,29 @@ import json
 import math
 import os
 import random
-from typing import List
+from typing import List, Union
 
 import numpy as np
 from pydantic import BaseModel
 
 
-class Sample(BaseModel):
+class TextSample(BaseModel):
     input_len: int
     input_ids: List[int]
     output_len: int
     task_id: int
 
 
+class MultimodalSample(BaseModel):
+    task_id: int
+    prompt: str
+    media_paths: List[str]
+    output_len: int
+
+
 class Workload(BaseModel):
     metadata: dict
-    samples: List[Sample] = []
+    samples: List[Union[TextSample, MultimodalSample]] = []
 
     def __init__(self, **kwargs) -> None:
         super().__init__(**kwargs)
@@ -33,22 +40,37 @@ class Workload(BaseModel):
         self.metadata.setdefault('workload_name', workload_name)
 
 
-def dataset_dump(input_lens, input_ids, output_lens, task_ids, metadata,
-                 output_file):
+def text_dataset_dump(input_lens, input_ids, output_lens, task_ids, metadata,
+                      output_file):
     samples = []
     for i in range(len(input_ids)):
         samples.append(
-            Sample(input_len=input_lens[i],
-                   input_ids=input_ids[i],
-                   output_len=output_lens[i],
-                   task_id=task_ids[i]))
+            TextSample(input_len=input_lens[i],
+                       input_ids=input_ids[i],
+                       output_len=output_lens[i],
+                       task_id=task_ids[i]))
     workload = Workload(metadata=metadata, samples=samples)
     os.makedirs(os.path.dirname(output_file), exist_ok=True)
     with open(output_file, 'w') as f:
         json.dump(workload.model_dump(), f)
 
 
-def print_dataset(input_ids, output_lens):
+def multimodal_dataset_dump(multimodal_texts, multimodal_image_paths,
+                            output_lens, task_ids, metadata, output_file):
+    samples = []
+    for i in range(len(multimodal_texts)):
+        samples.append(
+            MultimodalSample(task_id=task_ids[i],
+                             prompt=multimodal_texts[i],
+                             media_paths=multimodal_image_paths[i],
+                             output_len=output_lens[i]))
+    workload = Workload(metadata=metadata, samples=samples)
+    os.makedirs(os.path.dirname(output_file), exist_ok=True)
+    with open(output_file, 'w') as f:
+        json.dump(workload.model_dump(), f)
+
+
+def print_text_dataset(input_ids, output_lens):
     for i, input_tokens in enumerate(input_ids):
         d = {
             "task_id": i,
@@ -58,6 +80,19 @@ def print_dataset(input_ids, output_lens):
         print(json.dumps(d, separators=(',', ':'), ensure_ascii=False))
 
 
+def print_multimodal_dataset(multimodal_texts, multimodal_image_paths,
+                             output_lens):
+    for i, (text, image_paths) in enumerate(
+            zip(multimodal_texts, multimodal_image_paths)):
+        d = {
+            "task_id": i,
+            "prompt": text,
+            "media_paths": image_paths,
+            "output_tokens": output_lens[i]
+        }
+        print(json.dumps(d, separators=(',', ':'), ensure_ascii=False))
+
+
 def get_list_of_delays(delay_dist, mean_time_bet_reqs, num_reqs, random_seed):
     if delay_dist == "constant":
         delays = [mean_time_bet_reqs] * num_reqs
diff --git a/docs/source/performance/perf-benchmarking.md b/docs/source/performance/perf-benchmarking.md
index b2c0b26237..0e525c12c3 100644
--- a/docs/source/performance/perf-benchmarking.md
+++ b/docs/source/performance/perf-benchmarking.md
@@ -475,6 +475,115 @@ Total Latency (ms):             18563.6825
 
 ```
 
+#### Running multi-modal models in the PyTorch Workflow
+
+To benchmark multi-modal models with PyTorch workflow, you can follow the similar approach as above.
+
+First, prepare the dataset:
+```
+python ./benchmarks/cpp/prepare_dataset.py \
+  --tokenizer Qwen/Qwen2-VL-2B-Instruct \
+  --stdout \
+  dataset \
+  --dataset-name lmms-lab/MMMU \
+  --dataset-split test \
+  --dataset-image-key image \
+  --dataset-prompt-key question \
+  --num-requests 10 \
+  --output-len-dist 128,5 > mm_data.jsonl
+```
+It will download the media files to `/tmp` directory and prepare the dataset with their paths. Note that the `prompt` fields are texts and not tokenized ids. This is due to the fact that
+the `prompt` and the media (image/video) are processed by a preprocessor for multimodal files.
+
+Sample dataset for multimodal:
+```
+{"task_id":0,"prompt":"Brahma Industries sells vinyl replacement windows to home improvement retailers nationwide. The national sales manager believes that if they invest an additional $25,000 in advertising, they would increase sales volume by 10,000 units. <image 1> What is the total contribution margin?","media_paths":["/tmp/tmp9so41y3r.jpg"],"output_tokens":126}
+{"task_id":1,"prompt":"Let us compute for the missing amounts under work in process inventory, what is the cost of goods manufactured? <image 1>","media_paths":["/tmp/tmpowsrb_f4.jpg"],"output_tokens":119}
+{"task_id":2,"prompt":"Tsuji is reviewing the price of a 3-month Japanese yen/U.S. dollar currency futures contract, using the currency and interest rate data shown below. Because the 3-month Japanese interest rate has just increased to .50%, Itsuji recognizes that an arbitrage opportunity exists nd decides to borrow $1 million U.S. dollars to purchase Japanese yen. Calculate the yen arbitrage profit from Itsuji's strategy, using the following data: <image 1> ","media_paths":["/tmp/tmpxhdvasex.jpg"],"output_tokens":126}
+...
+```
+
+Run the benchmark:
+```
+trtllm-bench --model Qwen/Qwen2-VL-2B-Instruct \
+  throughput \
+  --dataset mm_data.jsonl \
+  --backend pytorch \
+  --num_requests 10 \
+  --max_batch_size 4 \
+  --modality image
+```
+
+
+Sample output:
+```
+===========================================================
+= REQUEST DETAILS
+===========================================================
+Number of requests:             10
+Number of concurrent requests:  5.3019
+Average Input Length (tokens):  411.6000
+Average Output Length (tokens): 128.7000
+===========================================================
+= WORLD + RUNTIME INFORMATION
+===========================================================
+TP Size:                1
+PP Size:                1
+EP Size:                None
+Max Runtime Batch Size: 4
+Max Runtime Tokens:     12288
+Scheduling Policy:      GUARANTEED_NO_EVICT
+KV Memory Percentage:   90.00%
+Issue Rate (req/sec):   1.4117E+17
+
+===========================================================
+= PERFORMANCE OVERVIEW
+===========================================================
+Request Throughput (req/sec):                     1.4439
+Total Output Throughput (tokens/sec):             185.8351
+Per User Output Throughput (tokens/sec/user):     38.1959
+Per GPU Output Throughput (tokens/sec/gpu):       185.8351
+Total Token Throughput (tokens/sec):              780.1607
+Total Latency (ms):                               6925.4963
+Average request latency (ms):                     3671.8441
+
+-- Request Latency Breakdown (ms) -----------------------
+
+[Latency] P50    : 3936.3022
+[Latency] P90    : 5514.4701
+[Latency] P95    : 5514.4701
+[Latency] P99    : 5514.4701
+[Latency] MINIMUM: 2397.1047
+[Latency] MAXIMUM: 5514.4701
+[Latency] AVERAGE: 3671.8441
+
+===========================================================
+= DATASET DETAILS
+===========================================================
+Dataset Path:         /workspaces/tensorrt_llm/mm_data.jsonl
+Number of Sequences:  10
+
+-- Percentiles statistics ---------------------------------
+
+        Input              Output           Seq. Length
+-----------------------------------------------------------
+MIN:   167.0000           119.0000           300.0000
+MAX:  1059.0000           137.0000          1178.0000
+AVG:   411.6000           128.7000           540.3000
+P50:   299.0000           128.0000           427.0000
+P90:  1059.0000           137.0000          1178.0000
+P95:  1059.0000           137.0000          1178.0000
+P99:  1059.0000           137.0000          1178.0000
+===========================================================
+```
+
+**Notes and Limitations**:
+- Only image datasets are supported for now.
+- `--output-len-dist` is a required argument for multimodal datasets.
+- Tokenizer is unused during the prepare step but it is still a required argument.
+- Since the images are converted to tokens when the model is run, `trtllm-bench` uses a default large value for the maximum input sequence length when setting up the execution settings.
+  You can also modify the behavior by specifying a different value with the flag `--max_input_len` that suits your use-case.
+
 #### Quantization in the PyTorch Flow
 
 In order to run a quantized run with `trtllm-bench` utilizing the PyTorch flow, you will need to use a pre-quantized
diff --git a/examples/bert/utils.py b/examples/bert/utils.py
index 566101bfa8..105fd00c0b 100644
--- a/examples/bert/utils.py
+++ b/examples/bert/utils.py
@@ -17,7 +17,7 @@ def prepare_text_inputs(model_name, batch_size=8):
         f"HF_DATASETS_OFFLINE inside function: {datasets.config.HF_DATASETS_OFFLINE}"
     )
     if model_name == "BertForQuestionAnswering" or model_name == "RobertaForQuestionAnswering":
-        squad_dataset = load_dataset("squad_v2")
+        squad_dataset = load_dataset("squad_v2", trust_remote_code=True)
         val_dataset = squad_dataset["validation"]
         samples = val_dataset.select(range(batch_size))
 
@@ -27,7 +27,8 @@ def prepare_text_inputs(model_name, batch_size=8):
         }
         return qa_real_test_inputs
     elif model_name == "BertForSequenceClassification" or model_name == "RobertaForSequenceClassification":
-        yelp_dataset = load_dataset("fancyzhx/yelp_polarity")
+        yelp_dataset = load_dataset("fancyzhx/yelp_polarity",
+                                    trust_remote_code=True)
         val_dataset = yelp_dataset["test"]
         samples = val_dataset.select(range(batch_size))
 
diff --git a/examples/commandr/requirements.txt b/examples/commandr/requirements.txt
index 22619c82a4..4e095dac67 100644
--- a/examples/commandr/requirements.txt
+++ b/examples/commandr/requirements.txt
@@ -1,5 +1,5 @@
 -c ../constraints.txt
 tensorrt_llm>=0.0.0.dev0
-datasets==2.14.6
+datasets==3.1.0
 evaluate
 rouge_score
diff --git a/examples/draft_target_model/requirements.txt b/examples/draft_target_model/requirements.txt
index 85068b0352..423ba2d4b0 100644
--- a/examples/draft_target_model/requirements.txt
+++ b/examples/draft_target_model/requirements.txt
@@ -1,6 +1,6 @@
 -c ../constraints.txt
 tensorrt_llm>=0.0.0.dev0
-datasets~=2.14.5
+datasets==3.1.0
 rouge_score
 sentencepiece>=0.1.99
 evaluate
diff --git a/examples/eagle/requirements.txt b/examples/eagle/requirements.txt
index b3afd79a39..32bd8ce040 100644
--- a/examples/eagle/requirements.txt
+++ b/examples/eagle/requirements.txt
@@ -1,6 +1,6 @@
 -c ../constraints.txt
 tensorrt_llm>=0.0.0.dev0
-datasets~=2.14.5
+datasets==3.1.0
 rouge_score
 SentencePiece~=0.1.99
 evaluate
diff --git a/examples/gemma/requirements.txt b/examples/gemma/requirements.txt
index 509be84961..4e1079c1e0 100644
--- a/examples/gemma/requirements.txt
+++ b/examples/gemma/requirements.txt
@@ -11,4 +11,4 @@ sentencepiece>=0.1.99
 h5py~=3.12.1
 rouge_score
 nltk
-datasets==2.14.6
+datasets==3.1.0
diff --git a/examples/glm-4-9b/requirements.txt b/examples/glm-4-9b/requirements.txt
index a3dfa6d35a..210f487888 100644
--- a/examples/glm-4-9b/requirements.txt
+++ b/examples/glm-4-9b/requirements.txt
@@ -1,6 +1,6 @@
 -c ../constraints.txt
 tensorrt_llm>=0.0.0.dev0
-datasets~=2.14.5
+datasets==3.1.0
 evaluate
 protobuf
 rouge_score
diff --git a/examples/gpt/requirements.txt b/examples/gpt/requirements.txt
index 35d78a450b..d464f84b03 100644
--- a/examples/gpt/requirements.txt
+++ b/examples/gpt/requirements.txt
@@ -1,6 +1,6 @@
 -c ../constraints.txt
 tensorrt_llm>=0.0.0.dev0
-datasets~=2.14.5
+datasets==3.1.0
 evaluate
 rouge_score
 SentencePiece>=0.1.99
diff --git a/examples/llama/requirements.txt b/examples/llama/requirements.txt
index 2b909128f9..07d3dbfbd4 100644
--- a/examples/llama/requirements.txt
+++ b/examples/llama/requirements.txt
@@ -1,7 +1,7 @@
 -c ../constraints.txt
 tensorrt_llm>=0.0.0.dev0
 transformers>=4.43.0
-datasets==2.14.6
+datasets==3.1.0
 evaluate
 rouge_score
 sentencepiece>=0.1.99
diff --git a/examples/llama/summarize_long.py b/examples/llama/summarize_long.py
index c6189a662e..01b23f10ed 100644
--- a/examples/llama/summarize_long.py
+++ b/examples/llama/summarize_long.py
@@ -312,7 +312,8 @@ def main(args):
     tokenizer.pad_token = tokenizer.eos_token
 
     dataset_openweb = load_dataset("stas/openwebtext-10k",
-                                   cache_dir=args.dataset_path)
+                                   cache_dir=args.dataset_path,
+                                   trust_remote_code=True)
     long_texts = get_long_texts(dataset_openweb)  # generator
 
     # get datapoints
diff --git a/examples/lookahead/requirements.txt b/examples/lookahead/requirements.txt
index 85068b0352..423ba2d4b0 100644
--- a/examples/lookahead/requirements.txt
+++ b/examples/lookahead/requirements.txt
@@ -1,6 +1,6 @@
 -c ../constraints.txt
 tensorrt_llm>=0.0.0.dev0
-datasets~=2.14.5
+datasets==3.1.0
 rouge_score
 sentencepiece>=0.1.99
 evaluate
diff --git a/examples/mamba/requirements.txt b/examples/mamba/requirements.txt
index 4c69a932a1..d308333d45 100644
--- a/examples/mamba/requirements.txt
+++ b/examples/mamba/requirements.txt
@@ -1,7 +1,7 @@
 -c ../constraints.txt
 tensorrt_llm>=0.0.0.dev0
 transformers>=4.39.0
-datasets~=2.14.5
+datasets==3.1.0
 evaluate
 rouge_score
 sentencepiece
diff --git a/examples/medusa/requirements.txt b/examples/medusa/requirements.txt
index 85068b0352..423ba2d4b0 100644
--- a/examples/medusa/requirements.txt
+++ b/examples/medusa/requirements.txt
@@ -1,6 +1,6 @@
 -c ../constraints.txt
 tensorrt_llm>=0.0.0.dev0
-datasets~=2.14.5
+datasets==3.1.0
 rouge_score
 sentencepiece>=0.1.99
 evaluate
diff --git a/examples/models/contrib/baichuan/requirements.txt b/examples/models/contrib/baichuan/requirements.txt
index cdf475ba24..d234cb9d00 100644
--- a/examples/models/contrib/baichuan/requirements.txt
+++ b/examples/models/contrib/baichuan/requirements.txt
@@ -1,6 +1,6 @@
 -c ../../../constraints.txt
 tensorrt_llm>=0.0.0.dev0
-datasets~=2.15.0
+datasets==3.1.0
 evaluate
 rouge_score
 sentencepiece>=0.1.99
diff --git a/examples/models/contrib/bloom/requirements.txt b/examples/models/contrib/bloom/requirements.txt
index d38eb00c4b..88232baef8 100644
--- a/examples/models/contrib/bloom/requirements.txt
+++ b/examples/models/contrib/bloom/requirements.txt
@@ -1,6 +1,6 @@
 -c ../../../constraints.txt
 tensorrt_llm>=0.0.0.dev0
-datasets~=2.14.5
+datasets==3.1.0
 evaluate
 rouge_score
 sentencepiece>=0.1.99
diff --git a/examples/models/contrib/chatglm-6b/requirements.txt b/examples/models/contrib/chatglm-6b/requirements.txt
index 7254805798..cdc65bf2bb 100644
--- a/examples/models/contrib/chatglm-6b/requirements.txt
+++ b/examples/models/contrib/chatglm-6b/requirements.txt
@@ -1,6 +1,6 @@
 -c ../../../constraints.txt
 tensorrt_llm>=0.0.0.dev0
-datasets~=2.14.5
+datasets==3.1.0
 evaluate
 protobuf
 rouge_score
diff --git a/examples/models/contrib/chatglm2-6b/requirements.txt b/examples/models/contrib/chatglm2-6b/requirements.txt
index 7254805798..cdc65bf2bb 100644
--- a/examples/models/contrib/chatglm2-6b/requirements.txt
+++ b/examples/models/contrib/chatglm2-6b/requirements.txt
@@ -1,6 +1,6 @@
 -c ../../../constraints.txt
 tensorrt_llm>=0.0.0.dev0
-datasets~=2.14.5
+datasets==3.1.0
 evaluate
 protobuf
 rouge_score
diff --git a/examples/models/contrib/chatglm3-6b-32k/requirements.txt b/examples/models/contrib/chatglm3-6b-32k/requirements.txt
index 7254805798..cdc65bf2bb 100644
--- a/examples/models/contrib/chatglm3-6b-32k/requirements.txt
+++ b/examples/models/contrib/chatglm3-6b-32k/requirements.txt
@@ -1,6 +1,6 @@
 -c ../../../constraints.txt
 tensorrt_llm>=0.0.0.dev0
-datasets~=2.14.5
+datasets==3.1.0
 evaluate
 protobuf
 rouge_score
diff --git a/examples/models/contrib/dbrx/requirements.txt b/examples/models/contrib/dbrx/requirements.txt
index 1bc3616a7f..0b8f8e5e0f 100644
--- a/examples/models/contrib/dbrx/requirements.txt
+++ b/examples/models/contrib/dbrx/requirements.txt
@@ -1,6 +1,6 @@
 -c ../../../constraints.txt
 tensorrt_llm>=0.0.0.dev0
-datasets~=2.14.5
+datasets==3.1.0
 evaluate
 rouge_score
 tiktoken==0.6.0
diff --git a/examples/models/contrib/deepseek_v1/requirements.txt b/examples/models/contrib/deepseek_v1/requirements.txt
index 204a16d3ef..2f8713d865 100644
--- a/examples/models/contrib/deepseek_v1/requirements.txt
+++ b/examples/models/contrib/deepseek_v1/requirements.txt
@@ -1,5 +1,5 @@
 -c ../../../constraints.txt
 tensorrt_llm>=0.0.0.dev0
-datasets~=2.14.6
+datasets==3.1.0
 evaluate
 rouge_score
diff --git a/examples/models/contrib/deepseek_v2/requirements.txt b/examples/models/contrib/deepseek_v2/requirements.txt
index 6f83e50eb6..6be3961465 100644
--- a/examples/models/contrib/deepseek_v2/requirements.txt
+++ b/examples/models/contrib/deepseek_v2/requirements.txt
@@ -1,3 +1,3 @@
-datasets~=2.14.6
+datasets==3.1.0
 evaluate~=0.4.1
 rouge_score~=0.1.2
diff --git a/examples/models/contrib/falcon/requirements.txt b/examples/models/contrib/falcon/requirements.txt
index 25e0cb29d0..c26675edee 100644
--- a/examples/models/contrib/falcon/requirements.txt
+++ b/examples/models/contrib/falcon/requirements.txt
@@ -1,7 +1,7 @@
 -c ../../../constraints.txt
 tensorrt_llm>=0.0.0.dev0
 transformers>=4.31.0
-datasets~=2.14.5
+datasets==3.1.0
 evaluate
 rouge_score
 sentencepiece>=0.1.99
diff --git a/examples/models/contrib/gptj/requirements.txt b/examples/models/contrib/gptj/requirements.txt
index 25274fb7fd..2f8713d865 100644
--- a/examples/models/contrib/gptj/requirements.txt
+++ b/examples/models/contrib/gptj/requirements.txt
@@ -1,5 +1,5 @@
 -c ../../../constraints.txt
 tensorrt_llm>=0.0.0.dev0
-datasets~=2.14.5
+datasets==3.1.0
 evaluate
 rouge_score
diff --git a/examples/models/contrib/gptneox/requirements.txt b/examples/models/contrib/gptneox/requirements.txt
index ca6bbc7231..cc77e27f78 100644
--- a/examples/models/contrib/gptneox/requirements.txt
+++ b/examples/models/contrib/gptneox/requirements.txt
@@ -1,5 +1,5 @@
 -c ../../../constraints.txt
 tensorrt_llm>=0.0.0.dev0
-datasets~=2.14.5
+datasets==3.1.0
 rouge_score
 evaluate
diff --git a/examples/models/contrib/grok/requirements.txt b/examples/models/contrib/grok/requirements.txt
index c84cde73cb..3ab319c0ef 100644
--- a/examples/models/contrib/grok/requirements.txt
+++ b/examples/models/contrib/grok/requirements.txt
@@ -1,7 +1,7 @@
 -f https://storage.googleapis.com/jax-releases/jax_cuda_releases.html
 -c ../../../constraints.txt
 tensorrt_llm>=0.0.0.dev0
-datasets==2.14.6
+datasets==3.1.0
 evaluate
 rouge_score
 sentencepiece==0.2.0
diff --git a/examples/models/contrib/internlm/requirements.txt b/examples/models/contrib/internlm/requirements.txt
index 24292885f5..d9354a133c 100644
--- a/examples/models/contrib/internlm/requirements.txt
+++ b/examples/models/contrib/internlm/requirements.txt
@@ -1,6 +1,6 @@
 -c ../../../constraints.txt
 tensorrt_llm>=0.0.0.dev0
-datasets==2.14.5
+datasets==3.1.0
 rouge_score
 sentencepiece>=0.1.99
 evaluate
diff --git a/examples/models/contrib/jais/requirements.txt b/examples/models/contrib/jais/requirements.txt
index 6097c25004..592e01e5ba 100644
--- a/examples/models/contrib/jais/requirements.txt
+++ b/examples/models/contrib/jais/requirements.txt
@@ -1,6 +1,6 @@
 -c ../../../constraints.txt
 tensorrt_llm>=0.0.0.dev0
-datasets~=2.14.5
+datasets==3.1.0
 evaluate
 rouge_score
 SentencePiece>=0.1.99
diff --git a/examples/models/contrib/mpt/requirements.txt b/examples/models/contrib/mpt/requirements.txt
index 25274fb7fd..2f8713d865 100644
--- a/examples/models/contrib/mpt/requirements.txt
+++ b/examples/models/contrib/mpt/requirements.txt
@@ -1,5 +1,5 @@
 -c ../../../constraints.txt
 tensorrt_llm>=0.0.0.dev0
-datasets~=2.14.5
+datasets==3.1.0
 evaluate
 rouge_score
diff --git a/examples/models/contrib/opt/requirements.txt b/examples/models/contrib/opt/requirements.txt
index 25274fb7fd..2f8713d865 100644
--- a/examples/models/contrib/opt/requirements.txt
+++ b/examples/models/contrib/opt/requirements.txt
@@ -1,5 +1,5 @@
 -c ../../../constraints.txt
 tensorrt_llm>=0.0.0.dev0
-datasets~=2.14.5
+datasets==3.1.0
 evaluate
 rouge_score
diff --git a/examples/models/contrib/skywork/requirements.txt b/examples/models/contrib/skywork/requirements.txt
index 10274faa61..88232baef8 100644
--- a/examples/models/contrib/skywork/requirements.txt
+++ b/examples/models/contrib/skywork/requirements.txt
@@ -1,6 +1,6 @@
 -c ../../../constraints.txt
 tensorrt_llm>=0.0.0.dev0
-datasets~=2.16.1
+datasets==3.1.0
 evaluate
 rouge_score
 sentencepiece>=0.1.99
diff --git a/examples/models/contrib/smaug/requirements.txt b/examples/models/contrib/smaug/requirements.txt
index 0e92afc46d..88232baef8 100644
--- a/examples/models/contrib/smaug/requirements.txt
+++ b/examples/models/contrib/smaug/requirements.txt
@@ -1,6 +1,6 @@
 -c ../../../constraints.txt
 tensorrt_llm>=0.0.0.dev0
-datasets==2.14.6
+datasets==3.1.0
 evaluate
 rouge_score
 sentencepiece>=0.1.99
diff --git a/examples/multimodal/eval.py b/examples/multimodal/eval.py
index b91f9ff073..90c3a02837 100644
--- a/examples/multimodal/eval.py
+++ b/examples/multimodal/eval.py
@@ -108,6 +108,7 @@ def load_dataset(args) -> datasets.Dataset:
                     'timeout': aiohttp.ClientTimeout(total=3600)
                 }
             },
+            trust_remote_code=True,
         )
     return dataset
 
diff --git a/examples/nemotron/requirements.txt b/examples/nemotron/requirements.txt
index c4f3de3511..074757de75 100644
--- a/examples/nemotron/requirements.txt
+++ b/examples/nemotron/requirements.txt
@@ -2,6 +2,6 @@
 tensorrt_llm>=0.0.0.dev0
 nemo-toolkit[all]==2.0.0rc1
 megatron-core @ git+https://github.com/NVIDIA/Megatron-LM@core_r0.8.0
-datasets~=2.14.5
+datasets==3.1.0
 evaluate
 rouge_score
diff --git a/examples/nemotron_nas/calibration_utils.py b/examples/nemotron_nas/calibration_utils.py
index 42b4382fa3..26c12820a6 100644
--- a/examples/nemotron_nas/calibration_utils.py
+++ b/examples/nemotron_nas/calibration_utils.py
@@ -20,7 +20,7 @@ def create_trtllm_magpie_calibration_dataset(output_dir: str,
                                              calib_size: int = 512) -> None:
     from datasets import load_dataset
 
-    dataset = load_dataset(DATASET, split="train")
+    dataset = load_dataset(DATASET, split="train", trust_remote_code=True)
 
     def transform(conversation):
         value = '\n'.join(turn['value']
diff --git a/examples/phi/requirements.txt b/examples/phi/requirements.txt
index b2778023da..2621b03509 100644
--- a/examples/phi/requirements.txt
+++ b/examples/phi/requirements.txt
@@ -1,6 +1,6 @@
 -c ../constraints.txt
 tensorrt_llm>=0.0.0.dev0
-datasets~=2.14.5
+datasets==3.1.0
 evaluate
 rouge_score
 einops~=0.7.0
diff --git a/examples/prompt_lookup/requirements.txt b/examples/prompt_lookup/requirements.txt
index 3fe0660a14..a1f618a758 100644
--- a/examples/prompt_lookup/requirements.txt
+++ b/examples/prompt_lookup/requirements.txt
@@ -1,6 +1,6 @@
 -c ../constraints.txt
 tensorrt_llm>=0.0.0.dev0
-datasets~=2.14.5
+datasets==3.1.0
 rouge_score
 sentencepiece~=0.1.99
 evaluate
diff --git a/examples/pytorch/quickstart_multimodal.py b/examples/pytorch/quickstart_multimodal.py
index 26f93bc516..ca40b532b0 100644
--- a/examples/pytorch/quickstart_multimodal.py
+++ b/examples/pytorch/quickstart_multimodal.py
@@ -1,11 +1,12 @@
 import argparse
 import json
 import os
+from typing import Any, Dict, List
 
 from quickstart_advanced import add_llm_args, setup_llm
-from transformers import AutoProcessor
 
-from tensorrt_llm.inputs import load_image, load_video
+from tensorrt_llm.inputs import (INPUT_FORMATTER_MAP, default_image_loader,
+                                 default_video_loader)
 
 example_images = [
     "https://huggingface.co/datasets/YiYiXu/testing-images/resolve/main/seashore.png",
@@ -27,92 +28,32 @@ example_video_prompts = [
 ]
 
 
-def prepare_vila(args, inputs):
+def prepare_multimodal_inputs(model_dir: str,
+                              model_type: str,
+                              modality: str,
+                              prompts: List[str],
+                              media: List[str],
+                              image_data_format: str = "pt",
+                              num_frames: int = 8) -> List[Dict[str, Any]]:
 
-    def add_media_token(prompt, multi_modal_data):
-        mm_tokens = ""
-        if "image" in multi_modal_data:
-            for _ in multi_modal_data["image"]:
-                mm_tokens += "<image>"
-        elif "video" in multi_modal_data:
-            for _ in multi_modal_data["video"]:
-                mm_tokens += "<vila/video>"
-        return mm_tokens + prompt
+    inputs = []
+    if modality == "image":
+        inputs = default_image_loader(prompts, media, image_data_format)
+    elif modality == "video":
+        inputs = default_video_loader(prompts, media, image_data_format,
+                                      num_frames)
+    else:
+        raise ValueError(f"Unsupported modality: {modality}")
+
+    inputs = INPUT_FORMATTER_MAP[model_type](model_dir, inputs)
 
-    for input in inputs:
-        input["prompt"] = add_media_token(input["prompt"],
-                                          input["multi_modal_data"])
     return inputs
 
 
-def prepare_llava_next(args, inputs):
-    processor = AutoProcessor.from_pretrained(args.model_dir)
-
-    # Single-image inference chat template. For multi-image template,
-    # see https://huggingface.co/docs/transformers/en/model_doc/llava_next#multi-image-inference.
-    def apply_template(prompt, multimodal_data):
-        conversation = [
-            {
-                "role": "user",
-                "content": [
-                    {
-                        "type": "text",
-                        "text": prompt
-                    },
-                    {
-                        "type": "image"
-                    },
-                ],
-            },
-        ]
-        return processor.apply_chat_template(
-            conversation,
-            add_generation_prompt=True,
-        )
-
-    for input in inputs:
-        input["prompt"] = apply_template(input["prompt"],
-                                         input["multi_modal_data"])
-    return inputs
-
-
-def prepare_qwen2_vl(args, inputs):
-    processor = AutoProcessor.from_pretrained(args.model_dir)
-
-    def apply_template(prompt, multimodal_data):
-        content = [{
-            "type": media_type
-        } for media_type, items in multimodal_data.items()
-                   for _ in items] + [{
-                       "type": "text",
-                       "text": prompt
-                   }]
-
-        conversation = [{"role": "user", "content": content}]
-        return processor.apply_chat_template(
-            conversation,
-            tokenize=False,
-            add_generation_prompt=True,
-        )
-
-    for input in inputs:
-        input["prompt"] = apply_template(input["prompt"],
-                                         input["multi_modal_data"])
-    return inputs
-
-
-MODEL_TYPE_MAP = {
-    "llava_llama": prepare_vila,
-    "llava_next": prepare_llava_next,
-    "qwen2_vl": prepare_qwen2_vl,
-    "qwen2_5_vl": prepare_qwen2_vl,
-}
-
-
 def add_multimodal_args(parser):
     parser.add_argument("--model_type",
                         type=str,
-                        choices=MODEL_TYPE_MAP.keys(),
+                        choices=INPUT_FORMATTER_MAP.keys(),
                         help="Model type.")
     parser.add_argument("--modality",
                         type=str,
@@ -150,50 +91,16 @@ def main():
     llm, sampling_params = setup_llm(args)
 
     image_format = "pt"  # ["pt", "pil"]
-    if args.modality == "image":
-        prompts = args.prompt if args.prompt else example_image_prompts
-        images = args.media if args.media else example_images
-        if len(images) > len(prompts) and len(prompts) == 1:
-            # 1 prompt + N media
-            images = [images]
-        inputs = [{
-            "prompt": prompt,
-            "multi_modal_data": {
-                "image": [
-                    load_image(i, format=image_format, device="cuda")
-                    for i in image
-                ] if isinstance(image, list) else
-                [load_image(image, format=image_format, device="cuda")]
-            }
-        } for prompt, image in zip(prompts, images)]
-    elif args.modality == "video":
-        prompts = args.prompt if args.prompt else example_video_prompts
-        videos = args.media if args.media else example_videos
-        if len(videos) > len(prompts) and len(prompts) == 1:
-            # 1 prompt + N media
-            videos = [videos]
-        inputs = [{
-            "prompt": prompt,
-            "multi_modal_data": {
-                "video": [
-                    load_video(
-                        i, args.num_frames, format=image_format, device="cuda")
-                    for i in video
-                ] if isinstance(video, list) else [
-                    load_video(video,
-                               args.num_frames,
-                               format=image_format,
-                               device="cuda")
-                ]
-            }
-        } for prompt, video in zip(prompts, videos)]
+    if args.model_type is not None:
+        model_type = args.model_type
     else:
-        raise ValueError(f"Unsupported modality: {args.modality}")
+        model_type = json.load(
+            open(os.path.join(llm._hf_model_dir, 'config.json')))['model_type']
+    assert model_type in INPUT_FORMATTER_MAP, f"Unsupported model_type: {model_type}"
 
-    model_type = json.load(open(os.path.join(llm._hf_model_dir,
-                                             'config.json')))['model_type']
-    assert model_type in MODEL_TYPE_MAP, f"Unsupported model_type: {model_type}"
-    inputs = MODEL_TYPE_MAP[model_type](args, inputs)
+    inputs = prepare_multimodal_inputs(args.model_dir, model_type,
+                                       args.modality, args.prompt, args.media,
+                                       image_format, args.num_frames)
 
     outputs = llm.generate(inputs, sampling_params)
 
diff --git a/examples/quantization/requirements.txt b/examples/quantization/requirements.txt
index d25462fb9f..f14563d3a4 100644
--- a/examples/quantization/requirements.txt
+++ b/examples/quantization/requirements.txt
@@ -1,6 +1,6 @@
 -c ../constraints.txt
 tensorrt_llm>=0.0.0.dev0
-datasets>=2.14.4
+datasets==3.1.0
 nemo-toolkit[all]==2.0.0rc1
 rouge_score
 transformers_stream_generator==0.0.4
diff --git a/examples/qwen/requirements.txt b/examples/qwen/requirements.txt
index 997d073f09..e53acc9577 100644
--- a/examples/qwen/requirements.txt
+++ b/examples/qwen/requirements.txt
@@ -1,6 +1,6 @@
 -c ../constraints.txt
 tensorrt_llm>=0.0.0.dev0
-datasets~=2.16.0
+datasets==3.1.0
 evaluate
 rouge_score
 transformers>=4.40.1
diff --git a/examples/qwen2audio/requirements.txt b/examples/qwen2audio/requirements.txt
index 25ca280871..fbfbb970a6 100644
--- a/examples/qwen2audio/requirements.txt
+++ b/examples/qwen2audio/requirements.txt
@@ -1,6 +1,6 @@
 -c ../constraints.txt
 tensorrt_llm>=0.0.dev0
-datasets~=2.16.0
+datasets==3.1.0
 evaluate
 rouge_score
 transformers>=4.45.0
diff --git a/examples/qwenvl/requirements.txt b/examples/qwenvl/requirements.txt
index 620ed8071c..8d19b00769 100644
--- a/examples/qwenvl/requirements.txt
+++ b/examples/qwenvl/requirements.txt
@@ -1,6 +1,6 @@
 -c ../constraints.txt
 tensorrt_llm>=0.0.0.dev0
-datasets~=2.16.0
+datasets==3.1.0
 evaluate
 rouge_score
 transformers-stream-generator
diff --git a/examples/recurrentgemma/requirements.txt b/examples/recurrentgemma/requirements.txt
index 4146f76f66..1cc58c7636 100644
--- a/examples/recurrentgemma/requirements.txt
+++ b/examples/recurrentgemma/requirements.txt
@@ -5,7 +5,7 @@ flax>=0.8.2
 jax~=0.4.23
 orbax-checkpoint==0.5.7
 transformers>=4.40.0
-datasets~=2.14.5
+datasets==3.1.0
 evaluate
 rouge_score
 sentencepiece
diff --git a/examples/redrafter/requirements.txt b/examples/redrafter/requirements.txt
index 85068b0352..423ba2d4b0 100644
--- a/examples/redrafter/requirements.txt
+++ b/examples/redrafter/requirements.txt
@@ -1,6 +1,6 @@
 -c ../constraints.txt
 tensorrt_llm>=0.0.0.dev0
-datasets~=2.14.5
+datasets==3.1.0
 rouge_score
 sentencepiece>=0.1.99
 evaluate
diff --git a/examples/summarize.py b/examples/summarize.py
index d7d370d8e6..a584e74fea 100644
--- a/examples/summarize.py
+++ b/examples/summarize.py
@@ -110,7 +110,8 @@ def main(args):
     dataset = load_dataset(dataset_name,
                            dataset_revision,
                            cache_dir=args.dataset_cache_dir,
-                           split=dataset_split)
+                           split=dataset_split,
+                           trust_remote_code=True)
     dataset = dataset.shuffle(args.random_seed)
 
     max_batch_size = args.batch_size
diff --git a/examples/whisper/requirements.txt b/examples/whisper/requirements.txt
index a9eefa968e..2af7da3fec 100644
--- a/examples/whisper/requirements.txt
+++ b/examples/whisper/requirements.txt
@@ -1,7 +1,7 @@
 -c ../constraints.txt
 tensorrt_llm>=0.0.0.dev0
 tiktoken
-datasets
+datasets==3.1.0
 kaldialign
 openai-whisper
 librosa
diff --git a/examples/whisper/run.py b/examples/whisper/run.py
index 17bed30bbe..2e714c1d95 100755
--- a/examples/whisper/run.py
+++ b/examples/whisper/run.py
@@ -564,7 +564,8 @@ if __name__ == '__main__':
     normalizer = EnglishTextNormalizer()
     dataset = load_dataset(args.dataset,
                            args.dataset_name,
-                           split=args.dataset_split)
+                           split=args.dataset_split,
+                           trust_remote_code=True)
     if args.enable_warmup:
         results, total_duration = decode_dataset(
             model,
diff --git a/requirements-dev.txt b/requirements-dev.txt
index ab59dbbe36..ae71fcf14d 100644
--- a/requirements-dev.txt
+++ b/requirements-dev.txt
@@ -1,5 +1,4 @@
 -r requirements.txt
-datasets==2.19.2
 einops
 graphviz
 mypy
diff --git a/requirements.txt b/requirements.txt
index 632314d0d2..b352fdd54c 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -31,6 +31,8 @@ pydantic>=2.9.1
 pillow==10.3.0
 wheel<=0.45.1
 optimum
+# evaluate needs datasets>=2.0.0 which triggers datasets>3.1.0 which is not stable: https://github.com/huggingface/datasets/issues/7467
+datasets==3.1.0
 evaluate
 mpmath>=1.3.0
 click
diff --git a/tensorrt_llm/_torch/models/modeling_qwen2vl.py b/tensorrt_llm/_torch/models/modeling_qwen2vl.py
index ff4159b41f..e9b876cb8e 100644
--- a/tensorrt_llm/_torch/models/modeling_qwen2vl.py
+++ b/tensorrt_llm/_torch/models/modeling_qwen2vl.py
@@ -437,10 +437,7 @@ class Qwen2VLModelBase(PreTrainedModel):
             inputs_embeds=input_embeds,
             return_context_logits=return_context_logits,
             mrope_config=mrope_config)
-        logger.debug(
-            f"output_ids: {(output_prob if output_prob.dim() == 2 else output_prob.unsqueeze(0)).argmax(dim=1).tolist()}"
-        )
-        logger.info(f'output shape: {output_prob.shape}')
+        logger.debug(f'output shape: {output_prob.shape}')
         return output_prob
 
 
diff --git a/tensorrt_llm/bench/benchmark/throughput.py b/tensorrt_llm/bench/benchmark/throughput.py
index 83e60eccdb..6c1d348f53 100755
--- a/tensorrt_llm/bench/benchmark/throughput.py
+++ b/tensorrt_llm/bench/benchmark/throughput.py
@@ -10,6 +10,7 @@ from click_option_group import (MutuallyExclusiveOptionGroup, OptionGroup,
 
 from tensorrt_llm.bench.benchmark.utils.asynchronous import async_benchmark
 from tensorrt_llm.bench.benchmark.utils.processes import IterationWriter
+from tensorrt_llm.bench.build.build import get_model_config
 
 # isort: off
 from tensorrt_llm.bench.benchmark.utils.general import (
@@ -21,7 +22,8 @@ from tensorrt_llm.bench.dataclasses.configuration import RuntimeConfig
 from tensorrt_llm.bench.dataclasses.general import BenchmarkEnvironment
 from tensorrt_llm.bench.dataclasses.reporting import ReportUtility
 from tensorrt_llm.bench.utils.data import (create_dataset_from_stream,
-                                           initialize_tokenizer)
+                                           initialize_tokenizer,
+                                           update_metadata_for_multimodal)
 from tensorrt_llm.llmapi import LLM, CapacitySchedulerPolicy
 from tensorrt_llm.logger import logger
 from tensorrt_llm.sampling_params import SamplingParams
@@ -92,12 +94,26 @@ from tensorrt_llm.sampling_params import SamplingParams
     required=False,
     help="Pass in a dataset file for parsing instead of stdin.",
 )
+@optgroup.option(
+    "--modality",
+    type=click.Choice(["image", "video"]),
+    default=None,
+    help="Modality of the multimodal requests.",
+)
+@optgroup.option(
+    "--max_input_len",
+    type=int,
+    default=4096,
+    help=
+    "Maximum input sequence length to use for multimodal models. This is used only when --modality "
+    "is specified since the actual number of vision tokens is unknown before the model is run.",
+)
 @optgroup.option(
     "--num_requests",
     type=int,
     default=0,
     help=
-    "Number of requests to cap benchmark run at. If not specified or set to 0, it will be the"
+    "Number of requests to cap benchmark run at. If not specified or set to 0, it will be the "
     "length of dataset.",
 )
 @optgroup.option(
@@ -194,6 +210,9 @@ def throughput_command(
     engine_dir: Path = params.pop("engine_dir")
     concurrency: int = params.pop("concurrency")
     backend: str = params.get("backend")
+    modality: str = params.pop("modality")
+    max_input_len: int = params.pop("max_input_len")
+    model_type = get_model_config(model, checkpoint_path).model_type
 
     # Reporting options
     report_json: Path = params.pop("report_json")
@@ -209,15 +228,24 @@ def throughput_command(
     # Dataset Loading and Preparation
     with open(dataset_path, "r") as dataset:
         metadata, requests = create_dataset_from_stream(
-            tokenizer, dataset, num_requests=num_requests)
+            tokenizer,
+            dataset,
+            num_requests=num_requests,
+            model_dir=checkpoint_path,
+            model_type=model_type,
+            modality=modality,
+            max_input_seq_len_for_multimodal=max_input_len)
         metadata.dataset_path = dataset_path
         params["target_input_len"] = params.get(
             "target_input_len") or metadata.avg_isl
         params["target_output_len"] = params.get(
             "target_output_len") or metadata.avg_osl
 
-    # Log dataset info
-    logger.info(metadata.get_summary_for_print())
+    if modality is None:
+        # Log dataset info
+        # NOTE: This table is only accurate for non-multimodal models.
+        #       The accurate table for multimodal models will be logged after the benchmark is done.
+        logger.info(metadata.get_summary_for_print())
 
     # Engine configuration parsing
     if backend and backend.lower() in ["pytorch", "autodeploy"]:
@@ -294,8 +322,12 @@ def throughput_command(
             warmup_dataset = generate_warmup_dataset(requests, warmup)
             logger.info("Running warmup.")
             asyncio.run(
-                async_benchmark(llm, sampling_params, warmup_dataset, False,
-                                concurrency))
+                async_benchmark(llm,
+                                sampling_params,
+                                warmup_dataset,
+                                False,
+                                concurrency,
+                                modality=modality))
             # WAR: IterationResult is a singleton tied to the executor.
             # Since the benchmark calls asyncio.run() multiple times (e.g., during warmup),
             # we must reset it to ensure it attaches to the correct event loop.
@@ -304,10 +336,19 @@ def throughput_command(
 
         with iteration_writer.capture():
             statistics = asyncio.run(
-                async_benchmark(llm, sampling_params, requests, streaming,
-                                concurrency, iteration_writer.full_address))
+                async_benchmark(llm,
+                                sampling_params,
+                                requests,
+                                streaming,
+                                concurrency,
+                                iteration_writer.full_address,
+                                modality=modality))
 
         logger.info(f"Benchmark done. Reporting results...")
+        if modality is not None:
+            # For multimodal models, we need to update the metadata with the correct input lengths
+            metadata = update_metadata_for_multimodal(metadata, statistics)
+
         report_utility = ReportUtility(statistics, metadata, runtime_config,
                                        logger, kwargs, streaming)
         if report_json:
diff --git a/tensorrt_llm/bench/benchmark/utils/asynchronous.py b/tensorrt_llm/bench/benchmark/utils/asynchronous.py
index 08ba1f2d70..d61241f5ee 100644
--- a/tensorrt_llm/bench/benchmark/utils/asynchronous.py
+++ b/tensorrt_llm/bench/benchmark/utils/asynchronous.py
@@ -23,7 +23,8 @@ class LlmManager:
                  llm: LLM,
                  outbox: asyncio.Queue[PerfItemTuple],
                  streaming: bool,
-                 concurrency: int = -1) -> None:
+                 concurrency: int = -1,
+                 modality: Optional[str] = None) -> None:
         self.llm = llm
         self._inbox: asyncio.Queue[Tuple[InferenceRequest,
                                          SamplingParams]] = asyncio.Queue()
@@ -38,6 +39,7 @@ class LlmManager:
             concurrency) if concurrency > 0 else None
         self.streaming = streaming
         self.request_seen = asyncio.Event()
+        self.modality = modality
 
     async def process_request(self, request: InferenceRequest,
                               sampling_params: SamplingParams):
@@ -50,7 +52,7 @@ class LlmManager:
             time_on_first_token = None
             # Schedule the request in the LLM API (asynchronously)
             output: RequestOutput = self.llm.generate_async(
-                request.input_ids,
+                request.input_ids if self.modality is None else request.prompt,
                 sampling_params=sampling_params,
                 streaming=self.streaming)
             if self.streaming:
@@ -70,7 +72,7 @@ class LlmManager:
             start_timestamp=request_start_timestamp,
             end_timestamp=response_end_timestamp,
             request_id=response.request_id,
-            num_input_tokens=len(request.input_ids),
+            num_input_tokens=len(output.prompt_token_ids),
             response_is_final=response.finished,
             error=False,
             tokens=tokens,
@@ -201,6 +203,7 @@ async def async_benchmark(
     streaming: bool,
     concurrency: int = -1,
     iteration_log_addr: str = None,
+    modality: Optional[str] = None,
 ) -> StatsKeeper:
     outbox = asyncio.Queue()
     statistics = StatsKeeper()
@@ -208,7 +211,11 @@ async def async_benchmark(
 
     try:
         logger.info("Starting benchmarking async task.")
-        backend = LlmManager(llm, outbox, streaming, concurrency=concurrency)
+        backend = LlmManager(llm,
+                             outbox,
+                             streaming,
+                             concurrency=concurrency,
+                             modality=modality)
         backend.run(iteration_addr=iteration_log_addr)
 
         enqueue_task = asyncio.create_task(
diff --git a/tensorrt_llm/bench/build/dataclasses.py b/tensorrt_llm/bench/build/dataclasses.py
index aa37fa0242..ae51bdeb36 100755
--- a/tensorrt_llm/bench/build/dataclasses.py
+++ b/tensorrt_llm/bench/build/dataclasses.py
@@ -116,6 +116,7 @@ class ModelConfig(BaseModel):
         setting calculation.
     """
     name: str
+    model_type: str
     param_count: int
     num_hidden_layers: int = Field(validation_alias=AliasChoices(
         "num_hidden_layers",
diff --git a/tensorrt_llm/bench/dataclasses/general.py b/tensorrt_llm/bench/dataclasses/general.py
index 5c5fdf95c7..1a8277a415 100644
--- a/tensorrt_llm/bench/dataclasses/general.py
+++ b/tensorrt_llm/bench/dataclasses/general.py
@@ -1,7 +1,7 @@
 from __future__ import annotations
 
 from pathlib import Path
-from typing import List, Optional
+from typing import Any, List, Optional, Union
 
 from pydantic import (AliasChoices, BaseModel, Field, computed_field,
                       model_validator)
@@ -17,7 +17,7 @@ class BenchmarkEnvironment(BaseModel):
 
 class InferenceRequest(BaseModel):
     task_id: int
-    prompt: Optional[str] = None
+    prompt: Optional[Union[str, Any]] = None
     output_tokens: int
     input_ids: Optional[List[int]] = Field(
         alias=AliasChoices("input_ids", "logits"))
diff --git a/tensorrt_llm/bench/utils/data.py b/tensorrt_llm/bench/utils/data.py
index 64a94ac668..5b9d29d4e2 100644
--- a/tensorrt_llm/bench/utils/data.py
+++ b/tensorrt_llm/bench/utils/data.py
@@ -1,12 +1,36 @@
 import json
 from functools import partial
-from typing import List, TextIO, Tuple
+from typing import Any, Dict, List, TextIO, Tuple
 
 from transformers import AutoTokenizer, PreTrainedTokenizer
 
 from tensorrt_llm.bench.dataclasses.general import (DatasetMetadata,
                                                     InferenceRequest)
 from tensorrt_llm.bench.dataclasses.statistics import PercentileStats
+from tensorrt_llm.inputs import (INPUT_FORMATTER_MAP, default_image_loader,
+                                 default_video_loader)
+
+
+def prepare_multimodal_inputs(model_dir: str,
+                              model_type: str,
+                              modality: str,
+                              prompts: List[str],
+                              media: List[str],
+                              image_data_format: str = "pt",
+                              num_frames: int = 8) -> List[Dict[str, Any]]:
+
+    inputs = []
+    if modality == "image":
+        inputs = default_image_loader(prompts, media, image_data_format)
+    elif modality == "video":
+        inputs = default_video_loader(prompts, media, image_data_format,
+                                      num_frames)
+    else:
+        raise ValueError(f"Unsupported modality: {modality}")
+
+    inputs = INPUT_FORMATTER_MAP[model_type](model_dir, inputs)
+
+    return inputs
 
 
 def initialize_tokenizer(model_name: str) -> PreTrainedTokenizer:
@@ -36,6 +60,10 @@ def create_dataset_from_stream(
     max_input_length: int = 0,
     max_output_length: int = 0,
     num_requests: int = 0,
+    model_dir: str = None,
+    model_type: str = None,
+    modality: str = None,
+    max_input_seq_len_for_multimodal: int = 4096,
 ) -> Tuple[DatasetMetadata, List[InferenceRequest]]:
     """Generate metadata and a list of requests to drive benchmarking.
 
@@ -83,13 +111,30 @@ def create_dataset_from_stream(
         # Each line should be a complete JSON dictionary with no indentation
         # or newline characters.
         data = json.loads(line)
-        logits = data.get("input_ids", data.get("logits", None))
-        prompt = data.get("prompt", None)
+        if modality is not None:
+            # Multimodal data
+            assert modality in [
+                "image", "video"
+            ], f"Modality must be one of ['image', 'video'] but got {modality}."
+
+            prompt = data.get("prompt")  # cannot be None
+            media_paths = data.get("media_paths", None)
+            inputs = prepare_multimodal_inputs(
+                model_dir,
+                model_type,
+                modality,
+                prompts=[prompt],
+                media=media_paths)  # list of dicts
+            logits = None  # cannot tokenize multi-modal data, handled by preprocessor
+            prompt = inputs[0]
+        else:
+            logits = data.get("input_ids", data.get("logits", None))
+            prompt = data.get("prompt", None)
+            # If the request comes in with logits, just use the provided.
+            # Otherwise we need to tokenize it.
+            logits = tokenize(prompt)["input_ids"] if logits is None else logits
         task_id = data["task_id"]
         osl = data["output_tokens"]
-        # If the request comes in with logits, just use the provided.
-        # Otherwise we need to tokenize it.
-        logits = tokenize(prompt)["input_ids"] if logits is None else logits
 
         request = InferenceRequest(
             task_id=task_id,
@@ -97,9 +142,14 @@ def create_dataset_from_stream(
             output_tokens=output_limiter(osl),
             input_ids=logits,
         )
-        all_isl.append(len(logits))
         all_osl.append(osl)
-        all_seq_len.append(len(logits) + osl)
+        if modality is not None:
+            cur_isl = max_input_seq_len_for_multimodal  # NOTE: actual sequence length is unknown until the model is run
+            all_isl.append(cur_isl)
+            all_seq_len.append(cur_isl + osl)
+        else:
+            all_isl.append(len(logits))
+            all_seq_len.append(len(logits) + osl)
         dataset.append(request)
 
     isl_stats = PercentileStats.from_iterable(all_isl)
@@ -115,3 +165,31 @@ def create_dataset_from_stream(
     )
 
     return metadata, dataset
+
+
+def update_metadata_for_multimodal(metadata, statistics) -> DatasetMetadata:
+    """Update the metadata from benchmark statistics. Only used for multimodal models.
+
+    Args:
+        metadata (DatasetMetadata): The metadata to update.
+        statistics (StatsKeeper): The statistics to update the metadata with.
+
+    Returns:
+        DatasetMetadata: The updated metadata.
+    """
+    all_isl = []
+    all_osl = []
+    all_seq_len = []
+    for request in statistics.requests.values():
+        all_isl.append(request.num_input_tokens)
+        all_osl.append(request.num_total_output_tokens)
+        all_seq_len.append(request.num_input_tokens +
+                           request.num_total_output_tokens)
+    isl_stats = PercentileStats.from_iterable(all_isl)
+    osl_stats = PercentileStats.from_iterable(all_osl)
+    seq_len_stats = PercentileStats.from_iterable(all_seq_len)
+    metadata.isl_stats = isl_stats
+    metadata.osl_stats = osl_stats
+    metadata.seq_len_stats = seq_len_stats
+
+    return metadata
diff --git a/tensorrt_llm/evaluate/cnn_dailymail.py b/tensorrt_llm/evaluate/cnn_dailymail.py
index 494ba4da71..cb54358b92 100644
--- a/tensorrt_llm/evaluate/cnn_dailymail.py
+++ b/tensorrt_llm/evaluate/cnn_dailymail.py
@@ -36,7 +36,10 @@ class CnnDailymail(Evaluator):
                  system_prompt: Optional[str] = None):
         super().__init__(apply_chat_template=apply_chat_template,
                          system_prompt=system_prompt)
-        self.data = datasets.load_dataset(dataset_path, "3.0.0", split="test")
+        self.data = datasets.load_dataset(dataset_path,
+                                          "3.0.0",
+                                          split="test",
+                                          trust_remote_code=True)
         self.data = self.data.shuffle(random_seed)
         if num_samples is None:
             self.num_samples = self.data.num_rows
diff --git a/tensorrt_llm/inputs/__init__.py b/tensorrt_llm/inputs/__init__.py
index fd95bfd43f..fd9887ddb1 100644
--- a/tensorrt_llm/inputs/__init__.py
+++ b/tensorrt_llm/inputs/__init__.py
@@ -1,10 +1,15 @@
 from .data import PromptInputs, TextPrompt, TokensPrompt, prompt_inputs
 from .registry import (ExtraProcessedInputs, InputProcessor,
                        create_input_processor, register_input_processor)
-from .utils import load_image, load_video
+from .utils import (INPUT_FORMATTER_MAP, default_image_loader,
+                    default_video_loader, format_llava_next_input,
+                    format_qwen2_vl_input, format_vila_input, load_image,
+                    load_video)
 
 __all__ = [
     "PromptInputs", "prompt_inputs", "TextPrompt", "TokensPrompt",
     "InputProcessor", "create_input_processor", "register_input_processor",
-    "ExtraProcessedInputs", "load_image", "load_video"
+    "ExtraProcessedInputs", "load_image", "load_video", "INPUT_FORMATTER_MAP",
+    "default_image_loader", "default_video_loader", "format_vila_input",
+    "format_llava_next_input", "format_qwen2_vl_input"
 ]
diff --git a/tensorrt_llm/inputs/utils.py b/tensorrt_llm/inputs/utils.py
index a79b49fcc4..19f8a0d174 100644
--- a/tensorrt_llm/inputs/utils.py
+++ b/tensorrt_llm/inputs/utils.py
@@ -6,6 +6,7 @@ import requests
 import torch
 from PIL import Image
 from torchvision.transforms import ToTensor
+from transformers import AutoProcessor
 
 
 def load_image(image: str,
@@ -67,3 +68,158 @@ def load_video(
             device=device) if format == "pt" else frames[index]
         for index in indices if index in frames
     ]
+
+
+"""
+VLM input preparation.
+"""
+
+
+def format_vila_input(model_dir, inputs):
+    """
+    This function formats the input for the VILA/NVILA VL model.
+
+    Arguments:
+        model_dir: The directory of the model to load any preprocessor.
+        inputs: The list of inputs to format.
+
+    Returns:
+        A list of dictionaries where "prompt" data is modified to a TextPrompt that combines text prompt and multimodal data.
+    """
+
+    def add_media_token(prompt, multi_modal_data):
+        mm_tokens = ""
+        if "image" in multi_modal_data:
+            for _ in multi_modal_data["image"]:
+                mm_tokens += "<image>"
+        elif "video" in multi_modal_data:
+            for _ in multi_modal_data["video"]:
+                mm_tokens += "<vila/video>"
+        return mm_tokens + prompt
+
+    for input in inputs:
+        input["prompt"] = add_media_token(input["prompt"],
+                                          input["multi_modal_data"])
+    return inputs
+
+
+def format_llava_next_input(model_dir, inputs):
+    """
+    This function formats the input for the Llava Next VL model.
+
+    Arguments:
+        model_dir: The directory of the model to load any preprocessor.
+        inputs: The list of inputs to format.
+
+    Returns:
+        A list of dictionaries where "prompt" data is modified to a TextPrompt that combines text prompt and multimodal data.
+    """
+    processor = AutoProcessor.from_pretrained(model_dir)
+
+    # Single-image inference chat template. For multi-image template,
+    # see https://huggingface.co/docs/transformers/en/model_doc/llava_next#multi-image-inference.
+    def apply_template(prompt, multimodal_data):
+        conversation = [
+            {
+                "role": "user",
+                "content": [
+                    {
+                        "type": "text",
+                        "text": prompt
+                    },
+                    {
+                        "type": "image"
+                    },
+                ],
+            },
+        ]
+        return processor.apply_chat_template(
+            conversation,
+            add_generation_prompt=True,
+        )
+
+    for input in inputs:
+        input["prompt"] = apply_template(input["prompt"],
+                                         input["multi_modal_data"])
+    return inputs
+
+
+def format_qwen2_vl_input(model_dir, inputs):
+    """
+    This function formats the input for the Qwen2/Qwen2.5 VL model.
+
+    Arguments:
+        model_dir: The directory of the model to load any preprocessor.
+        inputs: The list of inputs to format.
+
+    Returns:
+        A list of dictionaries where "prompt" data is modified to a TextPrompt that combines text prompt and multimodal data.
+    """
+    processor = AutoProcessor.from_pretrained(model_dir)
+
+    def apply_template(prompt, multimodal_data):
+        content = [{
+            "type": media_type
+        } for media_type, items in multimodal_data.items()
+                   for _ in items] + [{
+                       "type": "text",
+                       "text": prompt
+                   }]
+
+        conversation = [{"role": "user", "content": content}]
+        # print(conversation)
+        return processor.apply_chat_template(
+            conversation,
+            tokenize=False,
+            add_generation_prompt=True,
+        )
+
+    for input in inputs:
+        input["prompt"] = apply_template(input["prompt"],
+                                         input["multi_modal_data"])
+    return inputs
+
+
+def default_image_loader(prompts, images, image_data_format="pt"):
+    if len(images) > len(prompts) and len(prompts) == 1:
+        # 1 prompt + N media
+        images = [images]
+    inputs = [{
+        "prompt": prompt,
+        "multi_modal_data": {
+            "image": [
+                load_image(i, format=image_data_format, device="cuda")
+                for i in image
+            ] if isinstance(image, list) else
+            [load_image(image, format=image_data_format, device="cuda")]
+        }
+    } for prompt, image in zip(prompts, images)]
+    return inputs
+
+
+def default_video_loader(prompts, videos, image_data_format="pt", num_frames=8):
+    if len(videos) > len(prompts) and len(prompts) == 1:
+        # 1 prompt + N media
+        videos = [videos]
+    inputs = [{
+        "prompt": prompt,
+        "multi_modal_data": {
+            "video": [
+                load_video(
+                    i, num_frames, format=image_data_format, device="cuda")
+                for i in video
+            ] if isinstance(video, list) else [
+                load_video(
+                    video, num_frames, format=image_data_format, device="cuda")
+            ]
+        }
+    } for prompt, video in zip(prompts, videos)]
+    return inputs
+
+
+INPUT_FORMATTER_MAP = {
+    "llava_llama": format_vila_input,
+    "llava_next": format_llava_next_input,
+    "qwen2_vl": format_qwen2_vl_input,
+    "qwen2_5_vl": format_qwen2_vl_input,
+}
diff --git a/tensorrt_llm/models/convert_utils.py b/tensorrt_llm/models/convert_utils.py
index 310fd0ab35..4367fe085a 100644
--- a/tensorrt_llm/models/convert_utils.py
+++ b/tensorrt_llm/models/convert_utils.py
@@ -306,6 +306,7 @@ def load_calib_dataset(dataset_name_or_dir: str,
     dataset = load_dataset(dataset_name_or_dir,
                            name=config_name,
                            split=split,
+                           trust_remote_code=trust_remote_code,
                            **kwargs)
     return dataset[key]
 
diff --git a/tensorrt_llm/quantization/quantize_by_modelopt.py b/tensorrt_llm/quantization/quantize_by_modelopt.py
index 7b8e590fd4..01524fc0ff 100755
--- a/tensorrt_llm/quantization/quantize_by_modelopt.py
+++ b/tensorrt_llm/quantization/quantize_by_modelopt.py
@@ -384,20 +384,26 @@ def get_calib_dataloader(dataset_name_or_dir="cnn_dailymail",
         dataset = load_dataset(
             "json",
             data_files="https://the-eye.eu/public/AI/pile/val.jsonl.zst",
-            split="train")
+            split="train",
+            trust_remote_code=True)
         dataset = dataset["text"][:calib_size]
     elif "scienceqa" in dataset_name_or_dir.lower(
     ) or "science_qa" in dataset_name_or_dir.lower():
         if os.path.isdir(dataset_name_or_dir):
-            dataset = load_dataset(dataset_name_or_dir, split="train")
+            dataset = load_dataset(dataset_name_or_dir,
+                                   split="train",
+                                   trust_remote_code=True)
         else:
-            dataset = load_dataset("derek-thomas/ScienceQA", split="train")
+            dataset = load_dataset("derek-thomas/ScienceQA",
+                                   split="train",
+                                   trust_remote_code=True)
         dataset = dataset.select(range(calib_size))
     elif "cnn_dailymail" in dataset_name_or_dir:
         dataset = load_dataset(
             dataset_name_or_dir,
             name="3.0.0",
             split="train",
+            trust_remote_code=True,
         )
         dataset = dataset["article"][:calib_size]
     elif os.path.isdir(dataset_name_or_dir):
@@ -405,7 +411,9 @@ def get_calib_dataloader(dataset_name_or_dir="cnn_dailymail",
             f"Recognized local dataset repo {dataset_name_or_dir} for calibration; "
             "assuming the calibration data are in the train split and text column."
         )
-        dataset = load_dataset(dataset_name_or_dir, split="train")
+        dataset = load_dataset(dataset_name_or_dir,
+                               split="train",
+                               trust_remote_code=True)
         dataset = dataset["text"][:calib_size]
     else:
         raise NotImplementedError(
@@ -993,22 +1001,29 @@ def get_nemo_calib_dataloader(dataset_name_or_dir="cnn_dailymail",
         dataset = load_dataset(
             "json",
             data_files="https://the-eye.eu/public/AI/pile/val.jsonl.zst",
-            split="train")
+            split="train",
+            trust_remote_code=True)
         text_column = "text"
     elif "wikitext" in dataset_name_or_dir:
         dataset = load_dataset(dataset_name_or_dir,
                                "wikitext-103-v1",
-                               split="train")
+                               split="train",
+                               trust_remote_code=True)
         text_column = "text"
     elif "cnn_dailymail" in dataset_name_or_dir:
-        dataset = load_dataset(dataset_name_or_dir, name="3.0.0", split="train")
+        dataset = load_dataset(dataset_name_or_dir,
+                               name="3.0.0",
+                               split="train",
+                               trust_remote_code=True)
         text_column = "article"
     elif os.path.isdir(dataset_name_or_dir):
         logger.info(
             f"Recognized local dataset repo {dataset_name_or_dir} for calibration; "
             "assuming the calibration data are in the train split and text column."
         )
-        dataset = load_dataset(dataset_name_or_dir, split="train")
+        dataset = load_dataset(dataset_name_or_dir,
+                               split="train",
+                               trust_remote_code=True)
         text_column = "text"
     else:
         raise NotImplementedError(
diff --git a/tests/unittest/llmapi/test_llm.py b/tests/unittest/llmapi/test_llm.py
index 7fe3e5e1c9..f74a6e0074 100644
--- a/tests/unittest/llmapi/test_llm.py
+++ b/tests/unittest/llmapi/test_llm.py
@@ -366,8 +366,11 @@ def test_tokenizer_decode_incrementally(tokenizer_dir: str, threshold: float):
     num_samples = 100
     cnn_dailymail = datasets.load_dataset(cnn_dailymail_path,
                                           name='3.0.0',
-                                          split='train')
-    alpaca_chinese = datasets.load_dataset(alpaca_chinese_path, split='train')
+                                          split='train',
+                                          trust_remote_code=True)
+    alpaca_chinese = datasets.load_dataset(alpaca_chinese_path,
+                                           split='train',
+                                           trust_remote_code=True)
     dataset = cnn_dailymail['article'][:num_samples // 2] + alpaca_chinese[
         'output_zh'][:num_samples // 2]