TensorRT-LLMs/examples/nemotron_nas/calibration_utils.py
rakib-hasan ff3b741045
feat: adding multimodal (only image for now) support in trtllm-bench (#3490)
* feat: adding multimodal (only image for now) support in trtllm-bench

Signed-off-by: Rakib Hasan <rhasan@nvidia.com>

* fix: add  in load_dataset() calls to maintain the v2.19.2 behavior

Signed-off-by: Rakib Hasan <rhasan@nvidia.com>

* re-adding prompt_token_ids and using that for prompt_len

Signed-off-by: Rakib Hasan <rhasan@nvidia.com>

* updating the datasets version in examples as well

Signed-off-by: Rakib Hasan <rhasan@nvidia.com>

* api changes are not needed

Signed-off-by: Rakib Hasan <rhasan@nvidia.com>

* moving datasets requirement and removing a missed api change

Signed-off-by: Rakib Hasan <rhasan@nvidia.com>

* addressing review comments

Signed-off-by: Rakib Hasan <rhasan@nvidia.com>

* refactoring the quickstart example

Signed-off-by: Rakib Hasan <rhasan@nvidia.com>

---------

Signed-off-by: Rakib Hasan <rhasan@nvidia.com>
2025-04-18 07:06:16 +08:00

40 lines
1.5 KiB
Python

# SPDX-FileCopyrightText: Copyright (c) 2022-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
DATASET = "Magpie-Align/Magpie-Pro-MT-300K-v0.1"
def create_trtllm_magpie_calibration_dataset(output_dir: str,
calib_size: int = 512) -> None:
from datasets import load_dataset
dataset = load_dataset(DATASET, split="train", trust_remote_code=True)
def transform(conversation):
value = '\n'.join(turn['value']
for turn in conversation['conversations'])
return {"text": value}
dataset = dataset.select(range(calib_size)).map(
transform, remove_columns=dataset.column_names)
# https://github.com/huggingface/datasets/issues/6703#issuecomment-1974766332
dataset.to_parquet(output_dir + "/data.parquet")
if __name__ == "__main__":
import sys
output_dir = sys.argv[1]
create_trtllm_magpie_calibration_dataset(output_dir)