[None][chore] update torch_dtype -> dtype in 'transformers' (#8263)

Signed-off-by: ixlmar <206748156+ixlmar@users.noreply.github.com>
2026-01-13 22:18:36 +08:00 · 2025-10-15 10:09:30 +02:00 · 2025-10-15 10:09:30 +02:00 · 93a4b7f1b6
commit 93a4b7f1b6
parent 616d1df7a0
64 changed files with 110 additions and 124 deletions
--- a/examples/medusa/convert_checkpoint.py
+++ b/examples/medusa/convert_checkpoint.py
@ -350,7 +350,7 @@ def main():
            hf_model = LlamaForCausalLM if args.model_type != "mixtral" else MixtralForCausalLM
            model = hf_model.from_pretrained(
                args.model_dir,
-                torch_dtype='auto',
+                dtype='auto',
                device_map='auto' if not args.load_model_on_cpu else 'cpu',
                trust_remote_code=True)

--- a/examples/mmlu.py
+++ b/examples/mmlu.py
@ -447,7 +447,7 @@ def main():
        model = auto_model_cls.from_pretrained(
            args.hf_model_dir,
            trust_remote_code=True,
-            torch_dtype=DTYPE_STR_MAPPING[args.hf_data_type],
+            dtype=DTYPE_STR_MAPPING[args.hf_data_type],
            device_map="auto" if args.hf_device_map_auto else None,
        )
        if not args.hf_device_map_auto:
--- a/examples/models/contrib/bloom/convert_checkpoint.py
+++ b/examples/models/contrib/bloom/convert_checkpoint.py
@ -898,7 +898,7 @@ def main():
    if not convert_from_ckpt:
        logger.info(f'Convert by using model')
        hf_bloom = BloomForCausalLM.from_pretrained(args.model_dir,
-                                                    torch_dtype="auto",
+                                                    dtype="auto",
                                                    device_map="auto",
                                                    trust_remote_code=True)
    else:
--- a/examples/models/contrib/cogvlm/convert_checkpoint.py
+++ b/examples/models/contrib/cogvlm/convert_checkpoint.py
@ -405,13 +405,13 @@ def main():

        if args.model_type == "llava":
            hf_llava = LlavaForConditionalGeneration.from_pretrained(
-                args.model_dir, torch_dtype="auto")
+                args.model_dir, dtype="auto")
            model = hf_llava.language_model
        else:
            model = AutoModelForCausalLM.from_pretrained(
                args.model_dir,
                device_map='auto' if not args.load_model_on_cpu else 'cpu',
-                torch_dtype='auto' if not args.smoothquant else torch.float16,
+                dtype='auto' if not args.smoothquant else torch.float16,
                trust_remote_code=True,
            )
        if args.smoothquant is not None or args.int8_kv_cache:
--- a/examples/models/contrib/dbrx/convert_checkpoint.py
+++ b/examples/models/contrib/dbrx/convert_checkpoint.py
@ -605,7 +605,7 @@ if __name__ == '__main__':
        hf_model = AutoModelForCausalLM.from_pretrained(model_dir,
                                                        trust_remote_code=True,
                                                        device_map="auto",
-                                                        torch_dtype=getattr(
+                                                        dtype=getattr(
                                                            torch, args.dtype),
                                                        config=hf_config)
        return hf_model
--- a/examples/models/contrib/falcon/convert_checkpoint.py
+++ b/examples/models/contrib/falcon/convert_checkpoint.py
@ -129,10 +129,7 @@ def convert_and_save_hf(args: argparse.Namespace):
    import transformers
    if not args.load_by_shard and quant_config.quant_mode.has_any_quant():
        hf_model = transformers.FalconForCausalLM.from_pretrained(
-            model_dir,
-            trust_remote_code=True,
-            torch_dtype='auto',
-            device_map='auto')
+            model_dir, trust_remote_code=True, dtype='auto', device_map='auto')
    else:
        # Initialize huggingface local cache.
        # Huggingface copies the external configuration source (`configuration_falcon.py` here) into its local cache at
@ -142,7 +139,7 @@ def convert_and_save_hf(args: argparse.Namespace):
        # Preload the config once to initialize local cache, so subsequent multithread loading won't fail.
        _ = transformers.FalconConfig.from_pretrained(model_dir,
                                                      trust_remote_code=True,
-                                                      torch_dtype='auto',
+                                                      dtype='auto',
                                                      device_map='auto')

    def convert_and_save_rank(args, rank: int):
--- a/examples/models/contrib/falcon/requirements.txt
+++ b/examples/models/contrib/falcon/requirements.txt
@ -1,6 +1,6 @@
 -c ../../../constraints.txt
 tensorrt_llm>=0.0.0.dev0
-transformers>=4.31.0
+transformers>=4.56.0
 datasets==3.1.0
 evaluate
 rouge_score
--- a/examples/models/contrib/gptj/convert_checkpoint.py
+++ b/examples/models/contrib/gptj/convert_checkpoint.py
@ -87,7 +87,7 @@ def convert_and_save_hf(args):
    quant_config = args_to_quant_config(args)

    hf_model = AutoModelForCausalLM.from_pretrained(model_dir,
-                                                    torch_dtype='auto',
+                                                    dtype='auto',
                                                    trust_remote_code=True)

    def convert_and_save_rank(args, rank):
--- a/examples/models/contrib/gptneox/convert_checkpoint.py
+++ b/examples/models/contrib/gptneox/convert_checkpoint.py
@ -639,7 +639,7 @@ if __name__ == '__main__':

    hf_config = AutoConfig.from_pretrained(args.model_dir)
    hf_model = AutoModelForCausalLM.from_pretrained(args.model_dir,
-                                                    torch_dtype="auto")
+                                                    dtype="auto")

    config = {
        'architecture': hf_config.architectures[0],
--- a/examples/models/contrib/mmdit/requirements.txt
+++ b/examples/models/contrib/mmdit/requirements.txt
@ -1,2 +1,2 @@
-transformers>=4.47.1
+transformers>=4.56.0
 diffusers>=0.32.2
--- a/examples/models/contrib/mpt/convert_checkpoint.py
+++ b/examples/models/contrib/mpt/convert_checkpoint.py
@ -841,8 +841,7 @@ if __name__ == '__main__':

    hf_model = MptForCausalLM.from_pretrained(args.model_dir,
                                              device_map="auto",
-                                              torch_dtype=getattr(
-                                                  torch, args.dtype))
+                                              dtype=getattr(torch, args.dtype))

    act_range = {}
    mpt_qkv_para = {}
--- a/examples/models/contrib/opt/convert_checkpoint.py
+++ b/examples/models/contrib/opt/convert_checkpoint.py
@ -278,10 +278,10 @@ if __name__ == '__main__':

    if args.model_type == 'opt':
        hf_model = AutoModelForCausalLM.from_pretrained(args.model_dir,
-                                                        torch_dtype="auto")
+                                                        dtype="auto")
    elif args.model_type == 'blip2':
        hf_model = Blip2ForConditionalGeneration.from_pretrained(
-            args.model_dir, torch_dtype="auto").language_model
+            args.model_dir, dtype="auto").language_model

    hf_config = hf_model.config
    if hf_config.hidden_size != hf_config.word_embed_proj_dim:
--- a/examples/models/core/enc_dec/run.py
+++ b/examples/models/core/enc_dec/run.py
@ -388,7 +388,7 @@ if __name__ == "__main__":
        if tensorrt_llm.mpi_rank() == 0:
            hf_model = AutoModelForSeq2SeqLM.from_pretrained(
                args.model_name,  # TODO: use model path instead
-                # torch_dtype=torch.float16 if '16' in dtype else torch.float32,  # TODO: use matched torch dtype
+                # dtype=torch.float16 if '16' in dtype else torch.float32,  # TODO: use matched torch dtype
            ).to('cuda').eval()  # TODO: create config model path instead
            assert type(hf_model) in (
                T5ForConditionalGeneration, BartForConditionalGeneration,
--- a/examples/models/core/internlm2/convert_checkpoint.py
+++ b/examples/models/core/internlm2/convert_checkpoint.py
@ -508,7 +508,7 @@ if __name__ == '__main__':

        hf_model = AutoModelForCausalLM.from_pretrained(args.model_dir,
                                                        trust_remote_code=True,
-                                                        torch_dtype="auto")
+                                                        dtype="auto")
        weights = convert_from_hf(
            hf_model,
            hf_config,
--- a/examples/models/core/llama/requirements.txt
+++ b/examples/models/core/llama/requirements.txt
@ -1,6 +1,6 @@
 -c ../../../constraints.txt
 tensorrt_llm>=0.0.0.dev0
-transformers>=4.43.0
+transformers>=4.56.0
 datasets==3.1.0
 evaluate
 rouge_score
--- a/examples/models/core/llama/summarize_long.py
+++ b/examples/models/core/llama/summarize_long.py
@ -356,7 +356,7 @@ def main(args):
        profiler.start('load HF model')
        hf_model = AutoModelForCausalLM.from_pretrained(
            args.hf_model_location,
-            torch_dtype=torch.float16,
+            dtype=torch.float16,
            use_flash_attention_2=True)
        profiler.stop('load HF model')
        tensorrt_llm.logger.info(
--- a/examples/models/core/mamba/requirements.txt
+++ b/examples/models/core/mamba/requirements.txt
@ -1,6 +1,6 @@
 -c ../../../constraints.txt
 tensorrt_llm>=0.0.0.dev0
-transformers>=4.39.0
+transformers>=4.56.0
 datasets==3.1.0
 evaluate
 rouge_score
--- a/examples/models/core/mixtral/requirements.txt
+++ b/examples/models/core/mixtral/requirements.txt
@ -1,4 +1,4 @@
 -c ../../../constraints.txt
 tensorrt_llm>=0.0.0.dev0
-transformers==4.54.0
+transformers==4.56.0
 accelerate==0.25.0
--- a/examples/models/core/multimodal/eval.py
+++ b/examples/models/core/multimodal/eval.py
@ -120,7 +120,7 @@ def load_hf_model(args):
    model_class = getattr(__import__('transformers'),
                          SUPPORTED_MODEL_TYPES[args.model_type])
    hf_model = model_class.from_pretrained(args.hf_model_dir,
-                                           torch_dtype=torch.float16,
+                                           dtype=torch.float16,
                                           device_map="cuda:0",
                                           trust_remote_code=True)
    profiler.stop('load HF model')
--- a/examples/models/core/multimodal/requirements-internlm-xcomposer2.txt
+++ b/examples/models/core/multimodal/requirements-internlm-xcomposer2.txt
@ -1 +1 @@
-transformers==4.45.2
+transformers==4.56.0
--- a/examples/models/core/multimodal/requirements-llava_onevision.txt
+++ b/examples/models/core/multimodal/requirements-llava_onevision.txt
@ -1,4 +1,4 @@
 git+https://github.com/LLaVA-VL/LLaVA-NeXT.git
-transformers>=4.44.2
+transformers>=4.56.0
 einops
 av
--- a/examples/models/core/multimodal/requirements-vila.txt
+++ b/examples/models/core/multimodal/requirements-vila.txt
@ -1,2 +1,2 @@
 git+https://github.com/bfshi/scaling_on_scales.git
-transformers==4.36.2
+transformers==4.56.0
--- a/examples/models/core/qwen/requirements.txt
+++ b/examples/models/core/qwen/requirements.txt
@ -3,7 +3,7 @@ tensorrt_llm>=0.0.0.dev0
 datasets==3.1.0
 evaluate
 rouge_score
-transformers>=4.40.1
+transformers>=4.56.0
 transformers-stream-generator
 sentencepiece>=0.1.99
 tiktoken
--- a/examples/models/core/qwenvl/vit_onnx_trt.py
+++ b/examples/models/core/qwenvl/vit_onnx_trt.py
@ -66,7 +66,7 @@ class ONNX_TRT:
        model = AutoModelForCausalLM.from_pretrained(
            pretrained_model_path,
            device_map="cuda",
-            torch_dtype=torch_dtype,
+            dtype=torch_dtype,
            fp16=True,
            trust_remote_code=True,
        ).eval()
--- a/examples/models/core/recurrentgemma/convert_checkpoint.py
+++ b/examples/models/core/recurrentgemma/convert_checkpoint.py
@ -155,7 +155,7 @@ class HfParser:
        hf_model = AutoModelForCausalLM.from_pretrained(
            checkpoint_path,
            device_map="auto",
-            torch_dtype="auto",
+            dtype="auto",
        )
        model_params = dict(hf_model.named_parameters())
        return model_params
--- a/examples/models/core/recurrentgemma/requirements.txt
+++ b/examples/models/core/recurrentgemma/requirements.txt
@ -4,7 +4,7 @@ git+https://github.com/google-deepmind/recurrentgemma.git@8a32e365
 flax>=0.8.2
 jax~=0.4.23
 orbax-checkpoint==0.5.7
-transformers>=4.40.0
+transformers>=4.56.0
 datasets==3.1.0
 evaluate
 rouge_score
--- a/examples/redrafter/convert_checkpoint.py
+++ b/examples/redrafter/convert_checkpoint.py
@ -421,7 +421,7 @@ def main():
        # TODO: When ReDrafter is added to Transformers
        # hf_drafter_model = AutoModel.from_pretrained(
        #     args.drafter_model_dir,
-        #     torch_dtype="auto",
+        #     dtype="auto",
        # )
        ckpt_file = Path(args.drafter_model_dir, "model.safetensors")
        if not Path.exists(ckpt_file):
--- a/examples/summarize.py
+++ b/examples/summarize.py
@ -712,7 +712,7 @@ def main(args):
        model = auto_model_cls.from_pretrained(
            args.hf_model_dir,
            trust_remote_code=True,
-            torch_dtype=str_dtype_to_torch(args.hf_data_type),
+            dtype=str_dtype_to_torch(args.hf_data_type),
            device_map='auto' if args.hf_device_map_auto else None)
        try:
            model.to_bettertransformer()
--- a/tensorrt_llm/_torch/models/modeling_multimodal_encoder.py
+++ b/tensorrt_llm/_torch/models/modeling_multimodal_encoder.py
@ -37,12 +37,12 @@ class VisionTower(nn.Module):

        if "clip" in self.name:
            self.vision_tower = AutoModel.from_pretrained(
-                model_name_or_path, torch_dtype=config.model_dtype)
+                model_name_or_path, dtype=config.model_dtype)
        elif "siglip" in self.name:
            self.vision_tower = AutoModel.from_pretrained(
                model_name_or_path,
                attn_implementation="flash_attention_2",
-                torch_dtype="auto")
+                dtype="auto")
        else:
            raise ValueError(f"Unsupported vision tower: {self.name}")

--- a/tensorrt_llm/models/baichuan/convert.py
+++ b/tensorrt_llm/models/baichuan/convert.py
@ -481,7 +481,7 @@ def quantize(hf_model_dir: str,
    hf_model = AutoModelForCausalLM.from_pretrained(
        hf_model_dir,
        device_map='auto' if device != 'cpu' else 'cpu',
-        torch_dtype='auto'
+        dtype='auto'
        if not config.quantization._use_plugin_sq else torch.float16,
        trust_remote_code=trust_remote_code)
    tokenizer = AutoTokenizer.from_pretrained(
--- a/tensorrt_llm/models/baichuan/model.py
+++ b/tensorrt_llm/models/baichuan/model.py
@ -186,7 +186,7 @@ class BaichuanForCausalLM(DecoderModelForCausalLM):
            hf_model = transformers.AutoModelForCausalLM.from_pretrained(
                hf_model_or_dir,
                trust_remote_code=trust_remote_code,
-                torch_dtype='auto')
+                dtype='auto')
            hf_config_or_dir = hf_model_or_dir

        config = BaichuanConfig.from_hugging_face(hf_config_or_dir,
--- a/tensorrt_llm/models/chatglm/convert.py
+++ b/tensorrt_llm/models/chatglm/convert.py
@ -691,7 +691,7 @@ def quantize(hf_model_dir: str,
    hf_model = AutoModel.from_pretrained(
        hf_model_dir,
        trust_remote_code=trust_remote_code,
-        torch_dtype='auto' if config.chatglm_version != 'glm' else getattr(
+        dtype='auto' if config.chatglm_version != 'glm' else getattr(
            torch, config.dtype),
        device_map=device_map)

--- a/tensorrt_llm/models/chatglm/model.py
+++ b/tensorrt_llm/models/chatglm/model.py
@ -300,7 +300,7 @@ class ChatGLMForCausalLM(DecoderModelForCausalLM):
        hf_model = AutoModel.from_pretrained(
            hf_model_or_dir,
            trust_remote_code=trust_remote_code,
-            torch_dtype='auto' if config.chatglm_version != 'glm' else getattr(
+            dtype='auto' if config.chatglm_version != 'glm' else getattr(
                torch, config.dtype),
            device_map=device_map)
        weights = load_weights_from_hf_model(hf_model, config)
--- a/tensorrt_llm/models/deepseek_v1/convert.py
+++ b/tensorrt_llm/models/deepseek_v1/convert.py
@ -25,7 +25,7 @@ from ..._utils import pad_vocab_size, release_gc
 def load_hf_deepseek(model_dir):
    model = AutoModelForCausalLM.from_pretrained(model_dir,
                                                 device_map='auto',
-                                                 torch_dtype='auto',
+                                                 dtype='auto',
                                                 trust_remote_code=True)
    return model

--- a/tensorrt_llm/models/deepseek_v2/convert.py
+++ b/tensorrt_llm/models/deepseek_v2/convert.py
@ -168,7 +168,7 @@ def load_hf_deepseek(model_dir, load_model_on_cpu=False):
        model = AutoModelForCausalLM.from_pretrained(model_dir,
                                                     config=hf_config,
                                                     device_map='cpu',
-                                                     torch_dtype='auto',
+                                                     dtype='auto',
                                                     trust_remote_code=True)
    else:
        # Deepseek-v2 236B parameters with FP16 dtype need at least 472G GPU memory
@ -197,7 +197,7 @@ def load_hf_deepseek(model_dir, load_model_on_cpu=False):
                                                     config=hf_config,
                                                     device_map=device_map,
                                                     max_memory=max_memory,
-                                                     torch_dtype='auto',
+                                                     dtype='auto',
                                                     trust_remote_code=True)

    return model
--- a/tensorrt_llm/models/falcon/model.py
+++ b/tensorrt_llm/models/falcon/model.py
@ -265,7 +265,7 @@ class FalconForCausalLM(DecoderModelForCausalLM):
            weights = load_weights_from_hf_by_shard(hf_model_dir, config)
        else:
            hf_model = transformers.AutoModelForCausalLM.from_pretrained(
-                hf_model_dir, torch_dtype='auto')
+                hf_model_dir, dtype='auto')
            weights = load_weights_from_hf_model(hf_model, config)

        model = cls(config)
--- a/tensorrt_llm/models/gemma/convert.py
+++ b/tensorrt_llm/models/gemma/convert.py
@ -285,7 +285,7 @@ class HfParser:
        hf_model = AutoModelForCausalLM.from_pretrained(
            checkpoint_path,
            device_map="cpu" if load_model_on_cpu else "auto",
-            torch_dtype='auto',
+            dtype='auto',
            trust_remote_code=True,
        )
        model_params = dict(hf_model.named_parameters())
--- a/tensorrt_llm/models/gemma/model.py
+++ b/tensorrt_llm/models/gemma/model.py
@ -302,7 +302,7 @@ class GemmaForCausalLM(DecoderModelForCausalLM):
        hf_gemma = transformers.AutoModelForCausalLM.from_pretrained(
            hf_model_dir,
            device_map="cpu" if load_model_on_cpu else "auto",
-            torch_dtype='auto',
+            dtype='auto',
        )
        weights = load_gemma_weights_from_hf_model(hf_gemma, trt_llm_config)
        del hf_gemma
--- a/tensorrt_llm/models/gpt/convert.py
+++ b/tensorrt_llm/models/gpt/convert.py
@ -878,7 +878,7 @@ def quantize(hf_model_dir: str,
    hf_model = AutoModelForCausalLM.from_pretrained(
        hf_model_dir,
        device_map='auto' if device != 'cpu' else 'cpu',
-        torch_dtype='auto' if not use_smooth_quant else torch.float16,
+        dtype='auto' if not use_smooth_quant else torch.float16,
        trust_remote_code=trust_remote_code)

    os.environ["TOKENIZERS_PARALLELISM"] = os.environ.get(
@ -916,7 +916,7 @@ def load_hf_gpt(model_dir: str, load_model_on_cpu: bool = False):
        hf_model = AutoModelForCausalLM.from_pretrained(
            model_dir,
            device_map='auto' if not load_model_on_cpu else 'cpu',
-            torch_dtype='auto',
+            dtype='auto',
            trust_remote_code=True,
        )
    return hf_model
--- a/tensorrt_llm/models/gptj/model.py
+++ b/tensorrt_llm/models/gptj/model.py
@ -194,9 +194,7 @@ class GPTJForCausalLM(DecoderModelForCausalLM):
            trust_remote_code = kwargs.pop('trust_remote_code', True)

            hf_model = transformers.AutoModelForCausalLM.from_pretrained(
-                hf_model_dir,
-                torch_dtype='auto',
-                trust_remote_code=trust_remote_code)
+                hf_model_dir, dtype='auto', trust_remote_code=trust_remote_code)
        weights = load_weights_from_hf_model(hf_model, config)

        model = GPTJForCausalLM(config)
--- a/tensorrt_llm/models/llama/convert.py
+++ b/tensorrt_llm/models/llama/convert.py
@ -480,7 +480,7 @@ def load_hf_llama(model_dir: str, load_model_on_cpu: bool = False):
    model = model_cls.from_pretrained(
        model_dir,
        device_map='auto' if not load_model_on_cpu else 'cpu',
-        torch_dtype='auto',
+        dtype='auto',
        trust_remote_code=True,
    )
    if hf_config.model_type in ["llava", "llava_next"]:
@ -1129,7 +1129,7 @@ def quantize(hf_model_dir: str,
    hf_model = AutoModelForCausalLM.from_pretrained(
        hf_model_dir,
        device_map='auto' if device != 'cpu' else 'cpu',
-        torch_dtype='auto' if not use_smooth_quant else torch.float16,
+        dtype='auto' if not use_smooth_quant else torch.float16,
        trust_remote_code=trust_remote_code)

    os.environ["TOKENIZERS_PARALLELISM"] = os.environ.get(
--- a/tensorrt_llm/models/mamba/model.py
+++ b/tensorrt_llm/models/mamba/model.py
@ -458,7 +458,7 @@ class MambaForCausalLM(PretrainedModel):

        if not os.path.exists(hf_model_dir):
            hf_model = AutoModelForCausalLM.from_pretrained(
-                hf_model_dir, torch_dtype="auto", trust_remote_code=True)
+                hf_model_dir, dtype="auto", trust_remote_code=True)

            assert isinstance(hf_model, transformers.PreTrainedModel)
            weights = convert_hf_mamba(hf_model, dtype)
--- a/tensorrt_llm/models/medusa/model.py
+++ b/tensorrt_llm/models/medusa/model.py
@ -224,7 +224,7 @@ class MedusaForCausalLm(PretrainedModel):
            else:
                hf_model = AutoModelForCausalLM.from_pretrained(
                    hf_model_dir,
-                    torch_dtype="auto",
+                    dtype="auto",
                    trust_remote_code=trust_remote_code)

                assert isinstance(hf_model, transformers.PreTrainedModel)
--- a/tensorrt_llm/models/nemotron_nas/model.py
+++ b/tensorrt_llm/models/nemotron_nas/model.py
@ -674,7 +674,7 @@ class DeciLMForCausalLM(DecoderModelForCausalLM):
            hf_model = transformers.AutoModelForCausalLM.from_pretrained(
                hf_model_or_dir,
                device_map='auto' if not load_model_on_cpu else 'cpu',
-                torch_dtype=dtype,
+                dtype=dtype,
                trust_remote_code=trust_remote_code,
            )
            weights = load_weights_from_hf_model(hf_model, config)
--- a/tensorrt_llm/models/phi/model.py
+++ b/tensorrt_llm/models/phi/model.py
@ -203,9 +203,7 @@ class PhiForCausalLM(DecoderModelForCausalLM):
            trust_remote_code = kwargs.pop('trust_remote_code', True)

            hf_model = AutoModelForCausalLM.from_pretrained(
-                hf_model_dir,
-                torch_dtype="auto",
-                trust_remote_code=trust_remote_code)
+                hf_model_dir, dtype="auto", trust_remote_code=trust_remote_code)

        assert isinstance(hf_model, transformers.PreTrainedModel)

--- a/tensorrt_llm/models/phi3/model.py
+++ b/tensorrt_llm/models/phi3/model.py
@ -302,9 +302,7 @@ class Phi3ForCausalLM(DecoderModelForCausalLM):
            trust_remote_code = kwargs.pop('trust_remote_code', True)

            hf_model = AutoModelForCausalLM.from_pretrained(
-                hf_model_dir,
-                torch_dtype="auto",
-                trust_remote_code=trust_remote_code)
+                hf_model_dir, dtype="auto", trust_remote_code=trust_remote_code)

        assert isinstance(hf_model, transformers.PreTrainedModel)

--- a/tensorrt_llm/models/qwen/convert.py
+++ b/tensorrt_llm/models/qwen/convert.py
@ -467,7 +467,7 @@ def load_hf_qwen(model_dir: str, load_model_on_cpu: bool = False):
    model = model_cls.from_pretrained(
        model_dir,
        device_map='auto' if not load_model_on_cpu else 'cpu',
-        torch_dtype='auto',
+        dtype='auto',
        trust_remote_code=True)
    return model

@ -996,7 +996,7 @@ def quantize(hf_model_dir: str,
    hf_model = model_cls.from_pretrained(
        hf_model_dir,
        device_map='auto',
-        torch_dtype='auto' if not use_smooth_quant else torch.float16,
+        dtype='auto' if not use_smooth_quant else torch.float16,
        trust_remote_code=True).half()

    os.environ["TOKENIZERS_PARALLELISM"] = os.environ.get(
--- a/tensorrt_llm/quantization/quantize_by_modelopt.py
+++ b/tensorrt_llm/quantization/quantize_by_modelopt.py
@ -285,7 +285,7 @@ def _get_llava_qwen_model(model_dir, dtype, device):
    if "hf" in model_dir:
        from transformers import LlavaOnevisionForConditionalGeneration
        model = LlavaOnevisionForConditionalGeneration.from_pretrained(
-            model_dir, torch_dtype=dtype, device_map=device)
+            model_dir, dtype=dtype, device_map=device)
        model = model.language_model
    else:
        from llava.model.builder import load_pretrained_model
@ -328,20 +328,20 @@ def get_model(ckpt_path: str,
        from transformers import AutoModelForSeq2SeqLM
        model = AutoModelForSeq2SeqLM.from_pretrained(ckpt_path,
                                                      device_map="cuda",
-                                                      torch_dtype=torch_dtype,
+                                                      dtype=torch_dtype,
                                                      trust_remote_code=True)
    elif model_type_is_enc_dec(hf_config.model_type):
        from transformers import AutoModelForSeq2SeqLM
        model = AutoModelForSeq2SeqLM.from_pretrained(ckpt_path,
                                                      device_map=device,
-                                                      torch_dtype=torch_dtype,
+                                                      dtype=torch_dtype,
                                                      trust_remote_code=True)
        model = EncDecModelWrapper(hf_model=model)
    else:
        model = model_cls.from_pretrained(
            ckpt_path,
            device_map=device_map if device != "cpu" else "cpu",
-            torch_dtype="auto",
+            dtype="auto",
            trust_remote_code=True)
        if hf_config.model_type in ["llava", "internvl_chat"]:
            model = model.language_model
--- a/tensorrt_llm/runtime/multimodal_model_runner.py
+++ b/tensorrt_llm/runtime/multimodal_model_runner.py
@ -692,11 +692,10 @@ class MultimodalModelRunner:

        # Phi-4-multimodal uses pytorch engine due to issues with creating TRT engine.
        if self.model_type == "phi-4-multimodal":
-            model = AutoModelForCausalLM.from_pretrained(
-                self.args.hf_model_dir,
-                torch_dtype=torch.float16,
-                trust_remote_code=True,
-                device_map='cpu')
+            model = AutoModelForCausalLM.from_pretrained(self.args.hf_model_dir,
+                                                         dtype=torch.float16,
+                                                         trust_remote_code=True,
+                                                         device_map='cpu')
            self.vision_model = model.model.embed_tokens_extend.image_embed.to(
                self.device).eval()
            self.image_newlines = {}
@ -707,11 +706,10 @@ class MultimodalModelRunner:
            return

        if self.model_type == "phi-3-vision":
-            model = AutoModelForCausalLM.from_pretrained(
-                self.args.hf_model_dir,
-                torch_dtype=torch.float16,
-                trust_remote_code=True,
-                device_map='cpu')
+            model = AutoModelForCausalLM.from_pretrained(self.args.hf_model_dir,
+                                                         dtype=torch.float16,
+                                                         trust_remote_code=True,
+                                                         device_map='cpu')
            self.vision_model = model.model.vision_embed_tokens.to(
                self.device).eval()

@ -765,7 +763,7 @@ class MultimodalModelRunner:
    def init_audio_encoder(self):
        assert self.model_type == "phi-4-multimodal"
        model = AutoModelForCausalLM.from_pretrained(self.args.hf_model_dir,
-                                                     torch_dtype=torch.float16,
+                                                     dtype=torch.float16,
                                                     trust_remote_code=True,
                                                     device_map='cpu')
        self.audio_model = model.model.embed_tokens_extend.audio_embed.to(
@ -859,7 +857,7 @@ class MultimodalModelRunner:

        from transformers import CLIPImageProcessor
        processor = CLIPImageProcessor.from_pretrained(
-            "openai/clip-vit-large-patch14", torch_dtype=torch.bfloat16)
+            "openai/clip-vit-large-patch14", dtype=torch.bfloat16)
        frames = processor.preprocess(frames,
                                      return_tensors="pt")['pixel_values']
        # make dtype consistent with vision encoder
--- a/tensorrt_llm/tools/multimodal_builder.py
+++ b/tensorrt_llm/tools/multimodal_builder.py
@ -377,8 +377,8 @@ def build_blip2_engine(args):
                                          return_dict=True)
            return self.projector(qformer_output.last_hidden_state)

-    model = Blip2ForConditionalGeneration.from_pretrained(
-        args.model_path, torch_dtype=torch.float16)
+    model = Blip2ForConditionalGeneration.from_pretrained(args.model_path,
+                                                          dtype=torch.float16)

    blip2_llm = ""
    if model.language_model.config.architectures[
@ -449,8 +449,8 @@ def build_pix2struct_engine(args):
            img_features = self.encoder.layernorm(img_features[0])
            return img_features

-    model = Pix2StructForConditionalGeneration.from_pretrained(
-        args.model_path, torch_dtype=dtype)
+    model = Pix2StructForConditionalGeneration.from_pretrained(args.model_path,
+                                                               dtype=dtype)

    wrapper = pix2structVisionWrapper(model.encoder.to(args.device))
    # input shape: batch size, number of patches, hidden dimension
@ -501,7 +501,7 @@ def build_llava_engine(args):
        # Need to setup at hf_config._attn_implementation after transformers >= 4.46
        hf_config._attn_implementation = "eager"
        model = LlavaForConditionalGeneration.from_pretrained(
-            args.model_path, torch_dtype=torch.float16, config=hf_config)
+            args.model_path, dtype=torch.float16, config=hf_config)
        wrapper = LlavaVisionWrapper(
            model.vision_tower.to(args.device),
            model.multi_modal_projector.to(args.device),
@ -530,7 +530,7 @@ def build_llava_engine(args):
        hf_config = AutoConfig.from_pretrained(args.model_path)
        hf_config.vision_config._attn_implementation = "eager"
        model = LlavaNextForConditionalGeneration.from_pretrained(
-            args.model_path, torch_dtype=torch.float16, config=hf_config)
+            args.model_path, dtype=torch.float16, config=hf_config)
        wrapper = LlavaNextVisionWrapper(
            model.vision_tower.vision_model.to(args.device),
            model.multi_modal_projector.to(args.device),
@ -585,7 +585,7 @@ def build_llava_engine(args):
                return image_features  # (sigma(bs, patches_i), 729, c)

        model = LlavaOnevisionForConditionalGeneration.from_pretrained(
-            args.model_path, torch_dtype=torch.float16)
+            args.model_path, dtype=torch.float16)
        wrapper = LlavaOnevisionVisionWrapper(
            model.vision_tower.vision_model.to(args.device),
            model.multi_modal_projector.to(args.device), model.config)
@ -675,7 +675,7 @@ def build_nougat_engine(args):
            return self.encoder(image).last_hidden_state

    model = VisionEncoderDecoderModel.from_pretrained(args.model_path,
-                                                      torch_dtype=torch.float16)
+                                                      dtype=torch.float16)
    swin_encoder = model.get_encoder().to(args.device)
    wrapper = SwinEncoderWrapper(swin_encoder)

@ -710,7 +710,7 @@ def build_cogvlm_engine(args):
            return self.encoder(image)

    cogvlm = AutoModelForCausalLM.from_pretrained(args.model_path,
-                                                  torch_dtype=dtype,
+                                                  dtype=dtype,
                                                  trust_remote_code=True)
    vit_encoder = cogvlm.model.vision.to(args.device).eval()

@ -742,7 +742,7 @@ def build_fuyu_engine(args):
            return self.linear(patches).flatten(0, 1)

    model = FuyuForCausalLM.from_pretrained(args.model_path,
-                                            torch_dtype=torch.float16)
+                                            dtype=torch.float16)

    vision_encoder = model.vision_embed_tokens
    wrapper = FuyuEncoderWrapper(vision_encoder).to(args.device)
@ -803,7 +803,7 @@ def build_neva_engine(args):
    if os.path.isdir(joined_path):
        vision_path = joined_path
    encoder = AutoModel.from_pretrained(vision_path,
-                                        torch_dtype=torch.bfloat16,
+                                        dtype=torch.bfloat16,
                                        trust_remote_code=True)
    vision_encoder = encoder.vision_model
    hf_config = encoder.config
@ -884,7 +884,7 @@ def build_video_neva_engine(args):
            return vision_x

    encoder = AutoModel.from_pretrained(vision_config["from_pretrained"],
-                                        torch_dtype=torch.bfloat16,
+                                        dtype=torch.bfloat16,
                                        trust_remote_code=True,
                                        attn_implementation="eager")
    vision_encoder = encoder.vision_model
@ -951,7 +951,7 @@ def build_kosmos_engine(args):
            return img_features

    model = AutoModelForVision2Seq.from_pretrained(args.model_path,
-                                                   torch_dtype=torch.float16)
+                                                   dtype=torch.float16)
    wrapper = VisionEncoderWrapper(
        model.vision_model.to(args.device),
        model.image_to_text_projection.to(args.device))
@ -1001,7 +1001,7 @@ def build_phi_engine(args):
                1, pixel_values.shape[0], -1, self.vision_model.image_dim_out)

    model = AutoModelForCausalLM.from_pretrained(args.model_path,
-                                                 torch_dtype=torch.float16,
+                                                 dtype=torch.float16,
                                                 trust_remote_code=True)
    vision_model = model.model.vision_embed_tokens

@ -1103,7 +1103,7 @@ def build_phi4mm_engine(args):
            return torch.cat((speech_out, vision_out), dim=-1)

    model = AutoModelForCausalLM.from_pretrained(args.model_path,
-                                                 torch_dtype='auto',
+                                                 dtype='auto',
                                                 trust_remote_code=True)

    vision_model = model.model.embed_tokens_extend.image_embed
@ -1188,7 +1188,7 @@ def build_mllama_engine(args):
    # conflict with limitation of other multimodal models.
    from transformers import MllamaForConditionalGeneration
    model = MllamaForConditionalGeneration.from_pretrained(args.model_path,
-                                                           torch_dtype='auto',
+                                                           dtype='auto',
                                                           device_map='auto')

    # Check if the model structure is updated to transformers >= 4.52.0
@ -1279,7 +1279,7 @@ def build_internvl_engine(args):
            return vit_embeds_mlp

    model = AutoModelForCausalLM.from_pretrained(args.model_path,
-                                                 torch_dtype=torch.float16,
+                                                 dtype=torch.float16,
                                                 trust_remote_code=True,
                                                 use_flash_attn=False).to(
                                                     args.device)
@ -1345,7 +1345,7 @@ def build_qwen2_vl_engine(args):

    model = Qwen2VLForConditionalGeneration.from_pretrained(
        args.model_path,
-        torch_dtype=torch.float32,
+        dtype=torch.float32,
        device_map="cpu",
        attn_implementation="eager")
    hf_config = AutoConfig.from_pretrained(args.model_path)
@ -1502,7 +1502,7 @@ def build_qwen2_vl_engine(args):
            super().__init__()
            self.visual = Qwen2VisionTransformerPretrainedModelOpt._from_config(
                model.config.vision_config,
-                torch_dtype=torch.float32,
+                dtype=torch.float32,
            )
            self.visual.load_state_dict(model.visual.state_dict())

@ -1544,7 +1544,7 @@ def build_qwen2_audio_engine(args):
    from transformers import Qwen2AudioForConditionalGeneration

    model = Qwen2AudioForConditionalGeneration.from_pretrained(
-        args.model_path, torch_dtype=torch.float16)
+        args.model_path, dtype=torch.float16)

    # dummy audio features, dtype is float32
    audio = torch.randn(1,
@ -1710,7 +1710,7 @@ def build_pixtral_engine(args):
            return out

    model = Mistral3ForConditionalGeneration.from_pretrained(args.model_path,
-                                                             torch_dtype="auto")
+                                                             dtype="auto")
    vision_tower = model.vision_tower
    mm_projector = model.multi_modal_projector

--- a/tests/integration/defs/common.py
+++ b/tests/integration/defs/common.py
@ -750,7 +750,7 @@ def generate_dummy_loras(
    try:
        model = AutoModelForCausalLM.from_pretrained(
            hf_model_dir,
-            torch_dtype=torch.float16,
+            dtype=torch.float16,
            device_map=None,  # Load everything to CPU first
            trust_remote_code=True,
            low_cpu_mem_usage=False,
@ -762,7 +762,7 @@ def generate_dummy_loras(
        )
        model = AutoModelForCausalLM.from_pretrained(
            hf_model_dir,
-            torch_dtype=torch.float16,
+            dtype=torch.float16,
            device_map="auto",
            trust_remote_code=True,
        )
--- a/tests/unittest/_torch/modeling/test_modeling_pixtral.py
+++ b/tests/unittest/_torch/modeling/test_modeling_pixtral.py
@ -35,7 +35,7 @@ def make_pixtral_vision_config():
        pretrained_config=transformers.PixtralVisionConfig(
            hidden_size=1024,
            num_attention_heads=16,
-            torch_dtype=torch.bfloat16,
+            dtype=torch.bfloat16,
            hidden_act="silu",
        ),
    )
--- a/tests/unittest/_torch/modules/tests_lora_modules/test_lora_attention_pytorch_flow_vs_trt.py
+++ b/tests/unittest/_torch/modules/tests_lora_modules/test_lora_attention_pytorch_flow_vs_trt.py
@ -356,7 +356,7 @@ class TestLoraAttentionPytorchFlowVsTRT(unittest.TestCase):
                                   rms_norm_eps=1e-5,
                                   vocab_size=32000,
                                   num_key_value_heads=self.head_num,
-                                   torch_dtype=self.torch_dtype)
+                                   dtype=self.torch_dtype)

        mapping = Mapping(world_size=1, tp_size=1, rank=0)
        kv_cache_config = KvCacheConfig(max_tokens=max_seq_len)
--- a/tests/unittest/llmapi/test_llm_pytorch.py
+++ b/tests/unittest/llmapi/test_llm_pytorch.py
@ -546,7 +546,7 @@ def test_bielik_11b_v2_2_instruct_multi_lora() -> None:
        print("Creating dummy LoRAs...")

        model = AutoModelForCausalLM.from_pretrained(model_dir,
-                                                     torch_dtype=torch.bfloat16,
+                                                     dtype=torch.bfloat16,
                                                     device_map="auto")
        hf_modules = ["q_proj", "k_proj", "v_proj"]
        peft_lora_config = PeftLoraConfig(r=8,
@ -599,7 +599,7 @@ def test_gemma3_1b_instruct_multi_lora() -> None:
        print("Creating dummy LoRAs...")

        model = AutoModelForCausalLM.from_pretrained(model_dir,
-                                                     torch_dtype=torch.bfloat16,
+                                                     dtype=torch.bfloat16,
                                                     device_map="auto")
        hf_modules = ["q_proj", "k_proj", "v_proj"]
        peft_lora_config = PeftLoraConfig(r=8,
--- a/tests/unittest/others/test_graph_rewriter.py
+++ b/tests/unittest/others/test_graph_rewriter.py
@ -222,7 +222,7 @@ def create_gpt_attention_network(attention_type='gpt2_attention',
        embd_pdrop=0,
        attn_pdrop=0,
        hidden_act='gelu',
-        torch_dtype=dtype,
+        dtype=dtype,
    )
    attention = GPT2Attention(configuration).cuda().eval()

--- a/tests/unittest/trt/attention/test_bert_attention.py
+++ b/tests/unittest/trt/attention/test_bert_attention.py
@ -125,7 +125,7 @@ class TestFunctional(unittest.TestCase):
            num_attention_heads=num_heads,
            vocab_size=30522,
            hidden_act='gelu',
-            torch_dtype=dtype,
+            dtype=dtype,
        )
        attention = AttentionCls(configuration).cuda().eval()

--- a/tests/unittest/trt/attention/test_gpt_attention.py
+++ b/tests/unittest/trt/attention/test_gpt_attention.py
@ -883,7 +883,7 @@ class TestFunctional(unittest.TestCase):
                                  embd_pdrop=0,
                                  attn_pdrop=0,
                                  hidden_act='gelu',
-                                  torch_dtype=dtype,
+                                  dtype=dtype,
                                  attn_implementation='eager')

        if attention_type in ['gptj_attention', 'llama_attention']:
--- a/tests/unittest/trt/attention/test_gpt_attention_IFB.py
+++ b/tests/unittest/trt/attention/test_gpt_attention_IFB.py
@ -575,7 +575,7 @@ class TestFunctional(unittest.TestCase):
                                  embd_pdrop=0,
                                  attn_pdrop=0,
                                  hidden_act='gelu',
-                                  torch_dtype=dtype,
+                                  dtype=dtype,
                                  attn_implementation='eager')
        if attention_type == 'llama_attention':
            configuration.num_key_value_heads = num_kv_heads
--- a/tests/unittest/trt/model/test_gpt.py
+++ b/tests/unittest/trt/model/test_gpt.py
@ -56,7 +56,7 @@ class TestGPT(unittest.TestCase):
            activation_function=hidden_act,
            n_layer=n_layer,
            max_length=max_length,
-            torch_dtype=dtype,
+            dtype=dtype,
        )
        gpt_config.n_kv_head = gpt_config.n_head
        hf_gpt = GPT2LMHeadModel(gpt_config).cuda().eval()
--- a/tests/unittest/trt/model/test_llama.py
+++ b/tests/unittest/trt/model/test_llama.py
@ -506,7 +506,7 @@ class TestLLaMA(unittest.TestCase):
                "model": "cpu",
                "lm_head": "cpu"
            },  # Load to CPU memory
-            torch_dtype="auto")
+            dtype="auto")
        assert hf_llama.config.torch_dtype == torch.float16
        kv_dtype = trt.float16 if hf_llama.config.torch_dtype == torch.float16 else trt.float32
        max_context_length = 128  # for loader tests this value does not matter
--- a/tests/unittest/trt/model/test_mamba.py
+++ b/tests/unittest/trt/model/test_mamba.py
@ -177,7 +177,7 @@ class TestMamba(unittest.TestCase):

        # get hf mamba
        hf_mamba = AutoModelForCausalLM.from_config(
-            hf_config, torch_dtype=str_dtype_to_torch(dtype)).cuda().eval()
+            hf_config, dtype=str_dtype_to_torch(dtype)).cuda().eval()

        # inputs
        if remove_padding:
@ -373,7 +373,7 @@ class TestMamba(unittest.TestCase):

        # get hf mamba
        hf_mamba = AutoModelForCausalLM.from_pretrained(
-            hf_path, device_map='cpu', torch_dtype=str_dtype_to_torch(dtype))
+            hf_path, device_map='cpu', dtype=str_dtype_to_torch(dtype))

        # get tensort llm mamba
        hf_config = MambaConfig.from_pretrained(hf_path)
--- a/tests/unittest/trt/model/test_mistral.py
+++ b/tests/unittest/trt/model/test_mistral.py
@ -514,7 +514,7 @@ class TestMistralAndArctic(unittest.TestCase):
                "model": "cpu",
                "lm_head": "cpu"
            },  # Load to CPU memory
-            torch_dtype="auto")
+            dtype="auto")
        assert hf_mistral.config.torch_dtype == torch.float16
        kv_dtype = trt.float16 if hf_mistral.config.torch_dtype == torch.float16 else trt.float32
        max_context_length = 128  # for loader tests this value does not matter
--- a/tests/unittest/trt/model/test_nemotron_nas.py
+++ b/tests/unittest/trt/model/test_nemotron_nas.py
@ -386,7 +386,7 @@ class TestNemotronNas(unittest.TestCase):
        hf_model = transformers.AutoModelForCausalLM.from_pretrained(
            hf_model_dir,
            trust_remote_code=True,
-            torch_dtype=tensorrt_llm._utils.str_dtype_to_torch(params.dtype),
+            dtype=tensorrt_llm._utils.str_dtype_to_torch(params.dtype),
        ).cuda()
        runtime, config = self._from_hf_model(hf_model, params)
        self.allclose(
@ -720,8 +720,7 @@ class TestNemotronNas(unittest.TestCase):
            from_pretrained(
                hf_model_dir,
                trust_remote_code=True,
-                torch_dtype=tensorrt_llm._utils.str_dtype_to_torch(params.dtype
-                                                                   ),
+                dtype=tensorrt_llm._utils.str_dtype_to_torch(params.dtype),
            ).cuda(),
            atol=
            0.92,  # We've observed that on a real checkpoint with the current code, fp8 MMLU is on par with BF16, and this is the observed threshold, though it may seem high.
@ -747,8 +746,7 @@ class TestNemotronNas(unittest.TestCase):
            from_pretrained(
                hf_model_dir,
                trust_remote_code=True,
-                torch_dtype=tensorrt_llm._utils.str_dtype_to_torch(params.dtype
-                                                                   ),
+                dtype=tensorrt_llm._utils.str_dtype_to_torch(params.dtype),
                device_map="auto",
            ),
        )
@ -934,7 +932,7 @@ class TestNemotronNas(unittest.TestCase):
        dtype = tensorrt_llm._utils.str_dtype_to_torch(dtype)

        hf_model = transformers.AutoModelForCausalLM.from_pretrained(
-            hf_model_dir, trust_remote_code=True, torch_dtype=dtype).cuda()
+            hf_model_dir, trust_remote_code=True, dtype=dtype).cuda()

        batch_size = 1
        max_seq_len = 30
--- a/tests/unittest/trt/quantization/test_quant_layer.py
+++ b/tests/unittest/trt/quantization/test_quant_layer.py
@ -934,7 +934,7 @@ class TestSmoothQuant(unittest.TestCase):
            embd_pdrop=0,
            attn_pdrop=0,
            hidden_act='gelu',
-            torch_dtype=dtype,
+            dtype=dtype,
        )
        n_positions = configuration.n_positions