[None][chore] update torch_dtype -> dtype in 'transformers' (#8263)

Signed-off-by: ixlmar <206748156+ixlmar@users.noreply.github.com>
This commit is contained in:
mpikulski 2025-10-15 10:09:30 +02:00 committed by GitHub
parent 616d1df7a0
commit 93a4b7f1b6
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
64 changed files with 110 additions and 124 deletions

View File

@ -350,7 +350,7 @@ def main():
hf_model = LlamaForCausalLM if args.model_type != "mixtral" else MixtralForCausalLM
model = hf_model.from_pretrained(
args.model_dir,
torch_dtype='auto',
dtype='auto',
device_map='auto' if not args.load_model_on_cpu else 'cpu',
trust_remote_code=True)

View File

@ -447,7 +447,7 @@ def main():
model = auto_model_cls.from_pretrained(
args.hf_model_dir,
trust_remote_code=True,
torch_dtype=DTYPE_STR_MAPPING[args.hf_data_type],
dtype=DTYPE_STR_MAPPING[args.hf_data_type],
device_map="auto" if args.hf_device_map_auto else None,
)
if not args.hf_device_map_auto:

View File

@ -898,7 +898,7 @@ def main():
if not convert_from_ckpt:
logger.info(f'Convert by using model')
hf_bloom = BloomForCausalLM.from_pretrained(args.model_dir,
torch_dtype="auto",
dtype="auto",
device_map="auto",
trust_remote_code=True)
else:

View File

@ -405,13 +405,13 @@ def main():
if args.model_type == "llava":
hf_llava = LlavaForConditionalGeneration.from_pretrained(
args.model_dir, torch_dtype="auto")
args.model_dir, dtype="auto")
model = hf_llava.language_model
else:
model = AutoModelForCausalLM.from_pretrained(
args.model_dir,
device_map='auto' if not args.load_model_on_cpu else 'cpu',
torch_dtype='auto' if not args.smoothquant else torch.float16,
dtype='auto' if not args.smoothquant else torch.float16,
trust_remote_code=True,
)
if args.smoothquant is not None or args.int8_kv_cache:

View File

@ -605,7 +605,7 @@ if __name__ == '__main__':
hf_model = AutoModelForCausalLM.from_pretrained(model_dir,
trust_remote_code=True,
device_map="auto",
torch_dtype=getattr(
dtype=getattr(
torch, args.dtype),
config=hf_config)
return hf_model

View File

@ -129,10 +129,7 @@ def convert_and_save_hf(args: argparse.Namespace):
import transformers
if not args.load_by_shard and quant_config.quant_mode.has_any_quant():
hf_model = transformers.FalconForCausalLM.from_pretrained(
model_dir,
trust_remote_code=True,
torch_dtype='auto',
device_map='auto')
model_dir, trust_remote_code=True, dtype='auto', device_map='auto')
else:
# Initialize huggingface local cache.
# Huggingface copies the external configuration source (`configuration_falcon.py` here) into its local cache at
@ -142,7 +139,7 @@ def convert_and_save_hf(args: argparse.Namespace):
# Preload the config once to initialize local cache, so subsequent multithread loading won't fail.
_ = transformers.FalconConfig.from_pretrained(model_dir,
trust_remote_code=True,
torch_dtype='auto',
dtype='auto',
device_map='auto')
def convert_and_save_rank(args, rank: int):

View File

@ -1,6 +1,6 @@
-c ../../../constraints.txt
tensorrt_llm>=0.0.0.dev0
transformers>=4.31.0
transformers>=4.56.0
datasets==3.1.0
evaluate
rouge_score

View File

@ -87,7 +87,7 @@ def convert_and_save_hf(args):
quant_config = args_to_quant_config(args)
hf_model = AutoModelForCausalLM.from_pretrained(model_dir,
torch_dtype='auto',
dtype='auto',
trust_remote_code=True)
def convert_and_save_rank(args, rank):

View File

@ -639,7 +639,7 @@ if __name__ == '__main__':
hf_config = AutoConfig.from_pretrained(args.model_dir)
hf_model = AutoModelForCausalLM.from_pretrained(args.model_dir,
torch_dtype="auto")
dtype="auto")
config = {
'architecture': hf_config.architectures[0],

View File

@ -1,2 +1,2 @@
transformers>=4.47.1
transformers>=4.56.0
diffusers>=0.32.2

View File

@ -841,8 +841,7 @@ if __name__ == '__main__':
hf_model = MptForCausalLM.from_pretrained(args.model_dir,
device_map="auto",
torch_dtype=getattr(
torch, args.dtype))
dtype=getattr(torch, args.dtype))
act_range = {}
mpt_qkv_para = {}

View File

@ -278,10 +278,10 @@ if __name__ == '__main__':
if args.model_type == 'opt':
hf_model = AutoModelForCausalLM.from_pretrained(args.model_dir,
torch_dtype="auto")
dtype="auto")
elif args.model_type == 'blip2':
hf_model = Blip2ForConditionalGeneration.from_pretrained(
args.model_dir, torch_dtype="auto").language_model
args.model_dir, dtype="auto").language_model
hf_config = hf_model.config
if hf_config.hidden_size != hf_config.word_embed_proj_dim:

View File

@ -388,7 +388,7 @@ if __name__ == "__main__":
if tensorrt_llm.mpi_rank() == 0:
hf_model = AutoModelForSeq2SeqLM.from_pretrained(
args.model_name, # TODO: use model path instead
# torch_dtype=torch.float16 if '16' in dtype else torch.float32, # TODO: use matched torch dtype
# dtype=torch.float16 if '16' in dtype else torch.float32, # TODO: use matched torch dtype
).to('cuda').eval() # TODO: create config model path instead
assert type(hf_model) in (
T5ForConditionalGeneration, BartForConditionalGeneration,

View File

@ -508,7 +508,7 @@ if __name__ == '__main__':
hf_model = AutoModelForCausalLM.from_pretrained(args.model_dir,
trust_remote_code=True,
torch_dtype="auto")
dtype="auto")
weights = convert_from_hf(
hf_model,
hf_config,

View File

@ -1,6 +1,6 @@
-c ../../../constraints.txt
tensorrt_llm>=0.0.0.dev0
transformers>=4.43.0
transformers>=4.56.0
datasets==3.1.0
evaluate
rouge_score

View File

@ -356,7 +356,7 @@ def main(args):
profiler.start('load HF model')
hf_model = AutoModelForCausalLM.from_pretrained(
args.hf_model_location,
torch_dtype=torch.float16,
dtype=torch.float16,
use_flash_attention_2=True)
profiler.stop('load HF model')
tensorrt_llm.logger.info(

View File

@ -1,6 +1,6 @@
-c ../../../constraints.txt
tensorrt_llm>=0.0.0.dev0
transformers>=4.39.0
transformers>=4.56.0
datasets==3.1.0
evaluate
rouge_score

View File

@ -1,4 +1,4 @@
-c ../../../constraints.txt
tensorrt_llm>=0.0.0.dev0
transformers==4.54.0
transformers==4.56.0
accelerate==0.25.0

View File

@ -120,7 +120,7 @@ def load_hf_model(args):
model_class = getattr(__import__('transformers'),
SUPPORTED_MODEL_TYPES[args.model_type])
hf_model = model_class.from_pretrained(args.hf_model_dir,
torch_dtype=torch.float16,
dtype=torch.float16,
device_map="cuda:0",
trust_remote_code=True)
profiler.stop('load HF model')

View File

@ -1 +1 @@
transformers==4.45.2
transformers==4.56.0

View File

@ -1,4 +1,4 @@
git+https://github.com/LLaVA-VL/LLaVA-NeXT.git
transformers>=4.44.2
transformers>=4.56.0
einops
av

View File

@ -1,2 +1,2 @@
git+https://github.com/bfshi/scaling_on_scales.git
transformers==4.36.2
transformers==4.56.0

View File

@ -3,7 +3,7 @@ tensorrt_llm>=0.0.0.dev0
datasets==3.1.0
evaluate
rouge_score
transformers>=4.40.1
transformers>=4.56.0
transformers-stream-generator
sentencepiece>=0.1.99
tiktoken

View File

@ -66,7 +66,7 @@ class ONNX_TRT:
model = AutoModelForCausalLM.from_pretrained(
pretrained_model_path,
device_map="cuda",
torch_dtype=torch_dtype,
dtype=torch_dtype,
fp16=True,
trust_remote_code=True,
).eval()

View File

@ -155,7 +155,7 @@ class HfParser:
hf_model = AutoModelForCausalLM.from_pretrained(
checkpoint_path,
device_map="auto",
torch_dtype="auto",
dtype="auto",
)
model_params = dict(hf_model.named_parameters())
return model_params

View File

@ -4,7 +4,7 @@ git+https://github.com/google-deepmind/recurrentgemma.git@8a32e365
flax>=0.8.2
jax~=0.4.23
orbax-checkpoint==0.5.7
transformers>=4.40.0
transformers>=4.56.0
datasets==3.1.0
evaluate
rouge_score

View File

@ -421,7 +421,7 @@ def main():
# TODO: When ReDrafter is added to Transformers
# hf_drafter_model = AutoModel.from_pretrained(
# args.drafter_model_dir,
# torch_dtype="auto",
# dtype="auto",
# )
ckpt_file = Path(args.drafter_model_dir, "model.safetensors")
if not Path.exists(ckpt_file):

View File

@ -712,7 +712,7 @@ def main(args):
model = auto_model_cls.from_pretrained(
args.hf_model_dir,
trust_remote_code=True,
torch_dtype=str_dtype_to_torch(args.hf_data_type),
dtype=str_dtype_to_torch(args.hf_data_type),
device_map='auto' if args.hf_device_map_auto else None)
try:
model.to_bettertransformer()

View File

@ -37,12 +37,12 @@ class VisionTower(nn.Module):
if "clip" in self.name:
self.vision_tower = AutoModel.from_pretrained(
model_name_or_path, torch_dtype=config.model_dtype)
model_name_or_path, dtype=config.model_dtype)
elif "siglip" in self.name:
self.vision_tower = AutoModel.from_pretrained(
model_name_or_path,
attn_implementation="flash_attention_2",
torch_dtype="auto")
dtype="auto")
else:
raise ValueError(f"Unsupported vision tower: {self.name}")

View File

@ -481,7 +481,7 @@ def quantize(hf_model_dir: str,
hf_model = AutoModelForCausalLM.from_pretrained(
hf_model_dir,
device_map='auto' if device != 'cpu' else 'cpu',
torch_dtype='auto'
dtype='auto'
if not config.quantization._use_plugin_sq else torch.float16,
trust_remote_code=trust_remote_code)
tokenizer = AutoTokenizer.from_pretrained(

View File

@ -186,7 +186,7 @@ class BaichuanForCausalLM(DecoderModelForCausalLM):
hf_model = transformers.AutoModelForCausalLM.from_pretrained(
hf_model_or_dir,
trust_remote_code=trust_remote_code,
torch_dtype='auto')
dtype='auto')
hf_config_or_dir = hf_model_or_dir
config = BaichuanConfig.from_hugging_face(hf_config_or_dir,

View File

@ -691,7 +691,7 @@ def quantize(hf_model_dir: str,
hf_model = AutoModel.from_pretrained(
hf_model_dir,
trust_remote_code=trust_remote_code,
torch_dtype='auto' if config.chatglm_version != 'glm' else getattr(
dtype='auto' if config.chatglm_version != 'glm' else getattr(
torch, config.dtype),
device_map=device_map)

View File

@ -300,7 +300,7 @@ class ChatGLMForCausalLM(DecoderModelForCausalLM):
hf_model = AutoModel.from_pretrained(
hf_model_or_dir,
trust_remote_code=trust_remote_code,
torch_dtype='auto' if config.chatglm_version != 'glm' else getattr(
dtype='auto' if config.chatglm_version != 'glm' else getattr(
torch, config.dtype),
device_map=device_map)
weights = load_weights_from_hf_model(hf_model, config)

View File

@ -25,7 +25,7 @@ from ..._utils import pad_vocab_size, release_gc
def load_hf_deepseek(model_dir):
model = AutoModelForCausalLM.from_pretrained(model_dir,
device_map='auto',
torch_dtype='auto',
dtype='auto',
trust_remote_code=True)
return model

View File

@ -168,7 +168,7 @@ def load_hf_deepseek(model_dir, load_model_on_cpu=False):
model = AutoModelForCausalLM.from_pretrained(model_dir,
config=hf_config,
device_map='cpu',
torch_dtype='auto',
dtype='auto',
trust_remote_code=True)
else:
# Deepseek-v2 236B parameters with FP16 dtype need at least 472G GPU memory
@ -197,7 +197,7 @@ def load_hf_deepseek(model_dir, load_model_on_cpu=False):
config=hf_config,
device_map=device_map,
max_memory=max_memory,
torch_dtype='auto',
dtype='auto',
trust_remote_code=True)
return model

View File

@ -265,7 +265,7 @@ class FalconForCausalLM(DecoderModelForCausalLM):
weights = load_weights_from_hf_by_shard(hf_model_dir, config)
else:
hf_model = transformers.AutoModelForCausalLM.from_pretrained(
hf_model_dir, torch_dtype='auto')
hf_model_dir, dtype='auto')
weights = load_weights_from_hf_model(hf_model, config)
model = cls(config)

View File

@ -285,7 +285,7 @@ class HfParser:
hf_model = AutoModelForCausalLM.from_pretrained(
checkpoint_path,
device_map="cpu" if load_model_on_cpu else "auto",
torch_dtype='auto',
dtype='auto',
trust_remote_code=True,
)
model_params = dict(hf_model.named_parameters())

View File

@ -302,7 +302,7 @@ class GemmaForCausalLM(DecoderModelForCausalLM):
hf_gemma = transformers.AutoModelForCausalLM.from_pretrained(
hf_model_dir,
device_map="cpu" if load_model_on_cpu else "auto",
torch_dtype='auto',
dtype='auto',
)
weights = load_gemma_weights_from_hf_model(hf_gemma, trt_llm_config)
del hf_gemma

View File

@ -878,7 +878,7 @@ def quantize(hf_model_dir: str,
hf_model = AutoModelForCausalLM.from_pretrained(
hf_model_dir,
device_map='auto' if device != 'cpu' else 'cpu',
torch_dtype='auto' if not use_smooth_quant else torch.float16,
dtype='auto' if not use_smooth_quant else torch.float16,
trust_remote_code=trust_remote_code)
os.environ["TOKENIZERS_PARALLELISM"] = os.environ.get(
@ -916,7 +916,7 @@ def load_hf_gpt(model_dir: str, load_model_on_cpu: bool = False):
hf_model = AutoModelForCausalLM.from_pretrained(
model_dir,
device_map='auto' if not load_model_on_cpu else 'cpu',
torch_dtype='auto',
dtype='auto',
trust_remote_code=True,
)
return hf_model

View File

@ -194,9 +194,7 @@ class GPTJForCausalLM(DecoderModelForCausalLM):
trust_remote_code = kwargs.pop('trust_remote_code', True)
hf_model = transformers.AutoModelForCausalLM.from_pretrained(
hf_model_dir,
torch_dtype='auto',
trust_remote_code=trust_remote_code)
hf_model_dir, dtype='auto', trust_remote_code=trust_remote_code)
weights = load_weights_from_hf_model(hf_model, config)
model = GPTJForCausalLM(config)

View File

@ -480,7 +480,7 @@ def load_hf_llama(model_dir: str, load_model_on_cpu: bool = False):
model = model_cls.from_pretrained(
model_dir,
device_map='auto' if not load_model_on_cpu else 'cpu',
torch_dtype='auto',
dtype='auto',
trust_remote_code=True,
)
if hf_config.model_type in ["llava", "llava_next"]:
@ -1129,7 +1129,7 @@ def quantize(hf_model_dir: str,
hf_model = AutoModelForCausalLM.from_pretrained(
hf_model_dir,
device_map='auto' if device != 'cpu' else 'cpu',
torch_dtype='auto' if not use_smooth_quant else torch.float16,
dtype='auto' if not use_smooth_quant else torch.float16,
trust_remote_code=trust_remote_code)
os.environ["TOKENIZERS_PARALLELISM"] = os.environ.get(

View File

@ -458,7 +458,7 @@ class MambaForCausalLM(PretrainedModel):
if not os.path.exists(hf_model_dir):
hf_model = AutoModelForCausalLM.from_pretrained(
hf_model_dir, torch_dtype="auto", trust_remote_code=True)
hf_model_dir, dtype="auto", trust_remote_code=True)
assert isinstance(hf_model, transformers.PreTrainedModel)
weights = convert_hf_mamba(hf_model, dtype)

View File

@ -224,7 +224,7 @@ class MedusaForCausalLm(PretrainedModel):
else:
hf_model = AutoModelForCausalLM.from_pretrained(
hf_model_dir,
torch_dtype="auto",
dtype="auto",
trust_remote_code=trust_remote_code)
assert isinstance(hf_model, transformers.PreTrainedModel)

View File

@ -674,7 +674,7 @@ class DeciLMForCausalLM(DecoderModelForCausalLM):
hf_model = transformers.AutoModelForCausalLM.from_pretrained(
hf_model_or_dir,
device_map='auto' if not load_model_on_cpu else 'cpu',
torch_dtype=dtype,
dtype=dtype,
trust_remote_code=trust_remote_code,
)
weights = load_weights_from_hf_model(hf_model, config)

View File

@ -203,9 +203,7 @@ class PhiForCausalLM(DecoderModelForCausalLM):
trust_remote_code = kwargs.pop('trust_remote_code', True)
hf_model = AutoModelForCausalLM.from_pretrained(
hf_model_dir,
torch_dtype="auto",
trust_remote_code=trust_remote_code)
hf_model_dir, dtype="auto", trust_remote_code=trust_remote_code)
assert isinstance(hf_model, transformers.PreTrainedModel)

View File

@ -302,9 +302,7 @@ class Phi3ForCausalLM(DecoderModelForCausalLM):
trust_remote_code = kwargs.pop('trust_remote_code', True)
hf_model = AutoModelForCausalLM.from_pretrained(
hf_model_dir,
torch_dtype="auto",
trust_remote_code=trust_remote_code)
hf_model_dir, dtype="auto", trust_remote_code=trust_remote_code)
assert isinstance(hf_model, transformers.PreTrainedModel)

View File

@ -467,7 +467,7 @@ def load_hf_qwen(model_dir: str, load_model_on_cpu: bool = False):
model = model_cls.from_pretrained(
model_dir,
device_map='auto' if not load_model_on_cpu else 'cpu',
torch_dtype='auto',
dtype='auto',
trust_remote_code=True)
return model
@ -996,7 +996,7 @@ def quantize(hf_model_dir: str,
hf_model = model_cls.from_pretrained(
hf_model_dir,
device_map='auto',
torch_dtype='auto' if not use_smooth_quant else torch.float16,
dtype='auto' if not use_smooth_quant else torch.float16,
trust_remote_code=True).half()
os.environ["TOKENIZERS_PARALLELISM"] = os.environ.get(

View File

@ -285,7 +285,7 @@ def _get_llava_qwen_model(model_dir, dtype, device):
if "hf" in model_dir:
from transformers import LlavaOnevisionForConditionalGeneration
model = LlavaOnevisionForConditionalGeneration.from_pretrained(
model_dir, torch_dtype=dtype, device_map=device)
model_dir, dtype=dtype, device_map=device)
model = model.language_model
else:
from llava.model.builder import load_pretrained_model
@ -328,20 +328,20 @@ def get_model(ckpt_path: str,
from transformers import AutoModelForSeq2SeqLM
model = AutoModelForSeq2SeqLM.from_pretrained(ckpt_path,
device_map="cuda",
torch_dtype=torch_dtype,
dtype=torch_dtype,
trust_remote_code=True)
elif model_type_is_enc_dec(hf_config.model_type):
from transformers import AutoModelForSeq2SeqLM
model = AutoModelForSeq2SeqLM.from_pretrained(ckpt_path,
device_map=device,
torch_dtype=torch_dtype,
dtype=torch_dtype,
trust_remote_code=True)
model = EncDecModelWrapper(hf_model=model)
else:
model = model_cls.from_pretrained(
ckpt_path,
device_map=device_map if device != "cpu" else "cpu",
torch_dtype="auto",
dtype="auto",
trust_remote_code=True)
if hf_config.model_type in ["llava", "internvl_chat"]:
model = model.language_model

View File

@ -692,11 +692,10 @@ class MultimodalModelRunner:
# Phi-4-multimodal uses pytorch engine due to issues with creating TRT engine.
if self.model_type == "phi-4-multimodal":
model = AutoModelForCausalLM.from_pretrained(
self.args.hf_model_dir,
torch_dtype=torch.float16,
trust_remote_code=True,
device_map='cpu')
model = AutoModelForCausalLM.from_pretrained(self.args.hf_model_dir,
dtype=torch.float16,
trust_remote_code=True,
device_map='cpu')
self.vision_model = model.model.embed_tokens_extend.image_embed.to(
self.device).eval()
self.image_newlines = {}
@ -707,11 +706,10 @@ class MultimodalModelRunner:
return
if self.model_type == "phi-3-vision":
model = AutoModelForCausalLM.from_pretrained(
self.args.hf_model_dir,
torch_dtype=torch.float16,
trust_remote_code=True,
device_map='cpu')
model = AutoModelForCausalLM.from_pretrained(self.args.hf_model_dir,
dtype=torch.float16,
trust_remote_code=True,
device_map='cpu')
self.vision_model = model.model.vision_embed_tokens.to(
self.device).eval()
@ -765,7 +763,7 @@ class MultimodalModelRunner:
def init_audio_encoder(self):
assert self.model_type == "phi-4-multimodal"
model = AutoModelForCausalLM.from_pretrained(self.args.hf_model_dir,
torch_dtype=torch.float16,
dtype=torch.float16,
trust_remote_code=True,
device_map='cpu')
self.audio_model = model.model.embed_tokens_extend.audio_embed.to(
@ -859,7 +857,7 @@ class MultimodalModelRunner:
from transformers import CLIPImageProcessor
processor = CLIPImageProcessor.from_pretrained(
"openai/clip-vit-large-patch14", torch_dtype=torch.bfloat16)
"openai/clip-vit-large-patch14", dtype=torch.bfloat16)
frames = processor.preprocess(frames,
return_tensors="pt")['pixel_values']
# make dtype consistent with vision encoder

View File

@ -377,8 +377,8 @@ def build_blip2_engine(args):
return_dict=True)
return self.projector(qformer_output.last_hidden_state)
model = Blip2ForConditionalGeneration.from_pretrained(
args.model_path, torch_dtype=torch.float16)
model = Blip2ForConditionalGeneration.from_pretrained(args.model_path,
dtype=torch.float16)
blip2_llm = ""
if model.language_model.config.architectures[
@ -449,8 +449,8 @@ def build_pix2struct_engine(args):
img_features = self.encoder.layernorm(img_features[0])
return img_features
model = Pix2StructForConditionalGeneration.from_pretrained(
args.model_path, torch_dtype=dtype)
model = Pix2StructForConditionalGeneration.from_pretrained(args.model_path,
dtype=dtype)
wrapper = pix2structVisionWrapper(model.encoder.to(args.device))
# input shape: batch size, number of patches, hidden dimension
@ -501,7 +501,7 @@ def build_llava_engine(args):
# Need to setup at hf_config._attn_implementation after transformers >= 4.46
hf_config._attn_implementation = "eager"
model = LlavaForConditionalGeneration.from_pretrained(
args.model_path, torch_dtype=torch.float16, config=hf_config)
args.model_path, dtype=torch.float16, config=hf_config)
wrapper = LlavaVisionWrapper(
model.vision_tower.to(args.device),
model.multi_modal_projector.to(args.device),
@ -530,7 +530,7 @@ def build_llava_engine(args):
hf_config = AutoConfig.from_pretrained(args.model_path)
hf_config.vision_config._attn_implementation = "eager"
model = LlavaNextForConditionalGeneration.from_pretrained(
args.model_path, torch_dtype=torch.float16, config=hf_config)
args.model_path, dtype=torch.float16, config=hf_config)
wrapper = LlavaNextVisionWrapper(
model.vision_tower.vision_model.to(args.device),
model.multi_modal_projector.to(args.device),
@ -585,7 +585,7 @@ def build_llava_engine(args):
return image_features # (sigma(bs, patches_i), 729, c)
model = LlavaOnevisionForConditionalGeneration.from_pretrained(
args.model_path, torch_dtype=torch.float16)
args.model_path, dtype=torch.float16)
wrapper = LlavaOnevisionVisionWrapper(
model.vision_tower.vision_model.to(args.device),
model.multi_modal_projector.to(args.device), model.config)
@ -675,7 +675,7 @@ def build_nougat_engine(args):
return self.encoder(image).last_hidden_state
model = VisionEncoderDecoderModel.from_pretrained(args.model_path,
torch_dtype=torch.float16)
dtype=torch.float16)
swin_encoder = model.get_encoder().to(args.device)
wrapper = SwinEncoderWrapper(swin_encoder)
@ -710,7 +710,7 @@ def build_cogvlm_engine(args):
return self.encoder(image)
cogvlm = AutoModelForCausalLM.from_pretrained(args.model_path,
torch_dtype=dtype,
dtype=dtype,
trust_remote_code=True)
vit_encoder = cogvlm.model.vision.to(args.device).eval()
@ -742,7 +742,7 @@ def build_fuyu_engine(args):
return self.linear(patches).flatten(0, 1)
model = FuyuForCausalLM.from_pretrained(args.model_path,
torch_dtype=torch.float16)
dtype=torch.float16)
vision_encoder = model.vision_embed_tokens
wrapper = FuyuEncoderWrapper(vision_encoder).to(args.device)
@ -803,7 +803,7 @@ def build_neva_engine(args):
if os.path.isdir(joined_path):
vision_path = joined_path
encoder = AutoModel.from_pretrained(vision_path,
torch_dtype=torch.bfloat16,
dtype=torch.bfloat16,
trust_remote_code=True)
vision_encoder = encoder.vision_model
hf_config = encoder.config
@ -884,7 +884,7 @@ def build_video_neva_engine(args):
return vision_x
encoder = AutoModel.from_pretrained(vision_config["from_pretrained"],
torch_dtype=torch.bfloat16,
dtype=torch.bfloat16,
trust_remote_code=True,
attn_implementation="eager")
vision_encoder = encoder.vision_model
@ -951,7 +951,7 @@ def build_kosmos_engine(args):
return img_features
model = AutoModelForVision2Seq.from_pretrained(args.model_path,
torch_dtype=torch.float16)
dtype=torch.float16)
wrapper = VisionEncoderWrapper(
model.vision_model.to(args.device),
model.image_to_text_projection.to(args.device))
@ -1001,7 +1001,7 @@ def build_phi_engine(args):
1, pixel_values.shape[0], -1, self.vision_model.image_dim_out)
model = AutoModelForCausalLM.from_pretrained(args.model_path,
torch_dtype=torch.float16,
dtype=torch.float16,
trust_remote_code=True)
vision_model = model.model.vision_embed_tokens
@ -1103,7 +1103,7 @@ def build_phi4mm_engine(args):
return torch.cat((speech_out, vision_out), dim=-1)
model = AutoModelForCausalLM.from_pretrained(args.model_path,
torch_dtype='auto',
dtype='auto',
trust_remote_code=True)
vision_model = model.model.embed_tokens_extend.image_embed
@ -1188,7 +1188,7 @@ def build_mllama_engine(args):
# conflict with limitation of other multimodal models.
from transformers import MllamaForConditionalGeneration
model = MllamaForConditionalGeneration.from_pretrained(args.model_path,
torch_dtype='auto',
dtype='auto',
device_map='auto')
# Check if the model structure is updated to transformers >= 4.52.0
@ -1279,7 +1279,7 @@ def build_internvl_engine(args):
return vit_embeds_mlp
model = AutoModelForCausalLM.from_pretrained(args.model_path,
torch_dtype=torch.float16,
dtype=torch.float16,
trust_remote_code=True,
use_flash_attn=False).to(
args.device)
@ -1345,7 +1345,7 @@ def build_qwen2_vl_engine(args):
model = Qwen2VLForConditionalGeneration.from_pretrained(
args.model_path,
torch_dtype=torch.float32,
dtype=torch.float32,
device_map="cpu",
attn_implementation="eager")
hf_config = AutoConfig.from_pretrained(args.model_path)
@ -1502,7 +1502,7 @@ def build_qwen2_vl_engine(args):
super().__init__()
self.visual = Qwen2VisionTransformerPretrainedModelOpt._from_config(
model.config.vision_config,
torch_dtype=torch.float32,
dtype=torch.float32,
)
self.visual.load_state_dict(model.visual.state_dict())
@ -1544,7 +1544,7 @@ def build_qwen2_audio_engine(args):
from transformers import Qwen2AudioForConditionalGeneration
model = Qwen2AudioForConditionalGeneration.from_pretrained(
args.model_path, torch_dtype=torch.float16)
args.model_path, dtype=torch.float16)
# dummy audio features, dtype is float32
audio = torch.randn(1,
@ -1710,7 +1710,7 @@ def build_pixtral_engine(args):
return out
model = Mistral3ForConditionalGeneration.from_pretrained(args.model_path,
torch_dtype="auto")
dtype="auto")
vision_tower = model.vision_tower
mm_projector = model.multi_modal_projector

View File

@ -750,7 +750,7 @@ def generate_dummy_loras(
try:
model = AutoModelForCausalLM.from_pretrained(
hf_model_dir,
torch_dtype=torch.float16,
dtype=torch.float16,
device_map=None, # Load everything to CPU first
trust_remote_code=True,
low_cpu_mem_usage=False,
@ -762,7 +762,7 @@ def generate_dummy_loras(
)
model = AutoModelForCausalLM.from_pretrained(
hf_model_dir,
torch_dtype=torch.float16,
dtype=torch.float16,
device_map="auto",
trust_remote_code=True,
)

View File

@ -35,7 +35,7 @@ def make_pixtral_vision_config():
pretrained_config=transformers.PixtralVisionConfig(
hidden_size=1024,
num_attention_heads=16,
torch_dtype=torch.bfloat16,
dtype=torch.bfloat16,
hidden_act="silu",
),
)

View File

@ -356,7 +356,7 @@ class TestLoraAttentionPytorchFlowVsTRT(unittest.TestCase):
rms_norm_eps=1e-5,
vocab_size=32000,
num_key_value_heads=self.head_num,
torch_dtype=self.torch_dtype)
dtype=self.torch_dtype)
mapping = Mapping(world_size=1, tp_size=1, rank=0)
kv_cache_config = KvCacheConfig(max_tokens=max_seq_len)

View File

@ -546,7 +546,7 @@ def test_bielik_11b_v2_2_instruct_multi_lora() -> None:
print("Creating dummy LoRAs...")
model = AutoModelForCausalLM.from_pretrained(model_dir,
torch_dtype=torch.bfloat16,
dtype=torch.bfloat16,
device_map="auto")
hf_modules = ["q_proj", "k_proj", "v_proj"]
peft_lora_config = PeftLoraConfig(r=8,
@ -599,7 +599,7 @@ def test_gemma3_1b_instruct_multi_lora() -> None:
print("Creating dummy LoRAs...")
model = AutoModelForCausalLM.from_pretrained(model_dir,
torch_dtype=torch.bfloat16,
dtype=torch.bfloat16,
device_map="auto")
hf_modules = ["q_proj", "k_proj", "v_proj"]
peft_lora_config = PeftLoraConfig(r=8,

View File

@ -222,7 +222,7 @@ def create_gpt_attention_network(attention_type='gpt2_attention',
embd_pdrop=0,
attn_pdrop=0,
hidden_act='gelu',
torch_dtype=dtype,
dtype=dtype,
)
attention = GPT2Attention(configuration).cuda().eval()

View File

@ -125,7 +125,7 @@ class TestFunctional(unittest.TestCase):
num_attention_heads=num_heads,
vocab_size=30522,
hidden_act='gelu',
torch_dtype=dtype,
dtype=dtype,
)
attention = AttentionCls(configuration).cuda().eval()

View File

@ -883,7 +883,7 @@ class TestFunctional(unittest.TestCase):
embd_pdrop=0,
attn_pdrop=0,
hidden_act='gelu',
torch_dtype=dtype,
dtype=dtype,
attn_implementation='eager')
if attention_type in ['gptj_attention', 'llama_attention']:

View File

@ -575,7 +575,7 @@ class TestFunctional(unittest.TestCase):
embd_pdrop=0,
attn_pdrop=0,
hidden_act='gelu',
torch_dtype=dtype,
dtype=dtype,
attn_implementation='eager')
if attention_type == 'llama_attention':
configuration.num_key_value_heads = num_kv_heads

View File

@ -56,7 +56,7 @@ class TestGPT(unittest.TestCase):
activation_function=hidden_act,
n_layer=n_layer,
max_length=max_length,
torch_dtype=dtype,
dtype=dtype,
)
gpt_config.n_kv_head = gpt_config.n_head
hf_gpt = GPT2LMHeadModel(gpt_config).cuda().eval()

View File

@ -506,7 +506,7 @@ class TestLLaMA(unittest.TestCase):
"model": "cpu",
"lm_head": "cpu"
}, # Load to CPU memory
torch_dtype="auto")
dtype="auto")
assert hf_llama.config.torch_dtype == torch.float16
kv_dtype = trt.float16 if hf_llama.config.torch_dtype == torch.float16 else trt.float32
max_context_length = 128 # for loader tests this value does not matter

View File

@ -177,7 +177,7 @@ class TestMamba(unittest.TestCase):
# get hf mamba
hf_mamba = AutoModelForCausalLM.from_config(
hf_config, torch_dtype=str_dtype_to_torch(dtype)).cuda().eval()
hf_config, dtype=str_dtype_to_torch(dtype)).cuda().eval()
# inputs
if remove_padding:
@ -373,7 +373,7 @@ class TestMamba(unittest.TestCase):
# get hf mamba
hf_mamba = AutoModelForCausalLM.from_pretrained(
hf_path, device_map='cpu', torch_dtype=str_dtype_to_torch(dtype))
hf_path, device_map='cpu', dtype=str_dtype_to_torch(dtype))
# get tensort llm mamba
hf_config = MambaConfig.from_pretrained(hf_path)

View File

@ -514,7 +514,7 @@ class TestMistralAndArctic(unittest.TestCase):
"model": "cpu",
"lm_head": "cpu"
}, # Load to CPU memory
torch_dtype="auto")
dtype="auto")
assert hf_mistral.config.torch_dtype == torch.float16
kv_dtype = trt.float16 if hf_mistral.config.torch_dtype == torch.float16 else trt.float32
max_context_length = 128 # for loader tests this value does not matter

View File

@ -386,7 +386,7 @@ class TestNemotronNas(unittest.TestCase):
hf_model = transformers.AutoModelForCausalLM.from_pretrained(
hf_model_dir,
trust_remote_code=True,
torch_dtype=tensorrt_llm._utils.str_dtype_to_torch(params.dtype),
dtype=tensorrt_llm._utils.str_dtype_to_torch(params.dtype),
).cuda()
runtime, config = self._from_hf_model(hf_model, params)
self.allclose(
@ -720,8 +720,7 @@ class TestNemotronNas(unittest.TestCase):
from_pretrained(
hf_model_dir,
trust_remote_code=True,
torch_dtype=tensorrt_llm._utils.str_dtype_to_torch(params.dtype
),
dtype=tensorrt_llm._utils.str_dtype_to_torch(params.dtype),
).cuda(),
atol=
0.92, # We've observed that on a real checkpoint with the current code, fp8 MMLU is on par with BF16, and this is the observed threshold, though it may seem high.
@ -747,8 +746,7 @@ class TestNemotronNas(unittest.TestCase):
from_pretrained(
hf_model_dir,
trust_remote_code=True,
torch_dtype=tensorrt_llm._utils.str_dtype_to_torch(params.dtype
),
dtype=tensorrt_llm._utils.str_dtype_to_torch(params.dtype),
device_map="auto",
),
)
@ -934,7 +932,7 @@ class TestNemotronNas(unittest.TestCase):
dtype = tensorrt_llm._utils.str_dtype_to_torch(dtype)
hf_model = transformers.AutoModelForCausalLM.from_pretrained(
hf_model_dir, trust_remote_code=True, torch_dtype=dtype).cuda()
hf_model_dir, trust_remote_code=True, dtype=dtype).cuda()
batch_size = 1
max_seq_len = 30

View File

@ -934,7 +934,7 @@ class TestSmoothQuant(unittest.TestCase):
embd_pdrop=0,
attn_pdrop=0,
hidden_act='gelu',
torch_dtype=dtype,
dtype=dtype,
)
n_positions = configuration.n_positions