mirror of
https://github.com/NVIDIA/TensorRT-LLM.git
synced 2026-01-13 22:18:36 +08:00
[None][chore] update torch_dtype -> dtype in 'transformers' (#8263)
Signed-off-by: ixlmar <206748156+ixlmar@users.noreply.github.com>
This commit is contained in:
parent
616d1df7a0
commit
93a4b7f1b6
@ -350,7 +350,7 @@ def main():
|
||||
hf_model = LlamaForCausalLM if args.model_type != "mixtral" else MixtralForCausalLM
|
||||
model = hf_model.from_pretrained(
|
||||
args.model_dir,
|
||||
torch_dtype='auto',
|
||||
dtype='auto',
|
||||
device_map='auto' if not args.load_model_on_cpu else 'cpu',
|
||||
trust_remote_code=True)
|
||||
|
||||
|
||||
@ -447,7 +447,7 @@ def main():
|
||||
model = auto_model_cls.from_pretrained(
|
||||
args.hf_model_dir,
|
||||
trust_remote_code=True,
|
||||
torch_dtype=DTYPE_STR_MAPPING[args.hf_data_type],
|
||||
dtype=DTYPE_STR_MAPPING[args.hf_data_type],
|
||||
device_map="auto" if args.hf_device_map_auto else None,
|
||||
)
|
||||
if not args.hf_device_map_auto:
|
||||
|
||||
@ -898,7 +898,7 @@ def main():
|
||||
if not convert_from_ckpt:
|
||||
logger.info(f'Convert by using model')
|
||||
hf_bloom = BloomForCausalLM.from_pretrained(args.model_dir,
|
||||
torch_dtype="auto",
|
||||
dtype="auto",
|
||||
device_map="auto",
|
||||
trust_remote_code=True)
|
||||
else:
|
||||
|
||||
@ -405,13 +405,13 @@ def main():
|
||||
|
||||
if args.model_type == "llava":
|
||||
hf_llava = LlavaForConditionalGeneration.from_pretrained(
|
||||
args.model_dir, torch_dtype="auto")
|
||||
args.model_dir, dtype="auto")
|
||||
model = hf_llava.language_model
|
||||
else:
|
||||
model = AutoModelForCausalLM.from_pretrained(
|
||||
args.model_dir,
|
||||
device_map='auto' if not args.load_model_on_cpu else 'cpu',
|
||||
torch_dtype='auto' if not args.smoothquant else torch.float16,
|
||||
dtype='auto' if not args.smoothquant else torch.float16,
|
||||
trust_remote_code=True,
|
||||
)
|
||||
if args.smoothquant is not None or args.int8_kv_cache:
|
||||
|
||||
@ -605,7 +605,7 @@ if __name__ == '__main__':
|
||||
hf_model = AutoModelForCausalLM.from_pretrained(model_dir,
|
||||
trust_remote_code=True,
|
||||
device_map="auto",
|
||||
torch_dtype=getattr(
|
||||
dtype=getattr(
|
||||
torch, args.dtype),
|
||||
config=hf_config)
|
||||
return hf_model
|
||||
|
||||
@ -129,10 +129,7 @@ def convert_and_save_hf(args: argparse.Namespace):
|
||||
import transformers
|
||||
if not args.load_by_shard and quant_config.quant_mode.has_any_quant():
|
||||
hf_model = transformers.FalconForCausalLM.from_pretrained(
|
||||
model_dir,
|
||||
trust_remote_code=True,
|
||||
torch_dtype='auto',
|
||||
device_map='auto')
|
||||
model_dir, trust_remote_code=True, dtype='auto', device_map='auto')
|
||||
else:
|
||||
# Initialize huggingface local cache.
|
||||
# Huggingface copies the external configuration source (`configuration_falcon.py` here) into its local cache at
|
||||
@ -142,7 +139,7 @@ def convert_and_save_hf(args: argparse.Namespace):
|
||||
# Preload the config once to initialize local cache, so subsequent multithread loading won't fail.
|
||||
_ = transformers.FalconConfig.from_pretrained(model_dir,
|
||||
trust_remote_code=True,
|
||||
torch_dtype='auto',
|
||||
dtype='auto',
|
||||
device_map='auto')
|
||||
|
||||
def convert_and_save_rank(args, rank: int):
|
||||
|
||||
@ -1,6 +1,6 @@
|
||||
-c ../../../constraints.txt
|
||||
tensorrt_llm>=0.0.0.dev0
|
||||
transformers>=4.31.0
|
||||
transformers>=4.56.0
|
||||
datasets==3.1.0
|
||||
evaluate
|
||||
rouge_score
|
||||
|
||||
@ -87,7 +87,7 @@ def convert_and_save_hf(args):
|
||||
quant_config = args_to_quant_config(args)
|
||||
|
||||
hf_model = AutoModelForCausalLM.from_pretrained(model_dir,
|
||||
torch_dtype='auto',
|
||||
dtype='auto',
|
||||
trust_remote_code=True)
|
||||
|
||||
def convert_and_save_rank(args, rank):
|
||||
|
||||
@ -639,7 +639,7 @@ if __name__ == '__main__':
|
||||
|
||||
hf_config = AutoConfig.from_pretrained(args.model_dir)
|
||||
hf_model = AutoModelForCausalLM.from_pretrained(args.model_dir,
|
||||
torch_dtype="auto")
|
||||
dtype="auto")
|
||||
|
||||
config = {
|
||||
'architecture': hf_config.architectures[0],
|
||||
|
||||
@ -1,2 +1,2 @@
|
||||
transformers>=4.47.1
|
||||
transformers>=4.56.0
|
||||
diffusers>=0.32.2
|
||||
|
||||
@ -841,8 +841,7 @@ if __name__ == '__main__':
|
||||
|
||||
hf_model = MptForCausalLM.from_pretrained(args.model_dir,
|
||||
device_map="auto",
|
||||
torch_dtype=getattr(
|
||||
torch, args.dtype))
|
||||
dtype=getattr(torch, args.dtype))
|
||||
|
||||
act_range = {}
|
||||
mpt_qkv_para = {}
|
||||
|
||||
@ -278,10 +278,10 @@ if __name__ == '__main__':
|
||||
|
||||
if args.model_type == 'opt':
|
||||
hf_model = AutoModelForCausalLM.from_pretrained(args.model_dir,
|
||||
torch_dtype="auto")
|
||||
dtype="auto")
|
||||
elif args.model_type == 'blip2':
|
||||
hf_model = Blip2ForConditionalGeneration.from_pretrained(
|
||||
args.model_dir, torch_dtype="auto").language_model
|
||||
args.model_dir, dtype="auto").language_model
|
||||
|
||||
hf_config = hf_model.config
|
||||
if hf_config.hidden_size != hf_config.word_embed_proj_dim:
|
||||
|
||||
@ -388,7 +388,7 @@ if __name__ == "__main__":
|
||||
if tensorrt_llm.mpi_rank() == 0:
|
||||
hf_model = AutoModelForSeq2SeqLM.from_pretrained(
|
||||
args.model_name, # TODO: use model path instead
|
||||
# torch_dtype=torch.float16 if '16' in dtype else torch.float32, # TODO: use matched torch dtype
|
||||
# dtype=torch.float16 if '16' in dtype else torch.float32, # TODO: use matched torch dtype
|
||||
).to('cuda').eval() # TODO: create config model path instead
|
||||
assert type(hf_model) in (
|
||||
T5ForConditionalGeneration, BartForConditionalGeneration,
|
||||
|
||||
@ -508,7 +508,7 @@ if __name__ == '__main__':
|
||||
|
||||
hf_model = AutoModelForCausalLM.from_pretrained(args.model_dir,
|
||||
trust_remote_code=True,
|
||||
torch_dtype="auto")
|
||||
dtype="auto")
|
||||
weights = convert_from_hf(
|
||||
hf_model,
|
||||
hf_config,
|
||||
|
||||
@ -1,6 +1,6 @@
|
||||
-c ../../../constraints.txt
|
||||
tensorrt_llm>=0.0.0.dev0
|
||||
transformers>=4.43.0
|
||||
transformers>=4.56.0
|
||||
datasets==3.1.0
|
||||
evaluate
|
||||
rouge_score
|
||||
|
||||
@ -356,7 +356,7 @@ def main(args):
|
||||
profiler.start('load HF model')
|
||||
hf_model = AutoModelForCausalLM.from_pretrained(
|
||||
args.hf_model_location,
|
||||
torch_dtype=torch.float16,
|
||||
dtype=torch.float16,
|
||||
use_flash_attention_2=True)
|
||||
profiler.stop('load HF model')
|
||||
tensorrt_llm.logger.info(
|
||||
|
||||
@ -1,6 +1,6 @@
|
||||
-c ../../../constraints.txt
|
||||
tensorrt_llm>=0.0.0.dev0
|
||||
transformers>=4.39.0
|
||||
transformers>=4.56.0
|
||||
datasets==3.1.0
|
||||
evaluate
|
||||
rouge_score
|
||||
|
||||
@ -1,4 +1,4 @@
|
||||
-c ../../../constraints.txt
|
||||
tensorrt_llm>=0.0.0.dev0
|
||||
transformers==4.54.0
|
||||
transformers==4.56.0
|
||||
accelerate==0.25.0
|
||||
|
||||
@ -120,7 +120,7 @@ def load_hf_model(args):
|
||||
model_class = getattr(__import__('transformers'),
|
||||
SUPPORTED_MODEL_TYPES[args.model_type])
|
||||
hf_model = model_class.from_pretrained(args.hf_model_dir,
|
||||
torch_dtype=torch.float16,
|
||||
dtype=torch.float16,
|
||||
device_map="cuda:0",
|
||||
trust_remote_code=True)
|
||||
profiler.stop('load HF model')
|
||||
|
||||
@ -1 +1 @@
|
||||
transformers==4.45.2
|
||||
transformers==4.56.0
|
||||
|
||||
@ -1,4 +1,4 @@
|
||||
git+https://github.com/LLaVA-VL/LLaVA-NeXT.git
|
||||
transformers>=4.44.2
|
||||
transformers>=4.56.0
|
||||
einops
|
||||
av
|
||||
|
||||
@ -1,2 +1,2 @@
|
||||
git+https://github.com/bfshi/scaling_on_scales.git
|
||||
transformers==4.36.2
|
||||
transformers==4.56.0
|
||||
|
||||
@ -3,7 +3,7 @@ tensorrt_llm>=0.0.0.dev0
|
||||
datasets==3.1.0
|
||||
evaluate
|
||||
rouge_score
|
||||
transformers>=4.40.1
|
||||
transformers>=4.56.0
|
||||
transformers-stream-generator
|
||||
sentencepiece>=0.1.99
|
||||
tiktoken
|
||||
|
||||
@ -66,7 +66,7 @@ class ONNX_TRT:
|
||||
model = AutoModelForCausalLM.from_pretrained(
|
||||
pretrained_model_path,
|
||||
device_map="cuda",
|
||||
torch_dtype=torch_dtype,
|
||||
dtype=torch_dtype,
|
||||
fp16=True,
|
||||
trust_remote_code=True,
|
||||
).eval()
|
||||
|
||||
@ -155,7 +155,7 @@ class HfParser:
|
||||
hf_model = AutoModelForCausalLM.from_pretrained(
|
||||
checkpoint_path,
|
||||
device_map="auto",
|
||||
torch_dtype="auto",
|
||||
dtype="auto",
|
||||
)
|
||||
model_params = dict(hf_model.named_parameters())
|
||||
return model_params
|
||||
|
||||
@ -4,7 +4,7 @@ git+https://github.com/google-deepmind/recurrentgemma.git@8a32e365
|
||||
flax>=0.8.2
|
||||
jax~=0.4.23
|
||||
orbax-checkpoint==0.5.7
|
||||
transformers>=4.40.0
|
||||
transformers>=4.56.0
|
||||
datasets==3.1.0
|
||||
evaluate
|
||||
rouge_score
|
||||
|
||||
@ -421,7 +421,7 @@ def main():
|
||||
# TODO: When ReDrafter is added to Transformers
|
||||
# hf_drafter_model = AutoModel.from_pretrained(
|
||||
# args.drafter_model_dir,
|
||||
# torch_dtype="auto",
|
||||
# dtype="auto",
|
||||
# )
|
||||
ckpt_file = Path(args.drafter_model_dir, "model.safetensors")
|
||||
if not Path.exists(ckpt_file):
|
||||
|
||||
@ -712,7 +712,7 @@ def main(args):
|
||||
model = auto_model_cls.from_pretrained(
|
||||
args.hf_model_dir,
|
||||
trust_remote_code=True,
|
||||
torch_dtype=str_dtype_to_torch(args.hf_data_type),
|
||||
dtype=str_dtype_to_torch(args.hf_data_type),
|
||||
device_map='auto' if args.hf_device_map_auto else None)
|
||||
try:
|
||||
model.to_bettertransformer()
|
||||
|
||||
@ -37,12 +37,12 @@ class VisionTower(nn.Module):
|
||||
|
||||
if "clip" in self.name:
|
||||
self.vision_tower = AutoModel.from_pretrained(
|
||||
model_name_or_path, torch_dtype=config.model_dtype)
|
||||
model_name_or_path, dtype=config.model_dtype)
|
||||
elif "siglip" in self.name:
|
||||
self.vision_tower = AutoModel.from_pretrained(
|
||||
model_name_or_path,
|
||||
attn_implementation="flash_attention_2",
|
||||
torch_dtype="auto")
|
||||
dtype="auto")
|
||||
else:
|
||||
raise ValueError(f"Unsupported vision tower: {self.name}")
|
||||
|
||||
|
||||
@ -481,7 +481,7 @@ def quantize(hf_model_dir: str,
|
||||
hf_model = AutoModelForCausalLM.from_pretrained(
|
||||
hf_model_dir,
|
||||
device_map='auto' if device != 'cpu' else 'cpu',
|
||||
torch_dtype='auto'
|
||||
dtype='auto'
|
||||
if not config.quantization._use_plugin_sq else torch.float16,
|
||||
trust_remote_code=trust_remote_code)
|
||||
tokenizer = AutoTokenizer.from_pretrained(
|
||||
|
||||
@ -186,7 +186,7 @@ class BaichuanForCausalLM(DecoderModelForCausalLM):
|
||||
hf_model = transformers.AutoModelForCausalLM.from_pretrained(
|
||||
hf_model_or_dir,
|
||||
trust_remote_code=trust_remote_code,
|
||||
torch_dtype='auto')
|
||||
dtype='auto')
|
||||
hf_config_or_dir = hf_model_or_dir
|
||||
|
||||
config = BaichuanConfig.from_hugging_face(hf_config_or_dir,
|
||||
|
||||
@ -691,7 +691,7 @@ def quantize(hf_model_dir: str,
|
||||
hf_model = AutoModel.from_pretrained(
|
||||
hf_model_dir,
|
||||
trust_remote_code=trust_remote_code,
|
||||
torch_dtype='auto' if config.chatglm_version != 'glm' else getattr(
|
||||
dtype='auto' if config.chatglm_version != 'glm' else getattr(
|
||||
torch, config.dtype),
|
||||
device_map=device_map)
|
||||
|
||||
|
||||
@ -300,7 +300,7 @@ class ChatGLMForCausalLM(DecoderModelForCausalLM):
|
||||
hf_model = AutoModel.from_pretrained(
|
||||
hf_model_or_dir,
|
||||
trust_remote_code=trust_remote_code,
|
||||
torch_dtype='auto' if config.chatglm_version != 'glm' else getattr(
|
||||
dtype='auto' if config.chatglm_version != 'glm' else getattr(
|
||||
torch, config.dtype),
|
||||
device_map=device_map)
|
||||
weights = load_weights_from_hf_model(hf_model, config)
|
||||
|
||||
@ -25,7 +25,7 @@ from ..._utils import pad_vocab_size, release_gc
|
||||
def load_hf_deepseek(model_dir):
|
||||
model = AutoModelForCausalLM.from_pretrained(model_dir,
|
||||
device_map='auto',
|
||||
torch_dtype='auto',
|
||||
dtype='auto',
|
||||
trust_remote_code=True)
|
||||
return model
|
||||
|
||||
|
||||
@ -168,7 +168,7 @@ def load_hf_deepseek(model_dir, load_model_on_cpu=False):
|
||||
model = AutoModelForCausalLM.from_pretrained(model_dir,
|
||||
config=hf_config,
|
||||
device_map='cpu',
|
||||
torch_dtype='auto',
|
||||
dtype='auto',
|
||||
trust_remote_code=True)
|
||||
else:
|
||||
# Deepseek-v2 236B parameters with FP16 dtype need at least 472G GPU memory
|
||||
@ -197,7 +197,7 @@ def load_hf_deepseek(model_dir, load_model_on_cpu=False):
|
||||
config=hf_config,
|
||||
device_map=device_map,
|
||||
max_memory=max_memory,
|
||||
torch_dtype='auto',
|
||||
dtype='auto',
|
||||
trust_remote_code=True)
|
||||
|
||||
return model
|
||||
|
||||
@ -265,7 +265,7 @@ class FalconForCausalLM(DecoderModelForCausalLM):
|
||||
weights = load_weights_from_hf_by_shard(hf_model_dir, config)
|
||||
else:
|
||||
hf_model = transformers.AutoModelForCausalLM.from_pretrained(
|
||||
hf_model_dir, torch_dtype='auto')
|
||||
hf_model_dir, dtype='auto')
|
||||
weights = load_weights_from_hf_model(hf_model, config)
|
||||
|
||||
model = cls(config)
|
||||
|
||||
@ -285,7 +285,7 @@ class HfParser:
|
||||
hf_model = AutoModelForCausalLM.from_pretrained(
|
||||
checkpoint_path,
|
||||
device_map="cpu" if load_model_on_cpu else "auto",
|
||||
torch_dtype='auto',
|
||||
dtype='auto',
|
||||
trust_remote_code=True,
|
||||
)
|
||||
model_params = dict(hf_model.named_parameters())
|
||||
|
||||
@ -302,7 +302,7 @@ class GemmaForCausalLM(DecoderModelForCausalLM):
|
||||
hf_gemma = transformers.AutoModelForCausalLM.from_pretrained(
|
||||
hf_model_dir,
|
||||
device_map="cpu" if load_model_on_cpu else "auto",
|
||||
torch_dtype='auto',
|
||||
dtype='auto',
|
||||
)
|
||||
weights = load_gemma_weights_from_hf_model(hf_gemma, trt_llm_config)
|
||||
del hf_gemma
|
||||
|
||||
@ -878,7 +878,7 @@ def quantize(hf_model_dir: str,
|
||||
hf_model = AutoModelForCausalLM.from_pretrained(
|
||||
hf_model_dir,
|
||||
device_map='auto' if device != 'cpu' else 'cpu',
|
||||
torch_dtype='auto' if not use_smooth_quant else torch.float16,
|
||||
dtype='auto' if not use_smooth_quant else torch.float16,
|
||||
trust_remote_code=trust_remote_code)
|
||||
|
||||
os.environ["TOKENIZERS_PARALLELISM"] = os.environ.get(
|
||||
@ -916,7 +916,7 @@ def load_hf_gpt(model_dir: str, load_model_on_cpu: bool = False):
|
||||
hf_model = AutoModelForCausalLM.from_pretrained(
|
||||
model_dir,
|
||||
device_map='auto' if not load_model_on_cpu else 'cpu',
|
||||
torch_dtype='auto',
|
||||
dtype='auto',
|
||||
trust_remote_code=True,
|
||||
)
|
||||
return hf_model
|
||||
|
||||
@ -194,9 +194,7 @@ class GPTJForCausalLM(DecoderModelForCausalLM):
|
||||
trust_remote_code = kwargs.pop('trust_remote_code', True)
|
||||
|
||||
hf_model = transformers.AutoModelForCausalLM.from_pretrained(
|
||||
hf_model_dir,
|
||||
torch_dtype='auto',
|
||||
trust_remote_code=trust_remote_code)
|
||||
hf_model_dir, dtype='auto', trust_remote_code=trust_remote_code)
|
||||
weights = load_weights_from_hf_model(hf_model, config)
|
||||
|
||||
model = GPTJForCausalLM(config)
|
||||
|
||||
@ -480,7 +480,7 @@ def load_hf_llama(model_dir: str, load_model_on_cpu: bool = False):
|
||||
model = model_cls.from_pretrained(
|
||||
model_dir,
|
||||
device_map='auto' if not load_model_on_cpu else 'cpu',
|
||||
torch_dtype='auto',
|
||||
dtype='auto',
|
||||
trust_remote_code=True,
|
||||
)
|
||||
if hf_config.model_type in ["llava", "llava_next"]:
|
||||
@ -1129,7 +1129,7 @@ def quantize(hf_model_dir: str,
|
||||
hf_model = AutoModelForCausalLM.from_pretrained(
|
||||
hf_model_dir,
|
||||
device_map='auto' if device != 'cpu' else 'cpu',
|
||||
torch_dtype='auto' if not use_smooth_quant else torch.float16,
|
||||
dtype='auto' if not use_smooth_quant else torch.float16,
|
||||
trust_remote_code=trust_remote_code)
|
||||
|
||||
os.environ["TOKENIZERS_PARALLELISM"] = os.environ.get(
|
||||
|
||||
@ -458,7 +458,7 @@ class MambaForCausalLM(PretrainedModel):
|
||||
|
||||
if not os.path.exists(hf_model_dir):
|
||||
hf_model = AutoModelForCausalLM.from_pretrained(
|
||||
hf_model_dir, torch_dtype="auto", trust_remote_code=True)
|
||||
hf_model_dir, dtype="auto", trust_remote_code=True)
|
||||
|
||||
assert isinstance(hf_model, transformers.PreTrainedModel)
|
||||
weights = convert_hf_mamba(hf_model, dtype)
|
||||
|
||||
@ -224,7 +224,7 @@ class MedusaForCausalLm(PretrainedModel):
|
||||
else:
|
||||
hf_model = AutoModelForCausalLM.from_pretrained(
|
||||
hf_model_dir,
|
||||
torch_dtype="auto",
|
||||
dtype="auto",
|
||||
trust_remote_code=trust_remote_code)
|
||||
|
||||
assert isinstance(hf_model, transformers.PreTrainedModel)
|
||||
|
||||
@ -674,7 +674,7 @@ class DeciLMForCausalLM(DecoderModelForCausalLM):
|
||||
hf_model = transformers.AutoModelForCausalLM.from_pretrained(
|
||||
hf_model_or_dir,
|
||||
device_map='auto' if not load_model_on_cpu else 'cpu',
|
||||
torch_dtype=dtype,
|
||||
dtype=dtype,
|
||||
trust_remote_code=trust_remote_code,
|
||||
)
|
||||
weights = load_weights_from_hf_model(hf_model, config)
|
||||
|
||||
@ -203,9 +203,7 @@ class PhiForCausalLM(DecoderModelForCausalLM):
|
||||
trust_remote_code = kwargs.pop('trust_remote_code', True)
|
||||
|
||||
hf_model = AutoModelForCausalLM.from_pretrained(
|
||||
hf_model_dir,
|
||||
torch_dtype="auto",
|
||||
trust_remote_code=trust_remote_code)
|
||||
hf_model_dir, dtype="auto", trust_remote_code=trust_remote_code)
|
||||
|
||||
assert isinstance(hf_model, transformers.PreTrainedModel)
|
||||
|
||||
|
||||
@ -302,9 +302,7 @@ class Phi3ForCausalLM(DecoderModelForCausalLM):
|
||||
trust_remote_code = kwargs.pop('trust_remote_code', True)
|
||||
|
||||
hf_model = AutoModelForCausalLM.from_pretrained(
|
||||
hf_model_dir,
|
||||
torch_dtype="auto",
|
||||
trust_remote_code=trust_remote_code)
|
||||
hf_model_dir, dtype="auto", trust_remote_code=trust_remote_code)
|
||||
|
||||
assert isinstance(hf_model, transformers.PreTrainedModel)
|
||||
|
||||
|
||||
@ -467,7 +467,7 @@ def load_hf_qwen(model_dir: str, load_model_on_cpu: bool = False):
|
||||
model = model_cls.from_pretrained(
|
||||
model_dir,
|
||||
device_map='auto' if not load_model_on_cpu else 'cpu',
|
||||
torch_dtype='auto',
|
||||
dtype='auto',
|
||||
trust_remote_code=True)
|
||||
return model
|
||||
|
||||
@ -996,7 +996,7 @@ def quantize(hf_model_dir: str,
|
||||
hf_model = model_cls.from_pretrained(
|
||||
hf_model_dir,
|
||||
device_map='auto',
|
||||
torch_dtype='auto' if not use_smooth_quant else torch.float16,
|
||||
dtype='auto' if not use_smooth_quant else torch.float16,
|
||||
trust_remote_code=True).half()
|
||||
|
||||
os.environ["TOKENIZERS_PARALLELISM"] = os.environ.get(
|
||||
|
||||
@ -285,7 +285,7 @@ def _get_llava_qwen_model(model_dir, dtype, device):
|
||||
if "hf" in model_dir:
|
||||
from transformers import LlavaOnevisionForConditionalGeneration
|
||||
model = LlavaOnevisionForConditionalGeneration.from_pretrained(
|
||||
model_dir, torch_dtype=dtype, device_map=device)
|
||||
model_dir, dtype=dtype, device_map=device)
|
||||
model = model.language_model
|
||||
else:
|
||||
from llava.model.builder import load_pretrained_model
|
||||
@ -328,20 +328,20 @@ def get_model(ckpt_path: str,
|
||||
from transformers import AutoModelForSeq2SeqLM
|
||||
model = AutoModelForSeq2SeqLM.from_pretrained(ckpt_path,
|
||||
device_map="cuda",
|
||||
torch_dtype=torch_dtype,
|
||||
dtype=torch_dtype,
|
||||
trust_remote_code=True)
|
||||
elif model_type_is_enc_dec(hf_config.model_type):
|
||||
from transformers import AutoModelForSeq2SeqLM
|
||||
model = AutoModelForSeq2SeqLM.from_pretrained(ckpt_path,
|
||||
device_map=device,
|
||||
torch_dtype=torch_dtype,
|
||||
dtype=torch_dtype,
|
||||
trust_remote_code=True)
|
||||
model = EncDecModelWrapper(hf_model=model)
|
||||
else:
|
||||
model = model_cls.from_pretrained(
|
||||
ckpt_path,
|
||||
device_map=device_map if device != "cpu" else "cpu",
|
||||
torch_dtype="auto",
|
||||
dtype="auto",
|
||||
trust_remote_code=True)
|
||||
if hf_config.model_type in ["llava", "internvl_chat"]:
|
||||
model = model.language_model
|
||||
|
||||
@ -692,11 +692,10 @@ class MultimodalModelRunner:
|
||||
|
||||
# Phi-4-multimodal uses pytorch engine due to issues with creating TRT engine.
|
||||
if self.model_type == "phi-4-multimodal":
|
||||
model = AutoModelForCausalLM.from_pretrained(
|
||||
self.args.hf_model_dir,
|
||||
torch_dtype=torch.float16,
|
||||
trust_remote_code=True,
|
||||
device_map='cpu')
|
||||
model = AutoModelForCausalLM.from_pretrained(self.args.hf_model_dir,
|
||||
dtype=torch.float16,
|
||||
trust_remote_code=True,
|
||||
device_map='cpu')
|
||||
self.vision_model = model.model.embed_tokens_extend.image_embed.to(
|
||||
self.device).eval()
|
||||
self.image_newlines = {}
|
||||
@ -707,11 +706,10 @@ class MultimodalModelRunner:
|
||||
return
|
||||
|
||||
if self.model_type == "phi-3-vision":
|
||||
model = AutoModelForCausalLM.from_pretrained(
|
||||
self.args.hf_model_dir,
|
||||
torch_dtype=torch.float16,
|
||||
trust_remote_code=True,
|
||||
device_map='cpu')
|
||||
model = AutoModelForCausalLM.from_pretrained(self.args.hf_model_dir,
|
||||
dtype=torch.float16,
|
||||
trust_remote_code=True,
|
||||
device_map='cpu')
|
||||
self.vision_model = model.model.vision_embed_tokens.to(
|
||||
self.device).eval()
|
||||
|
||||
@ -765,7 +763,7 @@ class MultimodalModelRunner:
|
||||
def init_audio_encoder(self):
|
||||
assert self.model_type == "phi-4-multimodal"
|
||||
model = AutoModelForCausalLM.from_pretrained(self.args.hf_model_dir,
|
||||
torch_dtype=torch.float16,
|
||||
dtype=torch.float16,
|
||||
trust_remote_code=True,
|
||||
device_map='cpu')
|
||||
self.audio_model = model.model.embed_tokens_extend.audio_embed.to(
|
||||
@ -859,7 +857,7 @@ class MultimodalModelRunner:
|
||||
|
||||
from transformers import CLIPImageProcessor
|
||||
processor = CLIPImageProcessor.from_pretrained(
|
||||
"openai/clip-vit-large-patch14", torch_dtype=torch.bfloat16)
|
||||
"openai/clip-vit-large-patch14", dtype=torch.bfloat16)
|
||||
frames = processor.preprocess(frames,
|
||||
return_tensors="pt")['pixel_values']
|
||||
# make dtype consistent with vision encoder
|
||||
|
||||
@ -377,8 +377,8 @@ def build_blip2_engine(args):
|
||||
return_dict=True)
|
||||
return self.projector(qformer_output.last_hidden_state)
|
||||
|
||||
model = Blip2ForConditionalGeneration.from_pretrained(
|
||||
args.model_path, torch_dtype=torch.float16)
|
||||
model = Blip2ForConditionalGeneration.from_pretrained(args.model_path,
|
||||
dtype=torch.float16)
|
||||
|
||||
blip2_llm = ""
|
||||
if model.language_model.config.architectures[
|
||||
@ -449,8 +449,8 @@ def build_pix2struct_engine(args):
|
||||
img_features = self.encoder.layernorm(img_features[0])
|
||||
return img_features
|
||||
|
||||
model = Pix2StructForConditionalGeneration.from_pretrained(
|
||||
args.model_path, torch_dtype=dtype)
|
||||
model = Pix2StructForConditionalGeneration.from_pretrained(args.model_path,
|
||||
dtype=dtype)
|
||||
|
||||
wrapper = pix2structVisionWrapper(model.encoder.to(args.device))
|
||||
# input shape: batch size, number of patches, hidden dimension
|
||||
@ -501,7 +501,7 @@ def build_llava_engine(args):
|
||||
# Need to setup at hf_config._attn_implementation after transformers >= 4.46
|
||||
hf_config._attn_implementation = "eager"
|
||||
model = LlavaForConditionalGeneration.from_pretrained(
|
||||
args.model_path, torch_dtype=torch.float16, config=hf_config)
|
||||
args.model_path, dtype=torch.float16, config=hf_config)
|
||||
wrapper = LlavaVisionWrapper(
|
||||
model.vision_tower.to(args.device),
|
||||
model.multi_modal_projector.to(args.device),
|
||||
@ -530,7 +530,7 @@ def build_llava_engine(args):
|
||||
hf_config = AutoConfig.from_pretrained(args.model_path)
|
||||
hf_config.vision_config._attn_implementation = "eager"
|
||||
model = LlavaNextForConditionalGeneration.from_pretrained(
|
||||
args.model_path, torch_dtype=torch.float16, config=hf_config)
|
||||
args.model_path, dtype=torch.float16, config=hf_config)
|
||||
wrapper = LlavaNextVisionWrapper(
|
||||
model.vision_tower.vision_model.to(args.device),
|
||||
model.multi_modal_projector.to(args.device),
|
||||
@ -585,7 +585,7 @@ def build_llava_engine(args):
|
||||
return image_features # (sigma(bs, patches_i), 729, c)
|
||||
|
||||
model = LlavaOnevisionForConditionalGeneration.from_pretrained(
|
||||
args.model_path, torch_dtype=torch.float16)
|
||||
args.model_path, dtype=torch.float16)
|
||||
wrapper = LlavaOnevisionVisionWrapper(
|
||||
model.vision_tower.vision_model.to(args.device),
|
||||
model.multi_modal_projector.to(args.device), model.config)
|
||||
@ -675,7 +675,7 @@ def build_nougat_engine(args):
|
||||
return self.encoder(image).last_hidden_state
|
||||
|
||||
model = VisionEncoderDecoderModel.from_pretrained(args.model_path,
|
||||
torch_dtype=torch.float16)
|
||||
dtype=torch.float16)
|
||||
swin_encoder = model.get_encoder().to(args.device)
|
||||
wrapper = SwinEncoderWrapper(swin_encoder)
|
||||
|
||||
@ -710,7 +710,7 @@ def build_cogvlm_engine(args):
|
||||
return self.encoder(image)
|
||||
|
||||
cogvlm = AutoModelForCausalLM.from_pretrained(args.model_path,
|
||||
torch_dtype=dtype,
|
||||
dtype=dtype,
|
||||
trust_remote_code=True)
|
||||
vit_encoder = cogvlm.model.vision.to(args.device).eval()
|
||||
|
||||
@ -742,7 +742,7 @@ def build_fuyu_engine(args):
|
||||
return self.linear(patches).flatten(0, 1)
|
||||
|
||||
model = FuyuForCausalLM.from_pretrained(args.model_path,
|
||||
torch_dtype=torch.float16)
|
||||
dtype=torch.float16)
|
||||
|
||||
vision_encoder = model.vision_embed_tokens
|
||||
wrapper = FuyuEncoderWrapper(vision_encoder).to(args.device)
|
||||
@ -803,7 +803,7 @@ def build_neva_engine(args):
|
||||
if os.path.isdir(joined_path):
|
||||
vision_path = joined_path
|
||||
encoder = AutoModel.from_pretrained(vision_path,
|
||||
torch_dtype=torch.bfloat16,
|
||||
dtype=torch.bfloat16,
|
||||
trust_remote_code=True)
|
||||
vision_encoder = encoder.vision_model
|
||||
hf_config = encoder.config
|
||||
@ -884,7 +884,7 @@ def build_video_neva_engine(args):
|
||||
return vision_x
|
||||
|
||||
encoder = AutoModel.from_pretrained(vision_config["from_pretrained"],
|
||||
torch_dtype=torch.bfloat16,
|
||||
dtype=torch.bfloat16,
|
||||
trust_remote_code=True,
|
||||
attn_implementation="eager")
|
||||
vision_encoder = encoder.vision_model
|
||||
@ -951,7 +951,7 @@ def build_kosmos_engine(args):
|
||||
return img_features
|
||||
|
||||
model = AutoModelForVision2Seq.from_pretrained(args.model_path,
|
||||
torch_dtype=torch.float16)
|
||||
dtype=torch.float16)
|
||||
wrapper = VisionEncoderWrapper(
|
||||
model.vision_model.to(args.device),
|
||||
model.image_to_text_projection.to(args.device))
|
||||
@ -1001,7 +1001,7 @@ def build_phi_engine(args):
|
||||
1, pixel_values.shape[0], -1, self.vision_model.image_dim_out)
|
||||
|
||||
model = AutoModelForCausalLM.from_pretrained(args.model_path,
|
||||
torch_dtype=torch.float16,
|
||||
dtype=torch.float16,
|
||||
trust_remote_code=True)
|
||||
vision_model = model.model.vision_embed_tokens
|
||||
|
||||
@ -1103,7 +1103,7 @@ def build_phi4mm_engine(args):
|
||||
return torch.cat((speech_out, vision_out), dim=-1)
|
||||
|
||||
model = AutoModelForCausalLM.from_pretrained(args.model_path,
|
||||
torch_dtype='auto',
|
||||
dtype='auto',
|
||||
trust_remote_code=True)
|
||||
|
||||
vision_model = model.model.embed_tokens_extend.image_embed
|
||||
@ -1188,7 +1188,7 @@ def build_mllama_engine(args):
|
||||
# conflict with limitation of other multimodal models.
|
||||
from transformers import MllamaForConditionalGeneration
|
||||
model = MllamaForConditionalGeneration.from_pretrained(args.model_path,
|
||||
torch_dtype='auto',
|
||||
dtype='auto',
|
||||
device_map='auto')
|
||||
|
||||
# Check if the model structure is updated to transformers >= 4.52.0
|
||||
@ -1279,7 +1279,7 @@ def build_internvl_engine(args):
|
||||
return vit_embeds_mlp
|
||||
|
||||
model = AutoModelForCausalLM.from_pretrained(args.model_path,
|
||||
torch_dtype=torch.float16,
|
||||
dtype=torch.float16,
|
||||
trust_remote_code=True,
|
||||
use_flash_attn=False).to(
|
||||
args.device)
|
||||
@ -1345,7 +1345,7 @@ def build_qwen2_vl_engine(args):
|
||||
|
||||
model = Qwen2VLForConditionalGeneration.from_pretrained(
|
||||
args.model_path,
|
||||
torch_dtype=torch.float32,
|
||||
dtype=torch.float32,
|
||||
device_map="cpu",
|
||||
attn_implementation="eager")
|
||||
hf_config = AutoConfig.from_pretrained(args.model_path)
|
||||
@ -1502,7 +1502,7 @@ def build_qwen2_vl_engine(args):
|
||||
super().__init__()
|
||||
self.visual = Qwen2VisionTransformerPretrainedModelOpt._from_config(
|
||||
model.config.vision_config,
|
||||
torch_dtype=torch.float32,
|
||||
dtype=torch.float32,
|
||||
)
|
||||
self.visual.load_state_dict(model.visual.state_dict())
|
||||
|
||||
@ -1544,7 +1544,7 @@ def build_qwen2_audio_engine(args):
|
||||
from transformers import Qwen2AudioForConditionalGeneration
|
||||
|
||||
model = Qwen2AudioForConditionalGeneration.from_pretrained(
|
||||
args.model_path, torch_dtype=torch.float16)
|
||||
args.model_path, dtype=torch.float16)
|
||||
|
||||
# dummy audio features, dtype is float32
|
||||
audio = torch.randn(1,
|
||||
@ -1710,7 +1710,7 @@ def build_pixtral_engine(args):
|
||||
return out
|
||||
|
||||
model = Mistral3ForConditionalGeneration.from_pretrained(args.model_path,
|
||||
torch_dtype="auto")
|
||||
dtype="auto")
|
||||
vision_tower = model.vision_tower
|
||||
mm_projector = model.multi_modal_projector
|
||||
|
||||
|
||||
@ -750,7 +750,7 @@ def generate_dummy_loras(
|
||||
try:
|
||||
model = AutoModelForCausalLM.from_pretrained(
|
||||
hf_model_dir,
|
||||
torch_dtype=torch.float16,
|
||||
dtype=torch.float16,
|
||||
device_map=None, # Load everything to CPU first
|
||||
trust_remote_code=True,
|
||||
low_cpu_mem_usage=False,
|
||||
@ -762,7 +762,7 @@ def generate_dummy_loras(
|
||||
)
|
||||
model = AutoModelForCausalLM.from_pretrained(
|
||||
hf_model_dir,
|
||||
torch_dtype=torch.float16,
|
||||
dtype=torch.float16,
|
||||
device_map="auto",
|
||||
trust_remote_code=True,
|
||||
)
|
||||
|
||||
@ -35,7 +35,7 @@ def make_pixtral_vision_config():
|
||||
pretrained_config=transformers.PixtralVisionConfig(
|
||||
hidden_size=1024,
|
||||
num_attention_heads=16,
|
||||
torch_dtype=torch.bfloat16,
|
||||
dtype=torch.bfloat16,
|
||||
hidden_act="silu",
|
||||
),
|
||||
)
|
||||
|
||||
@ -356,7 +356,7 @@ class TestLoraAttentionPytorchFlowVsTRT(unittest.TestCase):
|
||||
rms_norm_eps=1e-5,
|
||||
vocab_size=32000,
|
||||
num_key_value_heads=self.head_num,
|
||||
torch_dtype=self.torch_dtype)
|
||||
dtype=self.torch_dtype)
|
||||
|
||||
mapping = Mapping(world_size=1, tp_size=1, rank=0)
|
||||
kv_cache_config = KvCacheConfig(max_tokens=max_seq_len)
|
||||
|
||||
@ -546,7 +546,7 @@ def test_bielik_11b_v2_2_instruct_multi_lora() -> None:
|
||||
print("Creating dummy LoRAs...")
|
||||
|
||||
model = AutoModelForCausalLM.from_pretrained(model_dir,
|
||||
torch_dtype=torch.bfloat16,
|
||||
dtype=torch.bfloat16,
|
||||
device_map="auto")
|
||||
hf_modules = ["q_proj", "k_proj", "v_proj"]
|
||||
peft_lora_config = PeftLoraConfig(r=8,
|
||||
@ -599,7 +599,7 @@ def test_gemma3_1b_instruct_multi_lora() -> None:
|
||||
print("Creating dummy LoRAs...")
|
||||
|
||||
model = AutoModelForCausalLM.from_pretrained(model_dir,
|
||||
torch_dtype=torch.bfloat16,
|
||||
dtype=torch.bfloat16,
|
||||
device_map="auto")
|
||||
hf_modules = ["q_proj", "k_proj", "v_proj"]
|
||||
peft_lora_config = PeftLoraConfig(r=8,
|
||||
|
||||
@ -222,7 +222,7 @@ def create_gpt_attention_network(attention_type='gpt2_attention',
|
||||
embd_pdrop=0,
|
||||
attn_pdrop=0,
|
||||
hidden_act='gelu',
|
||||
torch_dtype=dtype,
|
||||
dtype=dtype,
|
||||
)
|
||||
attention = GPT2Attention(configuration).cuda().eval()
|
||||
|
||||
|
||||
@ -125,7 +125,7 @@ class TestFunctional(unittest.TestCase):
|
||||
num_attention_heads=num_heads,
|
||||
vocab_size=30522,
|
||||
hidden_act='gelu',
|
||||
torch_dtype=dtype,
|
||||
dtype=dtype,
|
||||
)
|
||||
attention = AttentionCls(configuration).cuda().eval()
|
||||
|
||||
|
||||
@ -883,7 +883,7 @@ class TestFunctional(unittest.TestCase):
|
||||
embd_pdrop=0,
|
||||
attn_pdrop=0,
|
||||
hidden_act='gelu',
|
||||
torch_dtype=dtype,
|
||||
dtype=dtype,
|
||||
attn_implementation='eager')
|
||||
|
||||
if attention_type in ['gptj_attention', 'llama_attention']:
|
||||
|
||||
@ -575,7 +575,7 @@ class TestFunctional(unittest.TestCase):
|
||||
embd_pdrop=0,
|
||||
attn_pdrop=0,
|
||||
hidden_act='gelu',
|
||||
torch_dtype=dtype,
|
||||
dtype=dtype,
|
||||
attn_implementation='eager')
|
||||
if attention_type == 'llama_attention':
|
||||
configuration.num_key_value_heads = num_kv_heads
|
||||
|
||||
@ -56,7 +56,7 @@ class TestGPT(unittest.TestCase):
|
||||
activation_function=hidden_act,
|
||||
n_layer=n_layer,
|
||||
max_length=max_length,
|
||||
torch_dtype=dtype,
|
||||
dtype=dtype,
|
||||
)
|
||||
gpt_config.n_kv_head = gpt_config.n_head
|
||||
hf_gpt = GPT2LMHeadModel(gpt_config).cuda().eval()
|
||||
|
||||
@ -506,7 +506,7 @@ class TestLLaMA(unittest.TestCase):
|
||||
"model": "cpu",
|
||||
"lm_head": "cpu"
|
||||
}, # Load to CPU memory
|
||||
torch_dtype="auto")
|
||||
dtype="auto")
|
||||
assert hf_llama.config.torch_dtype == torch.float16
|
||||
kv_dtype = trt.float16 if hf_llama.config.torch_dtype == torch.float16 else trt.float32
|
||||
max_context_length = 128 # for loader tests this value does not matter
|
||||
|
||||
@ -177,7 +177,7 @@ class TestMamba(unittest.TestCase):
|
||||
|
||||
# get hf mamba
|
||||
hf_mamba = AutoModelForCausalLM.from_config(
|
||||
hf_config, torch_dtype=str_dtype_to_torch(dtype)).cuda().eval()
|
||||
hf_config, dtype=str_dtype_to_torch(dtype)).cuda().eval()
|
||||
|
||||
# inputs
|
||||
if remove_padding:
|
||||
@ -373,7 +373,7 @@ class TestMamba(unittest.TestCase):
|
||||
|
||||
# get hf mamba
|
||||
hf_mamba = AutoModelForCausalLM.from_pretrained(
|
||||
hf_path, device_map='cpu', torch_dtype=str_dtype_to_torch(dtype))
|
||||
hf_path, device_map='cpu', dtype=str_dtype_to_torch(dtype))
|
||||
|
||||
# get tensort llm mamba
|
||||
hf_config = MambaConfig.from_pretrained(hf_path)
|
||||
|
||||
@ -514,7 +514,7 @@ class TestMistralAndArctic(unittest.TestCase):
|
||||
"model": "cpu",
|
||||
"lm_head": "cpu"
|
||||
}, # Load to CPU memory
|
||||
torch_dtype="auto")
|
||||
dtype="auto")
|
||||
assert hf_mistral.config.torch_dtype == torch.float16
|
||||
kv_dtype = trt.float16 if hf_mistral.config.torch_dtype == torch.float16 else trt.float32
|
||||
max_context_length = 128 # for loader tests this value does not matter
|
||||
|
||||
@ -386,7 +386,7 @@ class TestNemotronNas(unittest.TestCase):
|
||||
hf_model = transformers.AutoModelForCausalLM.from_pretrained(
|
||||
hf_model_dir,
|
||||
trust_remote_code=True,
|
||||
torch_dtype=tensorrt_llm._utils.str_dtype_to_torch(params.dtype),
|
||||
dtype=tensorrt_llm._utils.str_dtype_to_torch(params.dtype),
|
||||
).cuda()
|
||||
runtime, config = self._from_hf_model(hf_model, params)
|
||||
self.allclose(
|
||||
@ -720,8 +720,7 @@ class TestNemotronNas(unittest.TestCase):
|
||||
from_pretrained(
|
||||
hf_model_dir,
|
||||
trust_remote_code=True,
|
||||
torch_dtype=tensorrt_llm._utils.str_dtype_to_torch(params.dtype
|
||||
),
|
||||
dtype=tensorrt_llm._utils.str_dtype_to_torch(params.dtype),
|
||||
).cuda(),
|
||||
atol=
|
||||
0.92, # We've observed that on a real checkpoint with the current code, fp8 MMLU is on par with BF16, and this is the observed threshold, though it may seem high.
|
||||
@ -747,8 +746,7 @@ class TestNemotronNas(unittest.TestCase):
|
||||
from_pretrained(
|
||||
hf_model_dir,
|
||||
trust_remote_code=True,
|
||||
torch_dtype=tensorrt_llm._utils.str_dtype_to_torch(params.dtype
|
||||
),
|
||||
dtype=tensorrt_llm._utils.str_dtype_to_torch(params.dtype),
|
||||
device_map="auto",
|
||||
),
|
||||
)
|
||||
@ -934,7 +932,7 @@ class TestNemotronNas(unittest.TestCase):
|
||||
dtype = tensorrt_llm._utils.str_dtype_to_torch(dtype)
|
||||
|
||||
hf_model = transformers.AutoModelForCausalLM.from_pretrained(
|
||||
hf_model_dir, trust_remote_code=True, torch_dtype=dtype).cuda()
|
||||
hf_model_dir, trust_remote_code=True, dtype=dtype).cuda()
|
||||
|
||||
batch_size = 1
|
||||
max_seq_len = 30
|
||||
|
||||
@ -934,7 +934,7 @@ class TestSmoothQuant(unittest.TestCase):
|
||||
embd_pdrop=0,
|
||||
attn_pdrop=0,
|
||||
hidden_act='gelu',
|
||||
torch_dtype=dtype,
|
||||
dtype=dtype,
|
||||
)
|
||||
n_positions = configuration.n_positions
|
||||
|
||||
|
||||
Loading…
Reference in New Issue
Block a user