import json import os from pathlib import Path import numpy as np import torch from .._utils import (_str_to_np_dict, fromfile, numpy_to_torch, str_dtype_to_torch) class LoraConfig(object): def __init__(self, hf_lora_dir: str = None, adapter_config: dict = {}, tokenizer_config: dict = {}, lora_target_modules: list = [], is_valid: bool = False, lm_head_weight=None, embedding_weight=None, hf_modules_to_trtllm_modules: dict = {}): self.hf_lora_dir = hf_lora_dir self.adapter_config = adapter_config self.tokenizer_config = tokenizer_config self.hf_lora_target_modules = lora_target_modules self.lora_target_modules = [ hf_modules_to_trtllm_modules[m] for m in lora_target_modules ] self.is_valid = is_valid self.lm_head_weight = lm_head_weight self.embedding_weight = embedding_weight self.vocab_size, self.hidden_size = self.lm_head_weight.shape if self.lm_head_weight is not None else ( 0, 0) @classmethod def from_hf(cls, hf_lora_dir, hf_modules_to_trtllm_modules): lora_target_modules = {} adapter_config = None tokenizer_config = None hf_lora_dir = hf_lora_dir is_valid = True if os.path.exists(f"{hf_lora_dir}/adapter_config.json"): with open(f"{hf_lora_dir}/adapter_config.json") as f: adapter_config = json.load(f) lora_target_modules = adapter_config["target_modules"] else: is_valid = False if os.path.exists(f"{hf_lora_dir}/tokenizer_config.json"): with open(f"{hf_lora_dir}/tokenizer_config.json") as f: tokenizer_config = json.load(f) else: is_valid = False lm_head_weight = None embedding_weight = None if os.path.exists(f"{hf_lora_dir}/adapter_model.bin"): lora_weight = torch.load(f"{hf_lora_dir}/adapter_model.bin") if adapter_config["modules_to_save"] is not None: if "lm_head" in adapter_config["modules_to_save"]: lm_head_weight = lora_weight[ "base_model.model.lm_head.weight"] if "embed_tokens" in adapter_config["modules_to_save"]: embedding_weight = lora_weight[ "base_model.model.model.embed_tokens.weight"] return cls(hf_lora_dir, adapter_config, tokenizer_config, lora_target_modules, is_valid, lm_head_weight, embedding_weight, hf_modules_to_trtllm_modules) class LoraManager(object): def __init__(self): self._lora_uid_to_key = {} ''' _lora_uid_to_low_ranks: dict[str -> List[dict[str -> int]]] { uid: [ { lora_module: int }, # layer_0_rank, { lora_module: int }, # layer_1_rank, ... ] } _lora_weights_pointers_list: [ { uid: { lora_module_1: [t_in, t_out] lora_module_2: [t_in, t_out] } }, # layer_0 { }, # layer_1 ... ] ''' self._lora_uid_to_low_ranks = {} self._lora_weights = [] self._lora_weights_pointers_list = [] def load_from_ckpt(self, model_dir, model_config, runtime_mapping, ckpt_source): if ckpt_source == "hf": self.load_from_hf(model_dir, model_config, runtime_mapping) elif ckpt_source == "nemo": self.load_from_nemo(model_dir, model_config, runtime_mapping) else: assert False, f"LoraManager does not support source {ckpt_source}" def load_from_nemo(self, model_dir, model_config, runtime_mapping): ''' Load lora modules, could be move to client side ''' self._model_config = model_config model_dir = Path(model_dir) with open(model_dir / "lora_weights.json", 'r') as f: config = json.load(f) lora_config = config['lora_config'] for key in lora_config['lora_kqv_adapter']: self._lora_uid_to_key[lora_config['lora_kqv_adapter'][key] ['key']] = key lora_target_modules = model_config.lora_target_modules dtype = model_config.dtype for layer_idx in range(model_config.num_layers): self._lora_weights_pointers_list.append({}) for uid, key in self._lora_uid_to_key.items(): self._lora_weights_pointers_list[layer_idx].update({uid: {}}) low_rank = int(lora_config['lora_kqv_adapter'][key]['low_rank']) for lora_module in lora_target_modules: if uid not in self._lora_uid_to_low_ranks: self._lora_uid_to_low_ranks.update( {uid: [{} for _ in range(model_config.num_layers)]}) self._lora_uid_to_low_ranks[uid][layer_idx][ lora_module] = low_rank prefix = f"model.model.language_model.encoder.layers.{layer_idx}.self_attention.adapter_layer.lora_kqv_adapter.{key}" t_in = numpy_to_torch( np.ascontiguousarray( fromfile( model_dir, f'{prefix}.linear_in.weight.bin', [model_config.hidden_size, low_rank], _str_to_np_dict['bfloat16']).transpose( 1, 0))).cuda() # t_in: [low_rank, hidden_size] t_out = numpy_to_torch( np.ascontiguousarray( fromfile(model_dir, f'{prefix}.linear_out.weight.bin', [low_rank, model_config.hidden_size * 3], _str_to_np_dict['bfloat16']).transpose( 1, 0))).cuda( ) # t_in: [hidden_size * 3, low_rank] t_in = t_in.float().to(str_dtype_to_torch(dtype)) t_out = t_out.float().to(str_dtype_to_torch(dtype)) self._lora_weights_pointers_list[layer_idx][uid].update({ lora_module: [ t_in.contiguous().data_ptr(), t_out.contiguous().data_ptr() ] }) self._lora_weights.append(t_in) self._lora_weights.append(t_out) if "-1" not in self._lora_uid_to_low_ranks: self._lora_uid_to_low_ranks.update( {"-1": [{} for _ in range(model_config.num_layers)]}) self._lora_uid_to_low_ranks["-1"][layer_idx][lora_module] = 0 def load_from_hf(self, model_dir, model_config, runtime_mapping): ''' lora config of https://huggingface.co/hfl/chinese-alpaca-2-lora-7b { "base_model_name_or_path": "/Llama-2-7b-hf", "bias": "none", "enable_lora": null, "fan_in_fan_out": false, "inference_mode": true, "lora_alpha": 128.0, "lora_dropout": 0.05, "merge_weights": false, "modules_to_save": [ "embed_tokens", "lm_head" ], "peft_type": "LORA", "r": 64, "target_modules": [ "q_proj", "v_proj", "k_proj", "o_proj", "gate_proj", "down_proj", "up_proj" ], "task_type": "CAUSAL_LM" } keys in adapter_model.bin: base_model.model.model.layers.0.self_attn.q_proj.lora_A.weight torch.Size([64, 4096]) base_model.model.model.layers.0.self_attn.q_proj.lora_B.weight torch.Size([4096, 64]) base_model.model.model.layers.0.self_attn.k_proj.lora_A.weight torch.Size([64, 4096]) base_model.model.model.layers.0.self_attn.k_proj.lora_B.weight torch.Size([4096, 64]) base_model.model.model.layers.0.self_attn.v_proj.lora_A.weight torch.Size([64, 4096]) base_model.model.model.layers.0.self_attn.v_proj.lora_B.weight torch.Size([4096, 64]) base_model.model.model.layers.0.self_attn.o_proj.lora_A.weight torch.Size([64, 4096]) base_model.model.model.layers.0.self_attn.o_proj.lora_B.weight torch.Size([4096, 64]) base_model.model.model.layers.0.mlp.gate_proj.lora_A.weight torch.Size([64, 4096]) base_model.model.model.layers.0.mlp.gate_proj.lora_B.weight torch.Size([11008, 64]) base_model.model.model.layers.0.mlp.up_proj.lora_A.weight torch.Size([64, 4096]) base_model.model.model.layers.0.mlp.up_proj.lora_B.weight torch.Size([11008, 64]) base_model.model.model.layers.0.mlp.down_proj.lora_A.weight torch.Size([64, 11008]) base_model.model.model.layers.0.mlp.down_proj.lora_B.weight torch.Size([4096, 64]) ... ''' tp_size = runtime_mapping.tp_size tp_rank = runtime_mapping.tp_rank lora_model = torch.load(f"{model_dir}/adapter_model.bin") with open(f"{model_dir}/adapter_config.json", 'r') as f: hf_config = json.load(f) lora_target_modules = model_config.lora_target_modules dtype = model_config.dtype ranks = [0, hf_config["r"]] uids = ["-1", "0"] # TODO should be lora from some config for uid, rank in zip(uids, ranks): self._lora_uid_to_low_ranks[uid] = [] for layer_idx in range(model_config.num_layers): self._lora_weights_pointers_list.append({}) self._lora_weights_pointers_list[layer_idx].update({uid: {}}) self._lora_uid_to_low_ranks[uid].append({}) prefix = "base_model.model.model.layers" for lora_module in lora_target_modules: if uid == "-1": self._lora_uid_to_low_ranks[uid][layer_idx][ lora_module] = 0 continue if lora_module == "attn_q" or lora_module == "attn_k" or lora_module == "attn_v": name = f"{prefix}.{layer_idx}.{lora_module.replace('attn_', 'self_attn.')}_proj" # not split t_in = lora_model[f"{name}.lora_A.weight"] # split by column t_out = lora_model[f"{name}.lora_B.weight"] assert t_out.shape[0] % tp_size == 0 t_out = torch.split(t_out, t_out.shape[0] // tp_size, dim=0)[tp_rank].contiguous() elif lora_module == "attn_dense": # split by row t_in = lora_model[ f"{prefix}.{layer_idx}.self_attn.o_proj.lora_A.weight"] assert t_in.shape[1] % tp_size == 0 t_in = torch.split(t_in, t_in.shape[1] // tp_size, dim=1)[tp_rank].contiguous() # not split t_out = lora_model[ f"{prefix}.{layer_idx}.self_attn.o_proj.lora_B.weight"] elif lora_module == "mlp_h_to_4h": # not split t_in = lora_model[ f"{prefix}.{layer_idx}.mlp.gate_proj.lora_A.weight"] # split by column t_out = lora_model[ f"{prefix}.{layer_idx}.mlp.gate_proj.lora_B.weight"] assert t_out.shape[0] % tp_size == 0 t_out = torch.split(t_out, t_out.shape[0] // tp_size, dim=0)[tp_rank].contiguous() elif lora_module == "mlp_gate": # not split t_in = lora_model[ f"{prefix}.{layer_idx}.mlp.up_proj.lora_A.weight"] # split by column t_out = lora_model[ f"{prefix}.{layer_idx}.mlp.up_proj.lora_B.weight"] assert t_out.shape[0] % tp_size == 0 t_out = torch.split(t_out, t_out.shape[0] // tp_size, dim=0)[tp_rank].contiguous() elif lora_module == "mlp_4h_to_h": # split by row t_in = lora_model[ f"{prefix}.{layer_idx}.mlp.down_proj.lora_A.weight"] assert t_in.shape[0] % tp_size == 0 t_in = torch.split(t_in, t_in.shape[1] // tp_size, dim=1)[tp_rank].contiguous() # not split t_out = lora_model[ f"{prefix}.{layer_idx}.mlp.down_proj.lora_B.weight"] t_in = t_in.cuda().contiguous() t_out = t_out.cuda().contiguous() t_in = t_in.float().to(str_dtype_to_torch(dtype)) t_out = t_out.float().to(str_dtype_to_torch(dtype)) self._lora_weights_pointers_list[layer_idx][uid].update( {lora_module: [t_in.data_ptr(), t_out.data_ptr()]}) assert t_in.shape[0] == int(hf_config["r"]) self._lora_uid_to_low_ranks[uid][layer_idx][ lora_module] = int(hf_config["r"]) # prevent torch free this buffer self._lora_weights.append(t_in) self._lora_weights.append(t_out) del lora_model def uid_to_key(self, uid: str): assert isinstance(uid, str) return self._lora_uid_to_key[uid] def uid_to_low_ranks(self, uid: str): assert isinstance(uid, str) return self._lora_uid_to_low_ranks[uid] @property def lora_weights(self): return self._lora_weights @property def lora_weights_pointers_list(self): return self._lora_weights_pointers_list