Merge pull request #45 from 0-yy-0/main

Model_Architecture_Discussions 新增 MiniCPM
2026-06-06 00:04:42 +00:00 · 2024-08-01 16:10:23 +08:00
parent 6f214b20d5 0a52a81a03
commit d323d5cf26
12 changed files with 297006 additions and 0 deletions
@@ -0,0 +1,726 @@
+import math
+import warnings
+from typing import List, Optional, Tuple, Union, Dict
+from collections import OrderedDict
+
+import torch
+import torch.nn.functional as F
+from torch import nn
+from torch.nn import CrossEntropyLoss
+import re
+from dataclasses import dataclass
+
+
+import logging
+from configuration_minicpm import MiniCPMConfig  # 直接导入
+
+logger = logging.getLogger(__name__)
+
+
+@dataclass
+class BaseModelOutputWithPast(OrderedDict):
+    last_hidden_state: torch.FloatTensor = None
+    past_key_values: Optional[Tuple[Tuple[torch.FloatTensor]]] = None
+    hidden_states: Optional[Tuple[torch.FloatTensor, ...]] = None
+    attentions: Optional[Tuple[torch.FloatTensor, ...]] = None
+    
+@dataclass
+class CausalLMOutputWithPast(OrderedDict):
+    loss: Optional[torch.FloatTensor] = None
+    logits: torch.FloatTensor = None
+    past_key_values: Optional[Tuple[Tuple[torch.FloatTensor]]] = None
+    hidden_states: Optional[Tuple[torch.FloatTensor, ...]] = None
+    attentions: Optional[Tuple[torch.FloatTensor, ...]] = None
+
+
+class MiniCPMRotaryEmbedding(nn.Module):
+    def __init__(self, dim, max_position_embeddings=2048, base=10000, device=None):
+        super().__init__()
+
+        self.dim = dim
+        self.max_position_embeddings = max_position_embeddings
+        self.base = base
+        # 计算了逆频率inv_freq并使用register_buffer方法将其注册为一个缓冲区
+        inv_freq = 1.0 / (self.base ** (torch.arange(0, self.dim, 2).float().to(device) / self.dim))
+        self.register_buffer("inv_freq", inv_freq, persistent=False)
+
+        # 构建缓存
+        self._set_cos_sin_cache(
+            seq_len=max_position_embeddings, device=self.inv_freq.device, dtype=torch.float32
+        )
+
+    def _set_cos_sin_cache(self, seq_len, device, dtype):
+        # 计算并缓存余弦和正弦值
+        self.max_seq_len_cached = seq_len
+        t = torch.arange(self.max_seq_len_cached, device=device, dtype=self.inv_freq.dtype)
+        freqs = torch.outer(t, self.inv_freq)
+
+        # 将频率扩展到维度上
+        emb = torch.cat((freqs, freqs), dim=-1)
+
+        # 缓存余弦值和正弦值
+        self.register_buffer("cos_cached", emb.cos().to(dtype), persistent=False)
+        self.register_buffer("sin_cached", emb.sin().to(dtype), persistent=False)
+
+    def forward(self, x, seq_len=None):
+        # 首先检查输入序列的长度是否超过了缓存的最大长度，如果超过了，则重新计算并缓存余弦和正弦值
+        # x: [bs, num_attention_heads, seq_len, head_size]
+        if seq_len > self.max_seq_len_cached:
+            self._set_cos_sin_cache(seq_len=seq_len, device=x.device, dtype=x.dtype)
+
+        # 返回对应序列长度的余弦和正弦值
+        return (
+            self.cos_cached[:seq_len].to(dtype=x.dtype),
+            self.sin_cached[:seq_len].to(dtype=x.dtype),
+        )
+        
+def rotate_half(x):
+    # 将输入张量 x 沿 emb 维度一分为二
+    x1 = x[..., : x.shape[-1] // 2]
+    x2 = x[..., x.shape[-1] // 2 :]
+    # 将后半部分取负号，然后与前半部分拼接，对输入张量的隐藏维度进行旋转
+    return torch.cat((-x2, x1), dim=-1)
+
+
+def apply_rotary_pos_emb(q, k, cos, sin, position_ids, unsqueeze_dim=1):
+    # 保存原始数据类型
+    orig_dtype = k.dtype  # torch.bfloat16
+    
+    # 根据 position_ids 选择 cos 和 sin，并在指定维度上扩展
+    cos = cos[position_ids].unsqueeze(unsqueeze_dim)  # [bs, 1, seq_len, dim] 便于和[bs, num_heads, q_len, head_dim] 维度的 q,k 进行矩阵乘法
+    sin = sin[position_ids].unsqueeze(unsqueeze_dim)  # [bs, 1, seq_len, dim]
+    
+    # 将 q 和 k 转换为 float32 类型，以便进行精确的计算
+    q_fp32 = q.to(dtype=torch.float32, device=q.device)
+    k_fp32 = k.to(dtype=torch.float32, device=k.device)
+    
+    # 计算 q 和 k 的旋转位置嵌入
+    q_embed = (q_fp32 * cos) + (rotate_half(q_fp32) * sin)
+    k_embed = (k_fp32 * cos) + (rotate_half(k_fp32) * sin)
+    
+    # 将结果转换回原始数据类型并返回
+    return q_embed.to(dtype=orig_dtype), k_embed.to(dtype=orig_dtype)  # [bs, num_heads, q_len, head_dim]
+
+
+def create_causal_mask(input_shape, dtype, device, past_length=0):
+    batch_size, query_length = input_shape
+    # 创建一个上三角矩阵，填充最小浮点值，表示未来的token不能看到
+    causal_mask = torch.triu(torch.full((query_length, query_length), torch.finfo(dtype).min, dtype=dtype, device=device), diagonal=1)
+    # 如果有过去的key-value长度，则在mask前面添加零矩阵
+    if past_length > 0:
+        causal_mask = torch.cat([torch.zeros(query_length, past_length, dtype=dtype, device=device), causal_mask], dim=-1)
+    # 扩展mask的维度以匹配批次大小，并返回
+    return causal_mask[None, None, :, :].expand(batch_size, 1, query_length, query_length + past_length)
+
+def expand_attention_mask(mask, dtype, target_length = None):
+    batch_size, source_length = mask.shape
+    target_length = target_length if target_length is not None else source_length
+
+    # 扩展mask的维度以匹配目标长度和批次大小
+    expanded_mask = mask[:, None, None, :].expand(batch_size, 1, target_length, source_length).to(dtype)
+    # 反转mask，将1变为0，0变为1
+    inverted_mask = 1.0 - expanded_mask
+    # 将反转后的mask中为True的位置填充为最小浮点值
+    return inverted_mask.masked_fill(inverted_mask.bool(), torch.finfo(dtype).min)
+
+def prepare_4d_causal_attention_mask(
+    attention_mask: Optional[torch.Tensor],
+    query_length: int,
+    past_length: int,
+    dtype: torch.dtype,
+    device: Union[torch.device, "str"] = "cpu",
+):
+
+    # 如果attention_mask存在且是2维的
+    if attention_mask is not None and attention_mask.dim() == 2:
+        # 获取批次大小和查询长度
+        batch_size = attention_mask.shape[0]
+        query_length = query_length
+        # 更新input_shape和past_length
+        input_shape = (batch_size, query_length)
+        causal_mask = None
+        if query_length > 1:
+            # 创建4维的causal mask
+            causal_mask = create_causal_mask(input_shape, dtype, device, past_length)
+        # 扩展attention mask
+        expanded_mask = expand_attention_mask(attention_mask, dtype, query_length)
+        if causal_mask is not None:
+            # 将causal mask中对应expanded mask为True的位置填充为最小浮点值
+            expanded_attn_mask = causal_mask.masked_fill(expanded_mask.bool(), torch.finfo(dtype).min)
+        expanded_attn_mask = expanded_mask
+    return expanded_attn_mask
+
+class MiniCPMAttention(nn.Module):
+    def __init__(self, config: MiniCPMConfig, layer_idx: Optional[int] = None):
+        super().__init__()
+        self.config = config
+        self.layer_idx = layer_idx
+        if layer_idx is None:
+            layer_idx.warn_once(
+                f"Instantiating {self.__class__.__name__} without passing `layer_idx` is not recommended and will "
+                "to errors during the forward call, if caching is used. Please make sure to provide a `layer_idx` "
+                "when creating this class."
+            )
+
+        self.attention_dropout = config.attention_dropout # 0.0
+        self.hidden_size = config.hidden_size # 2304
+        self.num_heads = config.num_attention_heads # 36
+        self.head_dim = self.hidden_size // self.num_heads # 64
+        self.num_key_value_heads = config.num_key_value_heads # 36
+        self.num_key_value_groups = self.num_heads // self.num_key_value_heads # 1
+        self.max_position_embeddings = config.max_position_embeddings # 2048
+        self.rope_theta = config.rope_theta  # 10000.0
+        self.is_causal = True
+
+        if (self.head_dim * self.num_heads) != self.hidden_size:
+            raise ValueError(
+                f"hidden_size must be divisible by num_heads (got `hidden_size`: {self.hidden_size}"
+                f" and `num_heads`: {self.num_heads})."
+            )
+
+        self.q_proj = nn.Linear(self.hidden_size, self.num_heads * self.head_dim, bias=config.attention_bias) # (2304, 36*64=2304)
+        self.k_proj = nn.Linear(self.hidden_size, self.num_key_value_heads * self.head_dim, bias=config.attention_bias)
+        self.v_proj = nn.Linear(self.hidden_size, self.num_key_value_heads * self.head_dim, bias=config.attention_bias)
+        self.o_proj = nn.Linear(self.num_heads * self.head_dim, self.hidden_size, bias=config.attention_bias)
+        self._init_rope()
+
+    def _init_rope(self):
+        self.rotary_emb = MiniCPMRotaryEmbedding(
+            self.head_dim,
+            max_position_embeddings=self.max_position_embeddings,
+            base=self.rope_theta,
+        )
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_value:  Optional[Tuple[Tuple[torch.FloatTensor]]] = None,
+        output_attentions: bool = False,
+        use_cache: bool = False,
+        **kwargs,
+    ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
+
+        bsz, q_len, _ = hidden_states.size()
+
+        # q,k,v 矩阵
+        query_states = self.q_proj(hidden_states)
+        key_states = self.k_proj(hidden_states)
+        value_states = self.v_proj(hidden_states)
+        
+        # 拆成 num_heads 个头 (bsz, num_heads, q_len, self.head_dim)
+        query_states = query_states.view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2)
+        key_states = key_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
+        value_states = value_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
+
+        kv_seq_len = key_states.shape[-2]
+        if past_key_value is not None and len(past_key_value) > 0 and len(past_key_value[0]) > self.layer_idx and len(past_key_value[0][self.layer_idx].shape) > 1:
+            # 如果有 kv-cache 缓存，需要加上缓存的长度
+            kv_seq_len += past_key_value[0][self.layer_idx].shape[0] 
+            
+        # 获取 RoPE Embedding 对应位置的 cos 和 sin 值 （ 这里传入的 value_states 不会参与计算，只是确保类型和设备）
+        cos, sin = self.rotary_emb(value_states.to(torch.float32), seq_len=kv_seq_len)
+        
+        # 对 q 和 k 向量应用 RoPE 位置编码
+        query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin, position_ids)
+        # 如果存在先前的 k-v 缓存
+        if past_key_value is not None:
+            # 若当前层缓存未初始化，则进行初始化
+            if len(past_key_value[0]) <= self.layer_idx:
+                # 为当前层新增 k-v 的缓存
+                past_key_value[0].append(key_states)
+                past_key_value[1].append(value_states)
+            else:
+                # 若当前层缓存已存在，通过在序列长度维度上进行拼接更新缓存
+                past_key_value[0][self.layer_idx] = torch.cat([past_key_value[0][self.layer_idx], key_states], dim=-2)
+                past_key_value[1][self.layer_idx] = torch.cat([past_key_value[1][self.layer_idx], value_states], dim=-2)
+
+            key_states, value_states = past_key_value[0][self.layer_idx], past_key_value[1][self.layer_idx]   
+            
+        attn_weights = torch.matmul(query_states, key_states.transpose(2, 3)) / math.sqrt(self.head_dim)
+        
+        if attn_weights.size() != (bsz, self.num_heads, q_len, kv_seq_len):
+            raise ValueError(
+                f"Attention weights should be of size {(bsz, self.num_heads, q_len, kv_seq_len)}, but is"
+                f" {attn_weights.size()}"
+            )
+
+        if attention_mask is not None:
+            if attention_mask.size() != (bsz, 1, q_len, kv_seq_len):
+                raise ValueError(
+                    f"Attention mask should be of size {(bsz, 1, q_len, kv_seq_len)}, but is {attention_mask.size()}"
+                )
+            attn_weights = attn_weights + attention_mask
+
+        # 使用32位浮点数精度以提高计算精度
+        attn_weights = F.softmax(attn_weights, dim=-1, dtype=torch.float32).to(query_states.dtype)
+        attn_weights = F.dropout(attn_weights, p=self.attention_dropout, training=self.training)
+        attn_output = torch.matmul(attn_weights, value_states)
+
+        if attn_output.size() != (bsz, self.num_heads, q_len, self.head_dim):
+            raise ValueError(
+                f"`attn_output` should be of size {(bsz, self.num_heads, q_len, self.head_dim)}, but is"
+                f" {attn_output.size()}"
+            )
+            
+        attn_output = attn_output.transpose(1, 2).contiguous()
+
+        attn_output = attn_output.reshape(bsz, q_len, self.hidden_size)
+
+        attn_output = self.o_proj(attn_output)
+
+        if not output_attentions:
+            attn_weights = None
+        
+        return attn_output, attn_weights, past_key_value
+
+class MiniCPMRMSNorm(nn.Module):
+    def __init__(self, hidden_size, eps=1e-6):
+        super().__init__()
+        # 初始化权重参数为1，形状由hidden_size决定
+        self.weight = nn.Parameter(torch.ones(hidden_size)) 
+        # 设置方差的epsilon值，防止除以0
+        self.variance_epsilon = eps
+
+    def forward(self, hidden_states):
+        # 保存输入的数据类型，以便后续恢复
+        old_dtype = hidden_states.dtype
+        # 计算方差，先转换数据类型以提高精度，然后计算平方的均值
+        variance = hidden_states.to(torch.float32).pow(2).mean(dim=-1, keepdim=True)
+        # 标准化隐藏状态，使用rsqrt（方差+epsilon的倒数根）进行缩放，并恢复原数据类型
+        hidden_states = (hidden_states * torch.rsqrt(variance + self.variance_epsilon)).to(old_dtype)
+        # 应用权重参数，进行缩放
+        return hidden_states * self.weight
+    
+class MiniCPMMLP(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.config = config
+        self.hidden_size = config.hidden_size # 2304
+        self.intermediate_size = config.intermediate_size # 5760
+        self.gate_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=False)
+        self.up_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=False)
+        self.down_proj = nn.Linear(self.intermediate_size, self.hidden_size, bias=False)
+        self.act_fn = nn.SiLU()
+
+    def forward(self, x): 
+        down_proj = self.down_proj(self.act_fn(self.gate_proj(x)) * self.up_proj(x))
+        return down_proj
+    
+class MiniCPMPreTrainedModel(nn.Module):
+    def __init__(self, *args, **kwargs):
+        self.config = args[0]
+
+        super().__init__()
+
+    def _init_weights(self, module):
+        std = self.config.initializer_range
+        if isinstance(module, nn.Linear):
+            module.weight.data.normal_(mean=0.0, std=std)
+            if module.bias is not None:
+                module.bias.data.zero_()
+        elif isinstance(module, nn.Embedding):
+            module.weight.data.normal_(mean=0.0, std=std)
+            if module.padding_idx is not None:
+                module.weight.data[module.padding_idx].zero_()
+
+
+class MiniCPMDecoderLayer(nn.Module):
+    def __init__(self, config: MiniCPMConfig, layer_idx: int):
+        super().__init__()
+        self.hidden_size = config.hidden_size
+        self.self_attn = MiniCPMAttention(config=config, layer_idx=layer_idx)
+
+        self.mlp = MiniCPMMLP(config)
+        self.input_layernorm = MiniCPMRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        self.post_attention_layernorm = MiniCPMRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+
+        self.scale_depth = config.scale_depth
+        self.num_hidden_layers = config.num_hidden_layers
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_value: Optional[Tuple[torch.Tensor]] = None,
+        output_attentions: Optional[bool] = False,
+        use_cache: Optional[bool] = False,
+        **kwargs,
+    ) -> Tuple[torch.FloatTensor, Optional[Tuple[torch.FloatTensor, torch.FloatTensor]]]:
+        
+        residual = hidden_states
+        # 对输入归一化
+        hidden_states = self.input_layernorm(hidden_states)
+        # Self Attention 计算
+        hidden_states, self_attn_weights, present_key_value = self.self_attn(
+            hidden_states=hidden_states,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            past_key_value=past_key_value,
+            output_attentions=output_attentions,
+            use_cache=use_cache,
+            **kwargs,
+        )
+        # 应用残差连接并缩放
+        hidden_states = residual + hidden_states * (self.scale_depth / math.sqrt(self.num_hidden_layers))
+
+        residual = hidden_states
+        # 对 attention 结果归一化
+        hidden_states = self.post_attention_layernorm(hidden_states)
+
+        hidden_states = self.mlp(hidden_states)
+        # 应用残差连接并缩放
+        hidden_states = residual + hidden_states * (self.scale_depth / math.sqrt(self.num_hidden_layers))
+
+        outputs = (hidden_states,)
+
+        if output_attentions:
+            outputs += (self_attn_weights,)
+
+        if use_cache:
+            outputs += (present_key_value,)
+
+        return outputs
+
+    
+class MiniCPMModel(MiniCPMPreTrainedModel):
+
+    def __init__(self, config: MiniCPMConfig):
+        super().__init__(config)
+
+        self.padding_idx = config.pad_token_id
+        self.vocab_size = config.vocab_size
+
+        self.embed_tokens = nn.Embedding(config.vocab_size, config.hidden_size, self.padding_idx)
+        self.layers = nn.ModuleList(
+            [MiniCPMDecoderLayer(config, layer_idx) for layer_idx in range(config.num_hidden_layers)]
+        )
+
+        self.norm = MiniCPMRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+
+        self.gradient_checkpointing = False
+        # self._init_weights()
+        
+    def _init_weights(self, module):
+        std = self.config.initializer_range
+        if isinstance(module, nn.Linear):
+            module.weight.data.normal_(mean=0.0, std=std)
+            if module.bias is not None:
+                module.bias.data.zero_()
+        elif isinstance(module, nn.Embedding):
+            module.weight.data.normal_(mean=0.0, std=std)
+            if module.padding_idx is not None:
+                module.weight.data[module.padding_idx].zero_()
+            
+    def get_input_embeddings(self):
+        return self.embed_tokens
+
+    def set_input_embeddings(self, value):
+        self.embed_tokens = value
+
+    def forward(
+        self,
+        input_ids: torch.LongTensor = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[List[torch.FloatTensor]] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple, BaseModelOutputWithPast]:
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        use_cache = use_cache if use_cache is not None else self.config.use_cache
+
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        if input_ids is not None and inputs_embeds is not None:
+            raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
+        elif input_ids is not None:
+            batch_size, seq_length = input_ids.shape[:2]
+        elif inputs_embeds is not None:
+            batch_size, seq_length = inputs_embeds.shape[:2]
+        else:
+            raise ValueError("You have to specify either input_ids or inputs_embeds")
+
+        past_key_values_length = 0
+        
+        if use_cache:
+            if past_key_values is not None and len(past_key_values) > 0 and len(past_key_values[0]) > 0 and len(past_key_values[0][0].shape) > 2:
+                past_key_values_length = past_key_values[0][0].shape[-2]
+
+        if position_ids is None:
+            device = input_ids.device if input_ids is not None else inputs_embeds.device
+            position_ids = torch.arange(
+                past_key_values_length, seq_length + past_key_values_length, dtype=torch.long, device=device
+            )
+            position_ids = position_ids.unsqueeze(0)
+
+        if inputs_embeds is None:
+            inputs_embeds = self.embed_tokens(input_ids) * self.config.scale_emb
+
+        attention_mask = prepare_4d_causal_attention_mask(attention_mask, seq_length, past_key_values_length, inputs_embeds.dtype, inputs_embeds.device)
+        
+        # embed positions
+        hidden_states = inputs_embeds
+
+        # decoder layers
+        all_hidden_states = () if output_hidden_states else None
+        all_self_attns = () if output_attentions else None
+        next_decoder_cache = None
+
+        for decoder_layer in self.layers:
+            if output_hidden_states:
+                all_hidden_states += (hidden_states,)
+
+            layer_outputs = decoder_layer(
+                hidden_states,
+                attention_mask=attention_mask,
+                position_ids=position_ids,
+                past_key_value=past_key_values,
+                output_attentions=output_attentions,
+                use_cache=use_cache,
+            )
+
+            hidden_states = layer_outputs[0]
+
+            if use_cache:
+                next_decoder_cache = layer_outputs[2 if output_attentions else 1]
+
+            if output_attentions:
+                all_self_attns += (layer_outputs[1],)
+        # 对最终的结果归一化
+        hidden_states = self.norm(hidden_states)
+
+        # 添加最后一个解码器层的隐藏状态
+        if output_hidden_states:
+            all_hidden_states += (hidden_states,)
+
+        next_cache = None
+        if use_cache:
+            next_cache = next_decoder_cache
+        if not return_dict:
+            return tuple(v for v in [hidden_states, next_cache, all_hidden_states, all_self_attns] if v is not None)
+        return BaseModelOutputWithPast(
+            last_hidden_state=hidden_states,
+            past_key_values=next_cache,
+            hidden_states=all_hidden_states,
+            attentions=all_self_attns,
+        )
+            
+class MiniCPMForCausalLM(MiniCPMPreTrainedModel):
+    _tied_weights_keys = ["lm_head.weight"]
+
+    def __init__(self, config):
+        super().__init__(config)
+        self.model = MiniCPMModel(config)
+        self.vocab_size = config.vocab_size
+        self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
+
+        # Initialize weights and apply final processing
+        # self.post_init()
+
+    def get_input_embeddings(self):
+        return self.model.embed_tokens
+
+    def set_input_embeddings(self, value):
+        self.model.embed_tokens = value
+
+    def get_output_embeddings(self):
+        return self.lm_head
+
+    def set_output_embeddings(self, new_embeddings):
+        self.lm_head = new_embeddings
+
+    def set_decoder(self, decoder):
+        self.model = decoder
+
+    def get_decoder(self):
+        return self.model
+
+    def forward(
+        self,
+        input_ids: torch.LongTensor = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[List[torch.FloatTensor]] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        labels: Optional[torch.LongTensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple, CausalLMOutputWithPast]:
+
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        # 调用模型
+        outputs = self.model(
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            past_key_values=past_key_values,
+            inputs_embeds=inputs_embeds,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        
+        # 获取最后一层隐藏状态，并通过线性层（lm_head）转换为logits
+        hidden_states = outputs.last_hidden_state
+        logits = self.lm_head(hidden_states / (self.config.hidden_size / self.config.dim_model_base))
+        logits = logits.float()
+        
+        loss = None
+        # 如果存在标签，则进行损失计算
+        if labels is not None:
+            # 对logits和labels进行错位，以便预测下一个token
+            shift_logits = logits[..., :-1, :].contiguous()
+            shift_labels = labels[..., 1:].contiguous()
+            # 为交叉熵损失计算准备，将tokens展平
+            loss_fct = CrossEntropyLoss()
+            shift_logits = shift_logits.view(-1, self.config.vocab_size)
+            shift_labels = shift_labels.view(-1)
+            shift_labels = shift_labels.to(shift_logits.device)
+            # 计算交叉熵损失
+            loss = loss_fct(shift_logits, shift_labels)
+
+        if not return_dict:
+            output = (logits,) + outputs[1:]
+            return (loss,) + output if loss is not None else output
+
+        return CausalLMOutputWithPast(
+            loss=loss,
+            logits=logits,
+            past_key_values=outputs.past_key_values,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+
+    def prepare_inputs_for_generation(
+            self, input_ids, past_key_values=None, attention_mask=None, inputs_embeds=None, **kwargs
+        ):
+        # 调整输入以匹配注意力掩码或过去的键值长度
+        def adjust_input_ids(input_ids, attention_mask, past_length):
+            if attention_mask is not None and attention_mask.shape[1] > input_ids.shape[1]:
+                return input_ids[:, -(attention_mask.shape[1] - past_length):]
+            elif past_length < input_ids.shape[1]:
+                return input_ids[:, past_length:]
+            return input_ids
+
+        # 根据 kv 缓存的长度调整输入
+        if past_key_values is not None and len(past_key_values) > 0 and len(past_key_values[0]) > 0 and len(past_key_values[0][0].shape) > 2:
+            cache_length = past_length = past_key_values[0][0].shape[2]
+            max_cache_length = None
+
+            input_ids = adjust_input_ids(input_ids, attention_mask, past_length)
+
+            if max_cache_length is not None and attention_mask is not None and cache_length + input_ids.shape[1] > max_cache_length:
+                attention_mask = attention_mask[:, -max_cache_length:]
+        
+        # 按照注意力掩码生成位置ID
+        position_ids = kwargs.get("position_ids", None)
+        if attention_mask is not None and position_ids is None:
+            position_ids = attention_mask.long().cumsum(-1) - 1
+            position_ids.masked_fill_(attention_mask == 0, 1)
+            if past_key_values:
+                position_ids = position_ids[:, -input_ids.shape[1]:]
+       
+        # 更新模型输入
+        model_inputs = {"inputs_embeds": inputs_embeds} if inputs_embeds is not None and past_key_values is None else {"input_ids": input_ids}
+        
+        model_inputs.update(
+            {
+                "position_ids": position_ids,
+                "past_key_values": past_key_values,
+                "use_cache": kwargs.get("use_cache"),
+                "attention_mask": attention_mask,
+            }
+        )
+        return model_inputs
+    
+    @torch.inference_mode()
+    def chat(self, tokenizer, query: str, history: List[Dict] = None, role: str = "user",
+             max_length: int = 4096, num_beams=1, do_sample=True, top_p=0.8, temperature=0.3, logits_processor=None,
+             **kwargs):
+        if history is None:
+            history = []
+        if logits_processor:
+            gen_kwargs = {"max_length": max_length, "num_beams": num_beams, "do_sample": do_sample, "top_p": top_p,
+                        "temperature": temperature, "logits_processor": logits_processor, **kwargs}
+        else:
+            gen_kwargs = {"max_length": max_length, "num_beams": num_beams, "do_sample": do_sample, "top_p": top_p,
+                        "temperature": temperature, "logits_processor": logits_processor, **kwargs}
+        
+        history.append({"role": role, "content": query})
+        history_str = tokenizer.apply_chat_template(history, tokenize=False, add_generation_prompt=False)
+        inputs = tokenizer(history_str, return_tensors='pt').to(self.device)
+        outputs = self.generate(**inputs, **gen_kwargs)
+        outputs = outputs.tolist()[0][len(inputs["input_ids"][0]):-1]
+        response = tokenizer.decode(outputs)
+        pattern = re.compile(r".*?(?=<AI>|<用户>)", re.DOTALL)
+        matches = pattern.findall(response)
+        if len(matches) > 0:
+            response = matches[0]
+        history.append({"role": "assistant", "content": response})
+        return response, history
+    
+        '''进行推理'''
+    @torch.no_grad()
+    def generate(self, input_ids, max_new_tokens=1024, temperature=1.0, top_k=None, use_cache=False, past_key_values=None, tokenizer=None, do_sample=False, **model_kwargs):
+        if use_cache and past_key_values is None:
+            # 初始化 kv 缓存
+            past_key_values = ([], [])
+            model_kwargs["past_key_values"] = past_key_values
+        batch_size = input_ids.size(0)
+        # 初始化完成标志和未完成序列标志
+        finished = torch.zeros(batch_size, dtype=torch.bool).to(input_ids.device)
+        unfinished_sequences = torch.ones(batch_size, dtype=torch.bool).to(input_ids.device)
+        # 获取 pad_token_id 用于填充
+        pad_token_id = tokenizer.pad_token_id  # 提前获取 pad_token_id
+
+        for _ in range(max_new_tokens):
+            # 准备生成的输入
+            model_inputs = self.prepare_inputs_for_generation(input_ids, **model_kwargs)
+
+            logits = self(**model_inputs).logits[:, -1, :] / temperature  # Apply temperature
+            
+            if top_k is not None:
+                indices_to_remove = logits < torch.topk(logits, top_k)[0][..., -1, None]
+                logits[indices_to_remove] = -float('Inf')
+    
+            if do_sample:
+                probs = F.softmax(logits, dim=-1)
+                next_tokens = torch.multinomial(probs, num_samples=1).squeeze(1)
+            else:
+                next_tokens = torch.argmax(logits, dim=-1)
+            
+            # 更新未完成序列的 next_tokens 
+            next_tokens = next_tokens * unfinished_sequences + pad_token_id * (~unfinished_sequences)        
+            input_ids = torch.cat([input_ids, next_tokens[:, None]], dim=-1)
+            if "attention_mask" in model_kwargs:
+                # 更新 attention_mask
+                attention_mask = model_kwargs["attention_mask"]
+                model_kwargs["attention_mask"] = torch.cat(
+                    [attention_mask, attention_mask.new_ones((attention_mask.shape[0], 1))], dim=-1
+                )
+            # 更新完成和未完成的序列标志
+            finished |= (next_tokens.squeeze(-1) == tokenizer.eos_token_id)
+            unfinished_sequences &= ~finished
+            
+            # 如果所有序列都完成，则停止生成
+            if finished.all():
+                break
+
+        return input_ids
@@ -0,0 +1,308 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "用我们搭建的模型尝试读取官方权重并预测"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "/home/jeeves/.local/lib/python3.10/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n",
+      "  from .autonotebook import tqdm as notebook_tqdm\n"
+     ]
+    }
+   ],
+   "source": [
+    "import json\n",
+    "import torch\n",
+    "from transformers import AutoTokenizer, AutoModelForCausalLM\n",
+    "from configuration_minicpm import MiniCPMConfig\n",
+    "from MiniCPM import MiniCPMForCausalLM\n",
+    "import logging\n",
+    "import gc\n",
+    "\n",
+    "# 配置日志\n",
+    "logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "加载模型 config"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "config_json = json.load(open(\"/data/workspace/llms-from-scratch-cn/Model_Architecture_Discussions/MiniCPM/config.json\"))\n",
+    "config = MiniCPMConfig(**config_json)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "按照 config 初始化模型，并查看模型结构"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "2024-07-25 15:57:28,490 - INFO - 初始化模型\n",
+      "2024-07-25 15:57:50,064 - INFO - 模型：\n",
+      ": MiniCPMForCausalLM(\n",
+      "  (model): MiniCPMModel(\n",
+      "    (embed_tokens): Embedding(122753, 2304)\n",
+      "    (layers): ModuleList(\n",
+      "      (0-39): 40 x MiniCPMDecoderLayer(\n",
+      "        (self_attn): MiniCPMAttention(\n",
+      "          (q_proj): Linear(in_features=2304, out_features=2304, bias=False)\n",
+      "          (k_proj): Linear(in_features=2304, out_features=2304, bias=False)\n",
+      "          (v_proj): Linear(in_features=2304, out_features=2304, bias=False)\n",
+      "          (o_proj): Linear(in_features=2304, out_features=2304, bias=False)\n",
+      "          (rotary_emb): MiniCPMRotaryEmbedding()\n",
+      "        )\n",
+      "        (mlp): MiniCPMMLP(\n",
+      "          (gate_proj): Linear(in_features=2304, out_features=5760, bias=False)\n",
+      "          (up_proj): Linear(in_features=2304, out_features=5760, bias=False)\n",
+      "          (down_proj): Linear(in_features=5760, out_features=2304, bias=False)\n",
+      "          (act_fn): SiLU()\n",
+      "        )\n",
+      "        (input_layernorm): MiniCPMRMSNorm()\n",
+      "        (post_attention_layernorm): MiniCPMRMSNorm()\n",
+      "      )\n",
+      "    )\n",
+      "    (norm): MiniCPMRMSNorm()\n",
+      "  )\n",
+      "  (lm_head): Linear(in_features=2304, out_features=122753, bias=False)\n",
+      ")\n"
+     ]
+    }
+   ],
+   "source": [
+    "\n",
+    "try:\n",
+    "    logging.info(\"初始化模型\")\n",
+    "    model = MiniCPMForCausalLM(config=config).to('cuda')\n",
+    "    logging.info(\"模型：\\n: %s\", model)\n",
+    "except Exception as e:\n",
+    "    logging.error(f\"初始化模型时发生错误: {e}\")\n",
+    "    raise"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "读取模型权重"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "2024-07-25 15:57:50,086 - INFO - 加载模型权重\n",
+      "2024-07-25 15:57:52,515 - INFO - 加载模型权重完成。\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "缺失的参数名: ['lm_head.weight']\n"
+     ]
+    }
+   ],
+   "source": [
+    "\n",
+    "path = \"/data/model/OpenBMB/MiniCPM-2B-dpo-bf16\"\n",
+    "\n",
+    "try:\n",
+    "    logging.info(\"加载模型权重\")\n",
+    "    params = torch.load(\n",
+    "        f=path + \"/pytorch_model.bin\",\n",
+    "        map_location=torch.device('cuda'),\n",
+    "        weights_only=True,  # 设置为True表示仅加载模型的权重。这通常用于加载预训练权重进行微调或预测，而不需要完整的模型结构\n",
+    "        mmap=True  # 使用内存映射方式加载模型文件，这可以提高加载大型模型文件的效率，特别是在有限的内存资源下\n",
+    "    )\n",
+    "    # 打印出模型参数和params中不一致的参数名\n",
+    "    missing_keys, unexpected_keys = model.load_state_dict(params, strict=False)\n",
+    "    # 打印缺失的参数名\n",
+    "    if missing_keys:\n",
+    "        print(\"缺失的参数名:\", missing_keys)\n",
+    "\n",
+    "    # 打印多余的参数名\n",
+    "    if unexpected_keys:\n",
+    "        print(\"多余的参数名:\", unexpected_keys)\n",
+    "    # modelV1 = AutoModelForCausalLM.from_pretrained(path, torch_dtype=torch.bfloat16, device_map='cuda', trust_remote_code=True)\n",
+    "    # 手动实现 tie embedding 即输入输出共享一个 Embedding\n",
+    "    model.get_output_embeddings().weight = model.get_input_embeddings().weight\n",
+    "    del params\n",
+    "    gc.collect()\n",
+    "    logging.info(\"加载模型权重完成。\")\n",
+    "except Exception as e:\n",
+    "    logging.error(f\"加载模型权重时发生错误: {e}\")\n",
+    "    raise"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "MiniCPM 采用了 tie-Embedding 的方式，即词嵌入层和输出层共享参数。这种方式可以减少模型的参数量，提高模型的训练效率。所以需要有获取和设置输入输出词嵌入层的方法。\n",
+    "我们可以看到在加载权重时缺失 `lm_head.weight` 的参数，这里我们通过手动设置 `model.get_output_embeddings().weight = model.get_input_embeddings().weight` 来共享参数。"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "使用默认的 tokenizer 分词"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 11,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "2024-07-25 16:01:12,360 - INFO - 初始化分词器\n",
+      "2024-07-25 16:01:12,557 - INFO - 生成文本\n"
+     ]
+    }
+   ],
+   "source": [
+    "logging.info(\"初始化分词器\")\n",
+    "tokenizer = AutoTokenizer.from_pretrained(\"/data/model/OpenBMB/MiniCPM-2B-dpo-bf16/\")\n",
+    "\n",
+    "logging.info(\"生成文本\")\n",
+    "input_texts = [\"北京最高的山是哪座山?\", \"山东省最长的山是哪座山?\" ]\n",
+    "\n",
+    "tokenizer.pad_token_id=tokenizer.eos_token_id\n",
+    "\n",
+    "inputs = tokenizer(input_texts, padding=True, return_tensors=\"pt\").to('cuda')"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "可以看出 MiniCPM 采用 tokenizer 为 `LlamaTokenizerFast`, 词表大小为 122753 个 token。"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 12,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "LlamaTokenizerFast(name_or_path='/data/model/OpenBMB/MiniCPM-2B-dpo-bf16/', vocab_size=122753, model_max_length=1000000000000000019884624838656, is_fast=True, padding_side='left', truncation_side='right', special_tokens={'bos_token': '<s>', 'eos_token': '</s>', 'unk_token': '<unk>', 'pad_token': '</s>'}, clean_up_tokenization_spaces=False),  added_tokens_decoder={\n",
+      "\t0: AddedToken(\"<unk>\", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),\n",
+      "\t1: AddedToken(\"<s>\", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),\n",
+      "\t2: AddedToken(\"</s>\", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),\n",
+      "}\n"
+     ]
+    }
+   ],
+   "source": [
+    "print(tokenizer)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "我们让模型输出结果看看"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 14,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "2024-07-25 16:01:36,687 - INFO - 生成结果: 北京最高的山是哪座山?\n",
+      " 北京最高的山是香山。香山位于北京市海淀区，距离北京市中心约25公里，海拔572米。香山是北京市内最高峰\n",
+      "2024-07-25 16:01:36,687 - INFO - 生成结果: 山东省最长的山是哪座山?\n",
+      " 目前，山东省最长的山是泰山。泰山，位于山东省中部，是五岳之一，也是中国著名的山脉之一。泰山是中国著名的山脉之一\n"
+     ]
+    }
+   ],
+   "source": [
+    "generate_input = {\n",
+    "    \"input_ids\": inputs.input_ids,\n",
+    "    \"attention_mask\": inputs.attention_mask,\n",
+    "    \"max_new_tokens\": 32,\n",
+    "    \"temperature\": 1,\n",
+    "    \"tokenizer\": tokenizer,\n",
+    "}\n",
+    "model.eval()\n",
+    "outputs = model.generate(**generate_input)\n",
+    "for output in outputs:\n",
+    "    result = tokenizer.decode(output, skip_special_tokens=True)\n",
+    "    logging.info(f\"生成结果: {result}\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "可以看出输出的结果还可以"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.10.12"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
@@ -0,0 +1,151 @@
+---
+language:
+- en
+- zh
+tags:
+- MiniCPM
+- ModelBest
+- THUNLP
+---
+
+
+<div align="center">
+<h1>
+  MiniCPM
+</h1>
+</div>
+
+<p align="center">
+<a href="https://shengdinghu.notion.site/MiniCPM-c805a17c5c8046398914e47f0542095a?pvs=4" target="_blank">MiniCPM 技术报告</a><a href="https://shengdinghu.notion.site/MiniCPM-Unveiling-the-Potential-of-End-side-Large-Language-Models-d4d3a8c426424654a4e80e42a711cb20?pvs=4" target="_blank"> Technical Report</a> |
+<a href="https://github.com/OpenBMB/OmniLMM/" target="_blank">OmniLMM 多模态模型 Multi-modal Model</a> |
+<a href="https://luca.cn/" target="_blank">CPM-C 千亿模型试用 ~100B Model Trial </a> 
+</p>
+
+MiniCPM 是面壁与清华大学自然语言处理实验室共同开源的系列端侧语言大模型，主体语言模型 MiniCPM-2B 仅有 24亿（2.4B）的非词嵌入参数量。
+- 经过 SFT 后，MiniCPM 在公开综合性评测集上，MiniCPM 与 Mistral-7B相近（中文、数学、代码能力更优），整体性能超越 Llama2-13B、MPT-30B、Falcon-40B 等模型。
+- 经过 DPO 后，MiniCPM 在当前最接近用户体感的评测集 MTBench上，MiniCPM-2B 也超越了 Llama2-70B-Chat、Vicuna-33B、Mistral-7B-Instruct-v0.1、Zephyr-7B-alpha 等众多代表性开源大模型。
+- 以 MiniCPM-2B 为基础构建端侧多模态大模型 MiniCPM-V，整体性能在同规模模型中实现最佳，超越基于 Phi-2 构建的现有多模态大模型，在部分评测集上达到与 9.6B Qwen-VL-Chat 相当甚至更好的性能。
+- 经过 Int4 量化后，MiniCPM 可在手机上进行部署推理，流式输出速度略高于人类说话速度。MiniCPM-V 也首次跑通了多模态大模型在手机上的部署。
+- 一张1080/2080可高效参数微调，一张3090/4090可全参数微调，一台机器可持续训练 MiniCPM，二次开发成本较低。
+
+我们将完全开源MiniCPM-2B的模型参数供学术研究和有限商用，以及训练过程中的所有Checkpoint和大部分非专有数据供模型机理研究。
+
+- 基于MiniCPM-2B的指令微调与人类偏好对**MiniCPM-2B-SFT/DPO。**
+- 基于MiniCPM-2B的多模态模型**MiniCPM-V**，能力超越基于Phi-2的同参数级别多模态模型**。**
+- MiniCPM-2B-SFT/DPO的Int4量化版**MiniCPM-2B-SFT/DPO-Int4。**
+- 基于MLC-LLM、LLMFarm开发的MiniCPM手机端程序，**文本及多模态模型均可在手机端进行推理。**
+
+
+MiniCPM is an End-Size LLM developed by ModelBest Inc. and TsinghuaNLP, with only 2.4B parameters excluding embeddings.
+
+- MiniCPM has very close performance compared with Mistral-7B on open-sourced general benchmarks with better ability on Chinese, Mathmetics and Coding after SFT. The overall performance exceeds Llama2-13B, MPT-30B, Falcon-40B, etc.
+- After DPO, MiniCPM outperforms Llama2-70B-Chat, Vicuna-33B, Mistral-7B-Instruct-v0.1, Zephyr-7B-alpha, etc. on MTBench.
+- MiniCPM-V, based on MiniCPM-2B, achieves the best overall performance among multimodel models of the same scale, surpassing existing multimodal large models built on Phi-2 and achieving performance comparable to or even better than 9.6B Qwen-VL-Chat on some tasks.
+- MiniCPM can be deployed and infer on smartphones, and the speed of streaming output is relatively higher than the verbal speed of human. MiniCPM-V is the first multi-modal models that can be deployed on smartphones.
+- The cost of developing based on MiniCPM is low. Parameter efficient finetuning can be conducted with a single 1080/2080 GPU and full parameter finetuning can be conducted with a 3090/4090 GPU.
+
+We release all model parameters for research and limited commercial use. We also release all the checkpoint during training and most public training data for research on model mechanism.
+
+- SFT and DPO version based on MiniCPM-2B and human preference: **MiniCPM-2B-SFT/DPO**
+- The multi-modal model **MiniCPM-V** based on MiniCPM-2B, which outperforms models with similar size, i.e., Phi-2
+- The INT4 quantized version **MiniCPM-2B-SFT/DPO-Int4** based on MiniCPM-2B-SFT/DPO
+- Mobile phone application based on MLC-LLM and LLMFarm. Both language model and multimodel model can conduct inference on smartphones.
+
+### 评测结果 Evaluation Results
+
+  详细的评测结果位于[github仓库](https://github.com/OpenBMB/MiniCPM?tab=readme-ov-file#%E8%AF%84%E6%B5%8B%E7%BB%93%E6%9E%9C)
+
+  Detailed evaluation results are in [github repo](https://github.com/OpenBMB/MiniCPM/blob/main/README-en.md#evaluation-results)
+
+  注意：我们发现使用Huggingface生成质量略差于vLLM，因此推荐使用vLLM进行测试。我们正在排查原因。
+
+  Notice: We discovered that the quality of Huggingface generation is slightly lower than vLLM, thus benchmarking using vLLM is recommended. 
+  We are investigating the cause now.
+
+### 局限性 Limitations
+
+- 受限于模型规模，模型可能出现幻觉性问题。其中由于DPO模型生成的回复内容更长，更容易出现幻觉。我们也将持续进行MiniCPM模型的迭代改进；
+- 为了保证在学术研究用途上模型的通用性，我们未对模型进行任何身份认同训练。同时由于我们用ShareGPT开源语料作为部分训练数据，模型可能会输出类似GPT系列模型的身份认同信息；
+- 受限于模型规模，模型的输出受到提示词（prompt）的影响较大，可能多次尝试产生不一致的结果；
+- 受限于模型容量，模型的知识记忆较不准确，后续我们将结合RAG方法来增强模型的知识记忆能力。
+
+- Due to limitations in model size, the model may experience hallucinatory issues. As DPO model tend to generate longer response, hallucinations are more likely to occur. We will also continue to iterate and improve the MiniCPM model.
+- To ensure the universality of the model for academic research purposes, we did not conduct any identity training on the model. Meanwhile, as we use ShareGPT open-source corpus as part of the training data, the model may output identity information similar to the GPT series models.
+- Due to the limitation of model size, the output of the model is greatly influenced by prompt words, which may result in inconsistent results from multiple attempts.
+- Due to limited model capacity, the model's knowledge memory is not accurate. In the future, we will combine the RAG method to enhance the model's knowledge memory ability.
+
+## 模型下载 Download
+ 
+  | HuggingFace | ModelScope | WiseModel |
+  |-------------|------------|-----------|
+  |[sft-bf16](https://huggingface.co/openbmb/MiniCPM-2B-sft-bf16)|[sft-bf16](https://modelscope.cn/models/OpenBMB/miniCPM-bf16)|[sft-bf16](https://wisemodel.cn/models/OpenBMB/miniCPM-bf16)
+  |[sft-fp32](https://huggingface.co/openbmb/MiniCPM-2B-sft-fp32)|[sft-fp32](https://modelscope.cn/models/OpenBMB/MiniCPM-2B-sft-fp32)|[sft-fp32](https://wisemodel.cn/models/OpenBMB/miniCPM-dpo-fp32)
+  |[dpo-bf16](https://huggingface.co/openbmb/MiniCPM-2B-dpo-bf16)|[dpo-bf16](https://modelscope.cn/models/OpenBMB/MiniCPM-2B-dpo-bf16/summary)|[dpo-bf16](https://wisemodel.cn/models/OpenBMB/MiniCPM-2B-dpo-bf16)
+  |[dpo-fp16](https://huggingface.co/openbmb/MiniCPM-2B-dpo-fp16)|[dpo-fp16](https://modelscope.cn/models/OpenBMB/MiniCPM-2B-dpo-fp16/)|[dpo-fp16](https://wisemodel.cn/models/OpenBMB/MiniCPM-2B-dpo-fp16)
+  |[dpo-fp32](https://huggingface.co/openbmb/MiniCPM-2B-dpo-fp32)|[dpo-fp32](https://modelscope.cn/models/OpenBMB/MiniCPM-2B-dpo-fp32)|[dpo-fp32](https://wisemodel.cn/models/OpenBMB/miniCPM-dpo-fp32)
+
+## 模型使用 Usage
+
+* 安装`transformers>=4.36.0`以及`accelerate`后，运行以下代码
+* 注意：需要在`from_pretrained`中明确指明模型的数据类型，否则会引起较大计算误差
+* Run the following code after install `transformers>=4.36.0` and `accelerate`
+* Warning: It is necessary to specify the data type of the model clearly in 'from_pretrained', otherwise large calculation errors will be caused 
+```python
+from modelscope import AutoModelForCausalLM, AutoTokenizer
+import torch
+torch.manual_seed(0)
+
+path = 'OpenBMB/MiniCPM-2B-dpo-bf16'
+tokenizer = AutoTokenizer.from_pretrained(path)
+model = AutoModelForCausalLM.from_pretrained(path, torch_dtype=torch.bfloat16, device_map='cuda', trust_remote_code=True)
+
+responds, history = model.chat(tokenizer, "山东省最高的山是哪座山, 它比黄山高还是矮？差距多少？", temperature=0.8, top_p=0.8)
+print(responds)
+```
+
+* 期望输出 Expected Output
+```shell
+山东省最高的山是泰山，海拔1545米。
+
+相对于黄山（海拔1864米），泰山海拔较低，相差约319米。
+```
+
+## 开源协议 LICENSE
+
+#### 模型协议 Model LICENSE
+
+* 本仓库中代码依照 [Apache-2.0](https://github.com/OpenBMB/MiniCPM/blob/main/LICENSE) 协议开源
+* MiniCPM 模型权重的使用则需要遵循 [“通用模型许可协议-来源说明-宣传限制-商业授权”](https://github.com/OpenBMB/General-Model-License/blob/main/%E9%80%9A%E7%94%A8%E6%A8%A1%E5%9E%8B%E8%AE%B8%E5%8F%AF%E5%8D%8F%E8%AE%AE-%E6%9D%A5%E6%BA%90%E8%AF%B4%E6%98%8E-%E5%AE%A3%E4%BC%A0%E9%99%90%E5%88%B6-%E5%95%86%E4%B8%9A%E6%8E%88%E6%9D%83.md)。
+* MiniCPM 模型权重对学术研究完全开放。
+* 如需将模型用于商业用途，请联系cpm@modelbest.cn来获取书面授权，在登记后亦允许免费商业使用。
+
+* This repository is released under the [Apache-2.0](https://github.com/OpenBMB/MiniCPM/blob/main/LICENSE) License. 
+* The usage of MiniCPM model weights must strictly follow [the General Model License (GML)](https://github.com/OpenBMB/General-Model-License/blob/main/%E9%80%9A%E7%94%A8%E6%A8%A1%E5%9E%8B%E8%AE%B8%E5%8F%AF%E5%8D%8F%E8%AE%AE-%E6%9D%A5%E6%BA%90%E8%AF%B4%E6%98%8E-%E5%AE%A3%E4%BC%A0%E9%99%90%E5%88%B6-%E5%95%86%E4%B8%9A%E6%8E%88%E6%9D%83.md).
+* The models and weights of MiniCPM are completely free for academic research.
+* If you intend to utilize the model for commercial purposes, please reach out to cpm@modelbest.cn to obtain the certificate of authorization.
+
+#### 声明 Statement
+
+* 作为一个语言模型，MiniCPM 通过学习大量的文本来生成内容，但它无法理解、表达个人观点或价值判断，它所输出的任何内容都不代表模型开发者的观点和立场。
+* 因此用户在使用 MiniCPM 生成的内容时，应自行负责对其进行评估和验证。
+* 如果由于使用 MinCPM 开源模型而导致的任何问题，包括但不限于数据安全问题、公共舆论风险，或模型被误导、滥用、传播或不当利用所带来的任何风险和问题，我们将不承担任何责任。
+
+* As a language model, MiniCPM generates content by learning from a vast amount of text. 
+* However, it does not possess the ability to comprehend or express personal opinions or value judgments. 
+* Any content generated by MiniCPM does not represent the viewpoints or positions of the model developers. 
+* Therefore, when using content generated by MiniCPM, users should take full responsibility for evaluating and verifying it on their own.
+
+<p id="8"></p>
+
+## 工作引用 Citation
+
+* 如果觉得MiniCPM有助于您的工作，请考虑引用下列[技术报告](https://shengdinghu.notion.site/MiniCPM-c805a17c5c8046398914e47f0542095a?pvs=4)
+* Please cite our [techinical report](https://shengdinghu.notion.site/MiniCPM-Unveiling-the-Potential-of-End-side-Large-Language-Models-d4d3a8c426424654a4e80e42a711cb20?pvs=4) if you find our work valuable.
+
+```
+@inproceedings{minicpm2024,
+ title={MiniCPM：Unveiling the Potential of End-side Large Language Models},
+ booktitle={OpenBMB Blog},
+ year={2024}
+}
+```
@@ -0,0 +1,28 @@
+{
+    "_name_or_path": "openbmb/CPM-2B",
+    "architectures": [
+        "MiniCPMForCausalLM"
+    ],
+    "do_sample": false,
+    "temperature": 1,
+    "bos_token_id": 1,
+    "eos_token_id": 2,
+    "hidden_act": "silu",
+    "hidden_size": 2304,
+    "initializer_range": 0.1,
+    "intermediate_size": 5760,
+    "max_position_embeddings": 2048,
+    "num_attention_heads": 36,
+    "num_hidden_layers": 40,
+    "num_key_value_heads": 36,
+    "rms_norm_eps": 1e-05,
+    "rope_scaling": null,
+    "torch_dtype": "bfloat16",
+    "transformers_version": "4.36.0",
+    "use_cache": true,
+    "vocab_size": 122753,
+    "scale_emb": 12,
+    "dim_model_base": 256,
+    "scale_depth": 1.4,
+    "_attn_implementation": "eager"
+}
@@ -0,0 +1,75 @@
+import logging
+import json
+
+logger = logging.getLogger(__name__)
+
+class MiniCPMConfig():
+
+    model_type = "minicpm"
+    keys_to_ignore_at_inference = ["past_key_values"]
+
+    def __init__(
+        self,
+        vocab_size=32000,
+        hidden_size=4096,
+        intermediate_size=11008,
+        num_hidden_layers=32,
+        num_attention_heads=32,
+        num_key_value_heads=None,
+        hidden_act="silu",
+        max_position_embeddings=2048,
+        initializer_range=0.02,
+        rms_norm_eps=1e-6,
+        use_cache=False,
+        pad_token_id=None,
+        bos_token_id=1,
+        eos_token_id=2,
+        pretraining_tp=1,
+        tie_word_embeddings=True,
+        rope_theta=10000.0,
+        rope_scaling=None,
+        attention_bias=False,
+        attention_dropout=0.0,
+        scale_emb=1,
+        dim_model_base=1,
+        scale_depth=1,
+        output_attentions=False,
+        output_hidden_states=False,
+        return_dict=True,
+        use_return_dict=True,
+        **kwargs,
+    ):
+        self.vocab_size = vocab_size
+        self.max_position_embeddings = max_position_embeddings
+        self.hidden_size = hidden_size
+        self.intermediate_size = intermediate_size
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.num_key_value_heads = num_key_value_heads
+        self.hidden_act = hidden_act
+        self.initializer_range = initializer_range
+        self.rms_norm_eps = rms_norm_eps
+        self.pretraining_tp = pretraining_tp
+        self.use_cache = use_cache
+        self.rope_theta = rope_theta
+        self.rope_scaling = rope_scaling
+        self.attention_bias = attention_bias
+        self.attention_dropout = attention_dropout
+        self.scale_emb = scale_emb
+        self.dim_model_base = dim_model_base
+        self.scale_depth = scale_depth
+        self.pad_token_id=pad_token_id
+        self.bos_token_id=bos_token_id
+        self.eos_token_id=eos_token_id
+        self.tie_word_embeddings=tie_word_embeddings
+        self.output_attentions=output_attentions
+        self.output_hidden_states=output_hidden_states
+        self.return_dict=return_dict
+        self.use_return_dict=use_return_dict
+        
+    def to_json_string(self) -> str:
+        config_dict = self.__dict__
+        return json.dumps(config_dict, indent=2, sort_keys=True) + "\n"
+
+    def __repr__(self):
+        return f"{self.__class__.__name__} {self.to_json_string()}"
@@ -0,0 +1,7 @@
+{
+    "do_sample": true,
+    "top_p": 0.8,
+    "temperature": 0.8,
+    "bos_token_id": 1,
+    "eos_token_id": 2
+}
@@ -0,0 +1,35 @@
+*.7z filter=lfs diff=lfs merge=lfs -text
+*.arrow filter=lfs diff=lfs merge=lfs -text
+*.bin filter=lfs diff=lfs merge=lfs -text
+*.bin.* filter=lfs diff=lfs merge=lfs -text
+*.bz2 filter=lfs diff=lfs merge=lfs -text
+*.ftz filter=lfs diff=lfs merge=lfs -text
+*.gz filter=lfs diff=lfs merge=lfs -text
+*.h5 filter=lfs diff=lfs merge=lfs -text
+*.joblib filter=lfs diff=lfs merge=lfs -text
+*.lfs.* filter=lfs diff=lfs merge=lfs -text
+*.model filter=lfs diff=lfs merge=lfs -text
+*.msgpack filter=lfs diff=lfs merge=lfs -text
+*.onnx filter=lfs diff=lfs merge=lfs -text
+*.ot filter=lfs diff=lfs merge=lfs -text
+*.parquet filter=lfs diff=lfs merge=lfs -text
+*.pb filter=lfs diff=lfs merge=lfs -text
+*.pt filter=lfs diff=lfs merge=lfs -text
+*.pth filter=lfs diff=lfs merge=lfs -text
+*.rar filter=lfs diff=lfs merge=lfs -text
+saved_model/**/* filter=lfs diff=lfs merge=lfs -text
+*.tar.* filter=lfs diff=lfs merge=lfs -text
+*.tflite filter=lfs diff=lfs merge=lfs -text
+*.tgz filter=lfs diff=lfs merge=lfs -text
+*.xz filter=lfs diff=lfs merge=lfs -text
+*.zip filter=lfs diff=lfs merge=lfs -text
+*.zstandard filter=lfs diff=lfs merge=lfs -text
+*.tfevents* filter=lfs diff=lfs merge=lfs -text
+*.db* filter=lfs diff=lfs merge=lfs -text
+*.ark* filter=lfs diff=lfs merge=lfs -text
+**/*ckpt*data* filter=lfs diff=lfs merge=lfs -text
+**/*ckpt*.meta filter=lfs diff=lfs merge=lfs -text
+**/*ckpt*.index filter=lfs diff=lfs merge=lfs -text
+*.safetensors filter=lfs diff=lfs merge=lfs -text
+*.ckpt filter=lfs diff=lfs merge=lfs -text
+pytorch_model.bin filter=lfs diff=lfs merge=lfs -text
@@ -0,0 +1,23 @@
+{
+  "bos_token": {
+    "content": "<s>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "eos_token": {
+    "content": "</s>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "unk_token": {
+    "content": "<unk>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  }
+}
@@ -0,0 +1,42 @@
+{
+  "add_bos_token": true,
+  "add_eos_token": false,
+  "added_tokens_decoder": {
+    "0": {
+      "content": "<unk>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "1": {
+      "content": "<s>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "2": {
+      "content": "</s>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    }
+  },
+  "bos_token": "<s>",
+  "clean_up_tokenization_spaces": false,
+  "eos_token": "</s>",
+  "legacy": true,
+  "model_max_length": 1000000000000000019884624838656,
+  "pad_token": null,
+  "sp_model_kwargs": {},
+  "spaces_between_special_tokens": false,
+  "tokenizer_class": "LlamaTokenizer",
+  "unk_token": "<unk>",
+  "use_default_system_prompt": false,
+  "chat_template": "{% for message in messages %}{% if message['role'] == 'user' %}{{'<用户>' + message['content'].strip() + '<AI>'}}{% else %}{{message['content'].strip()}}{% endif %}{% endfor %}"
+}