update some explain of the code

2026-06-06 00:04:50 +00:00 · 2024-09-20 17:04:16 +08:00
parent b4170e3766
commit ee218402cd
13 changed files with 19932 additions and 338 deletions
@@ -1,58 +1,58 @@
 from transformers import PretrainedConfig
 from typing import List

-# 定义 LMConfig 类，继承自 PretrainedConfig
+
 class LMConfig(PretrainedConfig):
-    model_type = "minimind"  # 设置模型类型为 "minimind"
+    model_type = "minimind"

    def __init__(
            self,
-            dim: int = 512,  # 模型维度，默认为 512
-            n_layers: int = 8,  # Transformer 层数，默认为 8
-            n_heads: int = 16,  # 注意力头数，默认为 16
-            n_kv_heads: int = 8,  # KV 头数，默认为 8
-            vocab_size: int = 6400,  # 词汇表大小，默认为 6400
-            hidden_dim: int = None,  # 隐藏层维度，默认为 None
-            multiple_of: int = 64,  # 隐藏层维度的倍数，默认为 64
-            norm_eps: float = 1e-5,  # 归一化层的 epsilon 值，默认为 1e-5
-            max_seq_len: int = 512,  # 最大序列长度，默认为 512
-            dropout: float = 0.0,  # Dropout 概率，默认为 0.0
-            flash_attn: bool = True,  # 是否使用 Flash Attention，默认为 True
+            dim: int = 768,
+            n_layers: int = 16,
+            n_heads: int = 16,
+            n_kv_heads: int = 8,
+            vocab_size: int = 6400,
+            hidden_dim: int = None,
+            multiple_of: int = 64,
+            norm_eps: float = 1e-5,
+            max_seq_len: int = 512,
+            dropout: float = 0.0,
+            flash_attn: bool = True,
            ####################################################
-            # 以下是 MOE（Mixture of Experts）的特定配置
-            # 当 use_moe 为 False 时，以下配置无效
+            # Here are the specific configurations of MOE
+            # When use_moe is false, the following is invalid
            ####################################################
-            use_moe: bool = False,  # 是否使用 MOE，默认为 False
-            num_experts_per_tok=2,  # 每个 token 选择的专家数量，默认为 2
-            n_routed_experts=4,  # 总的专家数量，默认为 4
-            n_shared_experts: bool = True,  # 是否使用共享专家，默认为 True
-            scoring_func='softmax',  # 评分函数，默认为 'softmax'
-            aux_loss_alpha=0.01,  # 辅助损失的 alpha 参数，默认为 0.01
-            seq_aux=True,  # 是否在序列级别上计算辅助损失，默认为 True
-            norm_topk_prob=True,  # 是否标准化 top-k 概率，默认为 True
+            use_moe: bool = False,
+            num_experts_per_tok=2,
+            n_routed_experts=4,
+            n_shared_experts: bool = True,
+            scoring_func='softmax',
+            aux_loss_alpha=0.01,
+            seq_aux=True,
+            norm_topk_prob=True,
            **kwargs,
    ):
-        self.dim = dim  # 设置模型维度
-        self.n_layers = n_layers  # 设置 Transformer 层数
-        self.n_heads = n_heads  # 设置注意力头数
-        self.n_kv_heads = n_kv_heads  # 设置 KV 头数
-        self.vocab_size = vocab_size  # 设置词汇表大小
-        self.hidden_dim = hidden_dim  # 设置隐藏层维度
-        self.multiple_of = multiple_of  # 设置隐藏层维度的倍数
-        self.norm_eps = norm_eps  # 设置归一化层的 epsilon 值
-        self.max_seq_len = max_seq_len  # 设置最大序列长度
-        self.dropout = dropout  # 设置 Dropout 概率
-        self.flash_attn = flash_attn  # 设置是否使用 Flash Attention
+        self.dim = dim
+        self.n_layers = n_layers
+        self.n_heads = n_heads
+        self.n_kv_heads = n_kv_heads
+        self.vocab_size = vocab_size
+        self.hidden_dim = hidden_dim
+        self.multiple_of = multiple_of
+        self.norm_eps = norm_eps
+        self.max_seq_len = max_seq_len
+        self.dropout = dropout
+        self.flash_attn = flash_attn
        ####################################################
-        # 以下是 MOE（Mixture of Experts）的特定配置
-        # 当 use_moe 为 False 时，以下配置无效
+        # Here are the specific configurations of MOE
+        # When use_moe is false, the following is invalid
        ####################################################
-        self.use_moe = use_moe  # 设置是否使用 MOE
-        self.num_experts_per_tok = num_experts_per_tok  # 设置每个 token 选择的专家数量
-        self.n_routed_experts = n_routed_experts  # 设置总的专家数量
-        self.n_shared_experts = n_shared_experts  # 设置是否使用共享专家
-        self.scoring_func = scoring_func  # 设置评分函数
-        self.aux_loss_alpha = aux_loss_alpha  # 设置辅助损失的 alpha 参数
-        self.seq_aux = seq_aux  # 设置是否在序列级别上计算辅助损失
-        self.norm_topk_prob = norm_topk_prob  # 设置是否标准化 top-k 概率
-        super().__init__(**kwargs)  # 调用父类 PretrainedConfig 的初始化方法
+        self.use_moe = use_moe
+        self.num_experts_per_tok = num_experts_per_tok  # 每个token选择的专家数量
+        self.n_routed_experts = n_routed_experts  # 总的专家数量
+        self.n_shared_experts = n_shared_experts  # 共享专家
+        self.scoring_func = scoring_func  # 评分函数，默认为'softmax'
+        self.aux_loss_alpha = aux_loss_alpha  # 辅助损失的alpha参数
+        self.seq_aux = seq_aux  # 是否在序列级别上计算辅助损失
+        self.norm_topk_prob = norm_topk_prob  # 是否标准化top-k概率
+        super().__init__(**kwargs)
@@ -9,79 +9,79 @@ import torch
 from sklearn.model_selection import train_test_split
 import os

-os.environ["TOKENIZERS_PARALLELISM"] = "false"  # 禁用 tokenizer 的并行处理
+os.environ["TOKENIZERS_PARALLELISM"] = "false"
+

-# 定义 PretrainDataset 类，继承自 Dataset
 class PretrainDataset(Dataset):
    def __init__(self, data_path_lst, max_length=512, memmap=False):
        super().__init__()
-        # 如果使用内存映射（memmap）
+        #
        if memmap:
            with open(data_path_lst[0], 'r') as f:
-                nbytes = f.seek(0, 2)  # 获取文件总字节数
-                flen = f.tell() // np.dtype('uint16').itemsize  # 计算文件长度
-            self.data = np.memmap(data_path_lst[0], dtype=np.dtype('uint16'), shape=(flen // max_length, max_length))  # 使用内存映射加载数据
+                nbytes = f.seek(0, 2)
+                flen = f.tell() // np.dtype('uint16').itemsize
+            self.data = np.memmap(data_path_lst[0], dtype=np.dtype('uint16'), shape=(flen // max_length, max_length))
        else:
            data_lst = []
            for data_path in data_path_lst:
                with open(data_path, 'rb') as f:
-                    data = np.fromfile(f, dtype=np.uint16)  # 从文件中读取数据
+                    data = np.fromfile(f, dtype=np.uint16)
                    data_lst.append(data)
-            data = np.concatenate(data_lst)  # 合并所有数据
-            data = data[:max_length * int(len(data) / max_length)]  # 截取数据
-            # np.random.shuffle(data)  # 打乱数据（注释掉了）
-            self.data = data.reshape(-1, max_length)  # 将数据重塑为 (样本数, 最大长度) 的形状
-        # 打印数据形状
+            data = np.concatenate(data_lst)
+            data = data[:max_length * int(len(data) / max_length)]
+            # np.random.shuffle(data)
+            self.data = data.reshape(-1, max_length)
+        #
        print("memmap:{} train data.shape:{}".format(memmap, self.data.shape))
        print("downloading finished.....")

    def __len__(self):
-        return self.data.shape[0]  # 返回数据集的长度
+        return self.data.shape[0]

    def __getitem__(self, index: int):
-        # 获取指定索引的样本
+        #
        sample = self.data[index]
-        X = np.array(sample[:-1]).astype(np.int64)  # 输入数据（去掉最后一个 token）
-        Y = np.array(sample[1:]).astype(np.int64)  # 目标数据（去掉第一个 token）
+        X = np.array(sample[:-1]).astype(np.int64)
+        Y = np.array(sample[1:]).astype(np.int64)
+
+        return torch.from_numpy(X), torch.from_numpy(Y)

-        return torch.from_numpy(X), torch.from_numpy(Y)  # 返回 PyTorch 张量

-# 定义 SFTDataset 类，继承自 Dataset
 class SFTDataset(Dataset):
    def __init__(self, df, tokenizer, max_length=1024, prompt_max_len=512, answer_max_len=256):
        super().__init__()
-        self.df = df  # 数据框
-        self.max_length = max_length  # 最大序列长度
-        self.prompt_max_len = prompt_max_len  # 提示的最大长度
-        self.answer_max_len = answer_max_len  # 回答的最大长度
+        self.df = df
+        self.max_length = max_length
+        self.prompt_max_len = prompt_max_len
+        self.answer_max_len = answer_max_len
        #
-        self.tokenizer = tokenizer  # 分词器
-        self.padding = 0  # 填充 token ID
-        self.bos_id = self.tokenizer('<s>assistant').data['input_ids']  # 开始 token ID
+        self.tokenizer = tokenizer
+        self.padding = 0  # self.tokenizer.special_tokens['<pad>']
+        self.bos_id = self.tokenizer('<s>assistant').data['input_ids']

    def __len__(self):
-        return self.df.shape[0]  # 返回数据集的长度
+        return self.df.shape[0]

    def find_sublist_index(self, main_list, sub_list) -> int:
        last_index = -1
        for i in range(len(main_list) - len(sub_list) + 1):
            if main_list[i:i + len(sub_list)] == sub_list:
                last_index = i
-        return last_index  # 查找子列表在主列表中的最后一个索引
+        return last_index

    def safe_eval(self, s):
        try:
            res = eval(s)
        except Exception as e:
            return []
-        return res  # 安全地执行 eval 函数
+        return res

    def __getitem__(self, index: int):
-        # 获取指定索引的样本
+        #
        sample = self.df.iloc[index]
-        history = self.safe_eval(sample['history'])  # 获取历史对话
-        q = str(sample['q'])  # 获取问题
-        a = str(sample['a'])  # 获取回答
+        history = self.safe_eval(sample['history'])
+        q = str(sample['q'])
+        a = str(sample['a'])

        messages = []
        for history_message in history:
@@ -102,29 +102,29 @@ class SFTDataset(Dataset):
            messages,
            tokenize=False,
            add_generation_prompt=True
-        )  # 生成新的提示
-        input_id = self.tokenizer(new_prompt).data['input_ids'][:self.max_length]  # 分词并截取
+        )
+        input_id = self.tokenizer(new_prompt).data['input_ids'][:self.max_length]

        # 实际长度
        question_length = self.find_sublist_index(input_id, self.bos_id) + len(self.bos_id)
        # 没满最大长度的剩余部分
        padding_len = self.max_length - len(input_id)
-        input_id = input_id + [self.padding] * padding_len  # 填充到最大长度
+        input_id = input_id + [self.padding] * padding_len
        mask_len = len(input_id) - question_length - padding_len
        # 0表示不计算损失
        loss_mask = [0] * question_length + [1] * (mask_len) + [0] * padding_len

        input_id = np.array(input_id)
-        X = np.array(input_id[:-1]).astype(np.int64)  # 输入数据（去掉最后一个 token）
-        Y = np.array(input_id[1:]).astype(np.int64)  # 目标数据（去掉第一个 token）
-        loss_mask = np.array(loss_mask[1:]).astype(np.int64)  # 损失掩码
+        X = np.array(input_id[:-1]).astype(np.int64)
+        Y = np.array(input_id[1:]).astype(np.int64)
+        loss_mask = np.array(loss_mask[1:]).astype(np.int64)

        X_tensor = torch.from_numpy(X)
        Y_tensor = torch.from_numpy(Y)
        loss_mask_tensor = torch.from_numpy(loss_mask)

-        return X_tensor, Y_tensor, loss_mask_tensor  # 返回 PyTorch 张量
+        return X_tensor, Y_tensor, loss_mask_tensor
+

-# 主函数
 if __name__ == "__main__":
-    pass
+    pass
@@ -10,29 +10,29 @@ from torch import nn
 from transformers import PreTrainedModel
 from transformers.modeling_outputs import CausalLMOutputWithPast

-# 定义 RMSNorm 类，实现一种归一化方法，类似于 LayerNorm，但计算方式不同
+
 class RMSNorm(torch.nn.Module):
    def __init__(self, dim: int, eps: float):
        super().__init__()
-        self.eps = eps  # 设置 epsilon，防止除零错误
-        self.weight = nn.Parameter(torch.ones(dim))  # 初始化权重参数
+        self.eps = eps
+        self.weight = nn.Parameter(torch.ones(dim))

    def _norm(self, x):
-        return x * torch.rsqrt(x.pow(2).mean(-1, keepdim=True) + self.eps)  # 计算 RMSNorm
+        return x * torch.rsqrt(x.pow(2).mean(-1, keepdim=True) + self.eps)

    def forward(self, x):
-        output = self._norm(x.float()).type_as(x)  # 应用 RMSNorm
-        return output * self.weight  # 乘以权重参数
+        output = self._norm(x.float()).type_as(x)
+        return output * self.weight
+

-# 定义 precompute_pos_cis 函数，用于预计算位置编码的复数形式
 def precompute_pos_cis(dim: int, end: int, theta: float = 10000.0):
-    freqs = 1.0 / (theta ** (torch.arange(0, dim, 2)[: (dim // 2)].float() / dim))  # 计算频率
-    t = torch.arange(end, device=freqs.device)  # 生成时间序列
-    freqs = torch.outer(t, freqs).float()  # 计算外积
-    pos_cis = torch.polar(torch.ones_like(freqs), freqs)  # 计算复数形式的位置编码
+    freqs = 1.0 / (theta ** (torch.arange(0, dim, 2)[: (dim // 2)].float() / dim))
+    t = torch.arange(end, device=freqs.device)  # type: ignore
+    freqs = torch.outer(t, freqs).float()  # type: ignore
+    pos_cis = torch.polar(torch.ones_like(freqs), freqs)  # complex64
    return pos_cis

-# 定义 apply_rotary_emb 函数，用于应用旋转位置编码
+
 def apply_rotary_emb(xq, xk, pos_cis):
    def unite_shape(pos_cis, x):
        ndim = x.ndim
@@ -41,14 +41,14 @@ def apply_rotary_emb(xq, xk, pos_cis):
        shape = [d if i == 1 or i == ndim - 1 else 1 for i, d in enumerate(x.shape)]
        return pos_cis.view(*shape)

-    xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2))  # 将 xq 转换为复数形式
-    xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2))  # 将 xk 转换为复数形式
-    pos_cis = unite_shape(pos_cis, xq_)  # 调整 pos_cis 的形状
-    xq_out = torch.view_as_real(xq_ * pos_cis).flatten(3)  # 应用旋转位置编码
-    xk_out = torch.view_as_real(xk_ * pos_cis).flatten(3)  # 应用旋转位置编码
-    return xq_out.type_as(xq), xk_out.type_as(xk)  # 返回结果
+    xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2))
+    xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2))
+    pos_cis = unite_shape(pos_cis, xq_)
+    xq_out = torch.view_as_real(xq_ * pos_cis).flatten(3)
+    xk_out = torch.view_as_real(xk_ * pos_cis).flatten(3)
+    return xq_out.type_as(xq), xk_out.type_as(xk)
+

-# 定义 repeat_kv 函数，用于重复 KV 头的值
 def repeat_kv(x: torch.Tensor, n_rep: int) -> torch.Tensor:
    """torch.repeat_interleave(x, dim=2, repeats=n_rep)"""
    bs, slen, n_kv_heads, head_dim = x.shape
@@ -60,130 +60,130 @@ def repeat_kv(x: torch.Tensor, n_rep: int) -> torch.Tensor:
        .reshape(bs, slen, n_kv_heads * n_rep, head_dim)
    )

-# 定义 Attention 类，实现自注意力机制
+
 class Attention(nn.Module):
    def __init__(self, args: LMConfig):
        super().__init__()
-        self.n_kv_heads = args.n_heads if args.n_kv_heads is None else args.n_kv_heads  # 设置 KV 头的数量
-        assert args.n_heads % self.n_kv_heads == 0  # 确保 KV 头的数量是总头数的因数
-        self.n_local_heads = args.n_heads  # 设置本地头的数量
-        self.n_local_kv_heads = self.n_kv_heads  # 设置本地 KV 头的数量
-        self.n_rep = self.n_local_heads // self.n_local_kv_heads  # 计算重复次数
-        self.head_dim = args.dim // args.n_heads  # 计算每个头的维度
-        self.wq = nn.Linear(args.dim, args.n_heads * self.head_dim, bias=False)  # 初始化 Q 矩阵
-        self.wk = nn.Linear(args.dim, self.n_kv_heads * self.head_dim, bias=False)  # 初始化 K 矩阵
-        self.wv = nn.Linear(args.dim, self.n_kv_heads * self.head_dim, bias=False)  # 初始化 V 矩阵
-        self.wo = nn.Linear(args.n_heads * self.head_dim, args.dim, bias=False)  # 初始化输出矩阵
-        self.k_cache, self.v_cache = None, None  # 初始化 KV 缓存
-        self.attn_dropout = nn.Dropout(args.dropout)  # 初始化注意力 dropout
-        self.resid_dropout = nn.Dropout(args.dropout)  # 初始化残差 dropout
-        self.dropout = args.dropout  # 设置 dropout 概率
-        self.flash = hasattr(torch.nn.functional, 'scaled_dot_product_attention') and args.flash_attn  # 判断是否使用 Flash Attention
+        self.n_kv_heads = args.n_heads if args.n_kv_heads is None else args.n_kv_heads
+        assert args.n_heads % self.n_kv_heads == 0
+        self.n_local_heads = args.n_heads
+        self.n_local_kv_heads = self.n_kv_heads
+        self.n_rep = self.n_local_heads // self.n_local_kv_heads
+        self.head_dim = args.dim // args.n_heads
+        self.wq = nn.Linear(args.dim, args.n_heads * self.head_dim, bias=False)
+        self.wk = nn.Linear(args.dim, self.n_kv_heads * self.head_dim, bias=False)
+        self.wv = nn.Linear(args.dim, self.n_kv_heads * self.head_dim, bias=False)
+        self.wo = nn.Linear(args.n_heads * self.head_dim, args.dim, bias=False)
+        self.k_cache, self.v_cache = None, None
+        self.attn_dropout = nn.Dropout(args.dropout)
+        self.resid_dropout = nn.Dropout(args.dropout)
+        self.dropout = args.dropout
+        self.flash = hasattr(torch.nn.functional, 'scaled_dot_product_attention') and args.flash_attn

        if not self.flash:
            # print("WARNING: using slow attention. Flash Attention requires PyTorch >= 2.0")
-            mask = torch.full((1, 1, args.max_seq_len, args.max_seq_len), float("-inf"))  # 初始化掩码
-            mask = torch.triu(mask, diagonal=1)  # 生成上三角掩码
-            self.register_buffer("mask", mask)  # 注册掩码
+            mask = torch.full((1, 1, args.max_seq_len, args.max_seq_len), float("-inf"))
+            mask = torch.triu(mask, diagonal=1)
+            self.register_buffer("mask", mask)

    def forward(self, x: torch.Tensor, pos_cis: torch.Tensor, use_kv_cache=False):
        bsz, seqlen, _ = x.shape
-        if use_kv_cache and self.eval():  # 如果使用 KV 缓存且在评估模式下
+        if use_kv_cache and self.eval():
            if self.k_cache is None or self.k_cache.shape[1] != x.shape[1] - 1:
-                xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)  # 计算 Q, K, V
+                xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)
            else:
-                token = x[:, -1:, :]  # 获取最后一个 token
-                xq = torch.cat((torch.zeros_like(x[:, :-1, :]), self.wq(token)), dim=1)  # 更新 Q
-                xk = torch.cat((self.k_cache, self.wk(token)), dim=1)  # 更新 K
-                xv = torch.cat((self.v_cache, self.wv(token)), dim=1)  # 更新 V
+                token = x[:, -1:, :]
+                xq = torch.cat((torch.zeros_like(x[:, :-1, :]), self.wq(token)), dim=1)
+                xk = torch.cat((self.k_cache, self.wk(token)), dim=1)
+                xv = torch.cat((self.v_cache, self.wv(token)), dim=1)

-            self.k_cache, self.v_cache = xk, xv  # 更新 KV 缓存
+            self.k_cache, self.v_cache = xk, xv
        else:
-            xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)  # 计算 Q, K, V
+            xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)

-        xq = xq.view(bsz, seqlen, self.n_local_heads, self.head_dim)  # 调整 Q 的形状
-        xk = xk.view(bsz, seqlen, self.n_local_kv_heads, self.head_dim)  # 调整 K 的形状
-        xv = xv.view(bsz, seqlen, self.n_local_kv_heads, self.head_dim)  # 调整 V 的形状
+        xq = xq.view(bsz, seqlen, self.n_local_heads, self.head_dim)
+        xk = xk.view(bsz, seqlen, self.n_local_kv_heads, self.head_dim)
+        xv = xv.view(bsz, seqlen, self.n_local_kv_heads, self.head_dim)

-        xq, xk = apply_rotary_emb(xq, xk, pos_cis)  # 应用旋转位置编码
+        xq, xk = apply_rotary_emb(xq, xk, pos_cis)

-        xk = repeat_kv(xk, self.n_rep)  # 重复 K 的值
-        xv = repeat_kv(xv, self.n_rep)  # 重复 V 的值
+        xk = repeat_kv(xk, self.n_rep)  # (bs, seqlen, n_local_heads, head_dim)
+        xv = repeat_kv(xv, self.n_rep)  # (bs, seqlen, n_local_heads, head_dim)

-        xq = xq.transpose(1, 2)  # 调整 Q 的形状
-        xk = xk.transpose(1, 2)  # 调整 K 的形状
-        xv = xv.transpose(1, 2)  # 调整 V 的形状
+        xq = xq.transpose(1, 2)
+        xk = xk.transpose(1, 2)
+        xv = xv.transpose(1, 2)

        if self.flash:
            output = torch.nn.functional.scaled_dot_product_attention(xq, xk, xv, attn_mask=None,
                                                                      dropout_p=self.dropout if self.training else 0.0,
-                                                                      is_causal=True)  # 使用 Flash Attention
+                                                                      is_causal=True)
        else:
-            scores = torch.matmul(xq, xk.transpose(2, 3)) / math.sqrt(self.head_dim)  # 计算注意力分数
+            scores = torch.matmul(xq, xk.transpose(2, 3)) / math.sqrt(self.head_dim)
            assert hasattr(self, 'mask')
-            scores = scores + self.mask[:, :, :seqlen, :seqlen]  # 应用掩码
-            scores = F.softmax(scores.float(), dim=-1).type_as(xq)  # 计算 softmax
-            scores = self.attn_dropout(scores)  # 应用注意力 dropout
-            output = torch.matmul(scores, xv)  # 计算输出
+            scores = scores + self.mask[:, :, :seqlen, :seqlen]  # (bs, n_local_heads, seqlen, cache_len + seqlen)
+            scores = F.softmax(scores.float(), dim=-1).type_as(xq)
+            scores = self.attn_dropout(scores)
+            output = torch.matmul(scores, xv)  # (bs, n_local_heads, seqlen, head_dim)

-        output = output.transpose(1, 2).contiguous().view(bsz, seqlen, -1)  # 调整输出的形状
+        output = output.transpose(1, 2).contiguous().view(bsz, seqlen, -1)
+
+        output = self.wo(output)
+        output = self.resid_dropout(output)
+        return output

-        output = self.wo(output)  # 应用输出矩阵
-        output = self.resid_dropout(output)  # 应用残差 dropout
-        return output  # 返回输出

-# 定义 FeedForward 类，实现前馈神经网络
 class FeedForward(nn.Module):
    def __init__(self, dim: int, hidden_dim: int, multiple_of: int, dropout: float):
        super().__init__()
        if hidden_dim is None:
-            hidden_dim = 4 * dim  # 设置隐藏层维度
-            hidden_dim = int(2 * hidden_dim / 3)  # 调整隐藏层维度
-            hidden_dim = multiple_of * ((hidden_dim + multiple_of - 1) // multiple_of)  # 调整隐藏层维度
-        self.w1 = nn.Linear(dim, hidden_dim, bias=False)  # 初始化第一层线性变换
-        self.w2 = nn.Linear(hidden_dim, dim, bias=False)  # 初始化第二层线性变换
-        self.w3 = nn.Linear(dim, hidden_dim, bias=False)  # 初始化第三层线性变换
-        self.dropout = nn.Dropout(dropout)  # 初始化 dropout
+            hidden_dim = 4 * dim
+            hidden_dim = int(2 * hidden_dim / 3)
+            hidden_dim = multiple_of * ((hidden_dim + multiple_of - 1) // multiple_of)
+        self.w1 = nn.Linear(dim, hidden_dim, bias=False)
+        self.w2 = nn.Linear(hidden_dim, dim, bias=False)
+        self.w3 = nn.Linear(dim, hidden_dim, bias=False)
+        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
-        return self.dropout(self.w2(F.silu(self.w1(x)) * self.w3(x)))  # 前向传播
+        return self.dropout(self.w2(F.silu(self.w1(x)) * self.w3(x)))
+

-# 定义 MoEGate 类，实现专家混合（MoE）的门控机制
 class MoEGate(nn.Module):
    def __init__(self, config: LMConfig):
        super().__init__()
        self.config = config
-        self.top_k = config.num_experts_per_tok  # 设置每个 token 选择的专家数量
-        self.n_routed_experts = config.n_routed_experts  # 设置路由专家的数量
+        self.top_k = config.num_experts_per_tok
+        self.n_routed_experts = config.n_routed_experts

-        self.scoring_func = config.scoring_func  # 设置评分函数
-        self.alpha = config.aux_loss_alpha  # 设置辅助损失的权重
-        self.seq_aux = config.seq_aux  # 设置序列辅助损失
+        self.scoring_func = config.scoring_func
+        self.alpha = config.aux_loss_alpha
+        self.seq_aux = config.seq_aux

-        self.norm_topk_prob = config.norm_topk_prob  # 设置是否归一化 top-k 概率
-        self.gating_dim = config.dim  # 设置门控维度
-        self.weight = nn.Parameter(torch.empty((self.n_routed_experts, self.gating_dim)))  # 初始化权重参数
-        self.reset_parameters()  # 重置参数
+        self.norm_topk_prob = config.norm_topk_prob
+        self.gating_dim = config.dim
+        self.weight = nn.Parameter(torch.empty((self.n_routed_experts, self.gating_dim)))
+        self.reset_parameters()

    def reset_parameters(self) -> None:
        import torch.nn.init as init
-        init.kaiming_uniform_(self.weight, a=math.sqrt(5))  # 使用 Kaiming 初始化权重
+        init.kaiming_uniform_(self.weight, a=math.sqrt(5))

    def forward(self, hidden_states):
        bsz, seq_len, h = hidden_states.shape

-        hidden_states = hidden_states.view(-1, h)  # 调整隐藏状态的形状
-        logits = F.linear(hidden_states, self.weight, None)  # 计算 logits
+        hidden_states = hidden_states.view(-1, h)
+        logits = F.linear(hidden_states, self.weight, None)
        if self.scoring_func == 'softmax':
-            scores = logits.softmax(dim=-1)  # 计算 softmax 评分
+            scores = logits.softmax(dim=-1)
        else:
            raise NotImplementedError(f'insupportable scoring function for MoE gating: {self.scoring_func}')

-        topk_weight, topk_idx = torch.topk(scores, k=self.top_k, dim=-1, sorted=False)  # 选择 top-k 专家
+        topk_weight, topk_idx = torch.topk(scores, k=self.top_k, dim=-1, sorted=False)

        if self.top_k > 1 and self.norm_topk_prob:
-            denominator = topk_weight.sum(dim=-1, keepdim=True) + 1e-20  # 计算归一化分母
-            topk_weight = topk_weight / denominator  # 归一化 top-k 概率
+            denominator = topk_weight.sum(dim=-1, keepdim=True) + 1e-20
+            topk_weight = topk_weight / denominator

        if self.training and self.alpha > 0.0:
            scores_for_aux = scores
@@ -204,9 +204,9 @@ class MoEGate(nn.Module):
                aux_loss = (Pi * fi).sum() * self.alpha
        else:
            aux_loss = None
-        return topk_idx, topk_weight, aux_loss  # 返回 top-k 专家索引、权重和辅助损失
+        return topk_idx, topk_weight, aux_loss
+

-# 定义 MOEFeedForward 类，实现专家混合（MoE）的前馈神经网络
 class MOEFeedForward(nn.Module):
    def __init__(self, config: LMConfig):
        super().__init__()
@@ -219,16 +219,16 @@ class MOEFeedForward(nn.Module):
                dropout=config.dropout,
            )
            for _ in range(config.n_routed_experts)
-        ])  # 初始化专家列表
+        ])

-        self.gate = MoEGate(config)  # 初始化门控机制
+        self.gate = MoEGate(config)
        if config.n_shared_experts is not None:
            self.shared_experts = FeedForward(
                dim=config.dim,
                hidden_dim=config.hidden_dim,
                multiple_of=config.multiple_of,
                dropout=config.dropout,
-            )  # 初始化共享专家
+            )

    def forward(self, x):
        identity = x
@@ -281,46 +281,35 @@ class MOEFeedForward(nn.Module):

        return expert_cache

-# 定义 TransformerBlock 类，实现 Transformer 的一个块，包括自注意力和前馈神经网络
+
 class TransformerBlock(nn.Module):
    def __init__(self, layer_id: int, args: LMConfig):
        super().__init__()
        self.n_heads = args.n_heads
        self.dim = args.dim
        self.head_dim = args.dim // args.n_heads
-        self.attention = Attention(args)  # 初始化自注意力机制
+        self.attention = Attention(args)

        self.layer_id = layer_id
-        self.attention_norm = RMSNorm(args.dim, eps=args.norm_eps)  # 初始化注意力归一化
-        self.ffn_norm = RMSNorm(args.dim, eps=args.norm_eps)  # 初始化前馈神经网络归一化
+        self.attention_norm = RMSNorm(args.dim, eps=args.norm_eps)
+        self.ffn_norm = RMSNorm(args.dim, eps=args.norm_eps)

        if args.use_moe:
-            self.feed_forward = MOEFeedForward(args)  # 初始化专家混合前馈神经网络
+            self.feed_forward = MOEFeedForward(args)
        else:
            self.feed_forward = FeedForward(
                dim=args.dim,
                hidden_dim=args.hidden_dim,
                multiple_of=args.multiple_of,
                dropout=args.dropout,
-            )  # 初始化前馈神经网络
+            )

    def forward(self, x, pos_cis, use_kv_cache=False):
-        h = x + self.attention(self.attention_norm(x), pos_cis, use_kv_cache)  # 计算自注意力
-        out = h + self.feed_forward(self.ffn_norm(h))  # 计算前馈神经网络
-        return out  # 返回输出
+        h = x + self.attention(self.attention_norm(x), pos_cis, use_kv_cache)
+        out = h + self.feed_forward(self.ffn_norm(h))
+        return out

-# 定义 Transformer 类，实现整个 Transformer 模型
-class Transformer(PreTrainedModel):
-    config_class = LMConfig
-    last_loss: Optional[torch.Tensor]

-    def __init__(self, params: LMConfig = None):
-        super().__init__(params)
-        if not params:
-            params = LMConfig()
-        self.params = params
-        self.vocab_size = params.vocab_size
-        self.n_layers = params.n_layers
 class Transformer(PreTrainedModel):
    config_class = LMConfig
    last_loss: Optional[torch.Tensor]
@@ -333,99 +322,99 @@ class Transformer(PreTrainedModel):
        self.vocab_size = params.vocab_size
        self.n_layers = params.n_layers

-        self.tok_embeddings = nn.Embedding(params.vocab_size, params.dim)  # 初始化词嵌入层
-        self.dropout = nn.Dropout(params.dropout)  # 初始化 dropout 层
-        self.layers = torch.nn.ModuleList()  # 初始化 Transformer 块列表
+        self.tok_embeddings = nn.Embedding(params.vocab_size, params.dim)
+        self.dropout = nn.Dropout(params.dropout)
+        self.layers = torch.nn.ModuleList()
        for layer_id in range(self.n_layers):
-            self.layers.append(TransformerBlock(layer_id, params))  # 添加 Transformer 块
-        self.norm = RMSNorm(params.dim, eps=params.norm_eps)  # 初始化归一化层
-        self.output = nn.Linear(params.dim, params.vocab_size, bias=False)  # 初始化输出层
-        self.tok_embeddings.weight = self.output.weight  # 共享词嵌入和输出层的权重
-        pos_cis = precompute_pos_cis(self.params.dim // self.params.n_heads, self.params.max_seq_len)  # 预计算位置编码
-        self.register_buffer("pos_cis", pos_cis, persistent=False)  # 注册位置编码缓冲区
+            self.layers.append(TransformerBlock(layer_id, params))
+        self.norm = RMSNorm(params.dim, eps=params.norm_eps)
+        self.output = nn.Linear(params.dim, params.vocab_size, bias=False)
+        self.tok_embeddings.weight = self.output.weight
+        pos_cis = precompute_pos_cis(self.params.dim // self.params.n_heads, self.params.max_seq_len)
+        self.register_buffer("pos_cis", pos_cis, persistent=False)

-        self.apply(self._init_weights)  # 初始化模型权重
+        self.apply(self._init_weights)

        for pn, p in self.named_parameters():
            if pn.endswith('w3.weight') or pn.endswith('wo.weight'):
-                torch.nn.init.normal_(p, mean=0.0, std=0.02 / math.sqrt(2 * params.n_layers))  # 对特定权重进行初始化
+                torch.nn.init.normal_(p, mean=0.0, std=0.02 / math.sqrt(2 * params.n_layers))

-        self.last_loss = None  # 初始化最后一个损失
-        self.OUT = CausalLMOutputWithPast()  # 初始化输出对象
+        self.last_loss = None
+        self.OUT = CausalLMOutputWithPast()

    def _init_weights(self, module):
        if isinstance(module, nn.Linear):
-            torch.nn.init.normal_(module.weight, mean=0.0, std=0.02)  # 初始化线性层的权重
+            torch.nn.init.normal_(module.weight, mean=0.0, std=0.02)
            if module.bias is not None:
-                torch.nn.init.zeros_(module.bias)  # 初始化线性层的偏置
+                torch.nn.init.zeros_(module.bias)
        elif isinstance(module, nn.Embedding):
-            torch.nn.init.normal_(module.weight, mean=0.0, std=0.02)  # 初始化嵌入层的权重
+            torch.nn.init.normal_(module.weight, mean=0.0, std=0.02)

    def forward(self, tokens: Optional[torch.Tensor] = None, targets: Optional[torch.Tensor] = None,
                use_kv_cache=False, **keyargs):
        if 'input_ids' in keyargs:
-            tokens = keyargs['input_ids']  # 如果传入了 input_ids，则使用 input_ids
+            tokens = keyargs['input_ids']
        if 'attention_mask' in keyargs:
-            targets = keyargs['attention_mask']  # 如果传入了 attention_mask，则使用 attention_mask
+            targets = keyargs['attention_mask']

-        _bsz, seqlen = tokens.shape  # 获取批量大小和序列长度
-        h = self.tok_embeddings(tokens)  # 获取词嵌入
-        h = self.dropout(h)  # 应用 dropout
-        pos_cis = self.pos_cis[:seqlen]  # 获取对应序列长度的位置编码
+        _bsz, seqlen = tokens.shape
+        h = self.tok_embeddings(tokens)
+        h = self.dropout(h)
+        pos_cis = self.pos_cis[:seqlen]
        for idx, layer in enumerate(self.layers):
-            h = layer(h, pos_cis, use_kv_cache)  # 逐层应用 Transformer 块
+            h = layer(h, pos_cis, use_kv_cache)

-        h = self.norm(h)  # 应用归一化
+        h = self.norm(h)

        if targets is not None:
-            logits = self.output(h)  # 计算 logits
-            self.last_loss = F.cross_entropy(logits.view(-1, logits.size(-1)), targets.view(-1), ignore_index=-1)  # 计算交叉熵损失
+            logits = self.output(h)
+            self.last_loss = F.cross_entropy(logits.view(-1, logits.size(-1)), targets.view(-1), ignore_index=-1)
        else:
-            logits = self.output(h[:, [-1], :])  # 计算最后一个 token 的 logits
-            self.last_loss = None  # 没有目标时，损失为 None
+            logits = self.output(h[:, [-1], :])
+            self.last_loss = None

-        self.OUT.__setitem__('logits', logits)  # 设置输出对象的 logits
-        self.OUT.__setitem__('last_loss', self.last_loss)  # 设置输出对象的 last_loss
+        self.OUT.__setitem__('logits', logits)
+        self.OUT.__setitem__('last_loss', self.last_loss)

-        return self.OUT  # 返回输出对象
+        return self.OUT

-    @torch.inference_mode()  # 推理模式
+    @torch.inference_mode()
    def generate(self, idx, eos, max_new_tokens, temperature=0.7, top_k=None, stream=True, repetition_penalty=1.,
                 use_kv_cache=True):
-        index = idx.shape[1]  # 获取当前序列长度
-        while idx.shape[1] < max_new_tokens - 1:  # 当生成的 token 数量小于最大数量时
-            inference_res = self(idx, use_kv_cache=use_kv_cache)  # 进行前向传播
-            logits = inference_res.logits  # 获取 logits
-            logits = logits[:, -1, :]  # 获取最后一个 token 的 logits
+        index = idx.shape[1]
+        while idx.shape[1] < max_new_tokens - 1:
+            inference_res = self(idx, use_kv_cache=use_kv_cache)
+            logits = inference_res.logits
+            logits = logits[:, -1, :]

-            for token in set(idx.tolist()[0]):  # 对重复 token 进行惩罚
+            for token in set(idx.tolist()[0]):
                logits[:, token] /= repetition_penalty

-            if temperature == 0.0:  # 如果温度为 0，直接选择概率最高的 token
+            if temperature == 0.0:
                _, idx_next = torch.topk(logits, k=1, dim=-1)
            else:
-                logits = logits / temperature  # 调整 logits
-                if top_k is not None:  # 如果设置了 top-k 采样
+                logits = logits / temperature
+                if top_k is not None:
                    v, _ = torch.topk(logits, min(top_k, logits.size(-1)))
-                    logits[logits < v[:, [-1]]] = -float('Inf')  # 将小于 top-k 的 logits 设为负无穷
+                    logits[logits < v[:, [-1]]] = -float('Inf')

-                probs = F.softmax(logits, dim=-1)  # 计算概率
-                idx_next = torch.multinomial(probs, num_samples=1, generator=None)  # 采样下一个 token
+                probs = F.softmax(logits, dim=-1)
+                idx_next = torch.multinomial(probs, num_samples=1, generator=None)

-            if idx_next == eos:  # 如果生成的 token 是结束符，停止生成
+            if idx_next == eos:
                break

-            idx = torch.cat((idx, idx_next), dim=1)  # 将生成的 token 添加到序列中
-            if stream:  # 如果需要流式输出
-                yield idx[:, index:]  # 返回生成的 token
+            idx = torch.cat((idx, idx_next), dim=1)
+            if stream:
+                yield idx[:, index:]

-        if not stream:  # 如果不需要流式输出
-            yield idx[:, index:]  # 返回生成的 token
+        if not stream:
+            yield idx[:, index:]

-    @torch.inference_mode()  # 推理模式
+    @torch.inference_mode()
    def eval_answer(self, idx):
-        idx_cond = idx if idx.size(1) <= self.params.max_seq_len else idx[:, -self.params.max_seq_len:]  # 截取序列
-        inference_res = self(idx_cond)  # 进行前向传播
-        logits = inference_res.logits  # 获取 logits
-        logits = logits[:, -1, :]  # 获取最后一个 token 的 logits
-        return logits  # 返回 logits
+        idx_cond = idx if idx.size(1) <= self.params.max_seq_len else idx[:, -self.params.max_seq_len:]
+        inference_res = self(idx_cond)
+        logits = inference_res.logits
+        logits = logits[:, -1, :]
+        return logits