mirror of
https://github.com/datawhalechina/llms-from-scratch-cn.git
synced 2026-02-06 19:21:05 +08:00
fix message
This commit is contained in:
parent
d941972597
commit
ace24ba8ce
@ -756,7 +756,7 @@
|
||||
" # 计算当前层的输出\n",
|
||||
" layer_output = layer(x)\n",
|
||||
" # 检查是否可以使用shortcut\n",
|
||||
" if self.use_shortcut and x.shape == layer_output.shape:\n",
|
||||
" if self.use_shortcut and x.size() == layer_output.size():\n",
|
||||
" x = x + layer_output\n",
|
||||
" else:\n",
|
||||
" x = layer_output\n",
|
||||
@ -768,7 +768,7 @@
|
||||
" output = model(x)\n",
|
||||
" target = torch.tensor([[0.]])\n",
|
||||
"\n",
|
||||
" # 通过目标值和输出值的接近程度来计算损失\n",
|
||||
" # 根据输出和标签差距来计算损失\n",
|
||||
" loss = nn.MSELoss()\n",
|
||||
" loss = loss(output, target)\n",
|
||||
" \n",
|
||||
@ -777,7 +777,7 @@
|
||||
"\n",
|
||||
" for name, param in model.named_parameters():\n",
|
||||
" if 'weight' in name:\n",
|
||||
" # 打印权重的梯度绝对值的平均值\n",
|
||||
" # 打印权重的平均绝对梯度\n",
|
||||
" print(f\"{name} has gradient mean of {param.grad.abs().mean().item()}\")"
|
||||
]
|
||||
},
|
||||
@ -862,14 +862,22 @@
|
||||
"- 接下来,我们将在实现Transformer块时应用shortcut连接。"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "fd8a2072",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## 4.5 在transformer块中连接注意力层和线性层"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "bc571b76",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"- 本节将前述概念融合,搭建所谓的Transformer块。\n",
|
||||
"- 本节将前述概念融合,搭建transformer块。\n",
|
||||
"- Transformer块将前一章的因果多头注意力模块与线性层结合起来,即之前章节中我们实现的前馈神经网络\n",
|
||||
"- 此外,Transformer块还使用了Dropout和shortcut连接。"
|
||||
"- 此外,transformer块还使用了Dropout和shortcut连接。"
|
||||
]
|
||||
},
|
||||
{
|
||||
@ -1051,7 +1059,7 @@
|
||||
" batch_size, seq_len = in_idx.shape\n",
|
||||
" tok_embeds = self.tok_emb(in_idx)\n",
|
||||
" pos_embeds = self.pos_emb(torch.arange(seq_len, device=in_idx.device))\n",
|
||||
" x = tok_embeds + pos_embeds # shape [batch_size, num_tokens, emb_size]\n",
|
||||
" x = tok_embeds + pos_embeds # Shape [batch_size, num_tokens, emb_size]\n",
|
||||
" x = self.trf_blocks(x)\n",
|
||||
" x = self.final_norm(x)\n",
|
||||
" logits = self.out_head(x)\n",
|
||||
@ -1110,7 +1118,7 @@
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"- 我们将在下一章对这个模型进行训练。\n",
|
||||
"- 然而,关于其大小有一个快速说明:我们之前提到它是一个拥有124M参数的模型;我们可以按照以下方式再次核对这个数字:"
|
||||
"- 这里对模型大小做一个快速说明:我们之前提到它是一个拥有124M参数的模型;可以按照以下方式核对这个数字:"
|
||||
]
|
||||
},
|
||||
{
|
||||
@ -1141,7 +1149,7 @@
|
||||
"- 在原始的GPT-2论文中,研究人员使用了权重绑定,这意味着他们将token嵌入层(tok_emb)重复用作输出层,即设置`self.out_head.weight = self.tok_emb.weight`\n",
|
||||
"- token嵌入层将50,257维输入token的one-hot编码投影到768维的embedding表示中\n",
|
||||
"- 输出层将768维的embedding投影回到50,257维的表示中,以便我们可以将其转换回单词(更多关于此的信息请参见下一节)\n",
|
||||
"- 因此,embedding层和输出层有相同数量的权重参数,根据它们的权重矩阵形状:参见下一章"
|
||||
"- 因此,embedding层和输出层有相同数量的权重参数,正如我们根据其权重矩阵的形状所看到的那样"
|
||||
]
|
||||
},
|
||||
{
|
||||
|
||||
Loading…
Reference in New Issue
Block a user