From ace24ba8ceeddb1197f16da08bc51aa2d01e0bca Mon Sep 17 00:00:00 2001 From: yy <310484121@qq.com> Date: Thu, 29 Feb 2024 10:12:59 +0800 Subject: [PATCH] fix message --- ch04/01_main-chapter-code/ch04.ipynb | 24 ++++++++++++++++-------- 1 file changed, 16 insertions(+), 8 deletions(-) diff --git a/ch04/01_main-chapter-code/ch04.ipynb b/ch04/01_main-chapter-code/ch04.ipynb index dc6280c..3f69b6c 100644 --- a/ch04/01_main-chapter-code/ch04.ipynb +++ b/ch04/01_main-chapter-code/ch04.ipynb @@ -756,7 +756,7 @@ " # 计算当前层的输出\n", " layer_output = layer(x)\n", " # 检查是否可以使用shortcut\n", - " if self.use_shortcut and x.shape == layer_output.shape:\n", + " if self.use_shortcut and x.size() == layer_output.size():\n", " x = x + layer_output\n", " else:\n", " x = layer_output\n", @@ -768,7 +768,7 @@ " output = model(x)\n", " target = torch.tensor([[0.]])\n", "\n", - " # 通过目标值和输出值的接近程度来计算损失\n", + " # 根据输出和标签差距来计算损失\n", " loss = nn.MSELoss()\n", " loss = loss(output, target)\n", " \n", @@ -777,7 +777,7 @@ "\n", " for name, param in model.named_parameters():\n", " if 'weight' in name:\n", - " # 打印权重的梯度绝对值的平均值\n", + " # 打印权重的平均绝对梯度\n", " print(f\"{name} has gradient mean of {param.grad.abs().mean().item()}\")" ] }, @@ -862,14 +862,22 @@ "- 接下来,我们将在实现Transformer块时应用shortcut连接。" ] }, + { + "cell_type": "markdown", + "id": "fd8a2072", + "metadata": {}, + "source": [ + "## 4.5 在transformer块中连接注意力层和线性层" + ] + }, { "cell_type": "markdown", "id": "bc571b76", "metadata": {}, "source": [ - "- 本节将前述概念融合,搭建所谓的Transformer块。\n", + "- 本节将前述概念融合,搭建transformer块。\n", "- Transformer块将前一章的因果多头注意力模块与线性层结合起来,即之前章节中我们实现的前馈神经网络\n", - "- 此外,Transformer块还使用了Dropout和shortcut连接。" + "- 此外,transformer块还使用了Dropout和shortcut连接。" ] }, { @@ -1051,7 +1059,7 @@ " batch_size, seq_len = in_idx.shape\n", " tok_embeds = self.tok_emb(in_idx)\n", " pos_embeds = self.pos_emb(torch.arange(seq_len, device=in_idx.device))\n", - " x = tok_embeds + pos_embeds # shape [batch_size, num_tokens, emb_size]\n", + " x = tok_embeds + pos_embeds # Shape [batch_size, num_tokens, emb_size]\n", " x = self.trf_blocks(x)\n", " x = self.final_norm(x)\n", " logits = self.out_head(x)\n", @@ -1110,7 +1118,7 @@ "metadata": {}, "source": [ "- 我们将在下一章对这个模型进行训练。\n", - "- 然而,关于其大小有一个快速说明:我们之前提到它是一个拥有124M参数的模型;我们可以按照以下方式再次核对这个数字:" + "- 这里对模型大小做一个快速说明:我们之前提到它是一个拥有124M参数的模型;可以按照以下方式核对这个数字:" ] }, { @@ -1141,7 +1149,7 @@ "- 在原始的GPT-2论文中,研究人员使用了权重绑定,这意味着他们将token嵌入层(tok_emb)重复用作输出层,即设置`self.out_head.weight = self.tok_emb.weight`\n", "- token嵌入层将50,257维输入token的one-hot编码投影到768维的embedding表示中\n", "- 输出层将768维的embedding投影回到50,257维的表示中,以便我们可以将其转换回单词(更多关于此的信息请参见下一节)\n", - "- 因此,embedding层和输出层有相同数量的权重参数,根据它们的权重矩阵形状:参见下一章" + "- 因此,embedding层和输出层有相同数量的权重参数,正如我们根据其权重矩阵的形状所看到的那样" ] }, {