diff --git a/Translated_Book/ch05/4.66 b/Translated_Book/ch05/4.66 new file mode 100644 index 0000000..9cd4d06 --- /dev/null +++ b/Translated_Book/ch05/4.66 @@ -0,0 +1,42 @@ +Looking in indexes: https://pypi.tuna.tsinghua.edu.cn/simple +Requirement already satisfied: tensorflow in c:\users\pr04ark\.conda\envs\cell\lib\site-packages (2.16.1) +Requirement already satisfied: tqdm in c:\users\pr04ark\.conda\envs\cell\lib\site-packages (4.65.0) +Requirement already satisfied: tensorflow-intel==2.16.1 in c:\users\pr04ark\.conda\envs\cell\lib\site-packages (from tensorflow) (2.16.1) +Requirement already satisfied: absl-py>=1.0.0 in c:\users\pr04ark\.conda\envs\cell\lib\site-packages (from tensorflow-intel==2.16.1->tensorflow) (2.0.0) +Requirement already satisfied: astunparse>=1.6.0 in c:\users\pr04ark\.conda\envs\cell\lib\site-packages (from tensorflow-intel==2.16.1->tensorflow) (1.6.3) +Requirement already satisfied: flatbuffers>=23.5.26 in c:\users\pr04ark\.conda\envs\cell\lib\site-packages (from tensorflow-intel==2.16.1->tensorflow) (24.3.25) +Requirement already satisfied: gast!=0.5.0,!=0.5.1,!=0.5.2,>=0.2.1 in c:\users\pr04ark\.conda\envs\cell\lib\site-packages (from tensorflow-intel==2.16.1->tensorflow) (0.5.4) +Requirement already satisfied: google-pasta>=0.1.1 in c:\users\pr04ark\.conda\envs\cell\lib\site-packages (from tensorflow-intel==2.16.1->tensorflow) (0.2.0) +Requirement already satisfied: h5py>=3.10.0 in c:\users\pr04ark\.conda\envs\cell\lib\site-packages (from tensorflow-intel==2.16.1->tensorflow) (3.11.0) +Requirement already satisfied: libclang>=13.0.0 in c:\users\pr04ark\.conda\envs\cell\lib\site-packages (from tensorflow-intel==2.16.1->tensorflow) (18.1.1) +Requirement already satisfied: ml-dtypes~=0.3.1 in c:\users\pr04ark\.conda\envs\cell\lib\site-packages (from tensorflow-intel==2.16.1->tensorflow) (0.3.2) +Requirement already satisfied: opt-einsum>=2.3.2 in c:\users\pr04ark\.conda\envs\cell\lib\site-packages (from tensorflow-intel==2.16.1->tensorflow) (3.3.0) +Requirement already satisfied: packaging in c:\users\pr04ark\appdata\roaming\python\python310\site-packages (from tensorflow-intel==2.16.1->tensorflow) (24.0) +Requirement already satisfied: protobuf!=4.21.0,!=4.21.1,!=4.21.2,!=4.21.3,!=4.21.4,!=4.21.5,<5.0.0dev,>=3.20.3 in c:\users\pr04ark\.conda\envs\cell\lib\site-packages (from tensorflow-intel==2.16.1->tensorflow) (4.23.4) +Requirement already satisfied: requests<3,>=2.21.0 in c:\users\pr04ark\.conda\envs\cell\lib\site-packages (from tensorflow-intel==2.16.1->tensorflow) (2.31.0) +Requirement already satisfied: setuptools in c:\users\pr04ark\.conda\envs\cell\lib\site-packages (from tensorflow-intel==2.16.1->tensorflow) (68.0.0) +Requirement already satisfied: six>=1.12.0 in c:\users\pr04ark\.conda\envs\cell\lib\site-packages (from tensorflow-intel==2.16.1->tensorflow) (1.16.0) +Requirement already satisfied: termcolor>=1.1.0 in c:\users\pr04ark\.conda\envs\cell\lib\site-packages (from tensorflow-intel==2.16.1->tensorflow) (2.4.0) +Requirement already satisfied: typing-extensions>=3.6.6 in c:\users\pr04ark\.conda\envs\cell\lib\site-packages (from tensorflow-intel==2.16.1->tensorflow) (4.7.1) +Requirement already satisfied: wrapt>=1.11.0 in c:\users\pr04ark\.conda\envs\cell\lib\site-packages (from tensorflow-intel==2.16.1->tensorflow) (1.16.0) +Requirement already satisfied: grpcio<2.0,>=1.24.3 in c:\users\pr04ark\.conda\envs\cell\lib\site-packages (from tensorflow-intel==2.16.1->tensorflow) (1.60.0) +Requirement already satisfied: tensorboard<2.17,>=2.16 in c:\users\pr04ark\.conda\envs\cell\lib\site-packages (from tensorflow-intel==2.16.1->tensorflow) (2.16.2) +Requirement already satisfied: keras>=3.0.0 in c:\users\pr04ark\.conda\envs\cell\lib\site-packages (from tensorflow-intel==2.16.1->tensorflow) (3.3.3) +Requirement already satisfied: tensorflow-io-gcs-filesystem>=0.23.1 in c:\users\pr04ark\.conda\envs\cell\lib\site-packages (from tensorflow-intel==2.16.1->tensorflow) (0.31.0) +Requirement already satisfied: numpy<2.0.0,>=1.23.5 in c:\users\pr04ark\.conda\envs\cell\lib\site-packages (from tensorflow-intel==2.16.1->tensorflow) (1.23.5) +Requirement already satisfied: colorama in c:\users\pr04ark\appdata\roaming\python\python310\site-packages (from tqdm) (0.4.6) +Requirement already satisfied: wheel<1.0,>=0.23.0 in c:\users\pr04ark\.conda\envs\cell\lib\site-packages (from astunparse>=1.6.0->tensorflow-intel==2.16.1->tensorflow) (0.41.2) +Requirement already satisfied: rich in c:\users\pr04ark\.conda\envs\cell\lib\site-packages (from keras>=3.0.0->tensorflow-intel==2.16.1->tensorflow) (13.7.1) +Requirement already satisfied: namex in c:\users\pr04ark\.conda\envs\cell\lib\site-packages (from keras>=3.0.0->tensorflow-intel==2.16.1->tensorflow) (0.0.8) +Requirement already satisfied: optree in c:\users\pr04ark\.conda\envs\cell\lib\site-packages (from keras>=3.0.0->tensorflow-intel==2.16.1->tensorflow) (0.11.0) +Requirement already satisfied: charset-normalizer<4,>=2 in c:\users\pr04ark\.conda\envs\cell\lib\site-packages (from requests<3,>=2.21.0->tensorflow-intel==2.16.1->tensorflow) (2.0.4) +Requirement already satisfied: idna<4,>=2.5 in c:\users\pr04ark\.conda\envs\cell\lib\site-packages (from requests<3,>=2.21.0->tensorflow-intel==2.16.1->tensorflow) (3.4) +Requirement already satisfied: urllib3<3,>=1.21.1 in c:\users\pr04ark\.conda\envs\cell\lib\site-packages (from requests<3,>=2.21.0->tensorflow-intel==2.16.1->tensorflow) (1.26.18) +Requirement already satisfied: certifi>=2017.4.17 in c:\users\pr04ark\.conda\envs\cell\lib\site-packages (from requests<3,>=2.21.0->tensorflow-intel==2.16.1->tensorflow) (2024.2.2) +Requirement already satisfied: markdown>=2.6.8 in c:\users\pr04ark\.conda\envs\cell\lib\site-packages (from tensorboard<2.17,>=2.16->tensorflow-intel==2.16.1->tensorflow) (3.5.1) +Requirement already satisfied: tensorboard-data-server<0.8.0,>=0.7.0 in c:\users\pr04ark\.conda\envs\cell\lib\site-packages (from tensorboard<2.17,>=2.16->tensorflow-intel==2.16.1->tensorflow) (0.7.2) +Requirement already satisfied: werkzeug>=1.0.1 in c:\users\pr04ark\.conda\envs\cell\lib\site-packages (from tensorboard<2.17,>=2.16->tensorflow-intel==2.16.1->tensorflow) (3.0.1) +Requirement already satisfied: MarkupSafe>=2.1.1 in c:\users\pr04ark\.conda\envs\cell\lib\site-packages (from werkzeug>=1.0.1->tensorboard<2.17,>=2.16->tensorflow-intel==2.16.1->tensorflow) (2.1.1) +Requirement already satisfied: markdown-it-py>=2.2.0 in c:\users\pr04ark\.conda\envs\cell\lib\site-packages (from rich->keras>=3.0.0->tensorflow-intel==2.16.1->tensorflow) (3.0.0) +Requirement already satisfied: pygments<3.0.0,>=2.13.0 in c:\users\pr04ark\.conda\envs\cell\lib\site-packages (from rich->keras>=3.0.0->tensorflow-intel==2.16.1->tensorflow) (2.15.1) +Requirement already satisfied: mdurl~=0.1 in c:\users\pr04ark\.conda\envs\cell\lib\site-packages (from markdown-it-py>=2.2.0->rich->keras>=3.0.0->tensorflow-intel==2.16.1->tensorflow) (0.1.2) diff --git a/Translated_Book/ch05/5.4 在 PyTorch 中加载和保存模型权重.ipynb b/Translated_Book/ch05/5.4 在 PyTorch 中加载和保存模型权重.ipynb new file mode 100644 index 0000000..fc4aaf8 --- /dev/null +++ b/Translated_Book/ch05/5.4 在 PyTorch 中加载和保存模型权重.ipynb @@ -0,0 +1,291 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "780ec141", + "metadata": {}, + "source": [ + "# 5.4 在 PyTorch 中加载和保存模型权重" + ] + }, + { + "cell_type": "markdown", + "id": "5de22098", + "metadata": {}, + "source": [ + "我们讨论了如何从数值上评估训练进度以及如何从零开始预训练一个大型语言模型(LLM)。\n", + "尽管大语言模型和数据集都相对较小,但此练习表明预训练 大语言模型的计算成本很高。\n", + "因此,能够保存大语言模型非常重要,这样我们就不必每次想要在新会话中使用它时都重新运行训练。" + ] + }, + { + "cell_type": "markdown", + "id": "0c5ff69a", + "metadata": {}, + "source": [ + "如图 5.16 中的章节概述所示,我们将在本节中介绍如何保存和加载预训练模型。\n", + "然后,在接下来的部分中,我们将从 OpenAI 加载一个功能更强大的预训练 GPT 模型到我们的 GPTModel 实例中。" + ] + }, + { + "cell_type": "markdown", + "id": "94b4fa2d", + "metadata": {}, + "source": [ + "**图 5.16 在训练和检查模型之后,保存模型通常很有帮助的,这样我们以后可以使用或继续训练它,这是本节的主题,然后我们将在本章的最后一节中从 OpenAI 加载预训练的模型权重**" + ] + }, + { + "cell_type": "markdown", + "id": "8a921a88", + "metadata": {}, + "source": [ + "![fig5.16](https://github.com/datawhalechina/llms-from-scratch-cn/blob/main/Translated_Book/img/fig-5-16.jpg?raw=true)" + ] + }, + { + "cell_type": "markdown", + "id": "41081dbb", + "metadata": {}, + "source": [ + "幸运的是,保存一个PyTorch 模型相对简单。\n", + "这里推荐的方法是使用 torch.save 函数保存模型的所谓状态字典 state_dict,这是一个将每个层映射到其参数的字典, 代码如下所示:" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "id": "31d1411d", + "metadata": {}, + "outputs": [ + { + "ename": "ImportError", + "evalue": "cannot import name 'GPTModel' from 'transformers' (C:\\Users\\Pr04ArK\\.conda\\envs\\cell\\lib\\site-packages\\transformers\\__init__.py)", + "output_type": "error", + "traceback": [ + "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[1;31mImportError\u001b[0m Traceback (most recent call last)", + "Cell \u001b[1;32mIn[8], line 3\u001b[0m\n\u001b[0;32m 1\u001b[0m \u001b[38;5;28;01mimport\u001b[39;00m \u001b[38;5;21;01mtorch\u001b[39;00m\n\u001b[0;32m 2\u001b[0m \u001b[38;5;28;01mimport\u001b[39;00m \u001b[38;5;21;01mtorch\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mnn\u001b[39;00m \u001b[38;5;28;01mas\u001b[39;00m \u001b[38;5;21;01mnn\u001b[39;00m\n\u001b[1;32m----> 3\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01mtransformers\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m GPTModel\n\u001b[0;32m 6\u001b[0m \u001b[38;5;28;01mclass\u001b[39;00m \u001b[38;5;21;01mFeedForward\u001b[39;00m(nn\u001b[38;5;241m.\u001b[39mModule):\n\u001b[0;32m 7\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21m__init__\u001b[39m(\u001b[38;5;28mself\u001b[39m, cfg):\n", + "\u001b[1;31mImportError\u001b[0m: cannot import name 'GPTModel' from 'transformers' (C:\\Users\\Pr04ArK\\.conda\\envs\\cell\\lib\\site-packages\\transformers\\__init__.py)" + ] + } + ], + "source": [ + "import torch\n", + "import torch.nn as nn\n", + "from transformers import GPTModel\n", + "\n", + "\n", + "class FeedForward(nn.Module):\n", + " def __init__(self, cfg):\n", + " super().__init__()\n", + " self.linear1 = nn.Linear(cfg[\"emb_dim\"], cfg[\"emb_dim\"] * 4)\n", + " self.relu = nn.ReLU()\n", + " self.linear2 = nn.Linear(cfg[\"emb_dim\"] * 4, cfg[\"emb_dim\"])\n", + " self.dropout = nn.Dropout(cfg[\"drop_rate\"])\n", + "\n", + " def forward(self, x):\n", + " x = self.relu(self.linear1(x))\n", + " x = self.dropout(x)\n", + " x = self.linear2(x)\n", + " return x\n", + " \n", + "class MultiHeadAttention(nn.Module):\n", + " def __init__(self, d_in, d_out,\n", + " context_length, dropout, num_heads, qkv_bias=False):\n", + " super().__init__()\n", + " assert d_out % num_heads == 0, \"d_out must be divisible by num_heads\"\n", + " self.d_out = d_out\n", + " self.num_heads = num_heads\n", + " self.head_dim = d_out // num_heads #A\n", + " self.W_query = nn.Linear(d_in, d_out, bias=qkv_bias)\n", + " self.W_key = nn.Linear(d_in, d_out, bias=qkv_bias)\n", + " self.W_value = nn.Linear(d_in, d_out, bias=qkv_bias)\n", + " self.out_proj = nn.Linear(d_out, d_out) #B\n", + " self.dropout = nn.Dropout(dropout)\n", + " self.register_buffer(\n", + " 'mask',\n", + " torch.triu(torch.ones(context_length, context_length), diagonal=1)\n", + " )\n", + " def forward(self, x):\n", + " b, num_tokens, d_in = x.shape\n", + " keys = self.W_key(x) #C\n", + " queries = self.W_query(x) #C\n", + " values = self.W_value(x) #C\n", + " \n", + " keys = keys.view(b, num_tokens, self.num_heads, self.head_dim) #D\n", + " values = values.view(b, num_tokens, self.num_heads, self.head_dim) #D\n", + " queries = queries.view(b, num_tokens, self.num_heads, self.head_dim)#D\n", + "\n", + " keys = keys.transpose(1, 2) #E\n", + " queries = queries.transpose(1, 2) #E\n", + " values = values.transpose(1, 2) #E\n", + "\n", + " attn_scores = queries @ keys.transpose(2, 3) #F\n", + " mask_bool = self.mask.bool()[:num_tokens, :num_tokens] #G\n", + "\n", + " attn_scores.masked_fill_(mask_bool, -torch.inf) #H\n", + "\n", + " attn_weights = torch.softmax(\n", + " attn_scores / keys.shape[-1]**0.5, dim=-1)\n", + " attn_weights = self.dropout(attn_weights)\n", + "\n", + " context_vec = (attn_weights @ values).transpose(1, 2) #I\n", + " #J\n", + " context_vec = context_vec.contiguous().view(b, num_tokens, self.d_out)\n", + " context_vec = self.out_proj(context_vec) #K\n", + " return context_vec\n", + "\n", + "GPT_CONFIG_124M = {\n", + " \"vocab_size\": 50257, # Vocabulary size\n", + " \"context_length\": 1024, # Context length\n", + " \"emb_dim\": 768, # Embedding dimension\n", + " \"n_heads\": 12, # Number of attention heads\n", + " \"n_layers\": 12, # Number of layers\n", + " \"drop_rate\": 0.1, # Dropout rate\n", + " \"qkv_bias\": False # Query-Key-Value bias\n", + "}\n", + "torch.manual_seed(123)\n", + "model = GPTModel(GPT_CONFIG_124M)\n", + "model.eval()\n", + "\n", + "\n", + "torch.save(model.state_dict(), \"model.pth\")" + ] + }, + { + "cell_type": "markdown", + "id": "77373a01", + "metadata": {}, + "source": [ + "在上面的代码中,“model.pth”是保存state_dict的文件名。 \n", + "从技术上讲我们可以使用任何文件扩展名,但.pth扩展名是PyTorch文件的常规约定" + ] + }, + { + "cell_type": "markdown", + "id": "07bf1918", + "metadata": {}, + "source": [ + "然后,通过 state_dict 保存模型权重后,我们可以按照以下代码将模型权重加载到新的 GPTModel 模型实例中:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "4183fef5", + "metadata": {}, + "outputs": [], + "source": [ + "model = GPTModel(GPT_CONFIG_124M)\n", + "model.load_state_dict(torch.load(\"model.pth\"))\n", + "model.eval()" + ] + }, + { + "cell_type": "markdown", + "id": "8d502a45", + "metadata": {}, + "source": [ + "正如第 4 章所讨论的,dropout通过在训练期间随机“丢弃”层的神经元来帮助防止模型对训练数据过拟合。\n", + "然而,在推理过程中,我们不想随机丢弃网络学到的任何信息。\n", + "使用 model.eval() 能够将模型切换到评估模式进行推理,禁用模型的 dropout 层。" + ] + }, + { + "cell_type": "markdown", + "id": "92b5cd12", + "metadata": {}, + "source": [ + "如果我们计划稍后继续预训练模型,例如使用本章早些时候定义的train_model_simple函数,建议同时保存优化器的状态。\n", + "这样做可以在重新开始训练时保持优化器的所有参数和状态,从而更有效地继续训练过程。" + ] + }, + { + "cell_type": "markdown", + "id": "6973802e", + "metadata": {}, + "source": [ + "AdamW 等自适应优化器,会为每个模型权重存储附加参数。\n", + "AdamW 利用历史数据动态调整每个模型参数的学习率。\n", + "如果没有它,优化器将重置,模型可能学习效果不佳,甚至无法正确收敛,这意味着它将失去生成连贯文本的能力。\n", + "使用torch.save,我们可以按照以下方式保存模型和优化器state_dict内容:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "0378ce57", + "metadata": {}, + "outputs": [], + "source": [ + "torch.save({\n", + " \"model_state_dict\": model.state_dict(),\n", + " \"optimizer_state_dict\": optimizer.state_dict(),\n", + " },\n", + " \"model_and_optimizer.pth\"\n", + ")" + ] + }, + { + "cell_type": "markdown", + "id": "93f84fd7", + "metadata": {}, + "source": [ + "接着,我们可以通过首先使用torch.load加载保存的数据,然后使用load_state_dict方法来恢复模型和优化器的状态,具体操作如下:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "e97ebb32", + "metadata": {}, + "outputs": [], + "source": [ + "checkpoint = torch.load(\"model_and_optimizer.pth\")\n", + "model = GPTModel(GPT_CONFIG_124M)\n", + "model.load_state_dict(checkpoint[\"model_state_dict\"])\n", + "optimizer = torch.optim.AdamW(model.parameters(), lr=5e-4, weight_decay=0.1)\n", + "optimizer.load_state_dict(checkpoint[\"optimizer_state_dict\"])\n", + "model.train();" + ] + }, + { + "cell_type": "markdown", + "id": "476721c3", + "metadata": {}, + "source": [ + "### 练习5.4" + ] + }, + { + "cell_type": "markdown", + "id": "879d6847", + "metadata": {}, + "source": [ + "在保存权重后,在一个新的Python会话或Jupyter笔记本文件中加载模型和优化器,并使用train_model_simple函数继续对其进行预训练1个周期。这样做可以无缝地继续之前的训练过程,从而提高模型的性能和适应性。" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python (cell)", + "language": "python", + "name": "cell" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.13" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/Translated_Book/ch05/5.5 从 OpenAI 加载预训练权重.ipynb b/Translated_Book/ch05/5.5 从 OpenAI 加载预训练权重.ipynb new file mode 100644 index 0000000..11e4823 --- /dev/null +++ b/Translated_Book/ch05/5.5 从 OpenAI 加载预训练权重.ipynb @@ -0,0 +1,671 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "70846747", + "metadata": {}, + "source": [ + "# 5.5 从 OpenAI 加载预训练权重" + ] + }, + { + "cell_type": "markdown", + "id": "76eadfe2", + "metadata": {}, + "source": [ + "之前,出于教育目的,我们使用包含一本短篇故事书的有限数据集训练了一个小型 GPT-2 模型。\n", + "这样使我们能够专注于基础知识,而不需要大量的时间和精力。" + ] + }, + { + "cell_type": "markdown", + "id": "91bb48c9", + "metadata": {}, + "source": [ + "幸运的是,OpenAI 公开分享了他们的 GPT-2 模型的权重,从而去除了我们自己投资数万到数十万美元在大型语料库上重新训练模型的需要。" + ] + }, + { + "cell_type": "markdown", + "id": "11c57649", + "metadata": {}, + "source": [ + "在本节的其余部分中,我们将这些权重加载到 GPTModel 类中并使用该模型进行文本生成。\n", + "例如,这里的权重是指存储在 PyTorch 的 Linear 和 Embedding 层的 .weight 属性中的权重参数。\n", + "我们在训练模型时曾通过model.parameters()访问过它们。" + ] + }, + { + "cell_type": "markdown", + "id": "bc26c95b", + "metadata": {}, + "source": [ + "在接下来的章节中,我们将重用这些预训练的权重来对模型进行微调,以便执行文本分类任务,并遵循类似于ChatGPT的指令。" + ] + }, + { + "cell_type": "markdown", + "id": "1c33e158", + "metadata": {}, + "source": [ + "请注意,OpenAI 最初是通过 TensorFlow 保存 GPT-2 的权重的,因此我们需要安装 TensorFlow 来在 Python 中加载这些权重。\n", + "此外,下面的代码将使用一个名为 tqdm 的进度条工具来跟踪下载过程,我们也需要安装这个工具。\n", + "这些安装步骤确保了在加载模型权重和监控下载进度时的技术兼容性和用户友好性。" + ] + }, + { + "cell_type": "markdown", + "id": "bee9f2ea", + "metadata": {}, + "source": [ + "您可以通过在终端中执行以下命令来安装这些库:" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "id": "6bf08a98", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Note: you may need to restart the kernel to use updated packages.\n" + ] + } + ], + "source": [ + "pip install tensorflow>=2.15.0 tqdm>=4.66" + ] + }, + { + "cell_type": "markdown", + "id": "1ba02175", + "metadata": {}, + "source": [ + "下载代码相对较长,大部分是模板代码,没有太多有趣之处。\n", + "因此,我们不在本章中花费宝贵的篇幅讨论从互联网获取文件的Python代码。\n", + "相反,我们将直接从本章的在线仓库下载名为gpt_download.py的Python模块。" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "fd30c4b4", + "metadata": {}, + "outputs": [], + "source": [ + "import urllib.request\n", + "url = (\n", + " \"https://raw.githubusercontent.com/rasbt/\"\n", + " \"LLMs-from-scratch/main/ch05/\"\n", + " \"01_main-chapter-code/gpt_download.py\"\n", + ")\n", + "filename = url.split('/')[-1]\n", + "urllib.request.urlretrieve(url, filename)" + ] + }, + { + "cell_type": "markdown", + "id": "219396c3", + "metadata": {}, + "source": [ + "接下来,在将此文件下载到Python会话的本地目录后,鼓励读者简要检查这个文件的内容,以确保文件已正确保存并且包含有效的Python代码。" + ] + }, + { + "cell_type": "markdown", + "id": "54f59460", + "metadata": {}, + "source": [ + "现在,我们可以从 gpt_download.py 文件中导入 download_and_load_gpt2 函数,如下代码所示,该函数会将 GPT-2 架构设置(settings)和权重参数(params)加载到我们的 Python 会话中:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "70dcdbce", + "metadata": {}, + "outputs": [], + "source": [ + "from gpt_download import download_and_load_gpt2\n", + "settings, params = download_and_load_gpt2(model_size=\"124M\", models_dir=\"gpt2\")" + ] + }, + { + "cell_type": "markdown", + "id": "86395747", + "metadata": {}, + "source": [ + "执行后续代码会下载与 124M 参数 GPT-2 模型相关的以下 7 个文件:" + ] + }, + { + "cell_type": "markdown", + "id": "1d7e435a", + "metadata": {}, + "source": [ + "checkpoint: 100%|███████████████████████████| 77.0/77.0 [00:00<00:00, 63.9kiB/s] \\\n", + "encoder.json: 100%|█████████████████████████| 1.04M/1.04M [00:00<00:00, 2.20MiB/s] \\\n", + "hprams.json: 100%|██████████████████████████| 90.0/90.0 [00:00<00:00, 78.3kiB/s] \\\n", + "model.ckpt.data-00000-of-00001: 100%|███████| 498M/498M [01:09<00:00, 7.16MiB/s] \\\n", + "model.ckpt.index: 100%|█████████████████████| 5.21k/5.21k [00:00<00:00, 3.24MiB/s] \\\n", + "model.ckpt.meta: 100%|██████████████████████| 471k/471k [00:00<00:00, 2.46MiB/s] \\\n", + "vocab.bpe: 100%|████████████████████████████| 456k/456k [00:00<00:00, 1.70MiB/s]" + ] + }, + { + "cell_type": "markdown", + "id": "a5a1d0be", + "metadata": {}, + "source": [ + "### 下载指南更新" + ] + }, + { + "cell_type": "markdown", + "id": "db268f67", + "metadata": {}, + "source": [ + "如果,可能是由于间歇性的互联网连接问题、服务器问题,或者OpenAI分享开源GPT-2模型权重的方式发生了变化,下载代码对您不起作用。\n", + "在这种情况下,请访问本章的在线代码仓库 https://github.com/rasbt/LLMs-from-scratch 以获取备用和更新的指南,并请通过Manning论坛联系我们以获取更多帮助。" + ] + }, + { + "cell_type": "markdown", + "id": "a689afec", + "metadata": {}, + "source": [ + "前面的代码执行完成后,我们来检查一下settings和params的内容:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "71208769", + "metadata": {}, + "outputs": [], + "source": [ + "print(\"Settings:\", settings)\n", + "print(\"Parameter dictionary keys:\", params.keys())" + ] + }, + { + "cell_type": "markdown", + "id": "98e33ad5", + "metadata": {}, + "source": [ + "内容如下:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "8290e4e9", + "metadata": {}, + "outputs": [], + "source": [ + "Settings: {'n_vocab': 50257, 'n_ctx': 1024, 'n_embd': 768, 'n_head': 12, 'n_layer': 12}\n", + "Parameter dictionary keys: dict_keys(['blocks', 'b', 'g', 'wpe', 'wte'])" + ] + }, + { + "cell_type": "markdown", + "id": "9eeabe01", + "metadata": {}, + "source": [ + "settings 和 params 都是Python字典。\n", + "settings 字典存储了与我们手动定义的 GPT_CONFIG_124M 设置相似的LLM架构设置。params 字典包含了实际的权重张量。\n", + "请注意,我们仅打印了字典键,因为打印权重内容会占用太多屏幕空间,然而,我们可以通过通过 print(params) 打印整个字典或通过相应的字典键选择个别张量来检查这些权重张量,例如,嵌入层权重:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "06eedf61", + "metadata": {}, + "outputs": [], + "source": [ + "print(params[\"wte\"])\n", + "print(\"Token embedding weight tensor dimensions:\", params[\"wte\"].shape)" + ] + }, + { + "cell_type": "markdown", + "id": "e557356c", + "metadata": {}, + "source": [ + "token嵌入层的权重如下:" + ] + }, + { + "cell_type": "markdown", + "id": "e048b5f3", + "metadata": {}, + "source": [ + "[[-0.11010301 ... -0.1363697 0.01506208 0.04531523] \\\n", + "[ 0.04034033 ... 0.08605453 0.00253983 0.04318958] \\\n", + "[-0.12746179 ... 0.08991534 -0.12972379 -0.08785918] \\\n", + "... \\\n", + "[-0.04453601 ... 0.10435229 0.09783269 -0.06952604] \\\n", + "[ 0.1860082 ... -0.09625227 0.07847701 -0.02245961] \\\n", + "[ 0.05135201 ... 0.00704835 0.15519823 0.12067825]] \\\n", + "Token embedding weight tensor dimensions: (50257, 768)" + ] + }, + { + "cell_type": "markdown", + "id": "58bce5ac", + "metadata": {}, + "source": [ + "我们通过download_and_load_gpt2(model_size=\"124M\", ...)设置下载并加载了最小的GPT-2模型的权重。\n", + "然而,请注意,OpenAI还分享了更大模型的权重:\"355M\"、\"774M\"和\"1558M\"。这些不同大小的GPT模型的整体架构是相同的,如图5.17所示。" + ] + }, + { + "cell_type": "markdown", + "id": "da22cb09", + "metadata": {}, + "source": [ + "**图5.17 GPT-2大型语言模型有几种不同的模型大小,参数数量从1.24亿到15.58亿不等。\n", + "它们的核心架构是相同的,唯一的区别在于嵌入大小以及像注意力头和变换器块这样的单个组件重复的次数。**" + ] + }, + { + "cell_type": "markdown", + "id": "7663e379", + "metadata": {}, + "source": [ + "![fig5.17](https://github.com/datawhalechina/llms-from-scratch-cn/blob/main/Translated_Book/img/fig-5-17.jpg?raw=true)" + ] + }, + { + "cell_type": "markdown", + "id": "3b826489", + "metadata": {}, + "source": [ + "如图5.17所示,不同大小的GPT-2模型的总体架构保持不变,不同的是各种架构元素重复的次数不同,以及嵌入大小有所不同。\n", + "本章剩余的代码也与这些更大的模型兼容。" + ] + }, + { + "cell_type": "markdown", + "id": "9350010d", + "metadata": {}, + "source": [ + "在将GPT-2模型权重加载到Python之后,我们仍需要将它们从settings和params字典转移到我们的GPTModel实例中。" + ] + }, + { + "cell_type": "markdown", + "id": "2cf4a9ff", + "metadata": {}, + "source": [ + "首先,我们创建一个字典,列出不同 GPT 模型大小之间的差异,如图 5.17 所示:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "ac0f6830", + "metadata": {}, + "outputs": [], + "source": [ + "model_configs = {\n", + " \"gpt2-small (124M)\": {\"emb_dim\": 768, \"n_layers\": 12, \"n_heads\": 12},\n", + " \"gpt2-medium (355M)\": {\"emb_dim\": 1024, \"n_layers\": 24, \"n_heads\": 16},\n", + " \"gpt2-large (774M)\": {\"emb_dim\": 1280, \"n_layers\": 36, \"n_heads\": 20},\n", + " \"gpt2-xl (1558M)\": {\"emb_dim\": 1600, \"n_layers\": 48, \"n_heads\": 25},\n", + "}" + ] + }, + { + "cell_type": "markdown", + "id": "c3fa70cb", + "metadata": {}, + "source": [ + "假设我们有兴趣加载最小的模型“gpt2-small (124M)”。我们可以使用 model_configs 中的相应设置表能够更新我们在本章前面定义和使用的全长 GPT_CONFIG_124M ,操作如下所示:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "e6e09cf3", + "metadata": {}, + "outputs": [], + "source": [ + "model_name = \"gpt2-small (124M)\"\n", + "NEW_CONFIG = GPT_CONFIG_124M.copy()\n", + "NEW_CONFIG.update(model_configs[model_name])" + ] + }, + { + "cell_type": "markdown", + "id": "9d4c4686", + "metadata": {}, + "source": [ + "细心的读者可能还记得,我们之前设置了256个token的长度,但OpenAI的原始GPT-2模型是定义为1,024个token的长度训练的,因此我们必须相应地更新 NEW_CONFIG:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "9e413216", + "metadata": {}, + "outputs": [], + "source": [ + "NEW_CONFIG.update({\"context_length\": 1024})" + ] + }, + { + "cell_type": "markdown", + "id": "a6f47d6c", + "metadata": {}, + "source": [ + "此外,OpenAI在多头注意力模块的线性层中使用偏置向量来实现查询(query)、键(key)和值(value)矩阵的计算。\n", + "在大型语言模型(LLM)中,偏置向量已不常使用,因为它们并未提升模型性能,因此是不必要的。\n", + "然而,由于我们正在使用预训练的权重,我们需要匹配设置以保持一致性,并启用这些偏置向量:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "7ecc7e04", + "metadata": {}, + "outputs": [], + "source": [ + "NEW_CONFIG.update({\"qkv_bias\": True})" + ] + }, + { + "cell_type": "markdown", + "id": "a54c6405", + "metadata": {}, + "source": [ + "我们现在可以使用更新后的 NEW_CONFIG 字典来初始化新的 GPTModel 实例:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "95671bf2", + "metadata": {}, + "outputs": [], + "source": [ + "gpt = GPTModel(NEW_CONFIG)\n", + "gpt.eval()" + ] + }, + { + "cell_type": "markdown", + "id": "da55b5b1", + "metadata": {}, + "source": [ + "默认情况下,GPTModel 实例使用随机权重进行初始化以进行预训练。\n", + "使用 OpenAI 模型权重的最后一步是用我们加载到 params 字典中的权重覆盖这些随机权重。" + ] + }, + { + "cell_type": "markdown", + "id": "7056e320", + "metadata": {}, + "source": [ + "为此,我们首先定义一个小的分配实用函数,该函数检查两个张量或数组(左侧和右侧)是否具有相同的维度或形状,并返回右侧张量作为可训练的PyTorch参数:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "f0e57649", + "metadata": {}, + "outputs": [], + "source": [ + "def assign(left, right):\n", + " if left.shape != right.shape:\n", + " raise ValueError(f\"Shape mismatch. Left: {left.shape}, Right: {right.shape}\")\n", + " return torch.nn.Parameter(torch.tensor(right))" + ] + }, + { + "cell_type": "markdown", + "id": "92eb189b", + "metadata": {}, + "source": [ + "接下来,我们定义一个名为 load_weights_into_gpt 函数,该函数将 params 字典中的权重加载到 GPTModel 实例 gpt 中:" + ] + }, + { + "cell_type": "markdown", + "id": "733da119", + "metadata": {}, + "source": [ + "### 代码示例5.5 将 OpenAI 权重加载到 GPT 模型代码中" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "a6b3474b", + "metadata": {}, + "outputs": [], + "source": [ + "import numpy as np\n", + "\n", + "def load_weights_into_gpt(gpt, params):\n", + " gpt.pos_emb.weight = assign(gpt.pos_emb.weight, params['wpe']) #A\n", + " gpt.tok_emb.weight = assign(gpt.tok_emb.weight, params['wte'])\n", + "\n", + " for b in range(len(params[\"blocks\"])): #B\n", + " q_w, k_w, v_w = np.split( #C\n", + " (params[\"blocks\"][b][\"attn\"][\"c_attn\"])[\"w\"], 3, axis=-1)\n", + " gpt.trf_blocks[b].att.W_query.weight = assign(\n", + " gpt.trf_blocks[b].att.W_query.weight, q_w.T)\n", + " gpt.trf_blocks[b].att.W_key.weight = assign(\n", + " gpt.trf_blocks[b].att.W_key.weight, k_w.T)\n", + " gpt.trf_blocks[b].att.W_value.weight = assign(\n", + " gpt.trf_blocks[b].att.W_value.weight, v_w.T)\n", + "\n", + " q_b, k_b, v_b = np.split(\n", + " (params[\"blocks\"][b][\"attn\"][\"c_attn\"])[\"b\"], 3, axis=-1)\n", + " gpt.trf_blocks[b].att.W_query.bias = assign(\n", + " gpt.trf_blocks[b].att.W_query.bias, q_b)\n", + " gpt.trf_blocks[b].att.W_key.bias = assign(\n", + " gpt.trf_blocks[b].att.W_key.bias, k_b)\n", + " gpt.trf_blocks[b].att.W_value.bias = assign(\n", + " gpt.trf_blocks[b].att.W_value.bias, v_b)\n", + "\n", + " gpt.trf_blocks[b].att.out_proj.weight = assign(\n", + " gpt.trf_blocks[b].att.out_proj.weight,\n", + " params[\"blocks\"][b][\"attn\"][\"c_proj\"][\"w\"].T)\n", + " gpt.trf_blocks[b].att.out_proj.bias = assign(\n", + " gpt.trf_blocks[b].att.out_proj.bias,\n", + " params[\"blocks\"][b][\"attn\"][\"c_proj\"][\"b\"])\n", + "\n", + " gpt.trf_blocks[b].ff.layers[0].weight = assign(\n", + " gpt.trf_blocks[b].ff.layers[0].weight,\n", + " params[\"blocks\"][b][\"mlp\"][\"c_fc\"][\"w\"].T)\n", + " gpt.trf_blocks[b].ff.layers[0].bias = assign(\n", + " gpt.trf_blocks[b].ff.layers[0].bias,\n", + " params[\"blocks\"][b][\"mlp\"][\"c_fc\"][\"b\"])\n", + " gpt.trf_blocks[b].ff.layers[2].weight = assign(\n", + " gpt.trf_blocks[b].ff.layers[2].weight,\n", + " params[\"blocks\"][b][\"mlp\"][\"c_proj\"][\"w\"].T)\n", + " gpt.trf_blocks[b].ff.layers[2].bias = assign(\n", + " gpt.trf_blocks[b].ff.layers[2].bias,\n", + " params[\"blocks\"][b][\"mlp\"][\"c_proj\"][\"b\"])\n", + " gpt.trf_blocks[b].norm1.scale = assign(\n", + " gpt.trf_blocks[b].norm1.scale,\n", + " params[\"blocks\"][b][\"ln_1\"][\"g\"])\n", + " \n", + " gpt.trf_blocks[b].norm1.shift = assign(\n", + " gpt.trf_blocks[b].norm1.shift,\n", + " params[\"blocks\"][b][\"ln_1\"][\"b\"])\n", + " gpt.trf_blocks[b].norm2.scale = assign(\n", + " gpt.trf_blocks[b].norm2.scale,\n", + " params[\"blocks\"][b][\"ln_2\"][\"g\"])\n", + " gpt.trf_blocks[b].norm2.shift = assign(\n", + " gpt.trf_blocks[b].norm2.shift,\n", + " params[\"blocks\"][b][\"ln_2\"][\"b\"])\n", + "\n", + " gpt.final_norm.scale = assign(gpt.final_norm.scale, params[\"g\"])\n", + " gpt.final_norm.shift = assign(gpt.final_norm.shift, params[\"b\"])\n", + " gpt.out_head.weight = assign(gpt.out_head.weight, params[\"wte\"]) #D" + ] + }, + { + "cell_type": "markdown", + "id": "48238389", + "metadata": {}, + "source": [ + "在 load_weights_into_gpt 函数中,我们仔细地将 OpenAI 实现的权重与 GPTModel 实现相匹配。\n", + "举一个具体的例子,OpenAI 将第一个 Transformer 块的输出投影层的权重张量存储为 params[\"blocks\"][0][\"attn\"] [\"c_proj\"][\"w\"]。\n", + "在我们的实现中,这个权重张量对应于gpt.trf_blocks[b].att.out_proj.weight,其中 gpt 是 GPTModel 实例。" + ] + }, + { + "cell_type": "markdown", + "id": "9d951581", + "metadata": {}, + "source": [ + "由于 OpenAI 使用的命名约定与我们的命名约定略有不同,因此开发 load_weights_into_gpt 函数需要进行大量猜测。\n", + "但是,如果我们尝试匹配两个具有不同维度的张量,则assign函数会警告我们。\n", + "此外,如果我们在此函数中犯了错误,我们会注意到这一点,因为生成的 GPT 模型将无法生成连贯的文本。" + ] + }, + { + "cell_type": "markdown", + "id": "51ff6b06", + "metadata": {}, + "source": [ + "让我们不要只是理论上讨论 load_weights_into_gpt,而是实际操作一下,将 OpenAI 的模型权重加载到我们的 GPTModel 实例 gpt 中:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "1ce3eb5f", + "metadata": {}, + "outputs": [], + "source": [ + "load_weights_into_gpt(gpt, params)\n", + "gpt.to(device)" + ] + }, + { + "cell_type": "markdown", + "id": "c17561ae", + "metadata": {}, + "source": [ + "如果模型加载正确,我们现在可以使用之前的生成函数来生成新文本:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "44087302", + "metadata": {}, + "outputs": [], + "source": [ + "torch.manual_seed(123)\n", + "token_ids = generate(\n", + " model=gpt,\n", + " idx=text_to_token_ids(\"Every effort moves you\", tokenizer),\n", + " max_new_tokens=25,\n", + " context_size=NEW_CONFIG[\"context_length\"],\n", + " top_k=50,\n", + " temperature=1.5\n", + ")\n", + "print(\"Output text:\\n\", token_ids_to_text(token_ids, tokenizer))" + ] + }, + { + "cell_type": "markdown", + "id": "661fc904", + "metadata": {}, + "source": [ + "输出文本如下:" + ] + }, + { + "cell_type": "markdown", + "id": "fed36ab9", + "metadata": {}, + "source": [ + "Output text: \\\n", + "Every effort moves you toward finding an ideal new way to practice something! \\\n", + "What makes us want to be on top of that?" + ] + }, + { + "cell_type": "markdown", + "id": "7ad4ddf7", + "metadata": {}, + "source": [ + "我们可以确信模型权重加载正确,因为模型能够生成连贯的文本。\n", + "在这个过程中的任何一个微小错误都会导致模型失败。" + ] + }, + { + "cell_type": "markdown", + "id": "01500d1d", + "metadata": {}, + "source": [ + "在接下来的章节中,我们将继续使用这个预训练模型,并对其进行微调,以便进行文本分类和执行指令。" + ] + }, + { + "cell_type": "markdown", + "id": "40edb065", + "metadata": {}, + "source": [ + "### 练习 5.5" + ] + }, + { + "cell_type": "markdown", + "id": "175548ee", + "metadata": {}, + "source": [ + "使用 OpenAI 在The Verdict数据集上的预训练权重计算 GPTModel 的训练和验证集损失。" + ] + }, + { + "cell_type": "markdown", + "id": "aa6f4836", + "metadata": {}, + "source": [ + "### 练习 5.6" + ] + }, + { + "cell_type": "markdown", + "id": "6ccc9404", + "metadata": {}, + "source": [ + "请尝试不同大小的 GPT-2 模型,例如最大的 1558M 参数模型,并将生成的文本与我们在本章中加载的 124M 模型进行比较。" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python (cell)", + "language": "python", + "name": "cell" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.13" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/Translated_Book/ch05/5.6 小结.ipynb b/Translated_Book/ch05/5.6 小结.ipynb new file mode 100644 index 0000000..bc95db4 --- /dev/null +++ b/Translated_Book/ch05/5.6 小结.ipynb @@ -0,0 +1,89 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "af5bc148", + "metadata": {}, + "source": [ + "# 5.6 小结" + ] + }, + { + "cell_type": "markdown", + "id": "21cbb4fc", + "metadata": {}, + "source": [ + "- 当大型语言模型(LLM)生成文本时,它们每次输出一个token" + ] + }, + { + "cell_type": "markdown", + "id": "69f0f948", + "metadata": {}, + "source": [ + "- 在默认情况下,通过将模型输出转换为概率分数,并选择对应于最高概率分数的词汇,来生成下一个token,这种方法称为“贪婪解码”。" + ] + }, + { + "cell_type": "markdown", + "id": "d29d231f", + "metadata": {}, + "source": [ + "- 通过使用概率采样和温度缩放,我们可以影响生成文本的多样性和连贯性。" + ] + }, + { + "cell_type": "markdown", + "id": "2df2ffa4", + "metadata": {}, + "source": [ + "- 在训练过程中,可以使用训练集和测试集的损失(loss)来衡量大语言模型生成文本的质量。" + ] + }, + { + "cell_type": "markdown", + "id": "8b8cc1eb", + "metadata": {}, + "source": [ + "- 预训练一个大语言模型涉及改变其权重的操作以最小化训练损失。" + ] + }, + { + "cell_type": "markdown", + "id": "c7d5979d", + "metadata": {}, + "source": [ + "- 大语言模型的训练循环本身是深度学习中的标准程序,这个过程使用了常规的交叉熵损失和AdamW优化器。" + ] + }, + { + "cell_type": "markdown", + "id": "0ff620fb", + "metadata": {}, + "source": [ + "- 由于在大型文本语料库上预训练大语言模型既耗时又耗资源,因此我们可以选择加载OpenAI的公开可用的权重,作为自己在大数据集上预训练模型的替代方案。" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python (cell)", + "language": "python", + "name": "cell" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.13" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +}