From d46a09335f3f924bd02ef1309ee457e3ecf37d26 Mon Sep 17 00:00:00 2001
From: aJupyter <ajupyter@163.com>
Date: Mon, 4 Mar 2024 13:55:56 +0800
Subject: [PATCH] feat: update ch02-02-03

---
 .../bpe_openai_gpt2-checkpoint.py             | 174 -------
 .../compare-bpe-tiktoken-checkpoint.ipynb     | 442 ----------------
 ch02/02_bonus_bytepair-encoder/README.md      |   6 +-
 .../bpe_openai_gpt2.cpython-311.pyc           | Bin 12758 -> 0 bytes
 .../bpe_openai_gpt2.py                        |  24 +-
 .../compare-bpe-tiktoken.ipynb                | 149 +++---
 ...eddings-and-linear-layers-checkpoint.ipynb | 486 ------------------
 ch02/03_bonus_embedding-vs-matmul/README.md   |   5 +-
 .../embeddings-and-linear-layers.ipynb        | 140 ++---
 9 files changed, 173 insertions(+), 1253 deletions(-)
 delete mode 100644 ch02/02_bonus_bytepair-encoder/.ipynb_checkpoints/bpe_openai_gpt2-checkpoint.py
 delete mode 100644 ch02/02_bonus_bytepair-encoder/.ipynb_checkpoints/compare-bpe-tiktoken-checkpoint.ipynb
 delete mode 100644 ch02/02_bonus_bytepair-encoder/__pycache__/bpe_openai_gpt2.cpython-311.pyc
 delete mode 100644 ch02/03_bonus_embedding-vs-matmul/.ipynb_checkpoints/embeddings-and-linear-layers-checkpoint.ipynb

diff --git a/ch02/02_bonus_bytepair-encoder/.ipynb_checkpoints/bpe_openai_gpt2-checkpoint.py b/ch02/02_bonus_bytepair-encoder/.ipynb_checkpoints/bpe_openai_gpt2-checkpoint.py
deleted file mode 100644
index f3d9575..0000000
--- a/ch02/02_bonus_bytepair-encoder/.ipynb_checkpoints/bpe_openai_gpt2-checkpoint.py
+++ /dev/null
@@ -1,174 +0,0 @@
-"""
-Byte pair encoding utilities
-
-Code from https://github.com/openai/gpt-2/blob/master/src/encoder.py
-
-And modified code (download_vocab) from
-https://github.com/openai/gpt-2/blob/master/download_model.py
-
-Modified MIT License
-
-Software Copyright (c) 2019 OpenAI
-
-We don’t claim ownership of the content you create with GPT-2, so it is yours to do with as you please.
-We only ask that you use GPT-2 responsibly and clearly indicate your content was created using GPT-2.
-
-Permission is hereby granted, free of charge, to any person obtaining a copy of this software and
-associated documentation files (the "Software"), to deal in the Software without restriction,
-including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
-and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so,
-subject to the following conditions:
-
-The above copyright notice and this permission notice shall be included
-in all copies or substantial portions of the Software.
-The above copyright notice and this permission notice need not be included
-with content created by the Software.
-
-THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED,
-INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
-BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
-TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE
-OR OTHER DEALINGS IN THE SOFTWARE.
-
-
-"""
-
-import os
-import json
-import regex as re
-import requests
-from tqdm import tqdm
-from functools import lru_cache
-
-@lru_cache()
-def bytes_to_unicode():
-    """
-    Returns list of utf-8 byte and a corresponding list of unicode strings.
-    The reversible bpe codes work on unicode strings.
-    This means you need a large # of unicode characters in your vocab if you want to avoid UNKs.
-    When you're at something like a 10B token dataset you end up needing around 5K for decent coverage.
-    This is a significant percentage of your normal, say, 32K bpe vocab.
-    To avoid that, we want lookup tables between utf-8 bytes and unicode strings.
-    And avoids mapping to whitespace/control characters the bpe code barfs on.
-    """
-    bs = list(range(ord("!"), ord("~")+1))+list(range(ord("¡"), ord("¬")+1))+list(range(ord("®"), ord("ÿ")+1))
-    cs = bs[:]
-    n = 0
-    for b in range(2**8):
-        if b not in bs:
-            bs.append(b)
-            cs.append(2**8+n)
-            n += 1
-    cs = [chr(n) for n in cs]
-    return dict(zip(bs, cs))
-
-def get_pairs(word):
-    """Return set of symbol pairs in a word.
-
-    Word is represented as tuple of symbols (symbols being variable-length strings).
-    """
-    pairs = set()
-    prev_char = word[0]
-    for char in word[1:]:
-        pairs.add((prev_char, char))
-        prev_char = char
-    return pairs
-
-class Encoder:
-    def __init__(self, encoder, bpe_merges, errors='replace'):
-        self.encoder = encoder
-        self.decoder = {v:k for k,v in self.encoder.items()}
-        self.errors = errors # how to handle errors in decoding
-        self.byte_encoder = bytes_to_unicode()
-        self.byte_decoder = {v:k for k, v in self.byte_encoder.items()}
-        self.bpe_ranks = dict(zip(bpe_merges, range(len(bpe_merges))))
-        self.cache = {}
-
-        # Should haved added re.IGNORECASE so BPE merges can happen for capitalized versions of contractions
-        self.pat = re.compile(r"""'s|'t|'re|'ve|'m|'ll|'d| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+""")
-
-    def bpe(self, token):
-        if token in self.cache:
-            return self.cache[token]
-        word = tuple(token)
-        pairs = get_pairs(word)
-
-        if not pairs:
-            return token
-
-        while True:
-            bigram = min(pairs, key = lambda pair: self.bpe_ranks.get(pair, float('inf')))
-            if bigram not in self.bpe_ranks:
-                break
-            first, second = bigram
-            new_word = []
-            i = 0
-            while i < len(word):
-                try:
-                    j = word.index(first, i)
-                    new_word.extend(word[i:j])
-                    i = j
-                except:
-                    new_word.extend(word[i:])
-                    break
-
-                if word[i] == first and i < len(word)-1 and word[i+1] == second:
-                    new_word.append(first+second)
-                    i += 2
-                else:
-                    new_word.append(word[i])
-                    i += 1
-            new_word = tuple(new_word)
-            word = new_word
-            if len(word) == 1:
-                break
-            else:
-                pairs = get_pairs(word)
-        word = ' '.join(word)
-        self.cache[token] = word
-        return word
-
-    def encode(self, text):
-        bpe_tokens = []
-        for token in re.findall(self.pat, text):
-            token = ''.join(self.byte_encoder[b] for b in token.encode('utf-8'))
-            bpe_tokens.extend(self.encoder[bpe_token] for bpe_token in self.bpe(token).split(' '))
-        return bpe_tokens
-
-    def decode(self, tokens):
-        text = ''.join([self.decoder[token] for token in tokens])
-        text = bytearray([self.byte_decoder[c] for c in text]).decode('utf-8', errors=self.errors)
-        return text
-
-def get_encoder(model_name, models_dir):
-    with open(os.path.join(models_dir, model_name, 'encoder.json'), 'r') as f:
-        encoder = json.load(f)
-    with open(os.path.join(models_dir, model_name, 'vocab.bpe'), 'r', encoding="utf-8") as f:
-        bpe_data = f.read()
-    bpe_merges = [tuple(merge_str.split()) for merge_str in bpe_data.split('\n')[1:-1]]
-    return Encoder(
-        encoder=encoder,
-        bpe_merges=bpe_merges,
-    )
-
-
-def download_vocab():
-    # Modified code from
-    subdir = 'gpt2_model'
-    if not os.path.exists(subdir):
-        os.makedirs(subdir)
-    subdir = subdir.replace('\\','/') # needed for Windows
-
-    for filename in ['encoder.json', 'vocab.bpe']:
-
-        r = requests.get("https://openaipublic.blob.core.windows.net/gpt-2/models/117M" + "/" + filename, stream=True)
-
-        with open(os.path.join(subdir, filename), 'wb') as f:
-            file_size = int(r.headers["content-length"])
-            chunk_size = 1000
-            with tqdm(ncols=100, desc="Fetching " + filename, total=file_size, unit_scale=True) as pbar:
-                # 1k for chunk_size, since Ethernet packet size is around 1500 bytes
-                for chunk in r.iter_content(chunk_size=chunk_size):
-                    f.write(chunk)
-                    pbar.update(chunk_size)
diff --git a/ch02/02_bonus_bytepair-encoder/.ipynb_checkpoints/compare-bpe-tiktoken-checkpoint.ipynb b/ch02/02_bonus_bytepair-encoder/.ipynb_checkpoints/compare-bpe-tiktoken-checkpoint.ipynb
deleted file mode 100644
index 7448afb..0000000
--- a/ch02/02_bonus_bytepair-encoder/.ipynb_checkpoints/compare-bpe-tiktoken-checkpoint.ipynb
+++ /dev/null
@@ -1,442 +0,0 @@
-{
- "cells": [
-  {
-   "cell_type": "markdown",
-   "id": "a9adc3bf-353c-411e-a471-0e92786e7103",
-   "metadata": {},
-   "source": [
-    "# Using BytePair encodding from `tiktoken`"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 1,
-   "id": "4036ffa3-0e5c-433a-a997-4ed7d33de0b2",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# !pip install tiktoken"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 4,
-   "id": "1c490fca-a48a-47fa-a299-322d1a08ad17",
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "tiktoken version: 0.5.2\n"
-     ]
-    }
-   ],
-   "source": [
-    "import importlib.metadata\n",
-    "\n",
-    "print(\"tiktoken version:\", importlib.metadata.version(\"tiktoken\"))"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 5,
-   "id": "0952667c-ce84-4f21-87db-59f52b44cec4",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "import tiktoken\n",
-    "\n",
-    "tik_tokenizer = tiktoken.get_encoding(\"gpt2\")\n",
-    "\n",
-    "text = \"Hello, world. Is this-- a test?\""
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 6,
-   "id": "b039c350-18ad-48fb-8e6a-085702dfc330",
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "[15496, 11, 995, 13, 1148, 428, 438, 257, 1332, 30]\n"
-     ]
-    }
-   ],
-   "source": [
-    "integers = tik_tokenizer.encode(text, allowed_special={\"<|endoftext|>\"})\n",
-    "\n",
-    "print(integers)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 7,
-   "id": "7b152ba4-04d3-41cc-849f-adedcfb8cabb",
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Hello, world. Is this-- a test?\n"
-     ]
-    }
-   ],
-   "source": [
-    "strings = tik_tokenizer.decode(integers)\n",
-    "\n",
-    "print(strings)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 8,
-   "id": "cf148a1a-316b-43ec-b7ba-1b6d409ce837",
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "50257\n"
-     ]
-    }
-   ],
-   "source": [
-    "print(tik_tokenizer.n_vocab)"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "6a0b5d4f-2af9-40de-828c-063c4243e771",
-   "metadata": {},
-   "source": [
-    "# Using the original Byte-pair encoding implementation used in GPT-2"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 9,
-   "id": "0903108c-65cb-4ae1-967a-2155e25349c2",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "from bpe_openai_gpt2 import get_encoder, download_vocab"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 10,
-   "id": "35dd8d7c-8c12-4b68-941a-0fd05882dd45",
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "Fetching encoder.json: 1.04Mit [00:28, 36.8kit/s]                                                   \n",
-      "Fetching vocab.bpe: 457kit [00:00, 458kit/s]                                                        \n"
-     ]
-    }
-   ],
-   "source": [
-    "download_vocab()"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 14,
-   "id": "1888a7a9-9c40-4fe0-99b4-ebd20aa1ffd0",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "orig_tokenizer = get_encoder(model_name=\"gpt2_model\", models_dir=\".\")"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 15,
-   "id": "2740510c-a78a-4fba-ae18-2b156ba2dfef",
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "[15496, 11, 995, 13, 1148, 428, 438, 257, 1332, 30]\n"
-     ]
-    }
-   ],
-   "source": [
-    "integers = orig_tokenizer.encode(text)\n",
-    "\n",
-    "print(integers)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 16,
-   "id": "434d115e-990d-42ad-88dd-31323a96e10f",
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Hello, world. Is this-- a test?\n"
-     ]
-    }
-   ],
-   "source": [
-    "strings = orig_tokenizer.decode(integers)\n",
-    "\n",
-    "print(strings)"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "4f63e8c6-707c-4d66-bcf8-dd790647cc86",
-   "metadata": {},
-   "source": [
-    "# Using the BytePair Tokenizer in HuggingFace transformers"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 12,
-   "id": "5bfff386-f725-4137-9c50-e5da0c38bea0",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# pip install transformers"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 13,
-   "id": "e9077bf4-f91f-42ad-ab76-f3d89128510e",
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "'4.30.2'"
-      ]
-     },
-     "execution_count": 13,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "import transformers\n",
-    "\n",
-    "transformers.__version__"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "16e06ee5-c4ca-4211-8e24-dbfd84b1d85b",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "设置为国内可访问"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 1,
-   "id": "3e07ddc9-187e-4482-a7b5-7e4e9381d805",
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "env: HF_ENDPOINT=https://hf-mirror.com\n"
-     ]
-    }
-   ],
-   "source": [
-    "%env HF_ENDPOINT=https://hf-mirror.com"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 2,
-   "id": "a9839137-b8ea-4a2c-85fc-9a63064cf8c8",
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "application/vnd.jupyter.widget-view+json": {
-       "model_id": "afc151b540664287aa60a4cbe90cdfeb",
-       "version_major": 2,
-       "version_minor": 0
-      },
-      "text/plain": [
-       "vocab.json: 0.00B [00:00, ?B/s]"
-      ]
-     },
-     "metadata": {},
-     "output_type": "display_data"
-    },
-    {
-     "data": {
-      "application/vnd.jupyter.widget-view+json": {
-       "model_id": "9a5d584e4adf40bca215b409b693dc02",
-       "version_major": 2,
-       "version_minor": 0
-      },
-      "text/plain": [
-       "merges.txt: 0.00B [00:00, ?B/s]"
-      ]
-     },
-     "metadata": {},
-     "output_type": "display_data"
-    },
-    {
-     "data": {
-      "application/vnd.jupyter.widget-view+json": {
-       "model_id": "a126ee77a9f94e58b1dcccd68e6d5bb1",
-       "version_major": 2,
-       "version_minor": 0
-      },
-      "text/plain": [
-       "config.json:   0%|          | 0.00/367 [00:00<?, ?B/s]"
-      ]
-     },
-     "metadata": {},
-     "output_type": "display_data"
-    }
-   ],
-   "source": [
-    "from transformers import GPT2Tokenizer\n",
-    "\n",
-    "hf_tokenizer = GPT2Tokenizer.from_pretrained(\"gpt2\")"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 11,
-   "id": "222cbd69-6a3d-4868-9c1f-421ffc9d5fe1",
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "[15496, 11, 995, 13, 1148, 428, 438, 257, 1332, 30]"
-      ]
-     },
-     "execution_count": 11,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "hf_tokenizer(strings)[\"input_ids\"]"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "907a1ade-3401-4f2e-9017-7f58a60cbd98",
-   "metadata": {},
-   "source": [
-    "# A quick performance benchmark"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 17,
-   "id": "a61bb445-b151-4a2f-8180-d4004c503754",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "with open('../01_main-chapter-code/the-verdict.txt', 'r', encoding='utf-8') as f:\n",
-    "    raw_text = f.read()"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 18,
-   "id": "57f7c0a3-c1fd-4313-af34-68e78eb33653",
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "9.14 ms ± 74.5 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)\n"
-     ]
-    }
-   ],
-   "source": [
-    "%timeit orig_tokenizer.encode(raw_text)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "036dd628-3591-46c9-a5ce-b20b105a8062",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "%timeit tik_tokenizer.encode(raw_text)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "b9c85b58-bfbc-465e-9a7e-477e53d55c90",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "%timeit hf_tokenizer(raw_text)[\"input_ids\"]"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "7117107f-22a6-46b4-a442-712d50b3ac7a",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "%timeit hf_tokenizer(raw_text, max_length=5145, truncation=True)[\"input_ids\"]"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "d81eaf6d-554b-44e3-aa19-2c3ae0030762",
-   "metadata": {},
-   "outputs": [],
-   "source": []
-  }
- ],
- "metadata": {
-  "kernelspec": {
-   "display_name": "Python 3 (ipykernel)",
-   "language": "python",
-   "name": "python3"
-  },
-  "language_info": {
-   "codemirror_mode": {
-    "name": "ipython",
-    "version": 3
-   },
-   "file_extension": ".py",
-   "mimetype": "text/x-python",
-   "name": "python",
-   "nbconvert_exporter": "python",
-   "pygments_lexer": "ipython3",
-   "version": "3.11.5"
-  }
- },
- "nbformat": 4,
- "nbformat_minor": 5
-}
diff --git a/ch02/02_bonus_bytepair-encoder/README.md b/ch02/02_bonus_bytepair-encoder/README.md
index 30a1946..c68e7b1 100644
--- a/ch02/02_bonus_bytepair-encoder/README.md
+++ b/ch02/02_bonus_bytepair-encoder/README.md
@@ -1,7 +1,7 @@
-# Chapter 2: Working with Text Data
+# 第2章:使用文本数据
 
 
 
-- [compare-bpe-tiktoken.ipynb](compare-bpe-tiktoken.ipynb) benchmarks various byte pair encoding implementations
-- [bpe_openai_gpt2.py](bpe_openai_gpt2.py) is the original bytepair encoder code used by OpenAI
+- [compare-bpe-tiktoken.ipynb](compare-bpe-tiktoken.ipynb) 对各种字节对编码实现进行基准测试
+- [bpe_openai_gpt2.py](bpe_openai_gpt2.py) 是OpenAI使用的原始字节对编码器代码
 
diff --git a/ch02/02_bonus_bytepair-encoder/__pycache__/bpe_openai_gpt2.cpython-311.pyc b/ch02/02_bonus_bytepair-encoder/__pycache__/bpe_openai_gpt2.cpython-311.pyc
deleted file mode 100644
index 7b7667219a927a1d7a4bc54cace1c0c261eeeafa..0000000000000000000000000000000000000000
GIT binary patch
literal 0
HcmV?d00001

literal 12758
zcmbtaeQX;?cHiZfNa};umo3GTv$mBuiYdu*;@EL~PAG}87|WC_$?;jS&5+!cM4KYz
z-IZmlRH7byINGN)aw@~jA?+zZnp2vj^`SYyI21T1m!c`qD@bDxVF?H@u1%5rRRvA}
z!~e9uH~bJuDZb=xxqLf2JM-qfH#6^JW`0p#Zsl<O=${hO!G}2Rzvv}<88V5x3rJk&
zL~fW9c~KYShk5qY4eQubKWt!6<FFA=eb6*+8aDA%)(|WiHxHY6PRB{+cYx)+EN)IT
zUg8RKhApD$5;ttsafdju<Ps;E-_fB4{^@JDRJ7oGnP>$p7j=VNbLphZ#BtV+i>f3{
zcmuK^h5TVL5E>OG)j%+y1|-F5?F@^OFd~P?g)vo~Q1-XAjRw@QNnfi!Jl+<bkV4)-
z+vtSavb)U}4Ex%~y^1QyZHnw~W7s6Qb>gDc>IjL#IB<^yBvC*klJJ-qo(cuSUeR+t
z?DzVbSsT{x(?$*f5KBSU%#mypM_fZfZ@@2w6v=8G43DT&URe@4!xI<f!04DNJmzl}
zcJFF`TId4@4wuz>ToS}^=nu1RsDeM}4U7w*PLh?ez=RMU5!5jWREJb4qzV_qlY(EC
zyb#(HBq1E?A8Ofc7nHCNP=$a(-(^Km!@!p=;AMHjL{Rc7QY*C-4h1g?Uga$CdeaCe
z6)BBKkR@dz98v;4DgvqbQB6iV5E27^YLxKjh?xTNG%=zusZiH3a$2p{en}n=C`uq4
zBBI75S@K;JMrALU5bY4Q1Tsbh|Cm=EmFz^WH*`^$0L!Ql_Nm@Lh_HFl(!|BI2!KY(
zN)nV=y^0d{2SAbtR{fLX5RO-+_C^9hNf90+p>4~`ep@qZM3lTiu)+kI{lugco>Yl-
zRSx(Ghus<o`Gb>eFp`w%b1*O-$h1JHSw}KKLwI&Zr#(H!F5)RMwk9Th!GJPm7sLRe
z_D!lNsZb^x8g?uAXba1NA_aql0V6)!-C64FtQR03$Oi3aM8w*i8pEt0QnF=9(j${{
z2u(sL=^>zm?N+pSM)IqaL&ZnJ!C-ib#EUT{VxFPD_gk$)_~iA4&(l!Lj?++B1*vHg
z#(<p>k?z&ZCuPhF+I<qFN&=HaNKT-vEIZ(C(x<5C_yA--5tdnd#ca0ziOq*3a0*fi
zC}8s<J72QH6T>fGRv4l~Jx*b;uY2gYW56l628I5Cz873w&Msk_V-WA#?80%^P*2~n
zAps=@9PXhLLSMJwaGwwkyWCxN!TH1f0q5YL&^I8sj`a7soLzRS%iY;~tjpy-By^yj
zyANu>1!VxFLwy1@m_g-oqF%Rf#5vH}1I&&NSFda6gx%Wh8gdiPZXk6C{f>blSLd-_
z$AHj(Y@oky(23@|fY$ADcMqTy=MkrSs8w*ektaA`z>6^0<LK>WO*xK%`T<5?XJ7w`
z0oS3PA)%+Qx63&&Xzg%<GDk<RGu;%J>g;v6j@X4R#}UUNqHaLw1Hl8VSeiQFc#o51
zp*;ug&LLNyo7m{=a}N#R)ovZ?8yLz}J?<KG+6BjeYmfxeJ<tdABu>;InYpQ++nL5f
zqGnX*dIUvCV8;fX)*LNePDd|rp?ll~h0&YVFUiojoB5PMJtvOiX$i`c9>3Q=Ciyc)
zh&?QTdi?L6gmJpgsr>3kmGU!}YRslM7{hl^>b+daf^WLQZ=!x06$_1DVTte5qM@ix
zJ~hU|Z-2Z+qUbxdYAVEBz-LL}x9`^K3>SebAew)ytIvsZh8Ha}x_nuS)<WD}+^Kyt
z`iMTlo#*A<5pFTBUvg-;h`z8E&qet4ON!Qrj`d1GFQU8-T$FR%jBf5}vGfw^cNMuc
zGrUUUC3C5z*%Sx9BK(7T84gaFcRz-hRlqeMsZh)cjEJI=?weFcTJ{M(a(&3klAV*&
zrkB~AT(MAqc`IZ+LZeD6!$pc(md?Xg!b(emZ-O+o2-7hYme0bt3zO^Dg8m+tydZ*H
z2&Oo_LXhmQu$_^VsYN!~>xUPmkfmhip1DXuV1(72f+}YQ`+PVc3dh`sGh`hflUVT`
z@O0o-DB*EQ9V06f44g%d(7vk!)y|@j=vCp0q&*J_+)qw0a+txF!;{F_a~M`i7DNel
z5*`tZlI$IY^FpE=B144RD=2}{5L`Mx5d_ObWl@TFVN{30^0+q$gX@Jweq#4wCMm{B
z1`(VN#sj%Qc3}z*6=4d7!)HN}>V<F>*cEk30?n%(hL(xu^<?azt|XL@w0B~H7=q26
z8VjiKs=R)wjf|2UhH+e#F&U+77YaVFJOURWlqT>#zct0b{3WTKZ^)7U_^tHg{q*A>
z*yFdqam)1aOAh7yITeK*QW@3w-?al==SDGPPgP<-=as<_#@ni5yQ-V38sy&Ppgr~e
zF}Ycn(zWhN@gap;5QJ}D!lRA4uf5HGcB*Yx`{!@};){R$;ESc%|5<wL^S}LRTW{|X
zrG?z}7R4`nRsUF<e{9$8wq3hDzHn$#@z97TU%w^eowtE?Puh+5z>VJxH~PNy01bcm
z_Y()Fo2Y`~QNx~$z*`Rl;jagkgGgjcJqxOoEdaAz+|ro7=a0U+J?g#u=-i{Zth*?|
zHJeffs+cmu7af&S`mijfOz12O#FPO(uA0(M2PT>gDV<MA>HLaZg<2`zr%*se<4-1;
zrZyB4CO?d9lC)9=k<N18Y~-vJuS(ZCWBRK-3q611U)=u2*qdXE=iWGzsA$$IniH1C
zHOu24Jozi*ulzTgei`^M@GsT3hCZtQr2f<Pk2WWE9np3jN$hlMJKYJ3`*-T^hfe%{
z_*DF~FEQ-bhW)>n#^QnTgcQ=GP&_=5kR}pC=d_`7i9T8DlM@zswj*gNjXLNzSBn?3
zW!AA=UN!&Z<rn8(T;UAHY6hZwvb1czBl=AAnPgeTYpyG<dGF=kx!!1R_En6vUmZ!5
zHELyz@v_EbS^0eB<?g8CUXqDAwhi)4SQ<4;<7~%rsqLD6z9Xi)TDnl0C~eS68{&or
zxfW7cGcrk~-+dXN&;%Dol`}ld2cKs;ba;o-&NnA87Mo!#_{F?&4YZjnqKoK9icB;V
zzaA>TA;LpR8>VN|${AibQ_af7aUZ5AjZ&roy`)z~co@|A$B`uEBTExd*HAccKM@0|
zlL)U!@uVvw`p71H5@~_+UO7Peq$MbYMiI1S^m%i7p0oKYn?bP~GfodWQhKi_VhkFn
z>6DStlCq%5^Bz(iDFeZx5osBfR1YgD?*OU}{FQzH3`HwfzTvWS&N=H$Zm5|*x==Ff
zVu|^I`GE!V3a2yHGY~Z-&E>CIu2|-uj*0QA?Fn;}W^Rg`o02tkQA^a4EZs2Mo9<2^
z&RwJ&=(_tDz;#aIu*AS&d7(tqUE+uJq8=f?K{NmwY2jelB$^<2bE?Ei0ly@tN+8Z4
z^rN5VSoWf&4K_%qW3R*7MmSIQ8wLfeHFqZWzJBKy4yIz;4t!b4F(B|pt+6ZL<`?2X
zshlSU?udrm`=3Yy)<80?+OjElYn7|%h)yMyletuyO_?iFG_VVm`GdR@@R9Rn0^q_^
zmc^-V&Zao-Ohq;=7WrDJk)u}RXd1Z-W;k^uvprarLg!Hvr_AJV&+s!k%t`*Qb#s<M
zP9`J_ysMXq<rIH5#h+KGC$g#*_JGuK#vWvbOsmI(6#~`c$+HK!Qb$ocYY#RZh_m}G
z{UX1(HO9xbrr!bA>ZR{ZwcJ8Y<x?x~Le6y4cOi2qkIaTcady8ow0Lxhj~$JjOTPm$
zGN+&4p+t755ja<o9p~{MkL(BrBRj;1@a)Nnzv!LWnNGM_;>FV^6_yL&o;kHMa#Gp(
z*t1(t4mMM&c|)p1%FH|?T$6DnRRT|o%|w$V%VAkz#w=AvHqMhRntsa^l&N238Y=Q8
zfRrAJEoC8N4zqezNf}w(mD0&ls)TwDOEM|_gjXe<hSiP{c{kOvQU$hFqR4xYQKZq#
z*-oC`P%O_rD*ZmdESKE0C2E)}OR{cWRNsz#w&UsBJDyJLct+dt47xe7>43KB04uVw
z0guw<2798RRcmOC8Z>i#lF5B(NNYLxSxeXLmaargx7N}P!6zCHX$^;1q2>BU81TnG
zkbc#9)B4N45Bn0WU0Q2bVvAGT;!M<cYxUj0T#>BZgopJT6IZiwVSju}I8i;JRZk$#
z9K|m^;Bq<#)YIH&I&KK?Crry-&eLk=Ik`OlJ(6b%2f6$Rs@Rc3e3jEnh*gU81jSYg
zCgC7w;nWh)khvoIC^v#d;!EcD^@ZAF#t<=plAL}<T&A1nLv0ZQo7Bb`(<nDnA{u7Q
z5pzM5RMhfW){?wz0ZofJ&6jPTo)KfDXz6&y5-C~CqjI%06p1-v$#(_{H6RSVWyTz8
z+Rh>M030IsARH&S(2p<ZPjFK_wYTbwM2u%yU&@^i=2<jO!<J<@Zy_AW0TX$cOc_jD
zKqdJBYc5E|v~C1z7ELUIHHU<6ty@6#8VM0K6a#;!p1ZMJFzSld1ZGMiB^Y@*%d&Rl
ziCLNPp~?r3(0P8&EtV|iMc=DJSu>ttQVQqE0ThnVCWF~MRA88B%rV16+V>iNox9Bc
zF)t#*<x^U`7F0M-(4x+r(g#8#&3gH1G?CK7ElU|ku+^Z#xigT1CLhFybVxPJqKW(F
z13~Y&PxKy~Zp>;rsKz|omia7EMFHU6EO+C{4-bC!g!A?j&RgQ|obiF<|JM8Oy@@AI
zXiuD&<(R6WDDJFuan-&2lE!O_7n-f|eo7gc-^$#@lzu!AVie2!*e5LbOBYfm=>m2g
zM485F%5Wwe2%#>KkGd4g%iEEbn<&HN3t)YJ9K6DDR#PSgCIvB-IV4Sam>bInQv4Z(
z+`5-w+tXJ%n9_sU>C!yTUqm70Kj8O)PdJ_PT?^00OSj-oR&QF^ACt7|rg&M?vaM$R
z*yYGvBx+dRP#<f$YFn^HjdyG{D;#gBPgd7OdshsbEsrhN*2QY$kDge)YljoH!&>cd
zv^#05jSandI&RyE`#(3-CL6Y0J`#1#Kb726AMHtQsEj(2Rn^!0z#do#yxub3vZ6Cp
zK9sC$___6G*2O(bTN8D5t<FC0NY*r5?|Z#(v1_SvseP&XZC9eERjX-@*R&?<>gOHv
zj(aOQ9ctzkt6HjhyC+f8qSds-Yg&>u_4Ceq_wL=P-W1~(_Qs^!)!XCM+n4wcI&T=>
zb-nMp;eGe;-`kdKH@j|C-fI7-`k!5&ZoGY<FMgmei`2RF=v%#sns%+GJzmqEtf`Gv
zUhSr~@2s}^_Mt?LU8}LjYwWoKD~()rO+J&~Y~kt}V?Vt5%EBv<LiOfo*RqXb$BT0p
zV~#5?M_*2EsQtWd>tfRzwl{6_#;^85%q#o24b`t*xN;$8xcu_m%LpbK1Tu&7dlE3S
zw(4l9LSE)ad-rs-a38lk+*zUb=Nm&(@yr+=K*y6Ygh|omuU4GNMR=I(xthhxaEJ$I
zX7u8>ECpr_TtvT^GiGQvcNIIj#X&=%`h+3XE#?{892$<bW4L6#%zcZQD{`pUH*jLn
zQtyMQ;kmh&nbI@4Q-uLHiS-Hm?~>bfjXw-~h=__U2VK^!>c4#d;sx$H4}?|Z-M`CY
zU0r?(b)nVBO_yo4C9TE?wIF2lnB~Bz6p}7X$OoslW~Z~nT+KXPmM!uV)Kqo?-1~z2
z0*+jKb3g7IM`K-DL(7s^YuJ^}$%jm@-8Zt;xB|WYXL2Z$#sQBcpA3&_cQ&o%n8MEI
zR6dD<-+{*{iYbo+WO;1##T$J$n`7de!KJ6QEqm}*bTby^=ZO%r{0ttck`X9l?5W8=
zz?)32FZ6QSfp=5Ramq0&6WB9vHl($)L51Q@``3&EMW&?y=3l=A&CArT>_ygeMPWZ(
zqGEpsK$;istaZ6|)57qg;cfF$Q=+y_t8K$dfyK|KVbqzd*?9fX>xc6D=;kB8vH#kB
zYbvq7Put&zFNw|l8emPoR@09fWg9;$YrKusq&(JdssDQ6;|uZrV;{Zp$t#~7I(hrh
z$@r<45{JCnAuq}$8hsjInNKV8#o3*-)yK{CYdvb3)YMUTKLj8p0!4t-+AF6B)~xv#
zP4R-A3RX*h$sEx=cq%UD)Kh-TBKgon9lPE)7Wxhp7{h-sx#bi1UNE|eE9SM(P$I8|
zE#}&0_-~uNSjgRw39m7#pG2+f%%yR#cIL`8$e!Mo9Si9wFK^D~)>`LKF6)8BH$9iW
zZx~~SH_H}}X>~hudFignX>3&cGnv>66xpfC`~M6UUL_W`0$_;8>wl2GZ?wmnw1;*s
zIkbmbbD8O>c#0b4S%8Y^#ipNyB4e*CdoQL;nTVL;SCGeiBU5J1`Kcx9Z_K&$lp|fc
zI_1(;e@dlB0cgtEDqowrGL@4e5;r#jBsM*#0oojz%>i%6R+pm@iJJz1MEwsmK-)ph
zcJMQs^R~@-Yc&4Cu=avCZgVDVKF#KXMJ6lExFUs4gQ>YBg)lqh9hW?wl-1+GVV21t
zrOP~?bCcd+`iuM=iZVUJl!i=fq}Og}H7d<oGo=WSPEv*eW^=z4oz-{_I%5s2+hK&a
zmIDL4(fH5`mje}g`n(pTJBAut--GxA#$DmMvYIy`SAB=Y+++l*+(k=`aP#r)z1(+E
zS2$VskZ+O~oUASbt_e=`B1gZ%`aBmZ-OjZ`B~R(n!6na~fLZ1XH)_8x)%s$pMFZ4K
z0VhQHw*}wtKnCyS<S=9Sew5P*u!!MXJei_U5u3sVfqSt!oiP?Hf$Ag1_pOBzgsLoc
z-!ID;!qzi5Ih5k%`<C<$A0i4lJ#kZJRs_rN1u8Wc2$Dj~DAeu6yJL&_`YR`wLMauR
z3n<>Etm4(lhfzoN0W|B<%bOOq+U3Fee`ZT9J$IP)eV7*Tv+S(1&2<fUm@fgfRe3yp
zFI~L$NX+tc+s|x?x@N7eIZ^evR`q!9Gn*$V-pV!?zWhI2AcTAznfA<@Ehz)-y`^+v
zg(6dREUn!POqK>Z9hWlTsF}!EU~?PWXI6IJjj4=GTJeYhImKrd^~|)WM|*^<4Ta<k
zj{_HDHC39&rTim&oX1}ofS_l&WlQ<1zPZuYf>(mE{fpB#4kju(w2F>foe4{iX6Zp#
zXMFfm$8y!ih31&!s(ry8Em_eck9}a^D=P9efz`bAQV6rw%l^EGfM*gF&uSIV-m3m|
zDsDNRupHMc$EjXJ{Kb>W>YD3Mz5dkIeGB`dW+J13ePMveXdp83@>ajDrZQv{m%78c
z_h!#;+`o2z`efojzjmNMesmx{cr1SOn0DX=^lzf_xK??5_W9+~s^o@E$?C=xqrS}a
z6$kJ&0n(Z#Wjkbhqt;QW|ESX7s4e-ZjtBg>ytboN|8c9KV|U5Ndw4)Uisdd^e<HPa
zcNWTr{HGaC&3(k4KyFH`@8d+c!U>_&6ohbunO()@utLWzl8dnBq7A>osXxPs284P4
zi|JarTKo(RHFk-{2v4hJqUrsTqMACDwj46oxA3+83f!9W9>qsop;-0#J2esA8EdvB
z(XzVQS12DSSu2zst)gcxu~aNe(;qQ_gYtJEmG^RzK`SPx@7#V(wiZ31Br6KBLrQk6
z%IO!e!ngK{6e(r&foU<f=aXMPQWS_HGHkIhv-aH$+xIgjb1Q6L$>vU3>0kgkCQ|S|
zfAT)x1|2p5;$#_M`UlxFyXjK|?8v3Rl@0^9;!KUyii3_gsjIYxBsFtTm|;@d+S~UY
zX&b^CnF-oh!ornq$~Rp>Cp56Fo7t!he1Y-$rAX~qx+NTEpv_$DVHgp81)(!>>c&qi
zVX8Nnvf_k->QQhwK>|{%e{3>z)}sWbrDmIak;tVHNegN?eW0Yw<KD9p)TY9`u#{Pr
z&Q0R{sUmk#(F+8ar-`LRHB~YOC5!_(GR>Y;8TM>sPlnKxaZ1K}$}~9vohxBi)r9kw
zq>s&X4wl?E`5a-JrK%RnV?@b+Mel|QoH)T|4biI5uxE-or>iS+XPlW)mES}O`YSh?
zy1r9hKij=rT0Xb;wS!j<#vV<SZq-V+GVQi(w$7E#cPGr7HS^}Ud2_P7`nA(nPRF_y
zs}tpqXyuR0cHgOKU*SrPb_Sy6&&z7&dgJw{;xCOQ>c`^eCgWx26J_VMvhz{?^YY4T
z+ZJ|Qx4&+WonO3=sM@7f?YhyFDBr7<?_J?aEcTo2%XOP$jyGNLN1xIf_9p7~X?6Re
zu4H{<^!eq=+F0k}*&98HM-SX`#fMMFD?N!yk5=iy35arg+&_A!am&w7|LpXQ>DcK+
zV~5t*f#|jpgaOPW!mXtUH|D(RUU1L5u@!#(@au;ct%;hQTFuTlyH|`<-Uw_AB?Dg(
z@~;W3=B<}vHej7nC18v3#=uv^<<|sO^VUl-Y->y1Td`zCtxzEK@unxbb{l_dbr`xf
zmwaL?!^<a|4P6hHe)6!M(%X1SZ)fS<hOVb9pFGV2HdmxdJRTgk_IT1``Y(VBM`@%{
z=>l7;kgp;mJxnbllOewv#ypWpd9f`;w$#iV31<4)fGUb3uoj+<K@Q4`s7BUC`ELMN
zSe{?eZQ~6q9DyGA%GOy6{gPbiEc+$7s(AKJa<*CaOLDdG!aK=5951}@09E~8Zoag6
zMOV!?u5bWf={a3_Hsf(CUy0}?3+kTZdBi2_!>?HW*ZE(&bQT^KIS1;hcy!}hz}&U}
E2jXn#ga7~l

diff --git a/ch02/02_bonus_bytepair-encoder/bpe_openai_gpt2.py b/ch02/02_bonus_bytepair-encoder/bpe_openai_gpt2.py
index f3d9575..0b7cfa1 100644
--- a/ch02/02_bonus_bytepair-encoder/bpe_openai_gpt2.py
+++ b/ch02/02_bonus_bytepair-encoder/bpe_openai_gpt2.py
@@ -41,6 +41,7 @@ import requests
 from tqdm import tqdm
 from functools import lru_cache
 
+# # 定义一个函数将字节转换为Unicode字符
 @lru_cache()
 def bytes_to_unicode():
     """
@@ -52,6 +53,15 @@ def bytes_to_unicode():
     To avoid that, we want lookup tables between utf-8 bytes and unicode strings.
     And avoids mapping to whitespace/control characters the bpe code barfs on.
     """
+    '''
+    返回一组UTF-8字节和相应的Unicode字符串列表。
+    可逆的BPE编码适用于Unicode字符串。
+    这意味着如果想要避免UNK（未知标记），则需要在词汇表中包含大量的Unicode字符。
+    当处理大约100亿标记的数据集时，最终需要大约5000个字符以确保良好的覆盖率。
+    这相当于正常情况下使用的32,000个BPE词汇表的显著比例。
+    为了避免这种情况，我们希望在UTF-8字节和Unicode字符串之间建立查找表。
+    并且要避免将BPE代码映射到空格/控制字符上，以免出现问题。
+    '''
     bs = list(range(ord("!"), ord("~")+1))+list(range(ord("¡"), ord("¬")+1))+list(range(ord("®"), ord("ÿ")+1))
     cs = bs[:]
     n = 0
@@ -63,11 +73,16 @@ def bytes_to_unicode():
     cs = [chr(n) for n in cs]
     return dict(zip(bs, cs))
 
+# 定义一个函数获取单词中的符号对
 def get_pairs(word):
     """Return set of symbol pairs in a word.
 
     Word is represented as tuple of symbols (symbols being variable-length strings).
     """
+    '''
+    返回单词中的符号对集合。
+    单词以符号元组的形式表示（其中符号是可变长度的字符串）。
+    '''
     pairs = set()
     prev_char = word[0]
     for char in word[1:]:
@@ -75,8 +90,10 @@ def get_pairs(word):
         prev_char = char
     return pairs
 
+# 定义一个使用字节对编码（BPE）进行编码和解码的Encoder类
 class Encoder:
     def __init__(self, encoder, bpe_merges, errors='replace'):
+        #  # 使用编码器字典、BPE合并和错误处理策略初始化Encoder
         self.encoder = encoder
         self.decoder = {v:k for k,v in self.encoder.items()}
         self.errors = errors # how to handle errors in decoding
@@ -89,6 +106,7 @@ class Encoder:
         self.pat = re.compile(r"""'s|'t|'re|'ve|'m|'ll|'d| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+""")
 
     def bpe(self, token):
+        # 对给定的标记执行字节对编码
         if token in self.cache:
             return self.cache[token]
         word = tuple(token)
@@ -130,6 +148,7 @@ class Encoder:
         return word
 
     def encode(self, text):
+        # 使用BPE对给定文本进行编码
         bpe_tokens = []
         for token in re.findall(self.pat, text):
             token = ''.join(self.byte_encoder[b] for b in token.encode('utf-8'))
@@ -137,10 +156,11 @@ class Encoder:
         return bpe_tokens
 
     def decode(self, tokens):
+         # 将一系列标记解码回文本
         text = ''.join([self.decoder[token] for token in tokens])
         text = bytearray([self.byte_decoder[c] for c in text]).decode('utf-8', errors=self.errors)
         return text
-
+# 定义一个函数获取特定模型的编码器
 def get_encoder(model_name, models_dir):
     with open(os.path.join(models_dir, model_name, 'encoder.json'), 'r') as f:
         encoder = json.load(f)
@@ -152,7 +172,7 @@ def get_encoder(model_name, models_dir):
         bpe_merges=bpe_merges,
     )
 
-
+# 定义一个函数下载GPT-2模型的词汇文件
 def download_vocab():
     # Modified code from
     subdir = 'gpt2_model'
diff --git a/ch02/02_bonus_bytepair-encoder/compare-bpe-tiktoken.ipynb b/ch02/02_bonus_bytepair-encoder/compare-bpe-tiktoken.ipynb
index a777c4a..5b32038 100644
--- a/ch02/02_bonus_bytepair-encoder/compare-bpe-tiktoken.ipynb
+++ b/ch02/02_bonus_bytepair-encoder/compare-bpe-tiktoken.ipynb
@@ -5,7 +5,8 @@
    "id": "a9adc3bf-353c-411e-a471-0e92786e7103",
    "metadata": {},
    "source": [
-    "# Using BytePair encodding from `tiktoken`"
+    "# 使用来自 `tiktoken` 的字节对编码\n",
+    "tiktoken是一个用于OpenAI模型的快速BPE标记器。（BPE标记器是一种基于字节对编码（Byte Pair Encoding，简称BPE）的文本标记方法。字节对编码是一种数据压缩技术，但在自然语言处理中，它也被用于创建词汇表和对文本进行分词。）"
    ]
   },
   {
@@ -20,7 +21,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 4,
+   "execution_count": 1,
    "id": "1c490fca-a48a-47fa-a299-322d1a08ad17",
    "metadata": {},
    "outputs": [
@@ -28,33 +29,33 @@
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "tiktoken version: 0.5.2\n"
+      "tiktoken version: 0.6.0\n"
      ]
     }
    ],
    "source": [
     "import importlib.metadata\n",
-    "\n",
+    "# 打印出当前系统中安装的 tiktoken 库的版本号\n",
     "print(\"tiktoken version:\", importlib.metadata.version(\"tiktoken\"))"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 5,
+   "execution_count": 2,
    "id": "0952667c-ce84-4f21-87db-59f52b44cec4",
    "metadata": {},
    "outputs": [],
    "source": [
     "import tiktoken\n",
-    "\n",
+    "# 创建一个使用 GPT-2 模型的编码器对象\n",
     "tik_tokenizer = tiktoken.get_encoding(\"gpt2\")\n",
-    "\n",
+    "# ，定义一个包含文本的字符串变量，使用 tik_tokenizer 对象对文本进行编码\n",
     "text = \"Hello, world. Is this-- a test?\""
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 6,
+   "execution_count": 3,
    "id": "b039c350-18ad-48fb-8e6a-085702dfc330",
    "metadata": {},
    "outputs": [
@@ -67,6 +68,7 @@
     }
    ],
    "source": [
+    "# 参数 allowed_special，该参数指定哪些特殊字符允许出现在编码结果\n",
     "integers = tik_tokenizer.encode(text, allowed_special={\"<|endoftext|>\"})\n",
     "\n",
     "print(integers)"
@@ -74,7 +76,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 7,
+   "execution_count": 4,
    "id": "7b152ba4-04d3-41cc-849f-adedcfb8cabb",
    "metadata": {},
    "outputs": [
@@ -87,6 +89,7 @@
     }
    ],
    "source": [
+    "# 进行解码\n",
     "strings = tik_tokenizer.decode(integers)\n",
     "\n",
     "print(strings)"
@@ -94,7 +97,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 8,
+   "execution_count": 5,
    "id": "cf148a1a-316b-43ec-b7ba-1b6d409ce837",
    "metadata": {},
    "outputs": [
@@ -107,6 +110,7 @@
     }
    ],
    "source": [
+    "# 表示编码器的词汇表大小\n",
     "print(tik_tokenizer.n_vocab)"
    ]
   },
@@ -115,12 +119,12 @@
    "id": "6a0b5d4f-2af9-40de-828c-063c4243e771",
    "metadata": {},
    "source": [
-    "# Using the original Byte-pair encoding implementation used in GPT-2"
+    "# 使用在GPT-2中使用的原始字节对编码实现"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 9,
+   "execution_count": 6,
    "id": "0903108c-65cb-4ae1-967a-2155e25349c2",
    "metadata": {},
    "outputs": [],
@@ -130,7 +134,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 10,
+   "execution_count": 7,
    "id": "35dd8d7c-8c12-4b68-941a-0fd05882dd45",
    "metadata": {},
    "outputs": [
@@ -138,8 +142,8 @@
      "name": "stderr",
      "output_type": "stream",
      "text": [
-      "Fetching encoder.json: 1.04Mit [00:28, 36.8kit/s]                                                   \n",
-      "Fetching vocab.bpe: 457kit [00:00, 458kit/s]                                                        \n"
+      "Fetching encoder.json: 1.04Mit [00:02, 502kit/s]                                                    \n",
+      "Fetching vocab.bpe: 457kit [00:02, 212kit/s]                                                        \n"
      ]
     }
    ],
@@ -149,7 +153,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 14,
+   "execution_count": 8,
    "id": "1888a7a9-9c40-4fe0-99b4-ebd20aa1ffd0",
    "metadata": {},
    "outputs": [],
@@ -159,7 +163,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 15,
+   "execution_count": 9,
    "id": "2740510c-a78a-4fba-ae18-2b156ba2dfef",
    "metadata": {},
    "outputs": [
@@ -179,7 +183,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 16,
+   "execution_count": 10,
    "id": "434d115e-990d-42ad-88dd-31323a96e10f",
    "metadata": {},
    "outputs": [
@@ -202,12 +206,14 @@
    "id": "4f63e8c6-707c-4d66-bcf8-dd790647cc86",
    "metadata": {},
    "source": [
-    "# Using the BytePair Tokenizer in HuggingFace transformers"
+    "# 使用HuggingFace Transformers中的BytePair Tokenizer\n",
+    "\r\n",
+    "\r\n"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 12,
+   "execution_count": 11,
    "id": "5bfff386-f725-4137-9c50-e5da0c38bea0",
    "metadata": {},
    "outputs": [],
@@ -217,17 +223,17 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 13,
+   "execution_count": 12,
    "id": "e9077bf4-f91f-42ad-ab76-f3d89128510e",
    "metadata": {},
    "outputs": [
     {
      "data": {
       "text/plain": [
-       "'4.30.2'"
+       "'4.33.3'"
       ]
      },
-     "execution_count": 13,
+     "execution_count": 12,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -240,47 +246,19 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
-   "id": "16e06ee5-c4ca-4211-8e24-dbfd84b1d85b",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "设置为国内可访问"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 1,
-   "id": "3e07ddc9-187e-4482-a7b5-7e4e9381d805",
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "env: HF_ENDPOINT=https://hf-mirror.com\n"
-     ]
-    }
-   ],
-   "source": [
-    "%env HF_ENDPOINT=https://hf-mirror.com"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 2,
+   "execution_count": 13,
    "id": "a9839137-b8ea-4a2c-85fc-9a63064cf8c8",
    "metadata": {},
    "outputs": [
     {
      "data": {
       "application/vnd.jupyter.widget-view+json": {
-       "model_id": "afc151b540664287aa60a4cbe90cdfeb",
+       "model_id": "ef488b57e4214b76a8913a4704de7e15",
        "version_major": 2,
        "version_minor": 0
       },
       "text/plain": [
-       "vocab.json: 0.00B [00:00, ?B/s]"
+       "vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]"
       ]
      },
      "metadata": {},
@@ -289,12 +267,12 @@
     {
      "data": {
       "application/vnd.jupyter.widget-view+json": {
-       "model_id": "9a5d584e4adf40bca215b409b693dc02",
+       "model_id": "9ab86eb5125640dba6d59a5744f2d927",
        "version_major": 2,
        "version_minor": 0
       },
       "text/plain": [
-       "merges.txt: 0.00B [00:00, ?B/s]"
+       "merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]"
       ]
      },
      "metadata": {},
@@ -303,12 +281,26 @@
     {
      "data": {
       "application/vnd.jupyter.widget-view+json": {
-       "model_id": "a126ee77a9f94e58b1dcccd68e6d5bb1",
+       "model_id": "073f03eb3ef541e092c8f344f65c34da",
        "version_major": 2,
        "version_minor": 0
       },
       "text/plain": [
-       "config.json:   0%|          | 0.00/367 [00:00<?, ?B/s]"
+       "tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "b92abbedc99a4bb9ad8bf5f3b3e8b140",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]"
       ]
      },
      "metadata": {},
@@ -317,13 +309,13 @@
    ],
    "source": [
     "from transformers import GPT2Tokenizer\n",
-    "\n",
+    "# 使用 HuggingFace Transformers 提供的 GPT2Tokenizer 类，创建一个预训练的 GPT-2 模型的标记器对象\n",
     "hf_tokenizer = GPT2Tokenizer.from_pretrained(\"gpt2\")"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 11,
+   "execution_count": 14,
    "id": "222cbd69-6a3d-4868-9c1f-421ffc9d5fe1",
    "metadata": {},
    "outputs": [
@@ -333,7 +325,7 @@
        "[15496, 11, 995, 13, 1148, 428, 438, 257, 1332, 30]"
       ]
      },
-     "execution_count": 11,
+     "execution_count": 14,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -347,12 +339,12 @@
    "id": "907a1ade-3401-4f2e-9017-7f58a60cbd98",
    "metadata": {},
    "source": [
-    "# A quick performance benchmark"
+    "# 快速测试"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 17,
+   "execution_count": 15,
    "id": "a61bb445-b151-4a2f-8180-d4004c503754",
    "metadata": {},
    "outputs": [],
@@ -363,7 +355,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 18,
+   "execution_count": 16,
    "id": "57f7c0a3-c1fd-4313-af34-68e78eb33653",
    "metadata": {},
    "outputs": [
@@ -371,17 +363,18 @@
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "9.14 ms ± 74.5 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)\n"
+      "14.6 ms ± 201 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)\n"
      ]
     }
    ],
    "source": [
+    "# 测量其运行时间，从而进行性能评估\n",
     "%timeit orig_tokenizer.encode(raw_text)"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 19,
+   "execution_count": 17,
    "id": "036dd628-3591-46c9-a5ce-b20b105a8062",
    "metadata": {},
    "outputs": [
@@ -389,7 +382,7 @@
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "3.28 ms ± 2.66 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)\n"
+      "2.9 ms ± 42.8 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)\n"
      ]
     }
    ],
@@ -399,7 +392,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 20,
+   "execution_count": 18,
    "id": "b9c85b58-bfbc-465e-9a7e-477e53d55c90",
    "metadata": {},
    "outputs": [
@@ -414,7 +407,7 @@
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "19.1 ms ± 2.43 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)\n"
+      "28.6 ms ± 643 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)\n"
      ]
     }
    ],
@@ -424,15 +417,15 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 21,
-   "id": "7117107f-22a6-46b4-a442-712d50b3ac7a",
+   "execution_count": 20,
+   "id": "67159770-e131-411a-b9fe-037b4d931c9d",
    "metadata": {},
    "outputs": [
     {
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "18.8 ms ± 2.41 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)\n"
+      "28.3 ms ± 601 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)\n"
      ]
     }
    ],
@@ -443,7 +436,15 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "id": "d81eaf6d-554b-44e3-aa19-2c3ae0030762",
+   "id": "e1cf5928-b6c8-4493-9e51-cb2795b2482c",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "5f70f164-5f73-479e-bfaf-914243016439",
    "metadata": {},
    "outputs": [],
    "source": []
@@ -465,7 +466,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.11.5"
+   "version": "3.8.17"
   }
  },
  "nbformat": 4,
diff --git a/ch02/03_bonus_embedding-vs-matmul/.ipynb_checkpoints/embeddings-and-linear-layers-checkpoint.ipynb b/ch02/03_bonus_embedding-vs-matmul/.ipynb_checkpoints/embeddings-and-linear-layers-checkpoint.ipynb
deleted file mode 100644
index 198e1b1..0000000
--- a/ch02/03_bonus_embedding-vs-matmul/.ipynb_checkpoints/embeddings-and-linear-layers-checkpoint.ipynb
+++ /dev/null
@@ -1,486 +0,0 @@
-{
- "cells": [
-  {
-   "cell_type": "markdown",
-   "id": "063850ab-22b0-4838-b53a-9bb11757d9d0",
-   "metadata": {},
-   "source": [
-    "# Embedding Layers and Linear Layers"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "0315c598-701f-46ff-8806-15813cad0e51",
-   "metadata": {},
-   "source": [
-    "- Embedding layers in PyTorch accomplish the same as linear layers that perform matrix multiplications; the reason we use embedding layers is computational efficiency\n",
-    "- We will take a look at this relationship step by step using code examples in PyTorch"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 1,
-   "id": "061720f4-f025-4640-82a0-15098fa94cf9",
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "PyTorch version: 2.1.0.post301\n"
-     ]
-    }
-   ],
-   "source": [
-    "import torch\n",
-    "\n",
-    "print(\"PyTorch version:\", torch.__version__)"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "a7895a66-7f69-4f62-9361-5c9da2eb76ef",
-   "metadata": {},
-   "source": [
-    "## Using nn.Embedding"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 2,
-   "id": "cc489ea5-73db-40b9-959e-0d70cae25f40",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# Suppose we have the following 3 training examples,\n",
-    "# which may represent token IDs in a LLM context\n",
-    "idx = torch.tensor([2, 3, 1])\n",
-    "\n",
-    "# The number of rows in the embedding matrix can be determined\n",
-    "# by obtaining the largest token ID + 1.\n",
-    "# If the highest token ID is 3, then we want 4 rows, for the possible\n",
-    "# token IDs 0, 1, 2, 3\n",
-    "num_idx = max(idx)+1\n",
-    "\n",
-    "# The desired embedding dimension is a hyperparameter\n",
-    "out_dim = 5"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "93d83a6e-8543-40af-b253-fe647640bf36",
-   "metadata": {},
-   "source": [
-    "- Let's implement a simple embedding layer:"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 3,
-   "id": "60a7c104-36e1-4b28-bd02-a24a1099dc66",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# We use the random seed for reproducibility since\n",
-    "# weights in the embedding layer are initialized with\n",
-    "# small random values\n",
-    "torch.manual_seed(123)\n",
-    "\n",
-    "embedding = torch.nn.Embedding(num_idx, out_dim)"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "dd96c00a-3297-4a50-8bfc-247aaea7e761",
-   "metadata": {},
-   "source": [
-    "We can optionally take a look at the embedding weights:"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 4,
-   "id": "595f603e-8d2a-4171-8f94-eac8106b2e57",
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "Parameter containing:\n",
-       "tensor([[ 0.3374, -0.1778, -0.3035, -0.5880,  1.5810],\n",
-       "        [ 1.3010,  1.2753, -0.2010, -0.1606, -0.4015],\n",
-       "        [ 0.6957, -1.8061, -1.1589,  0.3255, -0.6315],\n",
-       "        [-2.8400, -0.7849, -1.4096, -0.4076,  0.7953]], requires_grad=True)"
-      ]
-     },
-     "execution_count": 4,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "embedding.weight"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "c86eb562-61e2-4171-ab6e-b410a1fd5c18",
-   "metadata": {},
-   "source": [
-    "- We can then use the embedding layers to obtain the vector representation of a training example with ID 1:"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 5,
-   "id": "8bbc0255-4805-4be9-9f4c-1d0d967ef9d5",
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "tensor([[ 1.3010,  1.2753, -0.2010, -0.1606, -0.4015]],\n",
-       "       grad_fn=<EmbeddingBackward0>)"
-      ]
-     },
-     "execution_count": 5,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "embedding(torch.tensor([1]))"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "6a4d47f2-4691-47b8-9855-2528b6c285c9",
-   "metadata": {},
-   "source": [
-    "- Below is a visualization of what happens under the hood:"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "12ffd155-7190-44b1-b6b6-45b11d6fe83b",
-   "metadata": {},
-   "source": [
-    "<img src=\"images/1.png\" width=\"400px\">"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "87d1311b-cfb2-4afc-9e25-e4ecf35370d9",
-   "metadata": {},
-   "source": [
-    "- Similarly, we can use embedding layers to obtain the vector representation of a training example with ID 2:"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 6,
-   "id": "c309266a-c601-4633-9404-2e10b1cdde8c",
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "tensor([[ 0.6957, -1.8061, -1.1589,  0.3255, -0.6315]],\n",
-       "       grad_fn=<EmbeddingBackward0>)"
-      ]
-     },
-     "execution_count": 6,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "embedding(torch.tensor([2]))"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "7ad3b601-f68c-41b1-a28d-b624b94ef383",
-   "metadata": {},
-   "source": [
-    "<img src=\"images/2.png\" width=\"400px\">"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "27dd54bd-85ae-4887-9c5e-3139da361cf4",
-   "metadata": {},
-   "source": [
-    "- Now, let's convert all the training examples we have defined previously:"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 7,
-   "id": "0191aa4b-f6a8-4b0d-9c36-65e82b81d071",
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "tensor([[ 0.6957, -1.8061, -1.1589,  0.3255, -0.6315],\n",
-       "        [-2.8400, -0.7849, -1.4096, -0.4076,  0.7953],\n",
-       "        [ 1.3010,  1.2753, -0.2010, -0.1606, -0.4015]],\n",
-       "       grad_fn=<EmbeddingBackward0>)"
-      ]
-     },
-     "execution_count": 7,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "idx = torch.tensor([2, 3, 1])\n",
-    "embedding(idx)"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "146cf8eb-c517-4cd4-aa91-0e818fed7651",
-   "metadata": {},
-   "source": [
-    "- Under the hood, it's still the same look-up concept:"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "b392eb43-0bda-4821-b446-b8dcbee8ae00",
-   "metadata": {},
-   "source": [
-    "<img src=\"images/3.png\" width=\"450px\">"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "f0fe863b-d6a3-48f3-ace5-09ecd0eb7b59",
-   "metadata": {},
-   "source": [
-    "## Using nn.Linear"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "138de6a4-2689-4c1f-96af-7899b2d82a4e",
-   "metadata": {},
-   "source": [
-    "- Now, we will demonstrate that the embedding layer above accomplishes exactly the same as `nn.Linear` layer on a one-hot encoded representation in PyTorch\n",
-    "- First, let's convert the token IDs into a one-hot representation:"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 8,
-   "id": "b5bb56cf-bc73-41ab-b107-91a43f77bdba",
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "tensor([[0, 0, 1, 0],\n",
-       "        [0, 0, 0, 1],\n",
-       "        [0, 1, 0, 0]])"
-      ]
-     },
-     "execution_count": 8,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "onehot = torch.nn.functional.one_hot(idx)\n",
-    "onehot"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "aa45dfdf-fb26-4514-a176-75224f5f179b",
-   "metadata": {},
-   "source": [
-    "- Next, we initialize a `Linear` layer, which caries out a matrix multiplication $X W^\\top$:"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 9,
-   "id": "ae04c1ed-242e-4dd7-b8f7-4b7e4caae383",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "torch.manual_seed(123)\n",
-    "linear = torch.nn.Linear(num_idx, out_dim, bias=False)"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "63efb98e-5cc4-4e8d-9fe6-ef0ad29ae2d7",
-   "metadata": {},
-   "source": [
-    "- Note that the linear layer in PyTorch is also initialized with small random weights; to directly compare it to the `Embedding` layer above, we have to use the same small random weights, which is why we reassign them here:"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 10,
-   "id": "a3b90d69-761c-486e-bd19-b38a2988fe62",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "linear.weight = torch.nn.Parameter(embedding.weight.T.detach())"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "9116482d-f1f9-45e2-9bf3-7ef5e9003898",
-   "metadata": {},
-   "source": [
-    "- Now we can use the linear layer on the one-hot encoded representation of the inputs:"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 11,
-   "id": "90d2b0dd-9f1d-4c0f-bb16-1f6ce6b8ac2c",
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "tensor([[ 0.6957, -1.8061, -1.1589,  0.3255, -0.6315],\n",
-       "        [-2.8400, -0.7849, -1.4096, -0.4076,  0.7953],\n",
-       "        [ 1.3010,  1.2753, -0.2010, -0.1606, -0.4015]], grad_fn=<MmBackward0>)"
-      ]
-     },
-     "execution_count": 11,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "linear(onehot.float())"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "f6204bc8-92e2-4546-9cda-574fe1360fa2",
-   "metadata": {},
-   "source": [
-    "As we can see, this is exactly the same as what we got when we used the embedding layer:"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 12,
-   "id": "2b057649-3176-4a54-b58c-fd8fbf818c61",
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "tensor([[ 0.6957, -1.8061, -1.1589,  0.3255, -0.6315],\n",
-       "        [-2.8400, -0.7849, -1.4096, -0.4076,  0.7953],\n",
-       "        [ 1.3010,  1.2753, -0.2010, -0.1606, -0.4015]],\n",
-       "       grad_fn=<EmbeddingBackward0>)"
-      ]
-     },
-     "execution_count": 12,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "embedding(idx)"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "0e447639-8952-460e-8c8f-cf9e23c368c9",
-   "metadata": {},
-   "source": [
-    "- What happens under the hood is the following computation for the first training example's token ID:"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "1830eccf-a707-4753-a24a-9b103f55594a",
-   "metadata": {},
-   "source": [
-    "<img src=\"images/4.png\" width=\"450px\">"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "9ce5211a-14e6-46aa-a3a8-14609f086e97",
-   "metadata": {},
-   "source": [
-    "- And for the second training example's token ID:"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "173f6026-a461-44da-b9c5-f571f8ec8bf3",
-   "metadata": {},
-   "source": [
-    "<img src=\"images/5.png\" width=\"450px\">"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "e2608049-f5d1-49a9-a14b-82695fc32e6a",
-   "metadata": {},
-   "source": [
-    "- Since all but one index in each one-hot encoded row are 0 (by design), this matrix multiplication is essentially the same as a look-up of the one-hot elements\n",
-    "- This use of the matrix multiplication on one-hot encodings is equivalent to the embedding layer look-up but can be inefficient if we work with large embedding matrices, because there are a lot of wasteful multiplications by zero"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "5eacc005-86fc-490c-8f6a-dc37d8a0df7c",
-   "metadata": {},
-   "outputs": [],
-   "source": []
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "a1f63c81-1ee3-40a1-9ef2-14ff18fb4f05",
-   "metadata": {},
-   "outputs": [],
-   "source": []
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "c71959bb-facf-44fd-8edb-b67f7752f034",
-   "metadata": {},
-   "outputs": [],
-   "source": []
-  }
- ],
- "metadata": {
-  "kernelspec": {
-   "display_name": "Python 3 (ipykernel)",
-   "language": "python",
-   "name": "python3"
-  },
-  "language_info": {
-   "codemirror_mode": {
-    "name": "ipython",
-    "version": 3
-   },
-   "file_extension": ".py",
-   "mimetype": "text/x-python",
-   "name": "python",
-   "nbconvert_exporter": "python",
-   "pygments_lexer": "ipython3",
-   "version": "3.11.5"
-  }
- },
- "nbformat": 4,
- "nbformat_minor": 5
-}
diff --git a/ch02/03_bonus_embedding-vs-matmul/README.md b/ch02/03_bonus_embedding-vs-matmul/README.md
index a1f67ef..5228286 100644
--- a/ch02/03_bonus_embedding-vs-matmul/README.md
+++ b/ch02/03_bonus_embedding-vs-matmul/README.md
@@ -1,3 +1,4 @@
-# Chapter 2: Working with Text Data
+# 第2章:使用文本数据
+
+- [embeddings-and-linear-layers.ipynb](embeddings-and-linear-layers.ipynb) 包含可选（奖励）代码，以说明应用于独热编码向量的嵌入层和全连接层是等效的。
 
-- [embeddings-and-linear-layers.ipynb](embeddings-and-linear-layers.ipynb) contains optional (bonus) code to explain that embedding layers and fully connected layers applied to one-hot encoded vectors are equivalent.
diff --git a/ch02/03_bonus_embedding-vs-matmul/embeddings-and-linear-layers.ipynb b/ch02/03_bonus_embedding-vs-matmul/embeddings-and-linear-layers.ipynb
index 198e1b1..e068fc5 100644
--- a/ch02/03_bonus_embedding-vs-matmul/embeddings-and-linear-layers.ipynb
+++ b/ch02/03_bonus_embedding-vs-matmul/embeddings-and-linear-layers.ipynb
@@ -5,7 +5,7 @@
    "id": "063850ab-22b0-4838-b53a-9bb11757d9d0",
    "metadata": {},
    "source": [
-    "# Embedding Layers and Linear Layers"
+    "# Embedding层和 Linear层"
    ]
   },
   {
@@ -13,8 +13,8 @@
    "id": "0315c598-701f-46ff-8806-15813cad0e51",
    "metadata": {},
    "source": [
-    "- Embedding layers in PyTorch accomplish the same as linear layers that perform matrix multiplications; the reason we use embedding layers is computational efficiency\n",
-    "- We will take a look at this relationship step by step using code examples in PyTorch"
+    "- 在PyTorch中，嵌入层（Embedding layers）实现了执行矩阵乘法的线性层的相同功能；我们使用嵌入层的原因是为了提高计算效率。\n",
+    "- 我们将逐步使用PyTorch中的代码示例来查看这种关系。"
    ]
   },
   {
@@ -27,13 +27,12 @@
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "PyTorch version: 2.1.0.post301\n"
+      "PyTorch version: 1.12.1+cu113\n"
      ]
     }
    ],
    "source": [
     "import torch\n",
-    "\n",
     "print(\"PyTorch version:\", torch.__version__)"
    ]
   },
@@ -47,22 +46,21 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 2,
+   "execution_count": 3,
    "id": "cc489ea5-73db-40b9-959e-0d70cae25f40",
    "metadata": {},
    "outputs": [],
    "source": [
-    "# Suppose we have the following 3 training examples,\n",
-    "# which may represent token IDs in a LLM context\n",
+    "# 假设我们有以下 3 个训练样本，\n",
+    "# 这些样本可能表示语言模型（LM）上下文中的标记ID\n",
     "idx = torch.tensor([2, 3, 1])\n",
     "\n",
-    "# The number of rows in the embedding matrix can be determined\n",
-    "# by obtaining the largest token ID + 1.\n",
-    "# If the highest token ID is 3, then we want 4 rows, for the possible\n",
-    "# token IDs 0, 1, 2, 3\n",
-    "num_idx = max(idx)+1\n",
+    "# 嵌入矩阵的行数可以通过获取最大标记ID + 1 来确定。\n",
+    "# 如果最高的标记ID是3，则我们希望有4行，对应可能的\n",
+    "# 标记ID 0, 1, 2, 3\n",
+    "num_idx = max(idx) + 1\n",
     "\n",
-    "# The desired embedding dimension is a hyperparameter\n",
+    "# 所需的嵌入维度是一个超参数\n",
     "out_dim = 5"
    ]
   },
@@ -71,21 +69,21 @@
    "id": "93d83a6e-8543-40af-b253-fe647640bf36",
    "metadata": {},
    "source": [
-    "- Let's implement a simple embedding layer:"
+    "- 实现一个简单的嵌入层"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 3,
+   "execution_count": 4,
    "id": "60a7c104-36e1-4b28-bd02-a24a1099dc66",
    "metadata": {},
    "outputs": [],
    "source": [
-    "# We use the random seed for reproducibility since\n",
-    "# weights in the embedding layer are initialized with\n",
-    "# small random values\n",
+    "# 为了可重复性，我们使用随机种子，\n",
+    "# 因为嵌入层的权重是用小的随机值初始化的\n",
     "torch.manual_seed(123)\n",
     "\n",
+    "# 创建一个嵌入层，指定输入维度为 num_idx，输出维度为 out_dim\n",
     "embedding = torch.nn.Embedding(num_idx, out_dim)"
    ]
   },
@@ -94,12 +92,12 @@
    "id": "dd96c00a-3297-4a50-8bfc-247aaea7e761",
    "metadata": {},
    "source": [
-    "We can optionally take a look at the embedding weights:"
+    "查看嵌入权重数据情况"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 4,
+   "execution_count": 5,
    "id": "595f603e-8d2a-4171-8f94-eac8106b2e57",
    "metadata": {},
    "outputs": [
@@ -113,7 +111,7 @@
        "        [-2.8400, -0.7849, -1.4096, -0.4076,  0.7953]], requires_grad=True)"
       ]
      },
-     "execution_count": 4,
+     "execution_count": 5,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -127,12 +125,12 @@
    "id": "c86eb562-61e2-4171-ab6e-b410a1fd5c18",
    "metadata": {},
    "source": [
-    "- We can then use the embedding layers to obtain the vector representation of a training example with ID 1:"
+    "- 使用嵌入层来获取具有ID 1的训练样本的向量表示"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 5,
+   "execution_count": 6,
    "id": "8bbc0255-4805-4be9-9f4c-1d0d967ef9d5",
    "metadata": {},
    "outputs": [
@@ -143,7 +141,7 @@
        "       grad_fn=<EmbeddingBackward0>)"
       ]
      },
-     "execution_count": 5,
+     "execution_count": 6,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -157,7 +155,7 @@
    "id": "6a4d47f2-4691-47b8-9855-2528b6c285c9",
    "metadata": {},
    "source": [
-    "- Below is a visualization of what happens under the hood:"
+    "- 下面是底层操作的可视化"
    ]
   },
   {
@@ -173,12 +171,12 @@
    "id": "87d1311b-cfb2-4afc-9e25-e4ecf35370d9",
    "metadata": {},
    "source": [
-    "- Similarly, we can use embedding layers to obtain the vector representation of a training example with ID 2:"
+    "- 同样，我们可以使用嵌入层来获取具有ID 2的训练样本的向量表示："
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 6,
+   "execution_count": 7,
    "id": "c309266a-c601-4633-9404-2e10b1cdde8c",
    "metadata": {},
    "outputs": [
@@ -189,7 +187,7 @@
        "       grad_fn=<EmbeddingBackward0>)"
       ]
      },
-     "execution_count": 6,
+     "execution_count": 7,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -211,12 +209,12 @@
    "id": "27dd54bd-85ae-4887-9c5e-3139da361cf4",
    "metadata": {},
    "source": [
-    "- Now, let's convert all the training examples we have defined previously:"
+    "- 现在，让我们将之前定义的所有训练样本转换："
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 7,
+   "execution_count": 8,
    "id": "0191aa4b-f6a8-4b0d-9c36-65e82b81d071",
    "metadata": {},
    "outputs": [
@@ -229,12 +227,13 @@
        "       grad_fn=<EmbeddingBackward0>)"
       ]
      },
-     "execution_count": 7,
+     "execution_count": 8,
      "metadata": {},
      "output_type": "execute_result"
     }
    ],
    "source": [
+    "# 将原先的第三行变成现在的第一行，第四行变成现在的第二行，第二行变成现在的第三行\n",
     "idx = torch.tensor([2, 3, 1])\n",
     "embedding(idx)"
    ]
@@ -260,7 +259,7 @@
    "id": "f0fe863b-d6a3-48f3-ace5-09ecd0eb7b59",
    "metadata": {},
    "source": [
-    "## Using nn.Linear"
+    "## 使用 nn.Linear"
    ]
   },
   {
@@ -268,13 +267,13 @@
    "id": "138de6a4-2689-4c1f-96af-7899b2d82a4e",
    "metadata": {},
    "source": [
-    "- Now, we will demonstrate that the embedding layer above accomplishes exactly the same as `nn.Linear` layer on a one-hot encoded representation in PyTorch\n",
-    "- First, let's convert the token IDs into a one-hot representation:"
+    "- 接下来，我们将使用One-Hot编码，与embedding 层一样，在 `nn.Linear` 层进行操作\n",
+    "- 首先，我们将标记ID转换为One-Hot表示："
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 8,
+   "execution_count": 12,
    "id": "b5bb56cf-bc73-41ab-b107-91a43f77bdba",
    "metadata": {},
    "outputs": [
@@ -286,7 +285,7 @@
        "        [0, 1, 0, 0]])"
       ]
      },
-     "execution_count": 8,
+     "execution_count": 12,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -301,18 +300,33 @@
    "id": "aa45dfdf-fb26-4514-a176-75224f5f179b",
    "metadata": {},
    "source": [
-    "- Next, we initialize a `Linear` layer, which caries out a matrix multiplication $X W^\\top$:"
+    "- 接下来，我们使用矩阵乘法$X W^\\top$ 来初始化一个Linear层"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 9,
+   "execution_count": 16,
    "id": "ae04c1ed-242e-4dd7-b8f7-4b7e4caae383",
    "metadata": {},
-   "outputs": [],
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Parameter containing:\n",
+      "tensor([[-0.2039,  0.0166, -0.2483,  0.1886],\n",
+      "        [-0.4260,  0.3665, -0.3634, -0.3975],\n",
+      "        [-0.3159,  0.2264, -0.1847,  0.1871],\n",
+      "        [-0.4244, -0.3034, -0.1836, -0.0983],\n",
+      "        [-0.3814,  0.3274, -0.1179,  0.1605]], requires_grad=True)\n"
+     ]
+    }
+   ],
    "source": [
     "torch.manual_seed(123)\n",
-    "linear = torch.nn.Linear(num_idx, out_dim, bias=False)"
+    "# 初始化一个Linear层，该层的权重矩阵是由 num_idx（输入维度）到 out_dim（输出维度）的一个线性层，而且没有偏置项\n",
+    "linear = torch.nn.Linear(num_idx, out_dim, bias=False)\n",
+    "print(linear.weight)"
    ]
   },
   {
@@ -320,16 +334,17 @@
    "id": "63efb98e-5cc4-4e8d-9fe6-ef0ad29ae2d7",
    "metadata": {},
    "source": [
-    "- Note that the linear layer in PyTorch is also initialized with small random weights; to directly compare it to the `Embedding` layer above, we have to use the same small random weights, which is why we reassign them here:"
+    "- 请注意，PyTorch中的`linear`层也是用小的随机权重进行初始化的。为了与上面的 `Embedding` 层进行直接比较，我们必须使用相同的小随机权重，这就是我们在这里重新分配它们的原因："
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 10,
+   "execution_count": 17,
    "id": "a3b90d69-761c-486e-bd19-b38a2988fe62",
    "metadata": {},
    "outputs": [],
    "source": [
+    "# linear 层的权重就被重新赋值为与 embedding 层相同的小随机权重，以确保它们具有相同的初始化。这是为了使它们在后续操作中可以进行直接比较。\n",
     "linear.weight = torch.nn.Parameter(embedding.weight.T.detach())"
    ]
   },
@@ -338,12 +353,12 @@
    "id": "9116482d-f1f9-45e2-9bf3-7ef5e9003898",
    "metadata": {},
    "source": [
-    "- Now we can use the linear layer on the one-hot encoded representation of the inputs:"
+    "- 现在，我们可以使用线性层处理输入的One-Hot编码表示："
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 11,
+   "execution_count": 18,
    "id": "90d2b0dd-9f1d-4c0f-bb16-1f6ce6b8ac2c",
    "metadata": {},
    "outputs": [
@@ -355,7 +370,7 @@
        "        [ 1.3010,  1.2753, -0.2010, -0.1606, -0.4015]], grad_fn=<MmBackward0>)"
       ]
      },
-     "execution_count": 11,
+     "execution_count": 18,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -369,12 +384,12 @@
    "id": "f6204bc8-92e2-4546-9cda-574fe1360fa2",
    "metadata": {},
    "source": [
-    "As we can see, this is exactly the same as what we got when we used the embedding layer:"
+    "正如我们所看到的，这与我们使用嵌入层时得到的结果完全相同："
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 12,
+   "execution_count": 19,
    "id": "2b057649-3176-4a54-b58c-fd8fbf818c61",
    "metadata": {},
    "outputs": [
@@ -387,7 +402,7 @@
        "       grad_fn=<EmbeddingBackward0>)"
       ]
      },
-     "execution_count": 12,
+     "execution_count": 19,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -401,7 +416,7 @@
    "id": "0e447639-8952-460e-8c8f-cf9e23c368c9",
    "metadata": {},
    "source": [
-    "- What happens under the hood is the following computation for the first training example's token ID:"
+    "- 底层发生的计算如下，针对第一个训练样本的标记ID："
    ]
   },
   {
@@ -417,7 +432,7 @@
    "id": "9ce5211a-14e6-46aa-a3a8-14609f086e97",
    "metadata": {},
    "source": [
-    "- And for the second training example's token ID:"
+    "- 以及对于第二个训练样本的标记ID："
    ]
   },
   {
@@ -433,8 +448,9 @@
    "id": "e2608049-f5d1-49a9-a14b-82695fc32e6a",
    "metadata": {},
    "source": [
-    "- Since all but one index in each one-hot encoded row are 0 (by design), this matrix multiplication is essentially the same as a look-up of the one-hot elements\n",
-    "- This use of the matrix multiplication on one-hot encodings is equivalent to the embedding layer look-up but can be inefficient if we work with large embedding matrices, because there are a lot of wasteful multiplications by zero"
+    "- \n",
+    "由于每个独热编码行中除了一个索引外都为0（设计如此），这个矩阵乘法本质上就是对独热编码元素的查找\n",
+    "- 。在独热编码上使用矩阵乘法与使用嵌入层查找是等效的，但如果我们使用大型嵌入矩阵，这种方法可能效率较低，因为有很多不必要的零乘法。"
    ]
   },
   {
@@ -444,22 +460,6 @@
    "metadata": {},
    "outputs": [],
    "source": []
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "a1f63c81-1ee3-40a1-9ef2-14ff18fb4f05",
-   "metadata": {},
-   "outputs": [],
-   "source": []
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "c71959bb-facf-44fd-8edb-b67f7752f034",
-   "metadata": {},
-   "outputs": [],
-   "source": []
   }
  ],
  "metadata": {
@@ -478,7 +478,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.11.5"
+   "version": "3.8.17"
   }
  },
  "nbformat": 4,