From d46a09335f3f924bd02ef1309ee457e3ecf37d26 Mon Sep 17 00:00:00 2001 From: aJupyter Date: Mon, 4 Mar 2024 13:55:56 +0800 Subject: [PATCH] feat: update ch02-02-03 --- .../bpe_openai_gpt2-checkpoint.py | 174 ------- .../compare-bpe-tiktoken-checkpoint.ipynb | 442 ---------------- ch02/02_bonus_bytepair-encoder/README.md | 6 +- .../bpe_openai_gpt2.cpython-311.pyc | Bin 12758 -> 0 bytes .../bpe_openai_gpt2.py | 24 +- .../compare-bpe-tiktoken.ipynb | 149 +++--- ...eddings-and-linear-layers-checkpoint.ipynb | 486 ------------------ ch02/03_bonus_embedding-vs-matmul/README.md | 5 +- .../embeddings-and-linear-layers.ipynb | 140 ++--- 9 files changed, 173 insertions(+), 1253 deletions(-) delete mode 100644 ch02/02_bonus_bytepair-encoder/.ipynb_checkpoints/bpe_openai_gpt2-checkpoint.py delete mode 100644 ch02/02_bonus_bytepair-encoder/.ipynb_checkpoints/compare-bpe-tiktoken-checkpoint.ipynb delete mode 100644 ch02/02_bonus_bytepair-encoder/__pycache__/bpe_openai_gpt2.cpython-311.pyc delete mode 100644 ch02/03_bonus_embedding-vs-matmul/.ipynb_checkpoints/embeddings-and-linear-layers-checkpoint.ipynb diff --git a/ch02/02_bonus_bytepair-encoder/.ipynb_checkpoints/bpe_openai_gpt2-checkpoint.py b/ch02/02_bonus_bytepair-encoder/.ipynb_checkpoints/bpe_openai_gpt2-checkpoint.py deleted file mode 100644 index f3d9575..0000000 --- a/ch02/02_bonus_bytepair-encoder/.ipynb_checkpoints/bpe_openai_gpt2-checkpoint.py +++ /dev/null @@ -1,174 +0,0 @@ -""" -Byte pair encoding utilities - -Code from https://github.com/openai/gpt-2/blob/master/src/encoder.py - -And modified code (download_vocab) from -https://github.com/openai/gpt-2/blob/master/download_model.py - -Modified MIT License - -Software Copyright (c) 2019 OpenAI - -We don’t claim ownership of the content you create with GPT-2, so it is yours to do with as you please. -We only ask that you use GPT-2 responsibly and clearly indicate your content was created using GPT-2. - -Permission is hereby granted, free of charge, to any person obtaining a copy of this software and -associated documentation files (the "Software"), to deal in the Software without restriction, -including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, -and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, -subject to the following conditions: - -The above copyright notice and this permission notice shall be included -in all copies or substantial portions of the Software. -The above copyright notice and this permission notice need not be included -with content created by the Software. - -THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, -INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS -BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, -TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE -OR OTHER DEALINGS IN THE SOFTWARE. - - -""" - -import os -import json -import regex as re -import requests -from tqdm import tqdm -from functools import lru_cache - -@lru_cache() -def bytes_to_unicode(): - """ - Returns list of utf-8 byte and a corresponding list of unicode strings. - The reversible bpe codes work on unicode strings. - This means you need a large # of unicode characters in your vocab if you want to avoid UNKs. - When you're at something like a 10B token dataset you end up needing around 5K for decent coverage. - This is a significant percentage of your normal, say, 32K bpe vocab. - To avoid that, we want lookup tables between utf-8 bytes and unicode strings. - And avoids mapping to whitespace/control characters the bpe code barfs on. - """ - bs = list(range(ord("!"), ord("~")+1))+list(range(ord("¡"), ord("¬")+1))+list(range(ord("®"), ord("ÿ")+1)) - cs = bs[:] - n = 0 - for b in range(2**8): - if b not in bs: - bs.append(b) - cs.append(2**8+n) - n += 1 - cs = [chr(n) for n in cs] - return dict(zip(bs, cs)) - -def get_pairs(word): - """Return set of symbol pairs in a word. - - Word is represented as tuple of symbols (symbols being variable-length strings). - """ - pairs = set() - prev_char = word[0] - for char in word[1:]: - pairs.add((prev_char, char)) - prev_char = char - return pairs - -class Encoder: - def __init__(self, encoder, bpe_merges, errors='replace'): - self.encoder = encoder - self.decoder = {v:k for k,v in self.encoder.items()} - self.errors = errors # how to handle errors in decoding - self.byte_encoder = bytes_to_unicode() - self.byte_decoder = {v:k for k, v in self.byte_encoder.items()} - self.bpe_ranks = dict(zip(bpe_merges, range(len(bpe_merges)))) - self.cache = {} - - # Should haved added re.IGNORECASE so BPE merges can happen for capitalized versions of contractions - self.pat = re.compile(r"""'s|'t|'re|'ve|'m|'ll|'d| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+""") - - def bpe(self, token): - if token in self.cache: - return self.cache[token] - word = tuple(token) - pairs = get_pairs(word) - - if not pairs: - return token - - while True: - bigram = min(pairs, key = lambda pair: self.bpe_ranks.get(pair, float('inf'))) - if bigram not in self.bpe_ranks: - break - first, second = bigram - new_word = [] - i = 0 - while i < len(word): - try: - j = word.index(first, i) - new_word.extend(word[i:j]) - i = j - except: - new_word.extend(word[i:]) - break - - if word[i] == first and i < len(word)-1 and word[i+1] == second: - new_word.append(first+second) - i += 2 - else: - new_word.append(word[i]) - i += 1 - new_word = tuple(new_word) - word = new_word - if len(word) == 1: - break - else: - pairs = get_pairs(word) - word = ' '.join(word) - self.cache[token] = word - return word - - def encode(self, text): - bpe_tokens = [] - for token in re.findall(self.pat, text): - token = ''.join(self.byte_encoder[b] for b in token.encode('utf-8')) - bpe_tokens.extend(self.encoder[bpe_token] for bpe_token in self.bpe(token).split(' ')) - return bpe_tokens - - def decode(self, tokens): - text = ''.join([self.decoder[token] for token in tokens]) - text = bytearray([self.byte_decoder[c] for c in text]).decode('utf-8', errors=self.errors) - return text - -def get_encoder(model_name, models_dir): - with open(os.path.join(models_dir, model_name, 'encoder.json'), 'r') as f: - encoder = json.load(f) - with open(os.path.join(models_dir, model_name, 'vocab.bpe'), 'r', encoding="utf-8") as f: - bpe_data = f.read() - bpe_merges = [tuple(merge_str.split()) for merge_str in bpe_data.split('\n')[1:-1]] - return Encoder( - encoder=encoder, - bpe_merges=bpe_merges, - ) - - -def download_vocab(): - # Modified code from - subdir = 'gpt2_model' - if not os.path.exists(subdir): - os.makedirs(subdir) - subdir = subdir.replace('\\','/') # needed for Windows - - for filename in ['encoder.json', 'vocab.bpe']: - - r = requests.get("https://openaipublic.blob.core.windows.net/gpt-2/models/117M" + "/" + filename, stream=True) - - with open(os.path.join(subdir, filename), 'wb') as f: - file_size = int(r.headers["content-length"]) - chunk_size = 1000 - with tqdm(ncols=100, desc="Fetching " + filename, total=file_size, unit_scale=True) as pbar: - # 1k for chunk_size, since Ethernet packet size is around 1500 bytes - for chunk in r.iter_content(chunk_size=chunk_size): - f.write(chunk) - pbar.update(chunk_size) diff --git a/ch02/02_bonus_bytepair-encoder/.ipynb_checkpoints/compare-bpe-tiktoken-checkpoint.ipynb b/ch02/02_bonus_bytepair-encoder/.ipynb_checkpoints/compare-bpe-tiktoken-checkpoint.ipynb deleted file mode 100644 index 7448afb..0000000 --- a/ch02/02_bonus_bytepair-encoder/.ipynb_checkpoints/compare-bpe-tiktoken-checkpoint.ipynb +++ /dev/null @@ -1,442 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "id": "a9adc3bf-353c-411e-a471-0e92786e7103", - "metadata": {}, - "source": [ - "# Using BytePair encodding from `tiktoken`" - ] - }, - { - "cell_type": "code", - "execution_count": 1, - "id": "4036ffa3-0e5c-433a-a997-4ed7d33de0b2", - "metadata": {}, - "outputs": [], - "source": [ - "# !pip install tiktoken" - ] - }, - { - "cell_type": "code", - "execution_count": 4, - "id": "1c490fca-a48a-47fa-a299-322d1a08ad17", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "tiktoken version: 0.5.2\n" - ] - } - ], - "source": [ - "import importlib.metadata\n", - "\n", - "print(\"tiktoken version:\", importlib.metadata.version(\"tiktoken\"))" - ] - }, - { - "cell_type": "code", - "execution_count": 5, - "id": "0952667c-ce84-4f21-87db-59f52b44cec4", - "metadata": {}, - "outputs": [], - "source": [ - "import tiktoken\n", - "\n", - "tik_tokenizer = tiktoken.get_encoding(\"gpt2\")\n", - "\n", - "text = \"Hello, world. Is this-- a test?\"" - ] - }, - { - "cell_type": "code", - "execution_count": 6, - "id": "b039c350-18ad-48fb-8e6a-085702dfc330", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "[15496, 11, 995, 13, 1148, 428, 438, 257, 1332, 30]\n" - ] - } - ], - "source": [ - "integers = tik_tokenizer.encode(text, allowed_special={\"<|endoftext|>\"})\n", - "\n", - "print(integers)" - ] - }, - { - "cell_type": "code", - "execution_count": 7, - "id": "7b152ba4-04d3-41cc-849f-adedcfb8cabb", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Hello, world. Is this-- a test?\n" - ] - } - ], - "source": [ - "strings = tik_tokenizer.decode(integers)\n", - "\n", - "print(strings)" - ] - }, - { - "cell_type": "code", - "execution_count": 8, - "id": "cf148a1a-316b-43ec-b7ba-1b6d409ce837", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "50257\n" - ] - } - ], - "source": [ - "print(tik_tokenizer.n_vocab)" - ] - }, - { - "cell_type": "markdown", - "id": "6a0b5d4f-2af9-40de-828c-063c4243e771", - "metadata": {}, - "source": [ - "# Using the original Byte-pair encoding implementation used in GPT-2" - ] - }, - { - "cell_type": "code", - "execution_count": 9, - "id": "0903108c-65cb-4ae1-967a-2155e25349c2", - "metadata": {}, - "outputs": [], - "source": [ - "from bpe_openai_gpt2 import get_encoder, download_vocab" - ] - }, - { - "cell_type": "code", - "execution_count": 10, - "id": "35dd8d7c-8c12-4b68-941a-0fd05882dd45", - "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "Fetching encoder.json: 1.04Mit [00:28, 36.8kit/s] \n", - "Fetching vocab.bpe: 457kit [00:00, 458kit/s] \n" - ] - } - ], - "source": [ - "download_vocab()" - ] - }, - { - "cell_type": "code", - "execution_count": 14, - "id": "1888a7a9-9c40-4fe0-99b4-ebd20aa1ffd0", - "metadata": {}, - "outputs": [], - "source": [ - "orig_tokenizer = get_encoder(model_name=\"gpt2_model\", models_dir=\".\")" - ] - }, - { - "cell_type": "code", - "execution_count": 15, - "id": "2740510c-a78a-4fba-ae18-2b156ba2dfef", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "[15496, 11, 995, 13, 1148, 428, 438, 257, 1332, 30]\n" - ] - } - ], - "source": [ - "integers = orig_tokenizer.encode(text)\n", - "\n", - "print(integers)" - ] - }, - { - "cell_type": "code", - "execution_count": 16, - "id": "434d115e-990d-42ad-88dd-31323a96e10f", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Hello, world. Is this-- a test?\n" - ] - } - ], - "source": [ - "strings = orig_tokenizer.decode(integers)\n", - "\n", - "print(strings)" - ] - }, - { - "cell_type": "markdown", - "id": "4f63e8c6-707c-4d66-bcf8-dd790647cc86", - "metadata": {}, - "source": [ - "# Using the BytePair Tokenizer in HuggingFace transformers" - ] - }, - { - "cell_type": "code", - "execution_count": 12, - "id": "5bfff386-f725-4137-9c50-e5da0c38bea0", - "metadata": {}, - "outputs": [], - "source": [ - "# pip install transformers" - ] - }, - { - "cell_type": "code", - "execution_count": 13, - "id": "e9077bf4-f91f-42ad-ab76-f3d89128510e", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "'4.30.2'" - ] - }, - "execution_count": 13, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "import transformers\n", - "\n", - "transformers.__version__" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "16e06ee5-c4ca-4211-8e24-dbfd84b1d85b", - "metadata": {}, - "outputs": [], - "source": [ - "设置为国内可访问" - ] - }, - { - "cell_type": "code", - "execution_count": 1, - "id": "3e07ddc9-187e-4482-a7b5-7e4e9381d805", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "env: HF_ENDPOINT=https://hf-mirror.com\n" - ] - } - ], - "source": [ - "%env HF_ENDPOINT=https://hf-mirror.com" - ] - }, - { - "cell_type": "code", - "execution_count": 2, - "id": "a9839137-b8ea-4a2c-85fc-9a63064cf8c8", - "metadata": {}, - "outputs": [ - { - "data": { - "application/vnd.jupyter.widget-view+json": { - "model_id": "afc151b540664287aa60a4cbe90cdfeb", - "version_major": 2, - "version_minor": 0 - }, - "text/plain": [ - "vocab.json: 0.00B [00:00, ?B/s]" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "application/vnd.jupyter.widget-view+json": { - "model_id": "9a5d584e4adf40bca215b409b693dc02", - "version_major": 2, - "version_minor": 0 - }, - "text/plain": [ - "merges.txt: 0.00B [00:00, ?B/s]" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "application/vnd.jupyter.widget-view+json": { - "model_id": "a126ee77a9f94e58b1dcccd68e6d5bb1", - "version_major": 2, - "version_minor": 0 - }, - "text/plain": [ - "config.json: 0%| | 0.00/367 [00:00$p7j=VNbLphZ#BtV+i>f3{ zcmuK^h5TVL5E>OG)j%+y1|-F5?F@^OFd~P?g)vo~Q1-XAjRw@QNnfi!Jl+gDc>IjL#IB<^yBvC*klJJ-qo(cuSUeR+t z?DzVbSsT{x(?$*f5KBSU%#mypM_fZfZ@@2w6v=8G43DT&URe@4!xIToS}^=nu1RsDeM}4U7w*PLh?ez=RMU5!5jWREJb4qzV_qlY(EC zyb#(HBq1E?A8Ofc7nHCNP=$a(-(^Km!@!p=;AMHjL{Rc7QY*C-4h1g?Uga$CdeaCe z6)BBKkR@dz98v;4DgvqbQB6iV5E27^YLxKjh?xTNG%=zusZiH3a$2p{en}n=C`uq4 zBBI75S@K;JMrALU5bY4Q1Tsbh|Cm=EmFz^WH*`^$0L!Ql_Nm@Lh_HFl(!|BI2!KY( zN)nV=y^0d{2SAbtR{fLX5RO-+_C^9hNf90+p>4~`ep@qZM3lTiu)+kI{lugco>Yl- zRSx(GhuseFp`w%b1*O-$h1JHSw}KKLwI&Zr#(H!F5)RMwk9Th!GJPm7sLRe z_D!lNsZb^x8g?uAXba1NA_aql0V6)!-C64FtQR03$Oi3aM8w*i8pEt0QnF=9(j${{ z2u(sL=^>zm?N+pSM)IqaL&ZnJ!C-ib#EUT{VxFPD_gk$)_~iA4&(l!Lj?++B1*vHg z#(

k?z&ZCuPhF+IfGRv4l~Jx*b;uY2gYW56l628I5Cz873w&Msk_V-WA#?80%^P*2~n zAps=@9PXhLLSMJwaGwwkyWCxN!TH1f0q5YL&^I8sj`a7soLzRS%iY;~tjpy-By^yj zyANu>1!VxFLwy1@m_g-oqF%Rf#5vH}1I&&NSFda6gx%Wh8gdiPZXk6C{f>blSLd-_ z$AHj(Y@oky(23@|fY$ADcMqTy=MkrSs8w*ektaA`z>6^0WO*xK%`T<5?XJ7w` z0oS3PA)%+Qx63&&Xzg%g;v6j@X4R#}UUNqHaLw1Hl8VSeiQFc#o51 zp*;ug&LLNyo7m{=a}N#R)ovZ?8yLz}J?;InYpQ++nL5f zqGnX*dIUvCV8;fX)*LNePDd|rp?ll~h0&YVFUiojoB5PMJtvOiX$i`c9>3Q=Ciyc) zh&?QTdi?L6gmJpgsr>3kmGU!}YRslM7{hl^>b+daf^WLQZ=!x06$_1DVTte5qM@ix zJ~hU|Z-2Z+qUbxdYAVEBz-LL}x9`^K3>SebAew)ytIvsZh8Ha}x_nuS)xUPmkfmhip1DXuV1(72f+}YQ`+PVc3dh`sGh`hflUVT` z@O0o-DB*EQ9V06f44g%d(7vk!)y|@j=vCp0q&*J_+)qw0a+txF!;{F_a~M`i7DNel z5*`tZlI$IY^FpE=B144RD=2}{5L`Mx5d_ObWl@TFVN{30^0+q$gX@Jweq#4wCMm{B z1`(VN#sj%Qc3}z*6=4d7!)HN}>V*cEk30?n%(hL(xu^#zct0b{3WTKZ^)7U_^tHg{q*A> z*yFdqam)1aOAh7yITeK*QW@3w-?al==SDGPPgP<-=as<_#@ni5yQ-V38sy&Ppgr~e zF}Ycn(zWhN@gap;5QJ}D!lRA4uf5HGcB*Yx`{!@};){R$;ESc%|5VJxH~PNy01bcm z_Y()Fo2Y`~QNx~$z*`Rl;jagkgGgjcJqxOoEdaAz+|ro7=a0U+J?g#u=-i{Zth*?| zHJeffs+cmu7af&S`mijfOz12O#FPO(uA0(M2PT>gDVHLaZg<2`zr%*se<4-1; zrZyB4CO?d9lC)9=kn#^QnTgcQ=GP&_=5kR}pC=d_`7i9T8DlM@zswj*gNjXLNzSBn?3 zW!AA=UN!&ZTA;LpR8>VN|${AibQ_af7aUZ5AjZ&roy`)z~co@|A$B`uEBTExd*HAccKM@0| zlL)U!@uVvw`p71H5@~_+UO7Peq$MbYMiI1S^m%i7p0oKYn?bP~GfodWQhKi_VhkFn z>6DStlCq%5^Bz(iDFeZx5osBfR1YgD?*OU}{FQzH3`HwfzTvWS&N=H$Zm5|*x==Ff zVu|^I`GE!V3a2yHGY~Z-&E>CIu2|-uj*0QA?Fn;}W^Rg`o02tkQA^a4EZs2Mo9<2^ z&RwJ&=(_tDz;#aIu*AS&d7(tqUE+uJq8=f?K{NmwY2jelB$^<2bE?Ei0ly@tN+8Z4 z^rN5VSoWf&4K_%qW3R*7MmSIQ8wLfeHFqZWzJBKy4yIz;4t!b4F(B|pt+6ZL<`?2X zshlSU?udrm`=3Yy)<80?+OjElYn7|%h)yMyletuyO_?iFG_VVm`GdR@@R9Rn0^q_^ zmc^-V&Zao-Ohq;=7WrDJk)u}RXd1Z-W;k^uvprarLg!Hvr_AJV&+s!k%t`*Qb#sZR{ZwcJ8YFV^6_yL&o;kHMa#Gp( z*t1(t4mMM&c|)p1%FH|?T$6DnRRT|o%|w$V%VAkz#w=AvHqMhRntsa^l&N238Y=Q8 zfRrAJEoC8N4zqezNf}w(mD0&ls)TwDOEM|_gjXe43KB04uVw z0guw<2798RRcmOC8Z>i#lF5B(NNYLxSxeXLmaargx7N}P!6zCHX$^;1q2>BU81TnG zkbc#9)B4N45Bn0WU0Q2bVvAGT;!MBZgopJT6IZiwVSju}I8i;JRZk$# z9K|m^;Bq<#)YIH&I&KK?Crry-&eLk=Ik`OlJ(6b%2f6$Rs@Rc3e3jEnh*gU81jSYg zCgC7w;nWh)khvoIC^v#d;!EcD^@ZAF#t<=plAL}M030IsARH&S(2pttQVQqE0ThnVCWF~MRA88B%rV16+V>iNox9Bc zF)t#*;rsKz|omia7EMFHU6EO+C{4-bC!g!A?j&RgQ|obiF<|JM8Oy@@AI zXiuD&<(R6WDDJFuan-&2lE!O_7n-f|eo7gc-^$#@lzu!AVie2!*e5LbOBYfm=>m2g zM485F%5Wwe2%#>KkGd4g%iEEbn<&HN3t)YJ9K6DDR#PSgCIvB-IV4Sam>bInQv4Z( z+`5-w+tXJ%n9_sU>C!yTUqm70Kj8O)PdJ_PT?^00OSj-oR&QF^ACt7|rg&M?vaM$R z*yYGvBx+dRP#cd zv^#05jSandI&RyE`#(3-CL6Y0J`#1#Kb726AMHtQsEj(2Rn^!0z#do#yxub3vZ6Cp zK9sC$___6G*2O(bTN8D5tgOHv zj(aOQ9ctzkt6HjhyC+f8qSds-Yg&>u_4Ceq_wL=P-W1~(_Qs^!)!XCM+n4wcI&T=> zb-nMp;eGe;-`kdKH@j|C-fI7-`k!5&ZoGYtfRzwl{6_#;^85%q#o24b`t*xN;$8xcu_m%LpbK1Tu&7dlE3S zw(4l9LSE)ad-rs-a38lk+*zUb=Nm&(@yr+=K*y6Ygh|omuU4GNMR=I(xthhxaEJ$I zX7u8>ECpr_TtvT^GiGQvcNIIj#X&=%`h+3XE#?{892$bfjXw-~h=__U2VK^!>c4#d;sx$H4}?|Z-M`CY zU0r?(b)nVBO_yo4C9TE?wIF2lnB~Bz6p}7X$OoslW~Z~nT+KXPmM!uV)Kqo?-1~z2 z0*+jKb3g7IM`K-DL(7s^YuJ^}$%jm@-8Zt;xB|WYXL2Z$#sQBcpA3&_cQ&o%n8MEI zR6dD<-+{*{iYbo+WO;1##T$J$n`7de!KJ6QEqm}*bTby^=ZO%r{0ttck`X9l?5W8= zz?)32FZ6QSfp=5Ramq0&6WB9vHl($)L51Q@``3&EMW&?y=3l=A&CArT>_ygeMPWZ( zqGEpsK$;istaZ6|)57qg;cfF$Q=+y_t8K$dfyK|KVbqzd*?9fX>xc6D=;kB8vH#kB zYbvq7Put&zFNw|l8emPoR@09fWg9;$YrKusq&(JdssDQ6;|uZrV;{Zp$t#~7I(hrh z$@r<45{JCnAuq}$8hsjInNKV8#o3*-)yK{CYdvb3)YMUTKLj8p0!4t-+AF6B)~xv# zP4R-A3RX*h$sEx=cq%UD)Kh-TBKgon9lPE)7Wxhp7{h-sx#bi1UNE|eE9SM(P$I8| zE#}&0_-~uNSjgRw39m7#pG2+f%%yR#cIL`8$e!Mo9Si9wFK^D~)>`LKF6)8BH$9iW zZx~~SH_H}}X>~hudFignX>3&cGnv>66xpfC`~M6UUL_W`0$_;8>wl2GZ?wmnw1;*s zIkbmbbD8O>c#0b4S%8Y^#ipNyB4e*CdoQL;nTVL;SCGeiBU5J1`Kcx9Z_K&$lp|fc zI_1(;e@dlB0cgtEDqowrGL@4e5;r#jBsM*#0oojz%>i%6R+pm@iJJz1MEwsmK-)ph zcJMQs^R~@-Yc&4Cu=avCZgVDVKF#KXMJ6lExFUs4gQ>YBg)lqh9hW?wl-1+GVV21t zrOP~?bCcd+`iuM=iZVUJl!i=fq}Og}H7dEtm4(lhfzoN0W|B<%bOOq+U3Fee`ZT9J$IP)eV7*Tv+S(1&2Z9hWlTsF}!EU~?PWXI6IJjj4=GTJeYhImKrd^~|)WM|*^<4Ta(mE{fpB#4kju(w2F>foe4{iX6Zp# zXMFfm$8y!ih31&!s(ry8Em_eck9}a^D=P9efz`bAQV6rw%l^EGfM*gF&uSIV-m3m| zDsDNRupHMc$EjXJ{Kb>W>YD3Mz5dkIeGB`dW+J13ePMveXdp83@>ajDrZQv{m%78c z_h!#;+`o2z`efojzjmNMesmx{cr1SOn0DX=^lzf_xK??5_W9+~s^o@E$?C=xqrS}a z6$kJ&0n(Z#Wjkbhqt;QW|ESX7s4e-ZjtBg>ytboN|8c9KV|U5Ndw4)Uisdd^e_&6ohbunO()@utLWzl8dnBq7A>osXxPs284P4 zi|JarTKo(RHFk-{2v4hJqUrsTqMACDwj46oxA3+83f!9W9>qsop;-0#J2esA8EdvB z(XzVQS12DSSu2zst)gcxu~aNe(;qQ_gYtJEmG^RzK`SPx@7#V(wiZ31Br6KBLrQk6 z%IO!e!ngK{6e(r&foUvU3>0kgkCQ|S| zfAT)x1|2p5;$#_M`UlxFyXjK|?8v3Rl@0^9;!KUyii3_gsjIYxBsFtTm|;@d+S~UY zX&b^CnF-oh!ornq$~Rp>Cp56Fo7t!he1Y-$rAX~qx+NTEpv_$DVHgp81)(!>>c&qi zVX8Nnvf_k->QQhwK>|{%e{3>z)}sWbrDmIak;tVHNegN?eW0YwY;8TM>sPlnKxaZ1K}$}~9vohxBi)r9kw zq>s&X4wl?E`5a-JrK%RnV?@b+Mel|QoH)T|4biI5uxE-or>iS+XPlW)mES}O`YSh? zy1r9hKij=rT0Xb;wS!j<#vVc`^eCgWx26J_VMvhz{?^YY4T z+ZJ|Qx4&+WonO3=sM@7f?YhyFDBr7DcK+ zV~5t*f#|jpgaOPW!mXtUH|D(RUU1L5u@!#(@au;ct%;hQTFuTlyH|`<-Uw_AB?Dg( z@~;W3=B<}vHej7nC18v3#=uv^<<|sO^VUl-Y->y1Td`zCtxzEK@unxbb{l_dbr`xf zmwaL?!^l7;kgp;mJxnbllOewv#ypWpd9f`;w$#iV31<4)fGUb3uoj+?HW*ZE(&bQT^KIS1;hcy!}hz}&U} E2jXn#ga7~l diff --git a/ch02/02_bonus_bytepair-encoder/bpe_openai_gpt2.py b/ch02/02_bonus_bytepair-encoder/bpe_openai_gpt2.py index f3d9575..0b7cfa1 100644 --- a/ch02/02_bonus_bytepair-encoder/bpe_openai_gpt2.py +++ b/ch02/02_bonus_bytepair-encoder/bpe_openai_gpt2.py @@ -41,6 +41,7 @@ import requests from tqdm import tqdm from functools import lru_cache +# # 定义一个函数将字节转换为Unicode字符 @lru_cache() def bytes_to_unicode(): """ @@ -52,6 +53,15 @@ def bytes_to_unicode(): To avoid that, we want lookup tables between utf-8 bytes and unicode strings. And avoids mapping to whitespace/control characters the bpe code barfs on. """ + ''' + 返回一组UTF-8字节和相应的Unicode字符串列表。 + 可逆的BPE编码适用于Unicode字符串。 + 这意味着如果想要避免UNK(未知标记),则需要在词汇表中包含大量的Unicode字符。 + 当处理大约100亿标记的数据集时,最终需要大约5000个字符以确保良好的覆盖率。 + 这相当于正常情况下使用的32,000个BPE词汇表的显著比例。 + 为了避免这种情况,我们希望在UTF-8字节和Unicode字符串之间建立查找表。 + 并且要避免将BPE代码映射到空格/控制字符上,以免出现问题。 + ''' bs = list(range(ord("!"), ord("~")+1))+list(range(ord("¡"), ord("¬")+1))+list(range(ord("®"), ord("ÿ")+1)) cs = bs[:] n = 0 @@ -63,11 +73,16 @@ def bytes_to_unicode(): cs = [chr(n) for n in cs] return dict(zip(bs, cs)) +# 定义一个函数获取单词中的符号对 def get_pairs(word): """Return set of symbol pairs in a word. Word is represented as tuple of symbols (symbols being variable-length strings). """ + ''' + 返回单词中的符号对集合。 + 单词以符号元组的形式表示(其中符号是可变长度的字符串)。 + ''' pairs = set() prev_char = word[0] for char in word[1:]: @@ -75,8 +90,10 @@ def get_pairs(word): prev_char = char return pairs +# 定义一个使用字节对编码(BPE)进行编码和解码的Encoder类 class Encoder: def __init__(self, encoder, bpe_merges, errors='replace'): + # # 使用编码器字典、BPE合并和错误处理策略初始化Encoder self.encoder = encoder self.decoder = {v:k for k,v in self.encoder.items()} self.errors = errors # how to handle errors in decoding @@ -89,6 +106,7 @@ class Encoder: self.pat = re.compile(r"""'s|'t|'re|'ve|'m|'ll|'d| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+""") def bpe(self, token): + # 对给定的标记执行字节对编码 if token in self.cache: return self.cache[token] word = tuple(token) @@ -130,6 +148,7 @@ class Encoder: return word def encode(self, text): + # 使用BPE对给定文本进行编码 bpe_tokens = [] for token in re.findall(self.pat, text): token = ''.join(self.byte_encoder[b] for b in token.encode('utf-8')) @@ -137,10 +156,11 @@ class Encoder: return bpe_tokens def decode(self, tokens): + # 将一系列标记解码回文本 text = ''.join([self.decoder[token] for token in tokens]) text = bytearray([self.byte_decoder[c] for c in text]).decode('utf-8', errors=self.errors) return text - +# 定义一个函数获取特定模型的编码器 def get_encoder(model_name, models_dir): with open(os.path.join(models_dir, model_name, 'encoder.json'), 'r') as f: encoder = json.load(f) @@ -152,7 +172,7 @@ def get_encoder(model_name, models_dir): bpe_merges=bpe_merges, ) - +# 定义一个函数下载GPT-2模型的词汇文件 def download_vocab(): # Modified code from subdir = 'gpt2_model' diff --git a/ch02/02_bonus_bytepair-encoder/compare-bpe-tiktoken.ipynb b/ch02/02_bonus_bytepair-encoder/compare-bpe-tiktoken.ipynb index a777c4a..5b32038 100644 --- a/ch02/02_bonus_bytepair-encoder/compare-bpe-tiktoken.ipynb +++ b/ch02/02_bonus_bytepair-encoder/compare-bpe-tiktoken.ipynb @@ -5,7 +5,8 @@ "id": "a9adc3bf-353c-411e-a471-0e92786e7103", "metadata": {}, "source": [ - "# Using BytePair encodding from `tiktoken`" + "# 使用来自 `tiktoken` 的字节对编码\n", + "tiktoken是一个用于OpenAI模型的快速BPE标记器。(BPE标记器是一种基于字节对编码(Byte Pair Encoding,简称BPE)的文本标记方法。字节对编码是一种数据压缩技术,但在自然语言处理中,它也被用于创建词汇表和对文本进行分词。)" ] }, { @@ -20,7 +21,7 @@ }, { "cell_type": "code", - "execution_count": 4, + "execution_count": 1, "id": "1c490fca-a48a-47fa-a299-322d1a08ad17", "metadata": {}, "outputs": [ @@ -28,33 +29,33 @@ "name": "stdout", "output_type": "stream", "text": [ - "tiktoken version: 0.5.2\n" + "tiktoken version: 0.6.0\n" ] } ], "source": [ "import importlib.metadata\n", - "\n", + "# 打印出当前系统中安装的 tiktoken 库的版本号\n", "print(\"tiktoken version:\", importlib.metadata.version(\"tiktoken\"))" ] }, { "cell_type": "code", - "execution_count": 5, + "execution_count": 2, "id": "0952667c-ce84-4f21-87db-59f52b44cec4", "metadata": {}, "outputs": [], "source": [ "import tiktoken\n", - "\n", + "# 创建一个使用 GPT-2 模型的编码器对象\n", "tik_tokenizer = tiktoken.get_encoding(\"gpt2\")\n", - "\n", + "# ,定义一个包含文本的字符串变量,使用 tik_tokenizer 对象对文本进行编码\n", "text = \"Hello, world. Is this-- a test?\"" ] }, { "cell_type": "code", - "execution_count": 6, + "execution_count": 3, "id": "b039c350-18ad-48fb-8e6a-085702dfc330", "metadata": {}, "outputs": [ @@ -67,6 +68,7 @@ } ], "source": [ + "# 参数 allowed_special,该参数指定哪些特殊字符允许出现在编码结果\n", "integers = tik_tokenizer.encode(text, allowed_special={\"<|endoftext|>\"})\n", "\n", "print(integers)" @@ -74,7 +76,7 @@ }, { "cell_type": "code", - "execution_count": 7, + "execution_count": 4, "id": "7b152ba4-04d3-41cc-849f-adedcfb8cabb", "metadata": {}, "outputs": [ @@ -87,6 +89,7 @@ } ], "source": [ + "# 进行解码\n", "strings = tik_tokenizer.decode(integers)\n", "\n", "print(strings)" @@ -94,7 +97,7 @@ }, { "cell_type": "code", - "execution_count": 8, + "execution_count": 5, "id": "cf148a1a-316b-43ec-b7ba-1b6d409ce837", "metadata": {}, "outputs": [ @@ -107,6 +110,7 @@ } ], "source": [ + "# 表示编码器的词汇表大小\n", "print(tik_tokenizer.n_vocab)" ] }, @@ -115,12 +119,12 @@ "id": "6a0b5d4f-2af9-40de-828c-063c4243e771", "metadata": {}, "source": [ - "# Using the original Byte-pair encoding implementation used in GPT-2" + "# 使用在GPT-2中使用的原始字节对编码实现" ] }, { "cell_type": "code", - "execution_count": 9, + "execution_count": 6, "id": "0903108c-65cb-4ae1-967a-2155e25349c2", "metadata": {}, "outputs": [], @@ -130,7 +134,7 @@ }, { "cell_type": "code", - "execution_count": 10, + "execution_count": 7, "id": "35dd8d7c-8c12-4b68-941a-0fd05882dd45", "metadata": {}, "outputs": [ @@ -138,8 +142,8 @@ "name": "stderr", "output_type": "stream", "text": [ - "Fetching encoder.json: 1.04Mit [00:28, 36.8kit/s] \n", - "Fetching vocab.bpe: 457kit [00:00, 458kit/s] \n" + "Fetching encoder.json: 1.04Mit [00:02, 502kit/s] \n", + "Fetching vocab.bpe: 457kit [00:02, 212kit/s] \n" ] } ], @@ -149,7 +153,7 @@ }, { "cell_type": "code", - "execution_count": 14, + "execution_count": 8, "id": "1888a7a9-9c40-4fe0-99b4-ebd20aa1ffd0", "metadata": {}, "outputs": [], @@ -159,7 +163,7 @@ }, { "cell_type": "code", - "execution_count": 15, + "execution_count": 9, "id": "2740510c-a78a-4fba-ae18-2b156ba2dfef", "metadata": {}, "outputs": [ @@ -179,7 +183,7 @@ }, { "cell_type": "code", - "execution_count": 16, + "execution_count": 10, "id": "434d115e-990d-42ad-88dd-31323a96e10f", "metadata": {}, "outputs": [ @@ -202,12 +206,14 @@ "id": "4f63e8c6-707c-4d66-bcf8-dd790647cc86", "metadata": {}, "source": [ - "# Using the BytePair Tokenizer in HuggingFace transformers" + "# 使用HuggingFace Transformers中的BytePair Tokenizer\n", + "\r\n", + "\r\n" ] }, { "cell_type": "code", - "execution_count": 12, + "execution_count": 11, "id": "5bfff386-f725-4137-9c50-e5da0c38bea0", "metadata": {}, "outputs": [], @@ -217,17 +223,17 @@ }, { "cell_type": "code", - "execution_count": 13, + "execution_count": 12, "id": "e9077bf4-f91f-42ad-ab76-f3d89128510e", "metadata": {}, "outputs": [ { "data": { "text/plain": [ - "'4.30.2'" + "'4.33.3'" ] }, - "execution_count": 13, + "execution_count": 12, "metadata": {}, "output_type": "execute_result" } @@ -240,47 +246,19 @@ }, { "cell_type": "code", - "execution_count": null, - "id": "16e06ee5-c4ca-4211-8e24-dbfd84b1d85b", - "metadata": {}, - "outputs": [], - "source": [ - "设置为国内可访问" - ] - }, - { - "cell_type": "code", - "execution_count": 1, - "id": "3e07ddc9-187e-4482-a7b5-7e4e9381d805", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "env: HF_ENDPOINT=https://hf-mirror.com\n" - ] - } - ], - "source": [ - "%env HF_ENDPOINT=https://hf-mirror.com" - ] - }, - { - "cell_type": "code", - "execution_count": 2, + "execution_count": 13, "id": "a9839137-b8ea-4a2c-85fc-9a63064cf8c8", "metadata": {}, "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { - "model_id": "afc151b540664287aa60a4cbe90cdfeb", + "model_id": "ef488b57e4214b76a8913a4704de7e15", "version_major": 2, "version_minor": 0 }, "text/plain": [ - "vocab.json: 0.00B [00:00, ?B/s]" + "vocab.json: 0%| | 0.00/1.04M [00:00)" - ] - }, - "execution_count": 5, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "embedding(torch.tensor([1]))" - ] - }, - { - "cell_type": "markdown", - "id": "6a4d47f2-4691-47b8-9855-2528b6c285c9", - "metadata": {}, - "source": [ - "- Below is a visualization of what happens under the hood:" - ] - }, - { - "cell_type": "markdown", - "id": "12ffd155-7190-44b1-b6b6-45b11d6fe83b", - "metadata": {}, - "source": [ - "" - ] - }, - { - "cell_type": "markdown", - "id": "87d1311b-cfb2-4afc-9e25-e4ecf35370d9", - "metadata": {}, - "source": [ - "- Similarly, we can use embedding layers to obtain the vector representation of a training example with ID 2:" - ] - }, - { - "cell_type": "code", - "execution_count": 6, - "id": "c309266a-c601-4633-9404-2e10b1cdde8c", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "tensor([[ 0.6957, -1.8061, -1.1589, 0.3255, -0.6315]],\n", - " grad_fn=)" - ] - }, - "execution_count": 6, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "embedding(torch.tensor([2]))" - ] - }, - { - "cell_type": "markdown", - "id": "7ad3b601-f68c-41b1-a28d-b624b94ef383", - "metadata": {}, - "source": [ - "" - ] - }, - { - "cell_type": "markdown", - "id": "27dd54bd-85ae-4887-9c5e-3139da361cf4", - "metadata": {}, - "source": [ - "- Now, let's convert all the training examples we have defined previously:" - ] - }, - { - "cell_type": "code", - "execution_count": 7, - "id": "0191aa4b-f6a8-4b0d-9c36-65e82b81d071", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "tensor([[ 0.6957, -1.8061, -1.1589, 0.3255, -0.6315],\n", - " [-2.8400, -0.7849, -1.4096, -0.4076, 0.7953],\n", - " [ 1.3010, 1.2753, -0.2010, -0.1606, -0.4015]],\n", - " grad_fn=)" - ] - }, - "execution_count": 7, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "idx = torch.tensor([2, 3, 1])\n", - "embedding(idx)" - ] - }, - { - "cell_type": "markdown", - "id": "146cf8eb-c517-4cd4-aa91-0e818fed7651", - "metadata": {}, - "source": [ - "- Under the hood, it's still the same look-up concept:" - ] - }, - { - "cell_type": "markdown", - "id": "b392eb43-0bda-4821-b446-b8dcbee8ae00", - "metadata": {}, - "source": [ - "" - ] - }, - { - "cell_type": "markdown", - "id": "f0fe863b-d6a3-48f3-ace5-09ecd0eb7b59", - "metadata": {}, - "source": [ - "## Using nn.Linear" - ] - }, - { - "cell_type": "markdown", - "id": "138de6a4-2689-4c1f-96af-7899b2d82a4e", - "metadata": {}, - "source": [ - "- Now, we will demonstrate that the embedding layer above accomplishes exactly the same as `nn.Linear` layer on a one-hot encoded representation in PyTorch\n", - "- First, let's convert the token IDs into a one-hot representation:" - ] - }, - { - "cell_type": "code", - "execution_count": 8, - "id": "b5bb56cf-bc73-41ab-b107-91a43f77bdba", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "tensor([[0, 0, 1, 0],\n", - " [0, 0, 0, 1],\n", - " [0, 1, 0, 0]])" - ] - }, - "execution_count": 8, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "onehot = torch.nn.functional.one_hot(idx)\n", - "onehot" - ] - }, - { - "cell_type": "markdown", - "id": "aa45dfdf-fb26-4514-a176-75224f5f179b", - "metadata": {}, - "source": [ - "- Next, we initialize a `Linear` layer, which caries out a matrix multiplication $X W^\\top$:" - ] - }, - { - "cell_type": "code", - "execution_count": 9, - "id": "ae04c1ed-242e-4dd7-b8f7-4b7e4caae383", - "metadata": {}, - "outputs": [], - "source": [ - "torch.manual_seed(123)\n", - "linear = torch.nn.Linear(num_idx, out_dim, bias=False)" - ] - }, - { - "cell_type": "markdown", - "id": "63efb98e-5cc4-4e8d-9fe6-ef0ad29ae2d7", - "metadata": {}, - "source": [ - "- Note that the linear layer in PyTorch is also initialized with small random weights; to directly compare it to the `Embedding` layer above, we have to use the same small random weights, which is why we reassign them here:" - ] - }, - { - "cell_type": "code", - "execution_count": 10, - "id": "a3b90d69-761c-486e-bd19-b38a2988fe62", - "metadata": {}, - "outputs": [], - "source": [ - "linear.weight = torch.nn.Parameter(embedding.weight.T.detach())" - ] - }, - { - "cell_type": "markdown", - "id": "9116482d-f1f9-45e2-9bf3-7ef5e9003898", - "metadata": {}, - "source": [ - "- Now we can use the linear layer on the one-hot encoded representation of the inputs:" - ] - }, - { - "cell_type": "code", - "execution_count": 11, - "id": "90d2b0dd-9f1d-4c0f-bb16-1f6ce6b8ac2c", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "tensor([[ 0.6957, -1.8061, -1.1589, 0.3255, -0.6315],\n", - " [-2.8400, -0.7849, -1.4096, -0.4076, 0.7953],\n", - " [ 1.3010, 1.2753, -0.2010, -0.1606, -0.4015]], grad_fn=)" - ] - }, - "execution_count": 11, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "linear(onehot.float())" - ] - }, - { - "cell_type": "markdown", - "id": "f6204bc8-92e2-4546-9cda-574fe1360fa2", - "metadata": {}, - "source": [ - "As we can see, this is exactly the same as what we got when we used the embedding layer:" - ] - }, - { - "cell_type": "code", - "execution_count": 12, - "id": "2b057649-3176-4a54-b58c-fd8fbf818c61", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "tensor([[ 0.6957, -1.8061, -1.1589, 0.3255, -0.6315],\n", - " [-2.8400, -0.7849, -1.4096, -0.4076, 0.7953],\n", - " [ 1.3010, 1.2753, -0.2010, -0.1606, -0.4015]],\n", - " grad_fn=)" - ] - }, - "execution_count": 12, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "embedding(idx)" - ] - }, - { - "cell_type": "markdown", - "id": "0e447639-8952-460e-8c8f-cf9e23c368c9", - "metadata": {}, - "source": [ - "- What happens under the hood is the following computation for the first training example's token ID:" - ] - }, - { - "cell_type": "markdown", - "id": "1830eccf-a707-4753-a24a-9b103f55594a", - "metadata": {}, - "source": [ - "" - ] - }, - { - "cell_type": "markdown", - "id": "9ce5211a-14e6-46aa-a3a8-14609f086e97", - "metadata": {}, - "source": [ - "- And for the second training example's token ID:" - ] - }, - { - "cell_type": "markdown", - "id": "173f6026-a461-44da-b9c5-f571f8ec8bf3", - "metadata": {}, - "source": [ - "" - ] - }, - { - "cell_type": "markdown", - "id": "e2608049-f5d1-49a9-a14b-82695fc32e6a", - "metadata": {}, - "source": [ - "- Since all but one index in each one-hot encoded row are 0 (by design), this matrix multiplication is essentially the same as a look-up of the one-hot elements\n", - "- This use of the matrix multiplication on one-hot encodings is equivalent to the embedding layer look-up but can be inefficient if we work with large embedding matrices, because there are a lot of wasteful multiplications by zero" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "5eacc005-86fc-490c-8f6a-dc37d8a0df7c", - "metadata": {}, - "outputs": [], - "source": [] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "a1f63c81-1ee3-40a1-9ef2-14ff18fb4f05", - "metadata": {}, - "outputs": [], - "source": [] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "c71959bb-facf-44fd-8edb-b67f7752f034", - "metadata": {}, - "outputs": [], - "source": [] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3 (ipykernel)", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.11.5" - } - }, - "nbformat": 4, - "nbformat_minor": 5 -} diff --git a/ch02/03_bonus_embedding-vs-matmul/README.md b/ch02/03_bonus_embedding-vs-matmul/README.md index a1f67ef..5228286 100644 --- a/ch02/03_bonus_embedding-vs-matmul/README.md +++ b/ch02/03_bonus_embedding-vs-matmul/README.md @@ -1,3 +1,4 @@ -# Chapter 2: Working with Text Data +# 第2章:使用文本数据 + +- [embeddings-and-linear-layers.ipynb](embeddings-and-linear-layers.ipynb) 包含可选(奖励)代码,以说明应用于独热编码向量的嵌入层和全连接层是等效的。 -- [embeddings-and-linear-layers.ipynb](embeddings-and-linear-layers.ipynb) contains optional (bonus) code to explain that embedding layers and fully connected layers applied to one-hot encoded vectors are equivalent. diff --git a/ch02/03_bonus_embedding-vs-matmul/embeddings-and-linear-layers.ipynb b/ch02/03_bonus_embedding-vs-matmul/embeddings-and-linear-layers.ipynb index 198e1b1..e068fc5 100644 --- a/ch02/03_bonus_embedding-vs-matmul/embeddings-and-linear-layers.ipynb +++ b/ch02/03_bonus_embedding-vs-matmul/embeddings-and-linear-layers.ipynb @@ -5,7 +5,7 @@ "id": "063850ab-22b0-4838-b53a-9bb11757d9d0", "metadata": {}, "source": [ - "# Embedding Layers and Linear Layers" + "# Embedding层和 Linear层" ] }, { @@ -13,8 +13,8 @@ "id": "0315c598-701f-46ff-8806-15813cad0e51", "metadata": {}, "source": [ - "- Embedding layers in PyTorch accomplish the same as linear layers that perform matrix multiplications; the reason we use embedding layers is computational efficiency\n", - "- We will take a look at this relationship step by step using code examples in PyTorch" + "- 在PyTorch中,嵌入层(Embedding layers)实现了执行矩阵乘法的线性层的相同功能;我们使用嵌入层的原因是为了提高计算效率。\n", + "- 我们将逐步使用PyTorch中的代码示例来查看这种关系。" ] }, { @@ -27,13 +27,12 @@ "name": "stdout", "output_type": "stream", "text": [ - "PyTorch version: 2.1.0.post301\n" + "PyTorch version: 1.12.1+cu113\n" ] } ], "source": [ "import torch\n", - "\n", "print(\"PyTorch version:\", torch.__version__)" ] }, @@ -47,22 +46,21 @@ }, { "cell_type": "code", - "execution_count": 2, + "execution_count": 3, "id": "cc489ea5-73db-40b9-959e-0d70cae25f40", "metadata": {}, "outputs": [], "source": [ - "# Suppose we have the following 3 training examples,\n", - "# which may represent token IDs in a LLM context\n", + "# 假设我们有以下 3 个训练样本,\n", + "# 这些样本可能表示语言模型(LM)上下文中的标记ID\n", "idx = torch.tensor([2, 3, 1])\n", "\n", - "# The number of rows in the embedding matrix can be determined\n", - "# by obtaining the largest token ID + 1.\n", - "# If the highest token ID is 3, then we want 4 rows, for the possible\n", - "# token IDs 0, 1, 2, 3\n", - "num_idx = max(idx)+1\n", + "# 嵌入矩阵的行数可以通过获取最大标记ID + 1 来确定。\n", + "# 如果最高的标记ID是3,则我们希望有4行,对应可能的\n", + "# 标记ID 0, 1, 2, 3\n", + "num_idx = max(idx) + 1\n", "\n", - "# The desired embedding dimension is a hyperparameter\n", + "# 所需的嵌入维度是一个超参数\n", "out_dim = 5" ] }, @@ -71,21 +69,21 @@ "id": "93d83a6e-8543-40af-b253-fe647640bf36", "metadata": {}, "source": [ - "- Let's implement a simple embedding layer:" + "- 实现一个简单的嵌入层" ] }, { "cell_type": "code", - "execution_count": 3, + "execution_count": 4, "id": "60a7c104-36e1-4b28-bd02-a24a1099dc66", "metadata": {}, "outputs": [], "source": [ - "# We use the random seed for reproducibility since\n", - "# weights in the embedding layer are initialized with\n", - "# small random values\n", + "# 为了可重复性,我们使用随机种子,\n", + "# 因为嵌入层的权重是用小的随机值初始化的\n", "torch.manual_seed(123)\n", "\n", + "# 创建一个嵌入层,指定输入维度为 num_idx,输出维度为 out_dim\n", "embedding = torch.nn.Embedding(num_idx, out_dim)" ] }, @@ -94,12 +92,12 @@ "id": "dd96c00a-3297-4a50-8bfc-247aaea7e761", "metadata": {}, "source": [ - "We can optionally take a look at the embedding weights:" + "查看嵌入权重数据情况" ] }, { "cell_type": "code", - "execution_count": 4, + "execution_count": 5, "id": "595f603e-8d2a-4171-8f94-eac8106b2e57", "metadata": {}, "outputs": [ @@ -113,7 +111,7 @@ " [-2.8400, -0.7849, -1.4096, -0.4076, 0.7953]], requires_grad=True)" ] }, - "execution_count": 4, + "execution_count": 5, "metadata": {}, "output_type": "execute_result" } @@ -127,12 +125,12 @@ "id": "c86eb562-61e2-4171-ab6e-b410a1fd5c18", "metadata": {}, "source": [ - "- We can then use the embedding layers to obtain the vector representation of a training example with ID 1:" + "- 使用嵌入层来获取具有ID 1的训练样本的向量表示" ] }, { "cell_type": "code", - "execution_count": 5, + "execution_count": 6, "id": "8bbc0255-4805-4be9-9f4c-1d0d967ef9d5", "metadata": {}, "outputs": [ @@ -143,7 +141,7 @@ " grad_fn=)" ] }, - "execution_count": 5, + "execution_count": 6, "metadata": {}, "output_type": "execute_result" } @@ -157,7 +155,7 @@ "id": "6a4d47f2-4691-47b8-9855-2528b6c285c9", "metadata": {}, "source": [ - "- Below is a visualization of what happens under the hood:" + "- 下面是底层操作的可视化" ] }, { @@ -173,12 +171,12 @@ "id": "87d1311b-cfb2-4afc-9e25-e4ecf35370d9", "metadata": {}, "source": [ - "- Similarly, we can use embedding layers to obtain the vector representation of a training example with ID 2:" + "- 同样,我们可以使用嵌入层来获取具有ID 2的训练样本的向量表示:" ] }, { "cell_type": "code", - "execution_count": 6, + "execution_count": 7, "id": "c309266a-c601-4633-9404-2e10b1cdde8c", "metadata": {}, "outputs": [ @@ -189,7 +187,7 @@ " grad_fn=)" ] }, - "execution_count": 6, + "execution_count": 7, "metadata": {}, "output_type": "execute_result" } @@ -211,12 +209,12 @@ "id": "27dd54bd-85ae-4887-9c5e-3139da361cf4", "metadata": {}, "source": [ - "- Now, let's convert all the training examples we have defined previously:" + "- 现在,让我们将之前定义的所有训练样本转换:" ] }, { "cell_type": "code", - "execution_count": 7, + "execution_count": 8, "id": "0191aa4b-f6a8-4b0d-9c36-65e82b81d071", "metadata": {}, "outputs": [ @@ -229,12 +227,13 @@ " grad_fn=)" ] }, - "execution_count": 7, + "execution_count": 8, "metadata": {}, "output_type": "execute_result" } ], "source": [ + "# 将原先的第三行变成现在的第一行,第四行变成现在的第二行,第二行变成现在的第三行\n", "idx = torch.tensor([2, 3, 1])\n", "embedding(idx)" ] @@ -260,7 +259,7 @@ "id": "f0fe863b-d6a3-48f3-ace5-09ecd0eb7b59", "metadata": {}, "source": [ - "## Using nn.Linear" + "## 使用 nn.Linear" ] }, { @@ -268,13 +267,13 @@ "id": "138de6a4-2689-4c1f-96af-7899b2d82a4e", "metadata": {}, "source": [ - "- Now, we will demonstrate that the embedding layer above accomplishes exactly the same as `nn.Linear` layer on a one-hot encoded representation in PyTorch\n", - "- First, let's convert the token IDs into a one-hot representation:" + "- 接下来,我们将使用One-Hot编码,与embedding 层一样,在 `nn.Linear` 层进行操作\n", + "- 首先,我们将标记ID转换为One-Hot表示:" ] }, { "cell_type": "code", - "execution_count": 8, + "execution_count": 12, "id": "b5bb56cf-bc73-41ab-b107-91a43f77bdba", "metadata": {}, "outputs": [ @@ -286,7 +285,7 @@ " [0, 1, 0, 0]])" ] }, - "execution_count": 8, + "execution_count": 12, "metadata": {}, "output_type": "execute_result" } @@ -301,18 +300,33 @@ "id": "aa45dfdf-fb26-4514-a176-75224f5f179b", "metadata": {}, "source": [ - "- Next, we initialize a `Linear` layer, which caries out a matrix multiplication $X W^\\top$:" + "- 接下来,我们使用矩阵乘法$X W^\\top$ 来初始化一个Linear层" ] }, { "cell_type": "code", - "execution_count": 9, + "execution_count": 16, "id": "ae04c1ed-242e-4dd7-b8f7-4b7e4caae383", "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Parameter containing:\n", + "tensor([[-0.2039, 0.0166, -0.2483, 0.1886],\n", + " [-0.4260, 0.3665, -0.3634, -0.3975],\n", + " [-0.3159, 0.2264, -0.1847, 0.1871],\n", + " [-0.4244, -0.3034, -0.1836, -0.0983],\n", + " [-0.3814, 0.3274, -0.1179, 0.1605]], requires_grad=True)\n" + ] + } + ], "source": [ "torch.manual_seed(123)\n", - "linear = torch.nn.Linear(num_idx, out_dim, bias=False)" + "# 初始化一个Linear层,该层的权重矩阵是由 num_idx(输入维度)到 out_dim(输出维度)的一个线性层,而且没有偏置项\n", + "linear = torch.nn.Linear(num_idx, out_dim, bias=False)\n", + "print(linear.weight)" ] }, { @@ -320,16 +334,17 @@ "id": "63efb98e-5cc4-4e8d-9fe6-ef0ad29ae2d7", "metadata": {}, "source": [ - "- Note that the linear layer in PyTorch is also initialized with small random weights; to directly compare it to the `Embedding` layer above, we have to use the same small random weights, which is why we reassign them here:" + "- 请注意,PyTorch中的`linear`层也是用小的随机权重进行初始化的。为了与上面的 `Embedding` 层进行直接比较,我们必须使用相同的小随机权重,这就是我们在这里重新分配它们的原因:" ] }, { "cell_type": "code", - "execution_count": 10, + "execution_count": 17, "id": "a3b90d69-761c-486e-bd19-b38a2988fe62", "metadata": {}, "outputs": [], "source": [ + "# linear 层的权重就被重新赋值为与 embedding 层相同的小随机权重,以确保它们具有相同的初始化。这是为了使它们在后续操作中可以进行直接比较。\n", "linear.weight = torch.nn.Parameter(embedding.weight.T.detach())" ] }, @@ -338,12 +353,12 @@ "id": "9116482d-f1f9-45e2-9bf3-7ef5e9003898", "metadata": {}, "source": [ - "- Now we can use the linear layer on the one-hot encoded representation of the inputs:" + "- 现在,我们可以使用线性层处理输入的One-Hot编码表示:" ] }, { "cell_type": "code", - "execution_count": 11, + "execution_count": 18, "id": "90d2b0dd-9f1d-4c0f-bb16-1f6ce6b8ac2c", "metadata": {}, "outputs": [ @@ -355,7 +370,7 @@ " [ 1.3010, 1.2753, -0.2010, -0.1606, -0.4015]], grad_fn=)" ] }, - "execution_count": 11, + "execution_count": 18, "metadata": {}, "output_type": "execute_result" } @@ -369,12 +384,12 @@ "id": "f6204bc8-92e2-4546-9cda-574fe1360fa2", "metadata": {}, "source": [ - "As we can see, this is exactly the same as what we got when we used the embedding layer:" + "正如我们所看到的,这与我们使用嵌入层时得到的结果完全相同:" ] }, { "cell_type": "code", - "execution_count": 12, + "execution_count": 19, "id": "2b057649-3176-4a54-b58c-fd8fbf818c61", "metadata": {}, "outputs": [ @@ -387,7 +402,7 @@ " grad_fn=)" ] }, - "execution_count": 12, + "execution_count": 19, "metadata": {}, "output_type": "execute_result" } @@ -401,7 +416,7 @@ "id": "0e447639-8952-460e-8c8f-cf9e23c368c9", "metadata": {}, "source": [ - "- What happens under the hood is the following computation for the first training example's token ID:" + "- 底层发生的计算如下,针对第一个训练样本的标记ID:" ] }, { @@ -417,7 +432,7 @@ "id": "9ce5211a-14e6-46aa-a3a8-14609f086e97", "metadata": {}, "source": [ - "- And for the second training example's token ID:" + "- 以及对于第二个训练样本的标记ID:" ] }, { @@ -433,8 +448,9 @@ "id": "e2608049-f5d1-49a9-a14b-82695fc32e6a", "metadata": {}, "source": [ - "- Since all but one index in each one-hot encoded row are 0 (by design), this matrix multiplication is essentially the same as a look-up of the one-hot elements\n", - "- This use of the matrix multiplication on one-hot encodings is equivalent to the embedding layer look-up but can be inefficient if we work with large embedding matrices, because there are a lot of wasteful multiplications by zero" + "- \n", + "由于每个独热编码行中除了一个索引外都为0(设计如此),这个矩阵乘法本质上就是对独热编码元素的查找\n", + "- 。在独热编码上使用矩阵乘法与使用嵌入层查找是等效的,但如果我们使用大型嵌入矩阵,这种方法可能效率较低,因为有很多不必要的零乘法。" ] }, { @@ -444,22 +460,6 @@ "metadata": {}, "outputs": [], "source": [] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "a1f63c81-1ee3-40a1-9ef2-14ff18fb4f05", - "metadata": {}, - "outputs": [], - "source": [] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "c71959bb-facf-44fd-8edb-b67f7752f034", - "metadata": {}, - "outputs": [], - "source": [] } ], "metadata": { @@ -478,7 +478,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.11.5" + "version": "3.8.17" } }, "nbformat": 4,