mirror of
https://github.com/datawhalechina/llms-from-scratch-cn.git
synced 2026-06-06 00:04:42 +00:00
443 lines
9.2 KiB
Plaintext
443 lines
9.2 KiB
Plaintext
{
|
|
"cells": [
|
|
{
|
|
"cell_type": "markdown",
|
|
"id": "a9adc3bf-353c-411e-a471-0e92786e7103",
|
|
"metadata": {},
|
|
"source": [
|
|
"# Using BytePair encodding from `tiktoken`"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 1,
|
|
"id": "4036ffa3-0e5c-433a-a997-4ed7d33de0b2",
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"# !pip install tiktoken"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 4,
|
|
"id": "1c490fca-a48a-47fa-a299-322d1a08ad17",
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"name": "stdout",
|
|
"output_type": "stream",
|
|
"text": [
|
|
"tiktoken version: 0.5.2\n"
|
|
]
|
|
}
|
|
],
|
|
"source": [
|
|
"import importlib.metadata\n",
|
|
"\n",
|
|
"print(\"tiktoken version:\", importlib.metadata.version(\"tiktoken\"))"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 5,
|
|
"id": "0952667c-ce84-4f21-87db-59f52b44cec4",
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"import tiktoken\n",
|
|
"\n",
|
|
"tik_tokenizer = tiktoken.get_encoding(\"gpt2\")\n",
|
|
"\n",
|
|
"text = \"Hello, world. Is this-- a test?\""
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 6,
|
|
"id": "b039c350-18ad-48fb-8e6a-085702dfc330",
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"name": "stdout",
|
|
"output_type": "stream",
|
|
"text": [
|
|
"[15496, 11, 995, 13, 1148, 428, 438, 257, 1332, 30]\n"
|
|
]
|
|
}
|
|
],
|
|
"source": [
|
|
"integers = tik_tokenizer.encode(text, allowed_special={\"<|endoftext|>\"})\n",
|
|
"\n",
|
|
"print(integers)"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 7,
|
|
"id": "7b152ba4-04d3-41cc-849f-adedcfb8cabb",
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"name": "stdout",
|
|
"output_type": "stream",
|
|
"text": [
|
|
"Hello, world. Is this-- a test?\n"
|
|
]
|
|
}
|
|
],
|
|
"source": [
|
|
"strings = tik_tokenizer.decode(integers)\n",
|
|
"\n",
|
|
"print(strings)"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 8,
|
|
"id": "cf148a1a-316b-43ec-b7ba-1b6d409ce837",
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"name": "stdout",
|
|
"output_type": "stream",
|
|
"text": [
|
|
"50257\n"
|
|
]
|
|
}
|
|
],
|
|
"source": [
|
|
"print(tik_tokenizer.n_vocab)"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"id": "6a0b5d4f-2af9-40de-828c-063c4243e771",
|
|
"metadata": {},
|
|
"source": [
|
|
"# Using the original Byte-pair encoding implementation used in GPT-2"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 9,
|
|
"id": "0903108c-65cb-4ae1-967a-2155e25349c2",
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"from bpe_openai_gpt2 import get_encoder, download_vocab"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 10,
|
|
"id": "35dd8d7c-8c12-4b68-941a-0fd05882dd45",
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"name": "stderr",
|
|
"output_type": "stream",
|
|
"text": [
|
|
"Fetching encoder.json: 1.04Mit [00:28, 36.8kit/s] \n",
|
|
"Fetching vocab.bpe: 457kit [00:00, 458kit/s] \n"
|
|
]
|
|
}
|
|
],
|
|
"source": [
|
|
"download_vocab()"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 14,
|
|
"id": "1888a7a9-9c40-4fe0-99b4-ebd20aa1ffd0",
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"orig_tokenizer = get_encoder(model_name=\"gpt2_model\", models_dir=\".\")"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 15,
|
|
"id": "2740510c-a78a-4fba-ae18-2b156ba2dfef",
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"name": "stdout",
|
|
"output_type": "stream",
|
|
"text": [
|
|
"[15496, 11, 995, 13, 1148, 428, 438, 257, 1332, 30]\n"
|
|
]
|
|
}
|
|
],
|
|
"source": [
|
|
"integers = orig_tokenizer.encode(text)\n",
|
|
"\n",
|
|
"print(integers)"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 16,
|
|
"id": "434d115e-990d-42ad-88dd-31323a96e10f",
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"name": "stdout",
|
|
"output_type": "stream",
|
|
"text": [
|
|
"Hello, world. Is this-- a test?\n"
|
|
]
|
|
}
|
|
],
|
|
"source": [
|
|
"strings = orig_tokenizer.decode(integers)\n",
|
|
"\n",
|
|
"print(strings)"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"id": "4f63e8c6-707c-4d66-bcf8-dd790647cc86",
|
|
"metadata": {},
|
|
"source": [
|
|
"# Using the BytePair Tokenizer in HuggingFace transformers"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 12,
|
|
"id": "5bfff386-f725-4137-9c50-e5da0c38bea0",
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"# pip install transformers"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 13,
|
|
"id": "e9077bf4-f91f-42ad-ab76-f3d89128510e",
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"data": {
|
|
"text/plain": [
|
|
"'4.30.2'"
|
|
]
|
|
},
|
|
"execution_count": 13,
|
|
"metadata": {},
|
|
"output_type": "execute_result"
|
|
}
|
|
],
|
|
"source": [
|
|
"import transformers\n",
|
|
"\n",
|
|
"transformers.__version__"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"id": "16e06ee5-c4ca-4211-8e24-dbfd84b1d85b",
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"设置为国内可访问"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 1,
|
|
"id": "3e07ddc9-187e-4482-a7b5-7e4e9381d805",
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"name": "stdout",
|
|
"output_type": "stream",
|
|
"text": [
|
|
"env: HF_ENDPOINT=https://hf-mirror.com\n"
|
|
]
|
|
}
|
|
],
|
|
"source": [
|
|
"%env HF_ENDPOINT=https://hf-mirror.com"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 2,
|
|
"id": "a9839137-b8ea-4a2c-85fc-9a63064cf8c8",
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"data": {
|
|
"application/vnd.jupyter.widget-view+json": {
|
|
"model_id": "afc151b540664287aa60a4cbe90cdfeb",
|
|
"version_major": 2,
|
|
"version_minor": 0
|
|
},
|
|
"text/plain": [
|
|
"vocab.json: 0.00B [00:00, ?B/s]"
|
|
]
|
|
},
|
|
"metadata": {},
|
|
"output_type": "display_data"
|
|
},
|
|
{
|
|
"data": {
|
|
"application/vnd.jupyter.widget-view+json": {
|
|
"model_id": "9a5d584e4adf40bca215b409b693dc02",
|
|
"version_major": 2,
|
|
"version_minor": 0
|
|
},
|
|
"text/plain": [
|
|
"merges.txt: 0.00B [00:00, ?B/s]"
|
|
]
|
|
},
|
|
"metadata": {},
|
|
"output_type": "display_data"
|
|
},
|
|
{
|
|
"data": {
|
|
"application/vnd.jupyter.widget-view+json": {
|
|
"model_id": "a126ee77a9f94e58b1dcccd68e6d5bb1",
|
|
"version_major": 2,
|
|
"version_minor": 0
|
|
},
|
|
"text/plain": [
|
|
"config.json: 0%| | 0.00/367 [00:00<?, ?B/s]"
|
|
]
|
|
},
|
|
"metadata": {},
|
|
"output_type": "display_data"
|
|
}
|
|
],
|
|
"source": [
|
|
"from transformers import GPT2Tokenizer\n",
|
|
"\n",
|
|
"hf_tokenizer = GPT2Tokenizer.from_pretrained(\"gpt2\")"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 11,
|
|
"id": "222cbd69-6a3d-4868-9c1f-421ffc9d5fe1",
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"data": {
|
|
"text/plain": [
|
|
"[15496, 11, 995, 13, 1148, 428, 438, 257, 1332, 30]"
|
|
]
|
|
},
|
|
"execution_count": 11,
|
|
"metadata": {},
|
|
"output_type": "execute_result"
|
|
}
|
|
],
|
|
"source": [
|
|
"hf_tokenizer(strings)[\"input_ids\"]"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"id": "907a1ade-3401-4f2e-9017-7f58a60cbd98",
|
|
"metadata": {},
|
|
"source": [
|
|
"# A quick performance benchmark"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 17,
|
|
"id": "a61bb445-b151-4a2f-8180-d4004c503754",
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"with open('../01_main-chapter-code/the-verdict.txt', 'r', encoding='utf-8') as f:\n",
|
|
" raw_text = f.read()"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 18,
|
|
"id": "57f7c0a3-c1fd-4313-af34-68e78eb33653",
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"name": "stdout",
|
|
"output_type": "stream",
|
|
"text": [
|
|
"9.14 ms ± 74.5 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)\n"
|
|
]
|
|
}
|
|
],
|
|
"source": [
|
|
"%timeit orig_tokenizer.encode(raw_text)"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"id": "036dd628-3591-46c9-a5ce-b20b105a8062",
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"%timeit tik_tokenizer.encode(raw_text)"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"id": "b9c85b58-bfbc-465e-9a7e-477e53d55c90",
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"%timeit hf_tokenizer(raw_text)[\"input_ids\"]"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"id": "7117107f-22a6-46b4-a442-712d50b3ac7a",
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"%timeit hf_tokenizer(raw_text, max_length=5145, truncation=True)[\"input_ids\"]"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"id": "d81eaf6d-554b-44e3-aa19-2c3ae0030762",
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": []
|
|
}
|
|
],
|
|
"metadata": {
|
|
"kernelspec": {
|
|
"display_name": "Python 3 (ipykernel)",
|
|
"language": "python",
|
|
"name": "python3"
|
|
},
|
|
"language_info": {
|
|
"codemirror_mode": {
|
|
"name": "ipython",
|
|
"version": 3
|
|
},
|
|
"file_extension": ".py",
|
|
"mimetype": "text/x-python",
|
|
"name": "python",
|
|
"nbconvert_exporter": "python",
|
|
"pygments_lexer": "ipython3",
|
|
"version": "3.11.5"
|
|
}
|
|
},
|
|
"nbformat": 4,
|
|
"nbformat_minor": 5
|
|
}
|