mirror of
https://github.com/datawhalechina/llms-from-scratch-cn.git
synced 2026-05-01 11:58:17 +08:00
补充最新代码
This commit is contained in:
parent
310cdb21f5
commit
7a18d5b868
@ -7,10 +7,19 @@
|
||||
"id": "c024bfa4-1a7a-4751-b5a1-827225a3478b"
|
||||
},
|
||||
"source": [
|
||||
"<font size=\"1\">\n",
|
||||
"Supplementary code for \"Build a Large Language Model From Scratch\": <a href=\"https://www.manning.com/books/build-a-large-language-model-from-scratch\">https://www.manning.com/books/build-a-large-language-model-from-scratch</a> by <a href=\"https://sebastianraschka.com\">Sebastian Raschka</a><br>\n",
|
||||
"Code repository: <a href=\"https://github.com/rasbt/LLMs-from-scratch\">https://github.com/rasbt/LLMs-from-scratch</a>\n",
|
||||
"</font>"
|
||||
"<table style=\"width:100%\">\n",
|
||||
"<tr>\n",
|
||||
"<td style=\"vertical-align:middle; text-align:left;\">\n",
|
||||
"<font size=\"2\">\n",
|
||||
"Supplementary code for the <a href=\"http://mng.bz/orYv\">Build a Large Language Model From Scratch</a> book by <a href=\"https://sebastianraschka.com\">Sebastian Raschka</a><br>\n",
|
||||
"<br>Code repository: <a href=\"https://github.com/rasbt/LLMs-from-scratch\">https://github.com/rasbt/LLMs-from-scratch</a>\n",
|
||||
"</font>\n",
|
||||
"</td>\n",
|
||||
"<td style=\"vertical-align:middle; text-align:left;\">\n",
|
||||
"<a href=\"http://mng.bz/orYv\"><img src=\"https://sebastianraschka.com/images/LLMs-from-scratch-images/cover-small.webp\" width=\"100px\"></a>\n",
|
||||
"</td>\n",
|
||||
"</tr>\n",
|
||||
"</table>"
|
||||
]
|
||||
},
|
||||
{
|
||||
@ -907,7 +916,7 @@
|
||||
"id": "ab8e056c-abe0-415f-b34d-df686204259e",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"- To ensure that the model was loaded corrected, let's double-check that it generates coherent text"
|
||||
"- 为了确保模型加载正确,让我们仔细检查它是否生成连贯的文本。"
|
||||
]
|
||||
},
|
||||
{
|
||||
@ -951,7 +960,7 @@
|
||||
"id": "69162550-6a02-4ece-8db1-06c71d61946f",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"- Before we finetune the model as a classifier, let's see if the model can perhaps already classify spam messages via prompting"
|
||||
"- 在我们将模型微调为分类器之前,让我们看看模型是否已经可以通过提示对垃圾邮件进行分类。"
|
||||
]
|
||||
},
|
||||
{
|
||||
@ -991,8 +1000,8 @@
|
||||
"id": "1ce39ed0-2c77-410d-8392-dd15d4b22016",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"- As we can see, the model is not very good at following instructions\n",
|
||||
"- This is expected, since it has only been pretrained and not instruction-finetuned (instruction finetuning will be covered in the next chapter)"
|
||||
"- 正如我们所看到的,该模型不太擅长遵循指令\n",
|
||||
"- 这是预料之中的,因为它只经过了预训练,没有进行指令微调(指令微调将在下一章中介绍)"
|
||||
]
|
||||
},
|
||||
{
|
||||
|
||||
@ -5,10 +5,19 @@
|
||||
"id": "ba450fb1-8a26-4894-ab7a-5d7bfefe90ce",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"<font size=\"1\">\n",
|
||||
"Supplementary code for \"Build a Large Language Model From Scratch\": <a href=\"https://www.manning.com/books/build-a-large-language-model-from-scratch\">https://www.manning.com/books/build-a-large-language-model-from-scratch</a> by <a href=\"https://sebastianraschka.com\">Sebastian Raschka</a><br>\n",
|
||||
"Code repository: <a href=\"https://github.com/rasbt/LLMs-from-scratch\">https://github.com/rasbt/LLMs-from-scratch</a>\n",
|
||||
"</font>"
|
||||
"<table style=\"width:100%\">\n",
|
||||
"<tr>\n",
|
||||
"<td style=\"vertical-align:middle; text-align:left;\">\n",
|
||||
"<font size=\"2\">\n",
|
||||
"Supplementary code for the <a href=\"http://mng.bz/orYv\">Build a Large Language Model From Scratch</a> book by <a href=\"https://sebastianraschka.com\">Sebastian Raschka</a><br>\n",
|
||||
"<br>Code repository: <a href=\"https://github.com/rasbt/LLMs-from-scratch\">https://github.com/rasbt/LLMs-from-scratch</a>\n",
|
||||
"</font>\n",
|
||||
"</td>\n",
|
||||
"<td style=\"vertical-align:middle; text-align:left;\">\n",
|
||||
"<a href=\"http://mng.bz/orYv\"><img src=\"https://sebastianraschka.com/images/LLMs-from-scratch-images/cover-small.webp\" width=\"100px\"></a>\n",
|
||||
"</td>\n",
|
||||
"</tr>\n",
|
||||
"</table>"
|
||||
]
|
||||
},
|
||||
{
|
||||
|
||||
@ -21,11 +21,30 @@ from gpt_download import download_and_load_gpt2
|
||||
from previous_chapters import GPTModel, load_weights_into_gpt
|
||||
|
||||
|
||||
def download_and_unzip_spam_data(url, zip_path, extracted_path, data_file_path):
|
||||
def download_and_unzip_spam_data(url, zip_path, extracted_path, data_file_path, test_mode=False):
|
||||
if data_file_path.exists():
|
||||
print(f"{data_file_path} already exists. Skipping download and extraction.")
|
||||
return
|
||||
|
||||
if test_mode: # Try multiple times since CI sometimes has connectivity issues
|
||||
max_retries = 5
|
||||
delay = 5 # delay between retries in seconds
|
||||
for attempt in range(max_retries):
|
||||
try:
|
||||
# Downloading the file
|
||||
with urllib.request.urlopen(url, timeout=10) as response:
|
||||
with open(zip_path, "wb") as out_file:
|
||||
out_file.write(response.read())
|
||||
break # if download is successful, break out of the loop
|
||||
except urllib.error.URLError as e:
|
||||
print(f"Attempt {attempt + 1} failed: {e}")
|
||||
if attempt < max_retries - 1:
|
||||
time.sleep(delay) # wait before retrying
|
||||
else:
|
||||
print("Failed to download file after several attempts.")
|
||||
return # exit if all retries fail
|
||||
|
||||
else: # Code as it appears in the chapter
|
||||
# Downloading the file
|
||||
with urllib.request.urlopen(url) as response:
|
||||
with open(zip_path, "wb") as out_file:
|
||||
@ -238,6 +257,7 @@ if __name__ == "__main__":
|
||||
)
|
||||
parser.add_argument(
|
||||
"--test_mode",
|
||||
default=False,
|
||||
action="store_true",
|
||||
help=("This flag runs the model in test mode for internal testing purposes. "
|
||||
"Otherwise, it runs the model as it is used in the chapter (recommended).")
|
||||
@ -253,7 +273,7 @@ if __name__ == "__main__":
|
||||
extracted_path = "sms_spam_collection"
|
||||
data_file_path = Path(extracted_path) / "SMSSpamCollection.tsv"
|
||||
|
||||
download_and_unzip_spam_data(url, zip_path, extracted_path, data_file_path)
|
||||
download_and_unzip_spam_data(url, zip_path, extracted_path, data_file_path, test_mode=args.test_mode)
|
||||
df = pd.read_csv(data_file_path, sep="\t", header=None, names=["Label", "Text"])
|
||||
balanced_df = create_balanced_dataset(df)
|
||||
balanced_df["Label"] = balanced_df["Label"].map({"ham": 0, "spam": 1})
|
||||
@ -330,9 +350,7 @@ if __name__ == "__main__":
|
||||
}
|
||||
model = GPTModel(BASE_CONFIG)
|
||||
model.eval()
|
||||
|
||||
device = "cpu"
|
||||
model.to(device)
|
||||
|
||||
# Code as it is used in the main chapter
|
||||
else:
|
||||
@ -355,15 +373,18 @@ if __name__ == "__main__":
|
||||
|
||||
BASE_CONFIG.update(model_configs[CHOOSE_MODEL])
|
||||
|
||||
assert train_dataset.max_length <= BASE_CONFIG["context_length"], (
|
||||
f"Dataset length {train_dataset.max_length} exceeds model's context "
|
||||
f"length {BASE_CONFIG['context_length']}. Reinitialize data sets with "
|
||||
f"`max_length={BASE_CONFIG['context_length']}`"
|
||||
)
|
||||
|
||||
model_size = CHOOSE_MODEL.split(" ")[-1].lstrip("(").rstrip(")")
|
||||
settings, params = download_and_load_gpt2(model_size=model_size, models_dir="gpt2")
|
||||
|
||||
model = GPTModel(BASE_CONFIG)
|
||||
load_weights_into_gpt(model, params)
|
||||
model.eval()
|
||||
|
||||
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
||||
model.to(device)
|
||||
|
||||
########################################
|
||||
# Modify and pretrained model
|
||||
@ -376,6 +397,7 @@ if __name__ == "__main__":
|
||||
|
||||
num_classes = 2
|
||||
model.out_head = torch.nn.Linear(in_features=BASE_CONFIG["emb_dim"], out_features=num_classes)
|
||||
model.to(device)
|
||||
|
||||
for param in model.trf_blocks[-1].parameters():
|
||||
param.requires_grad = True
|
||||
|
||||
@ -14,15 +14,18 @@
|
||||
| 1 | gpt2-small (124M) | pretrained | last | last_block | longest train ex. (120) | 96.63% | 99.33% | 95.00% | 0.28 min | A100 |
|
||||
| 2 | gpt2-small (124M) | pretrained | first | last_block | longest train ex. (120) | 78.46% | 80.54% | 75.00% | 0.28 min | A100 |
|
||||
| 3 | gpt2-small (124M) | pretrained | last | last_layer | longest train ex. (120) | 78.65% | 79.87% | 72.00% | 0.25 min | A100 |
|
||||
| 4 | gpt2-small (124M) | pretrained | last | all | longest train ex. (120) | 99.62% | 96.64% | 96.67% | 0.69 min | A100 |
|
||||
| 5 | gpt2-medium (355M) | pretrained | last | last_block | longest train ex. (120) | 87.50% | 91.28% | 84.67% | 0.75 min | A100 |
|
||||
| 6 | gpt2-large (774M) | pretrained | last | last_block | longest train ex. (120) | 99.52% | 98.66% | 96.67% | 1.50 min | A100 |
|
||||
| 7 | gpt2-xl (1558M) | pretrained | last | last_block | longest train ex. (120) | 99.81% | 99.33% | 98.33% | 2.83 min | A100 |
|
||||
| 8 | gpt2-small (124M) | random | last | all | longest train ex. (120) | 100% | 96.64% | 93.67% | 0.69 min | A100 |
|
||||
| 9 | gpt2-small (124M) | pretrained | last | LoRA | longest train ex. (120) | 99.52% | 97.99% | 97.67% | 0.75 min | A100 |
|
||||
| 10 | gpt2-small (124M) | pretrained | last | last_block | context length (1024) | 83.08% | 87.92% | 78.33% | 2.46 min | A100 |
|
||||
| 11 | gpt2-small (124M) | pretrained | last | last_block | variable: no padding (batch size 1) | 100.00% | 98.66% | 98.00% | 1.75 min | A100 |
|
||||
| 11 | gpt2-small (124M) | pretrained | last | last_block | variable: no padding (batch size 8) | 99.33% | 98.66% | 98.33% | 1.70 min | A100 |
|
||||
| 4 | gpt2-small (124M) | pretrained | last | last_two_blocks | longest train ex. (120) | 98.85% | 98.66% | 98.33% | 0.33 min | A100 |
|
||||
| 5 | gpt2-small (124M) | pretrained | last | all | longest train ex. (120) | 99.62% | 96.64% | 96.67% | 0.69 min | A100 |
|
||||
| 6 | gpt2-medium (355M) | pretrained | last | last_block | longest train ex. (120) | 87.50% | 91.28% | 84.67% | 0.75 min | A100 |
|
||||
| 7 | gpt2-large (774M) | pretrained | last | last_block | longest train ex. (120) | 99.52% | 98.66% | 96.67% | 1.50 min | A100 |
|
||||
| 8 | gpt2-xl (1558M) | pretrained | last | last_block | longest train ex. (120) | 99.81% | 99.33% | 98.33% | 2.83 min | A100 |
|
||||
| 9 | gpt2-small (124M) | random | last | all | longest train ex. (120) | 100% | 96.64% | 93.67% | 0.69 min | A100 |
|
||||
| 10 | gpt2-small (124M) | pretrained | last | LoRA | longest train ex. (120) | 100.00% | 97.32% | 96.67% | 0.75 min | A100 |
|
||||
| 11 | gpt2-small (124M) | pretrained | last | last_block | context length (1024) | 83.08% | 87.92% | 78.33% | 2.46 min | A100 |
|
||||
| 12 | gpt2-small (124M) | pretrained | last | last_block | variable: no padding (batch size 1) | 100.00% | 98.66% | 98.00% | 1.75 min | A100 |
|
||||
| 13 | gpt2-small (124M) | pretrained | last | last_block | variable: no padding (batch size 8) | 99.33% | 98.66% | 98.33% | 1.70 min | A100 |
|
||||
| 14 | gpt2-small (124M) | pretrained | last | last_block | longest train ex. (120); but no causal mask | 99.23% | 98.66% | 95.33% | 0.29 min | A100 |
|
||||
| 15 | gpt2-small (124M) | pretrained | last | last_block | longest train ex. (120) and `ignore_index` for padding | 96.63% | 99.33% | 95.00% | 0.28 min | A100 |
|
||||
|
||||
|
||||
|
||||
@ -32,17 +35,20 @@
|
||||
您可以使用以下代码来重现实验:
|
||||
|
||||
- Row 1: `python additional-experiments.py`
|
||||
- Row 2: `python additional-experiments.py --trainable_token first`
|
||||
- Row 2: `python additional-experiments.py --trainable_token_pos first`
|
||||
- Row 3: `python additional-experiments.py --trainable_layers last_layer`
|
||||
- Row 4: `python additional-experiments.py --trainable_layers all`
|
||||
- Row 5: `python additional-experiments.py --model_size "gpt2-medium (355M)"`
|
||||
- Row 6: `python additional-experiments.py --model_size "gpt2-large (774M)"`
|
||||
- Row 7: `python additional-experiments.py --model_size "gpt2-xl (1558M)"`
|
||||
- Row 8: `python additional-experiments.py --weights random --trainable_layers all`
|
||||
- Row 9: `python additional-experiments.py --trainable_layers lora --lora_rank 16 --lora_alpha 8`
|
||||
- Row 10: `python additional-experiments.py --context_length "model_context_length"`
|
||||
- Row 11: `python additional-experiments.py --no_padding --batch_size 1`
|
||||
- Row 12: `python additional-experiments.py --no_padding --batch_size 1 --accumulation_steps 8`
|
||||
- Row 4: `python additional-experiments.py --trainable_layers last_two_blocks`
|
||||
- Row 5: `python additional-experiments.py --trainable_layers all`
|
||||
- Row 6: `python additional-experiments.py --model_size "gpt2-medium (355M)"`
|
||||
- Row 7: `python additional-experiments.py --model_size "gpt2-large (774M)"`
|
||||
- Row 8: `python additional-experiments.py --model_size "gpt2-xl (1558M)"`
|
||||
- Row 9: `python additional-experiments.py --weights random --trainable_layers all`
|
||||
- Row 10: `python additional-experiments.py --trainable_layers lora --lora_rank 16 --lora_alpha 16`
|
||||
- Row 11: `python additional-experiments.py --context_length "model_context_length"`
|
||||
- Row 12: `python additional-experiments.py --no_padding --batch_size 1`
|
||||
- Row 13: `python additional-experiments.py --no_padding --batch_size 1 --accumulation_steps 8`
|
||||
- Row 14: `python additional-experiments.py --disable_causal_mask`
|
||||
- Row 15: `python additional-experiments.py --ignore_index 50256`
|
||||
|
||||
我特意将 LLM 和数据集保持得较小,因此,如果您无法使用 GPU,您可以在 MacBook Air M3 等普通笔记本电脑上运行大约 15 分钟的训练。
|
||||
|
||||
@ -50,17 +56,13 @@
|
||||
|
||||
## 解释
|
||||
1. **训练最后一个输出标记与第一个输出标记(第 1 行与第 2 行)**:与第一个输出标记相比,训练最后一个输出标记会带来更好的性能。由于因果自注意力掩模,这种改进是可以预期的。
|
||||
|
||||
2. **训练最后一个 Transformer 块与最后一层(第 1 行与第 3 行)**:训练整个最后一个 Transformer 块也比仅训练最后一层获得更好的结果。
|
||||
|
||||
3. **训练所有层与最后一个 Transformer 块(第 1 行与第 4 行)**:训练所有层比仅训练最后一个 Transformer 块显示出约 2% 的适度改进,但它需要的时间几乎是三倍的训练时间。
|
||||
|
||||
4. **使用更大的预训练模型(第 1 行与第 5 行,以及第 1 行与第 6 行和第 7 行)**:采用 3 倍大的预训练模型会导致更差的结果。 然而,正如预期的那样,与初始模型相比,使用大 5 倍的模型可以提高性能。 同样,12 倍大的模型进一步提高了预测性能。(中等模型可能没有经过很好的预训练,或者特定的微调配置对该模型效果不佳。)
|
||||
|
||||
5. **使用具有随机权重的模型与预训练权重(第 1 行与第 8 行)**:使用具有随机权重的模型产生的结果仅比使用预训练权重稍差 1.3%。
|
||||
|
||||
6. **使用 LoRA(低阶适应)与训练所有层(第 9 行与第 4 行)**:保持模型冻结并添加可训练的 LoRA 层是训练所有模型参数的可行替代方案,甚至可以将性能提高 1%(请参阅[附录 E](../../appendix-E/01_main-chapter-code/appendix-E.ipynb)查看更多细节)。 从使用 LoRA 时训练和验证准确率之间的差距降低 1% 可以看出,这可能是由于过度拟合较少。 此外,使用 LoRA 的速度也稍快一些,因为需要更新的参数较少。
|
||||
|
||||
7. **将输入填充到完整上下文长度与最长训练示例(第 1 行与第 10 行)**:将输入填充到完整支持的上下文长度结果明显更差。
|
||||
|
||||
8. **填充与无填充(第 1 行与第 11 行和第 12 行)**:`--no_padding` 选项禁用数据集中的填充,这需要使用批量大小 1 来训练模型,因为输入具有可变长度。 这会带来更好的测试准确率,但需要更长的训练时间。 在第 12 行中,我们另外启用了 8 个步骤的梯度累积,以实现与其他实验相同的批量大小,这有助于减少过度拟合并略微提高测试集的准确性。
|
||||
4. **训练最后一个 Transformer 块与所有层(第 1 行与第 5 行)**:训练所有层比仅训练最后一个 Transformer 块显示出约 2% 的适度改进,但就时间而言,它需要几乎三倍的时间训练持续时间。 此外,它仅训练 12 个变压器块中的最后两个,其性能也不佳。
|
||||
5. **使用更大的预训练模型(第 1 行与第 5 行,以及第 1 行与第 7 行和第 8 行)**:采用 3 倍大的预训练模型会导致更差的结果。然而,正如预期的那样,与初始模型相比,使用大 5 倍的模型可以提高性能。同样,12 倍大的模型进一步提高了预测性能。(中等模型可能没有经过很好的预训练,或者特定的微调配置对该模型效果不佳。)
|
||||
6. **使用具有随机权重的模型与预训练权重(第 1 行与第 9 行)**:使用具有随机权重的模型产生的结果仅比使用预训练权重稍差 1.3%。
|
||||
7. **使用 LoRA(低阶适应)与训练所有层(第 10 行与第 5 行)**:保持模型冻结并添加可训练的 LoRA 层是训练所有模型参数的可行替代方案(请参阅[附录 E](../../appendix-E/01_main-chapter-code/appendix-E.ipynb)),甚至可以将性能提高 1%。 从使用 LoRA 时训练和验证准确率之间的差距降低约 1% 可以看出,这可能是由于过度拟合较少。此外,使用 LoRA 的速度也稍快一些,因为需要更新的参数较少。
|
||||
8. **将输入填充到完整上下文长度与最长训练示例(第 1 行与第 11 行)**:将输入填充到完整支持的上下文长度结果明显更差。
|
||||
9. **填充与无填充(第 1 行与第 12 行和第 13 行)**:`--no_padding` 选项禁用数据集中的填充,这需要使用批量大小 1 来训练模型,因为输入具有变量 长度。 这会带来更好的测试精度,但需要更长的训练时间。 在第 12 行中,我们另外启用了 8 个步骤的梯度累积,以实现与其他实验相同的批量大小,这有助于减少过度拟合并略微提高测试集的准确性。
|
||||
10. **禁用因果注意掩码(第 1 行与第 14 行)**:禁用多头注意模块中使用的因果注意掩码。这意味着所有Token都可以参加所有其他Token。 与带有因果掩模的 GPT 模型相比,模型精度略有提高。
|
||||
11. **忽略损失和反向传播中的填充索引(第 1 行与第 15 行)**:设置 `--ignore_index 50256` 会排除 PyTorch 中 `cross_entropy` 损失函数中的 `|endoftext|` 填充标记。 在这种情况下,它没有任何效果,因为我们替换了输出层,以便二元分类示例的标记 ID 为 0 或 1。 然而,当第 7 章中的指令微调模型时,此设置很有用。
|
||||
@ -4,6 +4,7 @@
|
||||
# Code: https://github.com/rasbt/LLMs-from-scratch
|
||||
|
||||
import argparse
|
||||
import math
|
||||
import os
|
||||
from pathlib import Path
|
||||
import time
|
||||
@ -23,8 +24,8 @@ from previous_chapters import GPTModel, load_weights_into_gpt
|
||||
class LoRALayer(torch.nn.Module):
|
||||
def __init__(self, in_dim, out_dim, rank, alpha):
|
||||
super().__init__()
|
||||
std_dev = 1 / torch.sqrt(torch.tensor(rank).float())
|
||||
self.A = torch.nn.Parameter(torch.randn(in_dim, rank) * std_dev)
|
||||
self.A = torch.nn.Parameter(torch.empty(in_dim, rank))
|
||||
torch.nn.init.kaiming_uniform_(self.A, a=math.sqrt(5))
|
||||
self.B = torch.nn.Parameter(torch.zeros(rank, out_dim))
|
||||
self.alpha = alpha
|
||||
|
||||
@ -153,7 +154,7 @@ def instantiate_model(choose_model, load_weights):
|
||||
|
||||
if not load_weights:
|
||||
torch.manual_seed(123)
|
||||
model = GPTModel(BASE_CONFIG)
|
||||
model = GPTModel(BASE_CONFIG, disable_causal_mask=args.disable_causal_mask)
|
||||
|
||||
if load_weights:
|
||||
model_size = choose_model.split(" ")[-1].lstrip("(").rstrip(")")
|
||||
@ -164,14 +165,16 @@ def instantiate_model(choose_model, load_weights):
|
||||
return model
|
||||
|
||||
|
||||
def calc_loss_batch(input_batch, target_batch, model, device, trainable_token=-1):
|
||||
def calc_loss_batch(input_batch, target_batch, model, device,
|
||||
trainable_token_pos=-1, ignore_index=-100):
|
||||
input_batch, target_batch = input_batch.to(device), target_batch.to(device)
|
||||
logits = model(input_batch)[:, trainable_token, :] # Logits of last output token
|
||||
loss = torch.nn.functional.cross_entropy(logits, target_batch)
|
||||
logits = model(input_batch)[:, trainable_token_pos, :] # Logits of last output token
|
||||
loss = torch.nn.functional.cross_entropy(logits, target_batch, ignore_index=ignore_index)
|
||||
return loss
|
||||
|
||||
|
||||
def calc_loss_loader(data_loader, model, device, num_batches=None, trainable_token=-1):
|
||||
def calc_loss_loader(data_loader, model, device,
|
||||
num_batches=None, trainable_token_pos=-1, ignore_index=-100):
|
||||
total_loss = 0.
|
||||
if len(data_loader) == 0:
|
||||
return float("nan")
|
||||
@ -183,7 +186,10 @@ def calc_loss_loader(data_loader, model, device, num_batches=None, trainable_tok
|
||||
num_batches = min(num_batches, len(data_loader))
|
||||
for i, (input_batch, target_batch) in enumerate(data_loader):
|
||||
if i < num_batches:
|
||||
loss = calc_loss_batch(input_batch, target_batch, model, device, trainable_token=trainable_token)
|
||||
loss = calc_loss_batch(
|
||||
input_batch, target_batch, model, device,
|
||||
trainable_token_pos=trainable_token_pos, ignore_index=ignore_index
|
||||
)
|
||||
total_loss += loss.item()
|
||||
else:
|
||||
break
|
||||
@ -191,7 +197,7 @@ def calc_loss_loader(data_loader, model, device, num_batches=None, trainable_tok
|
||||
|
||||
|
||||
@torch.no_grad() # Disable gradient tracking for efficiency
|
||||
def calc_accuracy_loader(data_loader, model, device, num_batches=None, trainable_token=-1):
|
||||
def calc_accuracy_loader(data_loader, model, device, num_batches=None, trainable_token_pos=-1):
|
||||
model.eval()
|
||||
correct_predictions, num_examples = 0, 0
|
||||
|
||||
@ -202,7 +208,7 @@ def calc_accuracy_loader(data_loader, model, device, num_batches=None, trainable
|
||||
for i, (input_batch, target_batch) in enumerate(data_loader):
|
||||
if i < num_batches:
|
||||
input_batch, target_batch = input_batch.to(device), target_batch.to(device)
|
||||
logits = model(input_batch)[:, trainable_token, :] # Logits of last output token
|
||||
logits = model(input_batch)[:, trainable_token_pos, :] # Logits of last output token
|
||||
predicted_labels = torch.argmax(logits, dim=-1)
|
||||
|
||||
num_examples += predicted_labels.shape[0]
|
||||
@ -212,18 +218,25 @@ def calc_accuracy_loader(data_loader, model, device, num_batches=None, trainable
|
||||
return correct_predictions / num_examples
|
||||
|
||||
|
||||
def evaluate_model(model, train_loader, val_loader, device, eval_iter, trainable_token=-1):
|
||||
def evaluate_model(model, train_loader, val_loader, device,
|
||||
eval_iter, trainable_token_pos=-1, ignore_index=-100):
|
||||
model.eval()
|
||||
with torch.no_grad():
|
||||
train_loss = calc_loss_loader(train_loader, model, device, num_batches=eval_iter, trainable_token=trainable_token)
|
||||
val_loss = calc_loss_loader(val_loader, model, device, num_batches=eval_iter, trainable_token=trainable_token)
|
||||
train_loss = calc_loss_loader(
|
||||
train_loader, model, device, num_batches=eval_iter,
|
||||
trainable_token_pos=trainable_token_pos, ignore_index=ignore_index
|
||||
)
|
||||
val_loss = calc_loss_loader(
|
||||
val_loader, model, device, num_batches=eval_iter,
|
||||
trainable_token_pos=trainable_token_pos, ignore_index=ignore_index
|
||||
)
|
||||
model.train()
|
||||
return train_loss, val_loss
|
||||
|
||||
|
||||
def train_classifier_simple(model, train_loader, val_loader, optimizer, device, num_epochs,
|
||||
eval_freq, eval_iter, tokenizer, max_steps=None, trainable_token=-1,
|
||||
accumulation_steps=1):
|
||||
eval_freq, eval_iter, tokenizer, max_steps=None, trainable_token_pos=-1,
|
||||
accumulation_steps=1, ignore_index=-100):
|
||||
# Initialize lists to track losses and tokens seen
|
||||
train_losses, val_losses, train_accs, val_accs = [], [], [], []
|
||||
examples_seen, global_step = 0, -1
|
||||
@ -233,7 +246,10 @@ def train_classifier_simple(model, train_loader, val_loader, optimizer, device,
|
||||
model.train() # Set model to training mode
|
||||
|
||||
for batch_idx, (input_batch, target_batch) in enumerate(train_loader):
|
||||
loss = calc_loss_batch(input_batch, target_batch, model, device, trainable_token=trainable_token)
|
||||
loss = calc_loss_batch(
|
||||
input_batch, target_batch, model, device,
|
||||
trainable_token_pos=trainable_token_pos, ignore_index=ignore_index
|
||||
)
|
||||
|
||||
# Use gradient accumulation if accumulation_steps > 1
|
||||
# See https://sebastianraschka.com/blog/2023/llm-grad-accumulation.html
|
||||
@ -253,7 +269,9 @@ def train_classifier_simple(model, train_loader, val_loader, optimizer, device,
|
||||
# Optional evaluation step
|
||||
if global_step % eval_freq == 0:
|
||||
train_loss, val_loss = evaluate_model(
|
||||
model, train_loader, val_loader, device, eval_iter, trainable_token=trainable_token)
|
||||
model, train_loader, val_loader, device, eval_iter,
|
||||
trainable_token_pos=trainable_token_pos, ignore_index=ignore_index
|
||||
)
|
||||
train_losses.append(train_loss)
|
||||
val_losses.append(val_loss)
|
||||
print(f"Ep {epoch+1} (Step {global_step:06d}): "
|
||||
@ -263,8 +281,8 @@ def train_classifier_simple(model, train_loader, val_loader, optimizer, device,
|
||||
break
|
||||
|
||||
# New: Calculate accuracy after each epoch
|
||||
train_accuracy = calc_accuracy_loader(train_loader, model, device, num_batches=eval_iter, trainable_token=trainable_token)
|
||||
val_accuracy = calc_accuracy_loader(val_loader, model, device, num_batches=eval_iter, trainable_token=trainable_token)
|
||||
train_accuracy = calc_accuracy_loader(train_loader, model, device, num_batches=eval_iter, trainable_token_pos=trainable_token_pos)
|
||||
val_accuracy = calc_accuracy_loader(val_loader, model, device, num_batches=eval_iter, trainable_token_pos=trainable_token_pos)
|
||||
print(f"Training accuracy: {train_accuracy*100:.2f}% | ", end="")
|
||||
print(f"Validation accuracy: {val_accuracy*100:.2f}%")
|
||||
train_accs.append(train_accuracy)
|
||||
@ -311,15 +329,15 @@ if __name__ == "__main__":
|
||||
type=str,
|
||||
default="last_block",
|
||||
help=(
|
||||
"Which layers to train. Options: 'all', 'last_block', 'last_layer', 'lora'."
|
||||
"Which layers to train. Options: 'all', 'last_block', 'last_two_blocks', 'last_layer', 'lora'."
|
||||
)
|
||||
)
|
||||
parser.add_argument(
|
||||
"--trainable_token",
|
||||
"--trainable_token_pos",
|
||||
type=str,
|
||||
default="last",
|
||||
help=(
|
||||
"Which token to train. Options: 'first', 'last'."
|
||||
"Which token position to train. Options: 'first', 'last'."
|
||||
)
|
||||
)
|
||||
parser.add_argument(
|
||||
@ -386,14 +404,32 @@ if __name__ == "__main__":
|
||||
)
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
"--disable_causal_mask",
|
||||
action='store_true',
|
||||
default=False,
|
||||
help=(
|
||||
"Disables the causal attention mask."
|
||||
)
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
"--ignore_index",
|
||||
type=int,
|
||||
default=-100,
|
||||
help=(
|
||||
"Sets the `ignore_index` in the cross entropy loss."
|
||||
)
|
||||
)
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
if args.trainable_token == "first":
|
||||
args.trainable_token = 0
|
||||
elif args.trainable_token == "last":
|
||||
args.trainable_token = -1
|
||||
if args.trainable_token_pos == "first":
|
||||
args.trainable_token_pos = 0
|
||||
elif args.trainable_token_pos == "last":
|
||||
args.trainable_token_pos = -1
|
||||
else:
|
||||
raise ValueError("Invalid --trainable_token argument")
|
||||
raise ValueError("Invalid --trainable_token_pos argument")
|
||||
|
||||
###############################
|
||||
# Load model
|
||||
@ -426,11 +462,14 @@ if __name__ == "__main__":
|
||||
|
||||
if args.trainable_layers == "last_layer":
|
||||
pass
|
||||
elif args.trainable_layers == "last_block":
|
||||
elif args.trainable_layers == "last_block" or args.trainable_layers == "last_two_blocks":
|
||||
for param in model.trf_blocks[-1].parameters():
|
||||
param.requires_grad = True
|
||||
for param in model.final_norm.parameters():
|
||||
param.requires_grad = True
|
||||
if args.trainable_layers == "last_two_blocks":
|
||||
for param in model.trf_blocks[-2].parameters():
|
||||
param.requires_grad = True
|
||||
elif args.trainable_layers == "all":
|
||||
for param in model.parameters():
|
||||
param.requires_grad = True
|
||||
@ -509,6 +548,12 @@ if __name__ == "__main__":
|
||||
drop_last=False,
|
||||
)
|
||||
|
||||
assert train_dataset.max_length <= model.pos_emb.weight.shape[0], (
|
||||
f"Dataset length {train_dataset.max_length} exceeds model's context "
|
||||
f"length {model.pos_emb.weight.shape[0]}. Reinitialize data sets with "
|
||||
f"`max_length={model.pos_emb.weight.shape[0]}`"
|
||||
)
|
||||
|
||||
###############################
|
||||
# Train model
|
||||
###############################
|
||||
@ -520,7 +565,7 @@ if __name__ == "__main__":
|
||||
train_losses, val_losses, train_accs, val_accs, examples_seen = train_classifier_simple(
|
||||
model, train_loader, val_loader, optimizer, device,
|
||||
num_epochs=args.num_epochs, eval_freq=50, eval_iter=5,
|
||||
tokenizer=tokenizer, max_steps=None, trainable_token=args.trainable_token,
|
||||
tokenizer=tokenizer, max_steps=None, trainable_token_pos=args.trainable_token_pos,
|
||||
accumulation_steps=args.accumulation_steps
|
||||
)
|
||||
|
||||
@ -532,9 +577,9 @@ if __name__ == "__main__":
|
||||
# Evaluate model
|
||||
###############################
|
||||
|
||||
train_accuracy = calc_accuracy_loader(train_loader, model, device, trainable_token=args.trainable_token)
|
||||
val_accuracy = calc_accuracy_loader(val_loader, model, device, trainable_token=args.trainable_token)
|
||||
test_accuracy = calc_accuracy_loader(test_loader, model, device, trainable_token=args.trainable_token)
|
||||
train_accuracy = calc_accuracy_loader(train_loader, model, device, trainable_token_pos=args.trainable_token_pos)
|
||||
val_accuracy = calc_accuracy_loader(val_loader, model, device, trainable_token_pos=args.trainable_token_pos)
|
||||
test_accuracy = calc_accuracy_loader(test_loader, model, device, trainable_token_pos=args.trainable_token_pos)
|
||||
|
||||
print(f"Training accuracy: {train_accuracy*100:.2f}%")
|
||||
print(f"Validation accuracy: {val_accuracy*100:.2f}%")
|
||||
|
||||
@ -60,7 +60,7 @@ def create_dataloader_v1(txt, batch_size=4, max_length=256,
|
||||
# Chapter 3
|
||||
#####################################
|
||||
class MultiHeadAttention(nn.Module):
|
||||
def __init__(self, d_in, d_out, context_length, dropout, num_heads, qkv_bias=False):
|
||||
def __init__(self, d_in, d_out, context_length, dropout, num_heads, qkv_bias=False, disable_causal_mask=False):
|
||||
super().__init__()
|
||||
assert d_out % num_heads == 0, "d_out must be divisible by n_heads"
|
||||
|
||||
@ -73,7 +73,10 @@ class MultiHeadAttention(nn.Module):
|
||||
self.W_value = nn.Linear(d_in, d_out, bias=qkv_bias)
|
||||
self.out_proj = nn.Linear(d_out, d_out) # Linear layer to combine head outputs
|
||||
self.dropout = nn.Dropout(dropout)
|
||||
|
||||
if not disable_causal_mask:
|
||||
self.register_buffer('mask', torch.triu(torch.ones(context_length, context_length), diagonal=1))
|
||||
self.disable_causal_mask = disable_causal_mask
|
||||
|
||||
def forward(self, x):
|
||||
b, num_tokens, d_in = x.shape
|
||||
@ -96,6 +99,7 @@ class MultiHeadAttention(nn.Module):
|
||||
# Compute scaled dot-product attention (aka self-attention) with a causal mask
|
||||
attn_scores = queries @ keys.transpose(2, 3) # Dot product for each head
|
||||
|
||||
if not self.disable_causal_mask:
|
||||
# Original mask truncated to the number of tokens and converted to boolean
|
||||
mask_bool = self.mask.bool()[:num_tokens, :num_tokens]
|
||||
|
||||
@ -157,7 +161,7 @@ class FeedForward(nn.Module):
|
||||
|
||||
|
||||
class TransformerBlock(nn.Module):
|
||||
def __init__(self, cfg):
|
||||
def __init__(self, cfg, disable_causal_mask=False):
|
||||
super().__init__()
|
||||
self.att = MultiHeadAttention(
|
||||
d_in=cfg["emb_dim"],
|
||||
@ -165,7 +169,9 @@ class TransformerBlock(nn.Module):
|
||||
context_length=cfg["context_length"],
|
||||
num_heads=cfg["n_heads"],
|
||||
dropout=cfg["drop_rate"],
|
||||
qkv_bias=cfg["qkv_bias"])
|
||||
qkv_bias=cfg["qkv_bias"],
|
||||
disable_causal_mask=disable_causal_mask
|
||||
)
|
||||
self.ff = FeedForward(cfg)
|
||||
self.norm1 = LayerNorm(cfg["emb_dim"])
|
||||
self.norm2 = LayerNorm(cfg["emb_dim"])
|
||||
@ -190,14 +196,14 @@ class TransformerBlock(nn.Module):
|
||||
|
||||
|
||||
class GPTModel(nn.Module):
|
||||
def __init__(self, cfg):
|
||||
def __init__(self, cfg, disable_causal_mask=False):
|
||||
super().__init__()
|
||||
self.tok_emb = nn.Embedding(cfg["vocab_size"], cfg["emb_dim"])
|
||||
self.pos_emb = nn.Embedding(cfg["context_length"], cfg["emb_dim"])
|
||||
self.drop_emb = nn.Dropout(cfg["drop_rate"])
|
||||
|
||||
self.trf_blocks = nn.Sequential(
|
||||
*[TransformerBlock(cfg) for _ in range(cfg["n_layers"])])
|
||||
*[TransformerBlock(cfg, disable_causal_mask) for _ in range(cfg["n_layers"])])
|
||||
|
||||
self.final_norm = LayerNorm(cfg["emb_dim"])
|
||||
self.out_head = nn.Linear(cfg["emb_dim"], cfg["vocab_size"], bias=False)
|
||||
@ -310,7 +316,7 @@ def load_weights_into_gpt(gpt, params):
|
||||
gpt.out_head.weight = assign(gpt.out_head.weight, params["wte"])
|
||||
|
||||
|
||||
def generate(model, idx, max_new_tokens, context_size, temperature, top_k=None):
|
||||
def generate(model, idx, max_new_tokens, context_size, temperature=0.0, top_k=None, eos_id=None):
|
||||
# For-loop is the same as before: Get logits, and only focus on last time step
|
||||
for _ in range(max_new_tokens):
|
||||
idx_cond = idx[:, -context_size:]
|
||||
@ -339,6 +345,9 @@ def generate(model, idx, max_new_tokens, context_size, temperature, top_k=None):
|
||||
else:
|
||||
idx_next = torch.argmax(logits, dim=-1, keepdim=True) # (batch_size, 1)
|
||||
|
||||
if idx_next == eos_id: # Stop generating early if end-of-sequence token is encountered and eos_id is specified
|
||||
break
|
||||
|
||||
# Same as before: append sampled index to the running sequence
|
||||
idx = torch.cat((idx, idx_next), dim=1) # (batch_size, num_tokens+1)
|
||||
|
||||
|
||||
127
ch06/03_bonus_imdb-classification/README.md
Normal file
127
ch06/03_bonus_imdb-classification/README.md
Normal file
@ -0,0 +1,127 @@
|
||||
# 对 50k IMDB 电影评论的情感进行分类的附加实验
|
||||
|
||||
|
||||
## Step 1: 安装依赖
|
||||
|
||||
通过下列命令安装额外的依赖项
|
||||
|
||||
```bash
|
||||
pip install -r requirements-extra.txt
|
||||
```
|
||||
|
||||
|
||||
## Step 2: 下载数据集
|
||||
|
||||
这些代码使用 IMDb 中的 50k 电影评论来预测电影评论是正面还是负面。 ([数据集](https://ai.stanford.edu/~amaas/data/sentiment/))
|
||||
|
||||
运行以下代码来创建`train.csv`, `validation.csv`, 和 `test.csv`数据集:
|
||||
|
||||
```bash
|
||||
python download-prepare-dataset.py
|
||||
```
|
||||
|
||||
|
||||
|
||||
## Step 3: 运行模型
|
||||
|
||||
主要章节中使用的 124M GPT-2 模型,从预训练权重开始,仅训练最后一个 Transformer 块加上输出层:
|
||||
|
||||
```bash
|
||||
python train-gpt.py
|
||||
```
|
||||
|
||||
```
|
||||
Ep 1 (Step 000000): Train loss 2.829, Val loss 3.433
|
||||
Ep 1 (Step 000050): Train loss 1.440, Val loss 1.669
|
||||
Ep 1 (Step 000100): Train loss 0.879, Val loss 1.037
|
||||
Ep 1 (Step 000150): Train loss 0.838, Val loss 0.866
|
||||
...
|
||||
Ep 1 (Step 004300): Train loss 0.174, Val loss 0.202
|
||||
Ep 1 (Step 004350): Train loss 0.309, Val loss 0.190
|
||||
Training accuracy: 88.75% | Validation accuracy: 91.25%
|
||||
Ep 2 (Step 004400): Train loss 0.263, Val loss 0.205
|
||||
Ep 2 (Step 004450): Train loss 0.226, Val loss 0.188
|
||||
...
|
||||
Ep 2 (Step 008650): Train loss 0.189, Val loss 0.171
|
||||
Ep 2 (Step 008700): Train loss 0.225, Val loss 0.179
|
||||
Training accuracy: 85.00% | Validation accuracy: 90.62%
|
||||
Ep 3 (Step 008750): Train loss 0.206, Val loss 0.187
|
||||
Ep 3 (Step 008800): Train loss 0.198, Val loss 0.172
|
||||
...
|
||||
Training accuracy: 96.88% | Validation accuracy: 90.62%
|
||||
Training completed in 18.62 minutes.
|
||||
|
||||
Evaluating on the full datasets ...
|
||||
|
||||
Training accuracy: 93.66%
|
||||
Validation accuracy: 90.02%
|
||||
Test accuracy: 89.96%
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
一个 66M 参数的编码器模型 [DistilBERT](https://arxiv.org/abs/1910.01108)(从 340M 参数 BERT 模型蒸馏而来),从预训练权重开始,仅训练最后一个 Transformer 块和输出层:
|
||||
|
||||
|
||||
```bash
|
||||
python train-bert-hf.py
|
||||
```
|
||||
|
||||
```
|
||||
Ep 1 (Step 000000): Train loss 0.693, Val loss 0.697
|
||||
Ep 1 (Step 000050): Train loss 0.532, Val loss 0.596
|
||||
Ep 1 (Step 000100): Train loss 0.431, Val loss 0.446
|
||||
...
|
||||
Ep 1 (Step 004300): Train loss 0.234, Val loss 0.351
|
||||
Ep 1 (Step 004350): Train loss 0.190, Val loss 0.222
|
||||
Training accuracy: 88.75% | Validation accuracy: 88.12%
|
||||
Ep 2 (Step 004400): Train loss 0.258, Val loss 0.270
|
||||
Ep 2 (Step 004450): Train loss 0.204, Val loss 0.295
|
||||
...
|
||||
Ep 2 (Step 008650): Train loss 0.088, Val loss 0.246
|
||||
Ep 2 (Step 008700): Train loss 0.084, Val loss 0.247
|
||||
Training accuracy: 98.75% | Validation accuracy: 90.62%
|
||||
Ep 3 (Step 008750): Train loss 0.067, Val loss 0.209
|
||||
Ep 3 (Step 008800): Train loss 0.059, Val loss 0.256
|
||||
...
|
||||
Ep 3 (Step 013050): Train loss 0.068, Val loss 0.280
|
||||
Ep 3 (Step 013100): Train loss 0.064, Val loss 0.306
|
||||
Training accuracy: 99.38% | Validation accuracy: 87.50%
|
||||
Training completed in 16.70 minutes.
|
||||
|
||||
Evaluating on the full datasets ...
|
||||
|
||||
Training accuracy: 98.87%
|
||||
Validation accuracy: 90.98%
|
||||
Test accuracy: 90.81%
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
一个355M 参数量的编码器模型 [RoBERTa](https://arxiv.org/abs/1907.11692) ,从预训练权重开始,仅训练最后一个 Transformer 块和输出层:
|
||||
|
||||
|
||||
```bash
|
||||
python train-bert-hf.py --bert_model roberta
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
一个scikit-learn Logistic 回归模型作为基线。
|
||||
|
||||
```bash
|
||||
python train-sklearn-logreg.py
|
||||
```
|
||||
|
||||
```
|
||||
Dummy classifier:
|
||||
Training Accuracy: 50.01%
|
||||
Validation Accuracy: 50.14%
|
||||
Test Accuracy: 49.91%
|
||||
|
||||
|
||||
Logistic regression classifier:
|
||||
Training Accuracy: 99.80%
|
||||
Validation Accuracy: 88.60%
|
||||
Test Accuracy: 88.84%
|
||||
```
|
||||
@ -0,0 +1,84 @@
|
||||
# Copyright (c) Sebastian Raschka under Apache License 2.0 (see LICENSE.txt).
|
||||
# Source for "Build a Large Language Model From Scratch"
|
||||
# - https://www.manning.com/books/build-a-large-language-model-from-scratch
|
||||
# Code: https://github.com/rasbt/LLMs-from-scratch
|
||||
|
||||
import os
|
||||
import sys
|
||||
import tarfile
|
||||
import time
|
||||
import urllib.request
|
||||
import pandas as pd
|
||||
|
||||
|
||||
def reporthook(count, block_size, total_size):
|
||||
global start_time
|
||||
if count == 0:
|
||||
start_time = time.time()
|
||||
else:
|
||||
duration = time.time() - start_time
|
||||
progress_size = int(count * block_size)
|
||||
percent = count * block_size * 100 / total_size
|
||||
|
||||
speed = int(progress_size / (1024 * duration)) if duration else 0
|
||||
sys.stdout.write(
|
||||
f"\r{int(percent)}% | {progress_size / (1024**2):.2f} MB "
|
||||
f"| {speed:.2f} MB/s | {duration:.2f} sec elapsed"
|
||||
)
|
||||
sys.stdout.flush()
|
||||
|
||||
|
||||
def download_and_extract_dataset(dataset_url, target_file, directory):
|
||||
if not os.path.exists(directory):
|
||||
if os.path.exists(target_file):
|
||||
os.remove(target_file)
|
||||
urllib.request.urlretrieve(dataset_url, target_file, reporthook)
|
||||
print("\nExtracting dataset ...")
|
||||
with tarfile.open(target_file, "r:gz") as tar:
|
||||
tar.extractall()
|
||||
else:
|
||||
print(f"Directory `{directory}` already exists. Skipping download.")
|
||||
|
||||
|
||||
def load_dataset_to_dataframe(basepath="aclImdb", labels={"pos": 1, "neg": 0}):
|
||||
data_frames = [] # List to store each chunk of DataFrame
|
||||
for subset in ("test", "train"):
|
||||
for label in ("pos", "neg"):
|
||||
path = os.path.join(basepath, subset, label)
|
||||
for file in sorted(os.listdir(path)):
|
||||
with open(os.path.join(path, file), "r", encoding="utf-8") as infile:
|
||||
# Create a DataFrame for each file and add it to the list
|
||||
data_frames.append(pd.DataFrame({"text": [infile.read()], "label": [labels[label]]}))
|
||||
# Concatenate all DataFrame chunks together
|
||||
df = pd.concat(data_frames, ignore_index=True)
|
||||
df = df.sample(frac=1, random_state=123).reset_index(drop=True) # Shuffle the DataFrame
|
||||
return df
|
||||
|
||||
|
||||
def partition_and_save(df, sizes=(35000, 5000, 10000)):
|
||||
# Shuffle the DataFrame
|
||||
df_shuffled = df.sample(frac=1, random_state=123).reset_index(drop=True)
|
||||
|
||||
# Get indices for where to split the data
|
||||
train_end = sizes[0]
|
||||
val_end = sizes[0] + sizes[1]
|
||||
|
||||
# Split the DataFrame
|
||||
train = df_shuffled.iloc[:train_end]
|
||||
val = df_shuffled.iloc[train_end:val_end]
|
||||
test = df_shuffled.iloc[val_end:]
|
||||
|
||||
# Save to CSV files
|
||||
train.to_csv("train.csv", index=False)
|
||||
val.to_csv("validation.csv", index=False)
|
||||
test.to_csv("test.csv", index=False)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
dataset_url = "http://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz"
|
||||
print("Downloading dataset ...")
|
||||
download_and_extract_dataset(dataset_url, "aclImdb_v1.tar.gz", "aclImdb")
|
||||
print("Creating data frames ...")
|
||||
df = load_dataset_to_dataframe()
|
||||
print("Partitioning and saving data frames ...")
|
||||
partition_and_save(df)
|
||||
99
ch06/03_bonus_imdb-classification/gpt_download.py
Normal file
99
ch06/03_bonus_imdb-classification/gpt_download.py
Normal file
@ -0,0 +1,99 @@
|
||||
# Copyright (c) Sebastian Raschka under Apache License 2.0 (see LICENSE.txt).
|
||||
# Source for "Build a Large Language Model From Scratch"
|
||||
# - https://www.manning.com/books/build-a-large-language-model-from-scratch
|
||||
# Code: https://github.com/rasbt/LLMs-from-scratch
|
||||
|
||||
|
||||
import os
|
||||
import requests
|
||||
import json
|
||||
import numpy as np
|
||||
import tensorflow as tf
|
||||
from tqdm import tqdm
|
||||
|
||||
|
||||
def download_and_load_gpt2(model_size, models_dir):
|
||||
# Validate model size
|
||||
allowed_sizes = ("124M", "355M", "774M", "1558M")
|
||||
if model_size not in allowed_sizes:
|
||||
raise ValueError(f"Model size not in {allowed_sizes}")
|
||||
|
||||
# Define paths
|
||||
model_dir = os.path.join(models_dir, model_size)
|
||||
base_url = "https://openaipublic.blob.core.windows.net/gpt-2/models"
|
||||
filenames = [
|
||||
"checkpoint", "encoder.json", "hparams.json",
|
||||
"model.ckpt.data-00000-of-00001", "model.ckpt.index",
|
||||
"model.ckpt.meta", "vocab.bpe"
|
||||
]
|
||||
|
||||
# Download files
|
||||
os.makedirs(model_dir, exist_ok=True)
|
||||
for filename in filenames:
|
||||
file_url = os.path.join(base_url, model_size, filename)
|
||||
file_path = os.path.join(model_dir, filename)
|
||||
download_file(file_url, file_path)
|
||||
|
||||
# Load settings and params
|
||||
tf_ckpt_path = tf.train.latest_checkpoint(model_dir)
|
||||
settings = json.load(open(os.path.join(model_dir, "hparams.json")))
|
||||
params = load_gpt2_params_from_tf_ckpt(tf_ckpt_path, settings)
|
||||
|
||||
return settings, params
|
||||
|
||||
|
||||
def download_file(url, destination):
|
||||
# Send a GET request to download the file in streaming mode
|
||||
response = requests.get(url, stream=True)
|
||||
|
||||
# Get the total file size from headers, defaulting to 0 if not present
|
||||
file_size = int(response.headers.get("content-length", 0))
|
||||
|
||||
# Check if file exists and has the same size
|
||||
if os.path.exists(destination):
|
||||
file_size_local = os.path.getsize(destination)
|
||||
if file_size == file_size_local:
|
||||
print(f"File already exists and is up-to-date: {destination}")
|
||||
return
|
||||
|
||||
# Define the block size for reading the file
|
||||
block_size = 1024 # 1 Kilobyte
|
||||
|
||||
# Initialize the progress bar with total file size
|
||||
progress_bar_description = url.split("/")[-1] # Extract filename from URL
|
||||
with tqdm(total=file_size, unit="iB", unit_scale=True, desc=progress_bar_description) as progress_bar:
|
||||
# Open the destination file in binary write mode
|
||||
with open(destination, "wb") as file:
|
||||
# Iterate over the file data in chunks
|
||||
for chunk in response.iter_content(block_size):
|
||||
progress_bar.update(len(chunk)) # Update progress bar
|
||||
file.write(chunk) # Write the chunk to the file
|
||||
|
||||
|
||||
def load_gpt2_params_from_tf_ckpt(ckpt_path, settings):
|
||||
# Initialize parameters dictionary with empty blocks for each layer
|
||||
params = {"blocks": [{} for _ in range(settings["n_layer"])]}
|
||||
|
||||
# Iterate over each variable in the checkpoint
|
||||
for name, _ in tf.train.list_variables(ckpt_path):
|
||||
# Load the variable and remove singleton dimensions
|
||||
variable_array = np.squeeze(tf.train.load_variable(ckpt_path, name))
|
||||
|
||||
# Process the variable name to extract relevant parts
|
||||
variable_name_parts = name.split("/")[1:] # Skip the 'model/' prefix
|
||||
|
||||
# Identify the target dictionary for the variable
|
||||
target_dict = params
|
||||
if variable_name_parts[0].startswith("h"):
|
||||
layer_number = int(variable_name_parts[0][1:])
|
||||
target_dict = params["blocks"][layer_number]
|
||||
|
||||
# Recursively access or create nested dictionaries
|
||||
for key in variable_name_parts[1:-1]:
|
||||
target_dict = target_dict.setdefault(key, {})
|
||||
|
||||
# Assign the variable array to the last key
|
||||
last_key = variable_name_parts[-1]
|
||||
target_dict[last_key] = variable_array
|
||||
|
||||
return params
|
||||
321
ch06/03_bonus_imdb-classification/previous_chapters.py
Normal file
321
ch06/03_bonus_imdb-classification/previous_chapters.py
Normal file
@ -0,0 +1,321 @@
|
||||
# Copyright (c) Sebastian Raschka under Apache License 2.0 (see LICENSE.txt).
|
||||
# Source for "Build a Large Language Model From Scratch"
|
||||
# - https://www.manning.com/books/build-a-large-language-model-from-scratch
|
||||
# Code: https://github.com/rasbt/LLMs-from-scratch
|
||||
#
|
||||
# This file collects all the relevant code that we covered thus far
|
||||
# throughout Chapters 2-5.
|
||||
# This file can be run as a standalone script.
|
||||
|
||||
import numpy as np
|
||||
import tiktoken
|
||||
import torch
|
||||
import torch.nn as nn
|
||||
from torch.utils.data import Dataset, DataLoader
|
||||
|
||||
#####################################
|
||||
# Chapter 2
|
||||
#####################################
|
||||
|
||||
|
||||
class GPTDatasetV1(Dataset):
|
||||
def __init__(self, txt, tokenizer, max_length, stride):
|
||||
self.tokenizer = tokenizer
|
||||
self.input_ids = []
|
||||
self.target_ids = []
|
||||
|
||||
# Tokenize the entire text
|
||||
token_ids = tokenizer.encode(txt)
|
||||
|
||||
# Use a sliding window to chunk the book into overlapping sequences of max_length
|
||||
for i in range(0, len(token_ids) - max_length, stride):
|
||||
input_chunk = token_ids[i:i + max_length]
|
||||
target_chunk = token_ids[i + 1: i + max_length + 1]
|
||||
self.input_ids.append(torch.tensor(input_chunk))
|
||||
self.target_ids.append(torch.tensor(target_chunk))
|
||||
|
||||
def __len__(self):
|
||||
return len(self.input_ids)
|
||||
|
||||
def __getitem__(self, idx):
|
||||
return self.input_ids[idx], self.target_ids[idx]
|
||||
|
||||
|
||||
def create_dataloader_v1(txt, batch_size=4, max_length=256,
|
||||
stride=128, shuffle=True, drop_last=True):
|
||||
# Initialize the tokenizer
|
||||
tokenizer = tiktoken.get_encoding("gpt2")
|
||||
|
||||
# Create dataset
|
||||
dataset = GPTDatasetV1(txt, tokenizer, max_length, stride)
|
||||
|
||||
# Create dataloader
|
||||
dataloader = DataLoader(
|
||||
dataset, batch_size=batch_size, shuffle=shuffle, drop_last=drop_last)
|
||||
|
||||
return dataloader
|
||||
|
||||
|
||||
#####################################
|
||||
# Chapter 3
|
||||
#####################################
|
||||
class MultiHeadAttention(nn.Module):
|
||||
def __init__(self, d_in, d_out, context_length, dropout, num_heads, qkv_bias=False):
|
||||
super().__init__()
|
||||
assert d_out % num_heads == 0, "d_out must be divisible by n_heads"
|
||||
|
||||
self.d_out = d_out
|
||||
self.num_heads = num_heads
|
||||
self.head_dim = d_out // num_heads # Reduce the projection dim to match desired output dim
|
||||
|
||||
self.W_query = nn.Linear(d_in, d_out, bias=qkv_bias)
|
||||
self.W_key = nn.Linear(d_in, d_out, bias=qkv_bias)
|
||||
self.W_value = nn.Linear(d_in, d_out, bias=qkv_bias)
|
||||
self.out_proj = nn.Linear(d_out, d_out) # Linear layer to combine head outputs
|
||||
self.dropout = nn.Dropout(dropout)
|
||||
self.register_buffer('mask', torch.triu(torch.ones(context_length, context_length), diagonal=1))
|
||||
|
||||
def forward(self, x):
|
||||
b, num_tokens, d_in = x.shape
|
||||
|
||||
keys = self.W_key(x) # Shape: (b, num_tokens, d_out)
|
||||
queries = self.W_query(x)
|
||||
values = self.W_value(x)
|
||||
|
||||
# We implicitly split the matrix by adding a `num_heads` dimension
|
||||
# Unroll last dim: (b, num_tokens, d_out) -> (b, num_tokens, num_heads, head_dim)
|
||||
keys = keys.view(b, num_tokens, self.num_heads, self.head_dim)
|
||||
values = values.view(b, num_tokens, self.num_heads, self.head_dim)
|
||||
queries = queries.view(b, num_tokens, self.num_heads, self.head_dim)
|
||||
|
||||
# Transpose: (b, num_tokens, num_heads, head_dim) -> (b, num_heads, num_tokens, head_dim)
|
||||
keys = keys.transpose(1, 2)
|
||||
queries = queries.transpose(1, 2)
|
||||
values = values.transpose(1, 2)
|
||||
|
||||
# Compute scaled dot-product attention (aka self-attention) with a causal mask
|
||||
attn_scores = queries @ keys.transpose(2, 3) # Dot product for each head
|
||||
|
||||
# Original mask truncated to the number of tokens and converted to boolean
|
||||
mask_bool = self.mask.bool()[:num_tokens, :num_tokens]
|
||||
|
||||
# Use the mask to fill attention scores
|
||||
attn_scores.masked_fill_(mask_bool, -torch.inf)
|
||||
|
||||
attn_weights = torch.softmax(attn_scores / keys.shape[-1]**0.5, dim=-1)
|
||||
attn_weights = self.dropout(attn_weights)
|
||||
|
||||
# Shape: (b, num_tokens, num_heads, head_dim)
|
||||
context_vec = (attn_weights @ values).transpose(1, 2)
|
||||
|
||||
# Combine heads, where self.d_out = self.num_heads * self.head_dim
|
||||
context_vec = context_vec.reshape(b, num_tokens, self.d_out)
|
||||
context_vec = self.out_proj(context_vec) # optional projection
|
||||
|
||||
return context_vec
|
||||
|
||||
|
||||
#####################################
|
||||
# Chapter 4
|
||||
#####################################
|
||||
class LayerNorm(nn.Module):
|
||||
def __init__(self, emb_dim):
|
||||
super().__init__()
|
||||
self.eps = 1e-5
|
||||
self.scale = nn.Parameter(torch.ones(emb_dim))
|
||||
self.shift = nn.Parameter(torch.zeros(emb_dim))
|
||||
|
||||
def forward(self, x):
|
||||
mean = x.mean(dim=-1, keepdim=True)
|
||||
var = x.var(dim=-1, keepdim=True, unbiased=False)
|
||||
norm_x = (x - mean) / torch.sqrt(var + self.eps)
|
||||
return self.scale * norm_x + self.shift
|
||||
|
||||
|
||||
class GELU(nn.Module):
|
||||
def __init__(self):
|
||||
super().__init__()
|
||||
|
||||
def forward(self, x):
|
||||
return 0.5 * x * (1 + torch.tanh(
|
||||
torch.sqrt(torch.tensor(2.0 / torch.pi)) *
|
||||
(x + 0.044715 * torch.pow(x, 3))
|
||||
))
|
||||
|
||||
|
||||
class FeedForward(nn.Module):
|
||||
def __init__(self, cfg):
|
||||
super().__init__()
|
||||
self.layers = nn.Sequential(
|
||||
nn.Linear(cfg["emb_dim"], 4 * cfg["emb_dim"]),
|
||||
GELU(),
|
||||
nn.Linear(4 * cfg["emb_dim"], cfg["emb_dim"]),
|
||||
)
|
||||
|
||||
def forward(self, x):
|
||||
return self.layers(x)
|
||||
|
||||
|
||||
class TransformerBlock(nn.Module):
|
||||
def __init__(self, cfg):
|
||||
super().__init__()
|
||||
self.att = MultiHeadAttention(
|
||||
d_in=cfg["emb_dim"],
|
||||
d_out=cfg["emb_dim"],
|
||||
context_length=cfg["context_length"],
|
||||
num_heads=cfg["n_heads"],
|
||||
dropout=cfg["drop_rate"],
|
||||
qkv_bias=cfg["qkv_bias"])
|
||||
self.ff = FeedForward(cfg)
|
||||
self.norm1 = LayerNorm(cfg["emb_dim"])
|
||||
self.norm2 = LayerNorm(cfg["emb_dim"])
|
||||
self.drop_resid = nn.Dropout(cfg["drop_rate"])
|
||||
|
||||
def forward(self, x):
|
||||
# Shortcut connection for attention block
|
||||
shortcut = x
|
||||
x = self.norm1(x)
|
||||
x = self.att(x) # Shape [batch_size, num_tokens, emb_size]
|
||||
x = self.drop_resid(x)
|
||||
x = x + shortcut # Add the original input back
|
||||
|
||||
# Shortcut connection for feed-forward block
|
||||
shortcut = x
|
||||
x = self.norm2(x)
|
||||
x = self.ff(x)
|
||||
x = self.drop_resid(x)
|
||||
x = x + shortcut # Add the original input back
|
||||
|
||||
return x
|
||||
|
||||
|
||||
class GPTModel(nn.Module):
|
||||
def __init__(self, cfg):
|
||||
super().__init__()
|
||||
self.tok_emb = nn.Embedding(cfg["vocab_size"], cfg["emb_dim"])
|
||||
self.pos_emb = nn.Embedding(cfg["context_length"], cfg["emb_dim"])
|
||||
self.drop_emb = nn.Dropout(cfg["drop_rate"])
|
||||
|
||||
self.trf_blocks = nn.Sequential(
|
||||
*[TransformerBlock(cfg) for _ in range(cfg["n_layers"])])
|
||||
|
||||
self.final_norm = LayerNorm(cfg["emb_dim"])
|
||||
self.out_head = nn.Linear(cfg["emb_dim"], cfg["vocab_size"], bias=False)
|
||||
|
||||
def forward(self, in_idx):
|
||||
batch_size, seq_len = in_idx.shape
|
||||
tok_embeds = self.tok_emb(in_idx)
|
||||
pos_embeds = self.pos_emb(torch.arange(seq_len, device=in_idx.device))
|
||||
x = tok_embeds + pos_embeds # Shape [batch_size, num_tokens, emb_size]
|
||||
x = self.drop_emb(x)
|
||||
x = self.trf_blocks(x)
|
||||
x = self.final_norm(x)
|
||||
logits = self.out_head(x)
|
||||
return logits
|
||||
|
||||
|
||||
def generate_text_simple(model, idx, max_new_tokens, context_size):
|
||||
# idx is (B, T) array of indices in the current context
|
||||
for _ in range(max_new_tokens):
|
||||
|
||||
# Crop current context if it exceeds the supported context size
|
||||
# E.g., if LLM supports only 5 tokens, and the context size is 10
|
||||
# then only the last 5 tokens are used as context
|
||||
idx_cond = idx[:, -context_size:]
|
||||
|
||||
# Get the predictions
|
||||
with torch.no_grad():
|
||||
logits = model(idx_cond)
|
||||
|
||||
# Focus only on the last time step
|
||||
# (batch, n_token, vocab_size) becomes (batch, vocab_size)
|
||||
logits = logits[:, -1, :]
|
||||
|
||||
# Get the idx of the vocab entry with the highest logits value
|
||||
idx_next = torch.argmax(logits, dim=-1, keepdim=True) # (batch, 1)
|
||||
|
||||
# Append sampled index to the running sequence
|
||||
idx = torch.cat((idx, idx_next), dim=1) # (batch, n_tokens+1)
|
||||
|
||||
return idx
|
||||
|
||||
|
||||
#####################################
|
||||
# Chapter 5
|
||||
#####################################
|
||||
def assign(left, right):
|
||||
if left.shape != right.shape:
|
||||
raise ValueError(f"Shape mismatch. Left: {left.shape}, Right: {right.shape}")
|
||||
return torch.nn.Parameter(torch.tensor(right))
|
||||
|
||||
|
||||
def load_weights_into_gpt(gpt, params):
|
||||
gpt.pos_emb.weight = assign(gpt.pos_emb.weight, params['wpe'])
|
||||
gpt.tok_emb.weight = assign(gpt.tok_emb.weight, params['wte'])
|
||||
|
||||
for b in range(len(params["blocks"])):
|
||||
q_w, k_w, v_w = np.split(
|
||||
(params["blocks"][b]["attn"]["c_attn"])["w"], 3, axis=-1)
|
||||
gpt.trf_blocks[b].att.W_query.weight = assign(
|
||||
gpt.trf_blocks[b].att.W_query.weight, q_w.T)
|
||||
gpt.trf_blocks[b].att.W_key.weight = assign(
|
||||
gpt.trf_blocks[b].att.W_key.weight, k_w.T)
|
||||
gpt.trf_blocks[b].att.W_value.weight = assign(
|
||||
gpt.trf_blocks[b].att.W_value.weight, v_w.T)
|
||||
|
||||
q_b, k_b, v_b = np.split(
|
||||
(params["blocks"][b]["attn"]["c_attn"])["b"], 3, axis=-1)
|
||||
gpt.trf_blocks[b].att.W_query.bias = assign(
|
||||
gpt.trf_blocks[b].att.W_query.bias, q_b)
|
||||
gpt.trf_blocks[b].att.W_key.bias = assign(
|
||||
gpt.trf_blocks[b].att.W_key.bias, k_b)
|
||||
gpt.trf_blocks[b].att.W_value.bias = assign(
|
||||
gpt.trf_blocks[b].att.W_value.bias, v_b)
|
||||
|
||||
gpt.trf_blocks[b].att.out_proj.weight = assign(
|
||||
gpt.trf_blocks[b].att.out_proj.weight,
|
||||
params["blocks"][b]["attn"]["c_proj"]["w"].T)
|
||||
gpt.trf_blocks[b].att.out_proj.bias = assign(
|
||||
gpt.trf_blocks[b].att.out_proj.bias,
|
||||
params["blocks"][b]["attn"]["c_proj"]["b"])
|
||||
|
||||
gpt.trf_blocks[b].ff.layers[0].weight = assign(
|
||||
gpt.trf_blocks[b].ff.layers[0].weight,
|
||||
params["blocks"][b]["mlp"]["c_fc"]["w"].T)
|
||||
gpt.trf_blocks[b].ff.layers[0].bias = assign(
|
||||
gpt.trf_blocks[b].ff.layers[0].bias,
|
||||
params["blocks"][b]["mlp"]["c_fc"]["b"])
|
||||
gpt.trf_blocks[b].ff.layers[2].weight = assign(
|
||||
gpt.trf_blocks[b].ff.layers[2].weight,
|
||||
params["blocks"][b]["mlp"]["c_proj"]["w"].T)
|
||||
gpt.trf_blocks[b].ff.layers[2].bias = assign(
|
||||
gpt.trf_blocks[b].ff.layers[2].bias,
|
||||
params["blocks"][b]["mlp"]["c_proj"]["b"])
|
||||
|
||||
gpt.trf_blocks[b].norm1.scale = assign(
|
||||
gpt.trf_blocks[b].norm1.scale,
|
||||
params["blocks"][b]["ln_1"]["g"])
|
||||
gpt.trf_blocks[b].norm1.shift = assign(
|
||||
gpt.trf_blocks[b].norm1.shift,
|
||||
params["blocks"][b]["ln_1"]["b"])
|
||||
gpt.trf_blocks[b].norm2.scale = assign(
|
||||
gpt.trf_blocks[b].norm2.scale,
|
||||
params["blocks"][b]["ln_2"]["g"])
|
||||
gpt.trf_blocks[b].norm2.shift = assign(
|
||||
gpt.trf_blocks[b].norm2.shift,
|
||||
params["blocks"][b]["ln_2"]["b"])
|
||||
|
||||
gpt.final_norm.scale = assign(gpt.final_norm.scale, params["g"])
|
||||
gpt.final_norm.shift = assign(gpt.final_norm.shift, params["b"])
|
||||
gpt.out_head.weight = assign(gpt.out_head.weight, params["wte"])
|
||||
|
||||
|
||||
def text_to_token_ids(text, tokenizer):
|
||||
encoded = tokenizer.encode(text, allowed_special={'<|endoftext|>'})
|
||||
encoded_tensor = torch.tensor(encoded).unsqueeze(0) # add batch dimension
|
||||
return encoded_tensor
|
||||
|
||||
|
||||
def token_ids_to_text(token_ids, tokenizer):
|
||||
flat = token_ids.squeeze(0) # remove batch dimension
|
||||
return tokenizer.decode(flat.tolist())
|
||||
2
ch06/03_bonus_imdb-classification/requirements-extra.txt
Normal file
2
ch06/03_bonus_imdb-classification/requirements-extra.txt
Normal file
@ -0,0 +1,2 @@
|
||||
transformers>=4.33.2
|
||||
scikit-learn>=1.3.0
|
||||
277
ch06/03_bonus_imdb-classification/sklearn-baseline.ipynb
Normal file
277
ch06/03_bonus_imdb-classification/sklearn-baseline.ipynb
Normal file
@ -0,0 +1,277 @@
|
||||
{
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "8968a681-2db1-4840-bb73-7d6c95986825",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"<table style=\"width:100%\">\n",
|
||||
"<tr>\n",
|
||||
"<td style=\"vertical-align:middle; text-align:left;\">\n",
|
||||
"<font size=\"2\">\n",
|
||||
"Supplementary code for the <a href=\"http://mng.bz/orYv\">Build a Large Language Model From Scratch</a> book by <a href=\"https://sebastianraschka.com\">Sebastian Raschka</a><br>\n",
|
||||
"<br>Code repository: <a href=\"https://github.com/rasbt/LLMs-from-scratch\">https://github.com/rasbt/LLMs-from-scratch</a>\n",
|
||||
"</font>\n",
|
||||
"</td>\n",
|
||||
"<td style=\"vertical-align:middle; text-align:left;\">\n",
|
||||
"<a href=\"http://mng.bz/orYv\"><img src=\"https://sebastianraschka.com/images/LLMs-from-scratch-images/cover-small.webp\" width=\"100px\"></a>\n",
|
||||
"</td>\n",
|
||||
"</tr>\n",
|
||||
"</table>"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "8b6e1cdd-b14e-4368-bdbb-9bf7ab821791",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"# Scikit-learn Logistic 回归模型"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 1,
|
||||
"id": "c2a72242-6197-4bef-aa05-696a152350d5",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"100% | 80.23 MB | 4.37 MB/s | 18.38 sec elapsed"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"!python download-prepare-dataset.py"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 14,
|
||||
"id": "69f32433-e19c-4066-b806-8f30b408107f",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"import pandas as pd\n",
|
||||
"\n",
|
||||
"train_df = pd.read_csv(\"train.csv\")\n",
|
||||
"val_df = pd.read_csv(\"validation.csv\")\n",
|
||||
"test_df = pd.read_csv(\"test.csv\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 16,
|
||||
"id": "0808b212-fe91-48d9-80b8-55519f8835d5",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/html": [
|
||||
"<div>\n",
|
||||
"<style scoped>\n",
|
||||
" .dataframe tbody tr th:only-of-type {\n",
|
||||
" vertical-align: middle;\n",
|
||||
" }\n",
|
||||
"\n",
|
||||
" .dataframe tbody tr th {\n",
|
||||
" vertical-align: top;\n",
|
||||
" }\n",
|
||||
"\n",
|
||||
" .dataframe thead th {\n",
|
||||
" text-align: right;\n",
|
||||
" }\n",
|
||||
"</style>\n",
|
||||
"<table border=\"1\" class=\"dataframe\">\n",
|
||||
" <thead>\n",
|
||||
" <tr style=\"text-align: right;\">\n",
|
||||
" <th></th>\n",
|
||||
" <th>text</th>\n",
|
||||
" <th>label</th>\n",
|
||||
" </tr>\n",
|
||||
" </thead>\n",
|
||||
" <tbody>\n",
|
||||
" <tr>\n",
|
||||
" <th>0</th>\n",
|
||||
" <td>The only reason I saw \"Shakedown\" was that it ...</td>\n",
|
||||
" <td>0</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>1</th>\n",
|
||||
" <td>This is absolute drivel, designed to shock and...</td>\n",
|
||||
" <td>0</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>2</th>\n",
|
||||
" <td>Lots of scenes and dialogue are flat-out goofy...</td>\n",
|
||||
" <td>1</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>3</th>\n",
|
||||
" <td>** and 1/2 stars out of **** Lifeforce is one ...</td>\n",
|
||||
" <td>1</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>4</th>\n",
|
||||
" <td>I learned a thing: you have to take this film ...</td>\n",
|
||||
" <td>1</td>\n",
|
||||
" </tr>\n",
|
||||
" </tbody>\n",
|
||||
"</table>\n",
|
||||
"</div>"
|
||||
],
|
||||
"text/plain": [
|
||||
" text label\n",
|
||||
"0 The only reason I saw \"Shakedown\" was that it ... 0\n",
|
||||
"1 This is absolute drivel, designed to shock and... 0\n",
|
||||
"2 Lots of scenes and dialogue are flat-out goofy... 1\n",
|
||||
"3 ** and 1/2 stars out of **** Lifeforce is one ... 1\n",
|
||||
"4 I learned a thing: you have to take this film ... 1"
|
||||
]
|
||||
},
|
||||
"execution_count": 16,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"train_df.head()"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "fae87bc1-14ca-4f89-8e12-49f77b0ec00d",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Scikit-learn baseline"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 17,
|
||||
"id": "180318b7-de18-4b05-b84a-ba97c72b9d8e",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from sklearn.feature_extraction.text import CountVectorizer\n",
|
||||
"from sklearn.linear_model import LogisticRegression\n",
|
||||
"from sklearn.metrics import accuracy_score"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 20,
|
||||
"id": "25090b7c-f516-4be2-8083-3a7187fe4635",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"vectorizer = CountVectorizer()\n",
|
||||
"\n",
|
||||
"X_train = vectorizer.fit_transform(train_df[\"text\"])\n",
|
||||
"X_val = vectorizer.transform(val_df[\"text\"])\n",
|
||||
"X_test = vectorizer.transform(test_df[\"text\"])\n",
|
||||
"\n",
|
||||
"y_train, y_val, y_test = train_df[\"label\"], val_df[\"label\"], test_df[\"label\"]"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 22,
|
||||
"id": "0247de3a-88f0-4b9c-becd-157baf3acf49",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"def eval(model, X_train, y_train, X_val, y_val, X_test, y_test):\n",
|
||||
" # Making predictions\n",
|
||||
" y_pred_train = model.predict(X_train)\n",
|
||||
" y_pred_val = model.predict(X_val)\n",
|
||||
" y_pred_test = model.predict(X_test)\n",
|
||||
" \n",
|
||||
" # Calculating accuracy and balanced accuracy\n",
|
||||
" accuracy_train = accuracy_score(y_train, y_pred_train)\n",
|
||||
" balanced_accuracy_train = balanced_accuracy_score(y_train, y_pred_train)\n",
|
||||
" \n",
|
||||
" accuracy_val = accuracy_score(y_val, y_pred_val)\n",
|
||||
" balanced_accuracy_val = balanced_accuracy_score(y_val, y_pred_val)\n",
|
||||
"\n",
|
||||
" accuracy_test = accuracy_score(y_test, y_pred_test)\n",
|
||||
" balanced_accuracy_test = balanced_accuracy_score(y_test, y_pred_test)\n",
|
||||
" \n",
|
||||
" # Printing the results\n",
|
||||
" print(f\"Training Accuracy: {accuracy_train*100:.2f}%\")\n",
|
||||
" print(f\"Validation Accuracy: {accuracy_val*100:.2f}%\")\n",
|
||||
" print(f\"Test Accuracy: {accuracy_test*100:.2f}%\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 23,
|
||||
"id": "c29c6dfc-f72d-40ab-8cb5-783aad1a15ab",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"Training Accuracy: 50.01%\n",
|
||||
"Validation Accuracy: 50.14%\n",
|
||||
"Test Accuracy: 49.91%\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"from sklearn.dummy import DummyClassifier\n",
|
||||
"\n",
|
||||
"# Create a dummy classifier with the strategy to predict the most frequent class\n",
|
||||
"dummy_clf = DummyClassifier(strategy=\"most_frequent\")\n",
|
||||
"dummy_clf.fit(X_train, y_train)\n",
|
||||
"\n",
|
||||
"eval(dummy_clf, X_train, y_train, X_val, y_val, X_test, y_test)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 24,
|
||||
"id": "088a8a3a-3b74-4d10-a51b-cb662569ae39",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"Training Accuracy: 99.80%\n",
|
||||
"Validation Accuracy: 88.62%\n",
|
||||
"Test Accuracy: 88.85%\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"model = LogisticRegression(max_iter=1000)\n",
|
||||
"model.fit(X_train, y_train)\n",
|
||||
"eval(model, X_train, y_train, X_val, y_val, X_test, y_test)"
|
||||
]
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"kernelspec": {
|
||||
"display_name": "Python 3 (ipykernel)",
|
||||
"language": "python",
|
||||
"name": "python3"
|
||||
},
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
"name": "ipython",
|
||||
"version": 3
|
||||
},
|
||||
"file_extension": ".py",
|
||||
"mimetype": "text/x-python",
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.11.4"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 5
|
||||
}
|
||||
301
ch06/03_bonus_imdb-classification/train-bert-hf.py
Normal file
301
ch06/03_bonus_imdb-classification/train-bert-hf.py
Normal file
@ -0,0 +1,301 @@
|
||||
# Copyright (c) Sebastian Raschka under Apache License 2.0 (see LICENSE.txt).
|
||||
# Source for "Build a Large Language Model From Scratch"
|
||||
# - https://www.manning.com/books/build-a-large-language-model-from-scratch
|
||||
# Code: https://github.com/rasbt/LLMs-from-scratch
|
||||
|
||||
import argparse
|
||||
from pathlib import Path
|
||||
import time
|
||||
|
||||
import pandas as pd
|
||||
import torch
|
||||
from torch.utils.data import DataLoader
|
||||
from torch.utils.data import Dataset
|
||||
|
||||
from transformers import AutoTokenizer, AutoModelForSequenceClassification
|
||||
|
||||
|
||||
class IMDBDataset(Dataset):
|
||||
def __init__(self, csv_file, tokenizer, max_length=None, pad_token_id=50256):
|
||||
self.data = pd.read_csv(csv_file)
|
||||
self.max_length = max_length if max_length is not None else self._longest_encoded_length(tokenizer)
|
||||
|
||||
# Pre-tokenize texts
|
||||
self.encoded_texts = [
|
||||
tokenizer.encode(text)[:self.max_length]
|
||||
for text in self.data["text"]
|
||||
]
|
||||
# Pad sequences to the longest sequence
|
||||
|
||||
# Debug
|
||||
pad_token_id = 0
|
||||
|
||||
self.encoded_texts = [
|
||||
et + [pad_token_id] * (self.max_length - len(et))
|
||||
for et in self.encoded_texts
|
||||
]
|
||||
|
||||
def __getitem__(self, index):
|
||||
encoded = self.encoded_texts[index]
|
||||
label = self.data.iloc[index]["label"]
|
||||
return torch.tensor(encoded, dtype=torch.long), torch.tensor(label, dtype=torch.long)
|
||||
|
||||
def __len__(self):
|
||||
return len(self.data)
|
||||
|
||||
def _longest_encoded_length(self, tokenizer):
|
||||
max_length = 0
|
||||
for text in self.data["text"]:
|
||||
encoded_length = len(tokenizer.encode(text))
|
||||
if encoded_length > max_length:
|
||||
max_length = encoded_length
|
||||
return max_length
|
||||
|
||||
|
||||
def calc_loss_batch(input_batch, target_batch, model, device):
|
||||
input_batch, target_batch = input_batch.to(device), target_batch.to(device)
|
||||
# logits = model(input_batch)[:, -1, :] # Logits of last output token
|
||||
logits = model(input_batch).logits
|
||||
loss = torch.nn.functional.cross_entropy(logits, target_batch)
|
||||
return loss
|
||||
|
||||
|
||||
# Same as in chapter 5
|
||||
def calc_loss_loader(data_loader, model, device, num_batches=None):
|
||||
total_loss = 0.
|
||||
if num_batches is None:
|
||||
num_batches = len(data_loader)
|
||||
else:
|
||||
# Reduce the number of batches to match the total number of batches in the data loader
|
||||
# if num_batches exceeds the number of batches in the data loader
|
||||
num_batches = min(num_batches, len(data_loader))
|
||||
for i, (input_batch, target_batch) in enumerate(data_loader):
|
||||
if i < num_batches:
|
||||
loss = calc_loss_batch(input_batch, target_batch, model, device)
|
||||
total_loss += loss.item()
|
||||
else:
|
||||
break
|
||||
return total_loss / num_batches
|
||||
|
||||
|
||||
@torch.no_grad() # Disable gradient tracking for efficiency
|
||||
def calc_accuracy_loader(data_loader, model, device, num_batches=None):
|
||||
model.eval()
|
||||
correct_predictions, num_examples = 0, 0
|
||||
|
||||
if num_batches is None:
|
||||
num_batches = len(data_loader)
|
||||
else:
|
||||
num_batches = min(num_batches, len(data_loader))
|
||||
for i, (input_batch, target_batch) in enumerate(data_loader):
|
||||
if i < num_batches:
|
||||
input_batch, target_batch = input_batch.to(device), target_batch.to(device)
|
||||
# logits = model(input_batch)[:, -1, :] # Logits of last output token
|
||||
logits = model(input_batch).logits
|
||||
predicted_labels = torch.argmax(logits, dim=1)
|
||||
num_examples += predicted_labels.shape[0]
|
||||
correct_predictions += (predicted_labels == target_batch).sum().item()
|
||||
else:
|
||||
break
|
||||
return correct_predictions / num_examples
|
||||
|
||||
|
||||
def evaluate_model(model, train_loader, val_loader, device, eval_iter):
|
||||
model.eval()
|
||||
with torch.no_grad():
|
||||
train_loss = calc_loss_loader(train_loader, model, device, num_batches=eval_iter)
|
||||
val_loss = calc_loss_loader(val_loader, model, device, num_batches=eval_iter)
|
||||
model.train()
|
||||
return train_loss, val_loss
|
||||
|
||||
|
||||
def train_classifier_simple(model, train_loader, val_loader, optimizer, device, num_epochs,
|
||||
eval_freq, eval_iter, tokenizer, max_steps=None):
|
||||
# Initialize lists to track losses and tokens seen
|
||||
train_losses, val_losses, train_accs, val_accs = [], [], [], []
|
||||
examples_seen, global_step = 0, -1
|
||||
|
||||
# Main training loop
|
||||
for epoch in range(num_epochs):
|
||||
model.train() # Set model to training mode
|
||||
|
||||
for input_batch, target_batch in train_loader:
|
||||
optimizer.zero_grad() # Reset loss gradients from previous epoch
|
||||
loss = calc_loss_batch(input_batch, target_batch, model, device)
|
||||
loss.backward() # Calculate loss gradients
|
||||
optimizer.step() # Update model weights using loss gradients
|
||||
examples_seen += input_batch.shape[0] # New: track examples instead of tokens
|
||||
global_step += 1
|
||||
|
||||
# Optional evaluation step
|
||||
if global_step % eval_freq == 0:
|
||||
train_loss, val_loss = evaluate_model(
|
||||
model, train_loader, val_loader, device, eval_iter)
|
||||
train_losses.append(train_loss)
|
||||
val_losses.append(val_loss)
|
||||
print(f"Ep {epoch+1} (Step {global_step:06d}): "
|
||||
f"Train loss {train_loss:.3f}, Val loss {val_loss:.3f}")
|
||||
|
||||
if max_steps is not None and global_step > max_steps:
|
||||
break
|
||||
|
||||
# New: Calculate accuracy after each epoch
|
||||
train_accuracy = calc_accuracy_loader(train_loader, model, device, num_batches=eval_iter)
|
||||
val_accuracy = calc_accuracy_loader(val_loader, model, device, num_batches=eval_iter)
|
||||
print(f"Training accuracy: {train_accuracy*100:.2f}% | ", end="")
|
||||
print(f"Validation accuracy: {val_accuracy*100:.2f}%")
|
||||
train_accs.append(train_accuracy)
|
||||
val_accs.append(val_accuracy)
|
||||
|
||||
if max_steps is not None and global_step > max_steps:
|
||||
break
|
||||
|
||||
return train_losses, val_losses, train_accs, val_accs, examples_seen
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
|
||||
parser = argparse.ArgumentParser()
|
||||
parser.add_argument(
|
||||
"--trainable_layers",
|
||||
type=str,
|
||||
default="last_block",
|
||||
help=(
|
||||
"Which layers to train. Options: 'all', 'last_block', 'last_layer'."
|
||||
)
|
||||
)
|
||||
parser.add_argument(
|
||||
"--bert_model",
|
||||
type=str,
|
||||
default="distilbert",
|
||||
help=(
|
||||
"Which layers to train. Options: 'all', 'last_block', 'last_layer'."
|
||||
)
|
||||
)
|
||||
args = parser.parse_args()
|
||||
|
||||
###############################
|
||||
# Load model
|
||||
###############################
|
||||
|
||||
torch.manual_seed(123)
|
||||
if args.bert_model == "distilbert":
|
||||
|
||||
model = AutoModelForSequenceClassification.from_pretrained(
|
||||
"distilbert-base-uncased", num_labels=2
|
||||
)
|
||||
model.out_head = torch.nn.Linear(in_features=768, out_features=2)
|
||||
|
||||
if args.trainable_layers == "last_layer":
|
||||
pass
|
||||
elif args.trainable_layers == "last_block":
|
||||
for param in model.pre_classifier.parameters():
|
||||
param.requires_grad = True
|
||||
for param in model.distilbert.transformer.layer[-1].parameters():
|
||||
param.requires_grad = True
|
||||
elif args.trainable_layers == "all":
|
||||
for param in model.parameters():
|
||||
param.requires_grad = True
|
||||
else:
|
||||
raise ValueError("Invalid --trainable_layers argument.")
|
||||
|
||||
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")
|
||||
|
||||
elif args.bert_model == "roberta":
|
||||
|
||||
model = AutoModelForSequenceClassification.from_pretrained(
|
||||
"FacebookAI/roberta-large", num_labels=2
|
||||
)
|
||||
model.classifier.out_proj = torch.nn.Linear(in_features=1024, out_features=2)
|
||||
|
||||
if args.trainable_layers == "last_layer":
|
||||
pass
|
||||
elif args.trainable_layers == "last_block":
|
||||
for param in model.classifier.parameters():
|
||||
param.requires_grad = True
|
||||
for param in model.roberta.encoder.layer[-1].parameters():
|
||||
param.requires_grad = True
|
||||
elif args.trainable_layers == "all":
|
||||
for param in model.parameters():
|
||||
param.requires_grad = True
|
||||
else:
|
||||
raise ValueError("Invalid --trainable_layers argument.")
|
||||
|
||||
tokenizer = AutoTokenizer.from_pretrained("FacebookAI/roberta-large")
|
||||
|
||||
else:
|
||||
raise ValueError("Selected --bert_model not supported.")
|
||||
|
||||
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
||||
model.to(device)
|
||||
model.eval()
|
||||
|
||||
###############################
|
||||
# Instantiate dataloaders
|
||||
###############################
|
||||
|
||||
pad_token_id = tokenizer.encode(tokenizer.pad_token)
|
||||
|
||||
base_path = Path(".")
|
||||
|
||||
train_dataset = IMDBDataset(base_path / "train.csv", max_length=256, tokenizer=tokenizer, pad_token_id=pad_token_id)
|
||||
val_dataset = IMDBDataset(base_path / "validation.csv", max_length=256, tokenizer=tokenizer, pad_token_id=pad_token_id)
|
||||
test_dataset = IMDBDataset(base_path / "test.csv", max_length=256, tokenizer=tokenizer, pad_token_id=pad_token_id)
|
||||
|
||||
num_workers = 0
|
||||
batch_size = 8
|
||||
|
||||
train_loader = DataLoader(
|
||||
dataset=train_dataset,
|
||||
batch_size=batch_size,
|
||||
shuffle=True,
|
||||
num_workers=num_workers,
|
||||
drop_last=True,
|
||||
)
|
||||
|
||||
val_loader = DataLoader(
|
||||
dataset=val_dataset,
|
||||
batch_size=batch_size,
|
||||
num_workers=num_workers,
|
||||
drop_last=False,
|
||||
)
|
||||
|
||||
test_loader = DataLoader(
|
||||
dataset=test_dataset,
|
||||
batch_size=batch_size,
|
||||
num_workers=num_workers,
|
||||
drop_last=False,
|
||||
)
|
||||
|
||||
###############################
|
||||
# Train model
|
||||
###############################
|
||||
|
||||
start_time = time.time()
|
||||
torch.manual_seed(123)
|
||||
optimizer = torch.optim.AdamW(model.parameters(), lr=5e-5, weight_decay=0.1)
|
||||
|
||||
num_epochs = 3
|
||||
train_losses, val_losses, train_accs, val_accs, examples_seen = train_classifier_simple(
|
||||
model, train_loader, val_loader, optimizer, device,
|
||||
num_epochs=num_epochs, eval_freq=50, eval_iter=20,
|
||||
tokenizer=tokenizer, max_steps=None
|
||||
)
|
||||
|
||||
end_time = time.time()
|
||||
execution_time_minutes = (end_time - start_time) / 60
|
||||
print(f"Training completed in {execution_time_minutes:.2f} minutes.")
|
||||
|
||||
###############################
|
||||
# Evaluate model
|
||||
###############################
|
||||
|
||||
print("\nEvaluating on the full datasets ...\n")
|
||||
|
||||
train_accuracy = calc_accuracy_loader(train_loader, model, device)
|
||||
val_accuracy = calc_accuracy_loader(val_loader, model, device)
|
||||
test_accuracy = calc_accuracy_loader(test_loader, model, device)
|
||||
|
||||
print(f"Training accuracy: {train_accuracy*100:.2f}%")
|
||||
print(f"Validation accuracy: {val_accuracy*100:.2f}%")
|
||||
print(f"Test accuracy: {test_accuracy*100:.2f}%")
|
||||
366
ch06/03_bonus_imdb-classification/train-gpt.py
Normal file
366
ch06/03_bonus_imdb-classification/train-gpt.py
Normal file
@ -0,0 +1,366 @@
|
||||
# Copyright (c) Sebastian Raschka under Apache License 2.0 (see LICENSE.txt).
|
||||
# Source for "Build a Large Language Model From Scratch"
|
||||
# - https://www.manning.com/books/build-a-large-language-model-from-scratch
|
||||
# Code: https://github.com/rasbt/LLMs-from-scratch
|
||||
|
||||
import argparse
|
||||
from pathlib import Path
|
||||
import time
|
||||
|
||||
import pandas as pd
|
||||
import tiktoken
|
||||
import torch
|
||||
from torch.utils.data import DataLoader
|
||||
from torch.utils.data import Dataset
|
||||
|
||||
from gpt_download import download_and_load_gpt2
|
||||
from previous_chapters import GPTModel, load_weights_into_gpt
|
||||
|
||||
|
||||
class IMDBDataset(Dataset):
|
||||
def __init__(self, csv_file, tokenizer, max_length=None, pad_token_id=50256):
|
||||
self.data = pd.read_csv(csv_file)
|
||||
self.max_length = max_length if max_length is not None else self._longest_encoded_length(tokenizer)
|
||||
|
||||
# Pre-tokenize texts
|
||||
self.encoded_texts = [
|
||||
tokenizer.encode(text)[:self.max_length]
|
||||
for text in self.data["text"]
|
||||
]
|
||||
# Pad sequences to the longest sequence
|
||||
self.encoded_texts = [
|
||||
et + [pad_token_id] * (self.max_length - len(et))
|
||||
for et in self.encoded_texts
|
||||
]
|
||||
|
||||
def __getitem__(self, index):
|
||||
encoded = self.encoded_texts[index]
|
||||
label = self.data.iloc[index]["label"]
|
||||
return torch.tensor(encoded, dtype=torch.long), torch.tensor(label, dtype=torch.long)
|
||||
|
||||
def __len__(self):
|
||||
return len(self.data)
|
||||
|
||||
def _longest_encoded_length(self, tokenizer):
|
||||
max_length = 0
|
||||
for text in self.data["text"]:
|
||||
encoded_length = len(tokenizer.encode(text))
|
||||
if encoded_length > max_length:
|
||||
max_length = encoded_length
|
||||
return max_length
|
||||
|
||||
|
||||
def instantiate_model(choose_model, load_weights):
|
||||
|
||||
BASE_CONFIG = {
|
||||
"vocab_size": 50257, # Vocabulary size
|
||||
"context_length": 1024, # Context length
|
||||
"drop_rate": 0.0, # Dropout rate
|
||||
"qkv_bias": True # Query-key-value bias
|
||||
}
|
||||
|
||||
model_configs = {
|
||||
"gpt2-small (124M)": {"emb_dim": 768, "n_layers": 12, "n_heads": 12},
|
||||
"gpt2-medium (355M)": {"emb_dim": 1024, "n_layers": 24, "n_heads": 16},
|
||||
"gpt2-large (774M)": {"emb_dim": 1280, "n_layers": 36, "n_heads": 20},
|
||||
"gpt2-xl (1558M)": {"emb_dim": 1600, "n_layers": 48, "n_heads": 25},
|
||||
}
|
||||
|
||||
BASE_CONFIG.update(model_configs[choose_model])
|
||||
|
||||
if not load_weights:
|
||||
torch.manual_seed(123)
|
||||
model = GPTModel(BASE_CONFIG)
|
||||
|
||||
if load_weights:
|
||||
model_size = choose_model.split(" ")[-1].lstrip("(").rstrip(")")
|
||||
settings, params = download_and_load_gpt2(model_size=model_size, models_dir="gpt2")
|
||||
load_weights_into_gpt(model, params)
|
||||
|
||||
model.eval()
|
||||
return model
|
||||
|
||||
|
||||
def calc_loss_batch(input_batch, target_batch, model, device, trainable_token=-1):
|
||||
input_batch, target_batch = input_batch.to(device), target_batch.to(device)
|
||||
logits = model(input_batch)[:, trainable_token, :] # Logits of last output token
|
||||
loss = torch.nn.functional.cross_entropy(logits, target_batch)
|
||||
return loss
|
||||
|
||||
|
||||
def calc_loss_loader(data_loader, model, device, num_batches=None, trainable_token=-1):
|
||||
total_loss = 0.
|
||||
if len(data_loader) == 0:
|
||||
return float("nan")
|
||||
elif num_batches is None:
|
||||
num_batches = len(data_loader)
|
||||
else:
|
||||
# Reduce the number of batches to match the total number of batches in the data loader
|
||||
# if num_batches exceeds the number of batches in the data loader
|
||||
num_batches = min(num_batches, len(data_loader))
|
||||
for i, (input_batch, target_batch) in enumerate(data_loader):
|
||||
if i < num_batches:
|
||||
loss = calc_loss_batch(input_batch, target_batch, model, device, trainable_token=trainable_token)
|
||||
total_loss += loss.item()
|
||||
else:
|
||||
break
|
||||
return total_loss / num_batches
|
||||
|
||||
|
||||
@torch.no_grad() # Disable gradient tracking for efficiency
|
||||
def calc_accuracy_loader(data_loader, model, device, num_batches=None, trainable_token=-1):
|
||||
model.eval()
|
||||
correct_predictions, num_examples = 0, 0
|
||||
|
||||
if num_batches is None:
|
||||
num_batches = len(data_loader)
|
||||
else:
|
||||
num_batches = min(num_batches, len(data_loader))
|
||||
for i, (input_batch, target_batch) in enumerate(data_loader):
|
||||
if i < num_batches:
|
||||
input_batch, target_batch = input_batch.to(device), target_batch.to(device)
|
||||
logits = model(input_batch)[:, trainable_token, :] # Logits of last output token
|
||||
predicted_labels = torch.argmax(logits, dim=-1)
|
||||
|
||||
num_examples += predicted_labels.shape[0]
|
||||
correct_predictions += (predicted_labels == target_batch).sum().item()
|
||||
else:
|
||||
break
|
||||
return correct_predictions / num_examples
|
||||
|
||||
|
||||
def evaluate_model(model, train_loader, val_loader, device, eval_iter, trainable_token=-1):
|
||||
model.eval()
|
||||
with torch.no_grad():
|
||||
train_loss = calc_loss_loader(train_loader, model, device, num_batches=eval_iter, trainable_token=trainable_token)
|
||||
val_loss = calc_loss_loader(val_loader, model, device, num_batches=eval_iter, trainable_token=trainable_token)
|
||||
model.train()
|
||||
return train_loss, val_loss
|
||||
|
||||
|
||||
def train_classifier_simple(model, train_loader, val_loader, optimizer, device, num_epochs,
|
||||
eval_freq, eval_iter, tokenizer, max_steps=None, trainable_token=-1):
|
||||
# Initialize lists to track losses and tokens seen
|
||||
train_losses, val_losses, train_accs, val_accs = [], [], [], []
|
||||
examples_seen, global_step = 0, -1
|
||||
|
||||
# Main training loop
|
||||
for epoch in range(num_epochs):
|
||||
model.train() # Set model to training mode
|
||||
|
||||
for input_batch, target_batch in train_loader:
|
||||
optimizer.zero_grad() # Reset loss gradients from previous epoch
|
||||
loss = calc_loss_batch(input_batch, target_batch, model, device, trainable_token=trainable_token)
|
||||
loss.backward() # Calculate loss gradients
|
||||
optimizer.step() # Update model weights using loss gradients
|
||||
examples_seen += input_batch.shape[0] # New: track examples instead of tokens
|
||||
global_step += 1
|
||||
|
||||
# Optional evaluation step
|
||||
if global_step % eval_freq == 0:
|
||||
train_loss, val_loss = evaluate_model(
|
||||
model, train_loader, val_loader, device, eval_iter, trainable_token=trainable_token)
|
||||
train_losses.append(train_loss)
|
||||
val_losses.append(val_loss)
|
||||
print(f"Ep {epoch+1} (Step {global_step:06d}): "
|
||||
f"Train loss {train_loss:.3f}, Val loss {val_loss:.3f}")
|
||||
|
||||
if max_steps is not None and global_step > max_steps:
|
||||
break
|
||||
|
||||
# New: Calculate accuracy after each epoch
|
||||
train_accuracy = calc_accuracy_loader(train_loader, model, device, num_batches=eval_iter, trainable_token=trainable_token)
|
||||
val_accuracy = calc_accuracy_loader(val_loader, model, device, num_batches=eval_iter, trainable_token=trainable_token)
|
||||
print(f"Training accuracy: {train_accuracy*100:.2f}% | ", end="")
|
||||
print(f"Validation accuracy: {val_accuracy*100:.2f}%")
|
||||
train_accs.append(train_accuracy)
|
||||
val_accs.append(val_accuracy)
|
||||
|
||||
if max_steps is not None and global_step > max_steps:
|
||||
break
|
||||
|
||||
return train_losses, val_losses, train_accs, val_accs, examples_seen
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
|
||||
parser = argparse.ArgumentParser()
|
||||
parser.add_argument(
|
||||
"--model_size",
|
||||
type=str,
|
||||
default="gpt2-small (124M)",
|
||||
help=(
|
||||
"Which GPT model to use. Options: 'gpt2-small (124M)', 'gpt2-medium (355M)',"
|
||||
" 'gpt2-large (774M)', 'gpt2-xl (1558M)'."
|
||||
)
|
||||
)
|
||||
parser.add_argument(
|
||||
"--weights",
|
||||
type=str,
|
||||
default="pretrained",
|
||||
help=(
|
||||
"Whether to use 'pretrained' or 'random' weights."
|
||||
)
|
||||
)
|
||||
parser.add_argument(
|
||||
"--trainable_layers",
|
||||
type=str,
|
||||
default="last_block",
|
||||
help=(
|
||||
"Which layers to train. Options: 'all', 'last_block', 'last_layer'."
|
||||
)
|
||||
)
|
||||
parser.add_argument(
|
||||
"--trainable_token",
|
||||
type=str,
|
||||
default="last",
|
||||
help=(
|
||||
"Which token to train. Options: 'first', 'last'."
|
||||
)
|
||||
)
|
||||
parser.add_argument(
|
||||
"--context_length",
|
||||
type=str,
|
||||
default="256",
|
||||
help=(
|
||||
"The context length of the data inputs."
|
||||
"Options: 'longest_training_example', 'model_context_length' or integer value."
|
||||
)
|
||||
)
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
if args.trainable_token == "first":
|
||||
args.trainable_token = 0
|
||||
elif args.trainable_token == "last":
|
||||
args.trainable_token = -1
|
||||
else:
|
||||
raise ValueError("Invalid --trainable_token argument")
|
||||
|
||||
###############################
|
||||
# Load model
|
||||
###############################
|
||||
|
||||
if args.weights == "pretrained":
|
||||
load_weights = True
|
||||
elif args.weights == "random":
|
||||
load_weights = False
|
||||
else:
|
||||
raise ValueError("Invalid --weights argument.")
|
||||
|
||||
model = instantiate_model(args.model_size, load_weights)
|
||||
for param in model.parameters():
|
||||
param.requires_grad = False
|
||||
|
||||
if args.model_size == "gpt2-small (124M)":
|
||||
in_features = 768
|
||||
elif args.model_size == "gpt2-medium (355M)":
|
||||
in_features = 1024
|
||||
elif args.model_size == "gpt2-large (774M)":
|
||||
in_features = 1280
|
||||
elif args.model_size == "gpt2-xl (1558M)":
|
||||
in_features = 1600
|
||||
else:
|
||||
raise ValueError("Invalid --model_size argument")
|
||||
|
||||
torch.manual_seed(123)
|
||||
model.out_head = torch.nn.Linear(in_features=in_features, out_features=2)
|
||||
|
||||
if args.trainable_layers == "last_layer":
|
||||
pass
|
||||
elif args.trainable_layers == "last_block":
|
||||
for param in model.trf_blocks[-1].parameters():
|
||||
param.requires_grad = True
|
||||
for param in model.final_norm.parameters():
|
||||
param.requires_grad = True
|
||||
elif args.trainable_layers == "all":
|
||||
for param in model.parameters():
|
||||
param.requires_grad = True
|
||||
else:
|
||||
raise ValueError("Invalid --trainable_layers argument.")
|
||||
|
||||
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
||||
model.to(device)
|
||||
|
||||
###############################
|
||||
# Instantiate dataloaders
|
||||
###############################
|
||||
|
||||
base_path = Path(".")
|
||||
|
||||
tokenizer = tiktoken.get_encoding("gpt2")
|
||||
|
||||
train_dataset = None
|
||||
if args.context_length == "model_context_length":
|
||||
max_length = model.pos_emb.weight.shape[0]
|
||||
elif args.context_length == "longest_training_example":
|
||||
train_dataset = IMDBDataset(base_path / "train.csv", max_length=None, tokenizer=tokenizer)
|
||||
max_length = train_dataset.max_length
|
||||
else:
|
||||
try:
|
||||
max_length = int(args.context_length)
|
||||
except ValueError:
|
||||
raise ValueError("Invalid --context_length argument")
|
||||
|
||||
if train_dataset is None:
|
||||
train_dataset = IMDBDataset(base_path / "train.csv", max_length=max_length, tokenizer=tokenizer)
|
||||
val_dataset = IMDBDataset(base_path / "validation.csv", max_length=max_length, tokenizer=tokenizer)
|
||||
test_dataset = IMDBDataset(base_path / "test.csv", max_length=max_length, tokenizer=tokenizer)
|
||||
|
||||
num_workers = 0
|
||||
batch_size = 8
|
||||
|
||||
train_loader = DataLoader(
|
||||
dataset=train_dataset,
|
||||
batch_size=batch_size,
|
||||
shuffle=True,
|
||||
num_workers=num_workers,
|
||||
drop_last=True,
|
||||
)
|
||||
|
||||
val_loader = DataLoader(
|
||||
dataset=val_dataset,
|
||||
batch_size=batch_size,
|
||||
num_workers=num_workers,
|
||||
drop_last=False,
|
||||
)
|
||||
|
||||
test_loader = DataLoader(
|
||||
dataset=test_dataset,
|
||||
batch_size=batch_size,
|
||||
num_workers=num_workers,
|
||||
drop_last=False,
|
||||
)
|
||||
|
||||
###############################
|
||||
# Train model
|
||||
###############################
|
||||
|
||||
start_time = time.time()
|
||||
torch.manual_seed(123)
|
||||
optimizer = torch.optim.AdamW(model.parameters(), lr=5e-5, weight_decay=0.1)
|
||||
|
||||
num_epochs = 3
|
||||
train_losses, val_losses, train_accs, val_accs, examples_seen = train_classifier_simple(
|
||||
model, train_loader, val_loader, optimizer, device,
|
||||
num_epochs=num_epochs, eval_freq=50, eval_iter=20,
|
||||
tokenizer=tokenizer, max_steps=None, trainable_token=args.trainable_token
|
||||
)
|
||||
|
||||
end_time = time.time()
|
||||
execution_time_minutes = (end_time - start_time) / 60
|
||||
print(f"Training completed in {execution_time_minutes:.2f} minutes.")
|
||||
|
||||
###############################
|
||||
# Evaluate model
|
||||
###############################
|
||||
|
||||
print("\nEvaluating on the full datasets ...\n")
|
||||
|
||||
train_accuracy = calc_accuracy_loader(train_loader, model, device, trainable_token=args.trainable_token)
|
||||
val_accuracy = calc_accuracy_loader(val_loader, model, device, trainable_token=args.trainable_token)
|
||||
test_accuracy = calc_accuracy_loader(test_loader, model, device, trainable_token=args.trainable_token)
|
||||
|
||||
print(f"Training accuracy: {train_accuracy*100:.2f}%")
|
||||
print(f"Validation accuracy: {val_accuracy*100:.2f}%")
|
||||
print(f"Test accuracy: {test_accuracy*100:.2f}%")
|
||||
75
ch06/03_bonus_imdb-classification/train-sklearn-logreg.py
Normal file
75
ch06/03_bonus_imdb-classification/train-sklearn-logreg.py
Normal file
@ -0,0 +1,75 @@
|
||||
# Copyright (c) Sebastian Raschka under Apache License 2.0 (see LICENSE.txt).
|
||||
# Source for "Build a Large Language Model From Scratch"
|
||||
# - https://www.manning.com/books/build-a-large-language-model-from-scratch
|
||||
# Code: https://github.com/rasbt/LLMs-from-scratch
|
||||
|
||||
import pandas as pd
|
||||
from sklearn.feature_extraction.text import CountVectorizer
|
||||
from sklearn.linear_model import LogisticRegression
|
||||
from sklearn.metrics import accuracy_score
|
||||
# from sklearn.metrics import balanced_accuracy_score
|
||||
from sklearn.dummy import DummyClassifier
|
||||
|
||||
|
||||
def load_dataframes():
|
||||
df_train = pd.read_csv("train.csv")
|
||||
df_val = pd.read_csv("validation.csv")
|
||||
df_test = pd.read_csv("test.csv")
|
||||
|
||||
return df_train, df_val, df_test
|
||||
|
||||
|
||||
def eval(model, X_train, y_train, X_val, y_val, X_test, y_test):
|
||||
# Making predictions
|
||||
y_pred_train = model.predict(X_train)
|
||||
y_pred_val = model.predict(X_val)
|
||||
y_pred_test = model.predict(X_test)
|
||||
|
||||
# Calculating accuracy and balanced accuracy
|
||||
accuracy_train = accuracy_score(y_train, y_pred_train)
|
||||
# balanced_accuracy_train = balanced_accuracy_score(y_train, y_pred_train)
|
||||
|
||||
accuracy_val = accuracy_score(y_val, y_pred_val)
|
||||
# balanced_accuracy_val = balanced_accuracy_score(y_val, y_pred_val)
|
||||
|
||||
accuracy_test = accuracy_score(y_test, y_pred_test)
|
||||
# balanced_accuracy_test = balanced_accuracy_score(y_test, y_pred_test)
|
||||
|
||||
# Printing the results
|
||||
print(f"Training Accuracy: {accuracy_train*100:.2f}%")
|
||||
print(f"Validation Accuracy: {accuracy_val*100:.2f}%")
|
||||
print(f"Test Accuracy: {accuracy_test*100:.2f}%")
|
||||
|
||||
# print(f"\nTraining Balanced Accuracy: {balanced_accuracy_train*100:.2f}%")
|
||||
# print(f"Validation Balanced Accuracy: {balanced_accuracy_val*100:.2f}%")
|
||||
# print(f"Test Balanced Accuracy: {balanced_accuracy_test*100:.2f}%")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
df_train, df_val, df_test = load_dataframes()
|
||||
|
||||
#########################################
|
||||
# Convert text into bag-of-words model
|
||||
vectorizer = CountVectorizer()
|
||||
#########################################
|
||||
|
||||
X_train = vectorizer.fit_transform(df_train["text"])
|
||||
X_val = vectorizer.transform(df_val["text"])
|
||||
X_test = vectorizer.transform(df_test["text"])
|
||||
y_train, y_val, y_test = df_train["label"], df_val["label"], df_test["label"]
|
||||
|
||||
#####################################
|
||||
# Model training and evaluation
|
||||
#####################################
|
||||
|
||||
# Create a dummy classifier with the strategy to predict the most frequent class
|
||||
dummy_clf = DummyClassifier(strategy="most_frequent")
|
||||
dummy_clf.fit(X_train, y_train)
|
||||
|
||||
print("Dummy classifier:")
|
||||
eval(dummy_clf, X_train, y_train, X_val, y_val, X_test, y_test)
|
||||
|
||||
print("\n\nLogistic regression classifier:")
|
||||
model = LogisticRegression(max_iter=1000)
|
||||
model.fit(X_train, y_train)
|
||||
eval(model, X_train, y_train, X_val, y_val, X_test, y_test)
|
||||
Loading…
Reference in New Issue
Block a user