diff --git a/ch06/01_main-chapter-code/ch06.ipynb b/ch06/01_main-chapter-code/ch06.ipynb index efff3fe..f9e6085 100644 --- a/ch06/01_main-chapter-code/ch06.ipynb +++ b/ch06/01_main-chapter-code/ch06.ipynb @@ -7,10 +7,19 @@ "id": "c024bfa4-1a7a-4751-b5a1-827225a3478b" }, "source": [ - "\n", - "Supplementary code for \"Build a Large Language Model From Scratch\": https://www.manning.com/books/build-a-large-language-model-from-scratch by Sebastian Raschka
\n", - "Code repository: https://github.com/rasbt/LLMs-from-scratch\n", - "
" + "\n", + "\n", + "\n", + "\n", + "\n", + "
\n", + "\n", + "Supplementary code for the Build a Large Language Model From Scratch book by Sebastian Raschka
\n", + "
Code repository: https://github.com/rasbt/LLMs-from-scratch\n", + "
\n", + "
\n", + "\n", + "
" ] }, { @@ -907,7 +916,7 @@ "id": "ab8e056c-abe0-415f-b34d-df686204259e", "metadata": {}, "source": [ - "- To ensure that the model was loaded corrected, let's double-check that it generates coherent text" + "- 为了确保模型加载正确,让我们仔细检查它是否生成连贯的文本。" ] }, { @@ -951,7 +960,7 @@ "id": "69162550-6a02-4ece-8db1-06c71d61946f", "metadata": {}, "source": [ - "- Before we finetune the model as a classifier, let's see if the model can perhaps already classify spam messages via prompting" + "- 在我们将模型微调为分类器之前,让我们看看模型是否已经可以通过提示对垃圾邮件进行分类。" ] }, { @@ -991,8 +1000,8 @@ "id": "1ce39ed0-2c77-410d-8392-dd15d4b22016", "metadata": {}, "source": [ - "- As we can see, the model is not very good at following instructions\n", - "- This is expected, since it has only been pretrained and not instruction-finetuned (instruction finetuning will be covered in the next chapter)" + "- 正如我们所看到的,该模型不太擅长遵循指令\n", + "- 这是预料之中的,因为它只经过了预训练,没有进行指令微调(指令微调将在下一章中介绍)" ] }, { diff --git a/ch06/01_main-chapter-code/exercise-solutions.ipynb b/ch06/01_main-chapter-code/exercise-solutions.ipynb index b3a781b..0e2d502 100644 --- a/ch06/01_main-chapter-code/exercise-solutions.ipynb +++ b/ch06/01_main-chapter-code/exercise-solutions.ipynb @@ -5,10 +5,19 @@ "id": "ba450fb1-8a26-4894-ab7a-5d7bfefe90ce", "metadata": {}, "source": [ - "\n", - "Supplementary code for \"Build a Large Language Model From Scratch\": https://www.manning.com/books/build-a-large-language-model-from-scratch by Sebastian Raschka
\n", - "Code repository: https://github.com/rasbt/LLMs-from-scratch\n", - "
" + "\n", + "\n", + "\n", + "\n", + "\n", + "
\n", + "\n", + "Supplementary code for the Build a Large Language Model From Scratch book by Sebastian Raschka
\n", + "
Code repository: https://github.com/rasbt/LLMs-from-scratch\n", + "
\n", + "
\n", + "\n", + "
" ] }, { diff --git a/ch06/01_main-chapter-code/gpt-class-finetune.py b/ch06/01_main-chapter-code/gpt-class-finetune.py index b1c7053..bc5666b 100644 --- a/ch06/01_main-chapter-code/gpt-class-finetune.py +++ b/ch06/01_main-chapter-code/gpt-class-finetune.py @@ -21,15 +21,34 @@ from gpt_download import download_and_load_gpt2 from previous_chapters import GPTModel, load_weights_into_gpt -def download_and_unzip_spam_data(url, zip_path, extracted_path, data_file_path): +def download_and_unzip_spam_data(url, zip_path, extracted_path, data_file_path, test_mode=False): if data_file_path.exists(): print(f"{data_file_path} already exists. Skipping download and extraction.") return - # Downloading the file - with urllib.request.urlopen(url) as response: - with open(zip_path, "wb") as out_file: - out_file.write(response.read()) + if test_mode: # Try multiple times since CI sometimes has connectivity issues + max_retries = 5 + delay = 5 # delay between retries in seconds + for attempt in range(max_retries): + try: + # Downloading the file + with urllib.request.urlopen(url, timeout=10) as response: + with open(zip_path, "wb") as out_file: + out_file.write(response.read()) + break # if download is successful, break out of the loop + except urllib.error.URLError as e: + print(f"Attempt {attempt + 1} failed: {e}") + if attempt < max_retries - 1: + time.sleep(delay) # wait before retrying + else: + print("Failed to download file after several attempts.") + return # exit if all retries fail + + else: # Code as it appears in the chapter + # Downloading the file + with urllib.request.urlopen(url) as response: + with open(zip_path, "wb") as out_file: + out_file.write(response.read()) # Unzipping the file with zipfile.ZipFile(zip_path, "r") as zip_ref: @@ -238,6 +257,7 @@ if __name__ == "__main__": ) parser.add_argument( "--test_mode", + default=False, action="store_true", help=("This flag runs the model in test mode for internal testing purposes. " "Otherwise, it runs the model as it is used in the chapter (recommended).") @@ -253,7 +273,7 @@ if __name__ == "__main__": extracted_path = "sms_spam_collection" data_file_path = Path(extracted_path) / "SMSSpamCollection.tsv" - download_and_unzip_spam_data(url, zip_path, extracted_path, data_file_path) + download_and_unzip_spam_data(url, zip_path, extracted_path, data_file_path, test_mode=args.test_mode) df = pd.read_csv(data_file_path, sep="\t", header=None, names=["Label", "Text"]) balanced_df = create_balanced_dataset(df) balanced_df["Label"] = balanced_df["Label"].map({"ham": 0, "spam": 1}) @@ -330,9 +350,7 @@ if __name__ == "__main__": } model = GPTModel(BASE_CONFIG) model.eval() - device = "cpu" - model.to(device) # Code as it is used in the main chapter else: @@ -355,15 +373,18 @@ if __name__ == "__main__": BASE_CONFIG.update(model_configs[CHOOSE_MODEL]) + assert train_dataset.max_length <= BASE_CONFIG["context_length"], ( + f"Dataset length {train_dataset.max_length} exceeds model's context " + f"length {BASE_CONFIG['context_length']}. Reinitialize data sets with " + f"`max_length={BASE_CONFIG['context_length']}`" + ) + model_size = CHOOSE_MODEL.split(" ")[-1].lstrip("(").rstrip(")") settings, params = download_and_load_gpt2(model_size=model_size, models_dir="gpt2") model = GPTModel(BASE_CONFIG) load_weights_into_gpt(model, params) - model.eval() - device = torch.device("cuda" if torch.cuda.is_available() else "cpu") - model.to(device) ######################################## # Modify and pretrained model @@ -376,6 +397,7 @@ if __name__ == "__main__": num_classes = 2 model.out_head = torch.nn.Linear(in_features=BASE_CONFIG["emb_dim"], out_features=num_classes) + model.to(device) for param in model.trf_blocks[-1].parameters(): param.requires_grad = True diff --git a/ch06/01_main-chapter-code/gpt_download.py b/ch06/01_main-chapter-code/gpt_download.py index 0d695d2..11d648c 100644 --- a/ch06/01_main-chapter-code/gpt_download.py +++ b/ch06/01_main-chapter-code/gpt_download.py @@ -96,4 +96,4 @@ def load_gpt2_params_from_tf_ckpt(ckpt_path, settings): last_key = variable_name_parts[-1] target_dict[last_key] = variable_array - return params + return params \ No newline at end of file diff --git a/ch06/02_bonus_additional-experiments/README.md b/ch06/02_bonus_additional-experiments/README.md index d19d9ee..a2d9645 100644 --- a/ch06/02_bonus_additional-experiments/README.md +++ b/ch06/02_bonus_additional-experiments/README.md @@ -11,18 +11,21 @@ | | Model | Weights | Trainable token | Trainable layers | Context length | Training acc | Validation acc | Test acc | Training time | CPU/GPU | | ---- | ------------------ | ---------- | --------------- | ---------------- | ----------------------- | ------------ | -------------- | -------- | ------------- | ------- | -| 1 | gpt2-small (124M) | pretrained | last | last_block | longest train ex. (120) | 96.63% | 99.33% | 95.00% | 0.28 min | A100 | -| 2 | gpt2-small (124M) | pretrained | first | last_block | longest train ex. (120) | 78.46% | 80.54% | 75.00% | 0.28 min | A100 | -| 3 | gpt2-small (124M) | pretrained | last | last_layer | longest train ex. (120) | 78.65% | 79.87% | 72.00% | 0.25 min | A100 | -| 4 | gpt2-small (124M) | pretrained | last | all | longest train ex. (120) | 99.62% | 96.64% | 96.67% | 0.69 min | A100 | -| 5 | gpt2-medium (355M) | pretrained | last | last_block | longest train ex. (120) | 87.50% | 91.28% | 84.67% | 0.75 min | A100 | -| 6 | gpt2-large (774M) | pretrained | last | last_block | longest train ex. (120) | 99.52% | 98.66% | 96.67% | 1.50 min | A100 | -| 7 | gpt2-xl (1558M) | pretrained | last | last_block | longest train ex. (120) | 99.81% | 99.33% | 98.33% | 2.83 min | A100 | -| 8 | gpt2-small (124M) | random | last | all | longest train ex. (120) | 100% | 96.64% | 93.67% | 0.69 min | A100 | -| 9 | gpt2-small (124M) | pretrained | last | LoRA | longest train ex. (120) | 99.52% | 97.99% | 97.67% | 0.75 min | A100 | -| 10 | gpt2-small (124M) | pretrained | last | last_block | context length (1024) | 83.08% | 87.92% | 78.33% | 2.46 min | A100 | -| 11 | gpt2-small (124M) | pretrained | last | last_block | variable: no padding (batch size 1) | 100.00% | 98.66% | 98.00% | 1.75 min | A100 | -| 11 | gpt2-small (124M) | pretrained | last | last_block | variable: no padding (batch size 8) | 99.33% | 98.66% | 98.33% | 1.70 min | A100 | +| 1 | gpt2-small (124M) | pretrained | last | last_block | longest train ex. (120) | 96.63% | 99.33% | 95.00% | 0.28 min | A100 | +| 2 | gpt2-small (124M) | pretrained | first | last_block | longest train ex. (120) | 78.46% | 80.54% | 75.00% | 0.28 min | A100 | +| 3 | gpt2-small (124M) | pretrained | last | last_layer | longest train ex. (120) | 78.65% | 79.87% | 72.00% | 0.25 min | A100 | +| 4 | gpt2-small (124M) | pretrained | last | last_two_blocks | longest train ex. (120) | 98.85% | 98.66% | 98.33% | 0.33 min | A100 | +| 5 | gpt2-small (124M) | pretrained | last | all | longest train ex. (120) | 99.62% | 96.64% | 96.67% | 0.69 min | A100 | +| 6 | gpt2-medium (355M) | pretrained | last | last_block | longest train ex. (120) | 87.50% | 91.28% | 84.67% | 0.75 min | A100 | +| 7 | gpt2-large (774M) | pretrained | last | last_block | longest train ex. (120) | 99.52% | 98.66% | 96.67% | 1.50 min | A100 | +| 8 | gpt2-xl (1558M) | pretrained | last | last_block | longest train ex. (120) | 99.81% | 99.33% | 98.33% | 2.83 min | A100 | +| 9 | gpt2-small (124M) | random | last | all | longest train ex. (120) | 100% | 96.64% | 93.67% | 0.69 min | A100 | +| 10 | gpt2-small (124M) | pretrained | last | LoRA | longest train ex. (120) | 100.00% | 97.32% | 96.67% | 0.75 min | A100 | +| 11 | gpt2-small (124M) | pretrained | last | last_block | context length (1024) | 83.08% | 87.92% | 78.33% | 2.46 min | A100 | +| 12 | gpt2-small (124M) | pretrained | last | last_block | variable: no padding (batch size 1) | 100.00% | 98.66% | 98.00% | 1.75 min | A100 | +| 13 | gpt2-small (124M) | pretrained | last | last_block | variable: no padding (batch size 8) | 99.33% | 98.66% | 98.33% | 1.70 min | A100 | +| 14 | gpt2-small (124M) | pretrained | last | last_block | longest train ex. (120); but no causal mask | 99.23% | 98.66% | 95.33% | 0.29 min | A100 | +| 15 | gpt2-small (124M) | pretrained | last | last_block | longest train ex. (120) and `ignore_index` for padding | 96.63% | 99.33% | 95.00% | 0.28 min | A100 |   @@ -32,17 +35,20 @@ 您可以使用以下代码来重现实验: - Row 1: `python additional-experiments.py` -- Row 2: `python additional-experiments.py --trainable_token first` +- Row 2: `python additional-experiments.py --trainable_token_pos first` - Row 3: `python additional-experiments.py --trainable_layers last_layer` -- Row 4: `python additional-experiments.py --trainable_layers all` -- Row 5: `python additional-experiments.py --model_size "gpt2-medium (355M)"` -- Row 6: `python additional-experiments.py --model_size "gpt2-large (774M)"` -- Row 7: `python additional-experiments.py --model_size "gpt2-xl (1558M)"` -- Row 8: `python additional-experiments.py --weights random --trainable_layers all` -- Row 9: `python additional-experiments.py --trainable_layers lora --lora_rank 16 --lora_alpha 8` -- Row 10: `python additional-experiments.py --context_length "model_context_length"` -- Row 11: `python additional-experiments.py --no_padding --batch_size 1` -- Row 12: `python additional-experiments.py --no_padding --batch_size 1 --accumulation_steps 8` +- Row 4: `python additional-experiments.py --trainable_layers last_two_blocks` +- Row 5: `python additional-experiments.py --trainable_layers all` +- Row 6: `python additional-experiments.py --model_size "gpt2-medium (355M)"` +- Row 7: `python additional-experiments.py --model_size "gpt2-large (774M)"` +- Row 8: `python additional-experiments.py --model_size "gpt2-xl (1558M)"` +- Row 9: `python additional-experiments.py --weights random --trainable_layers all` +- Row 10: `python additional-experiments.py --trainable_layers lora --lora_rank 16 --lora_alpha 16` +- Row 11: `python additional-experiments.py --context_length "model_context_length"` +- Row 12: `python additional-experiments.py --no_padding --batch_size 1` +- Row 13: `python additional-experiments.py --no_padding --batch_size 1 --accumulation_steps 8` +- Row 14: `python additional-experiments.py --disable_causal_mask` +- Row 15: `python additional-experiments.py --ignore_index 50256` 我特意将 LLM 和数据集保持得较小,因此,如果您无法使用 GPU,您可以在 MacBook Air M3 等普通笔记本电脑上运行大约 15 分钟的训练。 @@ -50,17 +56,13 @@ ## 解释 1. **训练最后一个输出标记与第一个输出标记(第 1 行与第 2 行)**:与第一个输出标记相比,训练最后一个输出标记会带来更好的性能。由于因果自注意力掩模,这种改进是可以预期的。 - 2. **训练最后一个 Transformer 块与最后一层(第 1 行与第 3 行)**:训练整个最后一个 Transformer 块也比仅训练最后一层获得更好的结果。 - 3. **训练所有层与最后一个 Transformer 块(第 1 行与第 4 行)**:训练所有层比仅训练最后一个 Transformer 块显示出约 2% 的适度改进,但它需要的时间几乎是三倍的训练时间。 - -4. **使用更大的预训练模型(第 1 行与第 5 行,以及第 1 行与第 6 行和第 7 行)**:采用 3 倍大的预训练模型会导致更差的结果。 然而,正如预期的那样,与初始模型相比,使用大 5 倍的模型可以提高性能。 同样,12 倍大的模型进一步提高了预测性能。(中等模型可能没有经过很好的预训练,或者特定的微调配置对该模型效果不佳。) - -5. **使用具有随机权重的模型与预训练权重(第 1 行与第 8 行)**:使用具有随机权重的模型产生的结果仅比使用预训练权重稍差 1.3%。 - -6. **使用 LoRA(低阶适应)与训练所有层(第 9 行与第 4 行)**:保持模型冻结并添加可训练的 LoRA 层是训练所有模型参数的可行替代方案,甚至可以将性能提高 1%(请参阅[附录 E](../../appendix-E/01_main-chapter-code/appendix-E.ipynb)查看更多细节)。 从使用 LoRA 时训练和验证准确率之间的差距降低 1% 可以看出,这可能是由于过度拟合较少。 此外,使用 LoRA 的速度也稍快一些,因为需要更新的参数较少。 - -7. **将输入填充到完整上下文长度与最长训练示例(第 1 行与第 10 行)**:将输入填充到完整支持的上下文长度结果明显更差。 - -8. **填充与无填充(第 1 行与第 11 行和第 12 行)**:`--no_padding` 选项禁用数据集中的填充,这需要使用批量大小 1 来训练模型,因为输入具有可变长度。 这会带来更好的测试准确率,但需要更长的训练时间。 在第 12 行中,我们另外启用了 8 个步骤的梯度累积,以实现与其他实验相同的批量大小,这有助于减少过度拟合并略微提高测试集的准确性。 \ No newline at end of file +4. **训练最后一个 Transformer 块与所有层(第 1 行与第 5 行)**:训练所有层比仅训练最后一个 Transformer 块显示出约 2% 的适度改进,但就时间而言,它需要几乎三倍的时间训练持续时间。 此外,它仅训练 12 个变压器块中的最后两个,其性能也不佳。 +5. **使用更大的预训练模型(第 1 行与第 5 行,以及第 1 行与第 7 行和第 8 行)**:采用 3 倍大的预训练模型会导致更差的结果。然而,正如预期的那样,与初始模型相比,使用大 5 倍的模型可以提高性能。同样,12 倍大的模型进一步提高了预测性能。(中等模型可能没有经过很好的预训练,或者特定的微调配置对该模型效果不佳。) +6. **使用具有随机权重的模型与预训练权重(第 1 行与第 9 行)**:使用具有随机权重的模型产生的结果仅比使用预训练权重稍差 1.3%。 +7. **使用 LoRA(低阶适应)与训练所有层(第 10 行与第 5 行)**:保持模型冻结并添加可训练的 LoRA 层是训练所有模型参数的可行替代方案(请参阅[附录 E](../../appendix-E/01_main-chapter-code/appendix-E.ipynb)),甚至可以将性能提高 1%。 从使用 LoRA 时训练和验证准确率之间的差距降低约 1% 可以看出,这可能是由于过度拟合较少。此外,使用 LoRA 的速度也稍快一些,因为需要更新的参数较少。 +8. **将输入填充到完整上下文长度与最长训练示例(第 1 行与第 11 行)**:将输入填充到完整支持的上下文长度结果明显更差。 +9. **填充与无填充(第 1 行与第 12 行和第 13 行)**:`--no_padding` 选项禁用数据集中的填充,这需要使用批量大小 1 来训练模型,因为输入具有变量 长度。 这会带来更好的测试精度,但需要更长的训练时间。 在第 12 行中,我们另外启用了 8 个步骤的梯度累积,以实现与其他实验相同的批量大小,这有助于减少过度拟合并略微提高测试集的准确性。 +10. **禁用因果注意掩码(第 1 行与第 14 行)**:禁用多头注意模块中使用的因果注意掩码。这意味着所有Token都可以参加所有其他Token。 与带有因果掩模的 GPT 模型相比,模型精度略有提高。 +11. **忽略损失和反向传播中的填充索引(第 1 行与第 15 行)**:设置 `--ignore_index 50256` 会排除 PyTorch 中 `cross_entropy` 损失函数中的 `|endoftext|` 填充标记。 在这种情况下,它没有任何效果,因为我们替换了输出层,以便二元分类示例的标记 ID 为 0 或 1。 然而,当第 7 章中的指令微调模型时,此设置很有用。 \ No newline at end of file diff --git a/ch06/02_bonus_additional-experiments/additional-experiments.py b/ch06/02_bonus_additional-experiments/additional-experiments.py index 7492ed6..bcfc0b8 100644 --- a/ch06/02_bonus_additional-experiments/additional-experiments.py +++ b/ch06/02_bonus_additional-experiments/additional-experiments.py @@ -4,6 +4,7 @@ # Code: https://github.com/rasbt/LLMs-from-scratch import argparse +import math import os from pathlib import Path import time @@ -23,8 +24,8 @@ from previous_chapters import GPTModel, load_weights_into_gpt class LoRALayer(torch.nn.Module): def __init__(self, in_dim, out_dim, rank, alpha): super().__init__() - std_dev = 1 / torch.sqrt(torch.tensor(rank).float()) - self.A = torch.nn.Parameter(torch.randn(in_dim, rank) * std_dev) + self.A = torch.nn.Parameter(torch.empty(in_dim, rank)) + torch.nn.init.kaiming_uniform_(self.A, a=math.sqrt(5)) self.B = torch.nn.Parameter(torch.zeros(rank, out_dim)) self.alpha = alpha @@ -153,7 +154,7 @@ def instantiate_model(choose_model, load_weights): if not load_weights: torch.manual_seed(123) - model = GPTModel(BASE_CONFIG) + model = GPTModel(BASE_CONFIG, disable_causal_mask=args.disable_causal_mask) if load_weights: model_size = choose_model.split(" ")[-1].lstrip("(").rstrip(")") @@ -164,14 +165,16 @@ def instantiate_model(choose_model, load_weights): return model -def calc_loss_batch(input_batch, target_batch, model, device, trainable_token=-1): +def calc_loss_batch(input_batch, target_batch, model, device, + trainable_token_pos=-1, ignore_index=-100): input_batch, target_batch = input_batch.to(device), target_batch.to(device) - logits = model(input_batch)[:, trainable_token, :] # Logits of last output token - loss = torch.nn.functional.cross_entropy(logits, target_batch) + logits = model(input_batch)[:, trainable_token_pos, :] # Logits of last output token + loss = torch.nn.functional.cross_entropy(logits, target_batch, ignore_index=ignore_index) return loss -def calc_loss_loader(data_loader, model, device, num_batches=None, trainable_token=-1): +def calc_loss_loader(data_loader, model, device, + num_batches=None, trainable_token_pos=-1, ignore_index=-100): total_loss = 0. if len(data_loader) == 0: return float("nan") @@ -183,7 +186,10 @@ def calc_loss_loader(data_loader, model, device, num_batches=None, trainable_tok num_batches = min(num_batches, len(data_loader)) for i, (input_batch, target_batch) in enumerate(data_loader): if i < num_batches: - loss = calc_loss_batch(input_batch, target_batch, model, device, trainable_token=trainable_token) + loss = calc_loss_batch( + input_batch, target_batch, model, device, + trainable_token_pos=trainable_token_pos, ignore_index=ignore_index + ) total_loss += loss.item() else: break @@ -191,7 +197,7 @@ def calc_loss_loader(data_loader, model, device, num_batches=None, trainable_tok @torch.no_grad() # Disable gradient tracking for efficiency -def calc_accuracy_loader(data_loader, model, device, num_batches=None, trainable_token=-1): +def calc_accuracy_loader(data_loader, model, device, num_batches=None, trainable_token_pos=-1): model.eval() correct_predictions, num_examples = 0, 0 @@ -202,7 +208,7 @@ def calc_accuracy_loader(data_loader, model, device, num_batches=None, trainable for i, (input_batch, target_batch) in enumerate(data_loader): if i < num_batches: input_batch, target_batch = input_batch.to(device), target_batch.to(device) - logits = model(input_batch)[:, trainable_token, :] # Logits of last output token + logits = model(input_batch)[:, trainable_token_pos, :] # Logits of last output token predicted_labels = torch.argmax(logits, dim=-1) num_examples += predicted_labels.shape[0] @@ -212,18 +218,25 @@ def calc_accuracy_loader(data_loader, model, device, num_batches=None, trainable return correct_predictions / num_examples -def evaluate_model(model, train_loader, val_loader, device, eval_iter, trainable_token=-1): +def evaluate_model(model, train_loader, val_loader, device, + eval_iter, trainable_token_pos=-1, ignore_index=-100): model.eval() with torch.no_grad(): - train_loss = calc_loss_loader(train_loader, model, device, num_batches=eval_iter, trainable_token=trainable_token) - val_loss = calc_loss_loader(val_loader, model, device, num_batches=eval_iter, trainable_token=trainable_token) + train_loss = calc_loss_loader( + train_loader, model, device, num_batches=eval_iter, + trainable_token_pos=trainable_token_pos, ignore_index=ignore_index + ) + val_loss = calc_loss_loader( + val_loader, model, device, num_batches=eval_iter, + trainable_token_pos=trainable_token_pos, ignore_index=ignore_index + ) model.train() return train_loss, val_loss def train_classifier_simple(model, train_loader, val_loader, optimizer, device, num_epochs, - eval_freq, eval_iter, tokenizer, max_steps=None, trainable_token=-1, - accumulation_steps=1): + eval_freq, eval_iter, tokenizer, max_steps=None, trainable_token_pos=-1, + accumulation_steps=1, ignore_index=-100): # Initialize lists to track losses and tokens seen train_losses, val_losses, train_accs, val_accs = [], [], [], [] examples_seen, global_step = 0, -1 @@ -233,7 +246,10 @@ def train_classifier_simple(model, train_loader, val_loader, optimizer, device, model.train() # Set model to training mode for batch_idx, (input_batch, target_batch) in enumerate(train_loader): - loss = calc_loss_batch(input_batch, target_batch, model, device, trainable_token=trainable_token) + loss = calc_loss_batch( + input_batch, target_batch, model, device, + trainable_token_pos=trainable_token_pos, ignore_index=ignore_index + ) # Use gradient accumulation if accumulation_steps > 1 # See https://sebastianraschka.com/blog/2023/llm-grad-accumulation.html @@ -253,7 +269,9 @@ def train_classifier_simple(model, train_loader, val_loader, optimizer, device, # Optional evaluation step if global_step % eval_freq == 0: train_loss, val_loss = evaluate_model( - model, train_loader, val_loader, device, eval_iter, trainable_token=trainable_token) + model, train_loader, val_loader, device, eval_iter, + trainable_token_pos=trainable_token_pos, ignore_index=ignore_index + ) train_losses.append(train_loss) val_losses.append(val_loss) print(f"Ep {epoch+1} (Step {global_step:06d}): " @@ -263,8 +281,8 @@ def train_classifier_simple(model, train_loader, val_loader, optimizer, device, break # New: Calculate accuracy after each epoch - train_accuracy = calc_accuracy_loader(train_loader, model, device, num_batches=eval_iter, trainable_token=trainable_token) - val_accuracy = calc_accuracy_loader(val_loader, model, device, num_batches=eval_iter, trainable_token=trainable_token) + train_accuracy = calc_accuracy_loader(train_loader, model, device, num_batches=eval_iter, trainable_token_pos=trainable_token_pos) + val_accuracy = calc_accuracy_loader(val_loader, model, device, num_batches=eval_iter, trainable_token_pos=trainable_token_pos) print(f"Training accuracy: {train_accuracy*100:.2f}% | ", end="") print(f"Validation accuracy: {val_accuracy*100:.2f}%") train_accs.append(train_accuracy) @@ -311,15 +329,15 @@ if __name__ == "__main__": type=str, default="last_block", help=( - "Which layers to train. Options: 'all', 'last_block', 'last_layer', 'lora'." + "Which layers to train. Options: 'all', 'last_block', 'last_two_blocks', 'last_layer', 'lora'." ) ) parser.add_argument( - "--trainable_token", + "--trainable_token_pos", type=str, default="last", help=( - "Which token to train. Options: 'first', 'last'." + "Which token position to train. Options: 'first', 'last'." ) ) parser.add_argument( @@ -386,14 +404,32 @@ if __name__ == "__main__": ) ) + parser.add_argument( + "--disable_causal_mask", + action='store_true', + default=False, + help=( + "Disables the causal attention mask." + ) + ) + + parser.add_argument( + "--ignore_index", + type=int, + default=-100, + help=( + "Sets the `ignore_index` in the cross entropy loss." + ) + ) + args = parser.parse_args() - if args.trainable_token == "first": - args.trainable_token = 0 - elif args.trainable_token == "last": - args.trainable_token = -1 + if args.trainable_token_pos == "first": + args.trainable_token_pos = 0 + elif args.trainable_token_pos == "last": + args.trainable_token_pos = -1 else: - raise ValueError("Invalid --trainable_token argument") + raise ValueError("Invalid --trainable_token_pos argument") ############################### # Load model @@ -426,11 +462,14 @@ if __name__ == "__main__": if args.trainable_layers == "last_layer": pass - elif args.trainable_layers == "last_block": + elif args.trainable_layers == "last_block" or args.trainable_layers == "last_two_blocks": for param in model.trf_blocks[-1].parameters(): param.requires_grad = True for param in model.final_norm.parameters(): param.requires_grad = True + if args.trainable_layers == "last_two_blocks": + for param in model.trf_blocks[-2].parameters(): + param.requires_grad = True elif args.trainable_layers == "all": for param in model.parameters(): param.requires_grad = True @@ -509,6 +548,12 @@ if __name__ == "__main__": drop_last=False, ) + assert train_dataset.max_length <= model.pos_emb.weight.shape[0], ( + f"Dataset length {train_dataset.max_length} exceeds model's context " + f"length {model.pos_emb.weight.shape[0]}. Reinitialize data sets with " + f"`max_length={model.pos_emb.weight.shape[0]}`" + ) + ############################### # Train model ############################### @@ -520,7 +565,7 @@ if __name__ == "__main__": train_losses, val_losses, train_accs, val_accs, examples_seen = train_classifier_simple( model, train_loader, val_loader, optimizer, device, num_epochs=args.num_epochs, eval_freq=50, eval_iter=5, - tokenizer=tokenizer, max_steps=None, trainable_token=args.trainable_token, + tokenizer=tokenizer, max_steps=None, trainable_token_pos=args.trainable_token_pos, accumulation_steps=args.accumulation_steps ) @@ -532,9 +577,9 @@ if __name__ == "__main__": # Evaluate model ############################### - train_accuracy = calc_accuracy_loader(train_loader, model, device, trainable_token=args.trainable_token) - val_accuracy = calc_accuracy_loader(val_loader, model, device, trainable_token=args.trainable_token) - test_accuracy = calc_accuracy_loader(test_loader, model, device, trainable_token=args.trainable_token) + train_accuracy = calc_accuracy_loader(train_loader, model, device, trainable_token_pos=args.trainable_token_pos) + val_accuracy = calc_accuracy_loader(val_loader, model, device, trainable_token_pos=args.trainable_token_pos) + test_accuracy = calc_accuracy_loader(test_loader, model, device, trainable_token_pos=args.trainable_token_pos) print(f"Training accuracy: {train_accuracy*100:.2f}%") print(f"Validation accuracy: {val_accuracy*100:.2f}%") diff --git a/ch06/02_bonus_additional-experiments/previous_chapters.py b/ch06/02_bonus_additional-experiments/previous_chapters.py index 8d6f827..66367c4 100644 --- a/ch06/02_bonus_additional-experiments/previous_chapters.py +++ b/ch06/02_bonus_additional-experiments/previous_chapters.py @@ -60,7 +60,7 @@ def create_dataloader_v1(txt, batch_size=4, max_length=256, # Chapter 3 ##################################### class MultiHeadAttention(nn.Module): - def __init__(self, d_in, d_out, context_length, dropout, num_heads, qkv_bias=False): + def __init__(self, d_in, d_out, context_length, dropout, num_heads, qkv_bias=False, disable_causal_mask=False): super().__init__() assert d_out % num_heads == 0, "d_out must be divisible by n_heads" @@ -73,7 +73,10 @@ class MultiHeadAttention(nn.Module): self.W_value = nn.Linear(d_in, d_out, bias=qkv_bias) self.out_proj = nn.Linear(d_out, d_out) # Linear layer to combine head outputs self.dropout = nn.Dropout(dropout) - self.register_buffer('mask', torch.triu(torch.ones(context_length, context_length), diagonal=1)) + + if not disable_causal_mask: + self.register_buffer('mask', torch.triu(torch.ones(context_length, context_length), diagonal=1)) + self.disable_causal_mask = disable_causal_mask def forward(self, x): b, num_tokens, d_in = x.shape @@ -96,11 +99,12 @@ class MultiHeadAttention(nn.Module): # Compute scaled dot-product attention (aka self-attention) with a causal mask attn_scores = queries @ keys.transpose(2, 3) # Dot product for each head - # Original mask truncated to the number of tokens and converted to boolean - mask_bool = self.mask.bool()[:num_tokens, :num_tokens] + if not self.disable_causal_mask: + # Original mask truncated to the number of tokens and converted to boolean + mask_bool = self.mask.bool()[:num_tokens, :num_tokens] - # Use the mask to fill attention scores - attn_scores.masked_fill_(mask_bool, -torch.inf) + # Use the mask to fill attention scores + attn_scores.masked_fill_(mask_bool, -torch.inf) attn_weights = torch.softmax(attn_scores / keys.shape[-1]**0.5, dim=-1) attn_weights = self.dropout(attn_weights) @@ -157,7 +161,7 @@ class FeedForward(nn.Module): class TransformerBlock(nn.Module): - def __init__(self, cfg): + def __init__(self, cfg, disable_causal_mask=False): super().__init__() self.att = MultiHeadAttention( d_in=cfg["emb_dim"], @@ -165,7 +169,9 @@ class TransformerBlock(nn.Module): context_length=cfg["context_length"], num_heads=cfg["n_heads"], dropout=cfg["drop_rate"], - qkv_bias=cfg["qkv_bias"]) + qkv_bias=cfg["qkv_bias"], + disable_causal_mask=disable_causal_mask + ) self.ff = FeedForward(cfg) self.norm1 = LayerNorm(cfg["emb_dim"]) self.norm2 = LayerNorm(cfg["emb_dim"]) @@ -190,14 +196,14 @@ class TransformerBlock(nn.Module): class GPTModel(nn.Module): - def __init__(self, cfg): + def __init__(self, cfg, disable_causal_mask=False): super().__init__() self.tok_emb = nn.Embedding(cfg["vocab_size"], cfg["emb_dim"]) self.pos_emb = nn.Embedding(cfg["context_length"], cfg["emb_dim"]) self.drop_emb = nn.Dropout(cfg["drop_rate"]) self.trf_blocks = nn.Sequential( - *[TransformerBlock(cfg) for _ in range(cfg["n_layers"])]) + *[TransformerBlock(cfg, disable_causal_mask) for _ in range(cfg["n_layers"])]) self.final_norm = LayerNorm(cfg["emb_dim"]) self.out_head = nn.Linear(cfg["emb_dim"], cfg["vocab_size"], bias=False) @@ -310,7 +316,7 @@ def load_weights_into_gpt(gpt, params): gpt.out_head.weight = assign(gpt.out_head.weight, params["wte"]) -def generate(model, idx, max_new_tokens, context_size, temperature, top_k=None): +def generate(model, idx, max_new_tokens, context_size, temperature=0.0, top_k=None, eos_id=None): # For-loop is the same as before: Get logits, and only focus on last time step for _ in range(max_new_tokens): idx_cond = idx[:, -context_size:] @@ -339,6 +345,9 @@ def generate(model, idx, max_new_tokens, context_size, temperature, top_k=None): else: idx_next = torch.argmax(logits, dim=-1, keepdim=True) # (batch_size, 1) + if idx_next == eos_id: # Stop generating early if end-of-sequence token is encountered and eos_id is specified + break + # Same as before: append sampled index to the running sequence idx = torch.cat((idx, idx_next), dim=1) # (batch_size, num_tokens+1) diff --git a/ch06/03_bonus_imdb-classification/README.md b/ch06/03_bonus_imdb-classification/README.md new file mode 100644 index 0000000..fdb8ac1 --- /dev/null +++ b/ch06/03_bonus_imdb-classification/README.md @@ -0,0 +1,127 @@ +# 对 50k IMDB 电影评论的情感进行分类的附加实验 + +  +## Step 1: 安装依赖 + +通过下列命令安装额外的依赖项 + +```bash +pip install -r requirements-extra.txt +``` + +  +## Step 2: 下载数据集 + +这些代码使用 IMDb 中的 50k 电影评论来预测电影评论是正面还是负面。 ([数据集](https://ai.stanford.edu/~amaas/data/sentiment/)) + +运行以下代码来创建`train.csv`, `validation.csv`, 和 `test.csv`数据集: + +```bash +python download-prepare-dataset.py +``` + + +  +## Step 3: 运行模型 + +主要章节中使用的 124M GPT-2 模型,从预训练权重开始,仅训练最后一个 Transformer 块加上输出层: + +```bash +python train-gpt.py +``` + +``` +Ep 1 (Step 000000): Train loss 2.829, Val loss 3.433 +Ep 1 (Step 000050): Train loss 1.440, Val loss 1.669 +Ep 1 (Step 000100): Train loss 0.879, Val loss 1.037 +Ep 1 (Step 000150): Train loss 0.838, Val loss 0.866 +... +Ep 1 (Step 004300): Train loss 0.174, Val loss 0.202 +Ep 1 (Step 004350): Train loss 0.309, Val loss 0.190 +Training accuracy: 88.75% | Validation accuracy: 91.25% +Ep 2 (Step 004400): Train loss 0.263, Val loss 0.205 +Ep 2 (Step 004450): Train loss 0.226, Val loss 0.188 +... +Ep 2 (Step 008650): Train loss 0.189, Val loss 0.171 +Ep 2 (Step 008700): Train loss 0.225, Val loss 0.179 +Training accuracy: 85.00% | Validation accuracy: 90.62% +Ep 3 (Step 008750): Train loss 0.206, Val loss 0.187 +Ep 3 (Step 008800): Train loss 0.198, Val loss 0.172 +... +Training accuracy: 96.88% | Validation accuracy: 90.62% +Training completed in 18.62 minutes. + +Evaluating on the full datasets ... + +Training accuracy: 93.66% +Validation accuracy: 90.02% +Test accuracy: 89.96% +``` + +--- + +一个 66M 参数的编码器模型 [DistilBERT](https://arxiv.org/abs/1910.01108)(从 340M 参数 BERT 模型蒸馏而来),从预训练权重开始,仅训练最后一个 Transformer 块和输出层: + + +```bash +python train-bert-hf.py +``` + +``` +Ep 1 (Step 000000): Train loss 0.693, Val loss 0.697 +Ep 1 (Step 000050): Train loss 0.532, Val loss 0.596 +Ep 1 (Step 000100): Train loss 0.431, Val loss 0.446 +... +Ep 1 (Step 004300): Train loss 0.234, Val loss 0.351 +Ep 1 (Step 004350): Train loss 0.190, Val loss 0.222 +Training accuracy: 88.75% | Validation accuracy: 88.12% +Ep 2 (Step 004400): Train loss 0.258, Val loss 0.270 +Ep 2 (Step 004450): Train loss 0.204, Val loss 0.295 +... +Ep 2 (Step 008650): Train loss 0.088, Val loss 0.246 +Ep 2 (Step 008700): Train loss 0.084, Val loss 0.247 +Training accuracy: 98.75% | Validation accuracy: 90.62% +Ep 3 (Step 008750): Train loss 0.067, Val loss 0.209 +Ep 3 (Step 008800): Train loss 0.059, Val loss 0.256 +... +Ep 3 (Step 013050): Train loss 0.068, Val loss 0.280 +Ep 3 (Step 013100): Train loss 0.064, Val loss 0.306 +Training accuracy: 99.38% | Validation accuracy: 87.50% +Training completed in 16.70 minutes. + +Evaluating on the full datasets ... + +Training accuracy: 98.87% +Validation accuracy: 90.98% +Test accuracy: 90.81% +``` + +--- + +一个355M 参数量的编码器模型 [RoBERTa](https://arxiv.org/abs/1907.11692) ,从预训练权重开始,仅训练最后一个 Transformer 块和输出层: + + +```bash +python train-bert-hf.py --bert_model roberta +``` + +--- + +一个scikit-learn Logistic 回归模型作为基线。 + +```bash +python train-sklearn-logreg.py +``` + +``` +Dummy classifier: +Training Accuracy: 50.01% +Validation Accuracy: 50.14% +Test Accuracy: 49.91% + + +Logistic regression classifier: +Training Accuracy: 99.80% +Validation Accuracy: 88.60% +Test Accuracy: 88.84% +``` diff --git a/ch06/03_bonus_imdb-classification/download-prepare-dataset.py b/ch06/03_bonus_imdb-classification/download-prepare-dataset.py new file mode 100644 index 0000000..f5ab61c --- /dev/null +++ b/ch06/03_bonus_imdb-classification/download-prepare-dataset.py @@ -0,0 +1,84 @@ +# Copyright (c) Sebastian Raschka under Apache License 2.0 (see LICENSE.txt). +# Source for "Build a Large Language Model From Scratch" +# - https://www.manning.com/books/build-a-large-language-model-from-scratch +# Code: https://github.com/rasbt/LLMs-from-scratch + +import os +import sys +import tarfile +import time +import urllib.request +import pandas as pd + + +def reporthook(count, block_size, total_size): + global start_time + if count == 0: + start_time = time.time() + else: + duration = time.time() - start_time + progress_size = int(count * block_size) + percent = count * block_size * 100 / total_size + + speed = int(progress_size / (1024 * duration)) if duration else 0 + sys.stdout.write( + f"\r{int(percent)}% | {progress_size / (1024**2):.2f} MB " + f"| {speed:.2f} MB/s | {duration:.2f} sec elapsed" + ) + sys.stdout.flush() + + +def download_and_extract_dataset(dataset_url, target_file, directory): + if not os.path.exists(directory): + if os.path.exists(target_file): + os.remove(target_file) + urllib.request.urlretrieve(dataset_url, target_file, reporthook) + print("\nExtracting dataset ...") + with tarfile.open(target_file, "r:gz") as tar: + tar.extractall() + else: + print(f"Directory `{directory}` already exists. Skipping download.") + + +def load_dataset_to_dataframe(basepath="aclImdb", labels={"pos": 1, "neg": 0}): + data_frames = [] # List to store each chunk of DataFrame + for subset in ("test", "train"): + for label in ("pos", "neg"): + path = os.path.join(basepath, subset, label) + for file in sorted(os.listdir(path)): + with open(os.path.join(path, file), "r", encoding="utf-8") as infile: + # Create a DataFrame for each file and add it to the list + data_frames.append(pd.DataFrame({"text": [infile.read()], "label": [labels[label]]})) + # Concatenate all DataFrame chunks together + df = pd.concat(data_frames, ignore_index=True) + df = df.sample(frac=1, random_state=123).reset_index(drop=True) # Shuffle the DataFrame + return df + + +def partition_and_save(df, sizes=(35000, 5000, 10000)): + # Shuffle the DataFrame + df_shuffled = df.sample(frac=1, random_state=123).reset_index(drop=True) + + # Get indices for where to split the data + train_end = sizes[0] + val_end = sizes[0] + sizes[1] + + # Split the DataFrame + train = df_shuffled.iloc[:train_end] + val = df_shuffled.iloc[train_end:val_end] + test = df_shuffled.iloc[val_end:] + + # Save to CSV files + train.to_csv("train.csv", index=False) + val.to_csv("validation.csv", index=False) + test.to_csv("test.csv", index=False) + + +if __name__ == "__main__": + dataset_url = "http://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz" + print("Downloading dataset ...") + download_and_extract_dataset(dataset_url, "aclImdb_v1.tar.gz", "aclImdb") + print("Creating data frames ...") + df = load_dataset_to_dataframe() + print("Partitioning and saving data frames ...") + partition_and_save(df) diff --git a/ch06/03_bonus_imdb-classification/gpt_download.py b/ch06/03_bonus_imdb-classification/gpt_download.py new file mode 100644 index 0000000..0d695d2 --- /dev/null +++ b/ch06/03_bonus_imdb-classification/gpt_download.py @@ -0,0 +1,99 @@ +# Copyright (c) Sebastian Raschka under Apache License 2.0 (see LICENSE.txt). +# Source for "Build a Large Language Model From Scratch" +# - https://www.manning.com/books/build-a-large-language-model-from-scratch +# Code: https://github.com/rasbt/LLMs-from-scratch + + +import os +import requests +import json +import numpy as np +import tensorflow as tf +from tqdm import tqdm + + +def download_and_load_gpt2(model_size, models_dir): + # Validate model size + allowed_sizes = ("124M", "355M", "774M", "1558M") + if model_size not in allowed_sizes: + raise ValueError(f"Model size not in {allowed_sizes}") + + # Define paths + model_dir = os.path.join(models_dir, model_size) + base_url = "https://openaipublic.blob.core.windows.net/gpt-2/models" + filenames = [ + "checkpoint", "encoder.json", "hparams.json", + "model.ckpt.data-00000-of-00001", "model.ckpt.index", + "model.ckpt.meta", "vocab.bpe" + ] + + # Download files + os.makedirs(model_dir, exist_ok=True) + for filename in filenames: + file_url = os.path.join(base_url, model_size, filename) + file_path = os.path.join(model_dir, filename) + download_file(file_url, file_path) + + # Load settings and params + tf_ckpt_path = tf.train.latest_checkpoint(model_dir) + settings = json.load(open(os.path.join(model_dir, "hparams.json"))) + params = load_gpt2_params_from_tf_ckpt(tf_ckpt_path, settings) + + return settings, params + + +def download_file(url, destination): + # Send a GET request to download the file in streaming mode + response = requests.get(url, stream=True) + + # Get the total file size from headers, defaulting to 0 if not present + file_size = int(response.headers.get("content-length", 0)) + + # Check if file exists and has the same size + if os.path.exists(destination): + file_size_local = os.path.getsize(destination) + if file_size == file_size_local: + print(f"File already exists and is up-to-date: {destination}") + return + + # Define the block size for reading the file + block_size = 1024 # 1 Kilobyte + + # Initialize the progress bar with total file size + progress_bar_description = url.split("/")[-1] # Extract filename from URL + with tqdm(total=file_size, unit="iB", unit_scale=True, desc=progress_bar_description) as progress_bar: + # Open the destination file in binary write mode + with open(destination, "wb") as file: + # Iterate over the file data in chunks + for chunk in response.iter_content(block_size): + progress_bar.update(len(chunk)) # Update progress bar + file.write(chunk) # Write the chunk to the file + + +def load_gpt2_params_from_tf_ckpt(ckpt_path, settings): + # Initialize parameters dictionary with empty blocks for each layer + params = {"blocks": [{} for _ in range(settings["n_layer"])]} + + # Iterate over each variable in the checkpoint + for name, _ in tf.train.list_variables(ckpt_path): + # Load the variable and remove singleton dimensions + variable_array = np.squeeze(tf.train.load_variable(ckpt_path, name)) + + # Process the variable name to extract relevant parts + variable_name_parts = name.split("/")[1:] # Skip the 'model/' prefix + + # Identify the target dictionary for the variable + target_dict = params + if variable_name_parts[0].startswith("h"): + layer_number = int(variable_name_parts[0][1:]) + target_dict = params["blocks"][layer_number] + + # Recursively access or create nested dictionaries + for key in variable_name_parts[1:-1]: + target_dict = target_dict.setdefault(key, {}) + + # Assign the variable array to the last key + last_key = variable_name_parts[-1] + target_dict[last_key] = variable_array + + return params diff --git a/ch06/03_bonus_imdb-classification/previous_chapters.py b/ch06/03_bonus_imdb-classification/previous_chapters.py new file mode 100644 index 0000000..4fc0f7e --- /dev/null +++ b/ch06/03_bonus_imdb-classification/previous_chapters.py @@ -0,0 +1,321 @@ +# Copyright (c) Sebastian Raschka under Apache License 2.0 (see LICENSE.txt). +# Source for "Build a Large Language Model From Scratch" +# - https://www.manning.com/books/build-a-large-language-model-from-scratch +# Code: https://github.com/rasbt/LLMs-from-scratch +# +# This file collects all the relevant code that we covered thus far +# throughout Chapters 2-5. +# This file can be run as a standalone script. + +import numpy as np +import tiktoken +import torch +import torch.nn as nn +from torch.utils.data import Dataset, DataLoader + +##################################### +# Chapter 2 +##################################### + + +class GPTDatasetV1(Dataset): + def __init__(self, txt, tokenizer, max_length, stride): + self.tokenizer = tokenizer + self.input_ids = [] + self.target_ids = [] + + # Tokenize the entire text + token_ids = tokenizer.encode(txt) + + # Use a sliding window to chunk the book into overlapping sequences of max_length + for i in range(0, len(token_ids) - max_length, stride): + input_chunk = token_ids[i:i + max_length] + target_chunk = token_ids[i + 1: i + max_length + 1] + self.input_ids.append(torch.tensor(input_chunk)) + self.target_ids.append(torch.tensor(target_chunk)) + + def __len__(self): + return len(self.input_ids) + + def __getitem__(self, idx): + return self.input_ids[idx], self.target_ids[idx] + + +def create_dataloader_v1(txt, batch_size=4, max_length=256, + stride=128, shuffle=True, drop_last=True): + # Initialize the tokenizer + tokenizer = tiktoken.get_encoding("gpt2") + + # Create dataset + dataset = GPTDatasetV1(txt, tokenizer, max_length, stride) + + # Create dataloader + dataloader = DataLoader( + dataset, batch_size=batch_size, shuffle=shuffle, drop_last=drop_last) + + return dataloader + + +##################################### +# Chapter 3 +##################################### +class MultiHeadAttention(nn.Module): + def __init__(self, d_in, d_out, context_length, dropout, num_heads, qkv_bias=False): + super().__init__() + assert d_out % num_heads == 0, "d_out must be divisible by n_heads" + + self.d_out = d_out + self.num_heads = num_heads + self.head_dim = d_out // num_heads # Reduce the projection dim to match desired output dim + + self.W_query = nn.Linear(d_in, d_out, bias=qkv_bias) + self.W_key = nn.Linear(d_in, d_out, bias=qkv_bias) + self.W_value = nn.Linear(d_in, d_out, bias=qkv_bias) + self.out_proj = nn.Linear(d_out, d_out) # Linear layer to combine head outputs + self.dropout = nn.Dropout(dropout) + self.register_buffer('mask', torch.triu(torch.ones(context_length, context_length), diagonal=1)) + + def forward(self, x): + b, num_tokens, d_in = x.shape + + keys = self.W_key(x) # Shape: (b, num_tokens, d_out) + queries = self.W_query(x) + values = self.W_value(x) + + # We implicitly split the matrix by adding a `num_heads` dimension + # Unroll last dim: (b, num_tokens, d_out) -> (b, num_tokens, num_heads, head_dim) + keys = keys.view(b, num_tokens, self.num_heads, self.head_dim) + values = values.view(b, num_tokens, self.num_heads, self.head_dim) + queries = queries.view(b, num_tokens, self.num_heads, self.head_dim) + + # Transpose: (b, num_tokens, num_heads, head_dim) -> (b, num_heads, num_tokens, head_dim) + keys = keys.transpose(1, 2) + queries = queries.transpose(1, 2) + values = values.transpose(1, 2) + + # Compute scaled dot-product attention (aka self-attention) with a causal mask + attn_scores = queries @ keys.transpose(2, 3) # Dot product for each head + + # Original mask truncated to the number of tokens and converted to boolean + mask_bool = self.mask.bool()[:num_tokens, :num_tokens] + + # Use the mask to fill attention scores + attn_scores.masked_fill_(mask_bool, -torch.inf) + + attn_weights = torch.softmax(attn_scores / keys.shape[-1]**0.5, dim=-1) + attn_weights = self.dropout(attn_weights) + + # Shape: (b, num_tokens, num_heads, head_dim) + context_vec = (attn_weights @ values).transpose(1, 2) + + # Combine heads, where self.d_out = self.num_heads * self.head_dim + context_vec = context_vec.reshape(b, num_tokens, self.d_out) + context_vec = self.out_proj(context_vec) # optional projection + + return context_vec + + +##################################### +# Chapter 4 +##################################### +class LayerNorm(nn.Module): + def __init__(self, emb_dim): + super().__init__() + self.eps = 1e-5 + self.scale = nn.Parameter(torch.ones(emb_dim)) + self.shift = nn.Parameter(torch.zeros(emb_dim)) + + def forward(self, x): + mean = x.mean(dim=-1, keepdim=True) + var = x.var(dim=-1, keepdim=True, unbiased=False) + norm_x = (x - mean) / torch.sqrt(var + self.eps) + return self.scale * norm_x + self.shift + + +class GELU(nn.Module): + def __init__(self): + super().__init__() + + def forward(self, x): + return 0.5 * x * (1 + torch.tanh( + torch.sqrt(torch.tensor(2.0 / torch.pi)) * + (x + 0.044715 * torch.pow(x, 3)) + )) + + +class FeedForward(nn.Module): + def __init__(self, cfg): + super().__init__() + self.layers = nn.Sequential( + nn.Linear(cfg["emb_dim"], 4 * cfg["emb_dim"]), + GELU(), + nn.Linear(4 * cfg["emb_dim"], cfg["emb_dim"]), + ) + + def forward(self, x): + return self.layers(x) + + +class TransformerBlock(nn.Module): + def __init__(self, cfg): + super().__init__() + self.att = MultiHeadAttention( + d_in=cfg["emb_dim"], + d_out=cfg["emb_dim"], + context_length=cfg["context_length"], + num_heads=cfg["n_heads"], + dropout=cfg["drop_rate"], + qkv_bias=cfg["qkv_bias"]) + self.ff = FeedForward(cfg) + self.norm1 = LayerNorm(cfg["emb_dim"]) + self.norm2 = LayerNorm(cfg["emb_dim"]) + self.drop_resid = nn.Dropout(cfg["drop_rate"]) + + def forward(self, x): + # Shortcut connection for attention block + shortcut = x + x = self.norm1(x) + x = self.att(x) # Shape [batch_size, num_tokens, emb_size] + x = self.drop_resid(x) + x = x + shortcut # Add the original input back + + # Shortcut connection for feed-forward block + shortcut = x + x = self.norm2(x) + x = self.ff(x) + x = self.drop_resid(x) + x = x + shortcut # Add the original input back + + return x + + +class GPTModel(nn.Module): + def __init__(self, cfg): + super().__init__() + self.tok_emb = nn.Embedding(cfg["vocab_size"], cfg["emb_dim"]) + self.pos_emb = nn.Embedding(cfg["context_length"], cfg["emb_dim"]) + self.drop_emb = nn.Dropout(cfg["drop_rate"]) + + self.trf_blocks = nn.Sequential( + *[TransformerBlock(cfg) for _ in range(cfg["n_layers"])]) + + self.final_norm = LayerNorm(cfg["emb_dim"]) + self.out_head = nn.Linear(cfg["emb_dim"], cfg["vocab_size"], bias=False) + + def forward(self, in_idx): + batch_size, seq_len = in_idx.shape + tok_embeds = self.tok_emb(in_idx) + pos_embeds = self.pos_emb(torch.arange(seq_len, device=in_idx.device)) + x = tok_embeds + pos_embeds # Shape [batch_size, num_tokens, emb_size] + x = self.drop_emb(x) + x = self.trf_blocks(x) + x = self.final_norm(x) + logits = self.out_head(x) + return logits + + +def generate_text_simple(model, idx, max_new_tokens, context_size): + # idx is (B, T) array of indices in the current context + for _ in range(max_new_tokens): + + # Crop current context if it exceeds the supported context size + # E.g., if LLM supports only 5 tokens, and the context size is 10 + # then only the last 5 tokens are used as context + idx_cond = idx[:, -context_size:] + + # Get the predictions + with torch.no_grad(): + logits = model(idx_cond) + + # Focus only on the last time step + # (batch, n_token, vocab_size) becomes (batch, vocab_size) + logits = logits[:, -1, :] + + # Get the idx of the vocab entry with the highest logits value + idx_next = torch.argmax(logits, dim=-1, keepdim=True) # (batch, 1) + + # Append sampled index to the running sequence + idx = torch.cat((idx, idx_next), dim=1) # (batch, n_tokens+1) + + return idx + + +##################################### +# Chapter 5 +##################################### +def assign(left, right): + if left.shape != right.shape: + raise ValueError(f"Shape mismatch. Left: {left.shape}, Right: {right.shape}") + return torch.nn.Parameter(torch.tensor(right)) + + +def load_weights_into_gpt(gpt, params): + gpt.pos_emb.weight = assign(gpt.pos_emb.weight, params['wpe']) + gpt.tok_emb.weight = assign(gpt.tok_emb.weight, params['wte']) + + for b in range(len(params["blocks"])): + q_w, k_w, v_w = np.split( + (params["blocks"][b]["attn"]["c_attn"])["w"], 3, axis=-1) + gpt.trf_blocks[b].att.W_query.weight = assign( + gpt.trf_blocks[b].att.W_query.weight, q_w.T) + gpt.trf_blocks[b].att.W_key.weight = assign( + gpt.trf_blocks[b].att.W_key.weight, k_w.T) + gpt.trf_blocks[b].att.W_value.weight = assign( + gpt.trf_blocks[b].att.W_value.weight, v_w.T) + + q_b, k_b, v_b = np.split( + (params["blocks"][b]["attn"]["c_attn"])["b"], 3, axis=-1) + gpt.trf_blocks[b].att.W_query.bias = assign( + gpt.trf_blocks[b].att.W_query.bias, q_b) + gpt.trf_blocks[b].att.W_key.bias = assign( + gpt.trf_blocks[b].att.W_key.bias, k_b) + gpt.trf_blocks[b].att.W_value.bias = assign( + gpt.trf_blocks[b].att.W_value.bias, v_b) + + gpt.trf_blocks[b].att.out_proj.weight = assign( + gpt.trf_blocks[b].att.out_proj.weight, + params["blocks"][b]["attn"]["c_proj"]["w"].T) + gpt.trf_blocks[b].att.out_proj.bias = assign( + gpt.trf_blocks[b].att.out_proj.bias, + params["blocks"][b]["attn"]["c_proj"]["b"]) + + gpt.trf_blocks[b].ff.layers[0].weight = assign( + gpt.trf_blocks[b].ff.layers[0].weight, + params["blocks"][b]["mlp"]["c_fc"]["w"].T) + gpt.trf_blocks[b].ff.layers[0].bias = assign( + gpt.trf_blocks[b].ff.layers[0].bias, + params["blocks"][b]["mlp"]["c_fc"]["b"]) + gpt.trf_blocks[b].ff.layers[2].weight = assign( + gpt.trf_blocks[b].ff.layers[2].weight, + params["blocks"][b]["mlp"]["c_proj"]["w"].T) + gpt.trf_blocks[b].ff.layers[2].bias = assign( + gpt.trf_blocks[b].ff.layers[2].bias, + params["blocks"][b]["mlp"]["c_proj"]["b"]) + + gpt.trf_blocks[b].norm1.scale = assign( + gpt.trf_blocks[b].norm1.scale, + params["blocks"][b]["ln_1"]["g"]) + gpt.trf_blocks[b].norm1.shift = assign( + gpt.trf_blocks[b].norm1.shift, + params["blocks"][b]["ln_1"]["b"]) + gpt.trf_blocks[b].norm2.scale = assign( + gpt.trf_blocks[b].norm2.scale, + params["blocks"][b]["ln_2"]["g"]) + gpt.trf_blocks[b].norm2.shift = assign( + gpt.trf_blocks[b].norm2.shift, + params["blocks"][b]["ln_2"]["b"]) + + gpt.final_norm.scale = assign(gpt.final_norm.scale, params["g"]) + gpt.final_norm.shift = assign(gpt.final_norm.shift, params["b"]) + gpt.out_head.weight = assign(gpt.out_head.weight, params["wte"]) + + +def text_to_token_ids(text, tokenizer): + encoded = tokenizer.encode(text, allowed_special={'<|endoftext|>'}) + encoded_tensor = torch.tensor(encoded).unsqueeze(0) # add batch dimension + return encoded_tensor + + +def token_ids_to_text(token_ids, tokenizer): + flat = token_ids.squeeze(0) # remove batch dimension + return tokenizer.decode(flat.tolist()) diff --git a/ch06/03_bonus_imdb-classification/requirements-extra.txt b/ch06/03_bonus_imdb-classification/requirements-extra.txt new file mode 100644 index 0000000..7ab8694 --- /dev/null +++ b/ch06/03_bonus_imdb-classification/requirements-extra.txt @@ -0,0 +1,2 @@ +transformers>=4.33.2 +scikit-learn>=1.3.0 \ No newline at end of file diff --git a/ch06/03_bonus_imdb-classification/sklearn-baseline.ipynb b/ch06/03_bonus_imdb-classification/sklearn-baseline.ipynb new file mode 100644 index 0000000..dd25829 --- /dev/null +++ b/ch06/03_bonus_imdb-classification/sklearn-baseline.ipynb @@ -0,0 +1,277 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "8968a681-2db1-4840-bb73-7d6c95986825", + "metadata": {}, + "source": [ + "\n", + "\n", + "\n", + "\n", + "\n", + "
\n", + "\n", + "Supplementary code for the Build a Large Language Model From Scratch book by Sebastian Raschka
\n", + "
Code repository: https://github.com/rasbt/LLMs-from-scratch\n", + "
\n", + "
\n", + "\n", + "
" + ] + }, + { + "cell_type": "markdown", + "id": "8b6e1cdd-b14e-4368-bdbb-9bf7ab821791", + "metadata": {}, + "source": [ + "# Scikit-learn Logistic 回归模型" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "id": "c2a72242-6197-4bef-aa05-696a152350d5", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "100% | 80.23 MB | 4.37 MB/s | 18.38 sec elapsed" + ] + } + ], + "source": [ + "!python download-prepare-dataset.py" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "id": "69f32433-e19c-4066-b806-8f30b408107f", + "metadata": {}, + "outputs": [], + "source": [ + "import pandas as pd\n", + "\n", + "train_df = pd.read_csv(\"train.csv\")\n", + "val_df = pd.read_csv(\"validation.csv\")\n", + "test_df = pd.read_csv(\"test.csv\")" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "id": "0808b212-fe91-48d9-80b8-55519f8835d5", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
textlabel
0The only reason I saw \"Shakedown\" was that it ...0
1This is absolute drivel, designed to shock and...0
2Lots of scenes and dialogue are flat-out goofy...1
3** and 1/2 stars out of **** Lifeforce is one ...1
4I learned a thing: you have to take this film ...1
\n", + "
" + ], + "text/plain": [ + " text label\n", + "0 The only reason I saw \"Shakedown\" was that it ... 0\n", + "1 This is absolute drivel, designed to shock and... 0\n", + "2 Lots of scenes and dialogue are flat-out goofy... 1\n", + "3 ** and 1/2 stars out of **** Lifeforce is one ... 1\n", + "4 I learned a thing: you have to take this film ... 1" + ] + }, + "execution_count": 16, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "train_df.head()" + ] + }, + { + "cell_type": "markdown", + "id": "fae87bc1-14ca-4f89-8e12-49f77b0ec00d", + "metadata": {}, + "source": [ + "## Scikit-learn baseline" + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "id": "180318b7-de18-4b05-b84a-ba97c72b9d8e", + "metadata": {}, + "outputs": [], + "source": [ + "from sklearn.feature_extraction.text import CountVectorizer\n", + "from sklearn.linear_model import LogisticRegression\n", + "from sklearn.metrics import accuracy_score" + ] + }, + { + "cell_type": "code", + "execution_count": 20, + "id": "25090b7c-f516-4be2-8083-3a7187fe4635", + "metadata": {}, + "outputs": [], + "source": [ + "vectorizer = CountVectorizer()\n", + "\n", + "X_train = vectorizer.fit_transform(train_df[\"text\"])\n", + "X_val = vectorizer.transform(val_df[\"text\"])\n", + "X_test = vectorizer.transform(test_df[\"text\"])\n", + "\n", + "y_train, y_val, y_test = train_df[\"label\"], val_df[\"label\"], test_df[\"label\"]" + ] + }, + { + "cell_type": "code", + "execution_count": 22, + "id": "0247de3a-88f0-4b9c-becd-157baf3acf49", + "metadata": {}, + "outputs": [], + "source": [ + "def eval(model, X_train, y_train, X_val, y_val, X_test, y_test):\n", + " # Making predictions\n", + " y_pred_train = model.predict(X_train)\n", + " y_pred_val = model.predict(X_val)\n", + " y_pred_test = model.predict(X_test)\n", + " \n", + " # Calculating accuracy and balanced accuracy\n", + " accuracy_train = accuracy_score(y_train, y_pred_train)\n", + " balanced_accuracy_train = balanced_accuracy_score(y_train, y_pred_train)\n", + " \n", + " accuracy_val = accuracy_score(y_val, y_pred_val)\n", + " balanced_accuracy_val = balanced_accuracy_score(y_val, y_pred_val)\n", + "\n", + " accuracy_test = accuracy_score(y_test, y_pred_test)\n", + " balanced_accuracy_test = balanced_accuracy_score(y_test, y_pred_test)\n", + " \n", + " # Printing the results\n", + " print(f\"Training Accuracy: {accuracy_train*100:.2f}%\")\n", + " print(f\"Validation Accuracy: {accuracy_val*100:.2f}%\")\n", + " print(f\"Test Accuracy: {accuracy_test*100:.2f}%\")" + ] + }, + { + "cell_type": "code", + "execution_count": 23, + "id": "c29c6dfc-f72d-40ab-8cb5-783aad1a15ab", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Training Accuracy: 50.01%\n", + "Validation Accuracy: 50.14%\n", + "Test Accuracy: 49.91%\n" + ] + } + ], + "source": [ + "from sklearn.dummy import DummyClassifier\n", + "\n", + "# Create a dummy classifier with the strategy to predict the most frequent class\n", + "dummy_clf = DummyClassifier(strategy=\"most_frequent\")\n", + "dummy_clf.fit(X_train, y_train)\n", + "\n", + "eval(dummy_clf, X_train, y_train, X_val, y_val, X_test, y_test)" + ] + }, + { + "cell_type": "code", + "execution_count": 24, + "id": "088a8a3a-3b74-4d10-a51b-cb662569ae39", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Training Accuracy: 99.80%\n", + "Validation Accuracy: 88.62%\n", + "Test Accuracy: 88.85%\n" + ] + } + ], + "source": [ + "model = LogisticRegression(max_iter=1000)\n", + "model.fit(X_train, y_train)\n", + "eval(model, X_train, y_train, X_val, y_val, X_test, y_test)" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.4" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/ch06/03_bonus_imdb-classification/train-bert-hf.py b/ch06/03_bonus_imdb-classification/train-bert-hf.py new file mode 100644 index 0000000..8d9c796 --- /dev/null +++ b/ch06/03_bonus_imdb-classification/train-bert-hf.py @@ -0,0 +1,301 @@ +# Copyright (c) Sebastian Raschka under Apache License 2.0 (see LICENSE.txt). +# Source for "Build a Large Language Model From Scratch" +# - https://www.manning.com/books/build-a-large-language-model-from-scratch +# Code: https://github.com/rasbt/LLMs-from-scratch + +import argparse +from pathlib import Path +import time + +import pandas as pd +import torch +from torch.utils.data import DataLoader +from torch.utils.data import Dataset + +from transformers import AutoTokenizer, AutoModelForSequenceClassification + + +class IMDBDataset(Dataset): + def __init__(self, csv_file, tokenizer, max_length=None, pad_token_id=50256): + self.data = pd.read_csv(csv_file) + self.max_length = max_length if max_length is not None else self._longest_encoded_length(tokenizer) + + # Pre-tokenize texts + self.encoded_texts = [ + tokenizer.encode(text)[:self.max_length] + for text in self.data["text"] + ] + # Pad sequences to the longest sequence + + # Debug + pad_token_id = 0 + + self.encoded_texts = [ + et + [pad_token_id] * (self.max_length - len(et)) + for et in self.encoded_texts + ] + + def __getitem__(self, index): + encoded = self.encoded_texts[index] + label = self.data.iloc[index]["label"] + return torch.tensor(encoded, dtype=torch.long), torch.tensor(label, dtype=torch.long) + + def __len__(self): + return len(self.data) + + def _longest_encoded_length(self, tokenizer): + max_length = 0 + for text in self.data["text"]: + encoded_length = len(tokenizer.encode(text)) + if encoded_length > max_length: + max_length = encoded_length + return max_length + + +def calc_loss_batch(input_batch, target_batch, model, device): + input_batch, target_batch = input_batch.to(device), target_batch.to(device) + # logits = model(input_batch)[:, -1, :] # Logits of last output token + logits = model(input_batch).logits + loss = torch.nn.functional.cross_entropy(logits, target_batch) + return loss + + +# Same as in chapter 5 +def calc_loss_loader(data_loader, model, device, num_batches=None): + total_loss = 0. + if num_batches is None: + num_batches = len(data_loader) + else: + # Reduce the number of batches to match the total number of batches in the data loader + # if num_batches exceeds the number of batches in the data loader + num_batches = min(num_batches, len(data_loader)) + for i, (input_batch, target_batch) in enumerate(data_loader): + if i < num_batches: + loss = calc_loss_batch(input_batch, target_batch, model, device) + total_loss += loss.item() + else: + break + return total_loss / num_batches + + +@torch.no_grad() # Disable gradient tracking for efficiency +def calc_accuracy_loader(data_loader, model, device, num_batches=None): + model.eval() + correct_predictions, num_examples = 0, 0 + + if num_batches is None: + num_batches = len(data_loader) + else: + num_batches = min(num_batches, len(data_loader)) + for i, (input_batch, target_batch) in enumerate(data_loader): + if i < num_batches: + input_batch, target_batch = input_batch.to(device), target_batch.to(device) + # logits = model(input_batch)[:, -1, :] # Logits of last output token + logits = model(input_batch).logits + predicted_labels = torch.argmax(logits, dim=1) + num_examples += predicted_labels.shape[0] + correct_predictions += (predicted_labels == target_batch).sum().item() + else: + break + return correct_predictions / num_examples + + +def evaluate_model(model, train_loader, val_loader, device, eval_iter): + model.eval() + with torch.no_grad(): + train_loss = calc_loss_loader(train_loader, model, device, num_batches=eval_iter) + val_loss = calc_loss_loader(val_loader, model, device, num_batches=eval_iter) + model.train() + return train_loss, val_loss + + +def train_classifier_simple(model, train_loader, val_loader, optimizer, device, num_epochs, + eval_freq, eval_iter, tokenizer, max_steps=None): + # Initialize lists to track losses and tokens seen + train_losses, val_losses, train_accs, val_accs = [], [], [], [] + examples_seen, global_step = 0, -1 + + # Main training loop + for epoch in range(num_epochs): + model.train() # Set model to training mode + + for input_batch, target_batch in train_loader: + optimizer.zero_grad() # Reset loss gradients from previous epoch + loss = calc_loss_batch(input_batch, target_batch, model, device) + loss.backward() # Calculate loss gradients + optimizer.step() # Update model weights using loss gradients + examples_seen += input_batch.shape[0] # New: track examples instead of tokens + global_step += 1 + + # Optional evaluation step + if global_step % eval_freq == 0: + train_loss, val_loss = evaluate_model( + model, train_loader, val_loader, device, eval_iter) + train_losses.append(train_loss) + val_losses.append(val_loss) + print(f"Ep {epoch+1} (Step {global_step:06d}): " + f"Train loss {train_loss:.3f}, Val loss {val_loss:.3f}") + + if max_steps is not None and global_step > max_steps: + break + + # New: Calculate accuracy after each epoch + train_accuracy = calc_accuracy_loader(train_loader, model, device, num_batches=eval_iter) + val_accuracy = calc_accuracy_loader(val_loader, model, device, num_batches=eval_iter) + print(f"Training accuracy: {train_accuracy*100:.2f}% | ", end="") + print(f"Validation accuracy: {val_accuracy*100:.2f}%") + train_accs.append(train_accuracy) + val_accs.append(val_accuracy) + + if max_steps is not None and global_step > max_steps: + break + + return train_losses, val_losses, train_accs, val_accs, examples_seen + + +if __name__ == "__main__": + + parser = argparse.ArgumentParser() + parser.add_argument( + "--trainable_layers", + type=str, + default="last_block", + help=( + "Which layers to train. Options: 'all', 'last_block', 'last_layer'." + ) + ) + parser.add_argument( + "--bert_model", + type=str, + default="distilbert", + help=( + "Which layers to train. Options: 'all', 'last_block', 'last_layer'." + ) + ) + args = parser.parse_args() + + ############################### + # Load model + ############################### + + torch.manual_seed(123) + if args.bert_model == "distilbert": + + model = AutoModelForSequenceClassification.from_pretrained( + "distilbert-base-uncased", num_labels=2 + ) + model.out_head = torch.nn.Linear(in_features=768, out_features=2) + + if args.trainable_layers == "last_layer": + pass + elif args.trainable_layers == "last_block": + for param in model.pre_classifier.parameters(): + param.requires_grad = True + for param in model.distilbert.transformer.layer[-1].parameters(): + param.requires_grad = True + elif args.trainable_layers == "all": + for param in model.parameters(): + param.requires_grad = True + else: + raise ValueError("Invalid --trainable_layers argument.") + + tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased") + + elif args.bert_model == "roberta": + + model = AutoModelForSequenceClassification.from_pretrained( + "FacebookAI/roberta-large", num_labels=2 + ) + model.classifier.out_proj = torch.nn.Linear(in_features=1024, out_features=2) + + if args.trainable_layers == "last_layer": + pass + elif args.trainable_layers == "last_block": + for param in model.classifier.parameters(): + param.requires_grad = True + for param in model.roberta.encoder.layer[-1].parameters(): + param.requires_grad = True + elif args.trainable_layers == "all": + for param in model.parameters(): + param.requires_grad = True + else: + raise ValueError("Invalid --trainable_layers argument.") + + tokenizer = AutoTokenizer.from_pretrained("FacebookAI/roberta-large") + + else: + raise ValueError("Selected --bert_model not supported.") + + device = torch.device("cuda" if torch.cuda.is_available() else "cpu") + model.to(device) + model.eval() + + ############################### + # Instantiate dataloaders + ############################### + + pad_token_id = tokenizer.encode(tokenizer.pad_token) + + base_path = Path(".") + + train_dataset = IMDBDataset(base_path / "train.csv", max_length=256, tokenizer=tokenizer, pad_token_id=pad_token_id) + val_dataset = IMDBDataset(base_path / "validation.csv", max_length=256, tokenizer=tokenizer, pad_token_id=pad_token_id) + test_dataset = IMDBDataset(base_path / "test.csv", max_length=256, tokenizer=tokenizer, pad_token_id=pad_token_id) + + num_workers = 0 + batch_size = 8 + + train_loader = DataLoader( + dataset=train_dataset, + batch_size=batch_size, + shuffle=True, + num_workers=num_workers, + drop_last=True, + ) + + val_loader = DataLoader( + dataset=val_dataset, + batch_size=batch_size, + num_workers=num_workers, + drop_last=False, + ) + + test_loader = DataLoader( + dataset=test_dataset, + batch_size=batch_size, + num_workers=num_workers, + drop_last=False, + ) + + ############################### + # Train model + ############################### + + start_time = time.time() + torch.manual_seed(123) + optimizer = torch.optim.AdamW(model.parameters(), lr=5e-5, weight_decay=0.1) + + num_epochs = 3 + train_losses, val_losses, train_accs, val_accs, examples_seen = train_classifier_simple( + model, train_loader, val_loader, optimizer, device, + num_epochs=num_epochs, eval_freq=50, eval_iter=20, + tokenizer=tokenizer, max_steps=None + ) + + end_time = time.time() + execution_time_minutes = (end_time - start_time) / 60 + print(f"Training completed in {execution_time_minutes:.2f} minutes.") + + ############################### + # Evaluate model + ############################### + + print("\nEvaluating on the full datasets ...\n") + + train_accuracy = calc_accuracy_loader(train_loader, model, device) + val_accuracy = calc_accuracy_loader(val_loader, model, device) + test_accuracy = calc_accuracy_loader(test_loader, model, device) + + print(f"Training accuracy: {train_accuracy*100:.2f}%") + print(f"Validation accuracy: {val_accuracy*100:.2f}%") + print(f"Test accuracy: {test_accuracy*100:.2f}%") diff --git a/ch06/03_bonus_imdb-classification/train-gpt.py b/ch06/03_bonus_imdb-classification/train-gpt.py new file mode 100644 index 0000000..2f47ece --- /dev/null +++ b/ch06/03_bonus_imdb-classification/train-gpt.py @@ -0,0 +1,366 @@ +# Copyright (c) Sebastian Raschka under Apache License 2.0 (see LICENSE.txt). +# Source for "Build a Large Language Model From Scratch" +# - https://www.manning.com/books/build-a-large-language-model-from-scratch +# Code: https://github.com/rasbt/LLMs-from-scratch + +import argparse +from pathlib import Path +import time + +import pandas as pd +import tiktoken +import torch +from torch.utils.data import DataLoader +from torch.utils.data import Dataset + +from gpt_download import download_and_load_gpt2 +from previous_chapters import GPTModel, load_weights_into_gpt + + +class IMDBDataset(Dataset): + def __init__(self, csv_file, tokenizer, max_length=None, pad_token_id=50256): + self.data = pd.read_csv(csv_file) + self.max_length = max_length if max_length is not None else self._longest_encoded_length(tokenizer) + + # Pre-tokenize texts + self.encoded_texts = [ + tokenizer.encode(text)[:self.max_length] + for text in self.data["text"] + ] + # Pad sequences to the longest sequence + self.encoded_texts = [ + et + [pad_token_id] * (self.max_length - len(et)) + for et in self.encoded_texts + ] + + def __getitem__(self, index): + encoded = self.encoded_texts[index] + label = self.data.iloc[index]["label"] + return torch.tensor(encoded, dtype=torch.long), torch.tensor(label, dtype=torch.long) + + def __len__(self): + return len(self.data) + + def _longest_encoded_length(self, tokenizer): + max_length = 0 + for text in self.data["text"]: + encoded_length = len(tokenizer.encode(text)) + if encoded_length > max_length: + max_length = encoded_length + return max_length + + +def instantiate_model(choose_model, load_weights): + + BASE_CONFIG = { + "vocab_size": 50257, # Vocabulary size + "context_length": 1024, # Context length + "drop_rate": 0.0, # Dropout rate + "qkv_bias": True # Query-key-value bias + } + + model_configs = { + "gpt2-small (124M)": {"emb_dim": 768, "n_layers": 12, "n_heads": 12}, + "gpt2-medium (355M)": {"emb_dim": 1024, "n_layers": 24, "n_heads": 16}, + "gpt2-large (774M)": {"emb_dim": 1280, "n_layers": 36, "n_heads": 20}, + "gpt2-xl (1558M)": {"emb_dim": 1600, "n_layers": 48, "n_heads": 25}, + } + + BASE_CONFIG.update(model_configs[choose_model]) + + if not load_weights: + torch.manual_seed(123) + model = GPTModel(BASE_CONFIG) + + if load_weights: + model_size = choose_model.split(" ")[-1].lstrip("(").rstrip(")") + settings, params = download_and_load_gpt2(model_size=model_size, models_dir="gpt2") + load_weights_into_gpt(model, params) + + model.eval() + return model + + +def calc_loss_batch(input_batch, target_batch, model, device, trainable_token=-1): + input_batch, target_batch = input_batch.to(device), target_batch.to(device) + logits = model(input_batch)[:, trainable_token, :] # Logits of last output token + loss = torch.nn.functional.cross_entropy(logits, target_batch) + return loss + + +def calc_loss_loader(data_loader, model, device, num_batches=None, trainable_token=-1): + total_loss = 0. + if len(data_loader) == 0: + return float("nan") + elif num_batches is None: + num_batches = len(data_loader) + else: + # Reduce the number of batches to match the total number of batches in the data loader + # if num_batches exceeds the number of batches in the data loader + num_batches = min(num_batches, len(data_loader)) + for i, (input_batch, target_batch) in enumerate(data_loader): + if i < num_batches: + loss = calc_loss_batch(input_batch, target_batch, model, device, trainable_token=trainable_token) + total_loss += loss.item() + else: + break + return total_loss / num_batches + + +@torch.no_grad() # Disable gradient tracking for efficiency +def calc_accuracy_loader(data_loader, model, device, num_batches=None, trainable_token=-1): + model.eval() + correct_predictions, num_examples = 0, 0 + + if num_batches is None: + num_batches = len(data_loader) + else: + num_batches = min(num_batches, len(data_loader)) + for i, (input_batch, target_batch) in enumerate(data_loader): + if i < num_batches: + input_batch, target_batch = input_batch.to(device), target_batch.to(device) + logits = model(input_batch)[:, trainable_token, :] # Logits of last output token + predicted_labels = torch.argmax(logits, dim=-1) + + num_examples += predicted_labels.shape[0] + correct_predictions += (predicted_labels == target_batch).sum().item() + else: + break + return correct_predictions / num_examples + + +def evaluate_model(model, train_loader, val_loader, device, eval_iter, trainable_token=-1): + model.eval() + with torch.no_grad(): + train_loss = calc_loss_loader(train_loader, model, device, num_batches=eval_iter, trainable_token=trainable_token) + val_loss = calc_loss_loader(val_loader, model, device, num_batches=eval_iter, trainable_token=trainable_token) + model.train() + return train_loss, val_loss + + +def train_classifier_simple(model, train_loader, val_loader, optimizer, device, num_epochs, + eval_freq, eval_iter, tokenizer, max_steps=None, trainable_token=-1): + # Initialize lists to track losses and tokens seen + train_losses, val_losses, train_accs, val_accs = [], [], [], [] + examples_seen, global_step = 0, -1 + + # Main training loop + for epoch in range(num_epochs): + model.train() # Set model to training mode + + for input_batch, target_batch in train_loader: + optimizer.zero_grad() # Reset loss gradients from previous epoch + loss = calc_loss_batch(input_batch, target_batch, model, device, trainable_token=trainable_token) + loss.backward() # Calculate loss gradients + optimizer.step() # Update model weights using loss gradients + examples_seen += input_batch.shape[0] # New: track examples instead of tokens + global_step += 1 + + # Optional evaluation step + if global_step % eval_freq == 0: + train_loss, val_loss = evaluate_model( + model, train_loader, val_loader, device, eval_iter, trainable_token=trainable_token) + train_losses.append(train_loss) + val_losses.append(val_loss) + print(f"Ep {epoch+1} (Step {global_step:06d}): " + f"Train loss {train_loss:.3f}, Val loss {val_loss:.3f}") + + if max_steps is not None and global_step > max_steps: + break + + # New: Calculate accuracy after each epoch + train_accuracy = calc_accuracy_loader(train_loader, model, device, num_batches=eval_iter, trainable_token=trainable_token) + val_accuracy = calc_accuracy_loader(val_loader, model, device, num_batches=eval_iter, trainable_token=trainable_token) + print(f"Training accuracy: {train_accuracy*100:.2f}% | ", end="") + print(f"Validation accuracy: {val_accuracy*100:.2f}%") + train_accs.append(train_accuracy) + val_accs.append(val_accuracy) + + if max_steps is not None and global_step > max_steps: + break + + return train_losses, val_losses, train_accs, val_accs, examples_seen + + +if __name__ == "__main__": + + parser = argparse.ArgumentParser() + parser.add_argument( + "--model_size", + type=str, + default="gpt2-small (124M)", + help=( + "Which GPT model to use. Options: 'gpt2-small (124M)', 'gpt2-medium (355M)'," + " 'gpt2-large (774M)', 'gpt2-xl (1558M)'." + ) + ) + parser.add_argument( + "--weights", + type=str, + default="pretrained", + help=( + "Whether to use 'pretrained' or 'random' weights." + ) + ) + parser.add_argument( + "--trainable_layers", + type=str, + default="last_block", + help=( + "Which layers to train. Options: 'all', 'last_block', 'last_layer'." + ) + ) + parser.add_argument( + "--trainable_token", + type=str, + default="last", + help=( + "Which token to train. Options: 'first', 'last'." + ) + ) + parser.add_argument( + "--context_length", + type=str, + default="256", + help=( + "The context length of the data inputs." + "Options: 'longest_training_example', 'model_context_length' or integer value." + ) + ) + + args = parser.parse_args() + + if args.trainable_token == "first": + args.trainable_token = 0 + elif args.trainable_token == "last": + args.trainable_token = -1 + else: + raise ValueError("Invalid --trainable_token argument") + + ############################### + # Load model + ############################### + + if args.weights == "pretrained": + load_weights = True + elif args.weights == "random": + load_weights = False + else: + raise ValueError("Invalid --weights argument.") + + model = instantiate_model(args.model_size, load_weights) + for param in model.parameters(): + param.requires_grad = False + + if args.model_size == "gpt2-small (124M)": + in_features = 768 + elif args.model_size == "gpt2-medium (355M)": + in_features = 1024 + elif args.model_size == "gpt2-large (774M)": + in_features = 1280 + elif args.model_size == "gpt2-xl (1558M)": + in_features = 1600 + else: + raise ValueError("Invalid --model_size argument") + + torch.manual_seed(123) + model.out_head = torch.nn.Linear(in_features=in_features, out_features=2) + + if args.trainable_layers == "last_layer": + pass + elif args.trainable_layers == "last_block": + for param in model.trf_blocks[-1].parameters(): + param.requires_grad = True + for param in model.final_norm.parameters(): + param.requires_grad = True + elif args.trainable_layers == "all": + for param in model.parameters(): + param.requires_grad = True + else: + raise ValueError("Invalid --trainable_layers argument.") + + device = torch.device("cuda" if torch.cuda.is_available() else "cpu") + model.to(device) + + ############################### + # Instantiate dataloaders + ############################### + + base_path = Path(".") + + tokenizer = tiktoken.get_encoding("gpt2") + + train_dataset = None + if args.context_length == "model_context_length": + max_length = model.pos_emb.weight.shape[0] + elif args.context_length == "longest_training_example": + train_dataset = IMDBDataset(base_path / "train.csv", max_length=None, tokenizer=tokenizer) + max_length = train_dataset.max_length + else: + try: + max_length = int(args.context_length) + except ValueError: + raise ValueError("Invalid --context_length argument") + + if train_dataset is None: + train_dataset = IMDBDataset(base_path / "train.csv", max_length=max_length, tokenizer=tokenizer) + val_dataset = IMDBDataset(base_path / "validation.csv", max_length=max_length, tokenizer=tokenizer) + test_dataset = IMDBDataset(base_path / "test.csv", max_length=max_length, tokenizer=tokenizer) + + num_workers = 0 + batch_size = 8 + + train_loader = DataLoader( + dataset=train_dataset, + batch_size=batch_size, + shuffle=True, + num_workers=num_workers, + drop_last=True, + ) + + val_loader = DataLoader( + dataset=val_dataset, + batch_size=batch_size, + num_workers=num_workers, + drop_last=False, + ) + + test_loader = DataLoader( + dataset=test_dataset, + batch_size=batch_size, + num_workers=num_workers, + drop_last=False, + ) + + ############################### + # Train model + ############################### + + start_time = time.time() + torch.manual_seed(123) + optimizer = torch.optim.AdamW(model.parameters(), lr=5e-5, weight_decay=0.1) + + num_epochs = 3 + train_losses, val_losses, train_accs, val_accs, examples_seen = train_classifier_simple( + model, train_loader, val_loader, optimizer, device, + num_epochs=num_epochs, eval_freq=50, eval_iter=20, + tokenizer=tokenizer, max_steps=None, trainable_token=args.trainable_token + ) + + end_time = time.time() + execution_time_minutes = (end_time - start_time) / 60 + print(f"Training completed in {execution_time_minutes:.2f} minutes.") + + ############################### + # Evaluate model + ############################### + + print("\nEvaluating on the full datasets ...\n") + + train_accuracy = calc_accuracy_loader(train_loader, model, device, trainable_token=args.trainable_token) + val_accuracy = calc_accuracy_loader(val_loader, model, device, trainable_token=args.trainable_token) + test_accuracy = calc_accuracy_loader(test_loader, model, device, trainable_token=args.trainable_token) + + print(f"Training accuracy: {train_accuracy*100:.2f}%") + print(f"Validation accuracy: {val_accuracy*100:.2f}%") + print(f"Test accuracy: {test_accuracy*100:.2f}%") diff --git a/ch06/03_bonus_imdb-classification/train-sklearn-logreg.py b/ch06/03_bonus_imdb-classification/train-sklearn-logreg.py new file mode 100644 index 0000000..7842d12 --- /dev/null +++ b/ch06/03_bonus_imdb-classification/train-sklearn-logreg.py @@ -0,0 +1,75 @@ +# Copyright (c) Sebastian Raschka under Apache License 2.0 (see LICENSE.txt). +# Source for "Build a Large Language Model From Scratch" +# - https://www.manning.com/books/build-a-large-language-model-from-scratch +# Code: https://github.com/rasbt/LLMs-from-scratch + +import pandas as pd +from sklearn.feature_extraction.text import CountVectorizer +from sklearn.linear_model import LogisticRegression +from sklearn.metrics import accuracy_score +# from sklearn.metrics import balanced_accuracy_score +from sklearn.dummy import DummyClassifier + + +def load_dataframes(): + df_train = pd.read_csv("train.csv") + df_val = pd.read_csv("validation.csv") + df_test = pd.read_csv("test.csv") + + return df_train, df_val, df_test + + +def eval(model, X_train, y_train, X_val, y_val, X_test, y_test): + # Making predictions + y_pred_train = model.predict(X_train) + y_pred_val = model.predict(X_val) + y_pred_test = model.predict(X_test) + + # Calculating accuracy and balanced accuracy + accuracy_train = accuracy_score(y_train, y_pred_train) + # balanced_accuracy_train = balanced_accuracy_score(y_train, y_pred_train) + + accuracy_val = accuracy_score(y_val, y_pred_val) + # balanced_accuracy_val = balanced_accuracy_score(y_val, y_pred_val) + + accuracy_test = accuracy_score(y_test, y_pred_test) + # balanced_accuracy_test = balanced_accuracy_score(y_test, y_pred_test) + + # Printing the results + print(f"Training Accuracy: {accuracy_train*100:.2f}%") + print(f"Validation Accuracy: {accuracy_val*100:.2f}%") + print(f"Test Accuracy: {accuracy_test*100:.2f}%") + + # print(f"\nTraining Balanced Accuracy: {balanced_accuracy_train*100:.2f}%") + # print(f"Validation Balanced Accuracy: {balanced_accuracy_val*100:.2f}%") + # print(f"Test Balanced Accuracy: {balanced_accuracy_test*100:.2f}%") + + +if __name__ == "__main__": + df_train, df_val, df_test = load_dataframes() + + ######################################### + # Convert text into bag-of-words model + vectorizer = CountVectorizer() + ######################################### + + X_train = vectorizer.fit_transform(df_train["text"]) + X_val = vectorizer.transform(df_val["text"]) + X_test = vectorizer.transform(df_test["text"]) + y_train, y_val, y_test = df_train["label"], df_val["label"], df_test["label"] + + ##################################### + # Model training and evaluation + ##################################### + + # Create a dummy classifier with the strategy to predict the most frequent class + dummy_clf = DummyClassifier(strategy="most_frequent") + dummy_clf.fit(X_train, y_train) + + print("Dummy classifier:") + eval(dummy_clf, X_train, y_train, X_val, y_val, X_test, y_test) + + print("\n\nLogistic regression classifier:") + model = LogisticRegression(max_iter=1000) + model.fit(X_train, y_train) + eval(model, X_train, y_train, X_val, y_val, X_test, y_test)