Merge pull request #6 from Beyondzjl/main

附录A第三部分
This commit is contained in:
Ethan-Chen-plus
2024-03-04 09:45:52 +08:00
committed by GitHub
4 changed files with 174 additions and 117 deletions
+30 -29
View File
@@ -1,10 +1,10 @@
# Appendix A: Introduction to PyTorch (Part 3) # 附录A :PyTorch的介绍(第三部分)
import torch import torch
import torch.nn.functional as F import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader from torch.utils.data import Dataset, DataLoader
# NEW imports: # 导入新的库
import os import os
import torch.multiprocessing as mp import torch.multiprocessing as mp
from torch.utils.data.distributed import DistributedSampler from torch.utils.data.distributed import DistributedSampler
@@ -12,22 +12,23 @@ from torch.nn.parallel import DistributedDataParallel as DDP
from torch.distributed import init_process_group, destroy_process_group from torch.distributed import init_process_group, destroy_process_group
# NEW: function to initialize a distributed process group (1 process / GPU) # 创建一个新的函数用于初始化一个分布式进程(每个GPU一个进程)
# this allows communication among processes # 该函数允许进程之间的通信
def ddp_setup(rank, world_size): def ddp_setup(rank, world_size):
""" """
Arguments: 提示:
rank: a unique process ID rank:特定的进程编号(进程ID)
world_size: total number of processes in the group world_size:组内的进程总数
""" """
# rank of machine running rank:0 process
# here, we assume all GPUs are on the same machine # 正在运行的机器编号 ID:进程0
# 这里的前提是假设所有的GPU在同一台机器上
os.environ["MASTER_ADDR"] = "localhost" os.environ["MASTER_ADDR"] = "localhost"
# any free port on the machine # 机器上任意的空闲端口号
os.environ["MASTER_PORT"] = "12345" os.environ["MASTER_PORT"] = "12345"
# initialize process group # 初始化进程
# Windows users may have to use "gloo" instead of "nccl" as backend # Windows 用户使用"gloo"来替代下面代码中的"nccl"
# nccl: NVIDIA Collective Communication Library # nccl: NVIDIA Collective Communication Library
init_process_group(backend="nccl", rank=rank, world_size=world_size) init_process_group(backend="nccl", rank=rank, world_size=world_size)
torch.cuda.set_device(rank) torch.cuda.set_device(rank)
@@ -52,15 +53,15 @@ class NeuralNetwork(torch.nn.Module):
super().__init__() super().__init__()
self.layers = torch.nn.Sequential( self.layers = torch.nn.Sequential(
# 1st hidden layer # 第一个隐藏层
torch.nn.Linear(num_inputs, 30), torch.nn.Linear(num_inputs, 30),
torch.nn.ReLU(), torch.nn.ReLU(),
# 2nd hidden layer # 第二个隐藏层
torch.nn.Linear(30, 20), torch.nn.Linear(30, 20),
torch.nn.ReLU(), torch.nn.ReLU(),
# output layer # 输出层
torch.nn.Linear(20, num_outputs), torch.nn.Linear(20, num_outputs),
) )
@@ -91,11 +92,11 @@ def prepare_dataset():
train_loader = DataLoader( train_loader = DataLoader(
dataset=train_ds, dataset=train_ds,
batch_size=2, batch_size=2,
shuffle=False, # NEW: False because of DistributedSampler below shuffle=False, # 这里设置为False
pin_memory=True, pin_memory=True,
drop_last=True, drop_last=True,
# NEW: chunk batches across GPUs without overlapping samples: # 在多个GPU上划分批次,确保批次之间不重叠样本
sampler=DistributedSampler(train_ds) # NEW sampler=DistributedSampler(train_ds)
) )
test_loader = DataLoader( test_loader = DataLoader(
dataset=test_ds, dataset=test_ds,
@@ -105,33 +106,33 @@ def prepare_dataset():
return train_loader, test_loader return train_loader, test_loader
# NEW: wrapper # 包装器
def main(rank, world_size, num_epochs): def main(rank, world_size, num_epochs):
ddp_setup(rank, world_size) # NEW: initialize process groups ddp_setup(rank, world_size) #
train_loader, test_loader = prepare_dataset() train_loader, test_loader = prepare_dataset()
model = NeuralNetwork(num_inputs=2, num_outputs=2) model = NeuralNetwork(num_inputs=2, num_outputs=2)
model.to(rank) model.to(rank)
optimizer = torch.optim.SGD(model.parameters(), lr=0.5) optimizer = torch.optim.SGD(model.parameters(), lr=0.5)
model = DDP(model, device_ids=[rank]) # NEW: wrap model with DDP model = DDP(model, device_ids=[rank]) # 使用分布式数据并行(DDP)将模型进行包装
# the core model is now accessible as model.module # 现在核心模型可以通过 model.module 访问
for epoch in range(num_epochs): for epoch in range(num_epochs):
model.train() model.train()
for features, labels in enumerate(train_loader): for features, labels in enumerate(train_loader):
features, labels = features.to(rank), labels.to(rank) # New: use rank features, labels = features.to(rank), labels.to(rank)
logits = model(features) logits = model(features)
loss = F.cross_entropy(logits, labels) # Loss function loss = F.cross_entropy(logits, labels) # 损失函数
optimizer.zero_grad() optimizer.zero_grad()
loss.backward() loss.backward()
optimizer.step() optimizer.step()
### LOGGING ### 日志
print(f"[GPU{rank}] Epoch: {epoch+1:03d}/{num_epochs:03d}" print(f"[GPU{rank}] Epoch: {epoch+1:03d}/{num_epochs:03d}"
f" | Batchsize {labels.shape[0]:03d}" f" | Batchsize {labels.shape[0]:03d}"
f" | Train/Val Loss: {loss:.2f}") f" | Train/Val Loss: {loss:.2f}")
@@ -142,7 +143,7 @@ def main(rank, world_size, num_epochs):
test_acc = compute_accuracy(model, test_loader, device=rank) test_acc = compute_accuracy(model, test_loader, device=rank)
print(f"[GPU{rank}] Test accuracy", test_acc) print(f"[GPU{rank}] Test accuracy", test_acc)
destroy_process_group() # NEW: cleanly exit distributed mode destroy_process_group() # 清理退出分布式模式
def compute_accuracy(model, dataloader, device): def compute_accuracy(model, dataloader, device):
@@ -169,10 +170,10 @@ if __name__ == "__main__":
torch.manual_seed(123) torch.manual_seed(123)
# NEW: spawn new processes # 新建进程
# note that spawn will automatically pass the rank # 请注意,spawn会自动传递排名
num_epochs = 3 num_epochs = 3
world_size = torch.cuda.device_count() world_size = torch.cuda.device_count()
mp.spawn(main, args=(world_size, num_epochs), nprocs=world_size) mp.spawn(main, args=(world_size, num_epochs), nprocs=world_size)
# nprocs=world_size spawns one process per GPU # nprocs=world_size 会为每个GPU生成一个进程
+104 -54
View File
@@ -5,7 +5,7 @@
"id": "ca7fc8a0-280c-4979-b0c7-fc3a99b3b785", "id": "ca7fc8a0-280c-4979-b0c7-fc3a99b3b785",
"metadata": {}, "metadata": {},
"source": [ "source": [
"# Appendix A: Introduction to PyTorch (Part 1)" "# 附件A:PyTorch的介绍(第一部分)"
] ]
}, },
{ {
@@ -13,12 +13,12 @@
"id": "f5bf13d2-8fc2-483e-88cc-6b4310221e68", "id": "f5bf13d2-8fc2-483e-88cc-6b4310221e68",
"metadata": {}, "metadata": {},
"source": [ "source": [
"## A.1 What is PyTorch" "## A.1 什么是PyTorch"
] ]
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 1, "execution_count": null,
"id": "96ee5660-5327-48e2-9104-a882b3b2afa4", "id": "96ee5660-5327-48e2-9104-a882b3b2afa4",
"metadata": {}, "metadata": {},
"outputs": [ "outputs": [
@@ -32,13 +32,13 @@
], ],
"source": [ "source": [
"import torch\n", "import torch\n",
"\n", "# 显示PyTorch的版本\n",
"print(torch.__version__)" "print(torch.__version__)"
] ]
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 2, "execution_count": null,
"id": "f73ad4e4-7ec6-4467-a9e9-0cdf6d195264", "id": "f73ad4e4-7ec6-4467-a9e9-0cdf6d195264",
"metadata": {}, "metadata": {},
"outputs": [ "outputs": [
@@ -51,6 +51,7 @@
} }
], ],
"source": [ "source": [
"# 显示PyTorch是否是GPU版本,False表示CPU版本,True表示GPU版本\n",
"print(torch.cuda.is_available())" "print(torch.cuda.is_available())"
] ]
}, },
@@ -59,7 +60,7 @@
"id": "2100cf2e-7459-4ab3-92a8-43e86ab35a9b", "id": "2100cf2e-7459-4ab3-92a8-43e86ab35a9b",
"metadata": {}, "metadata": {},
"source": [ "source": [
"## A.2 Understanding tensors" "## A.2 向量"
] ]
}, },
{ {
@@ -67,7 +68,7 @@
"id": "26d7f785-e048-42bc-9182-a556af6bb7f4", "id": "26d7f785-e048-42bc-9182-a556af6bb7f4",
"metadata": {}, "metadata": {},
"source": [ "source": [
"### A.2.1 Scalars, vectors, matrices, and tensors" "### A.2.1 标量、向量、矩阵和张量\n"
] ]
}, },
{ {
@@ -80,22 +81,22 @@
"import torch\n", "import torch\n",
"import numpy as np\n", "import numpy as np\n",
"\n", "\n",
"# create a 0D tensor (scalar) from a Python integer\n", "# 用Python整数创建一个0维张量\n",
"tensor0d = torch.tensor(1)\n", "tensor0d = torch.tensor(1)\n",
"\n", "\n",
"# create a 1D tensor (vector) from a Python list\n", "# 用Python列表创建一个1维张量(向量)\n",
"tensor1d = torch.tensor([1, 2, 3])\n", "tensor1d = torch.tensor([1, 2, 3])\n",
"\n", "\n",
"# create a 2D tensor from a nested Python list\n", "# 用Python列表创建一个2维张量(向量)\n",
"tensor2d = torch.tensor([[1, 2], [3, 4]])\n", "tensor2d = torch.tensor([[1, 2], [3, 4]])\n",
"\n", "\n",
"# create a 3D tensor from a nested Python list\n", "# 用嵌套的Python列表创建一个3维张量\n",
"tensor3d_1 = torch.tensor([[[1, 2], [3, 4]], [[5, 6], [7, 8]]])\n", "tensor3d_1 = torch.tensor([[[1, 2], [3, 4]], [[5, 6], [7, 8]]])\n",
"\n", "\n",
"# create a 3D tensor from NumPy array\n", "# 从NumPy数组创建一个3维张量\n",
"ary3d = np.array([[[1, 2], [3, 4]], [[5, 6], [7, 8]]])\n", "ary3d = np.array([[[1, 2], [3, 4]], [[5, 6], [7, 8]]])\n",
"tensor3d_2 = torch.tensor(ary3d) # Copies NumPy array\n", "tensor3d_2 = torch.tensor(ary3d) # 复制NumPy数组\n",
"tensor3d_3 = torch.from_numpy(ary3d) # Shares memory with NumPy array" "tensor3d_3 = torch.from_numpy(ary3d) # 与NumPy数组共享内存"
] ]
}, },
{ {
@@ -118,7 +119,7 @@
], ],
"source": [ "source": [
"ary3d[0, 0, 0] = 999\n", "ary3d[0, 0, 0] = 999\n",
"print(tensor3d_2) # remains unchanged" "print(tensor3d_2) # 保持不变"
] ]
}, },
{ {
@@ -140,7 +141,7 @@
} }
], ],
"source": [ "source": [
"print(tensor3d_3) # changes because of memory sharing" "print(tensor3d_3) # 由于内存共享需要改变"
] ]
}, },
{ {
@@ -148,7 +149,7 @@
"id": "63dec48d-2b60-41a2-ac06-fef7e718605a", "id": "63dec48d-2b60-41a2-ac06-fef7e718605a",
"metadata": {}, "metadata": {},
"source": [ "source": [
"### A.2.2 Tensor data types" "### A.2.2 向量的数据类型"
] ]
}, },
{ {
@@ -213,7 +214,7 @@
"id": "2020deb5-aa02-4524-b311-c010f4ad27ff", "id": "2020deb5-aa02-4524-b311-c010f4ad27ff",
"metadata": {}, "metadata": {},
"source": [ "source": [
"### A.2.3 Common PyTorch tensor operations" "### A.2.3 PyTorch中常见的张量操作"
] ]
}, },
{ {
@@ -257,7 +258,7 @@
} }
], ],
"source": [ "source": [
"tensor2d.shape" "tensor2d.shape # 张量形状"
] ]
}, },
{ {
@@ -280,7 +281,7 @@
} }
], ],
"source": [ "source": [
"tensor2d.reshape(3, 2)" "tensor2d.reshape(3, 2) # 修改形状"
] ]
}, },
{ {
@@ -303,7 +304,7 @@
} }
], ],
"source": [ "source": [
"tensor2d.view(3, 2)" "tensor2d.view(3, 2) # 查看张量"
] ]
}, },
{ {
@@ -326,7 +327,7 @@
} }
], ],
"source": [ "source": [
"tensor2d.T" "tensor2d.T # 转置张量"
] ]
}, },
{ {
@@ -348,7 +349,7 @@
} }
], ],
"source": [ "source": [
"tensor2d.matmul(tensor2d.T)" "tensor2d.matmul(tensor2d.T) # 张量乘法:tensor2d与其转置相乘"
] ]
}, },
{ {
@@ -370,7 +371,7 @@
} }
], ],
"source": [ "source": [
"tensor2d @ tensor2d.T" "tensor2d @ tensor2d.T # 张量乘法的另一种实现方式:tensor2d与其转置相乘"
] ]
}, },
{ {
@@ -378,7 +379,7 @@
"id": "4c15bdeb-78e2-4870-8a4f-a9f591666f38", "id": "4c15bdeb-78e2-4870-8a4f-a9f591666f38",
"metadata": {}, "metadata": {},
"source": [ "source": [
"## A.3 Seeing models as computation graphs" "## A.3 把模型作为计算图"
] ]
}, },
{ {
@@ -398,13 +399,13 @@
"source": [ "source": [
"import torch.nn.functional as F\n", "import torch.nn.functional as F\n",
"\n", "\n",
"y = torch.tensor([1.0]) # true label\n", "y = torch.tensor([1.0]) # 真实样本\n",
"x1 = torch.tensor([1.1]) # input feature\n", "x1 = torch.tensor([1.1]) # 输入特征\n",
"w1 = torch.tensor([2.2]) # weight parameter\n", "w1 = torch.tensor([2.2]) # 权重变量\n",
"b = torch.tensor([0.0]) # bias unit\n", "b = torch.tensor([0.0]) # 偏置单元\n",
"\n", "\n",
"z = x1 * w1 + b # net input\n", "z = x1 * w1 + b # 网络输入\n",
"a = torch.sigmoid(z) # activation & output\n", "a = torch.sigmoid(z) # 激活函数 & 输出\n",
"\n", "\n",
"loss = F.binary_cross_entropy(a, y)\n", "loss = F.binary_cross_entropy(a, y)\n",
"print(loss)" "print(loss)"
@@ -415,7 +416,7 @@
"id": "f9424f26-2bac-47e7-b834-92ece802247c", "id": "f9424f26-2bac-47e7-b834-92ece802247c",
"metadata": {}, "metadata": {},
"source": [ "source": [
"## A.4 Automatic differentiation made easy" "## A.4 自动求导"
] ]
}, },
{ {
@@ -470,7 +471,7 @@
} }
], ],
"source": [ "source": [
"loss.backward()\n", "loss.backward()# 反向传播\n",
"\n", "\n",
"print(w1.grad)\n", "print(w1.grad)\n",
"print(b.grad)" "print(b.grad)"
@@ -481,7 +482,7 @@
"id": "f53bdd7d-44e6-40ab-8a5a-4eef74ef35dc", "id": "f53bdd7d-44e6-40ab-8a5a-4eef74ef35dc",
"metadata": {}, "metadata": {},
"source": [ "source": [
"## A.5 Implementing multilayer neural networks" "## A.5 多层神经网络的实现"
] ]
}, },
{ {
@@ -497,15 +498,15 @@
"\n", "\n",
" self.layers = torch.nn.Sequential(\n", " self.layers = torch.nn.Sequential(\n",
" \n", " \n",
" # 1st hidden layer\n", " # 第一个隐藏层\n",
" torch.nn.Linear(num_inputs, 30),\n", " torch.nn.Linear(num_inputs, 30),\n",
" torch.nn.ReLU(),\n", " torch.nn.ReLU(),\n",
"\n", "\n",
" # 2nd hidden layer\n", " # 第二个隐藏层\n",
" torch.nn.Linear(30, 20),\n", " torch.nn.Linear(30, 20),\n",
" torch.nn.ReLU(),\n", " torch.nn.ReLU(),\n",
"\n", "\n",
" # output layer\n", " # 输出层\n",
" torch.nn.Linear(20, num_outputs),\n", " torch.nn.Linear(20, num_outputs),\n",
" )\n", " )\n",
"\n", "\n",
@@ -566,7 +567,7 @@
], ],
"source": [ "source": [
"num_params = sum(p.numel() for p in model.parameters() if p.requires_grad)\n", "num_params = sum(p.numel() for p in model.parameters() if p.requires_grad)\n",
"print(\"Total number of trainable model parameters:\", num_params)" "print(\"Total number of trainable model parameters:\", num_params)# 打印训练模型的参数"
] ]
}, },
{ {
@@ -592,7 +593,7 @@
} }
], ],
"source": [ "source": [
"print(model.layers[0].weight)" "print(model.layers[0].weight) # 打印神经网络模型的第一层的权重"
] ]
}, },
{ {
@@ -618,9 +619,13 @@
} }
], ],
"source": [ "source": [
"# 设置随机数种子,以确保可复现性\n",
"torch.manual_seed(123)\n", "torch.manual_seed(123)\n",
"\n", "\n",
"# 假设 NeuralNetwork 是一个神经网络类,且其构造函数接受两个参数,分别为输入特征的维度和输出特征的维度\n",
"model = NeuralNetwork(50, 3)\n", "model = NeuralNetwork(50, 3)\n",
"\n",
"# 打印神经网络模型的第一层的权重\n",
"print(model.layers[0].weight)" "print(model.layers[0].weight)"
] ]
}, },
@@ -639,6 +644,7 @@
} }
], ],
"source": [ "source": [
"# 打印神经网络模型的第一层权重的形状\n",
"print(model.layers[0].weight.shape)" "print(model.layers[0].weight.shape)"
] ]
}, },
@@ -657,10 +663,16 @@
} }
], ],
"source": [ "source": [
"# 设置随机数种子,以确保可复现性\n",
"torch.manual_seed(123)\n", "torch.manual_seed(123)\n",
"\n", "\n",
"# 模型输入特征的维度为 50\n",
"X = torch.rand((1, 50))\n", "X = torch.rand((1, 50))\n",
"\n",
"# 使用模型进行前向传播计算输出\n",
"out = model(X)\n", "out = model(X)\n",
"\n",
"# 打印输出结果\n",
"print(out)" "print(out)"
] ]
}, },
@@ -679,6 +691,7 @@
} }
], ],
"source": [ "source": [
"# 使用 torch.no_grad() 上下文管理器,以便在推断时不计算梯度\n",
"with torch.no_grad():\n", "with torch.no_grad():\n",
" out = model(X)\n", " out = model(X)\n",
"print(out)" "print(out)"
@@ -709,7 +722,7 @@
"id": "19858180-0f26-43a8-b2c3-7ed40abf9f85", "id": "19858180-0f26-43a8-b2c3-7ed40abf9f85",
"metadata": {}, "metadata": {},
"source": [ "source": [
"## A.6 Setting up efficient data loaders" "## A.6 建立高效的数据加载器"
] ]
}, },
{ {
@@ -719,6 +732,7 @@
"metadata": {}, "metadata": {},
"outputs": [], "outputs": [],
"source": [ "source": [
"# 定义输入特征张量 X_train\n",
"X_train = torch.tensor([\n", "X_train = torch.tensor([\n",
" [-1.2, 3.1],\n", " [-1.2, 3.1],\n",
" [-0.9, 2.9],\n", " [-0.9, 2.9],\n",
@@ -727,6 +741,7 @@
" [2.7, -1.5]\n", " [2.7, -1.5]\n",
"])\n", "])\n",
"\n", "\n",
"# 定义对应的标签张量 y_train\n",
"y_train = torch.tensor([0, 0, 0, 1, 1])" "y_train = torch.tensor([0, 0, 0, 1, 1])"
] ]
}, },
@@ -756,18 +771,19 @@
"\n", "\n",
"\n", "\n",
"class ToyDataset(Dataset):\n", "class ToyDataset(Dataset):\n",
" # 初始化 ToyDataset 类\n",
" def __init__(self, X, y):\n", " def __init__(self, X, y):\n",
" self.features = X\n", " self.features = X\n",
" self.labels = y\n", " self.labels = y\n",
"\n", " # 获取指定索引的数据\n",
" def __getitem__(self, index):\n", " def __getitem__(self, index):\n",
" one_x = self.features[index]\n", " one_x = self.features[index]\n",
" one_y = self.labels[index] \n", " one_y = self.labels[index] \n",
" return one_x, one_y\n", " return one_x, one_y\n",
"\n", " # 获取数据集的长度\n",
" def __len__(self):\n", " def __len__(self):\n",
" return self.labels.shape[0]\n", " return self.labels.shape[0]\n",
"\n", "# 创建训练数据集和测试数据集实例\n",
"train_ds = ToyDataset(X_train, y_train)\n", "train_ds = ToyDataset(X_train, y_train)\n",
"test_ds = ToyDataset(X_test, y_test)" "test_ds = ToyDataset(X_test, y_test)"
] ]
@@ -804,6 +820,11 @@
"\n", "\n",
"torch.manual_seed(123)\n", "torch.manual_seed(123)\n",
"\n", "\n",
"# 创建训练数据加载器 train_loader\n",
"# dataset 参数传入了您定义的 ToyDataset 类的实例 train_ds\n",
"# batch_size 参数指定了每个批次包含的样本数量\n",
"# shuffle 参数指定是否在每个 epoch 之前对数据进行洗牌\n",
"# num_workers 参数指定用于数据加载的子进程数量\n",
"train_loader = DataLoader(\n", "train_loader = DataLoader(\n",
" dataset=train_ds,\n", " dataset=train_ds,\n",
" batch_size=2,\n", " batch_size=2,\n",
@@ -821,6 +842,11 @@
"source": [ "source": [
"test_ds = ToyDataset(X_test, y_test)\n", "test_ds = ToyDataset(X_test, y_test)\n",
"\n", "\n",
"# 创建测试数据加载器 test_loader\n",
"# dataset 参数传入了您定义的 ToyDataset 类的实例 test_ds\n",
"# batch_size 参数指定了每个批次包含的样本数量\n",
"# shuffle 参数指定是否在每个 epoch 之前对数据进行洗牌,这里设为 False 表示不洗牌\n",
"# num_workers 参数指定用于数据加载的子进程数量\n",
"test_loader = DataLoader(\n", "test_loader = DataLoader(\n",
" dataset=test_ds,\n", " dataset=test_ds,\n",
" batch_size=2,\n", " batch_size=2,\n",
@@ -848,7 +874,9 @@
} }
], ],
"source": [ "source": [
"# 迭代训练数据加载器 train_loader\n",
"for idx, (x, y) in enumerate(train_loader):\n", "for idx, (x, y) in enumerate(train_loader):\n",
" # 打印每个批次的索引、输入特征和对应的标签\n",
" print(f\"Batch {idx+1}:\", x, y)" " print(f\"Batch {idx+1}:\", x, y)"
] ]
}, },
@@ -884,7 +912,7 @@
"id": "d904ca82-e50f-4f3d-a3ac-fc6ca53dd00e", "id": "d904ca82-e50f-4f3d-a3ac-fc6ca53dd00e",
"metadata": {}, "metadata": {},
"source": [ "source": [
"## A.7 A typical training loop" "## A.7 一个示例训练轮次"
] ]
}, },
{ {
@@ -923,19 +951,19 @@
"\n", "\n",
" logits = model(features)\n", " logits = model(features)\n",
" \n", " \n",
" loss = F.cross_entropy(logits, labels) # Loss function\n", " loss = F.cross_entropy(logits, labels) # 损失函数\n",
" \n", " \n",
" optimizer.zero_grad()\n", " optimizer.zero_grad()\n",
" loss.backward()\n", " loss.backward()\n",
" optimizer.step()\n", " optimizer.step()\n",
" \n", " \n",
" ### LOGGING\n", " ### 日志\n",
" print(f\"Epoch: {epoch+1:03d}/{num_epochs:03d}\"\n", " print(f\"Epoch: {epoch+1:03d}/{num_epochs:03d}\"\n",
" f\" | Batch {batch_idx:03d}/{len(train_loader):03d}\"\n", " f\" | Batch {batch_idx:03d}/{len(train_loader):03d}\"\n",
" f\" | Train/Val Loss: {loss:.2f}\")\n", " f\" | Train/Val Loss: {loss:.2f}\")\n",
"\n", "\n",
" model.eval()\n", " model.eval()\n",
" # Optional model evaluation" " # 可选的模型评估指标"
] ]
}, },
{ {
@@ -985,10 +1013,16 @@
} }
], ],
"source": [ "source": [
"# 设置 PyTorch 的打印选项,以关闭科学计数法\n",
"torch.set_printoptions(sci_mode=False)\n", "torch.set_printoptions(sci_mode=False)\n",
"\n",
"# 假设 outputs 是模型的输出张量\n",
"\n",
"# 对模型的输出进行 softmax 操作,计算类别概率\n",
"probas = torch.softmax(outputs, dim=1)\n", "probas = torch.softmax(outputs, dim=1)\n",
"print(probas)\n", "print(probas)\n",
"\n", "\n",
"# 获取模型的预测结果,即具有最大概率的类别\n",
"predictions = torch.argmax(outputs, dim=1)\n", "predictions = torch.argmax(outputs, dim=1)\n",
"print(predictions)" "print(predictions)"
] ]
@@ -1008,6 +1042,7 @@
} }
], ],
"source": [ "source": [
"# 使用 torch.argmax() 函数沿着 dim=1 维度获取每个样本最大值的索引,即模型的预测结果\n",
"predictions = torch.argmax(outputs, dim=1)\n", "predictions = torch.argmax(outputs, dim=1)\n",
"print(predictions)" "print(predictions)"
] ]
@@ -1062,21 +1097,36 @@
"outputs": [], "outputs": [],
"source": [ "source": [
"def compute_accuracy(model, dataloader):\n", "def compute_accuracy(model, dataloader):\n",
" \"\"\"\n",
" 计算模型在给定数据加载器上的准确率。\n",
"\n", "\n",
" 参数:\n",
" model (torch.nn.Module): 待评估的模型。\n",
" dataloader (torch.utils.data.DataLoader): 包含输入数据的数据加载器。\n",
"\n",
" 返回:\n",
" float: 准确率值。\n",
" \"\"\"\n",
" # 将模型设为评估模式\n",
" model = model.eval()\n", " model = model.eval()\n",
" correct = 0.0\n", " correct = 0.0\n",
" total_examples = 0\n", " total_examples = 0\n",
" \n", " \n",
" # 遍历数据加载器\n",
" for idx, (features, labels) in enumerate(dataloader):\n", " for idx, (features, labels) in enumerate(dataloader):\n",
" \n", " \n",
" # 使用 no_grad 上下文,以便不跟踪梯度\n",
" with torch.no_grad():\n", " with torch.no_grad():\n",
" # 使用模型进行前向传播获取预测结果\n",
" logits = model(features)\n", " logits = model(features)\n",
" \n", " \n",
" # 获取预测结果并计算正确预测的数量\n",
" predictions = torch.argmax(logits, dim=1)\n", " predictions = torch.argmax(logits, dim=1)\n",
" compare = labels == predictions\n", " compare = labels == predictions\n",
" correct += torch.sum(compare)\n", " correct += torch.sum(compare)\n",
" total_examples += len(compare)\n", " total_examples += len(compare)\n",
"\n", "\n",
" # 计算并返回准确率\n",
" return (correct / total_examples).item()" " return (correct / total_examples).item()"
] ]
}, },
@@ -1127,7 +1177,7 @@
"id": "4d5cd469-3a45-4394-944b-3ce543f41dac", "id": "4d5cd469-3a45-4394-944b-3ce543f41dac",
"metadata": {}, "metadata": {},
"source": [ "source": [
"## A.8 Saving and loading models" "## A.8 保存并加载模型"
] ]
}, },
{ {
@@ -1158,7 +1208,7 @@
} }
], ],
"source": [ "source": [
"model = NeuralNetwork(2, 2) # needs to match the original model exactly\n", "model = NeuralNetwork(2, 2) # 需要与原始模型完全匹配\n",
"model.load_state_dict(torch.load(\"model.pth\"))" "model.load_state_dict(torch.load(\"model.pth\"))"
] ]
}, },
@@ -1167,7 +1217,7 @@
"id": "f891c013-43da-4a05-973d-997be313d2d8", "id": "f891c013-43da-4a05-973d-997be313d2d8",
"metadata": {}, "metadata": {},
"source": [ "source": [
"## A.9 Optimizing training performance with GPUs" "## A.9 使用GPU来优化训练性能"
] ]
}, },
{ {
@@ -1175,7 +1225,7 @@
"id": "e68ae888-cabf-49c9-bad6-ecdce774db57", "id": "e68ae888-cabf-49c9-bad6-ecdce774db57",
"metadata": {}, "metadata": {},
"source": [ "source": [
"### A.9.1 PyTorch computations on GPU devices" "### A.9.1 在GPU上进行 PyTorch 的运算"
] ]
}, },
{ {
@@ -1183,7 +1233,7 @@
"id": "141c845f-efe3-4614-b376-b8b7a9a2c887", "id": "141c845f-efe3-4614-b376-b8b7a9a2c887",
"metadata": {}, "metadata": {},
"source": [ "source": [
"See [code-part2.ipynb](code-part2.ipynb)" "See [code-part2.ipynb](code-part2.ipynb) "
] ]
}, },
{ {
@@ -1191,7 +1241,7 @@
"id": "99811829-b817-42ea-b03e-d35374debcc0", "id": "99811829-b817-42ea-b03e-d35374debcc0",
"metadata": {}, "metadata": {},
"source": [ "source": [
"### A.9.2 Single-GPU training" "### A.9.2 单个GPU的训练"
] ]
}, },
{ {
@@ -1207,7 +1257,7 @@
"id": "db6eb2d1-a341-4489-b04b-635c26945333", "id": "db6eb2d1-a341-4489-b04b-635c26945333",
"metadata": {}, "metadata": {},
"source": [ "source": [
"### A.9.3 Training with multiple GPUs" "### A.9.3 多GPU的训练"
] ]
}, },
{ {
@@ -6,7 +6,7 @@
"id": "O9i6kzBsZVaZ" "id": "O9i6kzBsZVaZ"
}, },
"source": [ "source": [
"# Appendix A: Introduction to PyTorch (Part 2)" "# 附件A:PyTorch的介绍(第二部分)"
] ]
}, },
{ {
@@ -15,7 +15,7 @@
"id": "ppbG5d-NZezH" "id": "ppbG5d-NZezH"
}, },
"source": [ "source": [
"## A.9 Optimizing training performance with GPUs" "## A.9 使用GPU优化训练性能"
] ]
}, },
{ {
@@ -24,7 +24,7 @@
"id": "6jH0J_DPZhbn" "id": "6jH0J_DPZhbn"
}, },
"source": [ "source": [
"### A.9.1 PyTorch computations on GPU devices" "### A.9.1 在GPU上进行 PyTorch 计算"
] ]
}, },
{ {
@@ -48,13 +48,13 @@
], ],
"source": [ "source": [
"import torch\n", "import torch\n",
"\n", "# 显示PyTorch的版本\n",
"print(torch.__version__)" "print(torch.__version__)"
] ]
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 2, "execution_count": null,
"metadata": { "metadata": {
"colab": { "colab": {
"base_uri": "https://localhost:8080/" "base_uri": "https://localhost:8080/"
@@ -72,6 +72,7 @@
} }
], ],
"source": [ "source": [
"# 显示PyTorch是否支持GPU\n",
"print(torch.cuda.is_available())" "print(torch.cuda.is_available())"
] ]
}, },
@@ -124,6 +125,7 @@
} }
], ],
"source": [ "source": [
"# 将两个张量移动到CUDA设备上\n",
"tensor_1 = tensor_1.to(\"cuda\")\n", "tensor_1 = tensor_1.to(\"cuda\")\n",
"tensor_2 = tensor_2.to(\"cuda\")\n", "tensor_2 = tensor_2.to(\"cuda\")\n",
"\n", "\n",
@@ -165,7 +167,7 @@
"id": "c8j1cWDcWAMf" "id": "c8j1cWDcWAMf"
}, },
"source": [ "source": [
"## A.9.2 Single-GPU training" "## A.9.2 单GPU训练"
] ]
}, },
{ {
@@ -264,15 +266,15 @@
"\n", "\n",
" self.layers = torch.nn.Sequential(\n", " self.layers = torch.nn.Sequential(\n",
"\n", "\n",
" # 1st hidden layer\n", " # 第一个隐藏层\n",
" torch.nn.Linear(num_inputs, 30),\n", " torch.nn.Linear(num_inputs, 30),\n",
" torch.nn.ReLU(),\n", " torch.nn.ReLU(),\n",
"\n", "\n",
" # 2nd hidden layer\n", " # 第二个隐藏层\n",
" torch.nn.Linear(30, 20),\n", " torch.nn.Linear(30, 20),\n",
" torch.nn.ReLU(),\n", " torch.nn.ReLU(),\n",
"\n", "\n",
" # output layer\n", " # 输出层\n",
" torch.nn.Linear(20, num_outputs),\n", " torch.nn.Linear(20, num_outputs),\n",
" )\n", " )\n",
"\n", "\n",
@@ -309,14 +311,22 @@
"import torch.nn.functional as F\n", "import torch.nn.functional as F\n",
"\n", "\n",
"\n", "\n",
"# 设置随机数种子,以确保可复现性\n",
"torch.manual_seed(123)\n", "torch.manual_seed(123)\n",
"\n",
"# 创建神经网络模型\n",
"model = NeuralNetwork(num_inputs=2, num_outputs=2)\n", "model = NeuralNetwork(num_inputs=2, num_outputs=2)\n",
"\n", "\n",
"device = torch.device(\"cuda\" if torch.cuda.is_available() else \"cpu\") # NEW\n", "# 根据设备可用情况选择设备\n",
"model = model.to(device) # NEW\n", "device = torch.device(\"cuda\" if torch.cuda.is_available() else \"cpu\")\n",
"\n", "\n",
"# 将模型移动到所选设备上\n",
"model = model.to(device)\n",
"\n",
"# 定义优化器,使用随机梯度下降 (SGD)\n",
"optimizer = torch.optim.SGD(model.parameters(), lr=0.5)\n", "optimizer = torch.optim.SGD(model.parameters(), lr=0.5)\n",
"\n", "\n",
"# 定义训练循环的 epoch 数量\n",
"num_epochs = 3\n", "num_epochs = 3\n",
"\n", "\n",
"for epoch in range(num_epochs):\n", "for epoch in range(num_epochs):\n",
@@ -324,21 +334,21 @@
" model.train()\n", " model.train()\n",
" for batch_idx, (features, labels) in enumerate(train_loader):\n", " for batch_idx, (features, labels) in enumerate(train_loader):\n",
"\n", "\n",
" features, labels = features.to(device), labels.to(device) # NEW\n", " features, labels = features.to(device), labels.to(device) \n",
" logits = model(features)\n", " logits = model(features)\n",
" loss = F.cross_entropy(logits, labels) # Loss function\n", " loss = F.cross_entropy(logits, labels) # 损失函数\n",
"\n", "\n",
" optimizer.zero_grad()\n", " optimizer.zero_grad()\n",
" loss.backward()\n", " loss.backward()\n",
" optimizer.step()\n", " optimizer.step()\n",
"\n", "\n",
" ### LOGGING\n", " ### 训练日志\n",
" print(f\"Epoch: {epoch+1:03d}/{num_epochs:03d}\"\n", " print(f\"Epoch: {epoch+1:03d}/{num_epochs:03d}\"\n",
" f\" | Batch {batch_idx:03d}/{len(train_loader):03d}\"\n", " f\" | Batch {batch_idx:03d}/{len(train_loader):03d}\"\n",
" f\" | Train/Val Loss: {loss:.2f}\")\n", " f\" | Train/Val Loss: {loss:.2f}\")\n",
"\n", "\n",
" model.eval()\n", " model.eval()\n",
" # Optional model evaluation" " # 可选的模型参数"
] ]
}, },
{ {
@@ -349,6 +359,7 @@
}, },
"outputs": [], "outputs": [],
"source": [ "source": [
"# 使用accuracy(准确率)作为指标\n",
"def compute_accuracy(model, dataloader, device):\n", "def compute_accuracy(model, dataloader, device):\n",
"\n", "\n",
" model = model.eval()\n", " model = model.eval()\n",
@@ -356,17 +367,17 @@
" total_examples = 0\n", " total_examples = 0\n",
"\n", "\n",
" for idx, (features, labels) in enumerate(dataloader):\n", " for idx, (features, labels) in enumerate(dataloader):\n",
"\n", " # 将数据移动到指定的设备上\n",
" features, labels = features.to(device), labels.to(device) # New\n", " features, labels = features.to(device), labels.to(device) # New\n",
"\n", "\n",
" with torch.no_grad():\n", " with torch.no_grad():\n",
" logits = model(features)\n", " logits = model(features)\n",
"\n", " # 获取预测结果并计算准确数量\n",
" predictions = torch.argmax(logits, dim=1)\n", " predictions = torch.argmax(logits, dim=1)\n",
" compare = labels == predictions\n", " compare = labels == predictions\n",
" correct += torch.sum(compare)\n", " correct += torch.sum(compare)\n",
" total_examples += len(compare)\n", " total_examples += len(compare)\n",
"\n", " # 计算并返回准确率\n",
" return (correct / total_examples).item()" " return (correct / total_examples).item()"
] ]
}, },
@@ -4,7 +4,7 @@
"cell_type": "markdown", "cell_type": "markdown",
"metadata": {}, "metadata": {},
"source": [ "source": [
"## Exercise A.3" "## 练习 A.3"
] ]
}, },
{ {
@@ -21,15 +21,15 @@
"\n", "\n",
" self.layers = torch.nn.Sequential(\n", " self.layers = torch.nn.Sequential(\n",
" \n", " \n",
" # 1st hidden layer\n", " # 第一个隐藏层\n",
" torch.nn.Linear(num_inputs, 30),\n", " torch.nn.Linear(num_inputs, 30),\n",
" torch.nn.ReLU(),\n", " torch.nn.ReLU(),\n",
"\n", "\n",
" # 2nd hidden layer\n", " # 第二个隐藏层\n",
" torch.nn.Linear(30, 20),\n", " torch.nn.Linear(30, 20),\n",
" torch.nn.ReLU(),\n", " torch.nn.ReLU(),\n",
"\n", "\n",
" # output layer\n", " # 输出层\n",
" torch.nn.Linear(20, num_outputs),\n", " torch.nn.Linear(20, num_outputs),\n",
" )\n", " )\n",
"\n", "\n",
@@ -62,7 +62,7 @@
"cell_type": "markdown", "cell_type": "markdown",
"metadata": {}, "metadata": {},
"source": [ "source": [
"## Exercise A.4" "## 练习 A.4"
] ]
}, },
{ {
@@ -74,7 +74,7 @@
"outputs": [], "outputs": [],
"source": [ "source": [
"import torch\n", "import torch\n",
"\n", "# 创建随机向量\n",
"a = torch.rand(100, 200)\n", "a = torch.rand(100, 200)\n",
"b = torch.rand(200, 300)" "b = torch.rand(200, 300)"
] ]
@@ -99,6 +99,9 @@
} }
], ],
"source": [ "source": [
"# 使用 @ 符号进行矩阵相乘,并计算执行时间\n",
"# %timeit 是 IPython 提供的魔术命令,用于多次执行代码以获取平均执行时间\n",
"# 它会自动选择执行次数以确保结果的准确性\n",
"%timeit a @ b" "%timeit a @ b"
] ]
}, },
@@ -110,6 +113,7 @@
}, },
"outputs": [], "outputs": [],
"source": [ "source": [
"# 将 a 和 b 移动到 CUDA 设备上以利用 GPU 加速计算\n",
"a, b = a.to(\"cuda\"), b.to(\"cuda\")" "a, b = a.to(\"cuda\"), b.to(\"cuda\")"
] ]
}, },
@@ -135,15 +139,6 @@
"source": [ "source": [
"%timeit a @ b" "%timeit a @ b"
] ]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"id": "Zqqa-To2L749"
},
"outputs": [],
"source": []
} }
], ],
"metadata": { "metadata": {
@@ -168,7 +163,7 @@
"name": "python", "name": "python",
"nbconvert_exporter": "python", "nbconvert_exporter": "python",
"pygments_lexer": "ipython3", "pygments_lexer": "ipython3",
"version": "3.10.6" "version": "3.11.5"
} }
}, },
"nbformat": 4, "nbformat": 4,