mirror of
https://github.com/datawhalechina/llms-from-scratch-cn.git
synced 2026-05-03 13:02:35 +00:00
File diff suppressed because it is too large
Load Diff
@@ -5,7 +5,7 @@
|
|||||||
"id": "51c9672d-8d0c-470d-ac2d-1271f8ec3f14",
|
"id": "51c9672d-8d0c-470d-ac2d-1271f8ec3f14",
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"source": [
|
"source": [
|
||||||
"# Chapter 3 Exercise solutions"
|
"# Chapter 3 习题解答"
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
@@ -13,12 +13,12 @@
|
|||||||
"id": "33dfa199-9aee-41d4-a64b-7e3811b9a616",
|
"id": "33dfa199-9aee-41d4-a64b-7e3811b9a616",
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"source": [
|
"source": [
|
||||||
"# Exercise 3.1"
|
"# 3.1"
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": 5,
|
"execution_count": 1,
|
||||||
"id": "5fee2cf5-61c3-4167-81b5-44ea155bbaf2",
|
"id": "5fee2cf5-61c3-4167-81b5-44ea155bbaf2",
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"outputs": [],
|
"outputs": [],
|
||||||
@@ -39,7 +39,7 @@
|
|||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": 58,
|
"execution_count": 2,
|
||||||
"id": "62ea289c-41cd-4416-89dd-dde6383a6f70",
|
"id": "62ea289c-41cd-4416-89dd-dde6383a6f70",
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"outputs": [],
|
"outputs": [],
|
||||||
@@ -72,7 +72,7 @@
|
|||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": 59,
|
"execution_count": 3,
|
||||||
"id": "7b035143-f4e8-45fb-b398-dec1bd5153d4",
|
"id": "7b035143-f4e8-45fb-b398-dec1bd5153d4",
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"outputs": [],
|
"outputs": [],
|
||||||
@@ -103,7 +103,7 @@
|
|||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": 60,
|
"execution_count": 4,
|
||||||
"id": "7591d79c-c30e-406d-adfd-20c12eb448f6",
|
"id": "7591d79c-c30e-406d-adfd-20c12eb448f6",
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"outputs": [],
|
"outputs": [],
|
||||||
@@ -115,7 +115,7 @@
|
|||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": 61,
|
"execution_count": 5,
|
||||||
"id": "ddd0f54f-6bce-46cc-a428-17c2a56557d0",
|
"id": "ddd0f54f-6bce-46cc-a428-17c2a56557d0",
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"outputs": [
|
"outputs": [
|
||||||
@@ -130,7 +130,7 @@
|
|||||||
" [-0.5299, -0.1081]], grad_fn=<MmBackward0>)"
|
" [-0.5299, -0.1081]], grad_fn=<MmBackward0>)"
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
"execution_count": 61,
|
"execution_count": 5,
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"output_type": "execute_result"
|
"output_type": "execute_result"
|
||||||
}
|
}
|
||||||
@@ -141,7 +141,7 @@
|
|||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": 62,
|
"execution_count": 6,
|
||||||
"id": "340908f8-1144-4ddd-a9e1-a1c5c3d592f5",
|
"id": "340908f8-1144-4ddd-a9e1-a1c5c3d592f5",
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"outputs": [
|
"outputs": [
|
||||||
@@ -156,7 +156,7 @@
|
|||||||
" [-0.5299, -0.1081]], grad_fn=<MmBackward0>)"
|
" [-0.5299, -0.1081]], grad_fn=<MmBackward0>)"
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
"execution_count": 62,
|
"execution_count": 6,
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"output_type": "execute_result"
|
"output_type": "execute_result"
|
||||||
}
|
}
|
||||||
@@ -170,15 +170,15 @@
|
|||||||
"id": "33543edb-46b5-4b01-8704-f7f101230544",
|
"id": "33543edb-46b5-4b01-8704-f7f101230544",
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"source": [
|
"source": [
|
||||||
"# Exercise 3.2"
|
"# 3.2"
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "markdown",
|
"cell_type": "markdown",
|
||||||
"id": "0588e209-1644-496a-8dae-7630b4ef9083",
|
"id": "1fc1a301",
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"source": [
|
"source": [
|
||||||
"If we want to have an output dimension of 2, as earlier in single-head attention, we can have to change the projection dimension `d_out` to 1:"
|
"如果我们想要多头注意力机制的输出和之前单头注意力机制一样为 2,我们可以将输出维度 `d_out` 设置为 1:"
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
@@ -227,7 +227,7 @@
|
|||||||
"id": "92bdabcb-06cf-4576-b810-d883bbd313ba",
|
"id": "92bdabcb-06cf-4576-b810-d883bbd313ba",
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"source": [
|
"source": [
|
||||||
"# Exercise 3.3"
|
"# 3.3"
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
@@ -249,7 +249,7 @@
|
|||||||
"id": "375d5290-8e8b-4149-958e-1efb58a69191",
|
"id": "375d5290-8e8b-4149-958e-1efb58a69191",
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"source": [
|
"source": [
|
||||||
"Optionally, the number of parameters is as follows:"
|
"上述实现的参数量为:"
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
@@ -280,7 +280,9 @@
|
|||||||
"id": "a56c1d47-9b95-4bd1-a517-580a6f779c52",
|
"id": "a56c1d47-9b95-4bd1-a517-580a6f779c52",
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"source": [
|
"source": [
|
||||||
"The GPT-2 model has 117M parameters in total, but as we can see, most of its parameters are not in the multi-head attention module itself."
|
"\n",
|
||||||
|
"\n",
|
||||||
|
"GPT-2 模型有 117M 的参数,但正如我们所见,绝大部分参数其实都不是来源于多头注意力机制(而是线性层)。"
|
||||||
]
|
]
|
||||||
}
|
}
|
||||||
],
|
],
|
||||||
@@ -300,7 +302,7 @@
|
|||||||
"name": "python",
|
"name": "python",
|
||||||
"nbconvert_exporter": "python",
|
"nbconvert_exporter": "python",
|
||||||
"pygments_lexer": "ipython3",
|
"pygments_lexer": "ipython3",
|
||||||
"version": "3.11.5"
|
"version": "3.9.18"
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
"nbformat": 4,
|
"nbformat": 4,
|
||||||
|
|||||||
Reference in New Issue
Block a user