mirror of
https://github.com/datawhalechina/llms-from-scratch-cn.git
synced 2026-05-01 11:58:17 +08:00
296 lines
11 KiB
Plaintext
296 lines
11 KiB
Plaintext
{
|
||
"cells": [
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 1,
|
||
"id": "dd05f32c-a90f-4122-b6d7-a5ec7b3b9ba0",
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"name": "stdout",
|
||
"output_type": "stream",
|
||
"text": [
|
||
"env: HF_ENDPOINT=https://hf-mirror.com\n"
|
||
]
|
||
}
|
||
],
|
||
"source": [
|
||
"%env HF_ENDPOINT=https://hf-mirror.com"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 2,
|
||
"id": "54f03217-da8d-4a05-9c85-9e0301a597e7",
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": [
|
||
"import os\n",
|
||
"\n",
|
||
"# 设置 HF_HOME 环境变量 设置下载路径\n",
|
||
"os.environ['HF_HOME'] = '/data1/ckw'"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": null,
|
||
"id": "94cab483-b247-4aa8-9557-d15e459244af",
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": [
|
||
"# 这个时候,由于OpenELM还没有官方发布在transformer,所以需要改下源码(已经有了更好的办法,因此不需要改源码了)"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": null,
|
||
"id": "e2f3081d-f795-4f86-b80e-e915ae56b426",
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": [
|
||
"# /data1/ckw/micromamba/envs/kewei-ai/lib/python3.11/site-packages/transformers/models/auto/tokenization_auto.py:909"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "markdown",
|
||
"id": "db03e7fd-d06f-4e78-842f-66c8e02043bd",
|
||
"metadata": {},
|
||
"source": [
|
||
"#### 1.3 AutoModelForCausalLM代码\n",
|
||
"\n",
|
||
"```python\n",
|
||
"class AutoModelForCausalLM:\n",
|
||
" def __init__(self):\n",
|
||
" raise EnvironmentError(\n",
|
||
" \"AutoModelForCausalLM is designed to be instantiated \"\n",
|
||
" \"using the `AutoModelForCausalLM.from_pretrained(pretrained_model_name_or_path)` or \"\n",
|
||
" \"`AutoModelForCausalLM.from_config(config)` methods.\"\n",
|
||
" )\n",
|
||
"\n",
|
||
"\t@classmethod\n",
|
||
" @replace_list_option_in_docstrings(MODEL_FOR_CAUSAL_LM_MAPPING, use_model_types=False)\n",
|
||
" def from_config(cls, config):\n",
|
||
"\n",
|
||
" if type(config) in MODEL_FOR_CAUSAL_LM_MAPPING.keys():\n",
|
||
" return MODEL_FOR_CAUSAL_LM_MAPPING[type(config)](config)\n",
|
||
" raise ValueError(\n",
|
||
" \"Unrecognized configuration class {} for this kind of AutoModel: {}.\\n\"\n",
|
||
" \"Model type should be one of {}.\".format(\n",
|
||
" config.__class__, cls.__name__, \", \".join(c.__name__ for c in MODEL_FOR_CAUSAL_LM_MAPPING.keys())\n",
|
||
" )\n",
|
||
" )\n",
|
||
"\n",
|
||
"\n",
|
||
"\t@classmethod\n",
|
||
" @replace_list_option_in_docstrings(MODEL_FOR_CAUSAL_LM_MAPPING)\n",
|
||
" @add_start_docstrings(\n",
|
||
" \"Instantiate one of the model classes of the library---with a causal language modeling head---from a \"\n",
|
||
" \"pretrained model.\",\n",
|
||
" AUTO_MODEL_PRETRAINED_DOCSTRING,\n",
|
||
" )\n",
|
||
" def from_pretrained(cls, pretrained_model_name_or_path, *model_args, **kwargs):\n",
|
||
" config = kwargs.pop(\"config\", None)\n",
|
||
" if not isinstance(config, PretrainedConfig):\n",
|
||
" config, kwargs = AutoConfig.from_pretrained(\n",
|
||
" pretrained_model_name_or_path, return_unused_kwargs=True, **kwargs\n",
|
||
" )\n",
|
||
"\n",
|
||
" if type(config) in MODEL_FOR_CAUSAL_LM_MAPPING.keys():\n",
|
||
" return MODEL_FOR_CAUSAL_LM_MAPPING[type(config)].from_pretrained(\n",
|
||
" pretrained_model_name_or_path, *model_args, config=config, **kwargs\n",
|
||
" )\n",
|
||
" raise ValueError(\n",
|
||
" \"Unrecognized configuration class {} for this kind of AutoModel: {}.\\n\"\n",
|
||
" \"Model type should be one of {}.\".format(\n",
|
||
" config.__class__, cls.__name__, \", \".join(c.__name__ for c in MODEL_FOR_CAUSAL_LM_MAPPING.keys())\n",
|
||
" )\n",
|
||
" )\n",
|
||
"```"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 5,
|
||
"id": "744c6db7-53f9-4911-adcb-4f0618693071",
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"data": {
|
||
"application/vnd.jupyter.widget-view+json": {
|
||
"model_id": "7dd376f050c3496b904a5a545f499e07",
|
||
"version_major": 2,
|
||
"version_minor": 0
|
||
},
|
||
"text/plain": [
|
||
"tokenizer_config.json: 0%| | 0.00/265 [00:00<?, ?B/s]"
|
||
]
|
||
},
|
||
"metadata": {},
|
||
"output_type": "display_data"
|
||
},
|
||
{
|
||
"data": {
|
||
"application/vnd.jupyter.widget-view+json": {
|
||
"model_id": "4936fbb98c5446ebb60f4bdb288ddc73",
|
||
"version_major": 2,
|
||
"version_minor": 0
|
||
},
|
||
"text/plain": [
|
||
"tokenizer.model: 0%| | 0.00/500k [00:00<?, ?B/s]"
|
||
]
|
||
},
|
||
"metadata": {},
|
||
"output_type": "display_data"
|
||
},
|
||
{
|
||
"data": {
|
||
"application/vnd.jupyter.widget-view+json": {
|
||
"model_id": "080e814bd03542aeb4a9f882c67ed06a",
|
||
"version_major": 2,
|
||
"version_minor": 0
|
||
},
|
||
"text/plain": [
|
||
"tokenizer.json: 0.00B [00:00, ?B/s]"
|
||
]
|
||
},
|
||
"metadata": {},
|
||
"output_type": "display_data"
|
||
},
|
||
{
|
||
"data": {
|
||
"application/vnd.jupyter.widget-view+json": {
|
||
"model_id": "d04a2f9f4a57490bb70e88af4ab10008",
|
||
"version_major": 2,
|
||
"version_minor": 0
|
||
},
|
||
"text/plain": [
|
||
"added_tokens.json: 0%| | 0.00/21.0 [00:00<?, ?B/s]"
|
||
]
|
||
},
|
||
"metadata": {},
|
||
"output_type": "display_data"
|
||
},
|
||
{
|
||
"data": {
|
||
"application/vnd.jupyter.widget-view+json": {
|
||
"model_id": "6a728b39e23043459b8c2bddef6e8845",
|
||
"version_major": 2,
|
||
"version_minor": 0
|
||
},
|
||
"text/plain": [
|
||
"special_tokens_map.json: 0%| | 0.00/435 [00:00<?, ?B/s]"
|
||
]
|
||
},
|
||
"metadata": {},
|
||
"output_type": "display_data"
|
||
},
|
||
{
|
||
"name": "stderr",
|
||
"output_type": "stream",
|
||
"text": [
|
||
"The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.\n",
|
||
"Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n"
|
||
]
|
||
},
|
||
{
|
||
"data": {
|
||
"text/plain": [
|
||
"'\\nDataWhalechina is an organization founded at Shanghai Jiao Tong University that helps learners learn artificial intelligence. The organization aims to provide AI-related courses to students in China.\\n\\nThis repository contains the code for the following courses:\\n\\n1. [Introduction to AI: Neural Networks and Classification](https://www.datawhalechina.com/courses/introduction-to-ai-neural-networks-and-classification/)\\n2. [Introduction to AI: Deep Learning and Applications](https://www.datawhalechina.com/courses/introduction-to-ai-deep-learning-and-applications/)\\n3. [Introduction to AI: Algorithms and Applications](https://www.datawhalechina.com/courses/introduction-to-ai-algorithms-and-applications/)\\n4. [Introduction to AI: Data Preparation and Model Evaluation](https://www.datawhalechina.com/courses/introduction-to-ai-data-preparation-and-model-evaluation/)\\n5. [Introduction to AI: Building and Evaluating AI Models](https://www.datawhalechina.com/courses/introduction-to-ai-building-and-evaluating-ai'"
|
||
]
|
||
},
|
||
"execution_count": 5,
|
||
"metadata": {},
|
||
"output_type": "execute_result"
|
||
}
|
||
],
|
||
"source": [
|
||
"from transformers import AutoTokenizer\n",
|
||
"from modeling_openelm import OpenELMForCausalLM\n",
|
||
"\n",
|
||
"model = OpenELMForCausalLM.from_pretrained(\"Apple/OpenELM-270M-Instruct\")#trust_remote_code=True\n",
|
||
"# tokenizer = AutoTokenizer.from_pretrained(\"Apple/OpenELM-270M-Instruct\")Llama-2-7b-hf\n",
|
||
"tokenizer = AutoTokenizer.from_pretrained(\"NousResearch/Llama-2-7b-chat-hf\")\n",
|
||
"prompt = '\\nDataWhalechina is an organization founded at Shanghai Jiao Tong University that helps learners learn artificial intelligence.'\n",
|
||
"inputs = tokenizer(prompt, return_tensors=\"pt\")\n",
|
||
"\n",
|
||
"# Generate\n",
|
||
"generate_ids = model.generate(inputs.input_ids, max_length=300)\n",
|
||
"tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "raw",
|
||
"id": "6c0f8954-aca3-496b-86e4-843cdb00b104",
|
||
"metadata": {},
|
||
"source": [
|
||
"上面这个openelm的回复,感觉还比较贴合datawhale的实际情况哈,速度也是很快的,没得说,不过链接是编的哈哈"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 6,
|
||
"id": "060b86f9-fda5-4d9f-8292-4d9464c7b2ef",
|
||
"metadata": {
|
||
"scrolled": true
|
||
},
|
||
"outputs": [
|
||
{
|
||
"name": "stderr",
|
||
"output_type": "stream",
|
||
"text": [
|
||
"The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.\n",
|
||
"Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n"
|
||
]
|
||
},
|
||
{
|
||
"data": {
|
||
"text/plain": [
|
||
"\"\\nDataWhalechina is an organization founded at Shanghai Jiao Tong University that helps learners \\nimprove their Chinese language skills through data-driven learning.\\n\\n## Data\\n\\nThe DataWhalechina platform collects data from various sources, including:\\n\\n1. [China's National Database of Vocabulary and Phrase Structure](https://www.national-database.gov.cn/): This database contains vocabulary and phrase structure definitions for 1,000,000+ Chinese words and phrases.\\n\\n2. [China's National Academic Database of Literature and Culture](https://academic.lib.shu.edu.cn/): This database contains articles, books, and speeches written in Chinese by Chinese scholars.\\n\\n3. [China's National Knowledge Incorporation Database](https://knowledge.cn/): This database contains data on intellectual property rights, patents, and copyrights.\\n\\n4. [China's National Bureau of Statistics](https://www.stat.gov.cn/): This database contains statistics on population, living standards, and purchasing power.\\n\\n5. [China's National Bureau of Census](https://www.census.gov.cn/): This database contains\""
|
||
]
|
||
},
|
||
"execution_count": 6,
|
||
"metadata": {},
|
||
"output_type": "execute_result"
|
||
}
|
||
],
|
||
"source": [
|
||
"prompt = '\\nDataWhalechina is an organization founded at Shanghai Jiao Tong University that helps learners '\n",
|
||
"inputs = tokenizer(prompt, return_tensors=\"pt\")\n",
|
||
"\n",
|
||
"# Generate\n",
|
||
"generate_ids = model.generate(inputs.input_ids, max_length=300)\n",
|
||
"tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "raw",
|
||
"id": "052ab03d-f739-40e5-9f48-e8ab3d0f5f19",
|
||
"metadata": {},
|
||
"source": [
|
||
"如果提示内容给的比较短,可能会在事实上面出一点小问题"
|
||
]
|
||
}
|
||
],
|
||
"metadata": {
|
||
"kernelspec": {
|
||
"display_name": "kewei-ai",
|
||
"language": "python",
|
||
"name": "kewei-ai"
|
||
},
|
||
"language_info": {
|
||
"codemirror_mode": {
|
||
"name": "ipython",
|
||
"version": 3
|
||
},
|
||
"file_extension": ".py",
|
||
"mimetype": "text/x-python",
|
||
"name": "python",
|
||
"nbconvert_exporter": "python",
|
||
"pygments_lexer": "ipython3",
|
||
"version": "3.11.5"
|
||
}
|
||
},
|
||
"nbformat": 4,
|
||
"nbformat_minor": 5
|
||
}
|