llms-from-scratch-cn/Model_Architecture_Discussions/openelm/openelm.ipynb

{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "id": "dd05f32c-a90f-4122-b6d7-a5ec7b3b9ba0",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "env: HF_ENDPOINT=https://hf-mirror.com\n"
     ]
    }
   ],
   "source": [
    "%env HF_ENDPOINT=https://hf-mirror.com"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "id": "54f03217-da8d-4a05-9c85-9e0301a597e7",
   "metadata": {},
   "outputs": [],
   "source": [
    "import os\n",
    "\n",
    "# 设置 HF_HOME 环境变量 设置下载路径\n",
    "os.environ['HF_HOME'] = '/data1/ckw'"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "94cab483-b247-4aa8-9557-d15e459244af",
   "metadata": {},
   "outputs": [],
   "source": [
    "# 这个时候，由于OpenELM还没有官方发布在transformer，所以需要改下源码(已经有了更好的办法,因此不需要改源码了)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "e2f3081d-f795-4f86-b80e-e915ae56b426",
   "metadata": {},
   "outputs": [],
   "source": [
    "# /data1/ckw/micromamba/envs/kewei-ai/lib/python3.11/site-packages/transformers/models/auto/tokenization_auto.py:909"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "db03e7fd-d06f-4e78-842f-66c8e02043bd",
   "metadata": {},
   "source": [
    "#### 1.3 AutoModelForCausalLM代码\n",
    "\n",
    "```python\n",
    "class AutoModelForCausalLM:\n",
    "    def __init__(self):\n",
    "        raise EnvironmentError(\n",
    "            \"AutoModelForCausalLM is designed to be instantiated \"\n",
    "            \"using the `AutoModelForCausalLM.from_pretrained(pretrained_model_name_or_path)` or \"\n",
    "            \"`AutoModelForCausalLM.from_config(config)` methods.\"\n",
    "        )\n",
    "\n",
    "\t@classmethod\n",
    "    @replace_list_option_in_docstrings(MODEL_FOR_CAUSAL_LM_MAPPING, use_model_types=False)\n",
    "    def from_config(cls, config):\n",
    "\n",
    "        if type(config) in MODEL_FOR_CAUSAL_LM_MAPPING.keys():\n",
    "            return MODEL_FOR_CAUSAL_LM_MAPPING[type(config)](config)\n",
    "        raise ValueError(\n",
    "            \"Unrecognized configuration class {} for this kind of AutoModel: {}.\\n\"\n",
    "            \"Model type should be one of {}.\".format(\n",
    "                config.__class__, cls.__name__, \", \".join(c.__name__ for c in MODEL_FOR_CAUSAL_LM_MAPPING.keys())\n",
    "            )\n",
    "        )\n",
    "\n",
    "\n",
    "\t@classmethod\n",
    "    @replace_list_option_in_docstrings(MODEL_FOR_CAUSAL_LM_MAPPING)\n",
    "    @add_start_docstrings(\n",
    "        \"Instantiate one of the model classes of the library---with a causal language modeling head---from a \"\n",
    "        \"pretrained model.\",\n",
    "        AUTO_MODEL_PRETRAINED_DOCSTRING,\n",
    "    )\n",
    "    def from_pretrained(cls, pretrained_model_name_or_path, *model_args, **kwargs):\n",
    "        config = kwargs.pop(\"config\", None)\n",
    "        if not isinstance(config, PretrainedConfig):\n",
    "            config, kwargs = AutoConfig.from_pretrained(\n",
    "                pretrained_model_name_or_path, return_unused_kwargs=True, **kwargs\n",
    "            )\n",
    "\n",
    "        if type(config) in MODEL_FOR_CAUSAL_LM_MAPPING.keys():\n",
    "            return MODEL_FOR_CAUSAL_LM_MAPPING[type(config)].from_pretrained(\n",
    "                pretrained_model_name_or_path, *model_args, config=config, **kwargs\n",
    "            )\n",
    "        raise ValueError(\n",
    "            \"Unrecognized configuration class {} for this kind of AutoModel: {}.\\n\"\n",
    "            \"Model type should be one of {}.\".format(\n",
    "                config.__class__, cls.__name__, \", \".join(c.__name__ for c in MODEL_FOR_CAUSAL_LM_MAPPING.keys())\n",
    "            )\n",
    "        )\n",
    "```"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "id": "744c6db7-53f9-4911-adcb-4f0618693071",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "7dd376f050c3496b904a5a545f499e07",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "tokenizer_config.json:   0%|          | 0.00/265 [00:00<?, ?B/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "4936fbb98c5446ebb60f4bdb288ddc73",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "080e814bd03542aeb4a9f882c67ed06a",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "tokenizer.json: 0.00B [00:00, ?B/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "d04a2f9f4a57490bb70e88af4ab10008",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "added_tokens.json:   0%|          | 0.00/21.0 [00:00<?, ?B/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "6a728b39e23043459b8c2bddef6e8845",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "special_tokens_map.json:   0%|          | 0.00/435 [00:00<?, ?B/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.\n",
      "Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "'\\nDataWhalechina is an organization founded at Shanghai Jiao Tong University that helps learners learn artificial intelligence. The organization aims to provide AI-related courses to students in China.\\n\\nThis repository contains the code for the following courses:\\n\\n1. [Introduction to AI: Neural Networks and Classification](https://www.datawhalechina.com/courses/introduction-to-ai-neural-networks-and-classification/)\\n2. [Introduction to AI: Deep Learning and Applications](https://www.datawhalechina.com/courses/introduction-to-ai-deep-learning-and-applications/)\\n3. [Introduction to AI: Algorithms and Applications](https://www.datawhalechina.com/courses/introduction-to-ai-algorithms-and-applications/)\\n4. [Introduction to AI: Data Preparation and Model Evaluation](https://www.datawhalechina.com/courses/introduction-to-ai-data-preparation-and-model-evaluation/)\\n5. [Introduction to AI: Building and Evaluating AI Models](https://www.datawhalechina.com/courses/introduction-to-ai-building-and-evaluating-ai'"
      ]
     },
     "execution_count": 5,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "from transformers import AutoTokenizer\n",
    "from modeling_openelm import OpenELMForCausalLM\n",
    "\n",
    "model = OpenELMForCausalLM.from_pretrained(\"Apple/OpenELM-270M-Instruct\")#trust_remote_code=True\n",
    "# tokenizer = AutoTokenizer.from_pretrained(\"Apple/OpenELM-270M-Instruct\")Llama-2-7b-hf\n",
    "tokenizer = AutoTokenizer.from_pretrained(\"NousResearch/Llama-2-7b-chat-hf\")\n",
    "prompt = '\\nDataWhalechina is an organization founded at Shanghai Jiao Tong University that helps learners learn artificial intelligence.'\n",
    "inputs = tokenizer(prompt, return_tensors=\"pt\")\n",
    "\n",
    "# Generate\n",
    "generate_ids = model.generate(inputs.input_ids, max_length=300)\n",
    "tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]"
   ]
  },
  {
   "cell_type": "raw",
   "id": "6c0f8954-aca3-496b-86e4-843cdb00b104",
   "metadata": {},
   "source": [
    "上面这个openelm的回复，感觉还比较贴合datawhale的实际情况哈，速度也是很快的，没得说，不过链接是编的哈哈"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "id": "060b86f9-fda5-4d9f-8292-4d9464c7b2ef",
   "metadata": {
    "scrolled": true
   },
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.\n",
      "Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "\"\\nDataWhalechina is an organization founded at Shanghai Jiao Tong University that helps learners \\nimprove their Chinese language skills through data-driven learning.\\n\\n## Data\\n\\nThe DataWhalechina platform collects data from various sources, including:\\n\\n1.  [China's National Database of Vocabulary and Phrase Structure](https://www.national-database.gov.cn/): This database contains vocabulary and phrase structure definitions for 1,000,000+ Chinese words and phrases.\\n\\n2.  [China's National Academic Database of Literature and Culture](https://academic.lib.shu.edu.cn/): This database contains articles, books, and speeches written in Chinese by Chinese scholars.\\n\\n3.  [China's National Knowledge Incorporation Database](https://knowledge.cn/): This database contains data on intellectual property rights, patents, and copyrights.\\n\\n4.  [China's National Bureau of Statistics](https://www.stat.gov.cn/): This database contains statistics on population, living standards, and purchasing power.\\n\\n5.  [China's National Bureau of Census](https://www.census.gov.cn/): This database contains\""
      ]
     },
     "execution_count": 6,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "prompt = '\\nDataWhalechina is an organization founded at Shanghai Jiao Tong University that helps learners '\n",
    "inputs = tokenizer(prompt, return_tensors=\"pt\")\n",
    "\n",
    "# Generate\n",
    "generate_ids = model.generate(inputs.input_ids, max_length=300)\n",
    "tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]"
   ]
  },
  {
   "cell_type": "raw",
   "id": "052ab03d-f739-40e5-9f48-e8ab3d0f5f19",
   "metadata": {},
   "source": [
    "如果提示内容给的比较短，可能会在事实上面出一点小问题"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "kewei-ai",
   "language": "python",
   "name": "kewei-ai"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.11.5"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}