add openelm

2026-05-01 11:58:17 +08:00 · 2024-06-01 17:33:19 +08:00 · 2024-06-01 17:33:19 +08:00 · 4c86b986e8
commit 4c86b986e8
parent 7fa3701ca8
3 changed files with 1611 additions and 0 deletions
--- a/Model_Architecture_Discussions/openelm/configuration_openelm.py
+++ b/Model_Architecture_Discussions/openelm/configuration_openelm.py
@ -0,0 +1,313 @@
 """Implements HF OpenELMConfig based on PretrainedConfig"""
 from numbers import Number
 from typing import List, Optional, Union
 import numpy as np
 from transformers import PretrainedConfig
 def make_divisible(
    v: Union[float, int],
    divisor: Optional[int] = 8,
    min_value: Optional[Union[float, int]] = None,
 ) -> Union[float, int]:
    """
    This function is taken from the original tf repo.
    It ensures that all layers have a channel number that is divisible by the divisor
    It can be seen at:
    https://github.com/tensorflow/models/blob/2cfc99eff5e5eb729c6793d2f3d03aa1c9be2b15/research/slim/nets/mobilenet/mobilenet.py#L62
    Args:
        v: input value
        divisor: default to 8
        min_value: minimum divisor value
    Returns:
        new_v: new divisible value
    """
    if min_value is None:
        min_value = divisor
    new_v = max(min_value, int(v + divisor / 2) // divisor * divisor)
    # Make sure that round down does not go down by more than 10%.
    if new_v < 0.9 * v:
        new_v += divisor
    return new_v
 def compute_heads(model_dim: int, head_dim: int) -> int:
    """Compute the number of heads.
    Args:
        model_dim: Model dimension.
        head_dim: Head dimension.
    Returns:
        An integer denoting number of heads in multi-head attention is returned.
    Raises:
        ValueError: if model dimension is not divisible by head dimension.
    """
    if model_dim % head_dim == 0:
        return model_dim // head_dim
    else:
        raise ValueError(
            f"Model dimension should be divisible by head dimension. Got: {model_dim} and {head_dim}."
        )
 OpenELM_CONFIGS = {
    "OpenELM-270M": dict(
        num_transformer_layers=16,
        model_dim=1280,
        head_dim=64,
        num_gqa_groups=4,
        normalize_qk_projections=True,
        share_input_output_layers=True,
        # Vary the FFN and QKV multipliers to create variable FFN and attention layers respectively.
        ffn_multipliers=(0.5, 4.0),
        qkv_multipliers=(0.5, 1.0),
    ),
    "OpenELM-450M": dict(
        num_transformer_layers=20,
        model_dim=1536,
        head_dim=64,
        num_gqa_groups=4,
        normalize_qk_projections=True,
        share_input_output_layers=True,
        # Vary the FFN and QKV multipliers to create variable FFN and attention layers respectively.
        ffn_multipliers=(0.5, 4.0),
        qkv_multipliers=(0.5, 1.0),
    ),
    "OpenELM-1_1B": dict(
        num_transformer_layers=28,
        model_dim=2048,
        head_dim=64,
        num_gqa_groups=4,
        normalize_qk_projections=True,
        share_input_output_layers=True,
        # Vary the FFN and QKV multipliers to create variable FFN and attention layers respectively.
        ffn_multipliers=(0.5, 4.0),
        qkv_multipliers=(0.5, 1.0),
    ),
    "OpenELM-3B": dict(
        num_transformer_layers=36,
        model_dim=3072,
        head_dim=128,
        num_gqa_groups=4,
        normalize_qk_projections=True,
        share_input_output_layers=True,
        # Vary the FFN and QKV multipliers to create variable FFN and attention layers respectively.
        ffn_multipliers=(0.5, 4.0),
        qkv_multipliers=(0.5, 1.0),
    ),
 }
 class OpenELMConfig(PretrainedConfig):
    r"""
    This is the configuration class to store the configuration of a [`OpenELMModel`]. It is used to instantiate an OpenELM model according to the specified arguments, defining the model architecture.
    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
    documentation from [`PretrainedConfig`] for more information.
    Args:
        vocab_size (`int`, *optional*, defaults to 32000):
            Vocabulary size of the OpenELM model.
        max_context_length (`int`, *optional*, defaults to 2048):
            Maximum number of input tokens.
        num_transformer_layers (`int`, *optional*, defaults to 12):
            Number of hidden layers in the Transformer decoder.
        model_dim (`int`, *optional*, defaults to 2048):
            Dimension of the hidden representations.
        head_dim (`int`, *optional*, defaults to 128):
            The attention head dimension.
        qkv_multipliers (`Union[Number, List[Number]]`, *optional*, defaults to 1.0):
            If the qkv_multipliers is a Number, then all attention layers have the same latent dimensions,
            resulting in uniform allocation of parameters.
            If the qkv_multipliers is a List of Number, then each attention layer have different latent dimensions
            assuming qkv_multipliers[0] != qkv_multipliers[1]. This results in variable allocation of parameters in attention layer.
            This scaling is known as layer-wise or block-wise scaling: https://arxiv.org/abs/2008.00623
        num_query_heads (`Union[int, None]`, *optional*, defaults to None):
            The number of query heads, computed from `compute_heads(model_dim=model_dim, head_dim=head_dim)`.
        num_gqa_groups (`int`, *optional*, defaults to 1):
            This variable allows to switch between multi-head attention, group query attention, and multi-query attention.
            When num_gqa_groups == 1, then it is multi-head attention.
            When 1 < num_gqa_groups < num_heads and num_heads is divisible by num_gqa_groups, then it is group query attention
            When num_gqa_groups == num_heads, then it is multi-query attention
        ffn_multipliers (`Union[Number, List[Number]]`, *optional*, defaults to 4.0):
            Feed-forward network (FFN) multipliers.
            If the ffn_multipliers is a Number, then all FFN layers have the same latent dimensions,
            resulting in uniform allocation of parameters.
            If the ffn_multipliers is a List of Number, then each FFN layer have different latent dimensions
            assuming ffn_multipliers[0] != ffn_multipliers[1]. This results in variable allocation of parameters in FFN layer.
            This scaling is known as layer-wise or block-wise scaling: https://arxiv.org/abs/2008.00623
        ffn_with_glu (`bool`, *optional*, defaults to True):
            Whether to use FFN with Gated Linear Unit (GLU)
        ffn_dim_divisor (`int`, *optional*, defaults to 256):
            The ffn layer dimension divisor.
        activation_fn_name (`str` or `function`, *optional*, defaults to `"swish"`):
            The non-linear activation function (function or string) in the decoder.
        normalization_layer_name (`str` or `function`, *optional*, defaults to `"rms_norm"`):
            Type of normalization layer.
        normalize_qk_projections (`bool`, *optional*, defaults to False):
            Whether to normalize queries and keys after projections
        share_input_output_layers (`bool`, *optional*, defaults to False):
            Whether to share the embedding between input and output linear layer
        rope_freq_constant (`int`, *optional*, defaults to 10000):
            The base period of the RoPE embeddings.
        rope_max_length (`int`, *optional*, defaults to 4096):
            That rope_max_length is set to twice of max_context_length.
            This allows flexibility in token lengths during training or fine-tuning.
        initializer_range (`float`, *optional*, defaults to 0.02):
            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
        use_cache (`bool`, *optional*, defaults to `True`):
            Whether or not the model should return the last key/values attentions (not used by all models). Only
            relevant if `config.is_decoder=True`.
        bos_token_id (`int`, *optional*, defaults to 2):
            Beginning of stream token id.
        eos_token_id (`int`, *optional*, defaults to 1):
            End of stream token id.
    """
    model_type = "openelm"
    def __init__(
        self,
        vocab_size: int = 32000,
        max_context_length: int = 2048,
        num_transformer_layers: int = 12,
        model_dim: int = 2048,
        head_dim: int = 128,
        qkv_multipliers: Union[Number, List[Number]] = 1.0,
        num_query_heads: Union[int, None] = None,
        num_gqa_groups: int = 1,
        ffn_multipliers: Union[Number, List[Number]] = 4.0,
        ffn_with_glu: bool = True,
        ffn_dim_divisor: int = 256,
        activation_fn_name: str = "swish",
        normalization_layer_name: str = "rms_norm",
        normalize_qk_projections: bool = False,
        share_input_output_layers: bool = False,
        rope_freq_constant: int = 10000,
        rope_max_length: int = 4096,
        initializer_range: float = 0.02,
        use_cache: bool = True,
        bos_token_id: int = 1,
        eos_token_id: int = 2,
        **kwargs,
    ) -> None:
        self.vocab_size = vocab_size
        self.max_context_length = max_context_length
        self.num_transformer_layers = num_transformer_layers
        self.model_dim = model_dim
        self.head_dim = head_dim
        self.qkv_multipliers = qkv_multipliers
        self.num_query_heads = num_query_heads
        self.num_gqa_groups = num_gqa_groups
        self.ffn_multipliers = ffn_multipliers
        self.ffn_with_glu = ffn_with_glu
        self.ffn_dim_divisor = ffn_dim_divisor
        self.activation_fn_name = activation_fn_name
        self.normalization_layer_name = normalization_layer_name
        self.normalize_qk_projections = normalize_qk_projections
        self.share_input_output_layers = share_input_output_layers
        self.rope_freq_constant = rope_freq_constant
        self.rope_max_length = rope_max_length
        self.num_query_heads = (
            compute_heads(model_dim=model_dim, head_dim=head_dim)
            if num_query_heads is None
            else num_query_heads
        )
        self.initializer_range = initializer_range
        self.__post_init__()
        super().__init__(
            use_cache=use_cache,
            bos_token_id=bos_token_id,
            eos_token_id=eos_token_id,
            **kwargs,
        )
    def __post_init__(self) -> None:
        if self.num_gqa_groups is not None:
            head_multiple_of = self.num_gqa_groups
        else:
            head_multiple_of = 2
        if isinstance(self.qkv_multipliers, Number):
            # All attention layers have the same latent dimensions, resulting in uniform allocation of parameters.
            qkv_dim = make_divisible(
                self.model_dim * self.qkv_multipliers,
                divisor=self.head_dim * head_multiple_of,
            )
            query_dims = [int(qkv_dim)] * self.num_transformer_layers
        elif (
            isinstance(self.qkv_multipliers, (tuple, list))
            and len(self.qkv_multipliers) == 2
        ):
            # Each attention layer have different latent dimensions assuming qkv_multipliers[0] != qkv_multipliers[1].
            # This results in variable allocation of parameters in attention layer.
            # This scaling is known as layer-wise or block-wise scaling: https://arxiv.org/abs/2008.00623
            qkv_multipliers = [
                round(v, 2)
                for v in np.linspace(
                    self.qkv_multipliers[0],
                    self.qkv_multipliers[1],
                    num=self.num_transformer_layers,
                    dtype=float,
                )
            ]
            # Make sure that scaled model dimension is divisible by scaled head dimension.
            query_dims = [
                int(
                    make_divisible(
                        self.model_dim * m, divisor=self.head_dim * head_multiple_of
                    )
                )
                for m in qkv_multipliers
            ]
        else:
            raise NotImplementedError(
                f"QKV multipliers should be a single number or a list containing exactly two numbers. Got: {qkv_multipliers}."
            )
        # compute the number of query, key, and value heads
        # For multi-head and multi-query attention, the number of heads for query, key, and value are the same.
        # For group query attention, the number of key and value heads are the same.
        self.num_query_heads = [
            int(compute_heads(q_dim, self.head_dim)) for q_dim in query_dims
        ]
        self.num_kv_heads = [
            q_heads // self.num_gqa_groups for q_heads in self.num_query_heads
        ]
        # Feed-forward network (FFN) multipliers
        if isinstance(self.ffn_multipliers, Number):
            # All FFN layers have the same latent dimensions, resulting in uniform allocation of parameters.
            self.ffn_multipliers = [self.ffn_multipliers] * self.num_transformer_layers
        elif isinstance(self.ffn_multipliers, (tuple, list)):
            # Each FFN layer have different latent dimensions assuming ffn_multipliers[0] != ffn_multipliers[1].
            # This results in variable allocation of parameters in FFN layer.
            # This scaling is known as layer-wise or block-wise scaling: https://arxiv.org/abs/2008.00623
            if len(self.ffn_multipliers) == 2:
                self.ffn_multipliers = [
                    round(v, 2)
                    for v in np.linspace(
                        self.ffn_multipliers[0],
                        self.ffn_multipliers[1],
                        num=self.num_transformer_layers,
                        dtype=float,
                    )
                ]
            else:
                assert (
                    len(self.ffn_multipliers) == self.num_transformer_layers
                ), f"{len(self.ffn_multipliers)=}!={self.num_transformer_layers=}"
        else:
            raise NotImplementedError(
                f"FFN multipliers should be a single number or a list containing exactly two numbers. Got: {qkv_multipliers}."
            )
        # check num_query_heads divisible by num_kv_heads for every layer
        for layer_idx in range(len(query_dims)):
            assert self.num_query_heads[layer_idx] % self.num_kv_heads[layer_idx] == 0
--- a/Model_Architecture_Discussions/openelm/modeling_openelm.py
+++ b/Model_Architecture_Discussions/openelm/modeling_openelm.py
--- a/Model_Architecture_Discussions/openelm/openelm.ipynb
+++ b/Model_Architecture_Discussions/openelm/openelm.ipynb
@ -0,0 +1,295 @@
 {
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "id": "dd05f32c-a90f-4122-b6d7-a5ec7b3b9ba0",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "env: HF_ENDPOINT=https://hf-mirror.com\n"
     ]
    }
   ],
   "source": [
    "%env HF_ENDPOINT=https://hf-mirror.com"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "id": "54f03217-da8d-4a05-9c85-9e0301a597e7",
   "metadata": {},
   "outputs": [],
   "source": [
    "import os\n",
    "\n",
    "# 设置 HF_HOME 环境变量 设置下载路径\n",
    "os.environ['HF_HOME'] = '/data1/ckw'"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "94cab483-b247-4aa8-9557-d15e459244af",
   "metadata": {},
   "outputs": [],
   "source": [
    "# 这个时候，由于OpenELM还没有官方发布在transformer，所以需要改下源码(已经有了更好的办法,因此不需要改源码了)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "e2f3081d-f795-4f86-b80e-e915ae56b426",
   "metadata": {},
   "outputs": [],
   "source": [
    "# /data1/ckw/micromamba/envs/kewei-ai/lib/python3.11/site-packages/transformers/models/auto/tokenization_auto.py:909"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "db03e7fd-d06f-4e78-842f-66c8e02043bd",
   "metadata": {},
   "source": [
    "#### 1.3 AutoModelForCausalLM代码\n",
    "\n",
    "```python\n",
    "class AutoModelForCausalLM:\n",
    "    def __init__(self):\n",
    "        raise EnvironmentError(\n",
    "            \"AutoModelForCausalLM is designed to be instantiated \"\n",
    "            \"using the `AutoModelForCausalLM.from_pretrained(pretrained_model_name_or_path)` or \"\n",
    "            \"`AutoModelForCausalLM.from_config(config)` methods.\"\n",
    "        )\n",
    "\n",
    "\t@classmethod\n",
    "    @replace_list_option_in_docstrings(MODEL_FOR_CAUSAL_LM_MAPPING, use_model_types=False)\n",
    "    def from_config(cls, config):\n",
    "\n",
    "        if type(config) in MODEL_FOR_CAUSAL_LM_MAPPING.keys():\n",
    "            return MODEL_FOR_CAUSAL_LM_MAPPING[type(config)](config)\n",
    "        raise ValueError(\n",
    "            \"Unrecognized configuration class {} for this kind of AutoModel: {}.\\n\"\n",
    "            \"Model type should be one of {}.\".format(\n",
    "                config.__class__, cls.__name__, \", \".join(c.__name__ for c in MODEL_FOR_CAUSAL_LM_MAPPING.keys())\n",
    "            )\n",
    "        )\n",
    "\n",
    "\n",
    "\t@classmethod\n",
    "    @replace_list_option_in_docstrings(MODEL_FOR_CAUSAL_LM_MAPPING)\n",
    "    @add_start_docstrings(\n",
    "        \"Instantiate one of the model classes of the library---with a causal language modeling head---from a \"\n",
    "        \"pretrained model.\",\n",
    "        AUTO_MODEL_PRETRAINED_DOCSTRING,\n",
    "    )\n",
    "    def from_pretrained(cls, pretrained_model_name_or_path, *model_args, **kwargs):\n",
    "        config = kwargs.pop(\"config\", None)\n",
    "        if not isinstance(config, PretrainedConfig):\n",
    "            config, kwargs = AutoConfig.from_pretrained(\n",
    "                pretrained_model_name_or_path, return_unused_kwargs=True, **kwargs\n",
    "            )\n",
    "\n",
    "        if type(config) in MODEL_FOR_CAUSAL_LM_MAPPING.keys():\n",
    "            return MODEL_FOR_CAUSAL_LM_MAPPING[type(config)].from_pretrained(\n",
    "                pretrained_model_name_or_path, *model_args, config=config, **kwargs\n",
    "            )\n",
    "        raise ValueError(\n",
    "            \"Unrecognized configuration class {} for this kind of AutoModel: {}.\\n\"\n",
    "            \"Model type should be one of {}.\".format(\n",
    "                config.__class__, cls.__name__, \", \".join(c.__name__ for c in MODEL_FOR_CAUSAL_LM_MAPPING.keys())\n",
    "            )\n",
    "        )\n",
    "```"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "id": "744c6db7-53f9-4911-adcb-4f0618693071",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "7dd376f050c3496b904a5a545f499e07",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "tokenizer_config.json:   0%|          | 0.00/265 [00:00<?, ?B/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "4936fbb98c5446ebb60f4bdb288ddc73",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "080e814bd03542aeb4a9f882c67ed06a",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "tokenizer.json: 0.00B [00:00, ?B/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "d04a2f9f4a57490bb70e88af4ab10008",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "added_tokens.json:   0%|          | 0.00/21.0 [00:00<?, ?B/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "6a728b39e23043459b8c2bddef6e8845",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "special_tokens_map.json:   0%|          | 0.00/435 [00:00<?, ?B/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.\n",
      "Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "'\\nDataWhalechina is an organization founded at Shanghai Jiao Tong University that helps learners learn artificial intelligence. The organization aims to provide AI-related courses to students in China.\\n\\nThis repository contains the code for the following courses:\\n\\n1. [Introduction to AI: Neural Networks and Classification](https://www.datawhalechina.com/courses/introduction-to-ai-neural-networks-and-classification/)\\n2. [Introduction to AI: Deep Learning and Applications](https://www.datawhalechina.com/courses/introduction-to-ai-deep-learning-and-applications/)\\n3. [Introduction to AI: Algorithms and Applications](https://www.datawhalechina.com/courses/introduction-to-ai-algorithms-and-applications/)\\n4. [Introduction to AI: Data Preparation and Model Evaluation](https://www.datawhalechina.com/courses/introduction-to-ai-data-preparation-and-model-evaluation/)\\n5. [Introduction to AI: Building and Evaluating AI Models](https://www.datawhalechina.com/courses/introduction-to-ai-building-and-evaluating-ai'"
      ]
     },
     "execution_count": 5,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "from transformers import AutoTokenizer\n",
    "from modeling_openelm import OpenELMForCausalLM\n",
    "\n",
    "model = OpenELMForCausalLM.from_pretrained(\"Apple/OpenELM-270M-Instruct\")#trust_remote_code=True\n",
    "# tokenizer = AutoTokenizer.from_pretrained(\"Apple/OpenELM-270M-Instruct\")Llama-2-7b-hf\n",
    "tokenizer = AutoTokenizer.from_pretrained(\"NousResearch/Llama-2-7b-chat-hf\")\n",
    "prompt = '\\nDataWhalechina is an organization founded at Shanghai Jiao Tong University that helps learners learn artificial intelligence.'\n",
    "inputs = tokenizer(prompt, return_tensors=\"pt\")\n",
    "\n",
    "# Generate\n",
    "generate_ids = model.generate(inputs.input_ids, max_length=300)\n",
    "tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]"
   ]
  },
  {
   "cell_type": "raw",
   "id": "6c0f8954-aca3-496b-86e4-843cdb00b104",
   "metadata": {},
   "source": [
    "上面这个openelm的回复，感觉还比较贴合datawhale的实际情况哈，速度也是很快的，没得说，不过链接是编的哈哈"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "id": "060b86f9-fda5-4d9f-8292-4d9464c7b2ef",
   "metadata": {
    "scrolled": true
   },
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.\n",
      "Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "\"\\nDataWhalechina is an organization founded at Shanghai Jiao Tong University that helps learners \\nimprove their Chinese language skills through data-driven learning.\\n\\n## Data\\n\\nThe DataWhalechina platform collects data from various sources, including:\\n\\n1.  [China's National Database of Vocabulary and Phrase Structure](https://www.national-database.gov.cn/): This database contains vocabulary and phrase structure definitions for 1,000,000+ Chinese words and phrases.\\n\\n2.  [China's National Academic Database of Literature and Culture](https://academic.lib.shu.edu.cn/): This database contains articles, books, and speeches written in Chinese by Chinese scholars.\\n\\n3.  [China's National Knowledge Incorporation Database](https://knowledge.cn/): This database contains data on intellectual property rights, patents, and copyrights.\\n\\n4.  [China's National Bureau of Statistics](https://www.stat.gov.cn/): This database contains statistics on population, living standards, and purchasing power.\\n\\n5.  [China's National Bureau of Census](https://www.census.gov.cn/): This database contains\""
      ]
     },
     "execution_count": 6,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "prompt = '\\nDataWhalechina is an organization founded at Shanghai Jiao Tong University that helps learners '\n",
    "inputs = tokenizer(prompt, return_tensors=\"pt\")\n",
    "\n",
    "# Generate\n",
    "generate_ids = model.generate(inputs.input_ids, max_length=300)\n",
    "tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]"
   ]
  },
  {
   "cell_type": "raw",
   "id": "052ab03d-f739-40e5-9f48-e8ab3d0f5f19",
   "metadata": {},
   "source": [
    "如果提示内容给的比较短，可能会在事实上面出一点小问题"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "kewei-ai",
   "language": "python",
   "name": "kewei-ai"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.11.5"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
 }