mirror of
https://github.com/datawhalechina/llms-from-scratch-cn.git
synced 2026-05-01 11:58:17 +08:00
add openelm
This commit is contained in:
parent
7fa3701ca8
commit
4c86b986e8
313
Model_Architecture_Discussions/openelm/configuration_openelm.py
Normal file
313
Model_Architecture_Discussions/openelm/configuration_openelm.py
Normal file
@ -0,0 +1,313 @@
|
|||||||
|
"""Implements HF OpenELMConfig based on PretrainedConfig"""
|
||||||
|
from numbers import Number
|
||||||
|
from typing import List, Optional, Union
|
||||||
|
|
||||||
|
import numpy as np
|
||||||
|
from transformers import PretrainedConfig
|
||||||
|
|
||||||
|
|
||||||
|
def make_divisible(
|
||||||
|
v: Union[float, int],
|
||||||
|
divisor: Optional[int] = 8,
|
||||||
|
min_value: Optional[Union[float, int]] = None,
|
||||||
|
) -> Union[float, int]:
|
||||||
|
"""
|
||||||
|
This function is taken from the original tf repo.
|
||||||
|
It ensures that all layers have a channel number that is divisible by the divisor
|
||||||
|
It can be seen at:
|
||||||
|
https://github.com/tensorflow/models/blob/2cfc99eff5e5eb729c6793d2f3d03aa1c9be2b15/research/slim/nets/mobilenet/mobilenet.py#L62
|
||||||
|
|
||||||
|
Args:
|
||||||
|
v: input value
|
||||||
|
divisor: default to 8
|
||||||
|
min_value: minimum divisor value
|
||||||
|
Returns:
|
||||||
|
new_v: new divisible value
|
||||||
|
"""
|
||||||
|
if min_value is None:
|
||||||
|
min_value = divisor
|
||||||
|
new_v = max(min_value, int(v + divisor / 2) // divisor * divisor)
|
||||||
|
# Make sure that round down does not go down by more than 10%.
|
||||||
|
if new_v < 0.9 * v:
|
||||||
|
new_v += divisor
|
||||||
|
return new_v
|
||||||
|
|
||||||
|
|
||||||
|
def compute_heads(model_dim: int, head_dim: int) -> int:
|
||||||
|
"""Compute the number of heads.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
model_dim: Model dimension.
|
||||||
|
head_dim: Head dimension.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
An integer denoting number of heads in multi-head attention is returned.
|
||||||
|
|
||||||
|
Raises:
|
||||||
|
ValueError: if model dimension is not divisible by head dimension.
|
||||||
|
"""
|
||||||
|
if model_dim % head_dim == 0:
|
||||||
|
return model_dim // head_dim
|
||||||
|
else:
|
||||||
|
raise ValueError(
|
||||||
|
f"Model dimension should be divisible by head dimension. Got: {model_dim} and {head_dim}."
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
OpenELM_CONFIGS = {
|
||||||
|
"OpenELM-270M": dict(
|
||||||
|
num_transformer_layers=16,
|
||||||
|
model_dim=1280,
|
||||||
|
head_dim=64,
|
||||||
|
num_gqa_groups=4,
|
||||||
|
normalize_qk_projections=True,
|
||||||
|
share_input_output_layers=True,
|
||||||
|
# Vary the FFN and QKV multipliers to create variable FFN and attention layers respectively.
|
||||||
|
ffn_multipliers=(0.5, 4.0),
|
||||||
|
qkv_multipliers=(0.5, 1.0),
|
||||||
|
),
|
||||||
|
"OpenELM-450M": dict(
|
||||||
|
num_transformer_layers=20,
|
||||||
|
model_dim=1536,
|
||||||
|
head_dim=64,
|
||||||
|
num_gqa_groups=4,
|
||||||
|
normalize_qk_projections=True,
|
||||||
|
share_input_output_layers=True,
|
||||||
|
# Vary the FFN and QKV multipliers to create variable FFN and attention layers respectively.
|
||||||
|
ffn_multipliers=(0.5, 4.0),
|
||||||
|
qkv_multipliers=(0.5, 1.0),
|
||||||
|
),
|
||||||
|
"OpenELM-1_1B": dict(
|
||||||
|
num_transformer_layers=28,
|
||||||
|
model_dim=2048,
|
||||||
|
head_dim=64,
|
||||||
|
num_gqa_groups=4,
|
||||||
|
normalize_qk_projections=True,
|
||||||
|
share_input_output_layers=True,
|
||||||
|
# Vary the FFN and QKV multipliers to create variable FFN and attention layers respectively.
|
||||||
|
ffn_multipliers=(0.5, 4.0),
|
||||||
|
qkv_multipliers=(0.5, 1.0),
|
||||||
|
),
|
||||||
|
"OpenELM-3B": dict(
|
||||||
|
num_transformer_layers=36,
|
||||||
|
model_dim=3072,
|
||||||
|
head_dim=128,
|
||||||
|
num_gqa_groups=4,
|
||||||
|
normalize_qk_projections=True,
|
||||||
|
share_input_output_layers=True,
|
||||||
|
# Vary the FFN and QKV multipliers to create variable FFN and attention layers respectively.
|
||||||
|
ffn_multipliers=(0.5, 4.0),
|
||||||
|
qkv_multipliers=(0.5, 1.0),
|
||||||
|
),
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
class OpenELMConfig(PretrainedConfig):
|
||||||
|
r"""
|
||||||
|
This is the configuration class to store the configuration of a [`OpenELMModel`]. It is used to instantiate an OpenELM model according to the specified arguments, defining the model architecture.
|
||||||
|
|
||||||
|
Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
|
||||||
|
documentation from [`PretrainedConfig`] for more information.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
vocab_size (`int`, *optional*, defaults to 32000):
|
||||||
|
Vocabulary size of the OpenELM model.
|
||||||
|
max_context_length (`int`, *optional*, defaults to 2048):
|
||||||
|
Maximum number of input tokens.
|
||||||
|
num_transformer_layers (`int`, *optional*, defaults to 12):
|
||||||
|
Number of hidden layers in the Transformer decoder.
|
||||||
|
model_dim (`int`, *optional*, defaults to 2048):
|
||||||
|
Dimension of the hidden representations.
|
||||||
|
head_dim (`int`, *optional*, defaults to 128):
|
||||||
|
The attention head dimension.
|
||||||
|
qkv_multipliers (`Union[Number, List[Number]]`, *optional*, defaults to 1.0):
|
||||||
|
If the qkv_multipliers is a Number, then all attention layers have the same latent dimensions,
|
||||||
|
resulting in uniform allocation of parameters.
|
||||||
|
If the qkv_multipliers is a List of Number, then each attention layer have different latent dimensions
|
||||||
|
assuming qkv_multipliers[0] != qkv_multipliers[1]. This results in variable allocation of parameters in attention layer.
|
||||||
|
This scaling is known as layer-wise or block-wise scaling: https://arxiv.org/abs/2008.00623
|
||||||
|
num_query_heads (`Union[int, None]`, *optional*, defaults to None):
|
||||||
|
The number of query heads, computed from `compute_heads(model_dim=model_dim, head_dim=head_dim)`.
|
||||||
|
num_gqa_groups (`int`, *optional*, defaults to 1):
|
||||||
|
This variable allows to switch between multi-head attention, group query attention, and multi-query attention.
|
||||||
|
When num_gqa_groups == 1, then it is multi-head attention.
|
||||||
|
When 1 < num_gqa_groups < num_heads and num_heads is divisible by num_gqa_groups, then it is group query attention
|
||||||
|
When num_gqa_groups == num_heads, then it is multi-query attention
|
||||||
|
ffn_multipliers (`Union[Number, List[Number]]`, *optional*, defaults to 4.0):
|
||||||
|
Feed-forward network (FFN) multipliers.
|
||||||
|
If the ffn_multipliers is a Number, then all FFN layers have the same latent dimensions,
|
||||||
|
resulting in uniform allocation of parameters.
|
||||||
|
If the ffn_multipliers is a List of Number, then each FFN layer have different latent dimensions
|
||||||
|
assuming ffn_multipliers[0] != ffn_multipliers[1]. This results in variable allocation of parameters in FFN layer.
|
||||||
|
This scaling is known as layer-wise or block-wise scaling: https://arxiv.org/abs/2008.00623
|
||||||
|
ffn_with_glu (`bool`, *optional*, defaults to True):
|
||||||
|
Whether to use FFN with Gated Linear Unit (GLU)
|
||||||
|
ffn_dim_divisor (`int`, *optional*, defaults to 256):
|
||||||
|
The ffn layer dimension divisor.
|
||||||
|
activation_fn_name (`str` or `function`, *optional*, defaults to `"swish"`):
|
||||||
|
The non-linear activation function (function or string) in the decoder.
|
||||||
|
normalization_layer_name (`str` or `function`, *optional*, defaults to `"rms_norm"`):
|
||||||
|
Type of normalization layer.
|
||||||
|
normalize_qk_projections (`bool`, *optional*, defaults to False):
|
||||||
|
Whether to normalize queries and keys after projections
|
||||||
|
share_input_output_layers (`bool`, *optional*, defaults to False):
|
||||||
|
Whether to share the embedding between input and output linear layer
|
||||||
|
rope_freq_constant (`int`, *optional*, defaults to 10000):
|
||||||
|
The base period of the RoPE embeddings.
|
||||||
|
rope_max_length (`int`, *optional*, defaults to 4096):
|
||||||
|
That rope_max_length is set to twice of max_context_length.
|
||||||
|
This allows flexibility in token lengths during training or fine-tuning.
|
||||||
|
initializer_range (`float`, *optional*, defaults to 0.02):
|
||||||
|
The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
|
||||||
|
use_cache (`bool`, *optional*, defaults to `True`):
|
||||||
|
Whether or not the model should return the last key/values attentions (not used by all models). Only
|
||||||
|
relevant if `config.is_decoder=True`.
|
||||||
|
bos_token_id (`int`, *optional*, defaults to 2):
|
||||||
|
Beginning of stream token id.
|
||||||
|
eos_token_id (`int`, *optional*, defaults to 1):
|
||||||
|
End of stream token id.
|
||||||
|
"""
|
||||||
|
|
||||||
|
model_type = "openelm"
|
||||||
|
|
||||||
|
def __init__(
|
||||||
|
self,
|
||||||
|
vocab_size: int = 32000,
|
||||||
|
max_context_length: int = 2048,
|
||||||
|
num_transformer_layers: int = 12,
|
||||||
|
model_dim: int = 2048,
|
||||||
|
head_dim: int = 128,
|
||||||
|
qkv_multipliers: Union[Number, List[Number]] = 1.0,
|
||||||
|
num_query_heads: Union[int, None] = None,
|
||||||
|
num_gqa_groups: int = 1,
|
||||||
|
ffn_multipliers: Union[Number, List[Number]] = 4.0,
|
||||||
|
ffn_with_glu: bool = True,
|
||||||
|
ffn_dim_divisor: int = 256,
|
||||||
|
activation_fn_name: str = "swish",
|
||||||
|
normalization_layer_name: str = "rms_norm",
|
||||||
|
normalize_qk_projections: bool = False,
|
||||||
|
share_input_output_layers: bool = False,
|
||||||
|
rope_freq_constant: int = 10000,
|
||||||
|
rope_max_length: int = 4096,
|
||||||
|
initializer_range: float = 0.02,
|
||||||
|
use_cache: bool = True,
|
||||||
|
bos_token_id: int = 1,
|
||||||
|
eos_token_id: int = 2,
|
||||||
|
**kwargs,
|
||||||
|
) -> None:
|
||||||
|
self.vocab_size = vocab_size
|
||||||
|
self.max_context_length = max_context_length
|
||||||
|
self.num_transformer_layers = num_transformer_layers
|
||||||
|
self.model_dim = model_dim
|
||||||
|
self.head_dim = head_dim
|
||||||
|
self.qkv_multipliers = qkv_multipliers
|
||||||
|
self.num_query_heads = num_query_heads
|
||||||
|
self.num_gqa_groups = num_gqa_groups
|
||||||
|
self.ffn_multipliers = ffn_multipliers
|
||||||
|
self.ffn_with_glu = ffn_with_glu
|
||||||
|
self.ffn_dim_divisor = ffn_dim_divisor
|
||||||
|
self.activation_fn_name = activation_fn_name
|
||||||
|
self.normalization_layer_name = normalization_layer_name
|
||||||
|
self.normalize_qk_projections = normalize_qk_projections
|
||||||
|
self.share_input_output_layers = share_input_output_layers
|
||||||
|
self.rope_freq_constant = rope_freq_constant
|
||||||
|
self.rope_max_length = rope_max_length
|
||||||
|
self.num_query_heads = (
|
||||||
|
compute_heads(model_dim=model_dim, head_dim=head_dim)
|
||||||
|
if num_query_heads is None
|
||||||
|
else num_query_heads
|
||||||
|
)
|
||||||
|
self.initializer_range = initializer_range
|
||||||
|
|
||||||
|
self.__post_init__()
|
||||||
|
super().__init__(
|
||||||
|
use_cache=use_cache,
|
||||||
|
bos_token_id=bos_token_id,
|
||||||
|
eos_token_id=eos_token_id,
|
||||||
|
**kwargs,
|
||||||
|
)
|
||||||
|
|
||||||
|
def __post_init__(self) -> None:
|
||||||
|
if self.num_gqa_groups is not None:
|
||||||
|
head_multiple_of = self.num_gqa_groups
|
||||||
|
else:
|
||||||
|
head_multiple_of = 2
|
||||||
|
|
||||||
|
if isinstance(self.qkv_multipliers, Number):
|
||||||
|
# All attention layers have the same latent dimensions, resulting in uniform allocation of parameters.
|
||||||
|
qkv_dim = make_divisible(
|
||||||
|
self.model_dim * self.qkv_multipliers,
|
||||||
|
divisor=self.head_dim * head_multiple_of,
|
||||||
|
)
|
||||||
|
query_dims = [int(qkv_dim)] * self.num_transformer_layers
|
||||||
|
|
||||||
|
elif (
|
||||||
|
isinstance(self.qkv_multipliers, (tuple, list))
|
||||||
|
and len(self.qkv_multipliers) == 2
|
||||||
|
):
|
||||||
|
# Each attention layer have different latent dimensions assuming qkv_multipliers[0] != qkv_multipliers[1].
|
||||||
|
# This results in variable allocation of parameters in attention layer.
|
||||||
|
# This scaling is known as layer-wise or block-wise scaling: https://arxiv.org/abs/2008.00623
|
||||||
|
qkv_multipliers = [
|
||||||
|
round(v, 2)
|
||||||
|
for v in np.linspace(
|
||||||
|
self.qkv_multipliers[0],
|
||||||
|
self.qkv_multipliers[1],
|
||||||
|
num=self.num_transformer_layers,
|
||||||
|
dtype=float,
|
||||||
|
)
|
||||||
|
]
|
||||||
|
# Make sure that scaled model dimension is divisible by scaled head dimension.
|
||||||
|
query_dims = [
|
||||||
|
int(
|
||||||
|
make_divisible(
|
||||||
|
self.model_dim * m, divisor=self.head_dim * head_multiple_of
|
||||||
|
)
|
||||||
|
)
|
||||||
|
for m in qkv_multipliers
|
||||||
|
]
|
||||||
|
else:
|
||||||
|
raise NotImplementedError(
|
||||||
|
f"QKV multipliers should be a single number or a list containing exactly two numbers. Got: {qkv_multipliers}."
|
||||||
|
)
|
||||||
|
|
||||||
|
# compute the number of query, key, and value heads
|
||||||
|
# For multi-head and multi-query attention, the number of heads for query, key, and value are the same.
|
||||||
|
# For group query attention, the number of key and value heads are the same.
|
||||||
|
self.num_query_heads = [
|
||||||
|
int(compute_heads(q_dim, self.head_dim)) for q_dim in query_dims
|
||||||
|
]
|
||||||
|
self.num_kv_heads = [
|
||||||
|
q_heads // self.num_gqa_groups for q_heads in self.num_query_heads
|
||||||
|
]
|
||||||
|
|
||||||
|
# Feed-forward network (FFN) multipliers
|
||||||
|
if isinstance(self.ffn_multipliers, Number):
|
||||||
|
# All FFN layers have the same latent dimensions, resulting in uniform allocation of parameters.
|
||||||
|
self.ffn_multipliers = [self.ffn_multipliers] * self.num_transformer_layers
|
||||||
|
elif isinstance(self.ffn_multipliers, (tuple, list)):
|
||||||
|
# Each FFN layer have different latent dimensions assuming ffn_multipliers[0] != ffn_multipliers[1].
|
||||||
|
# This results in variable allocation of parameters in FFN layer.
|
||||||
|
# This scaling is known as layer-wise or block-wise scaling: https://arxiv.org/abs/2008.00623
|
||||||
|
if len(self.ffn_multipliers) == 2:
|
||||||
|
self.ffn_multipliers = [
|
||||||
|
round(v, 2)
|
||||||
|
for v in np.linspace(
|
||||||
|
self.ffn_multipliers[0],
|
||||||
|
self.ffn_multipliers[1],
|
||||||
|
num=self.num_transformer_layers,
|
||||||
|
dtype=float,
|
||||||
|
)
|
||||||
|
]
|
||||||
|
else:
|
||||||
|
assert (
|
||||||
|
len(self.ffn_multipliers) == self.num_transformer_layers
|
||||||
|
), f"{len(self.ffn_multipliers)=}!={self.num_transformer_layers=}"
|
||||||
|
else:
|
||||||
|
raise NotImplementedError(
|
||||||
|
f"FFN multipliers should be a single number or a list containing exactly two numbers. Got: {qkv_multipliers}."
|
||||||
|
)
|
||||||
|
|
||||||
|
# check num_query_heads divisible by num_kv_heads for every layer
|
||||||
|
for layer_idx in range(len(query_dims)):
|
||||||
|
assert self.num_query_heads[layer_idx] % self.num_kv_heads[layer_idx] == 0
|
||||||
1003
Model_Architecture_Discussions/openelm/modeling_openelm.py
Normal file
1003
Model_Architecture_Discussions/openelm/modeling_openelm.py
Normal file
File diff suppressed because it is too large
Load Diff
295
Model_Architecture_Discussions/openelm/openelm.ipynb
Normal file
295
Model_Architecture_Discussions/openelm/openelm.ipynb
Normal file
@ -0,0 +1,295 @@
|
|||||||
|
{
|
||||||
|
"cells": [
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 1,
|
||||||
|
"id": "dd05f32c-a90f-4122-b6d7-a5ec7b3b9ba0",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"name": "stdout",
|
||||||
|
"output_type": "stream",
|
||||||
|
"text": [
|
||||||
|
"env: HF_ENDPOINT=https://hf-mirror.com\n"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"%env HF_ENDPOINT=https://hf-mirror.com"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 2,
|
||||||
|
"id": "54f03217-da8d-4a05-9c85-9e0301a597e7",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"import os\n",
|
||||||
|
"\n",
|
||||||
|
"# 设置 HF_HOME 环境变量 设置下载路径\n",
|
||||||
|
"os.environ['HF_HOME'] = '/data1/ckw'"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"id": "94cab483-b247-4aa8-9557-d15e459244af",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"# 这个时候,由于OpenELM还没有官方发布在transformer,所以需要改下源码(已经有了更好的办法,因此不需要改源码了)"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"id": "e2f3081d-f795-4f86-b80e-e915ae56b426",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"# /data1/ckw/micromamba/envs/kewei-ai/lib/python3.11/site-packages/transformers/models/auto/tokenization_auto.py:909"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"id": "db03e7fd-d06f-4e78-842f-66c8e02043bd",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"#### 1.3 AutoModelForCausalLM代码\n",
|
||||||
|
"\n",
|
||||||
|
"```python\n",
|
||||||
|
"class AutoModelForCausalLM:\n",
|
||||||
|
" def __init__(self):\n",
|
||||||
|
" raise EnvironmentError(\n",
|
||||||
|
" \"AutoModelForCausalLM is designed to be instantiated \"\n",
|
||||||
|
" \"using the `AutoModelForCausalLM.from_pretrained(pretrained_model_name_or_path)` or \"\n",
|
||||||
|
" \"`AutoModelForCausalLM.from_config(config)` methods.\"\n",
|
||||||
|
" )\n",
|
||||||
|
"\n",
|
||||||
|
"\t@classmethod\n",
|
||||||
|
" @replace_list_option_in_docstrings(MODEL_FOR_CAUSAL_LM_MAPPING, use_model_types=False)\n",
|
||||||
|
" def from_config(cls, config):\n",
|
||||||
|
"\n",
|
||||||
|
" if type(config) in MODEL_FOR_CAUSAL_LM_MAPPING.keys():\n",
|
||||||
|
" return MODEL_FOR_CAUSAL_LM_MAPPING[type(config)](config)\n",
|
||||||
|
" raise ValueError(\n",
|
||||||
|
" \"Unrecognized configuration class {} for this kind of AutoModel: {}.\\n\"\n",
|
||||||
|
" \"Model type should be one of {}.\".format(\n",
|
||||||
|
" config.__class__, cls.__name__, \", \".join(c.__name__ for c in MODEL_FOR_CAUSAL_LM_MAPPING.keys())\n",
|
||||||
|
" )\n",
|
||||||
|
" )\n",
|
||||||
|
"\n",
|
||||||
|
"\n",
|
||||||
|
"\t@classmethod\n",
|
||||||
|
" @replace_list_option_in_docstrings(MODEL_FOR_CAUSAL_LM_MAPPING)\n",
|
||||||
|
" @add_start_docstrings(\n",
|
||||||
|
" \"Instantiate one of the model classes of the library---with a causal language modeling head---from a \"\n",
|
||||||
|
" \"pretrained model.\",\n",
|
||||||
|
" AUTO_MODEL_PRETRAINED_DOCSTRING,\n",
|
||||||
|
" )\n",
|
||||||
|
" def from_pretrained(cls, pretrained_model_name_or_path, *model_args, **kwargs):\n",
|
||||||
|
" config = kwargs.pop(\"config\", None)\n",
|
||||||
|
" if not isinstance(config, PretrainedConfig):\n",
|
||||||
|
" config, kwargs = AutoConfig.from_pretrained(\n",
|
||||||
|
" pretrained_model_name_or_path, return_unused_kwargs=True, **kwargs\n",
|
||||||
|
" )\n",
|
||||||
|
"\n",
|
||||||
|
" if type(config) in MODEL_FOR_CAUSAL_LM_MAPPING.keys():\n",
|
||||||
|
" return MODEL_FOR_CAUSAL_LM_MAPPING[type(config)].from_pretrained(\n",
|
||||||
|
" pretrained_model_name_or_path, *model_args, config=config, **kwargs\n",
|
||||||
|
" )\n",
|
||||||
|
" raise ValueError(\n",
|
||||||
|
" \"Unrecognized configuration class {} for this kind of AutoModel: {}.\\n\"\n",
|
||||||
|
" \"Model type should be one of {}.\".format(\n",
|
||||||
|
" config.__class__, cls.__name__, \", \".join(c.__name__ for c in MODEL_FOR_CAUSAL_LM_MAPPING.keys())\n",
|
||||||
|
" )\n",
|
||||||
|
" )\n",
|
||||||
|
"```"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 5,
|
||||||
|
"id": "744c6db7-53f9-4911-adcb-4f0618693071",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"data": {
|
||||||
|
"application/vnd.jupyter.widget-view+json": {
|
||||||
|
"model_id": "7dd376f050c3496b904a5a545f499e07",
|
||||||
|
"version_major": 2,
|
||||||
|
"version_minor": 0
|
||||||
|
},
|
||||||
|
"text/plain": [
|
||||||
|
"tokenizer_config.json: 0%| | 0.00/265 [00:00<?, ?B/s]"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"metadata": {},
|
||||||
|
"output_type": "display_data"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"data": {
|
||||||
|
"application/vnd.jupyter.widget-view+json": {
|
||||||
|
"model_id": "4936fbb98c5446ebb60f4bdb288ddc73",
|
||||||
|
"version_major": 2,
|
||||||
|
"version_minor": 0
|
||||||
|
},
|
||||||
|
"text/plain": [
|
||||||
|
"tokenizer.model: 0%| | 0.00/500k [00:00<?, ?B/s]"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"metadata": {},
|
||||||
|
"output_type": "display_data"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"data": {
|
||||||
|
"application/vnd.jupyter.widget-view+json": {
|
||||||
|
"model_id": "080e814bd03542aeb4a9f882c67ed06a",
|
||||||
|
"version_major": 2,
|
||||||
|
"version_minor": 0
|
||||||
|
},
|
||||||
|
"text/plain": [
|
||||||
|
"tokenizer.json: 0.00B [00:00, ?B/s]"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"metadata": {},
|
||||||
|
"output_type": "display_data"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"data": {
|
||||||
|
"application/vnd.jupyter.widget-view+json": {
|
||||||
|
"model_id": "d04a2f9f4a57490bb70e88af4ab10008",
|
||||||
|
"version_major": 2,
|
||||||
|
"version_minor": 0
|
||||||
|
},
|
||||||
|
"text/plain": [
|
||||||
|
"added_tokens.json: 0%| | 0.00/21.0 [00:00<?, ?B/s]"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"metadata": {},
|
||||||
|
"output_type": "display_data"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"data": {
|
||||||
|
"application/vnd.jupyter.widget-view+json": {
|
||||||
|
"model_id": "6a728b39e23043459b8c2bddef6e8845",
|
||||||
|
"version_major": 2,
|
||||||
|
"version_minor": 0
|
||||||
|
},
|
||||||
|
"text/plain": [
|
||||||
|
"special_tokens_map.json: 0%| | 0.00/435 [00:00<?, ?B/s]"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"metadata": {},
|
||||||
|
"output_type": "display_data"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"name": "stderr",
|
||||||
|
"output_type": "stream",
|
||||||
|
"text": [
|
||||||
|
"The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.\n",
|
||||||
|
"Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"data": {
|
||||||
|
"text/plain": [
|
||||||
|
"'\\nDataWhalechina is an organization founded at Shanghai Jiao Tong University that helps learners learn artificial intelligence. The organization aims to provide AI-related courses to students in China.\\n\\nThis repository contains the code for the following courses:\\n\\n1. [Introduction to AI: Neural Networks and Classification](https://www.datawhalechina.com/courses/introduction-to-ai-neural-networks-and-classification/)\\n2. [Introduction to AI: Deep Learning and Applications](https://www.datawhalechina.com/courses/introduction-to-ai-deep-learning-and-applications/)\\n3. [Introduction to AI: Algorithms and Applications](https://www.datawhalechina.com/courses/introduction-to-ai-algorithms-and-applications/)\\n4. [Introduction to AI: Data Preparation and Model Evaluation](https://www.datawhalechina.com/courses/introduction-to-ai-data-preparation-and-model-evaluation/)\\n5. [Introduction to AI: Building and Evaluating AI Models](https://www.datawhalechina.com/courses/introduction-to-ai-building-and-evaluating-ai'"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"execution_count": 5,
|
||||||
|
"metadata": {},
|
||||||
|
"output_type": "execute_result"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"from transformers import AutoTokenizer\n",
|
||||||
|
"from modeling_openelm import OpenELMForCausalLM\n",
|
||||||
|
"\n",
|
||||||
|
"model = OpenELMForCausalLM.from_pretrained(\"Apple/OpenELM-270M-Instruct\")#trust_remote_code=True\n",
|
||||||
|
"# tokenizer = AutoTokenizer.from_pretrained(\"Apple/OpenELM-270M-Instruct\")Llama-2-7b-hf\n",
|
||||||
|
"tokenizer = AutoTokenizer.from_pretrained(\"NousResearch/Llama-2-7b-chat-hf\")\n",
|
||||||
|
"prompt = '\\nDataWhalechina is an organization founded at Shanghai Jiao Tong University that helps learners learn artificial intelligence.'\n",
|
||||||
|
"inputs = tokenizer(prompt, return_tensors=\"pt\")\n",
|
||||||
|
"\n",
|
||||||
|
"# Generate\n",
|
||||||
|
"generate_ids = model.generate(inputs.input_ids, max_length=300)\n",
|
||||||
|
"tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "raw",
|
||||||
|
"id": "6c0f8954-aca3-496b-86e4-843cdb00b104",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"上面这个openelm的回复,感觉还比较贴合datawhale的实际情况哈,速度也是很快的,没得说,不过链接是编的哈哈"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 6,
|
||||||
|
"id": "060b86f9-fda5-4d9f-8292-4d9464c7b2ef",
|
||||||
|
"metadata": {
|
||||||
|
"scrolled": true
|
||||||
|
},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"name": "stderr",
|
||||||
|
"output_type": "stream",
|
||||||
|
"text": [
|
||||||
|
"The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.\n",
|
||||||
|
"Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"data": {
|
||||||
|
"text/plain": [
|
||||||
|
"\"\\nDataWhalechina is an organization founded at Shanghai Jiao Tong University that helps learners \\nimprove their Chinese language skills through data-driven learning.\\n\\n## Data\\n\\nThe DataWhalechina platform collects data from various sources, including:\\n\\n1. [China's National Database of Vocabulary and Phrase Structure](https://www.national-database.gov.cn/): This database contains vocabulary and phrase structure definitions for 1,000,000+ Chinese words and phrases.\\n\\n2. [China's National Academic Database of Literature and Culture](https://academic.lib.shu.edu.cn/): This database contains articles, books, and speeches written in Chinese by Chinese scholars.\\n\\n3. [China's National Knowledge Incorporation Database](https://knowledge.cn/): This database contains data on intellectual property rights, patents, and copyrights.\\n\\n4. [China's National Bureau of Statistics](https://www.stat.gov.cn/): This database contains statistics on population, living standards, and purchasing power.\\n\\n5. [China's National Bureau of Census](https://www.census.gov.cn/): This database contains\""
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"execution_count": 6,
|
||||||
|
"metadata": {},
|
||||||
|
"output_type": "execute_result"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"prompt = '\\nDataWhalechina is an organization founded at Shanghai Jiao Tong University that helps learners '\n",
|
||||||
|
"inputs = tokenizer(prompt, return_tensors=\"pt\")\n",
|
||||||
|
"\n",
|
||||||
|
"# Generate\n",
|
||||||
|
"generate_ids = model.generate(inputs.input_ids, max_length=300)\n",
|
||||||
|
"tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "raw",
|
||||||
|
"id": "052ab03d-f739-40e5-9f48-e8ab3d0f5f19",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"如果提示内容给的比较短,可能会在事实上面出一点小问题"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"metadata": {
|
||||||
|
"kernelspec": {
|
||||||
|
"display_name": "kewei-ai",
|
||||||
|
"language": "python",
|
||||||
|
"name": "kewei-ai"
|
||||||
|
},
|
||||||
|
"language_info": {
|
||||||
|
"codemirror_mode": {
|
||||||
|
"name": "ipython",
|
||||||
|
"version": 3
|
||||||
|
},
|
||||||
|
"file_extension": ".py",
|
||||||
|
"mimetype": "text/x-python",
|
||||||
|
"name": "python",
|
||||||
|
"nbconvert_exporter": "python",
|
||||||
|
"pygments_lexer": "ipython3",
|
||||||
|
"version": "3.11.5"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"nbformat": 4,
|
||||||
|
"nbformat_minor": 5
|
||||||
|
}
|
||||||
Loading…
Reference in New Issue
Block a user