add openelm

This commit is contained in:
kewei 2024-06-01 17:33:19 +08:00
parent 7fa3701ca8
commit 4c86b986e8
3 changed files with 1611 additions and 0 deletions

View File

@ -0,0 +1,313 @@
"""Implements HF OpenELMConfig based on PretrainedConfig"""
from numbers import Number
from typing import List, Optional, Union
import numpy as np
from transformers import PretrainedConfig
def make_divisible(
v: Union[float, int],
divisor: Optional[int] = 8,
min_value: Optional[Union[float, int]] = None,
) -> Union[float, int]:
"""
This function is taken from the original tf repo.
It ensures that all layers have a channel number that is divisible by the divisor
It can be seen at:
https://github.com/tensorflow/models/blob/2cfc99eff5e5eb729c6793d2f3d03aa1c9be2b15/research/slim/nets/mobilenet/mobilenet.py#L62
Args:
v: input value
divisor: default to 8
min_value: minimum divisor value
Returns:
new_v: new divisible value
"""
if min_value is None:
min_value = divisor
new_v = max(min_value, int(v + divisor / 2) // divisor * divisor)
# Make sure that round down does not go down by more than 10%.
if new_v < 0.9 * v:
new_v += divisor
return new_v
def compute_heads(model_dim: int, head_dim: int) -> int:
"""Compute the number of heads.
Args:
model_dim: Model dimension.
head_dim: Head dimension.
Returns:
An integer denoting number of heads in multi-head attention is returned.
Raises:
ValueError: if model dimension is not divisible by head dimension.
"""
if model_dim % head_dim == 0:
return model_dim // head_dim
else:
raise ValueError(
f"Model dimension should be divisible by head dimension. Got: {model_dim} and {head_dim}."
)
OpenELM_CONFIGS = {
"OpenELM-270M": dict(
num_transformer_layers=16,
model_dim=1280,
head_dim=64,
num_gqa_groups=4,
normalize_qk_projections=True,
share_input_output_layers=True,
# Vary the FFN and QKV multipliers to create variable FFN and attention layers respectively.
ffn_multipliers=(0.5, 4.0),
qkv_multipliers=(0.5, 1.0),
),
"OpenELM-450M": dict(
num_transformer_layers=20,
model_dim=1536,
head_dim=64,
num_gqa_groups=4,
normalize_qk_projections=True,
share_input_output_layers=True,
# Vary the FFN and QKV multipliers to create variable FFN and attention layers respectively.
ffn_multipliers=(0.5, 4.0),
qkv_multipliers=(0.5, 1.0),
),
"OpenELM-1_1B": dict(
num_transformer_layers=28,
model_dim=2048,
head_dim=64,
num_gqa_groups=4,
normalize_qk_projections=True,
share_input_output_layers=True,
# Vary the FFN and QKV multipliers to create variable FFN and attention layers respectively.
ffn_multipliers=(0.5, 4.0),
qkv_multipliers=(0.5, 1.0),
),
"OpenELM-3B": dict(
num_transformer_layers=36,
model_dim=3072,
head_dim=128,
num_gqa_groups=4,
normalize_qk_projections=True,
share_input_output_layers=True,
# Vary the FFN and QKV multipliers to create variable FFN and attention layers respectively.
ffn_multipliers=(0.5, 4.0),
qkv_multipliers=(0.5, 1.0),
),
}
class OpenELMConfig(PretrainedConfig):
r"""
This is the configuration class to store the configuration of a [`OpenELMModel`]. It is used to instantiate an OpenELM model according to the specified arguments, defining the model architecture.
Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
documentation from [`PretrainedConfig`] for more information.
Args:
vocab_size (`int`, *optional*, defaults to 32000):
Vocabulary size of the OpenELM model.
max_context_length (`int`, *optional*, defaults to 2048):
Maximum number of input tokens.
num_transformer_layers (`int`, *optional*, defaults to 12):
Number of hidden layers in the Transformer decoder.
model_dim (`int`, *optional*, defaults to 2048):
Dimension of the hidden representations.
head_dim (`int`, *optional*, defaults to 128):
The attention head dimension.
qkv_multipliers (`Union[Number, List[Number]]`, *optional*, defaults to 1.0):
If the qkv_multipliers is a Number, then all attention layers have the same latent dimensions,
resulting in uniform allocation of parameters.
If the qkv_multipliers is a List of Number, then each attention layer have different latent dimensions
assuming qkv_multipliers[0] != qkv_multipliers[1]. This results in variable allocation of parameters in attention layer.
This scaling is known as layer-wise or block-wise scaling: https://arxiv.org/abs/2008.00623
num_query_heads (`Union[int, None]`, *optional*, defaults to None):
The number of query heads, computed from `compute_heads(model_dim=model_dim, head_dim=head_dim)`.
num_gqa_groups (`int`, *optional*, defaults to 1):
This variable allows to switch between multi-head attention, group query attention, and multi-query attention.
When num_gqa_groups == 1, then it is multi-head attention.
When 1 < num_gqa_groups < num_heads and num_heads is divisible by num_gqa_groups, then it is group query attention
When num_gqa_groups == num_heads, then it is multi-query attention
ffn_multipliers (`Union[Number, List[Number]]`, *optional*, defaults to 4.0):
Feed-forward network (FFN) multipliers.
If the ffn_multipliers is a Number, then all FFN layers have the same latent dimensions,
resulting in uniform allocation of parameters.
If the ffn_multipliers is a List of Number, then each FFN layer have different latent dimensions
assuming ffn_multipliers[0] != ffn_multipliers[1]. This results in variable allocation of parameters in FFN layer.
This scaling is known as layer-wise or block-wise scaling: https://arxiv.org/abs/2008.00623
ffn_with_glu (`bool`, *optional*, defaults to True):
Whether to use FFN with Gated Linear Unit (GLU)
ffn_dim_divisor (`int`, *optional*, defaults to 256):
The ffn layer dimension divisor.
activation_fn_name (`str` or `function`, *optional*, defaults to `"swish"`):
The non-linear activation function (function or string) in the decoder.
normalization_layer_name (`str` or `function`, *optional*, defaults to `"rms_norm"`):
Type of normalization layer.
normalize_qk_projections (`bool`, *optional*, defaults to False):
Whether to normalize queries and keys after projections
share_input_output_layers (`bool`, *optional*, defaults to False):
Whether to share the embedding between input and output linear layer
rope_freq_constant (`int`, *optional*, defaults to 10000):
The base period of the RoPE embeddings.
rope_max_length (`int`, *optional*, defaults to 4096):
That rope_max_length is set to twice of max_context_length.
This allows flexibility in token lengths during training or fine-tuning.
initializer_range (`float`, *optional*, defaults to 0.02):
The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
use_cache (`bool`, *optional*, defaults to `True`):
Whether or not the model should return the last key/values attentions (not used by all models). Only
relevant if `config.is_decoder=True`.
bos_token_id (`int`, *optional*, defaults to 2):
Beginning of stream token id.
eos_token_id (`int`, *optional*, defaults to 1):
End of stream token id.
"""
model_type = "openelm"
def __init__(
self,
vocab_size: int = 32000,
max_context_length: int = 2048,
num_transformer_layers: int = 12,
model_dim: int = 2048,
head_dim: int = 128,
qkv_multipliers: Union[Number, List[Number]] = 1.0,
num_query_heads: Union[int, None] = None,
num_gqa_groups: int = 1,
ffn_multipliers: Union[Number, List[Number]] = 4.0,
ffn_with_glu: bool = True,
ffn_dim_divisor: int = 256,
activation_fn_name: str = "swish",
normalization_layer_name: str = "rms_norm",
normalize_qk_projections: bool = False,
share_input_output_layers: bool = False,
rope_freq_constant: int = 10000,
rope_max_length: int = 4096,
initializer_range: float = 0.02,
use_cache: bool = True,
bos_token_id: int = 1,
eos_token_id: int = 2,
**kwargs,
) -> None:
self.vocab_size = vocab_size
self.max_context_length = max_context_length
self.num_transformer_layers = num_transformer_layers
self.model_dim = model_dim
self.head_dim = head_dim
self.qkv_multipliers = qkv_multipliers
self.num_query_heads = num_query_heads
self.num_gqa_groups = num_gqa_groups
self.ffn_multipliers = ffn_multipliers
self.ffn_with_glu = ffn_with_glu
self.ffn_dim_divisor = ffn_dim_divisor
self.activation_fn_name = activation_fn_name
self.normalization_layer_name = normalization_layer_name
self.normalize_qk_projections = normalize_qk_projections
self.share_input_output_layers = share_input_output_layers
self.rope_freq_constant = rope_freq_constant
self.rope_max_length = rope_max_length
self.num_query_heads = (
compute_heads(model_dim=model_dim, head_dim=head_dim)
if num_query_heads is None
else num_query_heads
)
self.initializer_range = initializer_range
self.__post_init__()
super().__init__(
use_cache=use_cache,
bos_token_id=bos_token_id,
eos_token_id=eos_token_id,
**kwargs,
)
def __post_init__(self) -> None:
if self.num_gqa_groups is not None:
head_multiple_of = self.num_gqa_groups
else:
head_multiple_of = 2
if isinstance(self.qkv_multipliers, Number):
# All attention layers have the same latent dimensions, resulting in uniform allocation of parameters.
qkv_dim = make_divisible(
self.model_dim * self.qkv_multipliers,
divisor=self.head_dim * head_multiple_of,
)
query_dims = [int(qkv_dim)] * self.num_transformer_layers
elif (
isinstance(self.qkv_multipliers, (tuple, list))
and len(self.qkv_multipliers) == 2
):
# Each attention layer have different latent dimensions assuming qkv_multipliers[0] != qkv_multipliers[1].
# This results in variable allocation of parameters in attention layer.
# This scaling is known as layer-wise or block-wise scaling: https://arxiv.org/abs/2008.00623
qkv_multipliers = [
round(v, 2)
for v in np.linspace(
self.qkv_multipliers[0],
self.qkv_multipliers[1],
num=self.num_transformer_layers,
dtype=float,
)
]
# Make sure that scaled model dimension is divisible by scaled head dimension.
query_dims = [
int(
make_divisible(
self.model_dim * m, divisor=self.head_dim * head_multiple_of
)
)
for m in qkv_multipliers
]
else:
raise NotImplementedError(
f"QKV multipliers should be a single number or a list containing exactly two numbers. Got: {qkv_multipliers}."
)
# compute the number of query, key, and value heads
# For multi-head and multi-query attention, the number of heads for query, key, and value are the same.
# For group query attention, the number of key and value heads are the same.
self.num_query_heads = [
int(compute_heads(q_dim, self.head_dim)) for q_dim in query_dims
]
self.num_kv_heads = [
q_heads // self.num_gqa_groups for q_heads in self.num_query_heads
]
# Feed-forward network (FFN) multipliers
if isinstance(self.ffn_multipliers, Number):
# All FFN layers have the same latent dimensions, resulting in uniform allocation of parameters.
self.ffn_multipliers = [self.ffn_multipliers] * self.num_transformer_layers
elif isinstance(self.ffn_multipliers, (tuple, list)):
# Each FFN layer have different latent dimensions assuming ffn_multipliers[0] != ffn_multipliers[1].
# This results in variable allocation of parameters in FFN layer.
# This scaling is known as layer-wise or block-wise scaling: https://arxiv.org/abs/2008.00623
if len(self.ffn_multipliers) == 2:
self.ffn_multipliers = [
round(v, 2)
for v in np.linspace(
self.ffn_multipliers[0],
self.ffn_multipliers[1],
num=self.num_transformer_layers,
dtype=float,
)
]
else:
assert (
len(self.ffn_multipliers) == self.num_transformer_layers
), f"{len(self.ffn_multipliers)=}!={self.num_transformer_layers=}"
else:
raise NotImplementedError(
f"FFN multipliers should be a single number or a list containing exactly two numbers. Got: {qkv_multipliers}."
)
# check num_query_heads divisible by num_kv_heads for every layer
for layer_idx in range(len(query_dims)):
assert self.num_query_heads[layer_idx] % self.num_kv_heads[layer_idx] == 0

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,295 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"id": "dd05f32c-a90f-4122-b6d7-a5ec7b3b9ba0",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"env: HF_ENDPOINT=https://hf-mirror.com\n"
]
}
],
"source": [
"%env HF_ENDPOINT=https://hf-mirror.com"
]
},
{
"cell_type": "code",
"execution_count": 2,
"id": "54f03217-da8d-4a05-9c85-9e0301a597e7",
"metadata": {},
"outputs": [],
"source": [
"import os\n",
"\n",
"# 设置 HF_HOME 环境变量 设置下载路径\n",
"os.environ['HF_HOME'] = '/data1/ckw'"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "94cab483-b247-4aa8-9557-d15e459244af",
"metadata": {},
"outputs": [],
"source": [
"# 这个时候由于OpenELM还没有官方发布在transformer所以需要改下源码(已经有了更好的办法,因此不需要改源码了)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "e2f3081d-f795-4f86-b80e-e915ae56b426",
"metadata": {},
"outputs": [],
"source": [
"# /data1/ckw/micromamba/envs/kewei-ai/lib/python3.11/site-packages/transformers/models/auto/tokenization_auto.py:909"
]
},
{
"cell_type": "markdown",
"id": "db03e7fd-d06f-4e78-842f-66c8e02043bd",
"metadata": {},
"source": [
"#### 1.3 AutoModelForCausalLM代码\n",
"\n",
"```python\n",
"class AutoModelForCausalLM:\n",
" def __init__(self):\n",
" raise EnvironmentError(\n",
" \"AutoModelForCausalLM is designed to be instantiated \"\n",
" \"using the `AutoModelForCausalLM.from_pretrained(pretrained_model_name_or_path)` or \"\n",
" \"`AutoModelForCausalLM.from_config(config)` methods.\"\n",
" )\n",
"\n",
"\t@classmethod\n",
" @replace_list_option_in_docstrings(MODEL_FOR_CAUSAL_LM_MAPPING, use_model_types=False)\n",
" def from_config(cls, config):\n",
"\n",
" if type(config) in MODEL_FOR_CAUSAL_LM_MAPPING.keys():\n",
" return MODEL_FOR_CAUSAL_LM_MAPPING[type(config)](config)\n",
" raise ValueError(\n",
" \"Unrecognized configuration class {} for this kind of AutoModel: {}.\\n\"\n",
" \"Model type should be one of {}.\".format(\n",
" config.__class__, cls.__name__, \", \".join(c.__name__ for c in MODEL_FOR_CAUSAL_LM_MAPPING.keys())\n",
" )\n",
" )\n",
"\n",
"\n",
"\t@classmethod\n",
" @replace_list_option_in_docstrings(MODEL_FOR_CAUSAL_LM_MAPPING)\n",
" @add_start_docstrings(\n",
" \"Instantiate one of the model classes of the library---with a causal language modeling head---from a \"\n",
" \"pretrained model.\",\n",
" AUTO_MODEL_PRETRAINED_DOCSTRING,\n",
" )\n",
" def from_pretrained(cls, pretrained_model_name_or_path, *model_args, **kwargs):\n",
" config = kwargs.pop(\"config\", None)\n",
" if not isinstance(config, PretrainedConfig):\n",
" config, kwargs = AutoConfig.from_pretrained(\n",
" pretrained_model_name_or_path, return_unused_kwargs=True, **kwargs\n",
" )\n",
"\n",
" if type(config) in MODEL_FOR_CAUSAL_LM_MAPPING.keys():\n",
" return MODEL_FOR_CAUSAL_LM_MAPPING[type(config)].from_pretrained(\n",
" pretrained_model_name_or_path, *model_args, config=config, **kwargs\n",
" )\n",
" raise ValueError(\n",
" \"Unrecognized configuration class {} for this kind of AutoModel: {}.\\n\"\n",
" \"Model type should be one of {}.\".format(\n",
" config.__class__, cls.__name__, \", \".join(c.__name__ for c in MODEL_FOR_CAUSAL_LM_MAPPING.keys())\n",
" )\n",
" )\n",
"```"
]
},
{
"cell_type": "code",
"execution_count": 5,
"id": "744c6db7-53f9-4911-adcb-4f0618693071",
"metadata": {},
"outputs": [
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "7dd376f050c3496b904a5a545f499e07",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
"tokenizer_config.json: 0%| | 0.00/265 [00:00<?, ?B/s]"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "4936fbb98c5446ebb60f4bdb288ddc73",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
"tokenizer.model: 0%| | 0.00/500k [00:00<?, ?B/s]"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "080e814bd03542aeb4a9f882c67ed06a",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
"tokenizer.json: 0.00B [00:00, ?B/s]"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "d04a2f9f4a57490bb70e88af4ab10008",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
"added_tokens.json: 0%| | 0.00/21.0 [00:00<?, ?B/s]"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "6a728b39e23043459b8c2bddef6e8845",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
"special_tokens_map.json: 0%| | 0.00/435 [00:00<?, ?B/s]"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.\n",
"Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n"
]
},
{
"data": {
"text/plain": [
"'\\nDataWhalechina is an organization founded at Shanghai Jiao Tong University that helps learners learn artificial intelligence. The organization aims to provide AI-related courses to students in China.\\n\\nThis repository contains the code for the following courses:\\n\\n1. [Introduction to AI: Neural Networks and Classification](https://www.datawhalechina.com/courses/introduction-to-ai-neural-networks-and-classification/)\\n2. [Introduction to AI: Deep Learning and Applications](https://www.datawhalechina.com/courses/introduction-to-ai-deep-learning-and-applications/)\\n3. [Introduction to AI: Algorithms and Applications](https://www.datawhalechina.com/courses/introduction-to-ai-algorithms-and-applications/)\\n4. [Introduction to AI: Data Preparation and Model Evaluation](https://www.datawhalechina.com/courses/introduction-to-ai-data-preparation-and-model-evaluation/)\\n5. [Introduction to AI: Building and Evaluating AI Models](https://www.datawhalechina.com/courses/introduction-to-ai-building-and-evaluating-ai'"
]
},
"execution_count": 5,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"from transformers import AutoTokenizer\n",
"from modeling_openelm import OpenELMForCausalLM\n",
"\n",
"model = OpenELMForCausalLM.from_pretrained(\"Apple/OpenELM-270M-Instruct\")#trust_remote_code=True\n",
"# tokenizer = AutoTokenizer.from_pretrained(\"Apple/OpenELM-270M-Instruct\")Llama-2-7b-hf\n",
"tokenizer = AutoTokenizer.from_pretrained(\"NousResearch/Llama-2-7b-chat-hf\")\n",
"prompt = '\\nDataWhalechina is an organization founded at Shanghai Jiao Tong University that helps learners learn artificial intelligence.'\n",
"inputs = tokenizer(prompt, return_tensors=\"pt\")\n",
"\n",
"# Generate\n",
"generate_ids = model.generate(inputs.input_ids, max_length=300)\n",
"tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]"
]
},
{
"cell_type": "raw",
"id": "6c0f8954-aca3-496b-86e4-843cdb00b104",
"metadata": {},
"source": [
"上面这个openelm的回复感觉还比较贴合datawhale的实际情况哈速度也是很快的没得说不过链接是编的哈哈"
]
},
{
"cell_type": "code",
"execution_count": 6,
"id": "060b86f9-fda5-4d9f-8292-4d9464c7b2ef",
"metadata": {
"scrolled": true
},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.\n",
"Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n"
]
},
{
"data": {
"text/plain": [
"\"\\nDataWhalechina is an organization founded at Shanghai Jiao Tong University that helps learners \\nimprove their Chinese language skills through data-driven learning.\\n\\n## Data\\n\\nThe DataWhalechina platform collects data from various sources, including:\\n\\n1. [China's National Database of Vocabulary and Phrase Structure](https://www.national-database.gov.cn/): This database contains vocabulary and phrase structure definitions for 1,000,000+ Chinese words and phrases.\\n\\n2. [China's National Academic Database of Literature and Culture](https://academic.lib.shu.edu.cn/): This database contains articles, books, and speeches written in Chinese by Chinese scholars.\\n\\n3. [China's National Knowledge Incorporation Database](https://knowledge.cn/): This database contains data on intellectual property rights, patents, and copyrights.\\n\\n4. [China's National Bureau of Statistics](https://www.stat.gov.cn/): This database contains statistics on population, living standards, and purchasing power.\\n\\n5. [China's National Bureau of Census](https://www.census.gov.cn/): This database contains\""
]
},
"execution_count": 6,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"prompt = '\\nDataWhalechina is an organization founded at Shanghai Jiao Tong University that helps learners '\n",
"inputs = tokenizer(prompt, return_tensors=\"pt\")\n",
"\n",
"# Generate\n",
"generate_ids = model.generate(inputs.input_ids, max_length=300)\n",
"tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]"
]
},
{
"cell_type": "raw",
"id": "052ab03d-f739-40e5-9f48-e8ab3d0f5f19",
"metadata": {},
"source": [
"如果提示内容给的比较短,可能会在事实上面出一点小问题"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "kewei-ai",
"language": "python",
"name": "kewei-ai"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.11.5"
}
},
"nbformat": 4,
"nbformat_minor": 5
}