diff --git a/README.md b/README.md index 462c046..5e981b4 100644 --- a/README.md +++ b/README.md @@ -565,7 +565,7 @@ MiniMind训练数据集下载地址: [ModelScope](https://www.modelscope.cn/da ├── lora_medical.jsonl (34MB) ├── pretrain_hq.jsonl (1.6GB, ✨) ├── r1_mix_1024.jsonl (340MB) -├── rlaif-mini.jsonl (1MB) +├── rlaif-mini.jsonl (1MB, ✨) ├── sft_1024.jsonl (5.6GB) ├── sft_2048.jsonl (9GB) ├── sft_512.jsonl (7.5GB) @@ -578,13 +578,28 @@ MiniMind训练数据集下载地址: [ModelScope](https://www.modelscope.cn/da * `dpo.jsonl`✨ --RLHF阶段数据集(已精简优化,适合快速训练) * `lora_identity.jsonl` --自我认知数据集(例如:你是谁?我是minimind...),推荐用于lora训练(亦可用于全参SFT,勿被名字局限) * `lora_medical.jsonl` --医疗问答数据集,推荐用于lora训练(亦可用于全参SFT,勿被名字局限) -* `pretrain_hq.jsonl`✨ --预训练数据集,整合自匠数科技 -* `r1_mix_1024.jsonl` --DeepSeek-R1-1.5B蒸馏数据,每条数据字符最大长度为1024(因此训练时设置max_seq_len=1024) +* `pretrain_hq.jsonl`✨ --预训练数据集,整合自匠数科技(推荐设置`max_seq_len≈320`) +* `r1_mix_1024.jsonl` --DeepSeek-R1-1.5B蒸馏数据,每条数据字符最大长度为1024(推荐设置`max_seq_len≈720`) * `rlaif-mini.jsonl` --RLAIF训练数据集,从SFT数据集中随机采样1万条高质量对话,用于PPO/GRPO/SPO等强化学习算法训练 -* `sft_1024.jsonl` --整合自Qwen2.5蒸馏数据(是sft_2048的子集),每条数据字符最大长度为1024(因此训练时设置max_seq_len=1024) -* `sft_2048.jsonl` --整合自Qwen2.5蒸馏数据,每条数据字符最大长度为2048(因此训练时设置max_seq_len=2048) -* `sft_512.jsonl` --整合自匠数科技SFT数据,每条数据字符最大长度为512(因此训练时设置max_seq_len=512) -* `sft_mini_512.jsonl`✨ --极简整合自匠数科技SFT数据+Qwen2.5蒸馏数据(用于快速训练Zero模型),每条数据字符最大长度为512(因此训练时设置max_seq_len=512) +* `sft_1024.jsonl` --整合自Qwen2.5蒸馏数据(是sft_2048的子集),每条数据字符最大长度为1024(推荐设置`max_seq_len≈650`) +* `sft_2048.jsonl` --整合自Qwen2.5蒸馏数据,每条数据字符最大长度为2048(推荐设置`max_seq_len≈1400`) +* `sft_512.jsonl` --整合自匠数科技SFT数据,每条数据字符最大长度为512(推荐设置`max_seq_len≈350`) +* `sft_mini_512.jsonl`✨ --极简整合自匠数科技SFT数据+Qwen2.5蒸馏数据(用于快速训练Zero模型),每条数据字符最大长度为512(推荐设置`max_seq_len≈340`) + + +训练参数`max_seq_len`目前指的是tokens长度,而非绝对字符数。 +本项目tokenizer在中文文本上大约`1.5~1.7 字符/token`,纯英文的压缩比在`4~5 字符/token`,不同数据分布会有波动。 +数据集命名标注的“最大长度”均为字符数,100长度的字符串可粗略换算成`100/1.5≈67`的tokens长度。 + +例如: + +* 中文:`白日依山尽`5个字符可能被拆分为[`白日`,`依`,`山`,`尽`] 4个tokens; +* 英文:`The sun sets in the west`24个字符可能被拆分为[`The `,`sun `,`sets `,`in `,`the`,`west`] 6个tokens + +“推荐设置”给出了各个数据集上最大tokens长度的粗略估计。 +须知max_seq_len可以激进/保守/均衡地调整,因为更大或更小均无法避免副作用:一些样本短于max_seq_len后被padding浪费算力,一些样本长于max_seq_len后被截断语意。 + +在 `算力效率` <---> `语义完整性` 之间找到一个平衡点即可 diff --git a/README_en.md b/README_en.md index fd1929b..f17ca0d 100644 --- a/README_en.md +++ b/README_en.md @@ -564,7 +564,7 @@ Place the downloaded dataset files in the `./dataset/` directory (✨ are recomm ├── lora_medical.jsonl (34MB) ├── pretrain_hq.jsonl (1.6GB, ✨) ├── r1_mix_1024.jsonl (340MB) -├── rlaif-mini.jsonl (1MB) +├── rlaif-mini.jsonl (1MB, ✨) ├── sft_1024.jsonl (5.6GB) ├── sft_2048.jsonl (9GB) ├── sft_512.jsonl (7.5GB) @@ -577,13 +577,28 @@ Place the downloaded dataset files in the `./dataset/` directory (✨ are recomm * `dpo.jsonl`✨ --RLHF stage dataset (optimized and simplified, suitable for fast training) * `lora_identity.jsonl` --Self-awareness dataset (e.g., Who are you? I am minimind...), recommended for lora training (can also be used for full-parameter SFT, don't be limited by the name) * `lora_medical.jsonl` --Medical Q&A dataset, recommended for lora training (can also be used for full-parameter SFT, don't be limited by the name) -* `pretrain_hq.jsonl`✨ --Pretraining dataset, integrated from JiangShu Technology -* `r1_mix_1024.jsonl` --DeepSeek-R1-1.5B distilled data, maximum character length per entry is 1024 (therefore set max_seq_len=1024 when training) +* `pretrain_hq.jsonl`✨ --Pretraining dataset, integrated from JiangShu Technology (recommended `max_seq_len≈320`) +* `r1_mix_1024.jsonl` --DeepSeek-R1-1.5B distilled data, maximum character length per entry is 1024 (recommended `max_seq_len≈720`) * `rlaif-mini.jsonl` --RLAIF training dataset, randomly sampled 10,000 high-quality conversations from SFT dataset for training reinforcement learning algorithms like PPO/GRPO/SPO -* `sft_1024.jsonl` --Integrated from Qwen2.5 distilled data (a subset of sft_2048), maximum character length per entry is 1024 (therefore set max_seq_len=1024 when training) -* `sft_2048.jsonl` --Integrated from Qwen2.5 distilled data, maximum character length per entry is 2048 (therefore set max_seq_len=2048 when training) -* `sft_512.jsonl` --Integrated from JiangShu Technology SFT data, maximum character length per entry is 512 (therefore set max_seq_len=512 when training) -* `sft_mini_512.jsonl`✨ --Minimal integration from JiangShu Technology SFT data + Qwen2.5 distilled data (for quick training of Zero models), maximum character length per entry is 512 (therefore set max_seq_len=512 when training) +* `sft_1024.jsonl` --Integrated from Qwen2.5 distilled data (a subset of sft_2048), maximum character length per entry is 1024 (recommended `max_seq_len≈650`) +* `sft_2048.jsonl` --Integrated from Qwen2.5 distilled data, maximum character length per entry is 2048 (recommended `max_seq_len≈1400`) +* `sft_512.jsonl` --Integrated from JiangShu Technology SFT data, maximum character length per entry is 512 (recommended `max_seq_len≈350`) +* `sft_mini_512.jsonl`✨ --Minimal integration from JiangShu Technology SFT data + Qwen2.5 distilled data (for quick training of Zero models), maximum character length per entry is 512 (recommended `max_seq_len≈340`) + + +Training parameter `max_seq_len` currently refers to the **token length**, not the absolute number of characters. +For this project's tokenizer, typical Chinese text is roughly `1.5~1.7 chars/token`, while pure English text is roughly `4~5 chars/token` (it varies with data distribution). +The “max length” annotated in dataset names is measured in **characters**. For example, a 100-character Chinese string can be roughly converted to `100/1.5≈67` tokens. + +For example: + +* Chinese: `白日依山尽` (5 chars) may be tokenized into [`白日`, `依`, `山`, `尽`] (4 tokens) +* English: `The sun sets in the west` (24 chars) may be tokenized into [`The `, `sun `, `sets `, `in `, `the`, `west`] (6 tokens) + +The “recommended setting” above provides a rough estimate of the max token length for each dataset. +Note that `max_seq_len` can be tuned aggressively / conservatively / in a balanced way: a larger value increases padding waste, while a smaller value increases truncation. + +Just find a balance between `compute efficiency` <---> `semantic completeness`. diff --git a/dataset/lm_dataset.py b/dataset/lm_dataset.py index 42b0c0d..89f68f3 100644 --- a/dataset/lm_dataset.py +++ b/dataset/lm_dataset.py @@ -1,14 +1,9 @@ import json -import random -import re - import pandas as pd import numpy as np from torch.utils.data import Dataset, DataLoader import torch -from sklearn.model_selection import train_test_split import os -import ast os.environ["TOKENIZERS_PARALLELISM"] = "false" diff --git a/trainer/train_distill_reason.py b/trainer/train_distill_reason.py index 111fe92..e86fe9d 100644 --- a/trainer/train_distill_reason.py +++ b/trainer/train_distill_reason.py @@ -110,7 +110,7 @@ if __name__ == "__main__": parser.add_argument("--save_interval", type=int, default=100, help="模型保存间隔") parser.add_argument('--hidden_size', default=512, type=int, help="隐藏层维度") parser.add_argument('--num_hidden_layers', default=8, type=int, help="隐藏层数量") - parser.add_argument('--max_seq_len', default=1024, type=int, help="训练的最大截断长度") + parser.add_argument('--max_seq_len', default=720, type=int, help="训练的最大截断长度(中文1token≈1.5~1.7字符)") parser.add_argument('--use_moe', default=0, type=int, choices=[0, 1], help="是否使用MoE架构(0=否,1=是)") parser.add_argument("--data_path", type=str, default="../dataset/r1_mix_1024.jsonl", help="推理蒸馏数据路径") parser.add_argument('--from_weight', default='dpo', type=str, help="基于哪个权重训练,默认dpo") diff --git a/trainer/train_distillation.py b/trainer/train_distillation.py index 58d2b87..e4b4ecd 100644 --- a/trainer/train_distillation.py +++ b/trainer/train_distillation.py @@ -146,7 +146,7 @@ if __name__ == "__main__": parser.add_argument("--grad_clip", type=float, default=1.0, help="梯度裁剪阈值") parser.add_argument("--log_interval", type=int, default=100, help="日志打印间隔") parser.add_argument("--save_interval", type=int, default=100, help="模型保存间隔") - parser.add_argument("--max_seq_len", type=int, default=512, help="训练的最大截断长度") + parser.add_argument("--max_seq_len", type=int, default=340, help="训练的最大截断长度(中文1token≈1.5~1.7字符)") parser.add_argument("--data_path", type=str, default="../dataset/sft_mini_512.jsonl", help="训练数据路径") parser.add_argument('--student_hidden_size', default=512, type=int, help="学生模型隐藏层维度") parser.add_argument('--student_num_layers', default=8, type=int, help="学生模型隐藏层数量") diff --git a/trainer/train_dpo.py b/trainer/train_dpo.py index c48f32d..a55a450 100644 --- a/trainer/train_dpo.py +++ b/trainer/train_dpo.py @@ -136,7 +136,7 @@ if __name__ == "__main__": parser.add_argument("--save_interval", type=int, default=100, help="模型保存间隔") parser.add_argument('--hidden_size', default=512, type=int, help="隐藏层维度") parser.add_argument('--num_hidden_layers', default=8, type=int, help="隐藏层数量") - parser.add_argument('--max_seq_len', default=1024, type=int, help="训练的最大截断长度") + parser.add_argument('--max_seq_len', default=1024, type=int, help="训练的最大截断长度(中文1token≈1.5~1.7字符)") parser.add_argument('--use_moe', default=0, type=int, choices=[0, 1], help="是否使用MoE架构(0=否,1=是)") parser.add_argument("--data_path", type=str, default="../dataset/dpo.jsonl", help="DPO训练数据路径") parser.add_argument('--from_weight', default='full_sft', type=str, help="基于哪个权重训练") diff --git a/trainer/train_full_sft.py b/trainer/train_full_sft.py index 8b380c7..f0489fb 100644 --- a/trainer/train_full_sft.py +++ b/trainer/train_full_sft.py @@ -98,7 +98,7 @@ if __name__ == "__main__": parser.add_argument("--save_interval", type=int, default=100, help="模型保存间隔") parser.add_argument('--hidden_size', default=512, type=int, help="隐藏层维度") parser.add_argument('--num_hidden_layers', default=8, type=int, help="隐藏层数量") - parser.add_argument('--max_seq_len', default=512, type=int, help="训练的最大截断长度") + parser.add_argument('--max_seq_len', default=340, type=int, help="训练的最大截断长度(中文1token≈1.5~1.7字符)") parser.add_argument('--use_moe', default=0, type=int, choices=[0, 1], help="是否使用MoE架构(0=否,1=是)") parser.add_argument("--data_path", type=str, default="../dataset/sft_mini_512.jsonl", help="训练数据路径") parser.add_argument('--from_weight', default='pretrain', type=str, help="基于哪个权重训练,为none则不基于任何权重训练") diff --git a/trainer/train_lora.py b/trainer/train_lora.py index 4e2d30f..474ae18 100644 --- a/trainer/train_lora.py +++ b/trainer/train_lora.py @@ -92,7 +92,7 @@ if __name__ == "__main__": parser.add_argument("--save_interval", type=int, default=1, help="模型保存间隔") parser.add_argument('--hidden_size', default=512, type=int, help="隐藏层维度") parser.add_argument('--num_hidden_layers', default=8, type=int, help="隐藏层数量") - parser.add_argument('--max_seq_len', default=512, type=int, help="训练的最大截断长度") + parser.add_argument('--max_seq_len', default=340, type=int, help="训练的最大截断长度(中文1token≈1.5~1.7字符)") parser.add_argument('--use_moe', default=0, type=int, choices=[0, 1], help="是否使用MoE架构(0=否,1=是)") parser.add_argument("--data_path", type=str, default="../dataset/lora_identity.jsonl", help="LoRA训练数据路径") parser.add_argument('--from_weight', default='full_sft', type=str, help="基于哪个权重训练,默认full_sft") diff --git a/trainer/train_pretrain.py b/trainer/train_pretrain.py index d8b7b06..d02d9b5 100644 --- a/trainer/train_pretrain.py +++ b/trainer/train_pretrain.py @@ -97,7 +97,7 @@ if __name__ == "__main__": parser.add_argument("--save_interval", type=int, default=100, help="模型保存间隔") parser.add_argument('--hidden_size', default=512, type=int, help="隐藏层维度") parser.add_argument('--num_hidden_layers', default=8, type=int, help="隐藏层数量") - parser.add_argument('--max_seq_len', default=512, type=int, help="训练的最大截断长度") + parser.add_argument('--max_seq_len', default=340, type=int, help="训练的最大截断长度(中文1token≈1.5~1.7字符)") parser.add_argument('--use_moe', default=0, type=int, choices=[0, 1], help="是否使用MoE架构(0=否,1=是)") parser.add_argument("--data_path", type=str, default="../dataset/pretrain_hq.jsonl", help="预训练数据路径") parser.add_argument('--from_weight', default='none', type=str, help="基于哪个权重训练,为none则从头开始")