This commit is contained in:
jingyaogong 2025-04-26 10:07:55 +08:00
parent a62faf34bd
commit 274483cb1b
4 changed files with 31076 additions and 8 deletions

0
dataset/__init__.py Normal file
View File

View File

@ -35,9 +35,8 @@ class PretrainDataset(Dataset):
sample = self.samples[index]
# 构建输入文本
text = f"{self.tokenizer.bos_token}{str(sample['text'])}{self.tokenizer.eos_token}"
encoding = self.tokenizer(
text,
str(sample['text']),
max_length=self.max_length,
padding='max_length',
truncation=True,
@ -58,8 +57,8 @@ class SFTDataset(Dataset):
self.tokenizer = tokenizer
self.max_length = max_length
self.samples = self.load_data(jsonl_path)
self.bos_id = tokenizer('<s>assistant', add_special_tokens=False).input_ids
self.eos_id = tokenizer('</s>', add_special_tokens=False).input_ids
self.bos_id = tokenizer('<|im_start|>assistant', add_special_tokens=False).input_ids
self.eos_id = tokenizer('<|im_end|>', add_special_tokens=False).input_ids
def __len__(self):
return len(self.samples)
@ -126,8 +125,8 @@ class DPODataset(Dataset):
self.tokenizer = tokenizer
self.max_length = max_length
self.padding = tokenizer.pad_token_id if tokenizer.pad_token_id is not None else 0
self.bos_id = tokenizer('<s>assistant', add_special_tokens=False).input_ids
self.eos_id = tokenizer('</s>', add_special_tokens=False).input_ids
self.bos_id = tokenizer('<|im_start|>assistant', add_special_tokens=False).input_ids
self.eos_id = tokenizer('<|im_end|>', add_special_tokens=False).input_ids
with open(file_path, 'r', encoding='utf-8') as f:
self.data = []
for line in f:
@ -202,8 +201,8 @@ class RLAIFDataset(Dataset):
self.tokenizer = tokenizer
self.max_length = max_length
self.samples = self.load_data(jsonl_path)
self.bos_id = tokenizer('<s>assistant', add_special_tokens=False).input_ids
self.eos_id = tokenizer('</s>', add_special_tokens=False).input_ids
self.bos_id = tokenizer('<|im_start|>assistant', add_special_tokens=False).input_ids
self.eos_id = tokenizer('<|im_end|>', add_special_tokens=False).input_ids
def __len__(self):
return len(self.samples)

31026
model/tokenizer.json Normal file

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,43 @@
{
"add_bos_token": false,
"add_eos_token": false,
"add_prefix_space": false,
"added_tokens_decoder": {
"0": {
"content": "<|endoftext|>",
"lstrip": false,
"normalized": false,
"rstrip": false,
"single_word": false,
"special": true
},
"1": {
"content": "<|im_start|>",
"lstrip": false,
"normalized": false,
"rstrip": false,
"single_word": false,
"special": true
},
"2": {
"content": "<|im_end|>",
"lstrip": false,
"normalized": false,
"rstrip": false,
"single_word": false,
"special": true
}
},
"additional_special_tokens": [],
"bos_token": "<|im_start|>",
"clean_up_tokenization_spaces": false,
"eos_token": "<|im_end|>",
"legacy": true,
"model_max_length": 32768,
"pad_token": "<|endoftext|>",
"sp_model_kwargs": {},
"spaces_between_special_tokens": false,
"tokenizer_class": "PreTrainedTokenizerFast",
"unk_token": "<|endoftext|>",
"chat_template": "{% if messages[0]['role'] == 'system' %}{% set system_message = messages[0]['content'] %}{{ '<|im_start|>system\\n' + system_message + '<|im_end|>\\n' }}{% else %}{{ '<|im_start|>system\\nYou are a helpful assistant<|im_end|>\\n' }}{% endif %}{% for message in messages %}{% set content = message['content'] %}{% if message['role'] == 'user' %}{{ '<|im_start|>user\\n' + content + '<|im_end|>\\n<|im_start|>assistant\\n' }}{% elif message['role'] == 'assistant' %}{{ content + '<|im_end|>' + '\\n' }}{% endif %}{% endfor %}"
}