from torch.utils.data import Dataset import torch import json import os import random from datasets import load_dataset, Features, Sequence, Value os.environ["TOKENIZERS_PARALLELISM"] = "false" def pre_processing_chat(conversations, add_system_ratio=0.2): # tool use 数据完整保留不做处理 if any(conv.get('tools') for conv in conversations): return conversations SYSTEM_PROMPTS = [ "你是一个知识丰富的AI，尽力为用户提供准确的信息。", "你是minimind，一个小巧但有用的语言模型。", "你是一个专业的AI助手，请提供有价值的回答。", "你是minimind，请尽力帮助用户解决问题。", "你是一个可靠的AI，请给出准确的回答。", "You are a helpful AI assistant.", "You are minimind, a lightweight intelligent assistant.", "You are a friendly chatbot. Please answer the user's questions carefully.", "You are a knowledgeable AI. Try your best to provide accurate information.", "You are minimind, a small but useful language model." ] # 概率性添加system if conversations[0].get('role') != 'system': if random.random() < add_system_ratio: return [{'role': 'system', 'content': random.choice(SYSTEM_PROMPTS)}] + conversations return conversations def post_processing_chat(prompt_content, empty_think_ratio=0.2): # 以80%概率移除空思考标签 if '\n\n\n\n' in prompt_content and random.random() > empty_think_ratio: prompt_content = prompt_content.replace('\n\n\n\n', '') return prompt_content class PretrainDataset(Dataset): def __init__(self, data_path, tokenizer, max_length=512): super().__init__() self.tokenizer = tokenizer self.max_length = max_length self.samples = load_dataset('json', data_files=data_path, split='train') def __len__(self): return len(self.samples) def __getitem__(self, index): sample = self.samples[index] tokens = self.tokenizer(str(sample['text']), add_special_tokens=False, max_length=self.max_length - 2, truncation=True).input_ids tokens = [self.tokenizer.bos_token_id] + tokens + [self.tokenizer.eos_token_id] input_ids = tokens + [self.tokenizer.pad_token_id] * (self.max_length - len(tokens)) input_ids = torch.tensor(input_ids, dtype=torch.long) labels = input_ids.clone() labels[input_ids == self.tokenizer.pad_token_id] = -100 return input_ids, labels class SFTDataset(Dataset): def __init__(self, jsonl_path, tokenizer, max_length=1024): super().__init__() self.tokenizer = tokenizer self.max_length = max_length features = Features({'conversations': [{'role': Value('string'), 'content': Value('string'), 'reasoning_content': Value('string'), 'tools': Value('string'), 'tool_calls': Value('string')}]}) self.samples = load_dataset('json', data_files=jsonl_path, split='train', features=features) self.bos_id = tokenizer(f'{tokenizer.bos_token}assistant\n', add_special_tokens=False).input_ids self.eos_id = tokenizer(f'{tokenizer.eos_token}\n', add_special_tokens=False).input_ids def __len__(self): return len(self.samples) def create_chat_prompt(self, conversations): messages = [] tools = None for message in conversations: message = dict(message) if message.get("role") == "system" and message.get("tools"): tools = json.loads(message["tools"]) if isinstance(message["tools"], str) else message["tools"] if message.get("tool_calls") and isinstance(message["tool_calls"], str): message["tool_calls"] = json.loads(message["tool_calls"]) messages.append(message) return self.tokenizer.apply_chat_template( messages, tokenize=False, add_generation_prompt=False, tools=tools ) def generate_labels(self, input_ids): labels = [-100] * len(input_ids) i = 0 while i < len(input_ids): if input_ids[i:i + len(self.bos_id)] == self.bos_id: start = i + len(self.bos_id) end = start while end < len(input_ids): if input_ids[end:end + len(self.eos_id)] == self.eos_id: break end += 1 for j in range(start, min(end + len(self.eos_id), self.max_length)): labels[j] = input_ids[j] i = end + len(self.eos_id) if end < len(input_ids) else len(input_ids) else: i += 1 return labels def __getitem__(self, index): sample = self.samples[index] conversations = pre_processing_chat(sample['conversations']) prompt = self.create_chat_prompt(conversations) prompt = post_processing_chat(prompt) input_ids = self.tokenizer(prompt).input_ids[:self.max_length] input_ids += [self.tokenizer.pad_token_id] * (self.max_length - len(input_ids)) labels = self.generate_labels(input_ids) # # === 调试打印 === # print(f"\n--- Sample {index} ---") # for i, (x, y) in enumerate(zip(input_ids[:-1], labels[1:])): # print(f"{i:3d}: X={self.tokenizer.decode([x])!r:16s} ---> Y={self.tokenizer.decode([input_ids[i+1]])!r:16s} label={y}") # # ================ return torch.tensor(input_ids, dtype=torch.long), torch.tensor(labels, dtype=torch.long) class DPODataset(Dataset): def __init__(self, file_path, tokenizer, max_length=4096): super().__init__() self.tokenizer = tokenizer self.max_length = max_length self.padding = tokenizer.pad_token_id if tokenizer.pad_token_id is not None else 0 self.bos_id = tokenizer(f'{tokenizer.bos_token}assistant\n', add_special_tokens=False).input_ids self.eos_id = tokenizer(f'{tokenizer.eos_token}\n', add_special_tokens=False).input_ids self.samples = load_dataset('json', data_files=file_path, split='train') def __len__(self): return len(self.samples) def __getitem__(self, index): sample = self.samples[index] chosen = sample['chosen'] # 是一个 list，里面包含若干 {role, content} rejected = sample['rejected'] # 同上 chosen_prompt = self.tokenizer.apply_chat_template( chosen, tokenize=False, add_generation_prompt=False ) chosen_prompt = post_processing_chat(chosen_prompt) rejected_prompt = self.tokenizer.apply_chat_template( rejected, tokenize=False, add_generation_prompt=False ) rejected_prompt = post_processing_chat(rejected_prompt) chosen_encoding = self.tokenizer( chosen_prompt, truncation=True, max_length=self.max_length, padding='max_length' ) rejected_encoding = self.tokenizer( rejected_prompt, truncation=True, max_length=self.max_length, padding='max_length' ) chosen_input_ids = chosen_encoding['input_ids'] chosen_loss_mask = self.generate_loss_mask(chosen_input_ids) rejected_input_ids = rejected_encoding['input_ids'] rejected_loss_mask = self.generate_loss_mask(rejected_input_ids) x_chosen = torch.tensor(chosen_input_ids[:-1], dtype=torch.long) y_chosen = torch.tensor(chosen_input_ids[1:], dtype=torch.long) mask_chosen = torch.tensor(chosen_loss_mask[1:], dtype=torch.long) x_rejected = torch.tensor(rejected_input_ids[:-1], dtype=torch.long) y_rejected = torch.tensor(rejected_input_ids[1:], dtype=torch.long) mask_rejected = torch.tensor(rejected_loss_mask[1:], dtype=torch.long) return { 'x_chosen': x_chosen, 'y_chosen': y_chosen, 'mask_chosen': mask_chosen, 'x_rejected': x_rejected, 'y_rejected': y_rejected, 'mask_rejected': mask_rejected } def generate_loss_mask(self, input_ids): loss_mask = [0] * len(input_ids) i = 0 while i < len(input_ids): if input_ids[i:i + len(self.bos_id)] == self.bos_id: start = i + len(self.bos_id) end = start while end < len(input_ids): if input_ids[end:end + len(self.eos_id)] == self.eos_id: break end += 1 for j in range(start, min(end + len(self.eos_id), self.max_length)): loss_mask[j] = 1 i = end + len(self.eos_id) if end < len(input_ids) else len(input_ids) else: i += 1 return loss_mask class RLAIFDataset(Dataset): def __init__(self, jsonl_path, tokenizer, max_length=1024, thinking_ratio=0.5): super().__init__() self.tokenizer = tokenizer self.max_length = max_length self.thinking_ratio = thinking_ratio # 按概率开启 thinking self.samples = load_dataset('json', data_files=jsonl_path, split='train') self.bos_id = tokenizer(f'{tokenizer.bos_token}assistant', add_special_tokens=False).input_ids self.eos_id = tokenizer(f'{tokenizer.eos_token}', add_special_tokens=False).input_ids def __len__(self): return len(self.samples) def create_chat_prompt(self, conversations): conversations = pre_processing_chat(conversations) use_thinking = random.random() < self.thinking_ratio return self.tokenizer.apply_chat_template( conversations[:-1], tokenize=False, open_thinking=use_thinking, add_generation_prompt=True ) def __getitem__(self, index): sample = self.samples[index] prompt = self.create_chat_prompt(sample['conversations']) return { 'prompt': prompt, 'answer': "" } class AgentRLDataset(Dataset): def __init__(self, jsonl_path, tokenizer, max_length=1024): super().__init__() self.tokenizer = tokenizer self.max_length = max_length self.samples = [] with open(jsonl_path, 'r', encoding='utf-8') as f: for line in f: self.samples.append(json.loads(line.strip())) def __len__(self): return len(self.samples) def parse_conversations(self, conversations): messages = [] tools = None for message in conversations: message = dict(message) if message.get("role") == "system" and message.get("tools"): tools = json.loads(message["tools"]) if isinstance(message["tools"], str) else message["tools"] messages.append(message) return messages[:-1], tools def __getitem__(self, index): sample = self.samples[index] messages, tools = self.parse_conversations(sample['conversations']) return {'messages': messages, 'tools': tools, 'gt': sample['gt']} if __name__ == "__main__": pass