From 1713c24114b6ef802da59005c6e1b4e5773a8c39 Mon Sep 17 00:00:00 2001 From: jingyaogong Date: Wed, 29 Oct 2025 10:36:28 +0800 Subject: [PATCH] [fix] model device --- trainer/train_distill_reason.py | 2 +- trainer/train_distillation.py | 4 ++-- trainer/train_dpo.py | 4 ++-- trainer/train_full_sft.py | 2 +- trainer/train_grpo.py | 22 +++++++-------------- trainer/train_lora.py | 2 +- trainer/train_ppo.py | 35 ++++++++++++++------------------- trainer/train_pretrain.py | 2 +- trainer/train_spo.py | 22 +++++++-------------- 9 files changed, 37 insertions(+), 58 deletions(-) diff --git a/trainer/train_distill_reason.py b/trainer/train_distill_reason.py index 3c882cd..a14a3aa 100644 --- a/trainer/train_distill_reason.py +++ b/trainer/train_distill_reason.py @@ -140,7 +140,7 @@ if __name__ == "__main__": wandb.init(project=args.wandb_project, name=wandb_run_name, id=wandb_id, resume=resume) # ========== 5. 定义模型、数据、优化器 ========== - model, tokenizer = init_model(lm_config, args.from_weight) + model, tokenizer = init_model(lm_config, args.from_weight, device=args.device) train_ds = SFTDataset(args.data_path, tokenizer, max_length=args.max_seq_len) train_sampler = DistributedSampler(train_ds) if dist.is_initialized() else None scaler = torch.cuda.amp.GradScaler(enabled=(args.dtype == 'float16')) diff --git a/trainer/train_distillation.py b/trainer/train_distillation.py index 848bf52..2ef0c17 100644 --- a/trainer/train_distillation.py +++ b/trainer/train_distillation.py @@ -184,9 +184,9 @@ if __name__ == "__main__": wandb.init(project=args.wandb_project, name=wandb_run_name, id=wandb_id, resume=resume) # ========== 5. 定义学生和教师模型 ========== - model, tokenizer = init_model(lm_config_student, args.from_student_weight) + model, tokenizer = init_model(lm_config_student, args.from_student_weight, device=args.device) Logger(f'学生模型总参数量:{sum(p.numel() for p in model.parameters()) / 1e6:.3f} M') - teacher_model, _ = init_model(lm_config_teacher, args.from_teacher_weight) + teacher_model, _ = init_model(lm_config_teacher, args.from_teacher_weight, device=args.device) teacher_model.eval() teacher_model.requires_grad_(False) Logger(f'教师模型总参数量:{sum(p.numel() for p in teacher_model.parameters()) / 1e6:.3f} M') diff --git a/trainer/train_dpo.py b/trainer/train_dpo.py index fc2eb82..9dfe63b 100644 --- a/trainer/train_dpo.py +++ b/trainer/train_dpo.py @@ -166,10 +166,10 @@ if __name__ == "__main__": wandb.init(project=args.wandb_project, name=wandb_run_name, id=wandb_id, resume=resume) # ========== 5. 定义模型和参考模型 ========== - model, tokenizer = init_model(lm_config, args.from_weight) + model, tokenizer = init_model(lm_config, args.from_weight, device=args.device) Logger(f'策略模型总参数量:{sum(p.numel() for p in model.parameters()) / 1e6:.3f} M') # 初始化参考模型(ref_model冻结) - ref_model, _ = init_model(lm_config, args.from_weight) + ref_model, _ = init_model(lm_config, args.from_weight, device=args.device) ref_model.eval() ref_model.requires_grad_(False) Logger(f'参考模型总参数量:{sum(p.numel() for p in ref_model.parameters()) / 1e6:.3f} M') diff --git a/trainer/train_full_sft.py b/trainer/train_full_sft.py index 6702159..9001dab 100644 --- a/trainer/train_full_sft.py +++ b/trainer/train_full_sft.py @@ -128,7 +128,7 @@ if __name__ == "__main__": wandb.init(project=args.wandb_project, name=wandb_run_name, id=wandb_id, resume=resume) # ========== 5. 定义模型、数据、优化器 ========== - model, tokenizer = init_model(lm_config, args.from_weight) + model, tokenizer = init_model(lm_config, args.from_weight, device=args.device) train_ds = SFTDataset(args.data_path, tokenizer, max_length=args.max_seq_len) train_sampler = DistributedSampler(train_ds) if dist.is_initialized() else None scaler = torch.cuda.amp.GradScaler(enabled=(args.dtype == 'float16')) diff --git a/trainer/train_grpo.py b/trainer/train_grpo.py index 13727fe..afebc7c 100755 --- a/trainer/train_grpo.py +++ b/trainer/train_grpo.py @@ -19,7 +19,7 @@ from torch.optim.lr_scheduler import CosineAnnealingLR from transformers import AutoModel from model.model_minimind import MiniMindConfig, MiniMindForCausalLM from dataset.lm_dataset import RLAIFDataset -from trainer.trainer_utils import Logger, is_main_process, lm_checkpoint, init_distributed_mode, setup_seed, SkipBatchSampler +from trainer.trainer_utils import Logger, is_main_process, lm_checkpoint, init_distributed_mode, setup_seed, SkipBatchSampler, init_model warnings.filterwarnings('ignore') @@ -240,25 +240,17 @@ if __name__ == "__main__": wandb.init(project=args.wandb_project, name=wandb_run_name, id=wandb_id, resume=resume) # ========== 5. 初始化模型和数据 ========== - tokenizer = AutoTokenizer.from_pretrained('../model/') - moe_suffix = '_moe' if lm_config.use_moe else '' base_weight = "reason" if args.reasoning == 1 else "full_sft" - ckp = f'{args.save_dir}/{base_weight}_{lm_config.hidden_size}{moe_suffix}.pth' - state_dict = torch.load(ckp, map_location=args.device) # Policy模型 - model = MiniMindForCausalLM(lm_config) - model.load_state_dict(state_dict, strict=False) - model = model.to(args.device) - Logger(f'Policy模型总参数量:{sum(p.numel() for p in model.parameters() if p.requires_grad) / 1e6:.3f} M') + model, tokenizer = init_model(lm_config, base_weight, device=args.device) # Reference模型 - ref_model = MiniMindForCausalLM(lm_config) - ref_model.load_state_dict(state_dict, strict=False) - ref_model.eval().requires_grad_(False) - ref_model = ref_model.to(args.device) + ref_model, _ = init_model(lm_config, base_weight, device=args.device) + ref_model = ref_model.eval().requires_grad_(False) # Reward模型 reward_model = AutoModel.from_pretrained( - args.reward_model_path, device_map="cuda", torch_dtype=torch.float16, trust_remote_code=True - ).to(args.device).eval().requires_grad_(False) + args.reward_model_path, torch_dtype=torch.float16, trust_remote_code=True + ) + reward_model = reward_model.to(args.device).eval().requires_grad_(False) reward_tokenizer = AutoTokenizer.from_pretrained(args.reward_model_path, trust_remote_code=True) # 数据和优化器 train_ds = RLAIFDataset(args.data_path, tokenizer, max_length=lm_config.max_seq_len) diff --git a/trainer/train_lora.py b/trainer/train_lora.py index 1473d60..a88a1af 100644 --- a/trainer/train_lora.py +++ b/trainer/train_lora.py @@ -123,7 +123,7 @@ if __name__ == "__main__": wandb.init(project=args.wandb_project, name=wandb_run_name, id=wandb_id, resume=resume) # ========== 5. 定义模型、应用LoRA、冻结非LoRA参数 ========== - model, tokenizer = init_model(lm_config, args.from_weight) + model, tokenizer = init_model(lm_config, args.from_weight, device=args.device) apply_lora(model) # 统计参数 diff --git a/trainer/train_ppo.py b/trainer/train_ppo.py index 51b0813..5b49aa3 100644 --- a/trainer/train_ppo.py +++ b/trainer/train_ppo.py @@ -20,7 +20,7 @@ from torch.optim.lr_scheduler import CosineAnnealingLR from transformers import AutoModel from model.model_minimind import MiniMindConfig, MiniMindForCausalLM from dataset.lm_dataset import RLAIFDataset -from trainer.trainer_utils import Logger, is_main_process, lm_checkpoint, init_distributed_mode, setup_seed, SkipBatchSampler +from trainer.trainer_utils import Logger, is_main_process, lm_checkpoint, init_distributed_mode, setup_seed, SkipBatchSampler, init_model warnings.filterwarnings('ignore') @@ -290,33 +290,28 @@ if __name__ == "__main__": wandb.init(project=args.wandb_project, name=wandb_run_name, id=wandb_id, resume=resume) # ========== 5. 初始化模型和数据 ========== - tokenizer = AutoTokenizer.from_pretrained('../model/', padding_side='left') - moe_suffix = '_moe' if lm_config.use_moe else '' base_weight = "reason" if args.reasoning == 1 else "full_sft" + # Actor模型 + actor_model, tokenizer = init_model(lm_config, base_weight, device=args.device) + tokenizer.padding_side = 'left' # PPO需要左侧padding + # Old Actor模型 + old_actor_model, _ = init_model(lm_config, base_weight, device=args.device) + old_actor_model = old_actor_model.eval().requires_grad_(False) + # Reference模型 + ref_model, _ = init_model(lm_config, base_weight, device=args.device) + ref_model = ref_model.eval().requires_grad_(False) + # Critic模型 + moe_suffix = '_moe' if lm_config.use_moe else '' ckp = f'{args.save_dir}/{base_weight}_{lm_config.hidden_size}{moe_suffix}.pth' state_dict = torch.load(ckp, map_location=args.device) - # Actor模型 - actor_model = MiniMindForCausalLM(lm_config) - actor_model.load_state_dict(state_dict, strict=False) - actor_model = actor_model.to(args.device) - Logger(f'Actor模型总参数量:{sum(p.numel() for p in actor_model.parameters() if p.requires_grad) / 1e6:.3f} M') - # Old Actor模型 - old_actor_model = MiniMindForCausalLM(lm_config) - old_actor_model.load_state_dict(state_dict, strict=False) - old_actor_model = old_actor_model.eval().requires_grad_(False).to(args.device) - # Reference模型 - ref_model = MiniMindForCausalLM(lm_config) - ref_model.load_state_dict(state_dict, strict=False) - ref_model = ref_model.eval().requires_grad_(False).to(args.device) - # Critic模型 critic_model = CriticModel(lm_config) critic_model.load_state_dict(state_dict, strict=False) critic_model = critic_model.to(args.device) - Logger(f'Critic模型总参数量:{sum(p.numel() for p in critic_model.parameters() if p.requires_grad) / 1e6:.3f} M') # Reward模型 reward_model = AutoModel.from_pretrained( - args.reward_model_path, device_map="cuda", torch_dtype=torch.float32, trust_remote_code=True - ).to(args.device).eval().requires_grad_(False) + args.reward_model_path, torch_dtype=torch.float16, trust_remote_code=True + ) + reward_model = reward_model.to(args.device).eval().requires_grad_(False) reward_tokenizer = AutoTokenizer.from_pretrained(args.reward_model_path, trust_remote_code=True) # 数据和优化器 train_ds = RLAIFDataset(args.data_path, tokenizer, max_length=(args.max_seq_len + args.max_gen_len)) diff --git a/trainer/train_pretrain.py b/trainer/train_pretrain.py index 87c79a7..83705dd 100644 --- a/trainer/train_pretrain.py +++ b/trainer/train_pretrain.py @@ -127,7 +127,7 @@ if __name__ == "__main__": wandb.init(project=args.wandb_project, name=wandb_run_name, id=wandb_id, resume=resume) # ========== 5. 定义模型、数据、优化器 ========== - model, tokenizer = init_model(lm_config, args.from_weight) + model, tokenizer = init_model(lm_config, args.from_weight, device=args.device) train_ds = PretrainDataset(args.data_path, tokenizer, max_length=args.max_seq_len) train_sampler = DistributedSampler(train_ds) if dist.is_initialized() else None scaler = torch.cuda.amp.GradScaler(enabled=(args.dtype == 'float16')) diff --git a/trainer/train_spo.py b/trainer/train_spo.py index 64c4e9f..34a7451 100755 --- a/trainer/train_spo.py +++ b/trainer/train_spo.py @@ -19,7 +19,7 @@ from torch.optim.lr_scheduler import CosineAnnealingLR from transformers import AutoModel from model.model_minimind import MiniMindConfig, MiniMindForCausalLM from dataset.lm_dataset import RLAIFDataset -from trainer.trainer_utils import Logger, is_main_process, lm_checkpoint, init_distributed_mode, setup_seed, SkipBatchSampler +from trainer.trainer_utils import Logger, is_main_process, lm_checkpoint, init_distributed_mode, setup_seed, SkipBatchSampler, init_model warnings.filterwarnings('ignore') @@ -287,25 +287,17 @@ if __name__ == "__main__": wandb.init(project=args.wandb_project, name=wandb_run_name, id=wandb_id, resume=resume) # ========== 5. 初始化模型(Policy, Ref, Reward)和Value Tracker、数据 ========== - tokenizer = AutoTokenizer.from_pretrained('../model/') - moe_suffix = '_moe' if lm_config.use_moe else '' base_weight = "reason" if args.reasoning == 1 else "full_sft" - ckp = f'{args.save_dir}/{base_weight}_{lm_config.hidden_size}{moe_suffix}.pth' - state_dict = torch.load(ckp, map_location=args.device) # Policy模型 - model = MiniMindForCausalLM(lm_config) - model.load_state_dict(state_dict, strict=False) - model = model.to(args.device) - Logger(f'Policy模型总参数量:{sum(p.numel() for p in model.parameters() if p.requires_grad) / 1e6:.3f} M') + model, tokenizer = init_model(lm_config, base_weight, device=args.device) # Reference模型 - ref_model = MiniMindForCausalLM(lm_config) - ref_model.load_state_dict(state_dict, strict=False) - ref_model.eval().requires_grad_(False) - ref_model = ref_model.to(args.device) + ref_model, _ = init_model(lm_config, base_weight, device=args.device) + ref_model = ref_model.eval().requires_grad_(False) # Reward模型 reward_model = AutoModel.from_pretrained( - args.reward_model_path, device_map="cuda", torch_dtype=torch.float16, trust_remote_code=True - ).to(args.device).eval().requires_grad_(False) + args.reward_model_path, torch_dtype=torch.float16, trust_remote_code=True + ) + reward_model = reward_model.to(args.device).eval().requires_grad_(False) reward_tokenizer = AutoTokenizer.from_pretrained(args.reward_model_path, trust_remote_code=True) # Value Tracker value_tracker = AutoAdaptiveValueTracker(rho_mode='kl', rho_const=0.9, D_half=0.06, clip_lower=0.5, clip_upper=0.96)