From 1713c24114b6ef802da59005c6e1b4e5773a8c39 Mon Sep 17 00:00:00 2001
From: jingyaogong <gongjy.cs@qq.com>
Date: Wed, 29 Oct 2025 10:36:28 +0800
Subject: [PATCH] [fix] model device

---
 trainer/train_distill_reason.py |  2 +-
 trainer/train_distillation.py   |  4 ++--
 trainer/train_dpo.py            |  4 ++--
 trainer/train_full_sft.py       |  2 +-
 trainer/train_grpo.py           | 22 +++++++--------------
 trainer/train_lora.py           |  2 +-
 trainer/train_ppo.py            | 35 ++++++++++++++-------------------
 trainer/train_pretrain.py       |  2 +-
 trainer/train_spo.py            | 22 +++++++--------------
 9 files changed, 37 insertions(+), 58 deletions(-)

diff --git a/trainer/train_distill_reason.py b/trainer/train_distill_reason.py
index 3c882cd..a14a3aa 100644
--- a/trainer/train_distill_reason.py
+++ b/trainer/train_distill_reason.py
@@ -140,7 +140,7 @@ if __name__ == "__main__":
         wandb.init(project=args.wandb_project, name=wandb_run_name, id=wandb_id, resume=resume)
     
     # ========== 5. 定义模型、数据、优化器 ==========
-    model, tokenizer = init_model(lm_config, args.from_weight)
+    model, tokenizer = init_model(lm_config, args.from_weight, device=args.device)
     train_ds = SFTDataset(args.data_path, tokenizer, max_length=args.max_seq_len)
     train_sampler = DistributedSampler(train_ds) if dist.is_initialized() else None
     scaler = torch.cuda.amp.GradScaler(enabled=(args.dtype == 'float16'))
diff --git a/trainer/train_distillation.py b/trainer/train_distillation.py
index 848bf52..2ef0c17 100644
--- a/trainer/train_distillation.py
+++ b/trainer/train_distillation.py
@@ -184,9 +184,9 @@ if __name__ == "__main__":
         wandb.init(project=args.wandb_project, name=wandb_run_name, id=wandb_id, resume=resume)
     
     # ========== 5. 定义学生和教师模型 ==========
-    model, tokenizer = init_model(lm_config_student, args.from_student_weight)
+    model, tokenizer = init_model(lm_config_student, args.from_student_weight, device=args.device)
     Logger(f'学生模型总参数量：{sum(p.numel() for p in model.parameters()) / 1e6:.3f} M')
-    teacher_model, _ = init_model(lm_config_teacher, args.from_teacher_weight)
+    teacher_model, _ = init_model(lm_config_teacher, args.from_teacher_weight, device=args.device)
     teacher_model.eval()
     teacher_model.requires_grad_(False)
     Logger(f'教师模型总参数量：{sum(p.numel() for p in teacher_model.parameters()) / 1e6:.3f} M')
diff --git a/trainer/train_dpo.py b/trainer/train_dpo.py
index fc2eb82..9dfe63b 100644
--- a/trainer/train_dpo.py
+++ b/trainer/train_dpo.py
@@ -166,10 +166,10 @@ if __name__ == "__main__":
         wandb.init(project=args.wandb_project, name=wandb_run_name, id=wandb_id, resume=resume)
     
     # ========== 5. 定义模型和参考模型 ==========
-    model, tokenizer = init_model(lm_config, args.from_weight)
+    model, tokenizer = init_model(lm_config, args.from_weight, device=args.device)
     Logger(f'策略模型总参数量：{sum(p.numel() for p in model.parameters()) / 1e6:.3f} M')
     # 初始化参考模型（ref_model冻结）
-    ref_model, _ = init_model(lm_config, args.from_weight)
+    ref_model, _ = init_model(lm_config, args.from_weight, device=args.device)
     ref_model.eval()
     ref_model.requires_grad_(False)
     Logger(f'参考模型总参数量：{sum(p.numel() for p in ref_model.parameters()) / 1e6:.3f} M')
diff --git a/trainer/train_full_sft.py b/trainer/train_full_sft.py
index 6702159..9001dab 100644
--- a/trainer/train_full_sft.py
+++ b/trainer/train_full_sft.py
@@ -128,7 +128,7 @@ if __name__ == "__main__":
         wandb.init(project=args.wandb_project, name=wandb_run_name, id=wandb_id, resume=resume)
     
     # ========== 5. 定义模型、数据、优化器 ==========
-    model, tokenizer = init_model(lm_config, args.from_weight)
+    model, tokenizer = init_model(lm_config, args.from_weight, device=args.device)
     train_ds = SFTDataset(args.data_path, tokenizer, max_length=args.max_seq_len)
     train_sampler = DistributedSampler(train_ds) if dist.is_initialized() else None
     scaler = torch.cuda.amp.GradScaler(enabled=(args.dtype == 'float16'))
diff --git a/trainer/train_grpo.py b/trainer/train_grpo.py
index 13727fe..afebc7c 100755
--- a/trainer/train_grpo.py
+++ b/trainer/train_grpo.py
@@ -19,7 +19,7 @@ from torch.optim.lr_scheduler import CosineAnnealingLR
 from transformers import AutoModel
 from model.model_minimind import MiniMindConfig, MiniMindForCausalLM
 from dataset.lm_dataset import RLAIFDataset
-from trainer.trainer_utils import Logger, is_main_process, lm_checkpoint, init_distributed_mode, setup_seed, SkipBatchSampler
+from trainer.trainer_utils import Logger, is_main_process, lm_checkpoint, init_distributed_mode, setup_seed, SkipBatchSampler, init_model
 
 warnings.filterwarnings('ignore')
 
@@ -240,25 +240,17 @@ if __name__ == "__main__":
         wandb.init(project=args.wandb_project, name=wandb_run_name, id=wandb_id, resume=resume)
     
     # ========== 5. 初始化模型和数据 ==========
-    tokenizer = AutoTokenizer.from_pretrained('../model/')
-    moe_suffix = '_moe' if lm_config.use_moe else ''
     base_weight = "reason" if args.reasoning == 1 else "full_sft"
-    ckp = f'{args.save_dir}/{base_weight}_{lm_config.hidden_size}{moe_suffix}.pth'
-    state_dict = torch.load(ckp, map_location=args.device)
     # Policy模型
-    model = MiniMindForCausalLM(lm_config)
-    model.load_state_dict(state_dict, strict=False)
-    model = model.to(args.device)
-    Logger(f'Policy模型总参数量：{sum(p.numel() for p in model.parameters() if p.requires_grad) / 1e6:.3f} M')
+    model, tokenizer = init_model(lm_config, base_weight, device=args.device)
     # Reference模型
-    ref_model = MiniMindForCausalLM(lm_config)
-    ref_model.load_state_dict(state_dict, strict=False)
-    ref_model.eval().requires_grad_(False)
-    ref_model = ref_model.to(args.device)
+    ref_model, _ = init_model(lm_config, base_weight, device=args.device)
+    ref_model = ref_model.eval().requires_grad_(False)
     # Reward模型
     reward_model = AutoModel.from_pretrained(
-        args.reward_model_path, device_map="cuda", torch_dtype=torch.float16, trust_remote_code=True
-    ).to(args.device).eval().requires_grad_(False)
+        args.reward_model_path, torch_dtype=torch.float16, trust_remote_code=True
+    )
+    reward_model = reward_model.to(args.device).eval().requires_grad_(False)
     reward_tokenizer = AutoTokenizer.from_pretrained(args.reward_model_path, trust_remote_code=True)
     # 数据和优化器
     train_ds = RLAIFDataset(args.data_path, tokenizer, max_length=lm_config.max_seq_len)
diff --git a/trainer/train_lora.py b/trainer/train_lora.py
index 1473d60..a88a1af 100644
--- a/trainer/train_lora.py
+++ b/trainer/train_lora.py
@@ -123,7 +123,7 @@ if __name__ == "__main__":
         wandb.init(project=args.wandb_project, name=wandb_run_name, id=wandb_id, resume=resume)
     
     # ========== 5. 定义模型、应用LoRA、冻结非LoRA参数 ==========
-    model, tokenizer = init_model(lm_config, args.from_weight)
+    model, tokenizer = init_model(lm_config, args.from_weight, device=args.device)
     apply_lora(model)
     
     # 统计参数
diff --git a/trainer/train_ppo.py b/trainer/train_ppo.py
index 51b0813..5b49aa3 100644
--- a/trainer/train_ppo.py
+++ b/trainer/train_ppo.py
@@ -20,7 +20,7 @@ from torch.optim.lr_scheduler import CosineAnnealingLR
 from transformers import AutoModel
 from model.model_minimind import MiniMindConfig, MiniMindForCausalLM
 from dataset.lm_dataset import RLAIFDataset
-from trainer.trainer_utils import Logger, is_main_process, lm_checkpoint, init_distributed_mode, setup_seed, SkipBatchSampler
+from trainer.trainer_utils import Logger, is_main_process, lm_checkpoint, init_distributed_mode, setup_seed, SkipBatchSampler, init_model
 
 warnings.filterwarnings('ignore')
 
@@ -290,33 +290,28 @@ if __name__ == "__main__":
         wandb.init(project=args.wandb_project, name=wandb_run_name, id=wandb_id, resume=resume)
     
     # ========== 5. 初始化模型和数据 ==========
-    tokenizer = AutoTokenizer.from_pretrained('../model/', padding_side='left')
-    moe_suffix = '_moe' if lm_config.use_moe else ''
     base_weight = "reason" if args.reasoning == 1 else "full_sft"
+    # Actor模型
+    actor_model, tokenizer = init_model(lm_config, base_weight, device=args.device)
+    tokenizer.padding_side = 'left'  # PPO需要左侧padding
+    # Old Actor模型
+    old_actor_model, _ = init_model(lm_config, base_weight, device=args.device)
+    old_actor_model = old_actor_model.eval().requires_grad_(False)
+    # Reference模型
+    ref_model, _ = init_model(lm_config, base_weight, device=args.device)
+    ref_model = ref_model.eval().requires_grad_(False)
+    # Critic模型
+    moe_suffix = '_moe' if lm_config.use_moe else ''
     ckp = f'{args.save_dir}/{base_weight}_{lm_config.hidden_size}{moe_suffix}.pth'
     state_dict = torch.load(ckp, map_location=args.device)
-    # Actor模型
-    actor_model = MiniMindForCausalLM(lm_config)
-    actor_model.load_state_dict(state_dict, strict=False)
-    actor_model = actor_model.to(args.device)
-    Logger(f'Actor模型总参数量：{sum(p.numel() for p in actor_model.parameters() if p.requires_grad) / 1e6:.3f} M')
-    # Old Actor模型
-    old_actor_model = MiniMindForCausalLM(lm_config)
-    old_actor_model.load_state_dict(state_dict, strict=False)
-    old_actor_model = old_actor_model.eval().requires_grad_(False).to(args.device)
-    # Reference模型
-    ref_model = MiniMindForCausalLM(lm_config)
-    ref_model.load_state_dict(state_dict, strict=False)
-    ref_model = ref_model.eval().requires_grad_(False).to(args.device)
-    # Critic模型
     critic_model = CriticModel(lm_config)
     critic_model.load_state_dict(state_dict, strict=False)
     critic_model = critic_model.to(args.device)
-    Logger(f'Critic模型总参数量：{sum(p.numel() for p in critic_model.parameters() if p.requires_grad) / 1e6:.3f} M')
     # Reward模型
     reward_model = AutoModel.from_pretrained(
-        args.reward_model_path, device_map="cuda", torch_dtype=torch.float32, trust_remote_code=True
-    ).to(args.device).eval().requires_grad_(False)
+        args.reward_model_path, torch_dtype=torch.float16, trust_remote_code=True
+    )
+    reward_model = reward_model.to(args.device).eval().requires_grad_(False)
     reward_tokenizer = AutoTokenizer.from_pretrained(args.reward_model_path, trust_remote_code=True)
     # 数据和优化器
     train_ds = RLAIFDataset(args.data_path, tokenizer, max_length=(args.max_seq_len + args.max_gen_len))
diff --git a/trainer/train_pretrain.py b/trainer/train_pretrain.py
index 87c79a7..83705dd 100644
--- a/trainer/train_pretrain.py
+++ b/trainer/train_pretrain.py
@@ -127,7 +127,7 @@ if __name__ == "__main__":
         wandb.init(project=args.wandb_project, name=wandb_run_name, id=wandb_id, resume=resume)
     
     # ========== 5. 定义模型、数据、优化器 ==========
-    model, tokenizer = init_model(lm_config, args.from_weight)
+    model, tokenizer = init_model(lm_config, args.from_weight, device=args.device)
     train_ds = PretrainDataset(args.data_path, tokenizer, max_length=args.max_seq_len)
     train_sampler = DistributedSampler(train_ds) if dist.is_initialized() else None
     scaler = torch.cuda.amp.GradScaler(enabled=(args.dtype == 'float16'))
diff --git a/trainer/train_spo.py b/trainer/train_spo.py
index 64c4e9f..34a7451 100755
--- a/trainer/train_spo.py
+++ b/trainer/train_spo.py
@@ -19,7 +19,7 @@ from torch.optim.lr_scheduler import CosineAnnealingLR
 from transformers import AutoModel
 from model.model_minimind import MiniMindConfig, MiniMindForCausalLM
 from dataset.lm_dataset import RLAIFDataset
-from trainer.trainer_utils import Logger, is_main_process, lm_checkpoint, init_distributed_mode, setup_seed, SkipBatchSampler
+from trainer.trainer_utils import Logger, is_main_process, lm_checkpoint, init_distributed_mode, setup_seed, SkipBatchSampler, init_model
 
 warnings.filterwarnings('ignore')
 
@@ -287,25 +287,17 @@ if __name__ == "__main__":
         wandb.init(project=args.wandb_project, name=wandb_run_name, id=wandb_id, resume=resume)
     
     # ========== 5. 初始化模型（Policy, Ref, Reward）和Value Tracker、数据 ==========
-    tokenizer = AutoTokenizer.from_pretrained('../model/')
-    moe_suffix = '_moe' if lm_config.use_moe else ''
     base_weight = "reason" if args.reasoning == 1 else "full_sft"
-    ckp = f'{args.save_dir}/{base_weight}_{lm_config.hidden_size}{moe_suffix}.pth'
-    state_dict = torch.load(ckp, map_location=args.device)
     # Policy模型
-    model = MiniMindForCausalLM(lm_config)
-    model.load_state_dict(state_dict, strict=False)
-    model = model.to(args.device)
-    Logger(f'Policy模型总参数量：{sum(p.numel() for p in model.parameters() if p.requires_grad) / 1e6:.3f} M')
+    model, tokenizer = init_model(lm_config, base_weight, device=args.device)
     # Reference模型
-    ref_model = MiniMindForCausalLM(lm_config)
-    ref_model.load_state_dict(state_dict, strict=False)
-    ref_model.eval().requires_grad_(False)
-    ref_model = ref_model.to(args.device)
+    ref_model, _ = init_model(lm_config, base_weight, device=args.device)
+    ref_model = ref_model.eval().requires_grad_(False)
     # Reward模型
     reward_model = AutoModel.from_pretrained(
-        args.reward_model_path, device_map="cuda", torch_dtype=torch.float16, trust_remote_code=True
-    ).to(args.device).eval().requires_grad_(False)
+        args.reward_model_path, torch_dtype=torch.float16, trust_remote_code=True
+    )
+    reward_model = reward_model.to(args.device).eval().requires_grad_(False)
     reward_tokenizer = AutoTokenizer.from_pretrained(args.reward_model_path, trust_remote_code=True)
     # Value Tracker
     value_tracker = AutoAdaptiveValueTracker(rho_mode='kl', rho_const=0.9, D_half=0.06, clip_lower=0.5, clip_upper=0.96)