mirror of
https://github.com/jingyaogong/minimind.git
synced 2026-06-06 00:04:50 +00:00
[fix] ddp exit hang
This commit is contained in:
@@ -487,4 +487,6 @@ if __name__ == "__main__":
|
||||
else:
|
||||
rl_train_epoch(epoch, loader, len(loader), rollout_engine, ref_model, reward_model, 0, wandb, use_sglang = (args.rollout_engine == "sglang"))
|
||||
|
||||
if dist.is_initialized(): dist.destroy_process_group()
|
||||
if dist.is_initialized():
|
||||
dist.barrier()
|
||||
dist.destroy_process_group()
|
||||
|
||||
@@ -243,4 +243,6 @@ if __name__ == "__main__":
|
||||
train_epoch(epoch, loader, len(loader), teacher_model, lm_config_student, 0, wandb, args.alpha, args.temperature)
|
||||
|
||||
# ========== 9. 清理分布进程 ==========
|
||||
if dist.is_initialized(): dist.destroy_process_group()
|
||||
if dist.is_initialized():
|
||||
dist.barrier()
|
||||
dist.destroy_process_group()
|
||||
@@ -223,4 +223,6 @@ if __name__ == "__main__":
|
||||
train_epoch(epoch, loader, len(loader), ref_model, lm_config, 0, wandb, args.beta)
|
||||
|
||||
# ========== 9. 清理分布进程 ==========
|
||||
if dist.is_initialized(): dist.destroy_process_group()
|
||||
if dist.is_initialized():
|
||||
dist.barrier()
|
||||
dist.destroy_process_group()
|
||||
@@ -168,4 +168,6 @@ if __name__ == "__main__":
|
||||
train_epoch(epoch, loader, len(loader), 0, wandb)
|
||||
|
||||
# ========== 9. 清理分布进程 ==========
|
||||
if dist.is_initialized(): dist.destroy_process_group()
|
||||
if dist.is_initialized():
|
||||
dist.barrier()
|
||||
dist.destroy_process_group()
|
||||
@@ -329,4 +329,6 @@ if __name__ == "__main__":
|
||||
grpo_train_epoch(epoch, loader, len(loader), rollout_engine, ref_model, reward_model, 0, wandb, use_sglang = (args.rollout_engine == "sglang"))
|
||||
|
||||
# ========== 9. 清理分布进程 ==========
|
||||
if dist.is_initialized(): dist.destroy_process_group()
|
||||
if dist.is_initialized():
|
||||
dist.barrier()
|
||||
dist.destroy_process_group()
|
||||
@@ -181,4 +181,6 @@ if __name__ == "__main__":
|
||||
train_epoch(epoch, loader, len(loader), lora_params, 0, wandb)
|
||||
|
||||
# ========== 10. 清理分布进程 ==========
|
||||
if dist.is_initialized(): dist.destroy_process_group()
|
||||
if dist.is_initialized():
|
||||
dist.barrier()
|
||||
dist.destroy_process_group()
|
||||
@@ -432,4 +432,6 @@ if __name__ == "__main__":
|
||||
ppo_train_epoch(epoch, loader, len(loader), rollout_engine, ref_model, actor_scheduler, critic_scheduler, reward_model, 0, wandb, use_sglang = (args.rollout_engine == "sglang"))
|
||||
|
||||
# ========== 9. 清理分布进程 ==========
|
||||
if dist.is_initialized(): dist.destroy_process_group()
|
||||
if dist.is_initialized():
|
||||
dist.barrier()
|
||||
dist.destroy_process_group()
|
||||
@@ -167,4 +167,6 @@ if __name__ == "__main__":
|
||||
train_epoch(epoch, loader, len(loader), 0, wandb)
|
||||
|
||||
# ========== 9. 清理分布进程 ==========
|
||||
if dist.is_initialized(): dist.destroy_process_group()
|
||||
if dist.is_initialized():
|
||||
dist.barrier()
|
||||
dist.destroy_process_group()
|
||||
Reference in New Issue
Block a user