From 805744e60a130456fa9b0de8014a0d1703a2d571 Mon Sep 17 00:00:00 2001 From: jingyaogong Date: Thu, 23 Oct 2025 19:08:42 +0800 Subject: [PATCH] [fix] loss-issues-430 --- trainer/train_distill_reason.py | 2 +- trainer/train_distillation.py | 2 +- trainer/train_dpo.py | 2 +- trainer/train_full_sft.py | 2 +- trainer/train_grpo.py | 2 +- trainer/train_lora.py | 2 +- trainer/train_ppo.py | 4 ++-- trainer/train_pretrain.py | 2 +- trainer/train_spo.py | 2 +- 9 files changed, 10 insertions(+), 10 deletions(-) diff --git a/trainer/train_distill_reason.py b/trainer/train_distill_reason.py index 9c1c9eb..525490e 100644 --- a/trainer/train_distill_reason.py +++ b/trainer/train_distill_reason.py @@ -79,7 +79,7 @@ def train_epoch(epoch, wandb): if step % args.log_interval == 0 or step == iter_per_epoch - 1: spend_time = time.time() - start_time Logger( - 'Epoch:[{}/{}]({}/{}) loss:{:.3f} lr:{:.12f} epoch_Time:{}min:'.format( + 'Epoch:[{}/{}]({}/{}) loss:{:.6f} lr:{:.12f} epoch_Time:{}min:'.format( epoch + 1, args.epochs, step, diff --git a/trainer/train_distillation.py b/trainer/train_distillation.py index 582e36d..377e5f3 100644 --- a/trainer/train_distillation.py +++ b/trainer/train_distillation.py @@ -113,7 +113,7 @@ def train_epoch(epoch, wandb, alpha=0.0, temperature=1.0): if step % args.log_interval == 0 or step == iter_per_epoch - 1: spend_time = time.time() - start_time Logger( - 'Epoch:[{}/{}]({}/{}) loss:{:.4f} lr:{:.12f} epoch_Time:{}min:'.format( + 'Epoch:[{}/{}]({}/{}) loss:{:.6f} lr:{:.12f} epoch_Time:{}min:'.format( epoch, args.epochs - 1, step, diff --git a/trainer/train_dpo.py b/trainer/train_dpo.py index cdb9d92..4b72afa 100644 --- a/trainer/train_dpo.py +++ b/trainer/train_dpo.py @@ -103,7 +103,7 @@ def train_epoch(epoch, wandb): if step % args.log_interval == 0 or step == iter_per_epoch - 1: spend_time = time.time() - start_time Logger( - 'Epoch:[{}/{}]({}/{}) loss:{:.3f} lr:{:.12f} epoch_Time:{}min:'.format( + 'Epoch:[{}/{}]({}/{}) loss:{:.6f} lr:{:.12f} epoch_Time:{}min:'.format( epoch + 1, args.epochs, step, diff --git a/trainer/train_full_sft.py b/trainer/train_full_sft.py index bae0dff..7c601a4 100644 --- a/trainer/train_full_sft.py +++ b/trainer/train_full_sft.py @@ -66,7 +66,7 @@ def train_epoch(epoch, wandb): if step % args.log_interval == 0 or step == iter_per_epoch - 1: spend_time = time.time() - start_time Logger( - 'Epoch:[{}/{}]({}/{}) loss:{:.3f} lr:{:.12f} epoch_Time:{}min:'.format( + 'Epoch:[{}/{}]({}/{}) loss:{:.6f} lr:{:.12f} epoch_Time:{}min:'.format( epoch + 1, args.epochs, step, diff --git a/trainer/train_grpo.py b/trainer/train_grpo.py index 8aeb349..7c33a94 100755 --- a/trainer/train_grpo.py +++ b/trainer/train_grpo.py @@ -169,7 +169,7 @@ def grpo_train_epoch(epoch, wandb): Logger( f'Epoch: {epoch}, Step: {step + 1}/{iter_per_epoch}, ' - f'Actor Loss: {policy_loss_val:.4f}, Reward: {avg_reward_val:.4f}, ' + f'Actor Loss: {policy_loss_val:.6f}, Reward: {avg_reward_val:.6f}, ' f'Avg Response Len: {avg_len_val:.2f}, LR: {current_lr:.2e}') if wandb and (not ddp or dist.get_rank() == 0): diff --git a/trainer/train_lora.py b/trainer/train_lora.py index 4c73126..e2659f4 100644 --- a/trainer/train_lora.py +++ b/trainer/train_lora.py @@ -67,7 +67,7 @@ def train_epoch(epoch, wandb): if step % args.log_interval == 0 or step == iter_per_epoch - 1: spend_time = time.time() - start_time Logger( - 'Epoch:[{}/{}]({}/{}) loss:{:.3f} lr:{:.12f} epoch_Time:{}min:'.format( + 'Epoch:[{}/{}]({}/{}) loss:{:.6f} lr:{:.12f} epoch_Time:{}min:'.format( epoch + 1, args.epochs, step, diff --git a/trainer/train_ppo.py b/trainer/train_ppo.py index f8a8d38..6ddbb64 100644 --- a/trainer/train_ppo.py +++ b/trainer/train_ppo.py @@ -193,8 +193,8 @@ def ppo_train_epoch(epoch: int, wandb_run, old_actor_model, ref_model, actor_sch }) Logger(f"Epoch: {epoch}, Step: {step + 1}/{len(train_loader)}, " - f"Actor Loss: {actor_loss_val:.4f}, Critic Loss: {critic_loss_val:.4f}, " - f"Reward: {reward_val:.4f}, KL: {kl_val:.4f}, KL_ref: {kl_ref_val:.4f}, " + f"Actor Loss: {actor_loss_val:.6f}, Critic Loss: {critic_loss_val:.6f}, " + f"Reward: {reward_val:.6f}, KL: {kl_val:.6f}, KL_ref: {kl_ref_val:.6f}, " f"Avg Response Len: {avg_len_val:.2f}, Actor LR: {actor_lr:.2e}, Critic LR: {critic_lr:.2e}") if (step + 1) % args.update_old_actor_freq == 0: diff --git a/trainer/train_pretrain.py b/trainer/train_pretrain.py index ab6f68f..13b83fc 100644 --- a/trainer/train_pretrain.py +++ b/trainer/train_pretrain.py @@ -66,7 +66,7 @@ def train_epoch(epoch, wandb): if step % args.log_interval == 0 or step == iter_per_epoch - 1: spend_time = time.time() - start_time Logger( - 'Epoch:[{}/{}]({}/{}) loss:{:.3f} lr:{:.12f} epoch_Time:{}min:'.format( + 'Epoch:[{}/{}]({}/{}) loss:{:.6f} lr:{:.12f} epoch_Time:{}min:'.format( epoch + 1, args.epochs, step, diff --git a/trainer/train_spo.py b/trainer/train_spo.py index 7ea15bf..1b81791 100755 --- a/trainer/train_spo.py +++ b/trainer/train_spo.py @@ -216,7 +216,7 @@ def spo_train_epoch(epoch, wandb, value_tracker): Logger( f'Epoch: {epoch}, Step: {step + 1}/{iter_per_epoch}, ' - f'Actor Loss: {policy_loss_val:.4f}, Reward: {avg_reward_val:.4f}, ' + f'Actor Loss: {policy_loss_val:.6f}, Reward: {avg_reward_val:.6f}, ' f'Baseline: {avg_baseline_val:.4f}, KL: {kl_val:.4f}, Rho: {rho:.4f}, Avg Response Len: {avg_len_val:.2f}, LR: {current_lr:.2e}') if wandb and (not ddp or dist.get_rank() == 0):