This commit is contained in:
yuyu5333 2026-03-25 13:08:38 +08:00 committed by GitHub
commit fe7fc29435
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
22 changed files with 4695 additions and 1 deletions

6
.gitignore vendored
View File

@ -1,4 +1,8 @@
__pycache__
model/__pycache__
out
website/
docs-minimind/
docs-minimind/
logfile
dataset
checkpoints

1
minimind_sdk/__init__.py Normal file
View File

@ -0,0 +1 @@
from .client import MinimindClient

65
minimind_sdk/client.py Normal file
View File

@ -0,0 +1,65 @@
import json
import urllib.request
import urllib.error
class MinimindClient:
def __init__(self, base_url, api_key=None, timeout=10):
self.base_url = base_url.rstrip('/')
self.api_key = api_key or ''
self.timeout = timeout
def _request(self, method, path, body=None, expect_text=False):
url = f"{self.base_url}{path}"
headers = {
'Content-Type': 'application/json',
'Cache-Control': 'no-cache'
}
if self.api_key:
headers['Authorization'] = f"Bearer {self.api_key}"
data = None
if body is not None:
data = json.dumps(body).encode('utf-8')
req = urllib.request.Request(url, data=data, headers=headers, method=method)
try:
with urllib.request.urlopen(req, timeout=self.timeout) as resp:
raw = resp.read()
if expect_text:
return raw.decode('utf-8', errors='replace')
return json.loads(raw.decode('utf-8'))
except urllib.error.HTTPError as e:
msg = e.read().decode('utf-8', errors='replace')
raise RuntimeError(f"HTTP {e.code}: {msg}")
except urllib.error.URLError as e:
raise RuntimeError(str(e))
def register(self, name, email):
res = self._request('POST', '/api/register', {'name': name, 'email': email})
self.api_key = res.get('api_key', self.api_key)
return res
def start_training(self, train_type, **params):
payload = {'train_type': train_type}
payload.update(params or {})
res = self._request('POST', '/train', payload)
return res
def get_processes(self):
return self._request('GET', '/processes', None)
def get_logs(self, process_id):
return self._request('GET', f"/logs/{process_id}", None, expect_text=True)
def stop(self, process_id):
return self._request('POST', f"/stop/{process_id}", None)
def delete(self, process_id):
return self._request('POST', f"/delete/{process_id}", None)
def get_logfiles(self):
return self._request('GET', '/logfiles', None)
def get_logfile_content(self, filename):
return self._request('GET', f"/logfile-content/{filename}", None, expect_text=True)
def delete_logfile(self, filename):
return self._request('DELETE', f"/delete-logfile/{filename}", None)

81
trainer_web/dispatcher.py Normal file
View File

@ -0,0 +1,81 @@
import sys
import os
def build_command(train_type, params, gpu_num, use_torchrun):
if train_type == 'pretrain':
script_path = '../trainer/train_pretrain.py'
cmd = ['torchrun', '--nproc_per_node', str(gpu_num), script_path] if use_torchrun else [sys.executable, script_path]
if 'save_weight' in params:
cmd.extend(['--save_weight', params['save_weight']])
elif train_type == 'sft':
script_path = '../trainer/train_full_sft.py'
cmd = ['torchrun', '--nproc_per_node', str(gpu_num), script_path] if use_torchrun else [sys.executable, script_path]
if 'save_weight' in params:
cmd.extend(['--save_weight', params['save_weight']])
elif train_type == 'lora':
script_path = '../trainer/train_lora.py'
cmd = ['torchrun', '--nproc_per_node', str(gpu_num), script_path] if use_torchrun else [sys.executable, script_path]
if 'lora_name' in params:
cmd.extend(['--lora_name', params['lora_name']])
elif train_type == 'dpo':
script_path = '../trainer/train_dpo.py'
cmd = ['torchrun', '--nproc_per_node', str(gpu_num), script_path] if use_torchrun else [sys.executable, script_path]
if 'beta' in params and params['beta']:
cmd.extend(['--beta', params['beta']])
if 'accumulation_steps' in params and params['accumulation_steps']:
cmd.extend(['--accumulation_steps', params['accumulation_steps']])
if 'grad_clip' in params and params['grad_clip']:
cmd.extend(['--grad_clip', params['grad_clip']])
elif train_type == 'ppo':
script_path = '../trainer/train_ppo.py'
cmd = ['torchrun', '--nproc_per_node', str(gpu_num), script_path] if use_torchrun else [sys.executable, script_path]
if 'clip_epsilon' in params and params['clip_epsilon']:
cmd.extend(['--clip_epsilon', params['clip_epsilon']])
if 'vf_coef' in params and params['vf_coef']:
cmd.extend(['--vf_coef', params['vf_coef']])
if 'kl_coef' in params and params['kl_coef']:
cmd.extend(['--kl_coef', params['kl_coef']])
if 'reasoning' in params and params['reasoning']:
cmd.extend(['--reasoning', params['reasoning']])
if 'update_old_actor_freq' in params and params['update_old_actor_freq']:
cmd.extend(['--update_old_actor_freq', params['update_old_actor_freq']])
if 'reward_model_path' in params and params['reward_model_path']:
cmd.extend(['--reward_model_path', params['reward_model_path']])
elif train_type == 'grpo':
script_path = '../trainer/train_grpo.py'
cmd = ['torchrun', '--nproc_per_node', str(gpu_num), script_path] if use_torchrun else [sys.executable, script_path]
if 'beta' in params and params['beta']:
cmd.extend(['--beta', params['beta']])
if 'num_generations' in params and params['num_generations']:
cmd.extend(['--num_generations', params['num_generations']])
if 'reasoning' in params and params['reasoning']:
cmd.extend(['--reasoning', params['reasoning']])
if 'reward_model_path' in params and params['reward_model_path']:
cmd.extend(['--reward_model_path', params['reward_model_path']])
elif train_type == 'spo':
script_path = '../trainer/train_spo.py'
cmd = ['torchrun', '--nproc_per_node', str(gpu_num), script_path] if use_torchrun else [sys.executable, script_path]
if 'beta' in params and params['beta']:
cmd.extend(['--beta', params['beta']])
if 'reasoning' in params and params['reasoning']:
cmd.extend(['--reasoning', params['reasoning']])
if 'reward_model_path' in params and params['reward_model_path']:
cmd.extend(['--reward_model_path', params['reward_model_path']])
else:
return None
for key, value in params.items():
if key in ['train_type', 'save_weight', 'lora_name', 'train_monitor', 'beta', 'accumulation_steps', 'grad_clip', 'gpu_num', 'clip_epsilon', 'vf_coef', 'kl_coef', 'reasoning', 'update_old_actor_freq', 'reward_model_path', 'num_generations'] or ((train_type == 'ppo' or train_type == 'grpo' or train_type == 'spo') and key == 'from_weight'):
continue
elif key == 'from_resume':
cmd.extend([f'--{key}', str(value)])
else:
cmd.extend([f'--{key}', str(value)])
if 'train_monitor' in params:
if params['train_monitor'] == 'wandb' or params['train_monitor'] == 'swanlab':
cmd.append('--use_wandb')
if params['train_monitor'] == 'wandb':
cmd.extend(['--wandb_project', 'minimind_training'])
return cmd

91
trainer_web/start_web_ui.sh Executable file
View File

@ -0,0 +1,91 @@
#!/bin/bash
# 获取脚本所在目录(兼容 macOS
SCRIPT_DIR="$(cd "$(dirname "$0")" && pwd)"
cd "$SCRIPT_DIR"
# 检查是否已经有实例在运行
if [ -f "train_web_ui.pid" ]; then
pid=$(cat "train_web_ui.pid")
if ps -p "$pid" > /dev/null 2>&1; then
echo "Web UI 服务已经在运行 (PID: $pid)"
exit 1
else
echo "删除旧的PID文件"
rm "train_web_ui.pid"
fi
fi
# 创建日志目录
LOG_DIR="../logfile"
mkdir -p "$LOG_DIR"
# 生成时间戳
TIMESTAMP=$(date +"%Y%m%d_%H%M%S")
LOG_FILE="$LOG_DIR/web_ui_$TIMESTAMP.log"
echo "启动 MiniMind Web UI 服务..."
echo "日志文件: $LOG_FILE"
# 依赖预检
python - <<'PY'
import sys
missing = []
for m in ('flask', 'psutil'):
try:
__import__(m)
except Exception as e:
missing.append(f"{m}: {e.__class__.__name__} {e}")
if missing:
print("依赖缺失或不可用:\n" + "\n".join(missing))
sys.exit(1)
PY
if [ $? -ne 0 ]; then
echo "启动失败:请先安装缺失依赖,例如 'pip install flask psutil'"
exit 1
fi
# 使用nohup启动服务
nohup python -u train_web_ui.py > "$LOG_FILE" 2>&1 &
# 保存PID
echo $! > "train_web_ui.pid"
# 轮询日志以获取实际端口号最多等待10秒
PORT=""
for i in {1..20}; do
# 提取形如 http://0.0.0.0:12345 的地址,再截取端口
PORT=$(grep -Eo 'http://0\.0\.0\.0:[0-9]+' "$LOG_FILE" | tail -n1 | awk -F: '{print $NF}')
if [ -n "$PORT" ]; then
break
fi
sleep 0.5
done
# 如果仍未获取到端口,回退为默认提示端口(与后端起始端口一致)
# 健康检查验证端口响应最多等待10秒
if [ -n "$PORT" ]; then
for i in {1..20}; do
if curl -s "http://localhost:$PORT/healthz" | grep -Eq '"status"[[:space:]]*:[[:space:]]*"ok"'; then
echo "服务已启动! PID: $(cat "train_web_ui.pid")"
echo "访问地址: http://localhost:$PORT"
echo "停止命令: kill $(cat "train_web_ui.pid") or bash trainer_web/stop_web_ui.sh"
exit 0
fi
sleep 0.5
done
fi
# 启动失败处理:打印日志并退出非零
echo "服务启动失败,请查看日志"
tail -n 50 "$LOG_FILE" || true
if [ -f "train_web_ui.pid" ]; then
pid=$(cat "train_web_ui.pid")
if ps -p "$pid" > /dev/null 2>&1; then
kill "$pid" >/dev/null 2>&1 || true
fi
rm -f "train_web_ui.pid"
fi
exit 1

File diff suppressed because it is too large Load Diff

Binary file not shown.

After

Width:  |  Height:  |  Size: 615 KiB

View File

@ -0,0 +1,362 @@
import { openTab as _openTab } from './ui/tabs.js';
import { initTrainForm } from './train/form.js';
import { startProcessPolling, stopProcessPolling, loadProcesses } from './processes/list.js';
import { loadLogFiles } from './logfiles/list.js';
import { refreshLog } from './processes/logs.js';
const hooks = {
onEnterProcesses: () => {
// 当切换到进程标签页时,立即加载一次,然后开始轮询
loadProcesses().then(() => {
startProcessPolling();
});
},
onLeaveProcesses: () => {
stopProcessPolling();
},
onEnterLogfiles: () => {
loadLogFiles();
},
};
window.openTab = (evt, tabName) => _openTab(evt, tabName, hooks);
// 文件夹选择器功能 - 直接显示服务器端文件浏览器
window.selectFolder = (inputId) => {
// 直接使用远程文件浏览器,不尝试本地文件系统访问
openRemoteFileBrowser(inputId);
};
// 远程文件浏览器 - 支持文件和文件夹选择
let currentFileBrowserTarget = null;
let currentBrowsePath = './';
let selectedFilePath = null;
let currentSelectionMode = 'auto'; // 'file', 'folder', or 'auto'
function openRemoteFileBrowser(inputId) {
console.log('openRemoteFileBrowser called with:', inputId);
currentFileBrowserTarget = inputId;
// 根据输入框ID确定选择模式
if (inputId === 'data_path') {
currentSelectionMode = 'file'; // 数据路径需要文件选择
console.log('Mode set to: FILE selection');
} else if (inputId === 'save_dir' || inputId.includes('reward_model_path')) {
currentSelectionMode = 'folder'; // 保存目录和奖励模型路径需要文件夹选择
console.log('Mode set to: FOLDER selection');
} else {
currentSelectionMode = 'auto'; // 自动模式
console.log('Mode set to: AUTO selection');
}
const modal = document.getElementById('file-browser-modal');
if (modal) {
modal.classList.remove('hidden');
console.log('Modal opened successfully');
} else {
console.error('Modal element not found!');
return;
}
// 重置选择状态
selectedFilePath = null;
const selectedPathInput = document.getElementById('selected-path');
if (selectedPathInput) {
selectedPathInput.value = '';
console.log('Selected path input cleared');
}
// 加载初始路径
loadQuickPaths();
browsePath('./');
}
function closeFileBrowser() {
document.getElementById('file-browser-modal').classList.add('hidden');
currentFileBrowserTarget = null;
currentBrowsePath = './';
selectedFilePath = null;
currentSelectionMode = 'auto';
}
function confirmFileSelection() {
console.log('confirmFileSelection called');
console.log('selectedFilePath:', selectedFilePath);
console.log('currentFileBrowserTarget:', currentFileBrowserTarget);
if (selectedFilePath && currentFileBrowserTarget) {
const targetElement = document.getElementById(currentFileBrowserTarget);
console.log('targetElement:', targetElement);
if (targetElement) {
targetElement.value = selectedFilePath;
console.log('Value set successfully');
closeFileBrowser();
} else {
console.error('Target element not found:', currentFileBrowserTarget);
alert('错误:无法找到目标输入框');
}
} else {
console.log('Missing selection or target');
alert('请先选择文件或文件夹');
}
}
function navigateToParent() {
if (window.currentParentPath) {
// 使用后端提供的父目录路径(绝对路径)
browsePath(window.currentParentPath);
} else if (currentBrowsePath && currentBrowsePath !== './') {
// 回退到基于当前路径的计算
const parentPath = currentBrowsePath.includes('/') ?
currentBrowsePath.substring(0, currentBrowsePath.lastIndexOf('/')) : './';
browsePath(parentPath || './');
}
}
function selectCurrentDirectory() {
// 选择当前目录
selectedFilePath = currentBrowsePath;
document.getElementById('selected-path').value = currentBrowsePath;
// 可以关闭模态框或让用户继续浏览
}
async function loadQuickPaths() {
try {
const response = await fetch('/api/quick-paths');
const data = await response.json();
const quickPathsContainer = document.getElementById('quick-paths');
quickPathsContainer.innerHTML = '';
if (data.paths && data.paths.length > 0) {
data.paths.forEach(path => {
const btn = document.createElement('button');
btn.className = 'quick-path-btn';
btn.textContent = path.name;
btn.onclick = () => browsePath(path.path);
btn.title = path.path;
quickPathsContainer.appendChild(btn);
});
}
} catch (error) {
console.warn('加载快捷路径失败:', error);
}
}
async function browsePath(path) {
console.log('browsePath called with:', path);
try {
currentBrowsePath = path;
selectedFilePath = null; // 重置选中的文件路径
document.getElementById('selected-path').value = ''; // 清空显示
// 更新帮助文本
updateHelpText();
const response = await fetch(`/api/browse?path=${encodeURIComponent(path)}`);
const data = await response.json();
if (data.error) {
alert(`浏览失败: ${data.error}`);
return;
}
renderFileList(data);
console.log('File list rendered successfully');
} catch (error) {
console.error('浏览路径失败:', error);
alert('浏览路径失败,请检查网络连接');
}
}
function renderFileList(data) {
const fileList = document.getElementById('file-list');
fileList.innerHTML = '';
if (!data.items || data.items.length === 0) {
fileList.innerHTML = '<div style="padding: 2rem; text-align: center; color: var(--text-secondary);">此目录为空</div>';
return;
}
// 更新当前路径显示(使用相对路径用于显示)
document.getElementById('current-path').textContent = data.relative_path || data.current_path;
// 存储父目录路径供导航使用
window.currentParentPath = data.parent;
// 先显示目录,再显示文件
const directories = data.items.filter(item => item.type === 'directory');
const files = data.items.filter(item => item.type === 'file');
// 渲染目录
directories.forEach(item => {
const div = createFileItem(item, '📁');
fileList.appendChild(div);
});
// 渲染文件(仅在文件选择模式或自动模式下显示)
if (currentSelectionMode !== 'folder') {
files.forEach(item => {
const div = createFileItem(item, '📄');
fileList.appendChild(div);
});
}
}
function createFileItem(item, icon) {
const div = document.createElement('div');
div.className = 'file-item';
// 根据选择模式添加适当的CSS类
if (currentSelectionMode === 'file' && item.type === 'directory') {
// 文件选择模式下,文件夹只用于导航,不能选择
div.classList.add('navigable');
} else if (currentSelectionMode === 'folder' && item.type === 'file') {
// 文件夹选择模式下,文件不能被选择
div.classList.add('disabled');
}
div.onclick = (event) => selectFileItem(item, event);
div.innerHTML = `
<span class="file-icon">${icon}</span>
<span class="file-name">${item.name}</span>
<span class="file-info">${item.type === 'file' ? formatFileSize(item.size) : '文件夹'}</span>
`;
return div;
}
function selectFileItem(item, event) {
console.log('selectFileItem called with:', item);
console.log('currentSelectionMode:', currentSelectionMode);
console.log('event:', event);
// 检查是否点击了被禁用的项目
if (event && event.currentTarget && event.currentTarget.classList.contains('disabled')) {
console.log('Clicked disabled item, ignoring');
return;
}
if (item.type === 'directory') {
// 文件夹:根据选择模式决定行为
if (currentSelectionMode === 'file') {
// 文件选择模式:只能选择文件,点击进入目录
console.log('File mode: navigating into directory');
browsePath(item.path);
} else if (currentSelectionMode === 'folder') {
// 文件夹选择模式:可以选择文件夹
console.log('Folder mode: selecting directory');
selectedFilePath = item.path;
document.getElementById('selected-path').value = item.path;
// 高亮显示选中的文件夹
document.querySelectorAll('.file-item').forEach(el => el.classList.remove('selected'));
if (event && event.currentTarget) {
event.currentTarget.classList.add('selected');
}
console.log('Directory selected:', selectedFilePath);
} else {
// 自动模式:点击进入目录
console.log('Auto mode: navigating into directory');
browsePath(item.path);
}
} else {
// 文件:选中文件路径(仅在选择文件或自动模式下)
if (currentSelectionMode !== 'folder') {
console.log('Selecting file:', item.path);
selectedFilePath = item.path;
document.getElementById('selected-path').value = item.path;
// 高亮显示选中的文件
document.querySelectorAll('.file-item').forEach(el => el.classList.remove('selected'));
if (event && event.currentTarget) {
event.currentTarget.classList.add('selected');
}
console.log('File selected:', selectedFilePath);
} else {
console.log('File clicked in folder mode, ignoring');
}
}
}
function formatFileSize(bytes) {
if (bytes === 0) return '0 B';
const k = 1024;
const sizes = ['B', 'KB', 'MB', 'GB'];
const i = Math.floor(Math.log(bytes) / Math.log(k));
return parseFloat((bytes / Math.pow(k, i)).toFixed(1)) + ' ' + sizes[i];
}
function updateHelpText() {
const helpText = document.querySelector('.file-browser-help');
const modalTitle = document.getElementById('modal-title');
if (!helpText) return;
let text = '';
let title = '';
switch (currentSelectionMode) {
case 'file':
text = '💡 请选择文件:点击文件选择,点击文件夹进入目录,使用📍选择当前目录';
title = '选择文件';
break;
case 'folder':
text = '💡 请选择文件夹:点击文件夹选择,点击文件无效,使用📍选择当前目录';
title = '选择文件夹';
break;
default:
text = '💡 点击文件夹进入目录,点击文件选择文件,使用📍选择当前目录';
title = '选择文件或文件夹';
}
helpText.textContent = text;
if (modalTitle) {
modalTitle.textContent = title;
}
}
// 添加模态框键盘事件监听
document.addEventListener('keydown', function(event) {
if (event.key === 'Escape') {
closeFileBrowser();
}
});
// 添加模态框点击外部关闭功能
document.addEventListener('DOMContentLoaded', function() {
const modal = document.getElementById('file-browser-modal');
if (modal) {
modal.addEventListener('click', function(event) {
if (event.target === modal) {
closeFileBrowser();
}
});
}
});
// 将文件浏览器函数暴露到全局作用域
window.selectFolder = selectFolder;
window.openRemoteFileBrowser = openRemoteFileBrowser;
window.closeFileBrowser = closeFileBrowser;
window.confirmFileSelection = confirmFileSelection;
window.navigateToParent = navigateToParent;
window.selectCurrentDirectory = selectCurrentDirectory;
// 将进程管理函数暴露到全局作用域
window.refreshProcesses = () => {
// 立即刷新进程数据,然后重置轮询计时器
return loadProcesses().then(() => {
// 重置轮询计时器以确保平滑的更新间隔
stopProcessPolling();
startProcessPolling();
});
};
window.refreshLogs = loadLogFiles;
window.refreshLog = refreshLog;
window.addEventListener('load', () => {
initTrainForm();
// 不再立即开始轮询,而是等待用户切换到进程标签页
// startProcessPolling(); // 移动到钩子函数中
loadProcesses(); // 仍然加载初始进程数据
});

View File

@ -0,0 +1,194 @@
import { getLogFiles, getLogFileContent, deleteLogFile as apiDeleteLogFile } from '../services/apiClient.js';
import { el } from '../utils/dom.js';
import { showNotification } from '../ui/notify.js';
import { showConfirmDialog } from '../ui/dialog.js';
export function loadLogFiles() {
return getLogFiles().then((data) => {
const list = document.getElementById('logfiles-list');
list.innerHTML = '';
if (data.length === 0) {
list.innerHTML = '<p>暂无日志文件</p>';
return;
}
data.sort((a, b) => new Date(b.modified_time) - new Date(a.modified_time));
const groups = {};
data.forEach((f) => {
let type = '自定义训练';
const n = f.filename;
if (n.includes('train_pretrain_')) type = 'pretrain';
else if (n.includes('train_sft_')) type = 'sft';
else if (n.includes('train_lora_')) type = 'lora';
else if (n.includes('train_dpo_')) type = 'dpo';
else if (n.includes('train_ppo_')) type = 'ppo';
else if (n.includes('train_grpo_')) type = 'grpo';
else if (n.includes('train_spo_')) type = 'spo';
f.train_type = type;
if (!groups[type]) groups[type] = [];
groups[type].push(f);
});
const order = ['pretrain', 'sft', 'lora', 'dpo', 'ppo', 'grpo', 'spo', '未知'];
[...order.filter((t) => groups[t]), ...Object.keys(groups).filter((t) => !order.includes(t))].forEach((t) => {
list.appendChild(createTypeGroupWithToggle(t, groups[t]));
});
});
}
function createTypeGroupWithToggle(trainType, files) {
const group = el('div', { class: 'process-type-group' });
const header = el('div', { class: 'process-type-header' });
header.dataset.expanded = 'true';
const title = el('h3', { class: 'process-type-title', text: getTrainTypeDisplayName(trainType) });
const toggle = el('button', { class: 'toggle-btn' });
toggle.innerHTML = '▼';
toggle.onclick = (e) => {
e.stopPropagation();
toggleGroup(header);
};
header.appendChild(title);
header.appendChild(toggle);
header.onclick = () => toggleGroup(header);
const content = el('div', { class: 'process-type-content' });
files.forEach((f) => addLogFileItemToGroup(content, f));
group.appendChild(header);
group.appendChild(content);
return group;
}
function toggleGroup(header) {
const expanded = header.dataset.expanded === 'true';
const content = header.nextElementSibling;
const toggle = header.querySelector('.toggle-btn');
if (expanded) {
header.dataset.expanded = 'false';
content.style.maxHeight = '0';
content.style.overflow = 'hidden';
toggle.innerHTML = '▶';
} else {
content.style.overflow = 'hidden';
content.style.maxHeight = 'none';
const h = content.scrollHeight;
content.style.maxHeight = '0';
content.offsetHeight;
header.dataset.expanded = 'true';
content.style.maxHeight = h + 'px';
setTimeout(() => {
content.style.maxHeight = 'none';
content.style.overflow = 'visible';
}, 300);
toggle.innerHTML = '▼';
}
}
function getTrainTypeDisplayName(trainType) {
const names = {
pretrain: '预训练 (Pretrain)',
sft: '全参数监督微调 (SFT - Full)',
lora: 'LoRA监督微调 (SFT - Lora)',
dpo: '直接偏好优化 (RL - DPO)',
ppo: 'PPO',
grpo: 'GRPO',
spo: 'SPO',
};
return names[trainType] || trainType;
}
function addLogFileItemToGroup(parent, logfile) {
const item = el('div', { class: 'process-item' });
item.innerHTML = `
<div class="process-info">
<div><strong>${logfile.filename}</strong></div>
<div>
<span class="process-status status-completed">已保存</span>
<span style="margin-left: 10px; color: #999; font-size: 0.9em;">${logfile.modified_time}</span>
</div>
</div>
<div>
<button class="btn-logs" data-view="${logfile.filename}">查看日志</button>
<button class="btn-delete" data-del="${logfile.filename}">删除</button>
</div>
<div id="log-content-${logfile.filename.replace(/\./g, '-') }" class="logs-container hidden"></div>
`;
parent.appendChild(item);
bindItemButtons(item, logfile);
}
function bindItemButtons(item, logfile) {
const viewBtn = item.querySelector('[data-view]');
if (viewBtn) viewBtn.addEventListener('click', () => viewLogFile(logfile.filename, viewBtn));
const delBtn = item.querySelector('[data-del]');
if (delBtn) delBtn.addEventListener('click', () => deleteLogFile(logfile.filename, delBtn));
}
function deleteLogFile(filename, button) {
showConfirmDialog(`确定要删除日志文件 "${filename}" 吗?此操作无法恢复。`, () => {
const item = button.closest('.process-item');
const content = item.closest('.process-type-content');
const group = content.closest('.process-type-group');
const original = button.textContent;
button.textContent = '删除中...';
button.disabled = true;
apiDeleteLogFile(filename)
.then((data) => {
if (data.success) {
item.remove();
if (content.children.length === 0) group.remove();
else {
const header = content.previousElementSibling;
if (header && header.dataset.expanded === 'true') {
content.style.maxHeight = 'none';
const h = content.scrollHeight;
content.style.maxHeight = h + 'px';
}
}
showNotification(`日志文件 "${filename}" 已成功删除`);
} else throw new Error(data.message || '删除失败');
})
.catch((e) => {
showNotification(`删除失败: ${e.message}`, 'error');
button.textContent = original;
button.disabled = false;
});
});
}
function viewLogFile(filename, button) {
const safe = filename.replace(/[^a-zA-Z0-9_.-]/g, '_').replace(/\./g, '-');
const item = button.closest('.process-item');
const container = item.querySelector(`#log-content-${safe}`);
const content = item.closest('.process-type-content');
const header = content ? content.previousElementSibling : null;
if (content && header && header.dataset.expanded !== 'true') toggleGroup(header);
if (container.classList.contains('hidden')) {
container.classList.remove('hidden');
container.textContent = '加载中...';
getLogFileContent(filename)
.then((logs) => {
container.textContent = logs;
container.scrollTop = 0;
updateContentHeight(content, header);
})
.catch((e) => {
container.textContent = `获取日志失败: ${e.message}`;
updateContentHeight(content, header);
});
} else {
container.classList.add('hidden');
updateContentHeight(content, header);
}
}
function updateContentHeight(content, header) {
if (content && header && header.dataset.expanded === 'true') {
const current = content.style.maxHeight;
content.style.maxHeight = 'none';
const h = content.scrollHeight;
if (current === 'none' || parseInt(current) !== h) {
content.style.maxHeight = h + 'px';
setTimeout(() => {
if (header.dataset.expanded === 'true') content.style.maxHeight = 'none';
}, 300);
} else content.style.maxHeight = current;
}
}

View File

@ -0,0 +1,634 @@
import { getProcesses, stopProcess as apiStop, deleteProcess as apiDelete } from '../services/apiClient.js';
import { showNotification } from '../ui/notify.js';
import { showConfirmDialog } from '../ui/dialog.js';
import { el, clearChildren } from '../utils/dom.js';
import { showLogs, refreshLog, clearLogTimerFor } from './logs.js';
// 计算训练进度信息
function calculateRemainingTime(current, total, logText) {
// 尝试从日志中提取时间信息
const timePatterns = [
/remaining[\s:=]\s*(\d+)[\s:]?(\d+)?[\s:]?(\d+)?/i, // remaining: 1:30:45 or remaining: 90
/ETA[\s:=]\s*(\d+):(\d+):(\d+)/i, // ETA: 1:30:45
/预计剩余[\s:=]\s*(\d+)[\s小时]*[\s:]?(\d+)?[\s分钟]*/i, // 预计剩余: 1小时30分钟
/剩余时间[\s:=]\s*(\d+)[\s小时]*[\s:]?(\d+)?[\s分钟]*/i, // 剩余时间: 1小时30分钟
/time left[\s:=]\s*(\d+)[\s:]?(\d+)?[\s:]?(\d+)?/i, // time left: 1:30:45
/还需[\s:=]\s*(\d+)[\s小时]*[\s:]?(\d+)?[\s分钟]*/i // 还需: 1小时30分钟
];
for (const pattern of timePatterns) {
const match = logText.match(pattern);
if (match) {
const hours = parseInt(match[1]) || 0;
const minutes = parseInt(match[2]) || 0;
const seconds = parseInt(match[3]) || 0;
if (hours > 0 || minutes > 0 || seconds > 0) {
const parts = [];
if (hours > 0) parts.push(`${hours}小时`);
if (minutes > 0) parts.push(`${minutes}分钟`);
if (seconds > 0 && hours === 0 && minutes === 0) parts.push(`${seconds}`);
return parts.join('');
}
}
}
// 如果没有找到时间信息,根据进度估算
if (current > 0 && current < total) {
const remainingEpochs = total - current;
// 假设每个epoch大约需要一定时间这里使用简单的线性估算
// 实际应用中可以根据历史数据更准确地估算
return `${remainingEpochs}个epoch`;
}
return '计算中...';
}
function calculateProgress(process) {
const defaultProgress = {
percentage: 0,
current: 0,
total: 0,
remaining: '计算中...',
loss: null,
epoch: null,
lr: null
};
// 如果进程不在运行,返回默认进度
if (!process.running) return defaultProgress;
// 从进程数据中提取进度信息
if (process.progress) {
return {
percentage: process.progress.percentage || 0,
current: process.progress.current_epoch || 0,
total: process.progress.total_epochs || 0,
remaining: process.progress.remaining_time || '计算中...',
loss: process.progress.current_loss || null,
epoch: process.progress.current_epoch ? `${process.progress.current_epoch}/${process.progress.total_epochs}` : null,
lr: process.progress.current_lr || null,
step: process.progress.current_step && process.progress.total_steps ?
`${process.progress.current_step}/${process.progress.total_steps}` : null,
currentStep: process.progress.current_step || 0,
totalSteps: process.progress.total_steps || 0
};
}
// 尝试从日志中提取进度信息(增强版本)
if (process.logs) {
const logText = process.logs.slice(-2000); // 取最近2000字符以获取更多上下文
// 提取epoch信息 - 支持多种格式
const epochPatterns = [
/epoch\s+(\d+)\s*\/\s*(\d+)/i, // epoch 3/10
/Epoch\s+(\d+)\s*of\s*(\d+)/i, // Epoch 3 of 10
/\[(\d+)\/(\d+)\]/i, // [3/10]
/epoch\s*[:]\s*(\d+)\s*\/\s*(\d+)/i, // epoch: 3/10
/第\s*(\d+)\s*轮\s*\/\s*共\s*(\d+)\s*轮/i // 第3轮/共10轮
];
let current = 0;
let total = 0;
let percentage = 0;
let currentStep = 0;
let totalSteps = 0;
let stepInfo = null;
for (const pattern of epochPatterns) {
const match = logText.match(pattern);
if (match) {
current = parseInt(match[1]);
total = parseInt(match[2]);
percentage = total > 0 ? Math.round((current / total) * 100) : 0;
break;
}
}
// 提取step信息 - 支持多种格式
const stepPatterns = [
/step\s+(\d+)\s*\/\s*(\d+)/i, // step 150/1000
/Step\s+(\d+)\s*of\s*(\d+)/i, // Step 150 of 1000
/\[(\d+)\/(\d+)\]/i, // [150/1000]
/step\s*[:]\s*(\d+)\s*\/\s*(\d+)/i, // step: 150/1000
/第\s*(\d+)\s*步\s*\/\s*共\s*(\d+)\s*步/i, // 第150步/共1000步
/步数\s*(\d+)\s*\/\s*(\d+)/i, // 步数 150/1000
/batch\s+(\d+)\s*\/\s*(\d+)/i, // batch 150/1000
/Batch\s+(\d+)\s*of\s*(\d+)/i // Batch 150 of 1000
];
for (const pattern of stepPatterns) {
const match = logText.match(pattern);
if (match) {
currentStep = parseInt(match[1]);
totalSteps = parseInt(match[2]);
stepInfo = `${currentStep}/${totalSteps}`;
break;
}
}
// 提取loss信息 - 支持多种格式
const lossPatterns = [
/loss[\s:=]\s*([\d.]+(?:e[+-]?\d+)?)/i, // loss: 4.32 or loss = 4.32
/training_loss[\s:=]\s*([\d.]+(?:e[+-]?\d+)?)/i, // training_loss: 4.32
/train_loss[\s:=]\s*([\d.]+(?:e[+-]?\d+)?)/i, // train_loss: 4.32
/Loss[\s:=]\s*([\d.]+(?:e[+-]?\d+)?)/i, // Loss: 4.32
/训练损失[\s:=]\s*([\d.]+(?:e[+-]?\d+)?)/i, // 训练损失: 4.32
/损失[\s:=]\s*([\d.]+(?:e[+-]?\d+)?)/i, // 损失: 4.32
/\s+([\d.]+(?:e[+-]?\d+)?)\s*loss/i, // 4.32 loss
/\s+([\d.]+(?:e[+-]?\d+)?)\s*训练损失/i, // 4.32 训练损失
/(?:loss|损失|training_loss|train_loss)\s*=\s*([\d.]+(?:e[+-]?\d+)?)/i // loss = 4.32
];
let currentLoss = null;
for (const pattern of lossPatterns) {
const matches = [...logText.matchAll(pattern)];
if (matches.length > 0) {
// 取最后一个匹配的loss值
const lastMatch = matches[matches.length - 1];
const lossValue = parseFloat(lastMatch[1]);
if (!isNaN(lossValue) && lossValue > 0 && lossValue < 100) { // 合理的loss范围
currentLoss = lossValue.toFixed(4);
break;
}
}
}
// 提取学习率信息
const lrPatterns = [
/lr[\s:=]\s*([\d.e+-]+)/i, // lr: 1e-4
/learning_rate[\s:=]\s*([\d.e+-]+)/i, // learning_rate: 1e-4
/LR[\s:=]\s*([\d.e+-]+)/i, // LR: 1e-4
/学习率[\s:=]\s*([\d.e+-]+)/i // 学习率: 1e-4
];
let currentLr = null;
for (const pattern of lrPatterns) {
const matches = [...logText.matchAll(pattern)];
if (matches.length > 0) {
const lastMatch = matches[matches.length - 1];
const lrValue = parseFloat(lastMatch[1]);
if (!isNaN(lrValue) && lrValue > 0 && lrValue < 1) { // 合理的lr范围
currentLr = lrValue.toExponential(2);
break;
}
}
}
// 如果找到了有效的epoch信息返回进度
if (total > 0) {
// 重新计算百分比 - 支持epoch和step双重进度
let finalPercentage = percentage;
if (totalSteps > 0 && currentStep > 0) {
// 基础epoch进度
const epochPercentage = (current / total) * 100;
// 当前epoch内的step进度
const stepPercentageInEpoch = (currentStep / totalSteps) * 100;
// 将step进度加到epoch进度上每个epoch占总进度的1/total
const stepContribution = stepPercentageInEpoch / total;
finalPercentage = Math.min(100, Math.max(0, Math.round(epochPercentage + stepContribution)));
}
return {
percentage: finalPercentage,
current,
total,
remaining: calculateRemainingTime(current, total, logText),
loss: currentLoss,
epoch: `${current}/${total}`,
lr: currentLr,
step: stepInfo,
currentStep,
totalSteps
};
}
}
return defaultProgress;
}
let processPollingTimer = null;
export function startProcessPolling() {
if (processPollingTimer) clearInterval(processPollingTimer);
// 使用2秒间隔进行实时更新确保进度信息及时刷新
processPollingTimer = setInterval(() => {
const tab = document.querySelector('.tab.active');
if (tab && tab.textContent.includes('进程')) {
checkProcessStatusChanges();
}
}, 2000);
}
export function stopProcessPolling() {
if (processPollingTimer) {
clearInterval(processPollingTimer);
processPollingTimer = null;
}
}
export function checkProcessStatusChanges() {
return getProcesses()
.then((data) => {
let updatedCount = 0;
data.forEach((p) => {
const item = document.querySelector(`[data-process-id="${p.id}"]`);
if (!item) return;
const cur = item.dataset.processStatus;
const next = p.status;
// 如果状态发生变化,更新整个项目
if (cur !== next) {
updateProcessItem(item, p);
if (next === '出错') showNotification(`进程 ${p.train_type} 已出错`, 'error');
updatedCount++;
}
// 如果进程正在运行,即使状态没变也要更新进度信息
else if (p.running) {
updateProcessProgress(item, p);
updatedCount++;
}
});
// 调试用:在控制台显示更新信息(生产环境中可以移除)
if (updatedCount > 0) {
console.log(`[${new Date().toLocaleTimeString()}] 更新了 ${updatedCount} 个进程的进度信息`);
}
})
.catch(() => {
showNotification('连接服务器失败,请刷新页面重试', 'error');
});
}
export function loadProcesses() {
return getProcesses().then((data) => {
const list = document.getElementById('process-list');
clearChildren(list);
if (data.length === 0) {
list.innerHTML = '<p>暂无训练进程</p>';
return;
}
data.sort((a, b) => new Date(b.start_time) - new Date(a.start_time));
const groups = {};
data.forEach((p) => {
if (!groups[p.train_type]) groups[p.train_type] = [];
groups[p.train_type].push(p);
});
const order = ['pretrain', 'sft', 'lora', 'dpo'];
const types = [...order.filter((t) => groups[t]), ...Object.keys(groups).filter((t) => !order.includes(t))];
types.forEach((t) => {
const g = createTypeGroupWithToggle(t, groups[t]);
list.appendChild(g);
});
});
}
function createTypeGroupWithToggle(trainType, processes) {
const group = el('div', { class: 'process-type-group' });
const header = el('div', { class: 'process-type-header' });
header.dataset.expanded = 'true';
const title = el('h3', { class: 'process-type-title', text: getTrainTypeDisplayName(trainType) });
const toggle = el('button', { class: 'toggle-btn' });
toggle.innerHTML = '▼';
toggle.onclick = (e) => {
e.stopPropagation();
toggleGroup(header);
};
header.appendChild(title);
header.appendChild(toggle);
header.onclick = () => toggleGroup(header);
const content = el('div', { class: 'process-type-content' });
processes.forEach((p) => addProcessItemToGroup(content, p));
group.appendChild(header);
group.appendChild(content);
return group;
}
function toggleGroup(header) {
const expanded = header.dataset.expanded === 'true';
const content = header.nextElementSibling;
const toggle = header.querySelector('.toggle-btn');
if (expanded) {
header.dataset.expanded = 'false';
content.style.maxHeight = '0';
content.style.overflow = 'hidden';
toggle.innerHTML = '▶';
} else {
content.style.overflow = 'hidden';
content.style.maxHeight = 'none';
const h = content.scrollHeight;
content.style.maxHeight = '0';
content.offsetHeight;
header.dataset.expanded = 'true';
content.style.maxHeight = h + 'px';
setTimeout(() => {
content.style.maxHeight = 'none';
content.style.overflow = 'visible';
}, 300);
toggle.innerHTML = '▼';
}
}
function getTrainTypeDisplayName(trainType) {
const names = {
pretrain: '预训练 (Pretrain)',
sft: '全参数监督微调 (SFT - Full)',
lora: 'LoRA监督微调 (SFT - Lora)',
dpo: '直接偏好优化 (RL - DPO)',
ppo: 'PPO',
grpo: 'GRPO',
spo: 'SPO',
};
return names[trainType] || trainType;
}
export function addProcessItemToGroup(parent, process) {
const item = el('div', { class: 'process-item' });
let statusClass = 'status-completed';
if (process.status === '运行中') statusClass = 'status-running';
else if (process.status === '手动停止') statusClass = 'status-manual-stop';
else if (process.status === '出错') statusClass = 'status-error';
item.dataset.processId = process.id;
item.dataset.processStatus = process.status;
item.dataset.trainMonitor = process.train_monitor || 'none';
item.dataset.swanlabUrl = process.swanlab_url || '';
const showDelete = !process.running;
const showSwanlab = process.train_monitor !== 'none';
const swanBtn = showSwanlab ? `<button class="btn-swanlab" data-swan="${process.id}">SwanLab</button>` : '';
// 计算进度信息
const progressInfo = calculateProgress(process);
const progressBar = process.running ? `
<div class="progress-container">
<div class="progress-bar">
<div class="progress-fill" style="width: ${progressInfo.percentage}%"></div>
</div>
<div class="progress-info">
<span>进度: ${progressInfo.current}/${progressInfo.total}${progressInfo.step ? ` (${progressInfo.step})` : ''}</span>
<span>剩余时间: ${progressInfo.remaining}</span>
</div>
<div class="progress-metrics">
${progressInfo.loss ? `<div class="metric-item"><span class="metric-label">Loss:</span><span class="metric-value">${progressInfo.loss}</span></div>` : ''}
${progressInfo.epoch ? `<div class="metric-item"><span class="metric-label">Epoch:</span><span class="metric-value">${progressInfo.epoch}</span></div>` : ''}
${progressInfo.step ? `<div class="metric-item"><span class="metric-label">Step:</span><span class="metric-value">${progressInfo.step}</span></div>` : ''}
${progressInfo.lr ? `<div class="metric-item"><span class="metric-label">LR:</span><span class="metric-value">${progressInfo.lr}</span></div>` : ''}
</div>
</div>
` : '';
item.innerHTML = `
<div class="process-info">
<div><strong>${process.start_time}</strong></div>
<div><span class="process-status ${statusClass}">${process.status}</span></div>
</div>
${progressBar}
<div>
<button class="btn-logs" data-show="${process.id}">查看日志</button>
<button class="btn-logs" data-refresh="${process.id}">刷新日志</button>
${swanBtn}
${process.running ? `<button class="btn-stop" data-stop="${process.id}">停止训练</button>` : ''}
${showDelete ? `<button class="btn-delete" data-del="${process.id}">删除</button>` : ''}
</div>
<div id="logs-${process.id}" class="logs-container hidden"></div>
`;
parent.appendChild(item);
bindItemButtons(item, process);
}
function bindItemButtons(item, process) {
const showBtn = item.querySelector('[data-show]');
if (showBtn) showBtn.addEventListener('click', () => showLogs(process.id));
const refreshBtn = item.querySelector('[data-refresh]');
if (refreshBtn) refreshBtn.addEventListener('click', () => refreshLog(process.id));
const swanBtn = item.querySelector('[data-swan]');
if (swanBtn) swanBtn.addEventListener('click', () => checkAndOpenSwanlab(process.id));
const stopBtn = item.querySelector('[data-stop]');
if (stopBtn) stopBtn.addEventListener('click', () => stopProcess(process.id));
const delBtn = item.querySelector('[data-del]');
if (delBtn) delBtn.addEventListener('click', () => deleteProcess(process.id));
}
export function updateProcessProgress(item, process) {
// 只更新进度信息,不更新整个项目
const progressInfo = calculateProgress(process);
// 更新进度条
const progressFill = item.querySelector('.progress-fill');
const progressText = item.querySelector('.progress-info span:first-child');
const remainingText = item.querySelector('.progress-info span:last-child');
const metricsContainer = item.querySelector('.progress-metrics');
if (progressFill) {
progressFill.style.width = `${progressInfo.percentage}%`;
}
if (progressText) {
const stepText = progressInfo.step ? ` (${progressInfo.step})` : '';
progressText.textContent = `进度: ${progressInfo.current}/${progressInfo.total}${stepText}`;
}
if (remainingText) {
remainingText.textContent = `剩余时间: ${progressInfo.remaining}`;
}
if (metricsContainer) {
// 更新指标 - 只更新有变化的值来减少DOM操作
const lossItem = metricsContainer.querySelector('.metric-item:nth-child(1) .metric-value');
const epochItem = metricsContainer.querySelector('.metric-item:nth-child(2) .metric-value');
const stepItem = metricsContainer.querySelector('.metric-item:nth-child(3) .metric-value');
const lrItem = metricsContainer.querySelector('.metric-item:nth-child(4) .metric-value');
if (progressInfo.loss && lossItem) {
lossItem.textContent = progressInfo.loss;
}
if (progressInfo.epoch && epochItem) {
epochItem.textContent = progressInfo.epoch;
}
if (progressInfo.step && stepItem) {
stepItem.textContent = progressInfo.step;
}
if (progressInfo.lr && lrItem) {
lrItem.textContent = progressInfo.lr;
}
}
}
export function updateProcessItem(item, process) {
item.dataset.processStatus = process.status;
item.dataset.trainMonitor = process.train_monitor || 'none';
if (process.swanlab_url) item.dataset.swanlabUrl = process.swanlab_url;
const statusEl = item.querySelector('.process-status');
if (statusEl) {
statusEl.classList.remove('status-running', 'status-manual-stop', 'status-error', 'status-completed');
let cls = 'status-completed';
if (process.status === '运行中') cls = 'status-running';
else if (process.status === '手动停止') cls = 'status-manual-stop';
else if (process.status === '出错') cls = 'status-error';
statusEl.classList.add(cls);
statusEl.textContent = process.status;
}
const btnContainer = item.querySelector('div:nth-child(2)');
const existingSwan = item.querySelector('.btn-swanlab');
const showSwan = process.train_monitor !== 'none';
if (showSwan && !existingSwan && btnContainer) {
const b = el('button', { class: 'btn-swanlab' });
b.textContent = 'SwanLab';
b.onclick = () => checkAndOpenSwanlab(process.id);
const stop = btnContainer.querySelector('.btn-stop');
if (stop) btnContainer.insertBefore(b, stop);
else btnContainer.appendChild(b);
} else if (!showSwan && existingSwan) existingSwan.remove();
const stopBtn = item.querySelector('.btn-stop');
if (stopBtn) {
if (!process.running) stopBtn.remove();
} else if (process.running && btnContainer) {
const n = el('button', { class: 'btn-stop' });
n.textContent = '停止训练';
n.onclick = () => stopProcess(process.id);
btnContainer.appendChild(n);
}
const delBtn = item.querySelector('.btn-delete');
if (!process.running) {
if (!delBtn) {
const c = item.querySelector('div:last-child');
if (c) {
const d = el('button', { class: 'btn-delete' });
d.textContent = '删除';
d.onclick = () => deleteProcess(process.id);
c.appendChild(d);
}
}
} else if (delBtn) delBtn.remove();
if (!process.running) clearLogTimerFor(process.id);
}
export function deleteProcess(processId) {
showConfirmDialog('确定要删除这个训练进程吗?此操作不可恢复。', () => {
apiDelete(processId)
.then(() => {
const item = document.querySelector(`[data-process-id="${processId}"]`);
if (item && item.parentNode) {
item.style.transition = 'opacity 0.3s, transform 0.3s';
item.style.opacity = '0';
item.style.transform = 'translateX(-20px)';
setTimeout(() => {
const content = item.closest('.process-type-content');
const group = content ? content.closest('.process-type-group') : null;
item.parentNode.removeChild(item);
if (content) {
const remain = content.querySelectorAll('.process-item');
if (remain.length === 0 && group) {
setTimeout(() => {
group.style.transition = 'opacity 0.3s, transform 0.3s';
group.style.opacity = '0';
group.style.transform = 'translateY(-10px)';
setTimeout(() => {
if (group.parentNode) group.parentNode.removeChild(group);
const left = document.querySelectorAll('.process-item');
if (left.length === 0) {
const list = document.getElementById('process-list');
list.innerHTML = '<p>暂无训练进程</p>';
}
}, 300);
}, 100);
} else {
const header = content.previousElementSibling;
if (header && header.dataset.expanded === 'true') content.style.maxHeight = content.scrollHeight + 'px';
const left = document.querySelectorAll('.process-item');
if (left.length === 0) {
const list = document.getElementById('process-list');
list.innerHTML = '<p>暂无训练进程</p>';
}
}
}
}, 300);
}
clearLogTimerFor(processId);
showNotification('训练进程已删除', 'success');
})
.catch(() => {
showNotification('删除进程失败,请刷新页面重试', 'error');
});
});
}
export function stopProcess(processId) {
showConfirmDialog('确定要停止这个训练进程吗?', () => {
apiStop(processId)
.then(() => {
const item = document.querySelector(`[data-process-id="${processId}"]`);
if (item) {
item.dataset.processStatus = '手动停止';
const statusEl = item.querySelector('.process-status');
if (statusEl) {
statusEl.classList.remove('status-running', 'status-error', 'status-completed');
statusEl.classList.add('status-manual-stop');
statusEl.textContent = '手动停止';
}
const stopBtn = item.querySelector('.btn-stop');
if (stopBtn) stopBtn.remove();
clearLogTimerFor(processId);
}
showNotification('训练进程已停止', 'info');
getProcesses()
.then((data) => {
const updated = data.find((p) => p.id === processId);
if (updated && item) updateProcessItem(item, updated);
})
.catch(() => {});
})
.catch(() => {
showNotification('停止进程失败', 'error');
});
}, () => {
showNotification('已取消停止操作', 'info');
});
}
export function checkAndOpenSwanlab(processId) {
const item = document.querySelector(`[data-process-id="${processId}"]`);
const monitor = item ? item.dataset.trainMonitor : 'none';
if (monitor === 'none') {
showNotification('此训练未启用监控功能', 'info');
return;
}
let url = item ? item.dataset.swanlabUrl : '';
if (!url || url.trim() === '') {
getProcesses()
.then((data) => {
const p = data.find((x) => x.id === processId);
if (p && p.swanlab_url) {
url = p.swanlab_url;
if (item) item.dataset.swanlabUrl = url;
openSwanlab(url);
} else {
showNotification('SwanLab链接尚未生成请稍后再试', 'info');
}
})
.catch(() => {
showNotification('获取SwanLab链接失败请稍后再试', 'error');
});
} else openSwanlab(url);
}
function openSwanlab(url) {
if (!isValidUrl(url)) {
showNotification('SwanLab链接无效或尚未生成', 'info');
return;
}
const w = window.open(url, '_blank');
if (w) showNotification('正在打开SwanLab页面', 'info');
else showNotification('无法打开新窗口,请检查浏览器设置', 'error');
}
function isValidUrl(url) {
try {
new URL(url);
return true;
} catch {
const u = String(url).toLowerCase();
return u.startsWith('http://') || u.startsWith('https://');
}
}

View File

@ -0,0 +1,73 @@
import { getLogs } from '../services/apiClient.js';
import { setHidden } from '../utils/dom.js';
const logTimers = new Map();
export function showLogs(processId) {
const container = document.getElementById(`logs-${processId}`);
if (!container) return;
const wasHidden = container.classList.contains('hidden');
setHidden(container, false);
if (wasHidden) {
loadLogContent(processId, container);
resetTimer(processId, container);
} else {
setHidden(container, true);
clearTimer(processId);
}
}
export function refreshLog(processId) {
const container = document.getElementById(`logs-${processId}`);
if (!container || container.classList.contains('hidden')) return;
loadLogContent(processId, container);
resetTimer(processId, container);
}
export function clearLogTimerFor(processId) {
clearTimer(processId);
}
export function isLogTimerActive(processId) {
return logTimers.has(processId);
}
function resetTimer(processId, container) {
clearTimer(processId);
const item = document.querySelector(`[data-process-id="${processId}"]`);
const running = item && item.dataset.processStatus === '运行中';
if (!running) return;
const id = setInterval(() => {
if (container.classList.contains('hidden')) {
clearTimer(processId);
return;
}
const current = document.querySelector(`[data-process-id="${processId}"]`);
const stillRunning = current && current.dataset.processStatus === '运行中';
if (stillRunning) loadLogContent(processId, container);
else clearTimer(processId);
}, 1000);
logTimers.set(processId, id);
}
function clearTimer(processId) {
const id = logTimers.get(processId);
if (id) {
clearInterval(id);
logTimers.delete(processId);
}
}
function loadLogContent(processId, container) {
const old = container.textContent;
const stickBottom = container.scrollHeight - container.scrollTop <= container.clientHeight + 10;
return getLogs(processId)
.then((logs) => {
container.textContent = logs;
if (stickBottom || old === container.textContent) container.scrollTop = container.scrollHeight;
})
.catch((err) => {
if (!container.textContent.includes('加载失败')) container.textContent = `加载日志失败: ${err.message}`;
});
}

View File

@ -0,0 +1,75 @@
const defaultTimeout = 10000;
export function fetchWithTimeoutAndRetry(url, options = {}, timeout = defaultTimeout, retries = 3) {
const controller = new AbortController();
const timeoutId = setTimeout(() => controller.abort(), timeout);
const fetchOptions = {
...options,
headers: {
...options.headers,
'Cache-Control': 'no-cache, no-store, must-revalidate',
Pragma: 'no-cache',
Expires: '0',
},
signal: controller.signal,
};
return fetch(url, fetchOptions)
.then((response) => {
clearTimeout(timeoutId);
if (!response.ok) throw new Error(`HTTP ${response.status}`);
return response;
})
.catch((error) => {
clearTimeout(timeoutId);
if (error.name === 'AbortError') throw new Error('请求超时');
if (retries > 0) {
return new Promise((resolve) => {
setTimeout(() => {
resolve(fetchWithTimeoutAndRetry(url, options, timeout, retries - 1));
}, timeout / 2);
});
}
throw error;
});
}
export function getProcesses() {
return fetchWithTimeoutAndRetry('/processes').then((r) => r.json());
}
export function getLogs(processId) {
return fetchWithTimeoutAndRetry(`/logs/${processId}`).then((r) => r.text());
}
export function startTrain(payload) {
return fetchWithTimeoutAndRetry('/train', {
method: 'POST',
headers: { 'Content-Type': 'application/json', 'Cache-Control': 'no-cache' },
body: JSON.stringify(payload),
}).then((r) => r.json());
}
export function stopProcess(processId) {
return fetchWithTimeoutAndRetry(`/stop/${processId}`, { method: 'POST' }).then((r) => r.json().catch(() => ({})));
}
export function deleteProcess(processId) {
return fetchWithTimeoutAndRetry(`/delete/${processId}`, { method: 'POST' }).then((r) => r.json().catch(() => ({})));
}
export function getLogFiles() {
return fetchWithTimeoutAndRetry('/logfiles').then((r) => r.json());
}
export function getLogFileContent(filename) {
return fetchWithTimeoutAndRetry(`/logfile-content/${encodeURIComponent(filename)}`).then((r) => r.text());
}
export function deleteLogFile(filename) {
return fetchWithTimeoutAndRetry(`/delete-logfile/${encodeURIComponent(filename)}`, {
method: 'DELETE',
headers: { 'Cache-Control': 'no-cache' },
}).then((r) => r.json());
}

View File

@ -0,0 +1,29 @@
const KEY = 'minimind_api_key';
export function getApiKey() {
try {
return localStorage.getItem(KEY) || '';
} catch (_) {
return '';
}
}
export function setApiKey(k) {
try {
localStorage.setItem(KEY, k || '');
} catch (_) {}
}
export function registerClient(payload) {
return fetch('/api/register', {
method: 'POST',
headers: { 'Content-Type': 'application/json', 'Cache-Control': 'no-cache' },
body: JSON.stringify(payload || {}),
}).then((r) => {
if (!r.ok) throw new Error('register_failed');
return r.json();
}).then((res) => {
if (res && res.api_key) setApiKey(res.api_key);
return res;
});
}

View File

@ -0,0 +1,128 @@
import { startTrain } from '../services/apiClient.js';
import { showNotification } from '../ui/notify.js';
export function initTrainForm() {
const typeSel = document.getElementById('train_type');
if (typeSel) {
typeSel.addEventListener('change', onTrainTypeChange);
typeSel.dispatchEvent(new Event('change'));
}
initGpuSelectors();
const form = document.getElementById('train-form');
if (form) form.addEventListener('submit', onSubmit);
}
function onTrainTypeChange() {
const v = this.value;
const pretrainSft = document.querySelectorAll('.pretrain-sft');
const fromWeightFields = document.querySelectorAll('.from-weight');
const loraFields = document.querySelectorAll('.lora');
const dpoFields = document.querySelectorAll('.dpo');
const dpoCard = document.querySelector('.parameter-card.dpo');
const ppoFields = document.querySelectorAll('.ppo');
const ppoCard = document.querySelector('.parameter-card.ppo');
const grpoFields = document.querySelectorAll('.grpo');
const grpoCard = document.querySelector('.parameter-card.grpo');
const spoFields = document.querySelectorAll('.spo');
const spoCard = document.querySelector('.parameter-card.spo');
pretrainSft.forEach((f) => (f.style.display = v === 'pretrain' || v === 'sft' || v === 'dpo' || v === 'ppo' || v === 'grpo' || v === 'spo' ? 'block' : 'none'));
fromWeightFields.forEach((f) => (f.style.display = v !== 'ppo' && v !== 'grpo' && v !== 'spo' ? 'block' : 'none'));
loraFields.forEach((f) => (f.style.display = v === 'lora' ? 'block' : 'none'));
dpoFields.forEach((f) => (f.style.display = v === 'dpo' ? 'block' : 'none'));
ppoFields.forEach((f) => (f.style.display = v === 'ppo' ? 'block' : 'none'));
if (dpoCard) dpoCard.style.display = v === 'dpo' ? 'block' : 'none';
if (ppoCard) ppoCard.style.display = v === 'ppo' ? 'block' : 'none';
grpoFields.forEach((f) => (f.style.display = v === 'grpo' ? 'block' : 'none'));
spoFields.forEach((f) => (f.style.display = v === 'spo' ? 'block' : 'none'));
if (grpoCard) grpoCard.style.display = v === 'grpo' ? 'block' : 'none';
if (spoCard) spoCard.style.display = v === 'spo' ? 'block' : 'none';
if (v === 'pretrain') setDefaults({ save_dir: '../out', save_weight: 'pretrain', epochs: '1', batch_size: '32', learning_rate: '5e-4', data_path: '../dataset/pretrain_hq.jsonl', from_weight: 'none', log_interval: '100', save_interval: '100', hidden_size: '512', num_hidden_layers: '8', max_seq_len: '512', use_moe: '0' });
else if (v === 'sft') setDefaults({ save_dir: '../out', save_weight: 'full_sft', epochs: '2', batch_size: '16', learning_rate: '5e-7', data_path: '../dataset/sft_mini_512.jsonl', from_weight: 'pretrain', log_interval: '100', save_interval: '100', hidden_size: '512', num_hidden_layers: '8', max_seq_len: '512', use_moe: '0' });
else if (v === 'lora') setDefaults({ save_dir: '../out/lora', lora_name: 'lora_identity', epochs: '50', batch_size: '32', learning_rate: '1e-4', data_path: '../dataset/lora_identity.jsonl', from_weight: 'full_sft', log_interval: '10', save_interval: '1', hidden_size: '512', num_hidden_layers: '8', max_seq_len: '512', use_moe: '0' });
else if (v === 'dpo') setDefaults({ save_dir: '../out', save_weight: 'dpo', epochs: '1', batch_size: '4', learning_rate: '4e-8', data_path: '../dataset/dpo.jsonl', from_weight: 'full_sft', log_interval: '100', save_interval: '100', beta: '0.1', hidden_size: '512', num_hidden_layers: '8', max_seq_len: '1024', use_moe: '0' });
else if (v === 'ppo') setDefaults({ save_dir: '../out', save_weight: 'ppo_actor', epochs: '1', batch_size: '2', learning_rate: '8e-8', data_path: '../dataset/rlaif-mini.jsonl', log_interval: '1', save_interval: '10', clip_epsilon: '0.1', vf_coef: '0.5', kl_coef: '0.02', reasoning: '1', update_old_actor_freq: '4', reward_model_path: '../../internlm2-1_8b-reward', hidden_size: '512', num_hidden_layers: '8', max_seq_len: '66', use_moe: '0' });
else if (v === 'grpo') setDefaults({ save_dir: '../out', save_weight: 'grpo', epochs: '1', batch_size: '2', learning_rate: '8e-8', data_path: '../dataset/rlaif-mini.jsonl', log_interval: '1', save_interval: '10', beta: '0.02', num_generations: '8', reasoning: '1', reward_model_path: '../../internlm2-1_8b-reward', hidden_size: '512', num_hidden_layers: '8', max_seq_len: '66', use_moe: '0' });
else if (v === 'spo') setDefaults({ save_dir: '../out', save_weight: 'spo', epochs: '1', batch_size: '2', learning_rate: '1e-7', data_path: '../dataset/rlaif-mini.jsonl', log_interval: '1', save_interval: '10', beta: '0.02', reasoning: '1', reward_model_path: '../../internlm2-1_8b-reward', hidden_size: '512', num_hidden_layers: '8', max_seq_len: '66', use_moe: '0' });
}
function setDefaults(map) {
Object.entries(map).forEach(([name, val]) => {
const nodes = document.querySelectorAll(`[name="${name}"]`);
nodes.forEach((node) => {
const card = node.closest('.parameter-card');
const visible = !card || card.style.display !== 'none';
if (visible) node.value = val;
});
});
}
function initGpuSelectors() {
const hasGpu = window.hasGpu === true;
const gpuCount = Number(window.gpuCount || 0);
const modeSel = document.getElementById('training_mode');
const single = document.getElementById('single-gpu-selection');
const multi = document.getElementById('multi-gpu-selection');
if (!modeSel) return;
function updateVisibility() {
const mode = modeSel.value;
if (single) single.style.display = mode === 'single_gpu' ? 'block' : 'none';
if (multi) multi.style.display = mode === 'multi_gpu' ? 'block' : 'none';
}
if (!hasGpu) {
modeSel.value = 'cpu';
if (single) single.style.display = 'none';
if (multi) multi.style.display = 'none';
} else {
const gpuNumInput = document.getElementById('gpu_num');
if (gpuNumInput && gpuCount > 0) gpuNumInput.value = gpuCount;
}
updateVisibility();
modeSel.addEventListener('change', updateVisibility);
}
function onSubmit(e) {
e.preventDefault();
const form = e.currentTarget;
const data = {};
const trainingModeSel = form.querySelector('#training_mode');
const trainingMode = trainingModeSel ? trainingModeSel.value : 'cpu';
const inputs = form.querySelectorAll('input, select, textarea');
inputs.forEach((el) => {
const name = el.name;
if (!name || name === 'training_mode') return;
const card = el.closest('.parameter-card');
const visible = !card || card.style.display !== 'none';
if (!visible) return;
let value = el.value;
if (el.type === 'checkbox') {
if (!el.checked) return;
}
if (name === 'gpu_num') {
const multi = document.getElementById('multi-gpu-selection');
if (!(multi && multi.style.display !== 'none')) return;
}
if (name === 'device') {
if (trainingMode === 'single_gpu') value = `cuda:${value}`;
else if (trainingMode === 'cpu') value = 'cpu';
else return;
}
data[name] = value;
});
showNotification('正在启动训练...', 'info');
setTimeout(() => {
startTrain(data)
.then((result) => {
if (result.success) {
showNotification('训练已开始!', 'success');
setTimeout(() => {
const processTab = document.querySelector('.tab[onclick*="processes"]');
if (processTab) processTab.click();
}, 1000);
} else showNotification('训练启动失败:' + result.error, 'error');
})
.catch(() => {
showNotification('启动训练中,请耐心等待...', 'info');
});
}, 1000);
}

View File

@ -0,0 +1,51 @@
export function showConfirmDialog(message, onConfirm, onCancel = null) {
const existing = document.querySelector('.custom-dialog');
if (existing && existing.parentNode && existing.parentNode.classList.contains('dialog-overlay')) {
document.body.removeChild(existing.parentNode);
}
const overlay = document.createElement('div');
overlay.className = 'dialog-overlay';
const container = document.createElement('div');
container.className = 'custom-dialog';
container.innerHTML = `
<div class="dialog-content">
<div class="dialog-message">${message}</div>
<div class="dialog-actions">
<button class="dialog-button dialog-cancel">取消</button>
<button class="dialog-button dialog-confirm">确认</button>
</div>
</div>
`;
overlay.appendChild(container);
document.body.appendChild(overlay);
setTimeout(() => {
overlay.classList.add('show');
container.classList.add('show');
}, 10);
const confirmBtn = container.querySelector('.dialog-confirm');
confirmBtn.addEventListener('click', () => {
if (onConfirm) onConfirm();
closeDialog(overlay);
});
const cancelBtn = container.querySelector('.dialog-cancel');
cancelBtn.addEventListener('click', () => {
if (onCancel) onCancel();
closeDialog(overlay);
});
overlay.addEventListener('click', (e) => {
if (e.target === overlay) {
if (onCancel) onCancel();
closeDialog(overlay);
}
});
}
export function closeDialog(overlay) {
overlay.classList.remove('show');
const container = overlay.querySelector('.custom-dialog');
if (container) container.classList.remove('show');
setTimeout(() => {
if (overlay.parentNode) document.body.removeChild(overlay);
}, 300);
}

View File

@ -0,0 +1,16 @@
export function showNotification(message, type = 'success') {
const n = document.createElement('div');
n.className = `notification notification-${type}`;
n.textContent = message;
document.body.appendChild(n);
setTimeout(() => {
n.classList.add('show');
}, 10);
setTimeout(() => {
n.classList.remove('show');
setTimeout(() => {
if (n.parentNode) document.body.removeChild(n);
}, 300);
}, 3000);
}

View File

@ -0,0 +1,15 @@
import { qsa } from '../utils/dom.js';
export function openTab(evt, tabName, hooks = {}) {
const contents = qsa('.tab-content');
contents.forEach((c) => c.classList.add('hidden'));
const tabs = qsa('.tab');
tabs.forEach((t) => t.classList.remove('active'));
const target = document.getElementById(tabName);
if (target) target.classList.remove('hidden');
if (evt && evt.currentTarget) evt.currentTarget.classList.add('active');
if (tabName !== 'processes' && hooks.onLeaveProcesses) hooks.onLeaveProcesses();
if (tabName === 'processes' && hooks.onEnterProcesses) hooks.onEnterProcesses();
if (tabName === 'logfiles' && hooks.onEnterLogfiles) hooks.onEnterLogfiles();
}

View File

@ -0,0 +1,34 @@
export function qs(selector, scope = document) {
return scope.querySelector(selector);
}
export function qsa(selector, scope = document) {
return Array.from(scope.querySelectorAll(selector));
}
export function el(tag, attrs = {}) {
const node = document.createElement(tag);
for (const [k, v] of Object.entries(attrs)) {
if (k === 'class') node.className = v;
else if (k === 'text') node.textContent = v;
else node.setAttribute(k, v);
}
return node;
}
export function setHidden(node, hidden) {
if (!node) return;
if (hidden) node.classList.add('hidden');
else node.classList.remove('hidden');
}
export function setText(node, text) {
if (!node) return;
node.textContent = text;
}
export function clearChildren(node) {
if (!node) return;
while (node.firstChild) node.removeChild(node.firstChild);
}

27
trainer_web/stop_web_ui.sh Executable file
View File

@ -0,0 +1,27 @@
#!/bin/bash
SCRIPT_DIR=$(dirname "$(readlink -f "$0")")
cd "$SCRIPT_DIR"
if [ -f "train_web_ui.pid" ]; then
pid=$(cat "train_web_ui.pid")
if ps -p "$pid" > /dev/null 2>&1; then
echo "正在停止 Web UI 服务 (PID: $pid)"
kill "$pid"
sleep 2
# 检查是否成功停止
if ps -p "$pid" > /dev/null 2>&1; then
echo "强制停止服务..."
kill -9 "$pid"
fi
echo "正在保存进程信息..."
echo "已保存到 'trainer_web/training_processes.json'"
sleep 1
echo "服务已停止"
else
echo "服务未运行但存在PID文件已删除"
rm "train_web_ui.pid"
fi
else
echo "服务未运行未找到PID文件"
fi

View File

@ -0,0 +1,357 @@
<!DOCTYPE html>
<html lang="zh-CN">
<head>
<meta charset="UTF-8">
<meta name="viewport" content="width=device-width, initial-scale=1.0">
<title>MiniMind Training Lab</title>
<link rel="stylesheet" href="/static/css/style.css">
<link rel="preconnect" href="https://fonts.googleapis.com">
<link rel="preconnect" href="https://fonts.gstatic.com" crossorigin>
<link href="https://fonts.googleapis.com/css2?family=Inter:wght@300;400;500;600;700&display=swap" rel="stylesheet">
</head>
<body>
<div class="header">
<img src="/static/images/logo2.png" alt="MiniMind Logo" class="logo">
<h1>MiniMind Training Lab</h1>
</div>
<div class="tabs">
<button class="tab active" onclick="openTab(event, 'train')">开始训练</button>
<button class="tab" onclick="openTab(event, 'processes')">训练进程</button>
<button class="tab" onclick="openTab(event, 'logfiles')">日志文件</button>
</div>
<div id="train" class="tab-content">
<div class="form-container">
<h2 class="section-title">选择训练类型并配置参数</h2>
<form id="train-form" method="post" action="/train">
<!-- 基础训练参数 -->
<div class="parameter-card">
<h3 class="card-title">基础训练参数</h3>
<div class="parameter-content">
<div class="form-group">
<label for="train_type">训练类型:</label>
<select id="train_type" name="train_type" required>
<option value="pretrain">🔤 Pretrain</option>
<option value="sft">🎯 SFT - Full</option>
<option value="lora">⚡ SFT - Lora</option>
<option value="dpo">🧠 RL - DPO</option>
<option value="ppo">🚀 RL - PPO</option>
<option value="grpo">💡 RL - GRPO</option>
<option value="spo">🔍 RL - SPO</option>
</select>
</div>
<div class="form-group">
<label for="epochs">训练轮数:</label>
<input type="number" id="epochs" name="epochs" min="1" value="10" required>
</div>
<div class="form-group">
<label for="batch_size">Batch Size</label>
<input type="number" id="batch_size" name="batch_size" min="1" value="32" required>
</div>
<div class="form-group">
<label for="learning_rate">学习率:</label>
<input type="text" id="learning_rate" name="learning_rate" value="5e-4" required>
</div>
<div class="form-group">
<label for="log_interval">日志打印间隔:</label>
<input type="number" id="log_interval" name="log_interval" min="1" value="100" required>
</div>
<div class="form-group">
<label for="data_path">数据路径:</label>
<div class="input-with-picker">
<input type="text" id="data_path" name="data_path" value="./dataset" required>
<button type="button" class="btn-picker" onclick="selectFolder('data_path')" title="选择文件夹">
📁
</button>
</div>
</div>
</div>
</div>
<!-- 强化学习参数 -->
<div class="parameter-card dpo" style="display: none;">
<h3 class="card-title">强化学习参数 (DPO)</h3>
<div class="parameter-content">
<div class="form-group">
<label for="beta">DPO Beta 参数:</label>
<input type="text" id="beta" name="beta" placeholder="0.1">
</div>
</div>
</div>
<!-- PPO强化学习参数 -->
<div class="parameter-card ppo" style="display: none;">
<h3 class="card-title">强化学习参数 (PPO)</h3>
<div class="parameter-content">
<div class="form-group">
<label for="clip_epsilon">PPO剪切系数</label>
<input type="text" id="clip_epsilon" name="clip_epsilon" placeholder="0.2">
</div>
<div class="form-group">
<label for="vf_coef">价值函数系数:</label>
<input type="text" id="vf_coef" name="vf_coef" placeholder="0.1">
</div>
<div class="form-group">
<label for="kl_coef">KL散度惩罚系数</label>
<input type="text" id="kl_coef" name="kl_coef" placeholder="0.01">
</div>
<div class="form-group">
<label for="reasoning">是否使用Reasoning模式</label>
<select id="reasoning" name="reasoning">
<option value="0"></option>
<option value="1"></option>
</select>
</div>
<div class="form-group">
<label for="update_old_actor_freq">更新旧Actor频率</label>
<input type="number" id="update_old_actor_freq" name="update_old_actor_freq" placeholder="10" min="1">
</div>
<div class="form-group">
<label for="reward_model_path">奖励模型路径:</label>
<div class="input-with-picker">
<input type="text" id="reward_model_path" name="reward_model_path" placeholder="path/to/reward/model">
<button type="button" class="btn-picker" onclick="selectFolder('reward_model_path')" title="选择文件夹">
📁
</button>
</div>
</div>
</div>
</div>
<!-- GRPO强化学习参数 -->
<div class="parameter-card grpo" style="display: none;">
<h3 class="card-title">强化学习参数 (GRPO)</h3>
<div class="parameter-content">
<div class="form-group">
<label for="beta_grpo">GRPO KL惩罚系数</label>
<input type="text" id="beta_grpo" name="beta" placeholder="0.02">
</div>
<div class="form-group">
<label for="num_generations">每个prompt生成样本数</label>
<input type="number" id="num_generations" name="num_generations" placeholder="8" min="1">
</div>
<div class="form-group">
<label for="reasoning_grpo">是否使用Reasoning模式</label>
<select id="reasoning_grpo" name="reasoning">
<option value="0"></option>
<option value="1"></option>
</select>
</div>
<div class="form-group">
<label for="reward_model_path_grpo">奖励模型路径:</label>
<div class="input-with-picker">
<input type="text" id="reward_model_path_grpo" name="reward_model_path" placeholder="../../internlm2-1_8b-reward">
<button type="button" class="btn-picker" onclick="selectFolder('reward_model_path_grpo')" title="选择文件夹">
📁
</button>
</div>
</div>
</div>
</div>
<!-- SPO强化学习参数 -->
<div class="parameter-card spo" style="display: none;">
<h3 class="card-title">强化学习参数 (SPO)</h3>
<div class="parameter-content">
<div class="form-group">
<label for="beta_spo">SPO KL惩罚系数</label>
<input type="text" id="beta_spo" name="beta" placeholder="0.02">
</div>
<div class="form-group">
<label for="reasoning_spo">是否使用Reasoning模式</label>
<select id="reasoning_spo" name="reasoning">
<option value="0"></option>
<option value="1"></option>
</select>
</div>
<div class="form-group">
<label for="reward_model_path_spo">奖励模型路径:</label>
<div class="input-with-picker">
<input type="text" id="reward_model_path_spo" name="reward_model_path" placeholder="../../internlm2-1_8b-reward">
<button type="button" class="btn-picker" onclick="selectFolder('reward_model_path_spo')" title="选择文件夹">
📁
</button>
</div>
</div>
</div>
</div>
<!-- 模型结构参数 -->
<div class="parameter-card">
<h3 class="card-title">模型结构参数</h3>
<div class="parameter-content">
<div class="form-group">
<label for="hidden_size">隐藏层维度:</label>
<input type="number" id="hidden_size" name="hidden_size" min="128" value="512" required>
</div>
<div class="form-group">
<label for="num_hidden_layers">隐藏层数量:</label>
<input type="number" id="num_hidden_layers" name="num_hidden_layers" min="1" value="8" required>
</div>
<div class="form-group">
<label for="max_seq_len">最大序列长度:</label>
<input type="number" id="max_seq_len" name="max_seq_len" min="64" value="512" required>
</div>
<div class="form-group">
<label for="use_moe">是否使用MoE架构</label>
<select id="use_moe" name="use_moe">
<option value="0">❌ 否</option>
<option value="1">✅ 是</option>
</select>
</div>
</div>
</div>
<!-- 模型保存与恢复 -->
<div class="parameter-card">
<h3 class="card-title">模型保存与恢复</h3>
<div class="parameter-content">
<div class="form-group">
<label for="save_dir">模型保存目录:</label>
<div class="input-with-picker">
<input type="text" id="save_dir" name="save_dir" value="./checkpoints" required>
<button type="button" class="btn-picker" onclick="selectFolder('save_dir')" title="选择文件夹">
📁
</button>
</div>
</div>
<div class="form-group">
<label for="save_interval">模型保存间隔:</label>
<input type="number" id="save_interval" name="save_interval" min="1" value="1000" required>
</div>
<div class="form-group pretrain-sft">
<label for="save_weight">保存权重前缀名:</label>
<input type="text" id="save_weight" name="save_weight" value="model">
</div>
<div class="form-group lora">
<label for="lora_name">LoRA权重名称</label>
<input type="text" id="lora_name" name="lora_name" value="lora_adapter">
</div>
<div class="form-group from-weight">
<label for="from_weight">基于哪个权重训练:</label>
<input type="text" id="from_weight" name="from_weight" value="pretrained_model">
</div>
<div class="form-group">
<div class="checkbox-group">
<input type="checkbox" id="from_resume" name="from_resume" value="1">
<label for="from_resume">是否自动检测&续训</label>
</div>
</div>
</div>
</div>
<!-- 其他设置 -->
<div class="parameter-card">
<h3 class="card-title">其他设置</h3>
<div class="parameter-content">
<div class="form-group">
<label for="training_mode">训练方式:</label>
<select id="training_mode" name="training_mode" required>
<option value="single_gpu">🎮 单卡训练</option>
<option value="multi_gpu">🚀 多卡训练</option>
<option value="cpu">💻 CPU训练</option>
</select>
</div>
<div class="form-group" id="single-gpu-selection">
<label for="device">GPU序号</label>
<input type="number" id="device" name="device" min="0" max="{{ gpu_count|default(1) - 1 }}" value="0" required>
</div>
<div class="form-group" id="multi-gpu-selection" style="display: none;">
<label for="gpu_num">多卡并行数:</label>
<div class="input-with-picker">
<input type="number" id="gpu_num" name="gpu_num" min="1" max="{{ gpu_count|default(1) }}" value="{{ gpu_count|default(1) }}" required>
<span class="hint-text">(可用GPU数量: {{ gpu_count|default(0) }})</span>
</div>
</div>
<div class="form-group">
<label for="train_monitor">训练监控:</label>
<select id="train_monitor" name="train_monitor">
<option value="none">❌ 无监控</option>
<option value="wandb">📊 使用WandB/SwanLab</option>
</select>
</div>
</div>
</div>
<script>
window.hasGpu = {{ has_gpu|default(false)|tojson|safe }};
window.gpuCount = {{ gpu_count|default(0)|tojson|safe }};
</script>
<div class="submit-container">
<button type="submit" class="btn-primary">
<span class="btn-icon">🚀</span>
开始训练
</button>
</div>
</form>
</div>
</div>
<div id="processes" class="tab-content hidden">
<div class="section-header">
<h2 class="section-title">训练进程列表</h2>
<div class="section-actions">
<button class="btn-refresh" onclick="refreshProcesses()">
<span class="btn-icon">🔄</span>
刷新
</button>
</div>
</div>
<div id="process-list">
<!-- 进程列表将通过JavaScript动态加载 -->
</div>
</div>
<div id="logfiles" class="tab-content hidden">
<div class="section-header">
<h2 class="section-title">日志文件列表</h2>
<div class="section-actions">
<button class="btn-refresh" onclick="refreshLogs()">
<span class="btn-icon">🔄</span>
刷新
</button>
</div>
</div>
<div id="logfiles-list">
<!-- 日志文件列表将通过JavaScript动态加载 -->
</div>
</div>
<!-- 文件浏览器模态框 -->
<div id="file-browser-modal" class="modal hidden">
<div class="modal-content">
<div class="modal-header">
<h3 id="modal-title">选择文件或文件夹</h3>
<button class="modal-close" onclick="closeFileBrowser()">&times;</button>
</div>
<div class="modal-body">
<div class="file-browser-nav">
<div class="current-path" id="current-path">./</div>
<div class="nav-buttons">
<button class="btn-navigate" onclick="selectCurrentDirectory()" title="选择当前目录">📍</button>
<button class="btn-navigate" onclick="navigateToParent()" title="上级目录">⬆️</button>
</div>
</div>
<div class="quick-paths" id="quick-paths">
<!-- 快捷路径将在这里显示 -->
</div>
<div class="file-browser-help">
💡 点击文件夹进入目录,点击文件选择文件,使用📍选择当前目录
</div>
<div class="file-list" id="file-list">
<!-- 文件列表将在这里显示 -->
</div>
</div>
<div class="modal-footer">
<input type="text" id="selected-path" placeholder="选择的文件或文件夹路径" readonly>
<button class="btn-primary" onclick="confirmFileSelection()">确认选择</button>
<button class="btn-secondary" onclick="closeFileBrowser()">取消</button>
</div>
</div>
</div>
<script type="module" src="/static/js/app.js"></script>
</body>
</html>

1091
trainer_web/train_web_ui.py Normal file

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1 @@
{}