mirror of
https://github.com/jingyaogong/minimind.git
synced 2026-05-01 11:48:14 +08:00
support Swanlab check
This commit is contained in:
parent
04477b72f2
commit
3a03cedd00
@ -33,8 +33,18 @@ nohup python -u train_web_ui.py > "$LOG_FILE" 2>&1 &
|
||||
# 保存PID
|
||||
echo $! > "train_web_ui.pid"
|
||||
|
||||
sleep 2
|
||||
# 等待服务启动并获取实际端口号
|
||||
sleep 3
|
||||
|
||||
# 从日志文件中提取实际使用的端口号
|
||||
# 查找包含"启动Flask服务器在 http://0.0.0.0:"的行并提取端口号
|
||||
PORT=$(grep -oP '启动Flask服务器在 http://0.0.0.0:\K[0-9]+' "$LOG_FILE" || echo "5000")
|
||||
|
||||
# 如果没有找到端口号,尝试查找"Running on http://0.0.0.0:"格式的日志
|
||||
if [ "$PORT" = "5000" ]; then
|
||||
PORT=$(grep -oP 'Running on http://0.0.0.0:\K[0-9]+' "$LOG_FILE" || echo "5000")
|
||||
fi
|
||||
|
||||
echo "服务已启动! PID: $(cat "train_web_ui.pid")"
|
||||
echo "访问地址: http://localhost:5000"
|
||||
echo "停止命令: kill $(cat "train_web_ui.pid") or ./trainer_web/stop_web_ui.sh"
|
||||
echo "访问地址: http://localhost:$PORT"
|
||||
echo "停止命令: kill $(cat "train_web_ui.pid") or bash trainer_web/stop_web_ui.sh"
|
||||
@ -341,6 +341,20 @@ button:active {
|
||||
margin-right: 10px;
|
||||
border-radius: 6px;
|
||||
}
|
||||
|
||||
.btn-swanlab {
|
||||
background: linear-gradient(135deg, #007bff 0%, #00bfff 100%);
|
||||
padding: 8px 15px;
|
||||
font-size: 14px;
|
||||
margin-right: 10px;
|
||||
border-radius: 6px;
|
||||
color: white;
|
||||
}
|
||||
|
||||
.btn-swanlab:hover {
|
||||
transform: translateY(-1px);
|
||||
box-shadow: 0 4px 10px rgba(0, 123, 255, 0.3);
|
||||
}
|
||||
.btn-delete {
|
||||
background: linear-gradient(135deg, #f44336 0%, #d32f2f 100%);
|
||||
padding: 8px 15px;
|
||||
|
||||
@ -226,10 +226,92 @@ function checkProcessStatusChanges() {
|
||||
});
|
||||
}
|
||||
|
||||
// 检查并打开SwanLab链接
|
||||
function checkAndOpenSwanlab(processId) {
|
||||
// 首先检查训练监控设置
|
||||
const processItem = document.querySelector(`[data-process-id="${processId}"]`);
|
||||
const trainMonitor = processItem ? processItem.dataset.trainMonitor : 'none';
|
||||
|
||||
// 如果训练监控设置为'none',显示提示信息
|
||||
if (trainMonitor === 'none') {
|
||||
showNotification('此训练未启用监控功能', 'info');
|
||||
return;
|
||||
}
|
||||
|
||||
// 首先从DOM中获取最新的URL
|
||||
let currentUrl = processItem ? processItem.dataset.swanlabUrl : '';
|
||||
|
||||
// 如果没有URL或URL不完整,尝试从后端获取最新的进程信息
|
||||
if (!currentUrl || currentUrl.trim() === '') {
|
||||
fetch('/processes')
|
||||
.then(response => response.json())
|
||||
.then(data => {
|
||||
const process = data.find(p => p.id === processId);
|
||||
if (process && process.swanlab_url) {
|
||||
currentUrl = process.swanlab_url;
|
||||
// 更新DOM中的URL数据属性
|
||||
if (processItem) {
|
||||
processItem.dataset.swanlabUrl = currentUrl;
|
||||
}
|
||||
openSwanlab(processId, currentUrl);
|
||||
} else {
|
||||
// 没有找到有效的链接
|
||||
showNotification('SwanLab链接尚未生成,请稍后再试', 'info');
|
||||
}
|
||||
})
|
||||
.catch(error => {
|
||||
console.error('获取进程信息失败:', error);
|
||||
showNotification('获取SwanLab链接失败,请稍后再试', 'error');
|
||||
});
|
||||
} else {
|
||||
// 有URL,直接打开
|
||||
openSwanlab(processId, currentUrl);
|
||||
}
|
||||
}
|
||||
|
||||
// 打开SwanLab链接
|
||||
function openSwanlab(processId, url) {
|
||||
// 检查URL是否有效
|
||||
if (!url || typeof url !== 'string' || url.trim() === '' || !isValidUrl(url)) {
|
||||
showNotification('SwanLab链接无效或尚未生成', 'info');
|
||||
return;
|
||||
}
|
||||
|
||||
// 在新窗口打开链接
|
||||
const newWindow = window.open(url, '_blank');
|
||||
|
||||
// 检查窗口是否成功打开
|
||||
if (newWindow) {
|
||||
// 显示成功通知
|
||||
showNotification('正在打开SwanLab页面', 'info');
|
||||
} else {
|
||||
// 弹出窗口被阻止
|
||||
showNotification('无法打开新窗口,请检查浏览器设置', 'error');
|
||||
}
|
||||
}
|
||||
|
||||
// 检查URL是否有效
|
||||
function isValidUrl(url) {
|
||||
try {
|
||||
// 尝试创建URL对象,如果失败说明URL无效
|
||||
new URL(url);
|
||||
return true;
|
||||
} catch (error) {
|
||||
// 简单检查是否以http或https开头
|
||||
return url.toLowerCase().startsWith('http://') || url.toLowerCase().startsWith('https://');
|
||||
}
|
||||
}
|
||||
|
||||
// 更新单个进程项
|
||||
function updateProcessItem(processItem, process) {
|
||||
// 更新数据属性
|
||||
processItem.dataset.processStatus = process.status;
|
||||
processItem.dataset.trainMonitor = process.train_monitor || 'none';
|
||||
|
||||
// 更新SwanLab URL数据属性
|
||||
if (process.swanlab_url) {
|
||||
processItem.dataset.swanlabUrl = process.swanlab_url;
|
||||
}
|
||||
|
||||
// 更新状态类和文本
|
||||
const statusElement = processItem.querySelector('.process-status');
|
||||
@ -253,23 +335,56 @@ function updateProcessItem(processItem, process) {
|
||||
statusElement.textContent = process.status;
|
||||
}
|
||||
|
||||
// 更新SwanLab按钮
|
||||
const existingSwanlabButton = processItem.querySelector('.btn-swanlab');
|
||||
const buttonContainer = processItem.querySelector('div:nth-child(2)'); // 按钮容器是第二个div
|
||||
|
||||
// 只有当train_monitor不是'none'时才显示SwanLab按钮
|
||||
const shouldShowSwanlab = process.train_monitor !== 'none';
|
||||
|
||||
// 如果应该显示按钮但不存在,则创建并添加
|
||||
if (shouldShowSwanlab && !existingSwanlabButton && buttonContainer) {
|
||||
const swanlabButton = document.createElement('button');
|
||||
swanlabButton.className = 'btn-swanlab';
|
||||
swanlabButton.textContent = 'SwanLab';
|
||||
swanlabButton.onclick = function() {
|
||||
checkAndOpenSwanlab(process.id);
|
||||
};
|
||||
|
||||
// 插入到停止按钮之前
|
||||
const stopButton = buttonContainer.querySelector('.btn-stop');
|
||||
if (stopButton) {
|
||||
buttonContainer.insertBefore(swanlabButton, stopButton);
|
||||
} else {
|
||||
// 如果没有停止按钮,插入到刷新按钮之后
|
||||
const refreshButton = buttonContainer.querySelector('.btn-logs:nth-child(2)');
|
||||
if (refreshButton) {
|
||||
buttonContainer.insertBefore(swanlabButton, refreshButton.nextSibling);
|
||||
}
|
||||
}
|
||||
} else if (!shouldShowSwanlab && existingSwanlabButton) {
|
||||
// 如果不应该显示按钮但存在,则移除
|
||||
existingSwanlabButton.remove();
|
||||
} else if (existingSwanlabButton) {
|
||||
// 更新现有按钮的点击事件
|
||||
existingSwanlabButton.onclick = function() {
|
||||
checkAndOpenSwanlab(process.id);
|
||||
};
|
||||
}
|
||||
|
||||
// 更新停止按钮
|
||||
const stopButton = processItem.querySelector('.btn-stop');
|
||||
if (stopButton) {
|
||||
if (!process.running) {
|
||||
stopButton.remove();
|
||||
}
|
||||
} else if (process.running) {
|
||||
// 如果按钮不存在但进程仍在运行,添加停止按钮
|
||||
const buttonContainer = processItem.querySelector('div:last-child');
|
||||
if (buttonContainer) {
|
||||
} else if (process.running && buttonContainer) {
|
||||
const newStopButton = document.createElement('button');
|
||||
newStopButton.className = 'btn-stop';
|
||||
newStopButton.onclick = () => stopProcess(process.id);
|
||||
newStopButton.textContent = '停止训练';
|
||||
buttonContainer.appendChild(newStopButton);
|
||||
}
|
||||
}
|
||||
|
||||
// 处理删除按钮
|
||||
const deleteButton = processItem.querySelector('.btn-delete');
|
||||
@ -465,10 +580,18 @@ function addProcessItemToGroup(parentElement, process) {
|
||||
// 设置进程数据属性,用于后续检查状态
|
||||
processItem.dataset.processId = process.id;
|
||||
processItem.dataset.processStatus = process.status;
|
||||
processItem.dataset.trainMonitor = process.train_monitor || 'none';
|
||||
processItem.dataset.swanlabUrl = process.swanlab_url || '';
|
||||
|
||||
// 检查是否显示删除按钮(对于非运行中的进程)
|
||||
const showDeleteButton = !process.running;
|
||||
|
||||
// 只有当train_monitor不是'none'时才显示SwanLab按钮
|
||||
const showSwanlabButton = process.train_monitor !== 'none';
|
||||
const swanlabButton = showSwanlabButton ? `<button class="btn-swanlab" onclick="checkAndOpenSwanlab('${process.id}')">
|
||||
SwanLab
|
||||
</button>` : '';
|
||||
|
||||
processItem.innerHTML = `
|
||||
<div class="process-info">
|
||||
<div>
|
||||
@ -481,6 +604,7 @@ function addProcessItemToGroup(parentElement, process) {
|
||||
<div>
|
||||
<button class="btn-logs" onclick="showLogs('${process.id}')">查看日志</button>
|
||||
<button class="btn-logs" onclick="refreshLog('${process.id}')">刷新日志</button>
|
||||
${swanlabButton}
|
||||
${process.running ? `<button class="btn-stop" onclick="stopProcess('${process.id}')">停止训练</button>` : ''}
|
||||
${showDeleteButton ? `<button class="btn-delete" onclick="deleteProcess('${process.id}')">删除</button>` : ''}
|
||||
</div>
|
||||
|
||||
@ -175,7 +175,10 @@ def start_training_process(train_type, params):
|
||||
'log_file': log_file,
|
||||
'start_time': time.strftime('%Y-%m-%d %H:%M:%S'),
|
||||
'running': True,
|
||||
'error': False
|
||||
'error': False,
|
||||
'train_monitor': params.get('train_monitor', 'none'), # 保存训练监控设置
|
||||
'swanlab_url': None,
|
||||
'next_line_is_swanlab_url': False
|
||||
}
|
||||
|
||||
# 开始读取输出
|
||||
@ -186,6 +189,16 @@ def start_training_process(train_type, params):
|
||||
if output == '' and process.poll() is not None:
|
||||
break
|
||||
if output:
|
||||
# 检查是否是swanlab链接的行
|
||||
output_stripped = output.strip()
|
||||
if training_processes[process_id]['next_line_is_swanlab_url']:
|
||||
# 保存swanlab链接
|
||||
training_processes[process_id]['swanlab_url'] = output_stripped
|
||||
training_processes[process_id]['next_line_is_swanlab_url'] = False
|
||||
elif 'swanlab: 🚀 View run at' in output_stripped:
|
||||
# 标记下一行是swanlab链接
|
||||
training_processes[process_id]['next_line_is_swanlab_url'] = True
|
||||
|
||||
with open(log_file, 'a') as f:
|
||||
f.write(output)
|
||||
# 检查进程是否成功结束
|
||||
@ -240,7 +253,9 @@ def processes():
|
||||
'start_time': info['start_time'],
|
||||
'running': info['running'],
|
||||
'error': info['error'],
|
||||
'status': status
|
||||
'status': status,
|
||||
'train_monitor': info.get('train_monitor', 'none'), # 添加train_monitor字段
|
||||
'swanlab_url': info.get('swanlab_url') # 添加swanlab_url字段
|
||||
})
|
||||
return jsonify(result)
|
||||
|
||||
@ -265,12 +280,26 @@ def logs(process_id):
|
||||
return '日志文件不存在或已被删除'
|
||||
|
||||
try:
|
||||
# 使用高效的方法读取文件的最后200行
|
||||
# 这对于大文件特别有用,可以避免读取整个文件
|
||||
last_200_lines = []
|
||||
block_size = 8192 # 8KB blocks
|
||||
# 使用二进制模式读取,然后尝试解码以处理不同编码的日志文件
|
||||
def read_log_file_robust(file_path):
|
||||
# 尝试多种编码方式读取文件
|
||||
encodings = ['utf-8', 'latin-1', 'gbk', 'gb2312']
|
||||
for encoding in encodings:
|
||||
try:
|
||||
with open(file_path, 'r', encoding=encoding) as f:
|
||||
return f.read(), encoding
|
||||
except UnicodeDecodeError:
|
||||
continue
|
||||
# 如果所有编码都失败,使用二进制模式读取并替换不可解码的字符
|
||||
with open(file_path, 'rb') as f:
|
||||
content = f.read()
|
||||
return content.decode('utf-8', errors='replace'), 'binary_decoded'
|
||||
|
||||
with open(log_file, 'r', encoding='utf-8') as f:
|
||||
# 使用高效的方法读取文件的最后200行,确保以完整行为单位
|
||||
last_200_lines = []
|
||||
|
||||
# 先尝试使用二进制模式读取文件末尾的部分
|
||||
with open(log_file, 'rb') as f:
|
||||
# 尝试直接定位到文件末尾,然后向前读取
|
||||
f.seek(0, os.SEEK_END)
|
||||
file_size = f.tell()
|
||||
@ -278,7 +307,11 @@ def logs(process_id):
|
||||
# 计算需要读取的块数
|
||||
position = file_size
|
||||
blocks = []
|
||||
while position > 0:
|
||||
block_size = 8192 # 8KB blocks
|
||||
|
||||
# 确保我们有足够的数据来处理完整行
|
||||
found_complete_lines = False
|
||||
while position > 0 and not found_complete_lines:
|
||||
# 后退一个块的位置
|
||||
position -= block_size
|
||||
if position < 0:
|
||||
@ -291,19 +324,36 @@ def logs(process_id):
|
||||
block = f.read(block_size)
|
||||
blocks.append(block)
|
||||
|
||||
# 如果已经收集了足够的行,就停止
|
||||
combined_text = ''.join(reversed(blocks))
|
||||
lines = combined_text.splitlines(True)
|
||||
if len(lines) >= 200:
|
||||
# 获取最后200行
|
||||
last_200_lines = lines[-200:]
|
||||
break
|
||||
# 如果已经收集了足够的数据,尝试解码并检查行数
|
||||
combined_binary = b''.join(blocks)
|
||||
# 尝试解码,使用errors='replace'处理无法解码的字符
|
||||
try:
|
||||
combined_text = combined_binary.decode('utf-8', errors='replace')
|
||||
except:
|
||||
combined_text = combined_binary.decode('latin-1')
|
||||
|
||||
lines = combined_text.splitlines(True) # 使用True保留换行符
|
||||
|
||||
# 确保我们不返回不完整的第一行
|
||||
if len(lines) > 0:
|
||||
# 如果有足够的行,确保我们从一个完整行开始
|
||||
if len(lines) > 1:
|
||||
# 跳过可能不完整的第一行
|
||||
last_200_lines = lines[1:]
|
||||
else:
|
||||
last_200_lines = lines
|
||||
|
||||
# 如果我们有足够的行,停止读取
|
||||
if len(last_200_lines) >= 200:
|
||||
# 获取最后200行
|
||||
last_200_lines = last_200_lines[-200:]
|
||||
found_complete_lines = True
|
||||
|
||||
# 如果文件内容不足200行,或者上面的方法没有收集到足够的行
|
||||
if len(last_200_lines) < 200:
|
||||
# 重新读取整个文件(对于小文件)
|
||||
f.seek(0)
|
||||
all_lines = f.readlines()
|
||||
content, encoding = read_log_file_robust(log_file)
|
||||
all_lines = content.splitlines(True) # 使用True保留换行符
|
||||
last_200_lines = all_lines[-200:] if len(all_lines) > 200 else all_lines
|
||||
|
||||
return ''.join(last_200_lines)
|
||||
@ -351,9 +401,27 @@ def get_logfile_content(filename):
|
||||
log_file = os.path.join(log_dir, filename)
|
||||
|
||||
try:
|
||||
# 读取完整的日志文件内容
|
||||
with open(log_file, 'r', encoding='utf-8') as f:
|
||||
content = f.read()
|
||||
# 使用二进制模式读取文件,可以更可靠地保留原始换行符
|
||||
with open(log_file, 'rb') as f:
|
||||
content_bytes = f.read()
|
||||
|
||||
# 尝试多种编码方式解码,确保正确处理换行符
|
||||
encodings = ['utf-8', 'latin-1', 'gbk', 'gb2312']
|
||||
content = None
|
||||
|
||||
for encoding in encodings:
|
||||
try:
|
||||
# 解码文件内容,保留原始换行符
|
||||
content = content_bytes.decode(encoding)
|
||||
break
|
||||
except UnicodeDecodeError:
|
||||
continue
|
||||
|
||||
# 如果所有编码都失败,使用errors='replace'参数处理不可解码的字符
|
||||
if content is None:
|
||||
content = content_bytes.decode('utf-8', errors='replace')
|
||||
|
||||
# 确保返回的内容正确保留所有换行符
|
||||
return content
|
||||
except FileNotFoundError:
|
||||
return jsonify({'error': 'Log file not found'}), 404
|
||||
@ -470,7 +538,9 @@ def save_processes_info():
|
||||
'start_time': info['start_time'],
|
||||
'running': info['running'],
|
||||
'error': info.get('error', False),
|
||||
'manually_stopped': info.get('manually_stopped', False)
|
||||
'manually_stopped': info.get('manually_stopped', False),
|
||||
'train_monitor': info.get('train_monitor', 'none'), # 保存train_monitor
|
||||
'swanlab_url': info.get('swanlab_url') # 保存swanlab_url
|
||||
}
|
||||
|
||||
with open(PROCESSES_FILE, 'w', encoding='utf-8') as f:
|
||||
@ -488,6 +558,16 @@ def load_processes_info():
|
||||
|
||||
# 检查每个进程是否还在运行
|
||||
for pid, info in loaded_processes.items():
|
||||
# 确保所有需要的字段都存在
|
||||
if 'swanlab_url' not in info:
|
||||
info['swanlab_url'] = None
|
||||
if 'manually_stopped' not in info:
|
||||
info['manually_stopped'] = False
|
||||
if 'error' not in info:
|
||||
info['error'] = False
|
||||
if 'train_monitor' not in info:
|
||||
info['train_monitor'] = 'none'
|
||||
|
||||
if info['running']:
|
||||
try:
|
||||
# 检查进程是否还在运行
|
||||
@ -498,10 +578,16 @@ def load_processes_info():
|
||||
else:
|
||||
# 进程已停止
|
||||
info['running'] = False
|
||||
# 如果进程未被明确标记为完成或出错,则默认为手动停止
|
||||
if not info['error']:
|
||||
info['manually_stopped'] = True
|
||||
training_processes[pid] = info
|
||||
except (psutil.NoSuchProcess, psutil.AccessDenied):
|
||||
# 进程不存在或无权限访问
|
||||
info['running'] = False
|
||||
# 如果进程未被明确标记为完成或出错,则默认为手动停止
|
||||
if not info['error']:
|
||||
info['manually_stopped'] = True
|
||||
training_processes[pid] = info
|
||||
else:
|
||||
# 进程已停止,直接恢复
|
||||
|
||||
Loading…
Reference in New Issue
Block a user