support Swanlab check

This commit is contained in:
yuyu5333 2025-11-11 13:26:34 +00:00
parent 04477b72f2
commit 3a03cedd00
4 changed files with 263 additions and 29 deletions

View File

@ -33,8 +33,18 @@ nohup python -u train_web_ui.py > "$LOG_FILE" 2>&1 &
# 保存PID
echo $! > "train_web_ui.pid"
sleep 2
# 等待服务启动并获取实际端口号
sleep 3
# 从日志文件中提取实际使用的端口号
# 查找包含"启动Flask服务器在 http://0.0.0.0:"的行并提取端口号
PORT=$(grep -oP '启动Flask服务器在 http://0.0.0.0:\K[0-9]+' "$LOG_FILE" || echo "5000")
# 如果没有找到端口号,尝试查找"Running on http://0.0.0.0:"格式的日志
if [ "$PORT" = "5000" ]; then
PORT=$(grep -oP 'Running on http://0.0.0.0:\K[0-9]+' "$LOG_FILE" || echo "5000")
fi
echo "服务已启动! PID: $(cat "train_web_ui.pid")"
echo "访问地址: http://localhost:5000"
echo "停止命令: kill $(cat "train_web_ui.pid") or ./trainer_web/stop_web_ui.sh"
echo "访问地址: http://localhost:$PORT"
echo "停止命令: kill $(cat "train_web_ui.pid") or bash trainer_web/stop_web_ui.sh"

View File

@ -341,6 +341,20 @@ button:active {
margin-right: 10px;
border-radius: 6px;
}
.btn-swanlab {
background: linear-gradient(135deg, #007bff 0%, #00bfff 100%);
padding: 8px 15px;
font-size: 14px;
margin-right: 10px;
border-radius: 6px;
color: white;
}
.btn-swanlab:hover {
transform: translateY(-1px);
box-shadow: 0 4px 10px rgba(0, 123, 255, 0.3);
}
.btn-delete {
background: linear-gradient(135deg, #f44336 0%, #d32f2f 100%);
padding: 8px 15px;

View File

@ -226,10 +226,92 @@ function checkProcessStatusChanges() {
});
}
// 检查并打开SwanLab链接
function checkAndOpenSwanlab(processId) {
// 首先检查训练监控设置
const processItem = document.querySelector(`[data-process-id="${processId}"]`);
const trainMonitor = processItem ? processItem.dataset.trainMonitor : 'none';
// 如果训练监控设置为'none',显示提示信息
if (trainMonitor === 'none') {
showNotification('此训练未启用监控功能', 'info');
return;
}
// 首先从DOM中获取最新的URL
let currentUrl = processItem ? processItem.dataset.swanlabUrl : '';
// 如果没有URL或URL不完整尝试从后端获取最新的进程信息
if (!currentUrl || currentUrl.trim() === '') {
fetch('/processes')
.then(response => response.json())
.then(data => {
const process = data.find(p => p.id === processId);
if (process && process.swanlab_url) {
currentUrl = process.swanlab_url;
// 更新DOM中的URL数据属性
if (processItem) {
processItem.dataset.swanlabUrl = currentUrl;
}
openSwanlab(processId, currentUrl);
} else {
// 没有找到有效的链接
showNotification('SwanLab链接尚未生成请稍后再试', 'info');
}
})
.catch(error => {
console.error('获取进程信息失败:', error);
showNotification('获取SwanLab链接失败请稍后再试', 'error');
});
} else {
// 有URL直接打开
openSwanlab(processId, currentUrl);
}
}
// 打开SwanLab链接
function openSwanlab(processId, url) {
// 检查URL是否有效
if (!url || typeof url !== 'string' || url.trim() === '' || !isValidUrl(url)) {
showNotification('SwanLab链接无效或尚未生成', 'info');
return;
}
// 在新窗口打开链接
const newWindow = window.open(url, '_blank');
// 检查窗口是否成功打开
if (newWindow) {
// 显示成功通知
showNotification('正在打开SwanLab页面', 'info');
} else {
// 弹出窗口被阻止
showNotification('无法打开新窗口,请检查浏览器设置', 'error');
}
}
// 检查URL是否有效
function isValidUrl(url) {
try {
// 尝试创建URL对象如果失败说明URL无效
new URL(url);
return true;
} catch (error) {
// 简单检查是否以http或https开头
return url.toLowerCase().startsWith('http://') || url.toLowerCase().startsWith('https://');
}
}
// 更新单个进程项
function updateProcessItem(processItem, process) {
// 更新数据属性
processItem.dataset.processStatus = process.status;
processItem.dataset.trainMonitor = process.train_monitor || 'none';
// 更新SwanLab URL数据属性
if (process.swanlab_url) {
processItem.dataset.swanlabUrl = process.swanlab_url;
}
// 更新状态类和文本
const statusElement = processItem.querySelector('.process-status');
@ -253,23 +335,56 @@ function updateProcessItem(processItem, process) {
statusElement.textContent = process.status;
}
// 更新SwanLab按钮
const existingSwanlabButton = processItem.querySelector('.btn-swanlab');
const buttonContainer = processItem.querySelector('div:nth-child(2)'); // 按钮容器是第二个div
// 只有当train_monitor不是'none'时才显示SwanLab按钮
const shouldShowSwanlab = process.train_monitor !== 'none';
// 如果应该显示按钮但不存在,则创建并添加
if (shouldShowSwanlab && !existingSwanlabButton && buttonContainer) {
const swanlabButton = document.createElement('button');
swanlabButton.className = 'btn-swanlab';
swanlabButton.textContent = 'SwanLab';
swanlabButton.onclick = function() {
checkAndOpenSwanlab(process.id);
};
// 插入到停止按钮之前
const stopButton = buttonContainer.querySelector('.btn-stop');
if (stopButton) {
buttonContainer.insertBefore(swanlabButton, stopButton);
} else {
// 如果没有停止按钮,插入到刷新按钮之后
const refreshButton = buttonContainer.querySelector('.btn-logs:nth-child(2)');
if (refreshButton) {
buttonContainer.insertBefore(swanlabButton, refreshButton.nextSibling);
}
}
} else if (!shouldShowSwanlab && existingSwanlabButton) {
// 如果不应该显示按钮但存在,则移除
existingSwanlabButton.remove();
} else if (existingSwanlabButton) {
// 更新现有按钮的点击事件
existingSwanlabButton.onclick = function() {
checkAndOpenSwanlab(process.id);
};
}
// 更新停止按钮
const stopButton = processItem.querySelector('.btn-stop');
if (stopButton) {
if (!process.running) {
stopButton.remove();
}
} else if (process.running) {
// 如果按钮不存在但进程仍在运行,添加停止按钮
const buttonContainer = processItem.querySelector('div:last-child');
if (buttonContainer) {
} else if (process.running && buttonContainer) {
const newStopButton = document.createElement('button');
newStopButton.className = 'btn-stop';
newStopButton.onclick = () => stopProcess(process.id);
newStopButton.textContent = '停止训练';
buttonContainer.appendChild(newStopButton);
}
}
// 处理删除按钮
const deleteButton = processItem.querySelector('.btn-delete');
@ -465,10 +580,18 @@ function addProcessItemToGroup(parentElement, process) {
// 设置进程数据属性,用于后续检查状态
processItem.dataset.processId = process.id;
processItem.dataset.processStatus = process.status;
processItem.dataset.trainMonitor = process.train_monitor || 'none';
processItem.dataset.swanlabUrl = process.swanlab_url || '';
// 检查是否显示删除按钮(对于非运行中的进程)
const showDeleteButton = !process.running;
// 只有当train_monitor不是'none'时才显示SwanLab按钮
const showSwanlabButton = process.train_monitor !== 'none';
const swanlabButton = showSwanlabButton ? `<button class="btn-swanlab" onclick="checkAndOpenSwanlab('${process.id}')">
SwanLab
</button>` : '';
processItem.innerHTML = `
<div class="process-info">
<div>
@ -481,6 +604,7 @@ function addProcessItemToGroup(parentElement, process) {
<div>
<button class="btn-logs" onclick="showLogs('${process.id}')">查看日志</button>
<button class="btn-logs" onclick="refreshLog('${process.id}')">刷新日志</button>
${swanlabButton}
${process.running ? `<button class="btn-stop" onclick="stopProcess('${process.id}')">停止训练</button>` : ''}
${showDeleteButton ? `<button class="btn-delete" onclick="deleteProcess('${process.id}')">删除</button>` : ''}
</div>

View File

@ -175,7 +175,10 @@ def start_training_process(train_type, params):
'log_file': log_file,
'start_time': time.strftime('%Y-%m-%d %H:%M:%S'),
'running': True,
'error': False
'error': False,
'train_monitor': params.get('train_monitor', 'none'), # 保存训练监控设置
'swanlab_url': None,
'next_line_is_swanlab_url': False
}
# 开始读取输出
@ -186,6 +189,16 @@ def start_training_process(train_type, params):
if output == '' and process.poll() is not None:
break
if output:
# 检查是否是swanlab链接的行
output_stripped = output.strip()
if training_processes[process_id]['next_line_is_swanlab_url']:
# 保存swanlab链接
training_processes[process_id]['swanlab_url'] = output_stripped
training_processes[process_id]['next_line_is_swanlab_url'] = False
elif 'swanlab: 🚀 View run at' in output_stripped:
# 标记下一行是swanlab链接
training_processes[process_id]['next_line_is_swanlab_url'] = True
with open(log_file, 'a') as f:
f.write(output)
# 检查进程是否成功结束
@ -240,7 +253,9 @@ def processes():
'start_time': info['start_time'],
'running': info['running'],
'error': info['error'],
'status': status
'status': status,
'train_monitor': info.get('train_monitor', 'none'), # 添加train_monitor字段
'swanlab_url': info.get('swanlab_url') # 添加swanlab_url字段
})
return jsonify(result)
@ -265,12 +280,26 @@ def logs(process_id):
return '日志文件不存在或已被删除'
try:
# 使用高效的方法读取文件的最后200行
# 这对于大文件特别有用,可以避免读取整个文件
last_200_lines = []
block_size = 8192 # 8KB blocks
# 使用二进制模式读取,然后尝试解码以处理不同编码的日志文件
def read_log_file_robust(file_path):
# 尝试多种编码方式读取文件
encodings = ['utf-8', 'latin-1', 'gbk', 'gb2312']
for encoding in encodings:
try:
with open(file_path, 'r', encoding=encoding) as f:
return f.read(), encoding
except UnicodeDecodeError:
continue
# 如果所有编码都失败,使用二进制模式读取并替换不可解码的字符
with open(file_path, 'rb') as f:
content = f.read()
return content.decode('utf-8', errors='replace'), 'binary_decoded'
with open(log_file, 'r', encoding='utf-8') as f:
# 使用高效的方法读取文件的最后200行确保以完整行为单位
last_200_lines = []
# 先尝试使用二进制模式读取文件末尾的部分
with open(log_file, 'rb') as f:
# 尝试直接定位到文件末尾,然后向前读取
f.seek(0, os.SEEK_END)
file_size = f.tell()
@ -278,7 +307,11 @@ def logs(process_id):
# 计算需要读取的块数
position = file_size
blocks = []
while position > 0:
block_size = 8192 # 8KB blocks
# 确保我们有足够的数据来处理完整行
found_complete_lines = False
while position > 0 and not found_complete_lines:
# 后退一个块的位置
position -= block_size
if position < 0:
@ -291,19 +324,36 @@ def logs(process_id):
block = f.read(block_size)
blocks.append(block)
# 如果已经收集了足够的行,就停止
combined_text = ''.join(reversed(blocks))
lines = combined_text.splitlines(True)
if len(lines) >= 200:
# 如果已经收集了足够的数据,尝试解码并检查行数
combined_binary = b''.join(blocks)
# 尝试解码使用errors='replace'处理无法解码的字符
try:
combined_text = combined_binary.decode('utf-8', errors='replace')
except:
combined_text = combined_binary.decode('latin-1')
lines = combined_text.splitlines(True) # 使用True保留换行符
# 确保我们不返回不完整的第一行
if len(lines) > 0:
# 如果有足够的行,确保我们从一个完整行开始
if len(lines) > 1:
# 跳过可能不完整的第一行
last_200_lines = lines[1:]
else:
last_200_lines = lines
# 如果我们有足够的行,停止读取
if len(last_200_lines) >= 200:
# 获取最后200行
last_200_lines = lines[-200:]
break
last_200_lines = last_200_lines[-200:]
found_complete_lines = True
# 如果文件内容不足200行或者上面的方法没有收集到足够的行
if len(last_200_lines) < 200:
# 重新读取整个文件(对于小文件)
f.seek(0)
all_lines = f.readlines()
content, encoding = read_log_file_robust(log_file)
all_lines = content.splitlines(True) # 使用True保留换行符
last_200_lines = all_lines[-200:] if len(all_lines) > 200 else all_lines
return ''.join(last_200_lines)
@ -351,9 +401,27 @@ def get_logfile_content(filename):
log_file = os.path.join(log_dir, filename)
try:
# 读取完整的日志文件内容
with open(log_file, 'r', encoding='utf-8') as f:
content = f.read()
# 使用二进制模式读取文件,可以更可靠地保留原始换行符
with open(log_file, 'rb') as f:
content_bytes = f.read()
# 尝试多种编码方式解码,确保正确处理换行符
encodings = ['utf-8', 'latin-1', 'gbk', 'gb2312']
content = None
for encoding in encodings:
try:
# 解码文件内容,保留原始换行符
content = content_bytes.decode(encoding)
break
except UnicodeDecodeError:
continue
# 如果所有编码都失败使用errors='replace'参数处理不可解码的字符
if content is None:
content = content_bytes.decode('utf-8', errors='replace')
# 确保返回的内容正确保留所有换行符
return content
except FileNotFoundError:
return jsonify({'error': 'Log file not found'}), 404
@ -470,7 +538,9 @@ def save_processes_info():
'start_time': info['start_time'],
'running': info['running'],
'error': info.get('error', False),
'manually_stopped': info.get('manually_stopped', False)
'manually_stopped': info.get('manually_stopped', False),
'train_monitor': info.get('train_monitor', 'none'), # 保存train_monitor
'swanlab_url': info.get('swanlab_url') # 保存swanlab_url
}
with open(PROCESSES_FILE, 'w', encoding='utf-8') as f:
@ -488,6 +558,16 @@ def load_processes_info():
# 检查每个进程是否还在运行
for pid, info in loaded_processes.items():
# 确保所有需要的字段都存在
if 'swanlab_url' not in info:
info['swanlab_url'] = None
if 'manually_stopped' not in info:
info['manually_stopped'] = False
if 'error' not in info:
info['error'] = False
if 'train_monitor' not in info:
info['train_monitor'] = 'none'
if info['running']:
try:
# 检查进程是否还在运行
@ -498,10 +578,16 @@ def load_processes_info():
else:
# 进程已停止
info['running'] = False
# 如果进程未被明确标记为完成或出错,则默认为手动停止
if not info['error']:
info['manually_stopped'] = True
training_processes[pid] = info
except (psutil.NoSuchProcess, psutil.AccessDenied):
# 进程不存在或无权限访问
info['running'] = False
# 如果进程未被明确标记为完成或出错,则默认为手动停止
if not info['error']:
info['manually_stopped'] = True
training_processes[pid] = info
else:
# 进程已停止,直接恢复