From 3a03cedd00821331c399ce026d454358a1144e37 Mon Sep 17 00:00:00 2001 From: yuyu5333 <1812107659@qq.com> Date: Tue, 11 Nov 2025 13:26:34 +0000 Subject: [PATCH] support Swanlab check --- trainer_web/start_web_ui.sh | 16 +++- trainer_web/static/css/style.css | 14 ++++ trainer_web/static/js/script.js | 134 +++++++++++++++++++++++++++++-- trainer_web/train_web_ui.py | 128 ++++++++++++++++++++++++----- 4 files changed, 263 insertions(+), 29 deletions(-) diff --git a/trainer_web/start_web_ui.sh b/trainer_web/start_web_ui.sh index 9279e4a..012f15c 100755 --- a/trainer_web/start_web_ui.sh +++ b/trainer_web/start_web_ui.sh @@ -33,8 +33,18 @@ nohup python -u train_web_ui.py > "$LOG_FILE" 2>&1 & # 保存PID echo $! > "train_web_ui.pid" -sleep 2 +# 等待服务启动并获取实际端口号 +sleep 3 + +# 从日志文件中提取实际使用的端口号 +# 查找包含"启动Flask服务器在 http://0.0.0.0:"的行并提取端口号 +PORT=$(grep -oP '启动Flask服务器在 http://0.0.0.0:\K[0-9]+' "$LOG_FILE" || echo "5000") + +# 如果没有找到端口号,尝试查找"Running on http://0.0.0.0:"格式的日志 +if [ "$PORT" = "5000" ]; then + PORT=$(grep -oP 'Running on http://0.0.0.0:\K[0-9]+' "$LOG_FILE" || echo "5000") +fi echo "服务已启动! PID: $(cat "train_web_ui.pid")" -echo "访问地址: http://localhost:5000" -echo "停止命令: kill $(cat "train_web_ui.pid") or ./trainer_web/stop_web_ui.sh" \ No newline at end of file +echo "访问地址: http://localhost:$PORT" +echo "停止命令: kill $(cat "train_web_ui.pid") or bash trainer_web/stop_web_ui.sh" \ No newline at end of file diff --git a/trainer_web/static/css/style.css b/trainer_web/static/css/style.css index 9563084..8f8b864 100644 --- a/trainer_web/static/css/style.css +++ b/trainer_web/static/css/style.css @@ -341,6 +341,20 @@ button:active { margin-right: 10px; border-radius: 6px; } + +.btn-swanlab { + background: linear-gradient(135deg, #007bff 0%, #00bfff 100%); + padding: 8px 15px; + font-size: 14px; + margin-right: 10px; + border-radius: 6px; + color: white; +} + +.btn-swanlab:hover { + transform: translateY(-1px); + box-shadow: 0 4px 10px rgba(0, 123, 255, 0.3); +} .btn-delete { background: linear-gradient(135deg, #f44336 0%, #d32f2f 100%); padding: 8px 15px; diff --git a/trainer_web/static/js/script.js b/trainer_web/static/js/script.js index 6cfcf54..f01966f 100644 --- a/trainer_web/static/js/script.js +++ b/trainer_web/static/js/script.js @@ -226,10 +226,92 @@ function checkProcessStatusChanges() { }); } +// 检查并打开SwanLab链接 +function checkAndOpenSwanlab(processId) { + // 首先检查训练监控设置 + const processItem = document.querySelector(`[data-process-id="${processId}"]`); + const trainMonitor = processItem ? processItem.dataset.trainMonitor : 'none'; + + // 如果训练监控设置为'none',显示提示信息 + if (trainMonitor === 'none') { + showNotification('此训练未启用监控功能', 'info'); + return; + } + + // 首先从DOM中获取最新的URL + let currentUrl = processItem ? processItem.dataset.swanlabUrl : ''; + + // 如果没有URL或URL不完整,尝试从后端获取最新的进程信息 + if (!currentUrl || currentUrl.trim() === '') { + fetch('/processes') + .then(response => response.json()) + .then(data => { + const process = data.find(p => p.id === processId); + if (process && process.swanlab_url) { + currentUrl = process.swanlab_url; + // 更新DOM中的URL数据属性 + if (processItem) { + processItem.dataset.swanlabUrl = currentUrl; + } + openSwanlab(processId, currentUrl); + } else { + // 没有找到有效的链接 + showNotification('SwanLab链接尚未生成,请稍后再试', 'info'); + } + }) + .catch(error => { + console.error('获取进程信息失败:', error); + showNotification('获取SwanLab链接失败,请稍后再试', 'error'); + }); + } else { + // 有URL,直接打开 + openSwanlab(processId, currentUrl); + } +} + +// 打开SwanLab链接 +function openSwanlab(processId, url) { + // 检查URL是否有效 + if (!url || typeof url !== 'string' || url.trim() === '' || !isValidUrl(url)) { + showNotification('SwanLab链接无效或尚未生成', 'info'); + return; + } + + // 在新窗口打开链接 + const newWindow = window.open(url, '_blank'); + + // 检查窗口是否成功打开 + if (newWindow) { + // 显示成功通知 + showNotification('正在打开SwanLab页面', 'info'); + } else { + // 弹出窗口被阻止 + showNotification('无法打开新窗口,请检查浏览器设置', 'error'); + } +} + +// 检查URL是否有效 +function isValidUrl(url) { + try { + // 尝试创建URL对象,如果失败说明URL无效 + new URL(url); + return true; + } catch (error) { + // 简单检查是否以http或https开头 + return url.toLowerCase().startsWith('http://') || url.toLowerCase().startsWith('https://'); + } +} + // 更新单个进程项 function updateProcessItem(processItem, process) { // 更新数据属性 processItem.dataset.processStatus = process.status; + processItem.dataset.trainMonitor = process.train_monitor || 'none'; + + // 更新SwanLab URL数据属性 + if (process.swanlab_url) { + processItem.dataset.swanlabUrl = process.swanlab_url; + } // 更新状态类和文本 const statusElement = processItem.querySelector('.process-status'); @@ -253,23 +335,56 @@ function updateProcessItem(processItem, process) { statusElement.textContent = process.status; } + // 更新SwanLab按钮 + const existingSwanlabButton = processItem.querySelector('.btn-swanlab'); + const buttonContainer = processItem.querySelector('div:nth-child(2)'); // 按钮容器是第二个div + + // 只有当train_monitor不是'none'时才显示SwanLab按钮 + const shouldShowSwanlab = process.train_monitor !== 'none'; + + // 如果应该显示按钮但不存在,则创建并添加 + if (shouldShowSwanlab && !existingSwanlabButton && buttonContainer) { + const swanlabButton = document.createElement('button'); + swanlabButton.className = 'btn-swanlab'; + swanlabButton.textContent = 'SwanLab'; + swanlabButton.onclick = function() { + checkAndOpenSwanlab(process.id); + }; + + // 插入到停止按钮之前 + const stopButton = buttonContainer.querySelector('.btn-stop'); + if (stopButton) { + buttonContainer.insertBefore(swanlabButton, stopButton); + } else { + // 如果没有停止按钮,插入到刷新按钮之后 + const refreshButton = buttonContainer.querySelector('.btn-logs:nth-child(2)'); + if (refreshButton) { + buttonContainer.insertBefore(swanlabButton, refreshButton.nextSibling); + } + } + } else if (!shouldShowSwanlab && existingSwanlabButton) { + // 如果不应该显示按钮但存在,则移除 + existingSwanlabButton.remove(); + } else if (existingSwanlabButton) { + // 更新现有按钮的点击事件 + existingSwanlabButton.onclick = function() { + checkAndOpenSwanlab(process.id); + }; + } + // 更新停止按钮 const stopButton = processItem.querySelector('.btn-stop'); if (stopButton) { if (!process.running) { stopButton.remove(); } - } else if (process.running) { - // 如果按钮不存在但进程仍在运行,添加停止按钮 - const buttonContainer = processItem.querySelector('div:last-child'); - if (buttonContainer) { + } else if (process.running && buttonContainer) { const newStopButton = document.createElement('button'); newStopButton.className = 'btn-stop'; newStopButton.onclick = () => stopProcess(process.id); newStopButton.textContent = '停止训练'; buttonContainer.appendChild(newStopButton); } - } // 处理删除按钮 const deleteButton = processItem.querySelector('.btn-delete'); @@ -465,10 +580,18 @@ function addProcessItemToGroup(parentElement, process) { // 设置进程数据属性,用于后续检查状态 processItem.dataset.processId = process.id; processItem.dataset.processStatus = process.status; + processItem.dataset.trainMonitor = process.train_monitor || 'none'; + processItem.dataset.swanlabUrl = process.swanlab_url || ''; // 检查是否显示删除按钮(对于非运行中的进程) const showDeleteButton = !process.running; + // 只有当train_monitor不是'none'时才显示SwanLab按钮 + const showSwanlabButton = process.train_monitor !== 'none'; + const swanlabButton = showSwanlabButton ? `` : ''; + processItem.innerHTML = `
@@ -481,6 +604,7 @@ function addProcessItemToGroup(parentElement, process) {
+ ${swanlabButton} ${process.running ? `` : ''} ${showDeleteButton ? `` : ''}
diff --git a/trainer_web/train_web_ui.py b/trainer_web/train_web_ui.py index 9b573ec..fd2f455 100644 --- a/trainer_web/train_web_ui.py +++ b/trainer_web/train_web_ui.py @@ -175,7 +175,10 @@ def start_training_process(train_type, params): 'log_file': log_file, 'start_time': time.strftime('%Y-%m-%d %H:%M:%S'), 'running': True, - 'error': False + 'error': False, + 'train_monitor': params.get('train_monitor', 'none'), # 保存训练监控设置 + 'swanlab_url': None, + 'next_line_is_swanlab_url': False } # 开始读取输出 @@ -186,6 +189,16 @@ def start_training_process(train_type, params): if output == '' and process.poll() is not None: break if output: + # 检查是否是swanlab链接的行 + output_stripped = output.strip() + if training_processes[process_id]['next_line_is_swanlab_url']: + # 保存swanlab链接 + training_processes[process_id]['swanlab_url'] = output_stripped + training_processes[process_id]['next_line_is_swanlab_url'] = False + elif 'swanlab: 🚀 View run at' in output_stripped: + # 标记下一行是swanlab链接 + training_processes[process_id]['next_line_is_swanlab_url'] = True + with open(log_file, 'a') as f: f.write(output) # 检查进程是否成功结束 @@ -240,7 +253,9 @@ def processes(): 'start_time': info['start_time'], 'running': info['running'], 'error': info['error'], - 'status': status + 'status': status, + 'train_monitor': info.get('train_monitor', 'none'), # 添加train_monitor字段 + 'swanlab_url': info.get('swanlab_url') # 添加swanlab_url字段 }) return jsonify(result) @@ -265,12 +280,26 @@ def logs(process_id): return '日志文件不存在或已被删除' try: - # 使用高效的方法读取文件的最后200行 - # 这对于大文件特别有用,可以避免读取整个文件 - last_200_lines = [] - block_size = 8192 # 8KB blocks + # 使用二进制模式读取,然后尝试解码以处理不同编码的日志文件 + def read_log_file_robust(file_path): + # 尝试多种编码方式读取文件 + encodings = ['utf-8', 'latin-1', 'gbk', 'gb2312'] + for encoding in encodings: + try: + with open(file_path, 'r', encoding=encoding) as f: + return f.read(), encoding + except UnicodeDecodeError: + continue + # 如果所有编码都失败,使用二进制模式读取并替换不可解码的字符 + with open(file_path, 'rb') as f: + content = f.read() + return content.decode('utf-8', errors='replace'), 'binary_decoded' - with open(log_file, 'r', encoding='utf-8') as f: + # 使用高效的方法读取文件的最后200行,确保以完整行为单位 + last_200_lines = [] + + # 先尝试使用二进制模式读取文件末尾的部分 + with open(log_file, 'rb') as f: # 尝试直接定位到文件末尾,然后向前读取 f.seek(0, os.SEEK_END) file_size = f.tell() @@ -278,7 +307,11 @@ def logs(process_id): # 计算需要读取的块数 position = file_size blocks = [] - while position > 0: + block_size = 8192 # 8KB blocks + + # 确保我们有足够的数据来处理完整行 + found_complete_lines = False + while position > 0 and not found_complete_lines: # 后退一个块的位置 position -= block_size if position < 0: @@ -291,19 +324,36 @@ def logs(process_id): block = f.read(block_size) blocks.append(block) - # 如果已经收集了足够的行,就停止 - combined_text = ''.join(reversed(blocks)) - lines = combined_text.splitlines(True) - if len(lines) >= 200: - # 获取最后200行 - last_200_lines = lines[-200:] - break + # 如果已经收集了足够的数据,尝试解码并检查行数 + combined_binary = b''.join(blocks) + # 尝试解码,使用errors='replace'处理无法解码的字符 + try: + combined_text = combined_binary.decode('utf-8', errors='replace') + except: + combined_text = combined_binary.decode('latin-1') + + lines = combined_text.splitlines(True) # 使用True保留换行符 + + # 确保我们不返回不完整的第一行 + if len(lines) > 0: + # 如果有足够的行,确保我们从一个完整行开始 + if len(lines) > 1: + # 跳过可能不完整的第一行 + last_200_lines = lines[1:] + else: + last_200_lines = lines + + # 如果我们有足够的行,停止读取 + if len(last_200_lines) >= 200: + # 获取最后200行 + last_200_lines = last_200_lines[-200:] + found_complete_lines = True # 如果文件内容不足200行,或者上面的方法没有收集到足够的行 if len(last_200_lines) < 200: # 重新读取整个文件(对于小文件) - f.seek(0) - all_lines = f.readlines() + content, encoding = read_log_file_robust(log_file) + all_lines = content.splitlines(True) # 使用True保留换行符 last_200_lines = all_lines[-200:] if len(all_lines) > 200 else all_lines return ''.join(last_200_lines) @@ -351,9 +401,27 @@ def get_logfile_content(filename): log_file = os.path.join(log_dir, filename) try: - # 读取完整的日志文件内容 - with open(log_file, 'r', encoding='utf-8') as f: - content = f.read() + # 使用二进制模式读取文件,可以更可靠地保留原始换行符 + with open(log_file, 'rb') as f: + content_bytes = f.read() + + # 尝试多种编码方式解码,确保正确处理换行符 + encodings = ['utf-8', 'latin-1', 'gbk', 'gb2312'] + content = None + + for encoding in encodings: + try: + # 解码文件内容,保留原始换行符 + content = content_bytes.decode(encoding) + break + except UnicodeDecodeError: + continue + + # 如果所有编码都失败,使用errors='replace'参数处理不可解码的字符 + if content is None: + content = content_bytes.decode('utf-8', errors='replace') + + # 确保返回的内容正确保留所有换行符 return content except FileNotFoundError: return jsonify({'error': 'Log file not found'}), 404 @@ -470,7 +538,9 @@ def save_processes_info(): 'start_time': info['start_time'], 'running': info['running'], 'error': info.get('error', False), - 'manually_stopped': info.get('manually_stopped', False) + 'manually_stopped': info.get('manually_stopped', False), + 'train_monitor': info.get('train_monitor', 'none'), # 保存train_monitor + 'swanlab_url': info.get('swanlab_url') # 保存swanlab_url } with open(PROCESSES_FILE, 'w', encoding='utf-8') as f: @@ -488,6 +558,16 @@ def load_processes_info(): # 检查每个进程是否还在运行 for pid, info in loaded_processes.items(): + # 确保所有需要的字段都存在 + if 'swanlab_url' not in info: + info['swanlab_url'] = None + if 'manually_stopped' not in info: + info['manually_stopped'] = False + if 'error' not in info: + info['error'] = False + if 'train_monitor' not in info: + info['train_monitor'] = 'none' + if info['running']: try: # 检查进程是否还在运行 @@ -498,10 +578,16 @@ def load_processes_info(): else: # 进程已停止 info['running'] = False + # 如果进程未被明确标记为完成或出错,则默认为手动停止 + if not info['error']: + info['manually_stopped'] = True training_processes[pid] = info except (psutil.NoSuchProcess, psutil.AccessDenied): # 进程不存在或无权限访问 info['running'] = False + # 如果进程未被明确标记为完成或出错,则默认为手动停止 + if not info['error']: + info['manually_stopped'] = True training_processes[pid] = info else: # 进程已停止,直接恢复