zydi-web/test_history/llama.py

import io
import os
import json
import base64
import requests
import re
from PIL import Image
from datetime import datetime, timedelta
from decord import VideoReader, cpu

OLLAMA_URL = "http://127.0.0.1:11434/api/generate"
class MediaAnalysisSystem:
    def __init__(self):
        self.MAX_NUM_FRAMES = 16

    def encode_video(self, video_data):
        def uniform_sample(l, n):
            gap = len(l) / n
            return [l[int(i * gap + gap / 2)] for i in range(n)]

        video_file = io.BytesIO(video_data)
        vr = VideoReader(video_file, ctx=cpu(0))
        # 修改采样逻辑，随机选择3-8帧
        num_frames = min(max(3, len(vr) // 30), 8)  # 确保至少3帧，最多8帧
        frame_idx = uniform_sample(range(len(vr)), num_frames)
        frames = vr.get_batch(frame_idx).asnumpy()
        frames = [Image.fromarray(v.astype('uint8')) for v in frames]
        print('采样帧数:', len(frames))
        return frames

    def process_video(self, video_data, object_name):
        if not video_data:
            raise ValueError(f"Empty video data for {object_name}")
        print(f"处理视频: {object_name}, 数据大小: {len(video_data)} bytes")

        frames = self.encode_video(video_data)
        all_responses = []

        # 逐帧分析
        for i, frame in enumerate(frames):
            print(f"Analyzing frame {i+1}/{len(frames)}...")
            question = """Please provide a detailed analysis of this surveillance image, including the following aspects:
                1. Precise count of people in the scene
                2. Individual behavior analysis of each person
                3. Facial expression recognition and emotional state assessment
                4. Detailed description of overall scene and environment
                5. Interactions between people
                6. Detailed description of environmental conditions
                7. Items and furniture present in the environment
                8. Any suspicious or unusual activities
                9. Specific characteristics of people (estimated age range, gender, clothing)
                10. Movement patterns and directions of people
                11. Carried items or objects
                12. Group dynamics and gathering situations
                13. Analysis of video timestamp (if present)

                Please describe in a clear, organized format and highlight important findings."""
            payload = {
                "model": "llama3.2-vision",  # 使用llama2 13b模型
                "prompt": question,
                "images": [self.image_to_base64(frame)]  # 每次只发送一张图片
            }

            try:
                response = requests.post(OLLAMA_URL, json=payload, stream=True)
                if response.status_code == 200:
                    frame_answer = self.process_stream_response(response)
                    all_responses.append(frame_answer)
                else:
                    raise Exception(f"Ollama API 错误: {response.status_code}")
            except requests.RequestException as e:
                print(f"请求 Ollama API 时出错: {str(e)}")
                raise

        # 合并所有帧的分析结果
        combined_answer = "\n\n=== 视频总体分析 ===\n".join(all_responses)
        extracted_info = self.extract_info(combined_answer)

        return {
            "original_answer": combined_answer,
            "extracted_info": extracted_info,
            "num_frames": len(frames),
        }

    def process_stream_response(self, response):
        full_response = []
        for line in response.iter_lines():
            if line:
                try:
                    json_response = json.loads(line)
                    if 'response' in json_response:
                        full_response.append(json_response['response'])
                    if json_response.get('done', False):
                        break
                except json.JSONDecodeError:
                    print(f"无法解析 JSON 行: {line}")
        return ''.join(full_response)

    @staticmethod
    def image_to_base64(image):
        buffered = io.BytesIO()
        image.save(buffered, format="PNG")
        return base64.b64encode(buffered.getvalue()).decode()

    @staticmethod
    def extract_time_from_filename(object_name):
        filename = os.path.basename(object_name)
        time_str = filename.split('_')[0] + '_' + filename.split('_')[1].split('.')[0]

        try:
            start_time = datetime.strptime(time_str, "%Y%m%d_%H%M%S")
            end_time = start_time + timedelta(seconds=10)
            return start_time, end_time
        except ValueError:
            print(f"无法从文件名 '{filename}' 解析时间。使用默认时间。")
            return datetime.now(), datetime.now() + timedelta(seconds=10)

    @staticmethod
    def extract_info(answer):
        info = {
            "environment": None,
            "num_people": None,
            "actions": [],
            "objects": [],
            "furniture": [],
            "emotions": [],
            "features": []
        }

        environments = ["office", "indoor", "outdoor", "meeting room", "room", "classroom", "living room", "bedroom", "kitchen", "bathroom", "hallway", "corridor"]
        for env in environments:
            if env in answer.lower():
                info["environment"] = env
                break

        people_patterns = [
            r'(\d+)\s*(person|people|individual|employee|user|child|adult|female|male)',
            r'(one|two|three|four|five|six|seven|eight|nine|ten)\s*(person|people|individual|employee|user|child|adult|female|male)',
            r'(a|few)\s*(person|people|employee|user|child|adult|female|male)',
            r'several\s*(person|people|employee|user|child|adult|female|male)?',
            r'(male|female)',
            r'(adult|minor|teenager|elderly)\s*(person|group)',
            r'(employee|worker|student|customer|audience|visitor|passenger)',
            r'(crowd|public|people|mass)',
            r'(men|women|adults|children)'
        ]
        for pattern in people_patterns:
            match = re.search(pattern, answer)
            if match:
                if match.group(1).isdigit():
                    info["num_people"] = int(match.group(1))
                elif match.group(1) in ['a', 'one',"an"]:
                    info["num_people"] = 1
                else:
                    num_word_to_digit = {
                        'two': 2, 'three': 3, 'four': 4, 'five': 5,
                        'six': 6, 'seven': 7, 'eight': 8, 'nine': 9, 'ten': 10
                    }
                    info["num_people"] = num_word_to_digit.get(match.group(1), 0)
                break

        actions = ["sleeping", "sitting", "drinking", "eating", "standing", "falling", "dancing", "squatting", "squat", "turning", "fall", "falling down", "lying down", "turning around", "turn", "jumping", "jump", "lying", "sleep", "talking", "sleeping", "getting up", "reading", "writing", "studying", "phone", "eating", "moving things", "sightseeing", "walking", "strolling", "walk", "reading", "writing", "using phone", "computer", "studying", "working", "laptop", "eating", "drinking", "organizing"]
        for action in actions:
            if action in answer:
                info["actions"].append(action)
        emotions = ["happy", "angry", "sad", "surprised", "scared", "disgusted", "calm", "relaxed", "neutral", "focused", "thinking"]
        objects = ["water bottle", "office supplies", "documents", "computer", "fan", "mouse", "keyboard", "tissue", "book", "pen", "bag", "box", "water cup", "cup", "mug", "glass", "folder", "backpack", "bookshelf", "file cabinet", "phone"]
        furniture = ["chair", "table", "coffee table", "file cabinet", "bed", "sofa", "cabinet", "shelf", "camera", "cushion", "office chair", "TV", "whiteboard", "monitor", "storage rack", "file rack"]
        features = ["wearing glasses", "not wearing glasses", "long hair", "short hair", "wearing hat", "not wearing hat", "wearing mask", "not wearing mask", "male", "female", "fat", "thin", "tall", "short", "man", "woman", "adult"]

        for obj in objects:
            if obj in answer:
                info["objects"].append(obj)

        for item in furniture:
            if item in answer:
                info["furniture"].append(item)

        for feature in features:
            if feature in answer:
                info["features"].append(feature)

        for emotion in emotions:
            if emotion in answer:
                info["emotions"].append(emotion)

        return info

# 初始化 MediaAnalysisSystem
media_analysis_system = MediaAnalysisSystem()

class MediaAnalysisError(Exception):
    """自定义媒体分析异常类"""
    pass

def process_video_folder(system, folder_path, output_path=None):
    """处理文件夹中的所有视频文件并保存结果"""
    # 支持的视频格式
    valid_extensions = {'.mp4', '.avi', '.mov', '.mkv'}
    results = {}

    # 确保文件夹存在
    if not os.path.exists(folder_path):
        raise MediaAnalysisError(f"错误：文件夹 '{folder_path}' 不存在")

    # 设置输出路径
    if output_path is None:
        output_path = os.getcwd()  # 如果未指定，使用当前目录
    elif not os.path.exists(output_path):
        os.makedirs(output_path)  # 如果输出目录不存在，创建它

    # 获取所有视频文件
    video_files = [
        f for f in os.listdir(folder_path)
        if os.path.splitext(f)[1].lower() in valid_extensions
    ]

    if not video_files:
        raise MediaAnalysisError(f"错误：在文件夹 '{folder_path}' 中未找到支持的视频文件")

    print(f"\n找到 {len(video_files)} 个视频文件，开始处理...\n")

    # 生成输出文件名
    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
    folder_name = os.path.basename(os.path.normpath(folder_path))
    output_file = os.path.join(output_path, f"analysis_results_{folder_name}_{timestamp}.json")

    # 处理每个视频文件并实时保存结果
    for i, video_file in enumerate(video_files, 1):
        video_path = os.path.join(folder_path, video_file)
        print(f"正在处理 ({i}/{len(video_files)}): {video_file}")

        try:
            with open(video_path, "rb") as f:
                video_data = f.read()
                result = system.process_video(video_data, video_file)
                # 修改结果存储格式
                results[video_file] = {
                    "video_analysis": {
                        "llama3.2-vision": result
                    }
                }

                # 实时保存当前结果到JSON文件
                with open(output_file, 'w', encoding='utf-8') as f:
                    json.dump(results, f, ensure_ascii=False, indent=2)

                print(f"✓ 成功处理并保存: {video_file}")
        except Exception as e:
            print(f"✗ 处理失败 {video_file}: {str(e)}")
            results[video_file] = {
                "video_analysis": {
                    "llama3.2-vision": {"error": str(e)}
                }
            }
            # 即使处理失败也保存当前结果
            with open(output_file, 'w', encoding='utf-8') as f:
                json.dump(results, f, ensure_ascii=False, indent=2)

    print(f"\n所有分析结果已保存到: {output_file}")
    return results

class MediaAnalysisError(Exception):
    """自定义媒体分析异常类"""
    pass

def main():
    try:
        system = MediaAnalysisSystem()

        # 添加文件夹路径输入处理
        folder_path = input("请输入视频文件夹路径: ").strip()
        output_path = input("请输入结果保存路径 (直接回车使用当前目录): ").strip()

        # 如果用户没有输入输出路径，则使用None（将使用当前目录）
        output_path = output_path if output_path else None

        # 处理文件夹中的视频
        results = process_video_folder(system, folder_path, output_path)

        # 显示处理统计
        success_count = sum(1 for r in results.values() if "error" not in r)
        print(f"\n处理完成！成功: {success_count}/{len(results)}")

    except MediaAnalysisError as e:
        print(f"\n错误: {str(e)}")
    except Exception as e:
        print(f"\n未预期的错误: {str(e)}")

if __name__ == "__main__":
    main()