Merge pull request #284 from xiuxiuxius/main

fix: 修复日志记录错误信息时的变量引用问题
Merge pull request #291 from 666ghj/fix/streamlit_torch_conflict
2025-11-10 19:59:06 +08:00 · 2025-11-10 19:55:38 +08:00 · 2025-11-10 19:17:06 +08:00 · 2025-11-10 19:14:19 +08:00 · 2025-11-10 19:00:06 +08:00 · 2025-11-10 18:53:27 +08:00
6 changed files with 407 additions and 126 deletions
--- a/79
+++ b/79
@@ -0,0 +1,79 @@
+FROM python:3.11-slim
+
+SHELL ["/bin/bash", "-o", "pipefail", "-c"]
+
+# Satisefy Chinese Mainland Internet Environment
+ENV PIP_INDEX_URL=https://mirrors.aliyun.com/pypi/simple/ \
+    PIP_TRUSTED_HOST=mirrors.aliyun.com 
+
+RUN sed -i 's/deb.debian.org/mirrors.ustc.edu.cn/g' /etc/apt/sources.list.d/debian.sources
+
+# Prevent Python from writing .pyc files, buffer stdout/stderr, and pin common tooling paths
+ENV PYTHONDONTWRITEBYTECODE=1 \
+    PYTHONUNBUFFERED=1 \
+    PIP_NO_CACHE_DIR=1 \
+    PATH="/root/.local/bin:${PATH}" \
+    PLAYWRIGHT_BROWSERS_PATH=/ms-playwright
+
+# Install system dependencies required by scientific Python stack, Playwright, and Streamlit
+RUN apt-get update && apt-get install -y --no-install-recommends \
+    build-essential \
+    curl \
+    git \
+    libgl1 \
+    libglib2.0-0 \
+    libgtk-3-0 \
+    libpango-1.0-0 \
+    libpangocairo-1.0-0 \
+    libatk1.0-0 \
+    libatk-bridge2.0-0 \
+    libxcb1 \
+    libxcomposite1 \
+    libxdamage1 \
+    libxext6 \
+    libxfixes3 \
+    libxi6 \
+    libxtst6 \
+    libnss3 \
+    libxrandr2 \
+    libxkbcommon0 \
+    libasound2 \
+    libx11-xcb1 \
+    libxshmfence1 \
+    libgbm1 \
+    ffmpeg \
+    tar\
+    && apt-get clean && rm -rf /var/lib/apt/lists/*
+
+# Install the latest uv release and expose it on PATH
+# Use GitHub accelerated source to download uv package (to avoid direct access to astral.sh)
+RUN curl -LsSf "https://wget.la/https://github.com/astral-sh/uv/releases/download/0.9.7/uv-x86_64-unknown-linux-musl.tar.gz" \
+    | tar -xz -C /usr/local/bin --strip-components=1 && \
+    uv --version
+
+# Set UV pip mirror for Chinese Mainland Internet Environment
+ENV UV_INDEX_URL="https://mirrors.aliyun.com/pypi/simple/" \
+    UV_TRUSTED_HOST="mirrors.aliyun.com"
+
+WORKDIR /app
+
+# Install Python dependencies first to leverage Docker layer caching
+COPY requirements.txt ./
+RUN uv pip install --system -r requirements.txt
+
+# Install Playwright browser binaries (system deps already handled above)
+RUN python -m playwright install chromium
+
+# Copy .env
+COPY .env.example .env
+
+# Copy application source
+COPY . .
+
+# Ensure runtime directories exist even if ignored in build context
+RUN mkdir -p /ms-playwright logs final_reports insight_engine_streamlit_reports media_engine_streamlit_reports query_engine_streamlit_reports
+
+EXPOSE 5000 8501 8502 8503
+
+# Default command launches the Flask orchestrator which starts Streamlit agents
+CMD ["python", "app.py"]
--- a/InsightEngine/tools/sentiment_analyzer.py
+++ b/InsightEngine/tools/sentiment_analyzer.py
@@ -11,13 +11,16 @@ import re

 try:
    import torch
+
    TORCH_AVAILABLE = True
+    torch.classes.__path__ = []
 except ImportError:
    torch = None  # type: ignore
    TORCH_AVAILABLE = False

 try:
    from transformers import AutoTokenizer, AutoModelForSequenceClassification
+
    TRANSFORMERS_AVAILABLE = True
 except ImportError:
    AutoTokenizer = None  # type: ignore
@@ -28,6 +31,7 @@ except ImportError:
 # INFO：若想跳过情感分析，可手动切换此开关为False
 SENTIMENT_ANALYSIS_ENABLED = True

+
 def _describe_missing_dependencies() -> str:
    missing = []
    if not TORCH_AVAILABLE:
@@ -36,14 +40,21 @@ def _describe_missing_dependencies() -> str:
        missing.append("Transformers")
    return " / ".join(missing)

+
 # 添加项目根目录到路径，以便导入WeiboMultilingualSentiment
-project_root = os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
-weibo_sentiment_path = os.path.join(project_root, "SentimentAnalysisModel", "WeiboMultilingualSentiment")
+project_root = os.path.dirname(
+    os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
+)
+weibo_sentiment_path = os.path.join(
+    project_root, "SentimentAnalysisModel", "WeiboMultilingualSentiment"
+)
 sys.path.append(weibo_sentiment_path)

+
@dataclass
 class SentimentResult:
    """情感分析结果数据类"""
+
    text: str
    sentiment_label: str
    confidence: float
@@ -53,9 +64,10 @@ class SentimentResult:
    analysis_performed: bool = True


-@dataclass 
+@dataclass
 class BatchSentimentResult:
    """批量情感分析结果数据类"""
+
    results: List[SentimentResult]
    total_processed: int
    success_count: int
@@ -69,7 +81,7 @@ class WeiboMultilingualSentimentAnalyzer:
    多语言情感分析器
    封装WeiboMultilingualSentiment模型，为AI Agent提供情感分析功能
    """
-    
+
    def __init__(self):
        """初始化情感分析器"""
        self.model = None
@@ -78,14 +90,14 @@ class WeiboMultilingualSentimentAnalyzer:
        self.is_initialized = False
        self.is_disabled = False
        self.disable_reason: Optional[str] = None
-        
+
        # 情感标签映射（5级分类）
        self.sentiment_map = {
-            0: "非常负面", 
-            1: "负面", 
-            2: "中性", 
-            3: "正面", 
-            4: "非常正面"
+            0: "非常负面",
+            1: "负面",
+            2: "中性",
+            3: "正面",
+            4: "非常正面",
        }

        if not SENTIMENT_ANALYSIS_ENABLED:
@@ -96,9 +108,13 @@ class WeiboMultilingualSentimentAnalyzer:

        if self.is_disabled:
            reason = self.disable_reason or "Sentiment analysis disabled."
-            print(f"WeiboMultilingualSentimentAnalyzer initialized but disabled: {reason}")
+            print(
+                f"WeiboMultilingualSentimentAnalyzer initialized but disabled: {reason}"
+            )
        else:
-            print("WeiboMultilingualSentimentAnalyzer 已创建，调用 initialize() 来加载模型")
+            print(
+                "WeiboMultilingualSentimentAnalyzer 已创建，调用 initialize() 来加载模型"
+            )

    def disable(self, reason: Optional[str] = None, drop_state: bool = False) -> None:
        """Disable sentiment analysis, optionally clearing loaded resources."""
@@ -127,17 +143,22 @@ class WeiboMultilingualSentimentAnalyzer:
        """Select the best available torch device."""
        if not TORCH_AVAILABLE:
            return None
+        assert torch is not None
        if torch.cuda.is_available():
            return torch.device("cuda")
        mps_backend = getattr(torch.backends, "mps", None)
-        if mps_backend and getattr(mps_backend, "is_available", lambda: False)() and getattr(mps_backend, "is_built", lambda: False)():
+        if (
+            mps_backend
+            and getattr(mps_backend, "is_available", lambda: False)()
+            and getattr(mps_backend, "is_built", lambda: False)()
+        ):
            return torch.device("mps")
        return torch.device("cpu")
-    
+
    def initialize(self) -> bool:
        """
        初始化模型和分词器
-        
+
        Returns:
            是否初始化成功
        """
@@ -155,31 +176,37 @@ class WeiboMultilingualSentimentAnalyzer:
        if self.is_initialized:
            print("模型已经初始化，无需重复加载")
            return True
-            
+
        try:
            print("正在加载多语言情感分析模型...")
-            
+            assert AutoTokenizer is not None
+            assert AutoModelForSequenceClassification is not None
+
            # 使用多语言情感分析模型
            model_name = "tabularisai/multilingual-sentiment-analysis"
            local_model_path = os.path.join(weibo_sentiment_path, "model")
-            
+
            # 检查本地是否已有模型
            if os.path.exists(local_model_path):
                print("从本地加载模型...")
                self.tokenizer = AutoTokenizer.from_pretrained(local_model_path)
-                self.model = AutoModelForSequenceClassification.from_pretrained(local_model_path)
+                self.model = AutoModelForSequenceClassification.from_pretrained(
+                    local_model_path
+                )
            else:
                print("首次使用，正在下载模型到本地...")
                # 下载并保存到本地
                self.tokenizer = AutoTokenizer.from_pretrained(model_name)
-                self.model = AutoModelForSequenceClassification.from_pretrained(model_name)
-                
+                self.model = AutoModelForSequenceClassification.from_pretrained(
+                    model_name
+                )
+
                # 保存到本地
                os.makedirs(local_model_path, exist_ok=True)
                self.tokenizer.save_pretrained(local_model_path)
                self.model.save_pretrained(local_model_path)
                print(f"模型已保存到: {local_model_path}")
-            
+
            # 设置设备
            device = self._select_device()
            if device is None:
@@ -198,46 +225,46 @@ class WeiboMultilingualSentimentAnalyzer:
                print("检测到 Apple MPS 设备，已使用 MPS 进行推理。")
            else:
                print("未检测到 GPU，自动使用 CPU 进行推理。")
-            
+
            print(f"模型加载成功! 使用设备: {self.device}")
            print("支持语言: 中文、英文、西班牙文、阿拉伯文、日文、韩文等22种语言")
            print("情感等级: 非常负面、负面、中性、正面、非常正面")
-            
+
            return True
-            
+
        except Exception as e:
            error_message = f"模型加载失败: {e}"
            print(error_message)
            print("请检查网络连接或模型文件")
            self.disable(error_message, drop_state=True)
            return False
-    
+
    def _preprocess_text(self, text: str) -> str:
        """
        文本预处理
-        
+
        Args:
            text: 输入文本
-            
+
        Returns:
            处理后的文本
        """
        # 基本文本清理
        if not text or not text.strip():
            return ""
-        
+
        # 去除多余空格
-        text = re.sub(r'\s+', ' ', text.strip())
-        
+        text = re.sub(r"\s+", " ", text.strip())
+
        return text
-    
+
    def analyze_single_text(self, text: str) -> SentimentResult:
        """
        对单个文本进行情感分析
-        
+
        Args:
            text: 要分析的文本
-            
+
        Returns:
            SentimentResult对象
        """
@@ -249,7 +276,7 @@ class WeiboMultilingualSentimentAnalyzer:
                probability_distribution={},
                success=False,
                error_message=self.disable_reason or "情感分析功能已禁用",
-                analysis_performed=False
+                analysis_performed=False,
            )

        if not self.is_initialized:
@@ -260,7 +287,7 @@ class WeiboMultilingualSentimentAnalyzer:
                probability_distribution={},
                success=False,
                error_message="模型未初始化，请先调用initialize() 方法",
-                analysis_performed=False
+                analysis_performed=False,
            )

        try:
@@ -275,27 +302,29 @@ class WeiboMultilingualSentimentAnalyzer:
                    probability_distribution={},
                    success=False,
                    error_message="输入文本为空或无效内容",
-                    analysis_performed=False
+                    analysis_performed=False,
                )
-
+            assert self.tokenizer is not None
            # 分词编码
            inputs = self.tokenizer(
                processed_text,
                max_length=512,
                padding=True,
                truncation=True,
-                return_tensors='pt'
+                return_tensors="pt",
            )

            # 转移到设备
            inputs = {k: v.to(self.device) for k, v in inputs.items()}

            # 预测
+            assert torch is not None
+            assert self.model is not None
            with torch.no_grad():
                outputs = self.model(**inputs)
                logits = outputs.logits
                probabilities = torch.softmax(logits, dim=1)
-                prediction = torch.argmax(probabilities, dim=1).item()
+                prediction = int(torch.argmax(probabilities, dim=1).item())

            # 构建结果
            confidence = probabilities[0][prediction].item()
@@ -311,7 +340,7 @@ class WeiboMultilingualSentimentAnalyzer:
                sentiment_label=label,
                confidence=confidence,
                probability_distribution=prob_dist,
-                success=True
+                success=True,
            )

        except Exception as e:
@@ -322,17 +351,19 @@ class WeiboMultilingualSentimentAnalyzer:
                probability_distribution={},
                success=False,
                error_message=f"预测时发生错误: {str(e)}",
-                analysis_performed=False
+                analysis_performed=False,
            )

-    def analyze_batch(self, texts: List[str], show_progress: bool = True) -> BatchSentimentResult:
+    def analyze_batch(
+        self, texts: List[str], show_progress: bool = True
+    ) -> BatchSentimentResult:
        """
        批量情感分析
-        
+
        Args:
            texts: 文本列表
            show_progress: 是否显示进度
-            
+
        Returns:
            BatchSentimentResult对象
        """
@@ -343,9 +374,9 @@ class WeiboMultilingualSentimentAnalyzer:
                success_count=0,
                failed_count=0,
                average_confidence=0.0,
-                analysis_performed=not self.is_disabled and self.is_initialized
+                analysis_performed=not self.is_disabled and self.is_initialized,
            )
-        
+
        if self.is_disabled or not self.is_initialized:
            passthrough_results = [
                SentimentResult(
@@ -355,7 +386,7 @@ class WeiboMultilingualSentimentAnalyzer:
                    probability_distribution={},
                    success=False,
                    error_message=self.disable_reason or "情感分析功能不可用",
-                    analysis_performed=False
+                    analysis_performed=False,
                )
                for text in texts
            ]
@@ -365,42 +396,44 @@ class WeiboMultilingualSentimentAnalyzer:
                success_count=0,
                failed_count=len(texts),
                average_confidence=0.0,
-                analysis_performed=False
+                analysis_performed=False,
            )
-        
+
        results = []
        success_count = 0
        total_confidence = 0.0
-        
+
        for i, text in enumerate(texts):
            if show_progress and len(texts) > 1:
-                print(f"处理进度: {i+1}/{len(texts)}")
-            
+                print(f"处理进度: {i + 1}/{len(texts)}")
+
            result = self.analyze_single_text(text)
            results.append(result)
-            
+
            if result.success:
                success_count += 1
                total_confidence += result.confidence
-        
-        average_confidence = total_confidence / success_count if success_count > 0 else 0.0
+
+        average_confidence = (
+            total_confidence / success_count if success_count > 0 else 0.0
+        )
        failed_count = len(texts) - success_count
-        
+
        return BatchSentimentResult(
            results=results,
            total_processed=len(texts),
            success_count=success_count,
            failed_count=failed_count,
            average_confidence=average_confidence,
-            analysis_performed=True
+            analysis_performed=True,
        )
-    
+
    def _build_passthrough_analysis(
        self,
        original_data: List[Dict[str, Any]],
        reason: str,
        texts: Optional[List[str]] = None,
-        results: Optional[List[SentimentResult]] = None
+        results: Optional[List[SentimentResult]] = None,
    ) -> Dict[str, Any]:
        """
        构建在情感分析不可用时的透传结果
@@ -416,33 +449,36 @@ class WeiboMultilingualSentimentAnalyzer:
                "sentiment_distribution": {},
                "high_confidence_results": [],
                "summary": f"情感分析未执行：{reason}",
-                "original_texts": original_data
+                "original_texts": original_data,
            }
        }
-        
+
        if texts is not None:
            response["sentiment_analysis"]["passthrough_texts"] = texts
-        
+
        if results is not None:
            response["sentiment_analysis"]["results"] = [
                result.__dict__ if isinstance(result, SentimentResult) else result
                for result in results
            ]
-        
+
        return response
-    
-    def analyze_query_results(self, query_results: List[Dict[str, Any]], 
-                            text_field: str = "content", 
-                            min_confidence: float = 0.5) -> Dict[str, Any]:
+
+    def analyze_query_results(
+        self,
+        query_results: List[Dict[str, Any]],
+        text_field: str = "content",
+        min_confidence: float = 0.5,
+    ) -> Dict[str, Any]:
        """
        对查询结果进行情感分析
        专门用于分析从MediaCrawlerDB返回的查询结果
-        
+
        Args:
            query_results: 查询结果列表，每个元素包含文本内容
            text_field: 文本内容字段名，默认为"content"
            min_confidence: 最小置信度阈值
-            
+
        Returns:
            包含情感分析结果的字典
        """
@@ -452,14 +488,14 @@ class WeiboMultilingualSentimentAnalyzer:
                    "total_analyzed": 0,
                    "sentiment_distribution": {},
                    "high_confidence_results": [],
-                    "summary": "没有内容需要分析"
+                    "summary": "没有内容需要分析",
                }
            }
-        
+
        # 提取文本内容
        texts_to_analyze = []
        original_data = []
-        
+
        for item in query_results:
            # 尝试多个可能的文本字段
            text_content = ""
@@ -467,49 +503,52 @@ class WeiboMultilingualSentimentAnalyzer:
                if field in item and item[field]:
                    text_content = str(item[field])
                    break
-            
+
            if text_content.strip():
                texts_to_analyze.append(text_content)
                original_data.append(item)
-        
+
        if not texts_to_analyze:
            return {
                "sentiment_analysis": {
                    "total_analyzed": 0,
                    "sentiment_distribution": {},
                    "high_confidence_results": [],
-                    "summary": "查询结果中没有找到可分析的文本内容"
+                    "summary": "查询结果中没有找到可分析的文本内容",
                }
            }
-        
+
        if self.is_disabled:
            return self._build_passthrough_analysis(
                original_data=original_data,
                reason=self.disable_reason or "情感分析模型不可用",
-                texts=texts_to_analyze
+                texts=texts_to_analyze,
            )
-        
+
        # 执行批量情感分析
        print(f"正在对{len(texts_to_analyze)}条内容进行情感分析...")
        batch_result = self.analyze_batch(texts_to_analyze, show_progress=True)
-        
+
        if not batch_result.analysis_performed:
            reason = self.disable_reason or "情感分析功能不可用"
            if batch_result.results:
-                candidate_error = next((r.error_message for r in batch_result.results if r.error_message), None)
+                candidate_error = next(
+                    (r.error_message for r in batch_result.results if r.error_message),
+                    None,
+                )
                if candidate_error:
                    reason = candidate_error
            return self._build_passthrough_analysis(
                original_data=original_data,
                reason=reason,
                texts=texts_to_analyze,
-                results=batch_result.results
+                results=batch_result.results,
            )
-        
+
        # 统计情感分布
        sentiment_distribution = {}
        high_confidence_results = []
-        
+
        for result, original_item in zip(batch_result.results, original_data):
            if result.success:
                # 统计情感分布
@@ -517,24 +556,28 @@ class WeiboMultilingualSentimentAnalyzer:
                if sentiment not in sentiment_distribution:
                    sentiment_distribution[sentiment] = 0
                sentiment_distribution[sentiment] += 1
-                
+
                # 收集高置信度结果
                if result.confidence >= min_confidence:
-                    high_confidence_results.append({
-                        "original_data": original_item,
-                        "sentiment": result.sentiment_label,
-                        "confidence": result.confidence,
-                        "text_preview": result.text[:100] + "..." if len(result.text) > 100 else result.text
-                    })
-        
+                    high_confidence_results.append(
+                        {
+                            "original_data": original_item,
+                            "sentiment": result.sentiment_label,
+                            "confidence": result.confidence,
+                            "text_preview": result.text[:100] + "..."
+                            if len(result.text) > 100
+                            else result.text,
+                        }
+                    )
+
        # 生成情感分析摘要
        total_analyzed = batch_result.success_count
        if total_analyzed > 0:
            dominant_sentiment = max(sentiment_distribution.items(), key=lambda x: x[1])
-            sentiment_summary = f"共分析{total_analyzed}条内容，主要情感倾向为'{dominant_sentiment[0]}'({dominant_sentiment[1]}条，占{dominant_sentiment[1]/total_analyzed*100:.1f}%)"
+            sentiment_summary = f"共分析{total_analyzed}条内容，主要情感倾向为'{dominant_sentiment[0]}'({dominant_sentiment[1]}条，占{dominant_sentiment[1] / total_analyzed * 100:.1f}%)"
        else:
            sentiment_summary = "情感分析失败"
-        
+
        return {
            "sentiment_analysis": {
                "total_analyzed": total_analyzed,
@@ -542,28 +585,46 @@ class WeiboMultilingualSentimentAnalyzer:
                "average_confidence": round(batch_result.average_confidence, 4),
                "sentiment_distribution": sentiment_distribution,
                "high_confidence_results": high_confidence_results,  # 返回所有高置信度结果，不做限制
-                "summary": sentiment_summary
+                "summary": sentiment_summary,
            }
        }
-    
+
    def get_model_info(self) -> Dict[str, Any]:
        """
        获取模型信息
-        
+
        Returns:
            模型信息字典
        """
        return {
            "model_name": "tabularisai/multilingual-sentiment-analysis",
            "supported_languages": [
-                "中文", "英文", "西班牙文", "阿拉伯文", "日文", "韩文", 
-                "德文", "法文", "意大利文", "葡萄牙文", "俄文", "荷兰文",
-                "波兰文", "土耳其文", "丹麦文", "希腊文", "芬兰文", 
-                "瑞典文", "挪威文", "匈牙利文", "捷克文", "保加利亚文"
+                "中文",
+                "英文",
+                "西班牙文",
+                "阿拉伯文",
+                "日文",
+                "韩文",
+                "德文",
+                "法文",
+                "意大利文",
+                "葡萄牙文",
+                "俄文",
+                "荷兰文",
+                "波兰文",
+                "土耳其文",
+                "丹麦文",
+                "希腊文",
+                "芬兰文",
+                "瑞典文",
+                "挪威文",
+                "匈牙利文",
+                "捷克文",
+                "保加利亚文",
            ],
            "sentiment_levels": list(self.sentiment_map.values()),
            "is_initialized": self.is_initialized,
-            "device": str(self.device) if self.device else "未设置"
+            "device": str(self.device) if self.device else "未设置",
        }


@@ -576,20 +637,23 @@ def enable_sentiment_analysis() -> bool:
    return multilingual_sentiment_analyzer.enable()


-def disable_sentiment_analysis(reason: Optional[str] = None, drop_state: bool = False) -> None:
+def disable_sentiment_analysis(
+    reason: Optional[str] = None, drop_state: bool = False
+) -> None:
    """Public helper to disable sentiment analysis at runtime."""
    multilingual_sentiment_analyzer.disable(reason=reason, drop_state=drop_state)


-def analyze_sentiment(text_or_texts: Union[str, List[str]], 
-                     initialize_if_needed: bool = True) -> Union[SentimentResult, BatchSentimentResult]:
+def analyze_sentiment(
+    text_or_texts: Union[str, List[str]], initialize_if_needed: bool = True
+) -> Union[SentimentResult, BatchSentimentResult]:
    """
    便捷的情感分析函数
-    
+
    Args:
        text_or_texts: 单个文本或文本列表
        initialize_if_needed: 如果模型未初始化，是否自动初始化
-        
+
    Returns:
        SentimentResult或BatchSentimentResult
    """
@@ -599,7 +663,7 @@ def analyze_sentiment(text_or_texts: Union[str, List[str]],
        and not multilingual_sentiment_analyzer.is_disabled
    ):
        multilingual_sentiment_analyzer.initialize()
-    
+
    if isinstance(text_or_texts, str):
        return multilingual_sentiment_analyzer.analyze_single_text(text_or_texts)
    else:
@@ -610,24 +674,30 @@ def analyze_sentiment(text_or_texts: Union[str, List[str]],
 if __name__ == "__main__":
    # 测试代码
    analyzer = WeiboMultilingualSentimentAnalyzer()
-    
+
    if analyzer.initialize():
        # 测试单个文本
        result = analyzer.analyze_single_text("今天天气真好，心情特别棒！")
-        print(f"单个文本分析: {result.sentiment_label} (置信度: {result.confidence:.4f})")
-        
+        print(
+            f"单个文本分析: {result.sentiment_label} (置信度: {result.confidence:.4f})"
+        )
+
        # 测试批量文本
        test_texts = [
            "这家餐厅的菜味道非常棒！",
            "服务态度太差了，很失望",
            "I absolutely love this product!",
-            "The customer service was disappointing."
+            "The customer service was disappointing.",
        ]
-        
+
        batch_result = analyzer.analyze_batch(test_texts)
-        print(f"\n批量分析: 成功 {batch_result.success_count}/{batch_result.total_processed}")
-        
+        print(
+            f"\n批量分析: 成功 {batch_result.success_count}/{batch_result.total_processed}"
+        )
+
        for result in batch_result.results:
-            print(f"'{result.text[:30]}...' -> {result.sentiment_label} ({result.confidence:.4f})")
+            print(
+                f"'{result.text[:30]}...' -> {result.sentiment_label} ({result.confidence:.4f})"
+            )
    else:
        print("模型初始化失败，无法进行测试")
--- a/QueryEngine/agent.py
+++ b/QueryEngine/agent.py
@@ -19,7 +19,7 @@ from .nodes import (
    ReportFormattingNode
 )
 from .state import State
-from .tools import TavilyNewsAgency, TavilyResponse
+from .tools import TavilyNewsAgency, TavilyResponse, SearchResult, ImageResult
 from .utils import Settings, format_search_results_for_prompt
 from loguru import logger

@@ -97,6 +97,121 @@ class DeepSearchAgent:
        except ValueError:
            return False
    
+    def _normalize_search_response(self, search_response: Any) -> Optional[TavilyResponse]:
+        """
+        规范化搜索响应，处理不同类型的返回值
+        
+        此方法解决了当搜索API返回dict而不是TavilyResponse对象时的类型错误问题。
+        同时提供错误恢复机制，确保系统在异常情况下仍能继续运行。
+        
+        Args:
+            search_response: 可能是TavilyResponse对象、dict或None
+            
+        Returns:
+            规范化后的TavilyResponse对象，如果无法规范化则返回None
+        """
+        if search_response is None:
+            return None
+        
+        # 如果已经是TavilyResponse对象，直接返回
+        if isinstance(search_response, TavilyResponse):
+            return search_response
+        
+        # 如果是dict，尝试转换为TavilyResponse对象
+        if isinstance(search_response, dict):
+            try:
+                # 从dict中提取结果
+                results = []
+                if 'results' in search_response:
+                    for item in search_response['results']:
+                        if isinstance(item, dict):
+                            results.append(SearchResult(
+                                title=item.get('title', ''),
+                                url=item.get('url', ''),
+                                content=item.get('content', ''),
+                                score=item.get('score'),
+                                raw_content=item.get('raw_content'),
+                                published_date=item.get('published_date')
+                            ))
+                        elif hasattr(item, '__dict__'):
+                            # 如果已经是SearchResult对象
+                            results.append(item)
+                
+                images = []
+                if 'images' in search_response:
+                    for item in search_response['images']:
+                        if isinstance(item, dict):
+                            images.append(ImageResult(
+                                url=item.get('url', ''),
+                                description=item.get('description')
+                            ))
+                        elif hasattr(item, '__dict__'):
+                            images.append(item)
+                
+                return TavilyResponse(
+                    query=search_response.get('query', ''),
+                    answer=search_response.get('answer'),
+                    results=results,
+                    images=images,
+                    response_time=search_response.get('response_time')
+                )
+            except Exception as e:
+                logger.warning(f"无法规范化搜索响应为TavilyResponse对象: {str(e)}")
+                logger.debug(f"原始响应类型: {type(search_response)}, 内容: {search_response}")
+                return None
+        
+        # 其他类型，记录警告并返回None
+        logger.warning(f"未知的搜索响应类型: {type(search_response)}")
+        return None
+    
+    def _validate_and_filter_search_results(self, results: List[SearchResult], query: str) -> List[SearchResult]:
+        """
+        验证和过滤搜索结果，防止幻觉问题
+        
+        此方法实现了搜索结果验证机制，通过以下方式防止AI生成虚假引用：
+        1. 验证URL有效性
+        2. 检查内容相关性
+        3. 过滤空结果和无效数据
+        
+        Args:
+            results: 原始搜索结果列表
+            query: 搜索查询，用于相关性验证
+            
+        Returns:
+            验证和过滤后的搜索结果列表
+        """
+        if not results:
+            return []
+        
+        validated_results = []
+        query_lower = query.lower()
+        query_keywords = set(query_lower.split())
+        
+        for result in results:
+            # 跳过空结果
+            if not result or not result.title:
+                continue
+            
+            # 验证URL格式
+            if result.url and not (result.url.startswith('http://') or result.url.startswith('https://')):
+                logger.debug(f"跳过无效URL: {result.url}")
+                continue
+            
+            # 基本相关性检查：至少包含一个查询关键词
+            title_lower = result.title.lower()
+            content_lower = (result.content or '').lower()
+            
+            # 检查标题或内容中是否包含查询关键词
+            has_relevance = any(keyword in title_lower or keyword in content_lower 
+                              for keyword in query_keywords if len(keyword) > 2)
+            
+            if has_relevance or len(query_keywords) == 0:
+                validated_results.append(result)
+            else:
+                logger.debug(f"过滤低相关性结果: {result.title[:50]}...")
+        
+        return validated_results
+
    def execute_search_tool(self, tool_name: str, query: str, **kwargs) -> TavilyResponse:
        """
        执行指定的搜索工具
@@ -260,12 +375,20 @@ class DeepSearchAgent:
        
        search_response = self.execute_search_tool(search_tool, search_query, **search_kwargs)
        
+        # 规范化搜索响应，处理dict类型返回值
+        normalized_response = self._normalize_search_response(search_response)
+        
        # 转换为兼容格式
        search_results = []
-        if search_response and search_response.results:
+        if normalized_response and normalized_response.results:
+            # 验证和过滤搜索结果，防止幻觉问题
+            validated_results = self._validate_and_filter_search_results(
+                normalized_response.results, search_query
+            )
+            
            # 每种搜索工具都有其特定的结果数量，这里取前10个作为上限
-            max_results = min(len(search_response.results), 10)
-            for result in search_response.results[:max_results]:
+            max_results = min(len(validated_results), 10)
+            for result in validated_results[:max_results]:
                search_results.append({
                    'title': result.title,
                    'url': result.url,
@@ -351,12 +474,20 @@ class DeepSearchAgent:
            
            search_response = self.execute_search_tool(search_tool, search_query, **search_kwargs)
            
+            # 规范化搜索响应，处理dict类型返回值
+            normalized_response = self._normalize_search_response(search_response)
+            
            # 转换为兼容格式
            search_results = []
-            if search_response and search_response.results:
+            if normalized_response and normalized_response.results:
+                # 验证和过滤搜索结果，防止幻觉问题
+                validated_results = self._validate_and_filter_search_results(
+                    normalized_response.results, search_query
+                )
+                
                # 每种搜索工具都有其特定的结果数量，这里取前10个作为上限
-                max_results = min(len(search_response.results), 10)
-                for result in search_response.results[:max_results]:
+                max_results = min(len(validated_results), 10)
+                for result in validated_results[:max_results]:
                    search_results.append({
                        'title': result.title,
                        'url': result.url,
--- a/README-EN.md
+++ b/README-EN.md
@@ -19,7 +19,7 @@
 [![Docker](https://img.shields.io/badge/Docker-Build-2496ED?style=flat-square&logo=docker&logoColor=white)](https://hub.docker.com/)


-[English](./README-EN.md) | [中文文档](./README.md)
+English | [中文文档](./README.md)

 </div>

--- a/README.md
+++ b/README.md
@@ -20,7 +20,7 @@



-[English](./README-EN.md) | [中文文档](./README.md)
+[English](./README-EN.md) | 中文文档

 </div>

--- a/app.py
+++ b/app.py
@@ -554,7 +554,8 @@ def read_process_output(process, app_name):
                            })
                            
        except Exception as e:
-            logger.exception(f"Error reading output for {app_name}: {e}")
+            error_msg = f"Error reading output for {app_name}: {e}"
+            logger.exception(error_msg)
            write_log_to_file(app_name, f"[{datetime.now().strftime('%H:%M:%S')}] {error_msg}")
            break
Author	SHA1	Message	Date
ghmark675	a7180c8259	Merge pull request #284 from xiuxiuxius/main fix: 修复日志记录错误信息时的变量引用问题	2025-11-10 19:59:06 +08:00
ghmark675	47d11ae529	Merge pull request #291 from 666ghj/fix/streamlit_torch_conflict Fix/streamlit torch conflict	2025-11-10 19:55:38 +08:00
ghmark675	dd92099951	fix(sentiment_analyzer): streamlit and torch conflict	2025-11-10 19:17:06 +08:00
ghmark675	8a76e128c4	fix(sentiment_analyzer): fix type warning from pyright	2025-11-10 19:14:19 +08:00
ghmark675	7eb60ea1d4	style(sentiment_analyzer): format file	2025-11-10 19:00:06 +08:00
ghmark675	a2a52f0b83	Merge branch 'main' into dev	2025-11-10 18:53:27 +08:00
limm	4d358c9e3b	fix: 修复日志记录错误信息时的变量引用问题	2025-11-10 16:18:28 +08:00
Doiiars	db6bcffb5d	Merge pull request #225 from tennisleng/fix/search-response-type-error-and-add-validation fix: handle dict response type error and add result validation feature	2025-11-08 22:16:39 +08:00
Doiiars	e87a737808	Merge pull request #213 from whtry/main Satisefy Chinese Mainland Internet Environment	2025-11-08 22:04:44 +08:00
Doiiars	1ae9d7d8e2	Merge pull request #223 from 1411430556/main docs: Fix link formatting in README.md	2025-11-08 13:22:17 +08:00
tennisleng	d83e00d087	fix: handle dict response type error and add result validation This PR fixes a critical bug and adds a technical feature: BUG FIX (#179): - Fixed AttributeError: 'dict' object has no attribute 'results' - Added _normalize_search_response() method to handle both dict and TavilyResponse object types - Prevents crashes when search API returns unexpected response formats - Provides graceful error recovery mechanism TECHNICAL FEATURE: - Added _validate_and_filter_search_results() method to prevent hallucination issues (#132) - Validates URL format and content relevance - Filters out empty, invalid, and low-relevance results - Prevents AI from citing non-existent or irrelevant sources - Improves result quality and reduces false citations Changes: - Normalize search responses before accessing .results attribute - Validate and filter results based on query relevance - Add comprehensive error handling and logging - Maintain backward compatibility with existing code Fixes #179 Addresses #132 Signed-off-by: Andrew Leng <work@Andrews-MacBook-Air.local>	2025-11-07 17:44:13 -05:00
COYG⚡️	6ac843b87e	Update English link format in README Removed the file path from the English link in the README.	2025-11-08 00:42:19 +08:00
COYG⚡️	b340db3415	Fix link formatting in README.md	2025-11-08 00:41:56 +08:00
whtry	17afef4abb	Satisefy Chinese Mainland Internet Environment	2025-11-07 18:50:02 +08:00