Compare commits
14 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
a7180c8259 | ||
|
|
47d11ae529 | ||
|
|
dd92099951 | ||
|
|
8a76e128c4 | ||
|
|
7eb60ea1d4 | ||
|
|
a2a52f0b83 | ||
|
|
4d358c9e3b | ||
|
|
db6bcffb5d | ||
|
|
e87a737808 | ||
|
|
1ae9d7d8e2 | ||
|
|
d83e00d087 | ||
|
|
6ac843b87e | ||
|
|
b340db3415 | ||
|
|
17afef4abb |
79
Dockerfile-CN
Normal file
79
Dockerfile-CN
Normal file
@@ -0,0 +1,79 @@
|
||||
FROM python:3.11-slim
|
||||
|
||||
SHELL ["/bin/bash", "-o", "pipefail", "-c"]
|
||||
|
||||
# Satisefy Chinese Mainland Internet Environment
|
||||
ENV PIP_INDEX_URL=https://mirrors.aliyun.com/pypi/simple/ \
|
||||
PIP_TRUSTED_HOST=mirrors.aliyun.com
|
||||
|
||||
RUN sed -i 's/deb.debian.org/mirrors.ustc.edu.cn/g' /etc/apt/sources.list.d/debian.sources
|
||||
|
||||
# Prevent Python from writing .pyc files, buffer stdout/stderr, and pin common tooling paths
|
||||
ENV PYTHONDONTWRITEBYTECODE=1 \
|
||||
PYTHONUNBUFFERED=1 \
|
||||
PIP_NO_CACHE_DIR=1 \
|
||||
PATH="/root/.local/bin:${PATH}" \
|
||||
PLAYWRIGHT_BROWSERS_PATH=/ms-playwright
|
||||
|
||||
# Install system dependencies required by scientific Python stack, Playwright, and Streamlit
|
||||
RUN apt-get update && apt-get install -y --no-install-recommends \
|
||||
build-essential \
|
||||
curl \
|
||||
git \
|
||||
libgl1 \
|
||||
libglib2.0-0 \
|
||||
libgtk-3-0 \
|
||||
libpango-1.0-0 \
|
||||
libpangocairo-1.0-0 \
|
||||
libatk1.0-0 \
|
||||
libatk-bridge2.0-0 \
|
||||
libxcb1 \
|
||||
libxcomposite1 \
|
||||
libxdamage1 \
|
||||
libxext6 \
|
||||
libxfixes3 \
|
||||
libxi6 \
|
||||
libxtst6 \
|
||||
libnss3 \
|
||||
libxrandr2 \
|
||||
libxkbcommon0 \
|
||||
libasound2 \
|
||||
libx11-xcb1 \
|
||||
libxshmfence1 \
|
||||
libgbm1 \
|
||||
ffmpeg \
|
||||
tar\
|
||||
&& apt-get clean && rm -rf /var/lib/apt/lists/*
|
||||
|
||||
# Install the latest uv release and expose it on PATH
|
||||
# Use GitHub accelerated source to download uv package (to avoid direct access to astral.sh)
|
||||
RUN curl -LsSf "https://wget.la/https://github.com/astral-sh/uv/releases/download/0.9.7/uv-x86_64-unknown-linux-musl.tar.gz" \
|
||||
| tar -xz -C /usr/local/bin --strip-components=1 && \
|
||||
uv --version
|
||||
|
||||
# Set UV pip mirror for Chinese Mainland Internet Environment
|
||||
ENV UV_INDEX_URL="https://mirrors.aliyun.com/pypi/simple/" \
|
||||
UV_TRUSTED_HOST="mirrors.aliyun.com"
|
||||
|
||||
WORKDIR /app
|
||||
|
||||
# Install Python dependencies first to leverage Docker layer caching
|
||||
COPY requirements.txt ./
|
||||
RUN uv pip install --system -r requirements.txt
|
||||
|
||||
# Install Playwright browser binaries (system deps already handled above)
|
||||
RUN python -m playwright install chromium
|
||||
|
||||
# Copy .env
|
||||
COPY .env.example .env
|
||||
|
||||
# Copy application source
|
||||
COPY . .
|
||||
|
||||
# Ensure runtime directories exist even if ignored in build context
|
||||
RUN mkdir -p /ms-playwright logs final_reports insight_engine_streamlit_reports media_engine_streamlit_reports query_engine_streamlit_reports
|
||||
|
||||
EXPOSE 5000 8501 8502 8503
|
||||
|
||||
# Default command launches the Flask orchestrator which starts Streamlit agents
|
||||
CMD ["python", "app.py"]
|
||||
@@ -11,13 +11,16 @@ import re
|
||||
|
||||
try:
|
||||
import torch
|
||||
|
||||
TORCH_AVAILABLE = True
|
||||
torch.classes.__path__ = []
|
||||
except ImportError:
|
||||
torch = None # type: ignore
|
||||
TORCH_AVAILABLE = False
|
||||
|
||||
try:
|
||||
from transformers import AutoTokenizer, AutoModelForSequenceClassification
|
||||
|
||||
TRANSFORMERS_AVAILABLE = True
|
||||
except ImportError:
|
||||
AutoTokenizer = None # type: ignore
|
||||
@@ -28,6 +31,7 @@ except ImportError:
|
||||
# INFO:若想跳过情感分析,可手动切换此开关为False
|
||||
SENTIMENT_ANALYSIS_ENABLED = True
|
||||
|
||||
|
||||
def _describe_missing_dependencies() -> str:
|
||||
missing = []
|
||||
if not TORCH_AVAILABLE:
|
||||
@@ -36,14 +40,21 @@ def _describe_missing_dependencies() -> str:
|
||||
missing.append("Transformers")
|
||||
return " / ".join(missing)
|
||||
|
||||
|
||||
# 添加项目根目录到路径,以便导入WeiboMultilingualSentiment
|
||||
project_root = os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
|
||||
weibo_sentiment_path = os.path.join(project_root, "SentimentAnalysisModel", "WeiboMultilingualSentiment")
|
||||
project_root = os.path.dirname(
|
||||
os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
|
||||
)
|
||||
weibo_sentiment_path = os.path.join(
|
||||
project_root, "SentimentAnalysisModel", "WeiboMultilingualSentiment"
|
||||
)
|
||||
sys.path.append(weibo_sentiment_path)
|
||||
|
||||
|
||||
@dataclass
|
||||
class SentimentResult:
|
||||
"""情感分析结果数据类"""
|
||||
|
||||
text: str
|
||||
sentiment_label: str
|
||||
confidence: float
|
||||
@@ -53,9 +64,10 @@ class SentimentResult:
|
||||
analysis_performed: bool = True
|
||||
|
||||
|
||||
@dataclass
|
||||
@dataclass
|
||||
class BatchSentimentResult:
|
||||
"""批量情感分析结果数据类"""
|
||||
|
||||
results: List[SentimentResult]
|
||||
total_processed: int
|
||||
success_count: int
|
||||
@@ -69,7 +81,7 @@ class WeiboMultilingualSentimentAnalyzer:
|
||||
多语言情感分析器
|
||||
封装WeiboMultilingualSentiment模型,为AI Agent提供情感分析功能
|
||||
"""
|
||||
|
||||
|
||||
def __init__(self):
|
||||
"""初始化情感分析器"""
|
||||
self.model = None
|
||||
@@ -78,14 +90,14 @@ class WeiboMultilingualSentimentAnalyzer:
|
||||
self.is_initialized = False
|
||||
self.is_disabled = False
|
||||
self.disable_reason: Optional[str] = None
|
||||
|
||||
|
||||
# 情感标签映射(5级分类)
|
||||
self.sentiment_map = {
|
||||
0: "非常负面",
|
||||
1: "负面",
|
||||
2: "中性",
|
||||
3: "正面",
|
||||
4: "非常正面"
|
||||
0: "非常负面",
|
||||
1: "负面",
|
||||
2: "中性",
|
||||
3: "正面",
|
||||
4: "非常正面",
|
||||
}
|
||||
|
||||
if not SENTIMENT_ANALYSIS_ENABLED:
|
||||
@@ -96,9 +108,13 @@ class WeiboMultilingualSentimentAnalyzer:
|
||||
|
||||
if self.is_disabled:
|
||||
reason = self.disable_reason or "Sentiment analysis disabled."
|
||||
print(f"WeiboMultilingualSentimentAnalyzer initialized but disabled: {reason}")
|
||||
print(
|
||||
f"WeiboMultilingualSentimentAnalyzer initialized but disabled: {reason}"
|
||||
)
|
||||
else:
|
||||
print("WeiboMultilingualSentimentAnalyzer 已创建,调用 initialize() 来加载模型")
|
||||
print(
|
||||
"WeiboMultilingualSentimentAnalyzer 已创建,调用 initialize() 来加载模型"
|
||||
)
|
||||
|
||||
def disable(self, reason: Optional[str] = None, drop_state: bool = False) -> None:
|
||||
"""Disable sentiment analysis, optionally clearing loaded resources."""
|
||||
@@ -127,17 +143,22 @@ class WeiboMultilingualSentimentAnalyzer:
|
||||
"""Select the best available torch device."""
|
||||
if not TORCH_AVAILABLE:
|
||||
return None
|
||||
assert torch is not None
|
||||
if torch.cuda.is_available():
|
||||
return torch.device("cuda")
|
||||
mps_backend = getattr(torch.backends, "mps", None)
|
||||
if mps_backend and getattr(mps_backend, "is_available", lambda: False)() and getattr(mps_backend, "is_built", lambda: False)():
|
||||
if (
|
||||
mps_backend
|
||||
and getattr(mps_backend, "is_available", lambda: False)()
|
||||
and getattr(mps_backend, "is_built", lambda: False)()
|
||||
):
|
||||
return torch.device("mps")
|
||||
return torch.device("cpu")
|
||||
|
||||
|
||||
def initialize(self) -> bool:
|
||||
"""
|
||||
初始化模型和分词器
|
||||
|
||||
|
||||
Returns:
|
||||
是否初始化成功
|
||||
"""
|
||||
@@ -155,31 +176,37 @@ class WeiboMultilingualSentimentAnalyzer:
|
||||
if self.is_initialized:
|
||||
print("模型已经初始化,无需重复加载")
|
||||
return True
|
||||
|
||||
|
||||
try:
|
||||
print("正在加载多语言情感分析模型...")
|
||||
|
||||
assert AutoTokenizer is not None
|
||||
assert AutoModelForSequenceClassification is not None
|
||||
|
||||
# 使用多语言情感分析模型
|
||||
model_name = "tabularisai/multilingual-sentiment-analysis"
|
||||
local_model_path = os.path.join(weibo_sentiment_path, "model")
|
||||
|
||||
|
||||
# 检查本地是否已有模型
|
||||
if os.path.exists(local_model_path):
|
||||
print("从本地加载模型...")
|
||||
self.tokenizer = AutoTokenizer.from_pretrained(local_model_path)
|
||||
self.model = AutoModelForSequenceClassification.from_pretrained(local_model_path)
|
||||
self.model = AutoModelForSequenceClassification.from_pretrained(
|
||||
local_model_path
|
||||
)
|
||||
else:
|
||||
print("首次使用,正在下载模型到本地...")
|
||||
# 下载并保存到本地
|
||||
self.tokenizer = AutoTokenizer.from_pretrained(model_name)
|
||||
self.model = AutoModelForSequenceClassification.from_pretrained(model_name)
|
||||
|
||||
self.model = AutoModelForSequenceClassification.from_pretrained(
|
||||
model_name
|
||||
)
|
||||
|
||||
# 保存到本地
|
||||
os.makedirs(local_model_path, exist_ok=True)
|
||||
self.tokenizer.save_pretrained(local_model_path)
|
||||
self.model.save_pretrained(local_model_path)
|
||||
print(f"模型已保存到: {local_model_path}")
|
||||
|
||||
|
||||
# 设置设备
|
||||
device = self._select_device()
|
||||
if device is None:
|
||||
@@ -198,46 +225,46 @@ class WeiboMultilingualSentimentAnalyzer:
|
||||
print("检测到 Apple MPS 设备,已使用 MPS 进行推理。")
|
||||
else:
|
||||
print("未检测到 GPU,自动使用 CPU 进行推理。")
|
||||
|
||||
|
||||
print(f"模型加载成功! 使用设备: {self.device}")
|
||||
print("支持语言: 中文、英文、西班牙文、阿拉伯文、日文、韩文等22种语言")
|
||||
print("情感等级: 非常负面、负面、中性、正面、非常正面")
|
||||
|
||||
|
||||
return True
|
||||
|
||||
|
||||
except Exception as e:
|
||||
error_message = f"模型加载失败: {e}"
|
||||
print(error_message)
|
||||
print("请检查网络连接或模型文件")
|
||||
self.disable(error_message, drop_state=True)
|
||||
return False
|
||||
|
||||
|
||||
def _preprocess_text(self, text: str) -> str:
|
||||
"""
|
||||
文本预处理
|
||||
|
||||
|
||||
Args:
|
||||
text: 输入文本
|
||||
|
||||
|
||||
Returns:
|
||||
处理后的文本
|
||||
"""
|
||||
# 基本文本清理
|
||||
if not text or not text.strip():
|
||||
return ""
|
||||
|
||||
|
||||
# 去除多余空格
|
||||
text = re.sub(r'\s+', ' ', text.strip())
|
||||
|
||||
text = re.sub(r"\s+", " ", text.strip())
|
||||
|
||||
return text
|
||||
|
||||
|
||||
def analyze_single_text(self, text: str) -> SentimentResult:
|
||||
"""
|
||||
对单个文本进行情感分析
|
||||
|
||||
|
||||
Args:
|
||||
text: 要分析的文本
|
||||
|
||||
|
||||
Returns:
|
||||
SentimentResult对象
|
||||
"""
|
||||
@@ -249,7 +276,7 @@ class WeiboMultilingualSentimentAnalyzer:
|
||||
probability_distribution={},
|
||||
success=False,
|
||||
error_message=self.disable_reason or "情感分析功能已禁用",
|
||||
analysis_performed=False
|
||||
analysis_performed=False,
|
||||
)
|
||||
|
||||
if not self.is_initialized:
|
||||
@@ -260,7 +287,7 @@ class WeiboMultilingualSentimentAnalyzer:
|
||||
probability_distribution={},
|
||||
success=False,
|
||||
error_message="模型未初始化,请先调用initialize() 方法",
|
||||
analysis_performed=False
|
||||
analysis_performed=False,
|
||||
)
|
||||
|
||||
try:
|
||||
@@ -275,27 +302,29 @@ class WeiboMultilingualSentimentAnalyzer:
|
||||
probability_distribution={},
|
||||
success=False,
|
||||
error_message="输入文本为空或无效内容",
|
||||
analysis_performed=False
|
||||
analysis_performed=False,
|
||||
)
|
||||
|
||||
assert self.tokenizer is not None
|
||||
# 分词编码
|
||||
inputs = self.tokenizer(
|
||||
processed_text,
|
||||
max_length=512,
|
||||
padding=True,
|
||||
truncation=True,
|
||||
return_tensors='pt'
|
||||
return_tensors="pt",
|
||||
)
|
||||
|
||||
# 转移到设备
|
||||
inputs = {k: v.to(self.device) for k, v in inputs.items()}
|
||||
|
||||
# 预测
|
||||
assert torch is not None
|
||||
assert self.model is not None
|
||||
with torch.no_grad():
|
||||
outputs = self.model(**inputs)
|
||||
logits = outputs.logits
|
||||
probabilities = torch.softmax(logits, dim=1)
|
||||
prediction = torch.argmax(probabilities, dim=1).item()
|
||||
prediction = int(torch.argmax(probabilities, dim=1).item())
|
||||
|
||||
# 构建结果
|
||||
confidence = probabilities[0][prediction].item()
|
||||
@@ -311,7 +340,7 @@ class WeiboMultilingualSentimentAnalyzer:
|
||||
sentiment_label=label,
|
||||
confidence=confidence,
|
||||
probability_distribution=prob_dist,
|
||||
success=True
|
||||
success=True,
|
||||
)
|
||||
|
||||
except Exception as e:
|
||||
@@ -322,17 +351,19 @@ class WeiboMultilingualSentimentAnalyzer:
|
||||
probability_distribution={},
|
||||
success=False,
|
||||
error_message=f"预测时发生错误: {str(e)}",
|
||||
analysis_performed=False
|
||||
analysis_performed=False,
|
||||
)
|
||||
|
||||
def analyze_batch(self, texts: List[str], show_progress: bool = True) -> BatchSentimentResult:
|
||||
def analyze_batch(
|
||||
self, texts: List[str], show_progress: bool = True
|
||||
) -> BatchSentimentResult:
|
||||
"""
|
||||
批量情感分析
|
||||
|
||||
|
||||
Args:
|
||||
texts: 文本列表
|
||||
show_progress: 是否显示进度
|
||||
|
||||
|
||||
Returns:
|
||||
BatchSentimentResult对象
|
||||
"""
|
||||
@@ -343,9 +374,9 @@ class WeiboMultilingualSentimentAnalyzer:
|
||||
success_count=0,
|
||||
failed_count=0,
|
||||
average_confidence=0.0,
|
||||
analysis_performed=not self.is_disabled and self.is_initialized
|
||||
analysis_performed=not self.is_disabled and self.is_initialized,
|
||||
)
|
||||
|
||||
|
||||
if self.is_disabled or not self.is_initialized:
|
||||
passthrough_results = [
|
||||
SentimentResult(
|
||||
@@ -355,7 +386,7 @@ class WeiboMultilingualSentimentAnalyzer:
|
||||
probability_distribution={},
|
||||
success=False,
|
||||
error_message=self.disable_reason or "情感分析功能不可用",
|
||||
analysis_performed=False
|
||||
analysis_performed=False,
|
||||
)
|
||||
for text in texts
|
||||
]
|
||||
@@ -365,42 +396,44 @@ class WeiboMultilingualSentimentAnalyzer:
|
||||
success_count=0,
|
||||
failed_count=len(texts),
|
||||
average_confidence=0.0,
|
||||
analysis_performed=False
|
||||
analysis_performed=False,
|
||||
)
|
||||
|
||||
|
||||
results = []
|
||||
success_count = 0
|
||||
total_confidence = 0.0
|
||||
|
||||
|
||||
for i, text in enumerate(texts):
|
||||
if show_progress and len(texts) > 1:
|
||||
print(f"处理进度: {i+1}/{len(texts)}")
|
||||
|
||||
print(f"处理进度: {i + 1}/{len(texts)}")
|
||||
|
||||
result = self.analyze_single_text(text)
|
||||
results.append(result)
|
||||
|
||||
|
||||
if result.success:
|
||||
success_count += 1
|
||||
total_confidence += result.confidence
|
||||
|
||||
average_confidence = total_confidence / success_count if success_count > 0 else 0.0
|
||||
|
||||
average_confidence = (
|
||||
total_confidence / success_count if success_count > 0 else 0.0
|
||||
)
|
||||
failed_count = len(texts) - success_count
|
||||
|
||||
|
||||
return BatchSentimentResult(
|
||||
results=results,
|
||||
total_processed=len(texts),
|
||||
success_count=success_count,
|
||||
failed_count=failed_count,
|
||||
average_confidence=average_confidence,
|
||||
analysis_performed=True
|
||||
analysis_performed=True,
|
||||
)
|
||||
|
||||
|
||||
def _build_passthrough_analysis(
|
||||
self,
|
||||
original_data: List[Dict[str, Any]],
|
||||
reason: str,
|
||||
texts: Optional[List[str]] = None,
|
||||
results: Optional[List[SentimentResult]] = None
|
||||
results: Optional[List[SentimentResult]] = None,
|
||||
) -> Dict[str, Any]:
|
||||
"""
|
||||
构建在情感分析不可用时的透传结果
|
||||
@@ -416,33 +449,36 @@ class WeiboMultilingualSentimentAnalyzer:
|
||||
"sentiment_distribution": {},
|
||||
"high_confidence_results": [],
|
||||
"summary": f"情感分析未执行:{reason}",
|
||||
"original_texts": original_data
|
||||
"original_texts": original_data,
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
if texts is not None:
|
||||
response["sentiment_analysis"]["passthrough_texts"] = texts
|
||||
|
||||
|
||||
if results is not None:
|
||||
response["sentiment_analysis"]["results"] = [
|
||||
result.__dict__ if isinstance(result, SentimentResult) else result
|
||||
for result in results
|
||||
]
|
||||
|
||||
|
||||
return response
|
||||
|
||||
def analyze_query_results(self, query_results: List[Dict[str, Any]],
|
||||
text_field: str = "content",
|
||||
min_confidence: float = 0.5) -> Dict[str, Any]:
|
||||
|
||||
def analyze_query_results(
|
||||
self,
|
||||
query_results: List[Dict[str, Any]],
|
||||
text_field: str = "content",
|
||||
min_confidence: float = 0.5,
|
||||
) -> Dict[str, Any]:
|
||||
"""
|
||||
对查询结果进行情感分析
|
||||
专门用于分析从MediaCrawlerDB返回的查询结果
|
||||
|
||||
|
||||
Args:
|
||||
query_results: 查询结果列表,每个元素包含文本内容
|
||||
text_field: 文本内容字段名,默认为"content"
|
||||
min_confidence: 最小置信度阈值
|
||||
|
||||
|
||||
Returns:
|
||||
包含情感分析结果的字典
|
||||
"""
|
||||
@@ -452,14 +488,14 @@ class WeiboMultilingualSentimentAnalyzer:
|
||||
"total_analyzed": 0,
|
||||
"sentiment_distribution": {},
|
||||
"high_confidence_results": [],
|
||||
"summary": "没有内容需要分析"
|
||||
"summary": "没有内容需要分析",
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
# 提取文本内容
|
||||
texts_to_analyze = []
|
||||
original_data = []
|
||||
|
||||
|
||||
for item in query_results:
|
||||
# 尝试多个可能的文本字段
|
||||
text_content = ""
|
||||
@@ -467,49 +503,52 @@ class WeiboMultilingualSentimentAnalyzer:
|
||||
if field in item and item[field]:
|
||||
text_content = str(item[field])
|
||||
break
|
||||
|
||||
|
||||
if text_content.strip():
|
||||
texts_to_analyze.append(text_content)
|
||||
original_data.append(item)
|
||||
|
||||
|
||||
if not texts_to_analyze:
|
||||
return {
|
||||
"sentiment_analysis": {
|
||||
"total_analyzed": 0,
|
||||
"sentiment_distribution": {},
|
||||
"high_confidence_results": [],
|
||||
"summary": "查询结果中没有找到可分析的文本内容"
|
||||
"summary": "查询结果中没有找到可分析的文本内容",
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
if self.is_disabled:
|
||||
return self._build_passthrough_analysis(
|
||||
original_data=original_data,
|
||||
reason=self.disable_reason or "情感分析模型不可用",
|
||||
texts=texts_to_analyze
|
||||
texts=texts_to_analyze,
|
||||
)
|
||||
|
||||
|
||||
# 执行批量情感分析
|
||||
print(f"正在对{len(texts_to_analyze)}条内容进行情感分析...")
|
||||
batch_result = self.analyze_batch(texts_to_analyze, show_progress=True)
|
||||
|
||||
|
||||
if not batch_result.analysis_performed:
|
||||
reason = self.disable_reason or "情感分析功能不可用"
|
||||
if batch_result.results:
|
||||
candidate_error = next((r.error_message for r in batch_result.results if r.error_message), None)
|
||||
candidate_error = next(
|
||||
(r.error_message for r in batch_result.results if r.error_message),
|
||||
None,
|
||||
)
|
||||
if candidate_error:
|
||||
reason = candidate_error
|
||||
return self._build_passthrough_analysis(
|
||||
original_data=original_data,
|
||||
reason=reason,
|
||||
texts=texts_to_analyze,
|
||||
results=batch_result.results
|
||||
results=batch_result.results,
|
||||
)
|
||||
|
||||
|
||||
# 统计情感分布
|
||||
sentiment_distribution = {}
|
||||
high_confidence_results = []
|
||||
|
||||
|
||||
for result, original_item in zip(batch_result.results, original_data):
|
||||
if result.success:
|
||||
# 统计情感分布
|
||||
@@ -517,24 +556,28 @@ class WeiboMultilingualSentimentAnalyzer:
|
||||
if sentiment not in sentiment_distribution:
|
||||
sentiment_distribution[sentiment] = 0
|
||||
sentiment_distribution[sentiment] += 1
|
||||
|
||||
|
||||
# 收集高置信度结果
|
||||
if result.confidence >= min_confidence:
|
||||
high_confidence_results.append({
|
||||
"original_data": original_item,
|
||||
"sentiment": result.sentiment_label,
|
||||
"confidence": result.confidence,
|
||||
"text_preview": result.text[:100] + "..." if len(result.text) > 100 else result.text
|
||||
})
|
||||
|
||||
high_confidence_results.append(
|
||||
{
|
||||
"original_data": original_item,
|
||||
"sentiment": result.sentiment_label,
|
||||
"confidence": result.confidence,
|
||||
"text_preview": result.text[:100] + "..."
|
||||
if len(result.text) > 100
|
||||
else result.text,
|
||||
}
|
||||
)
|
||||
|
||||
# 生成情感分析摘要
|
||||
total_analyzed = batch_result.success_count
|
||||
if total_analyzed > 0:
|
||||
dominant_sentiment = max(sentiment_distribution.items(), key=lambda x: x[1])
|
||||
sentiment_summary = f"共分析{total_analyzed}条内容,主要情感倾向为'{dominant_sentiment[0]}'({dominant_sentiment[1]}条,占{dominant_sentiment[1]/total_analyzed*100:.1f}%)"
|
||||
sentiment_summary = f"共分析{total_analyzed}条内容,主要情感倾向为'{dominant_sentiment[0]}'({dominant_sentiment[1]}条,占{dominant_sentiment[1] / total_analyzed * 100:.1f}%)"
|
||||
else:
|
||||
sentiment_summary = "情感分析失败"
|
||||
|
||||
|
||||
return {
|
||||
"sentiment_analysis": {
|
||||
"total_analyzed": total_analyzed,
|
||||
@@ -542,28 +585,46 @@ class WeiboMultilingualSentimentAnalyzer:
|
||||
"average_confidence": round(batch_result.average_confidence, 4),
|
||||
"sentiment_distribution": sentiment_distribution,
|
||||
"high_confidence_results": high_confidence_results, # 返回所有高置信度结果,不做限制
|
||||
"summary": sentiment_summary
|
||||
"summary": sentiment_summary,
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
def get_model_info(self) -> Dict[str, Any]:
|
||||
"""
|
||||
获取模型信息
|
||||
|
||||
|
||||
Returns:
|
||||
模型信息字典
|
||||
"""
|
||||
return {
|
||||
"model_name": "tabularisai/multilingual-sentiment-analysis",
|
||||
"supported_languages": [
|
||||
"中文", "英文", "西班牙文", "阿拉伯文", "日文", "韩文",
|
||||
"德文", "法文", "意大利文", "葡萄牙文", "俄文", "荷兰文",
|
||||
"波兰文", "土耳其文", "丹麦文", "希腊文", "芬兰文",
|
||||
"瑞典文", "挪威文", "匈牙利文", "捷克文", "保加利亚文"
|
||||
"中文",
|
||||
"英文",
|
||||
"西班牙文",
|
||||
"阿拉伯文",
|
||||
"日文",
|
||||
"韩文",
|
||||
"德文",
|
||||
"法文",
|
||||
"意大利文",
|
||||
"葡萄牙文",
|
||||
"俄文",
|
||||
"荷兰文",
|
||||
"波兰文",
|
||||
"土耳其文",
|
||||
"丹麦文",
|
||||
"希腊文",
|
||||
"芬兰文",
|
||||
"瑞典文",
|
||||
"挪威文",
|
||||
"匈牙利文",
|
||||
"捷克文",
|
||||
"保加利亚文",
|
||||
],
|
||||
"sentiment_levels": list(self.sentiment_map.values()),
|
||||
"is_initialized": self.is_initialized,
|
||||
"device": str(self.device) if self.device else "未设置"
|
||||
"device": str(self.device) if self.device else "未设置",
|
||||
}
|
||||
|
||||
|
||||
@@ -576,20 +637,23 @@ def enable_sentiment_analysis() -> bool:
|
||||
return multilingual_sentiment_analyzer.enable()
|
||||
|
||||
|
||||
def disable_sentiment_analysis(reason: Optional[str] = None, drop_state: bool = False) -> None:
|
||||
def disable_sentiment_analysis(
|
||||
reason: Optional[str] = None, drop_state: bool = False
|
||||
) -> None:
|
||||
"""Public helper to disable sentiment analysis at runtime."""
|
||||
multilingual_sentiment_analyzer.disable(reason=reason, drop_state=drop_state)
|
||||
|
||||
|
||||
def analyze_sentiment(text_or_texts: Union[str, List[str]],
|
||||
initialize_if_needed: bool = True) -> Union[SentimentResult, BatchSentimentResult]:
|
||||
def analyze_sentiment(
|
||||
text_or_texts: Union[str, List[str]], initialize_if_needed: bool = True
|
||||
) -> Union[SentimentResult, BatchSentimentResult]:
|
||||
"""
|
||||
便捷的情感分析函数
|
||||
|
||||
|
||||
Args:
|
||||
text_or_texts: 单个文本或文本列表
|
||||
initialize_if_needed: 如果模型未初始化,是否自动初始化
|
||||
|
||||
|
||||
Returns:
|
||||
SentimentResult或BatchSentimentResult
|
||||
"""
|
||||
@@ -599,7 +663,7 @@ def analyze_sentiment(text_or_texts: Union[str, List[str]],
|
||||
and not multilingual_sentiment_analyzer.is_disabled
|
||||
):
|
||||
multilingual_sentiment_analyzer.initialize()
|
||||
|
||||
|
||||
if isinstance(text_or_texts, str):
|
||||
return multilingual_sentiment_analyzer.analyze_single_text(text_or_texts)
|
||||
else:
|
||||
@@ -610,24 +674,30 @@ def analyze_sentiment(text_or_texts: Union[str, List[str]],
|
||||
if __name__ == "__main__":
|
||||
# 测试代码
|
||||
analyzer = WeiboMultilingualSentimentAnalyzer()
|
||||
|
||||
|
||||
if analyzer.initialize():
|
||||
# 测试单个文本
|
||||
result = analyzer.analyze_single_text("今天天气真好,心情特别棒!")
|
||||
print(f"单个文本分析: {result.sentiment_label} (置信度: {result.confidence:.4f})")
|
||||
|
||||
print(
|
||||
f"单个文本分析: {result.sentiment_label} (置信度: {result.confidence:.4f})"
|
||||
)
|
||||
|
||||
# 测试批量文本
|
||||
test_texts = [
|
||||
"这家餐厅的菜味道非常棒!",
|
||||
"服务态度太差了,很失望",
|
||||
"I absolutely love this product!",
|
||||
"The customer service was disappointing."
|
||||
"The customer service was disappointing.",
|
||||
]
|
||||
|
||||
|
||||
batch_result = analyzer.analyze_batch(test_texts)
|
||||
print(f"\n批量分析: 成功 {batch_result.success_count}/{batch_result.total_processed}")
|
||||
|
||||
print(
|
||||
f"\n批量分析: 成功 {batch_result.success_count}/{batch_result.total_processed}"
|
||||
)
|
||||
|
||||
for result in batch_result.results:
|
||||
print(f"'{result.text[:30]}...' -> {result.sentiment_label} ({result.confidence:.4f})")
|
||||
print(
|
||||
f"'{result.text[:30]}...' -> {result.sentiment_label} ({result.confidence:.4f})"
|
||||
)
|
||||
else:
|
||||
print("模型初始化失败,无法进行测试")
|
||||
|
||||
@@ -19,7 +19,7 @@ from .nodes import (
|
||||
ReportFormattingNode
|
||||
)
|
||||
from .state import State
|
||||
from .tools import TavilyNewsAgency, TavilyResponse
|
||||
from .tools import TavilyNewsAgency, TavilyResponse, SearchResult, ImageResult
|
||||
from .utils import Settings, format_search_results_for_prompt
|
||||
from loguru import logger
|
||||
|
||||
@@ -97,6 +97,121 @@ class DeepSearchAgent:
|
||||
except ValueError:
|
||||
return False
|
||||
|
||||
def _normalize_search_response(self, search_response: Any) -> Optional[TavilyResponse]:
|
||||
"""
|
||||
规范化搜索响应,处理不同类型的返回值
|
||||
|
||||
此方法解决了当搜索API返回dict而不是TavilyResponse对象时的类型错误问题。
|
||||
同时提供错误恢复机制,确保系统在异常情况下仍能继续运行。
|
||||
|
||||
Args:
|
||||
search_response: 可能是TavilyResponse对象、dict或None
|
||||
|
||||
Returns:
|
||||
规范化后的TavilyResponse对象,如果无法规范化则返回None
|
||||
"""
|
||||
if search_response is None:
|
||||
return None
|
||||
|
||||
# 如果已经是TavilyResponse对象,直接返回
|
||||
if isinstance(search_response, TavilyResponse):
|
||||
return search_response
|
||||
|
||||
# 如果是dict,尝试转换为TavilyResponse对象
|
||||
if isinstance(search_response, dict):
|
||||
try:
|
||||
# 从dict中提取结果
|
||||
results = []
|
||||
if 'results' in search_response:
|
||||
for item in search_response['results']:
|
||||
if isinstance(item, dict):
|
||||
results.append(SearchResult(
|
||||
title=item.get('title', ''),
|
||||
url=item.get('url', ''),
|
||||
content=item.get('content', ''),
|
||||
score=item.get('score'),
|
||||
raw_content=item.get('raw_content'),
|
||||
published_date=item.get('published_date')
|
||||
))
|
||||
elif hasattr(item, '__dict__'):
|
||||
# 如果已经是SearchResult对象
|
||||
results.append(item)
|
||||
|
||||
images = []
|
||||
if 'images' in search_response:
|
||||
for item in search_response['images']:
|
||||
if isinstance(item, dict):
|
||||
images.append(ImageResult(
|
||||
url=item.get('url', ''),
|
||||
description=item.get('description')
|
||||
))
|
||||
elif hasattr(item, '__dict__'):
|
||||
images.append(item)
|
||||
|
||||
return TavilyResponse(
|
||||
query=search_response.get('query', ''),
|
||||
answer=search_response.get('answer'),
|
||||
results=results,
|
||||
images=images,
|
||||
response_time=search_response.get('response_time')
|
||||
)
|
||||
except Exception as e:
|
||||
logger.warning(f"无法规范化搜索响应为TavilyResponse对象: {str(e)}")
|
||||
logger.debug(f"原始响应类型: {type(search_response)}, 内容: {search_response}")
|
||||
return None
|
||||
|
||||
# 其他类型,记录警告并返回None
|
||||
logger.warning(f"未知的搜索响应类型: {type(search_response)}")
|
||||
return None
|
||||
|
||||
def _validate_and_filter_search_results(self, results: List[SearchResult], query: str) -> List[SearchResult]:
|
||||
"""
|
||||
验证和过滤搜索结果,防止幻觉问题
|
||||
|
||||
此方法实现了搜索结果验证机制,通过以下方式防止AI生成虚假引用:
|
||||
1. 验证URL有效性
|
||||
2. 检查内容相关性
|
||||
3. 过滤空结果和无效数据
|
||||
|
||||
Args:
|
||||
results: 原始搜索结果列表
|
||||
query: 搜索查询,用于相关性验证
|
||||
|
||||
Returns:
|
||||
验证和过滤后的搜索结果列表
|
||||
"""
|
||||
if not results:
|
||||
return []
|
||||
|
||||
validated_results = []
|
||||
query_lower = query.lower()
|
||||
query_keywords = set(query_lower.split())
|
||||
|
||||
for result in results:
|
||||
# 跳过空结果
|
||||
if not result or not result.title:
|
||||
continue
|
||||
|
||||
# 验证URL格式
|
||||
if result.url and not (result.url.startswith('http://') or result.url.startswith('https://')):
|
||||
logger.debug(f"跳过无效URL: {result.url}")
|
||||
continue
|
||||
|
||||
# 基本相关性检查:至少包含一个查询关键词
|
||||
title_lower = result.title.lower()
|
||||
content_lower = (result.content or '').lower()
|
||||
|
||||
# 检查标题或内容中是否包含查询关键词
|
||||
has_relevance = any(keyword in title_lower or keyword in content_lower
|
||||
for keyword in query_keywords if len(keyword) > 2)
|
||||
|
||||
if has_relevance or len(query_keywords) == 0:
|
||||
validated_results.append(result)
|
||||
else:
|
||||
logger.debug(f"过滤低相关性结果: {result.title[:50]}...")
|
||||
|
||||
return validated_results
|
||||
|
||||
def execute_search_tool(self, tool_name: str, query: str, **kwargs) -> TavilyResponse:
|
||||
"""
|
||||
执行指定的搜索工具
|
||||
@@ -260,12 +375,20 @@ class DeepSearchAgent:
|
||||
|
||||
search_response = self.execute_search_tool(search_tool, search_query, **search_kwargs)
|
||||
|
||||
# 规范化搜索响应,处理dict类型返回值
|
||||
normalized_response = self._normalize_search_response(search_response)
|
||||
|
||||
# 转换为兼容格式
|
||||
search_results = []
|
||||
if search_response and search_response.results:
|
||||
if normalized_response and normalized_response.results:
|
||||
# 验证和过滤搜索结果,防止幻觉问题
|
||||
validated_results = self._validate_and_filter_search_results(
|
||||
normalized_response.results, search_query
|
||||
)
|
||||
|
||||
# 每种搜索工具都有其特定的结果数量,这里取前10个作为上限
|
||||
max_results = min(len(search_response.results), 10)
|
||||
for result in search_response.results[:max_results]:
|
||||
max_results = min(len(validated_results), 10)
|
||||
for result in validated_results[:max_results]:
|
||||
search_results.append({
|
||||
'title': result.title,
|
||||
'url': result.url,
|
||||
@@ -351,12 +474,20 @@ class DeepSearchAgent:
|
||||
|
||||
search_response = self.execute_search_tool(search_tool, search_query, **search_kwargs)
|
||||
|
||||
# 规范化搜索响应,处理dict类型返回值
|
||||
normalized_response = self._normalize_search_response(search_response)
|
||||
|
||||
# 转换为兼容格式
|
||||
search_results = []
|
||||
if search_response and search_response.results:
|
||||
if normalized_response and normalized_response.results:
|
||||
# 验证和过滤搜索结果,防止幻觉问题
|
||||
validated_results = self._validate_and_filter_search_results(
|
||||
normalized_response.results, search_query
|
||||
)
|
||||
|
||||
# 每种搜索工具都有其特定的结果数量,这里取前10个作为上限
|
||||
max_results = min(len(search_response.results), 10)
|
||||
for result in search_response.results[:max_results]:
|
||||
max_results = min(len(validated_results), 10)
|
||||
for result in validated_results[:max_results]:
|
||||
search_results.append({
|
||||
'title': result.title,
|
||||
'url': result.url,
|
||||
|
||||
@@ -19,7 +19,7 @@
|
||||
[](https://hub.docker.com/)
|
||||
|
||||
|
||||
[English](./README-EN.md) | [中文文档](./README.md)
|
||||
English | [中文文档](./README.md)
|
||||
|
||||
</div>
|
||||
|
||||
|
||||
@@ -20,7 +20,7 @@
|
||||
|
||||
|
||||
|
||||
[English](./README-EN.md) | [中文文档](./README.md)
|
||||
[English](./README-EN.md) | 中文文档
|
||||
|
||||
</div>
|
||||
|
||||
|
||||
3
app.py
3
app.py
@@ -554,7 +554,8 @@ def read_process_output(process, app_name):
|
||||
})
|
||||
|
||||
except Exception as e:
|
||||
logger.exception(f"Error reading output for {app_name}: {e}")
|
||||
error_msg = f"Error reading output for {app_name}: {e}"
|
||||
logger.exception(error_msg)
|
||||
write_log_to_file(app_name, f"[{datetime.now().strftime('%H:%M:%S')}] {error_msg}")
|
||||
break
|
||||
|
||||
|
||||
Reference in New Issue
Block a user