Optimize the Handling of Low Word Counts

This commit is contained in:
马一丁
2025-11-15 17:46:42 +08:00
parent cab812e261
commit a12ac4234d
2 changed files with 114 additions and 9 deletions

View File

@@ -10,6 +10,7 @@ Report Agent主类。
import json
import os
from copy import deepcopy
from pathlib import Path
from uuid import uuid4
from datetime import datetime
@@ -174,6 +175,8 @@ class ReportAgent:
- 章节存储、IR装订、渲染器等产出链路
- 状态管理、日志、输入输出校验与持久化。
"""
_CONTENT_SPARSE_MIN_ATTEMPTS = 3
_CONTENT_SPARSE_WARNING_TEXT = "本章LLM生成的内容字数可能过低必要时可以尝试重新运行程序。"
def __init__(self, config: Optional[Settings] = None):
"""
@@ -466,7 +469,9 @@ class ReportAgent:
emit('stage', {'stage': 'storage_ready', 'run_dir': str(run_dir)})
chapters = []
chapter_max_attempts = max(1, self.config.CHAPTER_JSON_MAX_ATTEMPTS)
chapter_max_attempts = max(
self._CONTENT_SPARSE_MIN_ATTEMPTS, self.config.CHAPTER_JSON_MAX_ATTEMPTS
)
for section in sections:
logger.info(f"生成章节: {section.title}")
emit('chapter_status', {
@@ -492,6 +497,9 @@ class ReportAgent:
chapter_payload: Dict[str, Any] | None = None
attempt = 1
best_sparse_candidate: Dict[str, Any] | None = None
best_sparse_score = -1
fallback_used = False
while attempt <= chapter_max_attempts:
try:
chapter_payload = self.chapter_generation_node.run(
@@ -506,6 +514,19 @@ class ReportAgent:
"content_sparse" if isinstance(structured_error, ChapterContentError) else "json_parse"
)
readable_label = "内容密度异常" if error_kind == "content_sparse" else "JSON解析失败"
if isinstance(structured_error, ChapterContentError):
candidate = getattr(structured_error, "chapter_payload", None)
candidate_score = getattr(structured_error, "body_characters", 0) or 0
if isinstance(candidate, dict) and candidate_score >= 0:
if candidate_score > best_sparse_score:
best_sparse_candidate = deepcopy(candidate)
best_sparse_score = candidate_score
will_fallback = (
isinstance(structured_error, ChapterContentError)
and attempt >= chapter_max_attempts
and attempt >= self._CONTENT_SPARSE_MIN_ATTEMPTS
and best_sparse_candidate is not None
)
logger.warning(
"章节 {title} {label}(第 {attempt}/{total} 次尝试): {error}",
title=section.title,
@@ -514,14 +535,27 @@ class ReportAgent:
total=chapter_max_attempts,
error=structured_error,
)
emit('chapter_status', {
status_value = 'retrying' if attempt < chapter_max_attempts or will_fallback else 'error'
status_payload = {
'chapterId': section.chapter_id,
'title': section.title,
'status': 'retrying' if attempt < chapter_max_attempts else 'error',
'status': status_value,
'attempt': attempt,
'error': str(structured_error),
'reason': error_kind,
})
}
if will_fallback:
status_payload['warning'] = 'content_sparse_fallback_pending'
emit('chapter_status', status_payload)
if will_fallback:
logger.warning(
"章节 {title} 达到最大尝试次数,保留字数最多(约 {score} 字)的版本作为兜底输出",
title=section.title,
score=best_sparse_score,
)
chapter_payload = self._finalize_sparse_chapter(best_sparse_candidate)
fallback_used = True
break
if attempt >= chapter_max_attempts:
raise
attempt += 1
@@ -553,12 +587,16 @@ class ReportAgent:
f"{section.title} 章节JSON在 {chapter_max_attempts} 次尝试后仍无法解析"
)
chapters.append(chapter_payload)
emit('chapter_status', {
completion_status = {
'chapterId': section.chapter_id,
'title': section.title,
'status': 'completed',
'attempt': attempt,
})
}
if fallback_used:
completion_status['warning'] = 'content_sparse_fallback'
completion_status['warningMessage'] = self._CONTENT_SPARSE_WARNING_TEXT
emit('chapter_status', completion_status)
document_ir = self.document_composer.build_document(
report_id,
@@ -779,6 +817,48 @@ class ReportAgent:
]
return any(keyword in normalized for keyword in keywords)
def _finalize_sparse_chapter(self, chapter: Optional[Dict[str, Any]]) -> Dict[str, Any]:
"""
构造内容稀疏兜底章节复制原始payload并插入温馨提示段落。
"""
safe_chapter = deepcopy(chapter or {})
if not isinstance(safe_chapter, dict):
safe_chapter = {}
self._ensure_sparse_warning_block(safe_chapter)
return safe_chapter
def _ensure_sparse_warning_block(self, chapter: Dict[str, Any]) -> None:
"""
将提示段落插在章节标题后,提醒读者该章字数偏少。
"""
warning_block = {
"type": "paragraph",
"inlines": [
{
"text": self._CONTENT_SPARSE_WARNING_TEXT,
"marks": [{"type": "italic"}],
}
],
"meta": {"role": "content-sparse-warning"},
}
blocks = chapter.get("blocks")
if isinstance(blocks, list) and blocks:
inserted = False
for idx, block in enumerate(blocks):
if isinstance(block, dict) and block.get("type") == "heading":
blocks.insert(idx + 1, warning_block)
inserted = True
break
if not inserted:
blocks.insert(0, warning_block)
else:
chapter["blocks"] = [warning_block]
meta = chapter.get("meta")
if isinstance(meta, dict):
meta["contentSparseWarning"] = True
else:
chapter["meta"] = {"contentSparseWarning": True}
def _stringify(self, value: Any) -> str:
"""
安全地将对象转成字符串。

View File

@@ -55,6 +55,20 @@ class ChapterContentError(ValueError):
当LLM仅输出标题或正文不足以支撑一章时触发驱动重试以保证报告质量。
"""
def __init__(
self,
message: str,
chapter: Optional[Dict[str, Any]] = None,
body_characters: int = 0,
narrative_characters: int = 0,
non_heading_blocks: int = 0,
):
super().__init__(message)
self.chapter_payload: Optional[Dict[str, Any]] = chapter
self.body_characters: int = int(body_characters or 0)
self.narrative_characters: int = int(narrative_characters or 0)
self.non_heading_blocks: int = int(non_heading_blocks or 0)
class ChapterGenerationNode(BaseNode):
"""
@@ -897,7 +911,13 @@ class ChapterGenerationNode(BaseNode):
"""
blocks = chapter.get("blocks")
if not isinstance(blocks, list) or not blocks:
raise ChapterContentError("章节缺少正文区块,无法输出内容")
raise ChapterContentError(
"章节缺少正文区块,无法输出内容",
chapter=chapter,
body_characters=0,
narrative_characters=0,
non_heading_blocks=0,
)
non_heading_blocks = [
block
@@ -905,16 +925,21 @@ class ChapterGenerationNode(BaseNode):
if isinstance(block, dict)
and block.get("type") not in {"heading", "divider", "toc"}
]
valid_block_count = len(non_heading_blocks)
body_characters = self._count_body_characters(blocks)
narrative_characters = self._count_narrative_characters(blocks)
if (
len(non_heading_blocks) < self._MIN_NON_HEADING_BLOCKS
valid_block_count < self._MIN_NON_HEADING_BLOCKS
or body_characters < self._MIN_BODY_CHARACTERS
or narrative_characters < self._MIN_NARRATIVE_CHARACTERS
):
raise ChapterContentError(
f"{chapter.get('title') or '该章节'} 正文不足:有效区块 {len(non_heading_blocks)} 个,估算字符数 {body_characters},叙述性字符数 {narrative_characters}"
f"{chapter.get('title') or '该章节'} 正文不足:有效区块 {valid_block_count} 个,估算字符数 {body_characters},叙述性字符数 {narrative_characters}",
chapter=chapter,
body_characters=body_characters,
narrative_characters=narrative_characters,
non_heading_blocks=valid_block_count,
)
def _count_body_characters(self, blocks: Any) -> int: