Optimize the Handling of Low Word Counts

2025-11-15 17:46:42 +08:00
parent cab812e261
commit a12ac4234d
2 changed files with 114 additions and 9 deletions
--- a/ReportEngine/agent.py
+++ b/ReportEngine/agent.py
@@ -10,6 +10,7 @@ Report Agent主类。

 import json
 import os
+from copy import deepcopy
 from pathlib import Path
 from uuid import uuid4
 from datetime import datetime
@@ -174,6 +175,8 @@ class ReportAgent:
    - 章节存储、IR装订、渲染器等产出链路；
    - 状态管理、日志、输入输出校验与持久化。
    """
+    _CONTENT_SPARSE_MIN_ATTEMPTS = 3
+    _CONTENT_SPARSE_WARNING_TEXT = "本章LLM生成的内容字数可能过低，必要时可以尝试重新运行程序。"
    
    def __init__(self, config: Optional[Settings] = None):
        """
@@ -466,7 +469,9 @@ class ReportAgent:
            emit('stage', {'stage': 'storage_ready', 'run_dir': str(run_dir)})

            chapters = []
-            chapter_max_attempts = max(1, self.config.CHAPTER_JSON_MAX_ATTEMPTS)
+            chapter_max_attempts = max(
+                self._CONTENT_SPARSE_MIN_ATTEMPTS, self.config.CHAPTER_JSON_MAX_ATTEMPTS
+            )
            for section in sections:
                logger.info(f"生成章节: {section.title}")
                emit('chapter_status', {
@@ -492,6 +497,9 @@ class ReportAgent:

                chapter_payload: Dict[str, Any] | None = None
                attempt = 1
+                best_sparse_candidate: Dict[str, Any] | None = None
+                best_sparse_score = -1
+                fallback_used = False
                while attempt <= chapter_max_attempts:
                    try:
                        chapter_payload = self.chapter_generation_node.run(
@@ -506,6 +514,19 @@ class ReportAgent:
                            "content_sparse" if isinstance(structured_error, ChapterContentError) else "json_parse"
                        )
                        readable_label = "内容密度异常" if error_kind == "content_sparse" else "JSON解析失败"
+                        if isinstance(structured_error, ChapterContentError):
+                            candidate = getattr(structured_error, "chapter_payload", None)
+                            candidate_score = getattr(structured_error, "body_characters", 0) or 0
+                            if isinstance(candidate, dict) and candidate_score >= 0:
+                                if candidate_score > best_sparse_score:
+                                    best_sparse_candidate = deepcopy(candidate)
+                                    best_sparse_score = candidate_score
+                        will_fallback = (
+                            isinstance(structured_error, ChapterContentError)
+                            and attempt >= chapter_max_attempts
+                            and attempt >= self._CONTENT_SPARSE_MIN_ATTEMPTS
+                            and best_sparse_candidate is not None
+                        )
                        logger.warning(
                            "章节 {title} {label}（第 {attempt}/{total} 次尝试）: {error}",
                            title=section.title,
@@ -514,14 +535,27 @@ class ReportAgent:
                            total=chapter_max_attempts,
                            error=structured_error,
                        )
-                        emit('chapter_status', {
+                        status_value = 'retrying' if attempt < chapter_max_attempts or will_fallback else 'error'
+                        status_payload = {
                            'chapterId': section.chapter_id,
                            'title': section.title,
-                            'status': 'retrying' if attempt < chapter_max_attempts else 'error',
+                            'status': status_value,
                            'attempt': attempt,
                            'error': str(structured_error),
                            'reason': error_kind,
-                        })
+                        }
+                        if will_fallback:
+                            status_payload['warning'] = 'content_sparse_fallback_pending'
+                        emit('chapter_status', status_payload)
+                        if will_fallback:
+                            logger.warning(
+                                "章节 {title} 达到最大尝试次数，保留字数最多（约 {score} 字）的版本作为兜底输出",
+                                title=section.title,
+                                score=best_sparse_score,
+                            )
+                            chapter_payload = self._finalize_sparse_chapter(best_sparse_candidate)
+                            fallback_used = True
+                            break
                        if attempt >= chapter_max_attempts:
                            raise
                        attempt += 1
@@ -553,12 +587,16 @@ class ReportAgent:
                        f"{section.title} 章节JSON在 {chapter_max_attempts} 次尝试后仍无法解析"
                    )
                chapters.append(chapter_payload)
-                emit('chapter_status', {
+                completion_status = {
                    'chapterId': section.chapter_id,
                    'title': section.title,
                    'status': 'completed',
                    'attempt': attempt,
-                })
+                }
+                if fallback_used:
+                    completion_status['warning'] = 'content_sparse_fallback'
+                    completion_status['warningMessage'] = self._CONTENT_SPARSE_WARNING_TEXT
+                emit('chapter_status', completion_status)

            document_ir = self.document_composer.build_document(
                report_id,
@@ -779,6 +817,48 @@ class ReportAgent:
        ]
        return any(keyword in normalized for keyword in keywords)

+    def _finalize_sparse_chapter(self, chapter: Optional[Dict[str, Any]]) -> Dict[str, Any]:
+        """
+        构造内容稀疏兜底章节：复制原始payload并插入温馨提示段落。
+        """
+        safe_chapter = deepcopy(chapter or {})
+        if not isinstance(safe_chapter, dict):
+            safe_chapter = {}
+        self._ensure_sparse_warning_block(safe_chapter)
+        return safe_chapter
+
+    def _ensure_sparse_warning_block(self, chapter: Dict[str, Any]) -> None:
+        """
+        将提示段落插在章节标题后，提醒读者该章字数偏少。
+        """
+        warning_block = {
+            "type": "paragraph",
+            "inlines": [
+                {
+                    "text": self._CONTENT_SPARSE_WARNING_TEXT,
+                    "marks": [{"type": "italic"}],
+                }
+            ],
+            "meta": {"role": "content-sparse-warning"},
+        }
+        blocks = chapter.get("blocks")
+        if isinstance(blocks, list) and blocks:
+            inserted = False
+            for idx, block in enumerate(blocks):
+                if isinstance(block, dict) and block.get("type") == "heading":
+                    blocks.insert(idx + 1, warning_block)
+                    inserted = True
+                    break
+            if not inserted:
+                blocks.insert(0, warning_block)
+        else:
+            chapter["blocks"] = [warning_block]
+        meta = chapter.get("meta")
+        if isinstance(meta, dict):
+            meta["contentSparseWarning"] = True
+        else:
+            chapter["meta"] = {"contentSparseWarning": True}
+
    def _stringify(self, value: Any) -> str:
        """
        安全地将对象转成字符串。
--- a/ReportEngine/nodes/chapter_generation_node.py
+++ b/ReportEngine/nodes/chapter_generation_node.py
@@ -55,6 +55,20 @@ class ChapterContentError(ValueError):
    当LLM仅输出标题或正文不足以支撑一章时触发，驱动重试以保证报告质量。
    """

+    def __init__(
+        self,
+        message: str,
+        chapter: Optional[Dict[str, Any]] = None,
+        body_characters: int = 0,
+        narrative_characters: int = 0,
+        non_heading_blocks: int = 0,
+    ):
+        super().__init__(message)
+        self.chapter_payload: Optional[Dict[str, Any]] = chapter
+        self.body_characters: int = int(body_characters or 0)
+        self.narrative_characters: int = int(narrative_characters or 0)
+        self.non_heading_blocks: int = int(non_heading_blocks or 0)
+

 class ChapterGenerationNode(BaseNode):
    """
@@ -897,7 +911,13 @@ class ChapterGenerationNode(BaseNode):
        """
        blocks = chapter.get("blocks")
        if not isinstance(blocks, list) or not blocks:
-            raise ChapterContentError("章节缺少正文区块，无法输出内容")
+            raise ChapterContentError(
+                "章节缺少正文区块，无法输出内容",
+                chapter=chapter,
+                body_characters=0,
+                narrative_characters=0,
+                non_heading_blocks=0,
+            )

        non_heading_blocks = [
            block
@@ -905,16 +925,21 @@ class ChapterGenerationNode(BaseNode):
            if isinstance(block, dict)
            and block.get("type") not in {"heading", "divider", "toc"}
        ]
+        valid_block_count = len(non_heading_blocks)
        body_characters = self._count_body_characters(blocks)
        narrative_characters = self._count_narrative_characters(blocks)

        if (
-            len(non_heading_blocks) < self._MIN_NON_HEADING_BLOCKS
+            valid_block_count < self._MIN_NON_HEADING_BLOCKS
            or body_characters < self._MIN_BODY_CHARACTERS
            or narrative_characters < self._MIN_NARRATIVE_CHARACTERS
        ):
            raise ChapterContentError(
-                f"{chapter.get('title') or '该章节'} 正文不足：有效区块 {len(non_heading_blocks)} 个，估算字符数 {body_characters}，叙述性字符数 {narrative_characters}"
+                f"{chapter.get('title') or '该章节'} 正文不足：有效区块 {valid_block_count} 个，估算字符数 {body_characters}，叙述性字符数 {narrative_characters}",
+                chapter=chapter,
+                body_characters=body_characters,
+                narrative_characters=narrative_characters,
+                non_heading_blocks=valid_block_count,
            )

    def _count_body_characters(self, blocks: Any) -> int: