Optimize Re-output Logic

This commit is contained in:
马一丁
2025-11-15 14:45:20 +08:00
parent bae13bf434
commit fa1ebc07ec

View File

@@ -90,7 +90,8 @@ class ChapterGenerationNode(BaseNode):
}
# 章节若仅包含标题或字符过少则视为失败强制LLM重新生成
_MIN_NON_HEADING_BLOCKS = 2
_MIN_BODY_CHARACTERS = 400
_MIN_BODY_CHARACTERS = 600
_MIN_NARRATIVE_CHARACTERS = 300
_PARAGRAPH_FRAGMENT_MAX_CHARS = 80
_PARAGRAPH_FRAGMENT_NO_TERMINATOR_MAX_CHARS = 240
_TERMINATION_PUNCTUATION = set("。!?!?;……")
@@ -659,10 +660,15 @@ class ChapterGenerationNode(BaseNode):
and block.get("type") not in {"heading", "divider", "toc"}
]
body_characters = self._count_body_characters(blocks)
narrative_characters = self._count_narrative_characters(blocks)
if len(non_heading_blocks) < self._MIN_NON_HEADING_BLOCKS or body_characters < self._MIN_BODY_CHARACTERS:
if (
len(non_heading_blocks) < self._MIN_NON_HEADING_BLOCKS
or body_characters < self._MIN_BODY_CHARACTERS
or narrative_characters < self._MIN_NARRATIVE_CHARACTERS
):
raise ChapterContentError(
f"{chapter.get('title') or '该章节'} 正文不足:有效区块 {len(non_heading_blocks)} 个,估算字符数 {body_characters}"
f"{chapter.get('title') or '该章节'} 正文不足:有效区块 {len(non_heading_blocks)} 个,估算字符数 {body_characters},叙述性字符数 {narrative_characters}"
)
def _count_body_characters(self, blocks: Any) -> int:
@@ -696,19 +702,7 @@ class ChapterGenerationNode(BaseNode):
return 0
if block_type == "paragraph":
inlines = node.get("inlines")
if isinstance(inlines, list):
total = 0
for run in inlines:
if isinstance(run, dict):
text = run.get("text")
if isinstance(text, str):
total += len(text.strip())
return total
text_value = node.get("text")
if isinstance(text_value, str):
return len(text_value.strip())
return len(self._extract_block_text(node).strip())
return self._estimate_paragraph_characters(node)
if block_type == "list":
total = 0
@@ -735,6 +729,57 @@ class ChapterGenerationNode(BaseNode):
return walk(blocks)
def _count_narrative_characters(self, blocks: Any) -> int:
"""
统计paragraph/callout/list/blockquote等叙述性结构的字符数避免被表格/图表“刷长”。
"""
def walk(node: Any) -> int:
if node is None:
return 0
if isinstance(node, list):
return sum(walk(item) for item in node)
if isinstance(node, str):
return len(node.strip())
if not isinstance(node, dict):
return 0
block_type = node.get("type")
if block_type == "paragraph":
return self._estimate_paragraph_characters(node)
if block_type == "list":
total = 0
for item in node.get("items", []):
total += walk(item)
return total
if block_type in {"callout", "blockquote"}:
return walk(node.get("blocks"))
# list项可能是匿名dict兼容性遍历
if block_type is None:
nested = node.get("blocks")
if isinstance(nested, list):
return walk(nested)
return 0
return walk(blocks)
def _estimate_paragraph_characters(self, block: Dict[str, Any]) -> int:
"""提取paragraph文本长度复用在多种统计中。"""
inlines = block.get("inlines")
if isinstance(inlines, list):
total = 0
for run in inlines:
if isinstance(run, dict):
text = run.get("text")
if isinstance(text, str):
total += len(text.strip())
return total
text_value = block.get("text")
if isinstance(text_value, str):
return len(text_value.strip())
return len(self._extract_block_text(block).strip())
def _sanitize_block_content(self, block: Dict[str, Any]):
"""根据类型做精细化修复例如清理paragraph内的非法inline mark"""
block_type = block.get("type")