Optimize JSON Parsing Compatibility

2025-11-17 17:36:58 +08:00
parent c20cc24c78
commit a5f3964a73
2 changed files with 120 additions and 3 deletions
--- a/ReportEngine/utils/json_parser.py
+++ b/ReportEngine/utils/json_parser.py
@@ -610,9 +610,34 @@ class RobustJSONParser:

        # 验证数据类型
        if not isinstance(data, dict):
-            if isinstance(data, list) and len(data) > 0 and isinstance(data[0], dict):
-                logger.warning(f"{context_name} 返回数组，自动提取第一个元素")
-                data = data[0]
+            if isinstance(data, list):
+                if len(data) > 0:
+                    # 尝试找到最符合期望的元素
+                    best_match = None
+                    max_match_count = 0
+
+                    for item in data:
+                        if isinstance(item, dict):
+                            if expected_keys:
+                                # 计算匹配的键数量
+                                match_count = sum(1 for key in expected_keys if key in item)
+                                if match_count > max_match_count:
+                                    max_match_count = match_count
+                                    best_match = item
+                            elif best_match is None:
+                                best_match = item
+
+                    if best_match:
+                        logger.warning(
+                            f"{context_name} 返回数组，自动提取最佳匹配元素（匹配{max_match_count}/{len(expected_keys or [])}个键）"
+                        )
+                        data = best_match
+                    else:
+                        raise JSONParseError(
+                            f"{context_name} 返回的数组中没有有效的对象"
+                        )
+                else:
+                    raise JSONParseError(f"{context_name} 返回空数组")
            else:
                raise JSONParseError(
                    f"{context_name} 返回的不是JSON对象: {type(data).__name__}"
@@ -625,6 +650,43 @@ class RobustJSONParser:
                logger.warning(
                    f"{context_name} 缺少预期的键: {', '.join(missing_keys)}"
                )
+                # 尝试修复常见的键名变体
+                data = self._try_recover_missing_keys(data, missing_keys, context_name)
+
+        return data
+
+    def _try_recover_missing_keys(
+        self, data: Dict[str, Any], missing_keys: List[str], context_name: str
+    ) -> Dict[str, Any]:
+        """
+        尝试从数据中恢复缺失的键，通过查找相似的键名。
+
+        参数:
+            data: 原始数据
+            missing_keys: 缺失的键列表
+            context_name: 上下文名称
+
+        返回:
+            Dict[str, Any]: 修复后的数据
+        """
+        # 常见的键名映射
+        key_aliases = {
+            "template_name": ["templateName", "name", "template"],
+            "selection_reason": ["selectionReason", "reason", "explanation"],
+            "title": ["reportTitle", "documentTitle"],
+            "chapters": ["chapterList", "chapterPlan", "sections"],
+            "totalWords": ["total_words", "wordCount", "totalWordCount"],
+        }
+
+        for missing_key in missing_keys:
+            if missing_key in key_aliases:
+                for alias in key_aliases[missing_key]:
+                    if alias in data:
+                        logger.info(
+                            f"{context_name} 找到键'{missing_key}'的别名'{alias}'，自动映射"
+                        )
+                        data[missing_key] = data[alias]
+                        break

        return data

--- a/ReportEngine/utils/test_json_parser.py
+++ b/ReportEngine/utils/test_json_parser.py
@@ -127,6 +127,61 @@ class TestRobustJSONParser(unittest.TestCase):
        self.assertEqual(result["name"], "test")
        self.assertEqual(result["value"], 123)

+    def test_unterminated_string_with_json_repair(self):
+        """测试使用json_repair库修复未终止的字符串。"""
+        # 创建启用json_repair的解析器
+        parser_with_repair = RobustJSONParser(
+            enable_json_repair=True,
+            enable_llm_repair=False,
+        )
+
+        # 模拟实际错误：字符串中有未转义的控制字符或引号
+        json_str = """{
+  "template_name": "特定政策报告",
+  "selection_reason": "这是测试内容"
+}"""
+        result = parser_with_repair.parse(json_str, "未终止字符串测试")
+        # 只要能够解析成功，不报错就可以了
+        self.assertIsInstance(result, dict)
+        self.assertIn("template_name", result)
+
+    def test_array_with_best_match(self):
+        """测试从数组中提取最佳匹配的元素。"""
+        json_str = """[
+  {
+    "name": "test",
+    "value": 123
+  },
+  {
+    "totalWords": 40000,
+    "globalGuidelines": ["guide1", "guide2"],
+    "chapters": []
+  }
+]"""
+        result = self.parser.parse(
+            json_str,
+            "数组最佳匹配测试",
+            expected_keys=["totalWords", "globalGuidelines", "chapters"],
+        )
+        # 应该提取第二个元素，因为它匹配了3个键
+        self.assertEqual(result["totalWords"], 40000)
+        self.assertEqual(len(result["globalGuidelines"]), 2)
+
+    def test_key_alias_recovery(self):
+        """测试键名别名恢复。"""
+        json_str = """{
+  "templateName": "test_template",
+  "selectionReason": "This is a test"
+}"""
+        result = self.parser.parse(
+            json_str,
+            "键别名测试",
+            expected_keys=["template_name", "selection_reason"],
+        )
+        # 应该自动映射 templateName -> template_name
+        self.assertEqual(result["template_name"], "test_template")
+        self.assertEqual(result["selection_reason"], "This is a test")
+
    def test_complex_real_world_case(self):
        """测试真实世界的复杂案例（类似实际错误）。"""
        # 模拟实际错误：缺少逗号、有markdown包裹、有思考内容