Optimize JSON Parsing Compatibility

This commit is contained in:
马一丁
2025-11-17 17:36:58 +08:00
parent c20cc24c78
commit a5f3964a73
2 changed files with 120 additions and 3 deletions

View File

@@ -610,9 +610,34 @@ class RobustJSONParser:
# 验证数据类型
if not isinstance(data, dict):
if isinstance(data, list) and len(data) > 0 and isinstance(data[0], dict):
logger.warning(f"{context_name} 返回数组,自动提取第一个元素")
data = data[0]
if isinstance(data, list):
if len(data) > 0:
# 尝试找到最符合期望的元素
best_match = None
max_match_count = 0
for item in data:
if isinstance(item, dict):
if expected_keys:
# 计算匹配的键数量
match_count = sum(1 for key in expected_keys if key in item)
if match_count > max_match_count:
max_match_count = match_count
best_match = item
elif best_match is None:
best_match = item
if best_match:
logger.warning(
f"{context_name} 返回数组,自动提取最佳匹配元素(匹配{max_match_count}/{len(expected_keys or [])}个键)"
)
data = best_match
else:
raise JSONParseError(
f"{context_name} 返回的数组中没有有效的对象"
)
else:
raise JSONParseError(f"{context_name} 返回空数组")
else:
raise JSONParseError(
f"{context_name} 返回的不是JSON对象: {type(data).__name__}"
@@ -625,6 +650,43 @@ class RobustJSONParser:
logger.warning(
f"{context_name} 缺少预期的键: {', '.join(missing_keys)}"
)
# 尝试修复常见的键名变体
data = self._try_recover_missing_keys(data, missing_keys, context_name)
return data
def _try_recover_missing_keys(
self, data: Dict[str, Any], missing_keys: List[str], context_name: str
) -> Dict[str, Any]:
"""
尝试从数据中恢复缺失的键,通过查找相似的键名。
参数:
data: 原始数据
missing_keys: 缺失的键列表
context_name: 上下文名称
返回:
Dict[str, Any]: 修复后的数据
"""
# 常见的键名映射
key_aliases = {
"template_name": ["templateName", "name", "template"],
"selection_reason": ["selectionReason", "reason", "explanation"],
"title": ["reportTitle", "documentTitle"],
"chapters": ["chapterList", "chapterPlan", "sections"],
"totalWords": ["total_words", "wordCount", "totalWordCount"],
}
for missing_key in missing_keys:
if missing_key in key_aliases:
for alias in key_aliases[missing_key]:
if alias in data:
logger.info(
f"{context_name} 找到键'{missing_key}'的别名'{alias}',自动映射"
)
data[missing_key] = data[alias]
break
return data

View File

@@ -127,6 +127,61 @@ class TestRobustJSONParser(unittest.TestCase):
self.assertEqual(result["name"], "test")
self.assertEqual(result["value"], 123)
def test_unterminated_string_with_json_repair(self):
"""测试使用json_repair库修复未终止的字符串。"""
# 创建启用json_repair的解析器
parser_with_repair = RobustJSONParser(
enable_json_repair=True,
enable_llm_repair=False,
)
# 模拟实际错误:字符串中有未转义的控制字符或引号
json_str = """{
"template_name": "特定政策报告",
"selection_reason": "这是测试内容"
}"""
result = parser_with_repair.parse(json_str, "未终止字符串测试")
# 只要能够解析成功,不报错就可以了
self.assertIsInstance(result, dict)
self.assertIn("template_name", result)
def test_array_with_best_match(self):
"""测试从数组中提取最佳匹配的元素。"""
json_str = """[
{
"name": "test",
"value": 123
},
{
"totalWords": 40000,
"globalGuidelines": ["guide1", "guide2"],
"chapters": []
}
]"""
result = self.parser.parse(
json_str,
"数组最佳匹配测试",
expected_keys=["totalWords", "globalGuidelines", "chapters"],
)
# 应该提取第二个元素因为它匹配了3个键
self.assertEqual(result["totalWords"], 40000)
self.assertEqual(len(result["globalGuidelines"]), 2)
def test_key_alias_recovery(self):
"""测试键名别名恢复。"""
json_str = """{
"templateName": "test_template",
"selectionReason": "This is a test"
}"""
result = self.parser.parse(
json_str,
"键别名测试",
expected_keys=["template_name", "selection_reason"],
)
# 应该自动映射 templateName -> template_name
self.assertEqual(result["template_name"], "test_template")
self.assertEqual(result["selection_reason"], "This is a test")
def test_complex_real_world_case(self):
"""测试真实世界的复杂案例(类似实际错误)。"""
# 模拟实际错误缺少逗号、有markdown包裹、有思考内容