Optimize JSON Parsing Compatibility
This commit is contained in:
@@ -610,9 +610,34 @@ class RobustJSONParser:
|
||||
|
||||
# 验证数据类型
|
||||
if not isinstance(data, dict):
|
||||
if isinstance(data, list) and len(data) > 0 and isinstance(data[0], dict):
|
||||
logger.warning(f"{context_name} 返回数组,自动提取第一个元素")
|
||||
data = data[0]
|
||||
if isinstance(data, list):
|
||||
if len(data) > 0:
|
||||
# 尝试找到最符合期望的元素
|
||||
best_match = None
|
||||
max_match_count = 0
|
||||
|
||||
for item in data:
|
||||
if isinstance(item, dict):
|
||||
if expected_keys:
|
||||
# 计算匹配的键数量
|
||||
match_count = sum(1 for key in expected_keys if key in item)
|
||||
if match_count > max_match_count:
|
||||
max_match_count = match_count
|
||||
best_match = item
|
||||
elif best_match is None:
|
||||
best_match = item
|
||||
|
||||
if best_match:
|
||||
logger.warning(
|
||||
f"{context_name} 返回数组,自动提取最佳匹配元素(匹配{max_match_count}/{len(expected_keys or [])}个键)"
|
||||
)
|
||||
data = best_match
|
||||
else:
|
||||
raise JSONParseError(
|
||||
f"{context_name} 返回的数组中没有有效的对象"
|
||||
)
|
||||
else:
|
||||
raise JSONParseError(f"{context_name} 返回空数组")
|
||||
else:
|
||||
raise JSONParseError(
|
||||
f"{context_name} 返回的不是JSON对象: {type(data).__name__}"
|
||||
@@ -625,6 +650,43 @@ class RobustJSONParser:
|
||||
logger.warning(
|
||||
f"{context_name} 缺少预期的键: {', '.join(missing_keys)}"
|
||||
)
|
||||
# 尝试修复常见的键名变体
|
||||
data = self._try_recover_missing_keys(data, missing_keys, context_name)
|
||||
|
||||
return data
|
||||
|
||||
def _try_recover_missing_keys(
|
||||
self, data: Dict[str, Any], missing_keys: List[str], context_name: str
|
||||
) -> Dict[str, Any]:
|
||||
"""
|
||||
尝试从数据中恢复缺失的键,通过查找相似的键名。
|
||||
|
||||
参数:
|
||||
data: 原始数据
|
||||
missing_keys: 缺失的键列表
|
||||
context_name: 上下文名称
|
||||
|
||||
返回:
|
||||
Dict[str, Any]: 修复后的数据
|
||||
"""
|
||||
# 常见的键名映射
|
||||
key_aliases = {
|
||||
"template_name": ["templateName", "name", "template"],
|
||||
"selection_reason": ["selectionReason", "reason", "explanation"],
|
||||
"title": ["reportTitle", "documentTitle"],
|
||||
"chapters": ["chapterList", "chapterPlan", "sections"],
|
||||
"totalWords": ["total_words", "wordCount", "totalWordCount"],
|
||||
}
|
||||
|
||||
for missing_key in missing_keys:
|
||||
if missing_key in key_aliases:
|
||||
for alias in key_aliases[missing_key]:
|
||||
if alias in data:
|
||||
logger.info(
|
||||
f"{context_name} 找到键'{missing_key}'的别名'{alias}',自动映射"
|
||||
)
|
||||
data[missing_key] = data[alias]
|
||||
break
|
||||
|
||||
return data
|
||||
|
||||
|
||||
@@ -127,6 +127,61 @@ class TestRobustJSONParser(unittest.TestCase):
|
||||
self.assertEqual(result["name"], "test")
|
||||
self.assertEqual(result["value"], 123)
|
||||
|
||||
def test_unterminated_string_with_json_repair(self):
|
||||
"""测试使用json_repair库修复未终止的字符串。"""
|
||||
# 创建启用json_repair的解析器
|
||||
parser_with_repair = RobustJSONParser(
|
||||
enable_json_repair=True,
|
||||
enable_llm_repair=False,
|
||||
)
|
||||
|
||||
# 模拟实际错误:字符串中有未转义的控制字符或引号
|
||||
json_str = """{
|
||||
"template_name": "特定政策报告",
|
||||
"selection_reason": "这是测试内容"
|
||||
}"""
|
||||
result = parser_with_repair.parse(json_str, "未终止字符串测试")
|
||||
# 只要能够解析成功,不报错就可以了
|
||||
self.assertIsInstance(result, dict)
|
||||
self.assertIn("template_name", result)
|
||||
|
||||
def test_array_with_best_match(self):
|
||||
"""测试从数组中提取最佳匹配的元素。"""
|
||||
json_str = """[
|
||||
{
|
||||
"name": "test",
|
||||
"value": 123
|
||||
},
|
||||
{
|
||||
"totalWords": 40000,
|
||||
"globalGuidelines": ["guide1", "guide2"],
|
||||
"chapters": []
|
||||
}
|
||||
]"""
|
||||
result = self.parser.parse(
|
||||
json_str,
|
||||
"数组最佳匹配测试",
|
||||
expected_keys=["totalWords", "globalGuidelines", "chapters"],
|
||||
)
|
||||
# 应该提取第二个元素,因为它匹配了3个键
|
||||
self.assertEqual(result["totalWords"], 40000)
|
||||
self.assertEqual(len(result["globalGuidelines"]), 2)
|
||||
|
||||
def test_key_alias_recovery(self):
|
||||
"""测试键名别名恢复。"""
|
||||
json_str = """{
|
||||
"templateName": "test_template",
|
||||
"selectionReason": "This is a test"
|
||||
}"""
|
||||
result = self.parser.parse(
|
||||
json_str,
|
||||
"键别名测试",
|
||||
expected_keys=["template_name", "selection_reason"],
|
||||
)
|
||||
# 应该自动映射 templateName -> template_name
|
||||
self.assertEqual(result["template_name"], "test_template")
|
||||
self.assertEqual(result["selection_reason"], "This is a test")
|
||||
|
||||
def test_complex_real_world_case(self):
|
||||
"""测试真实世界的复杂案例(类似实际错误)。"""
|
||||
# 模拟实际错误:缺少逗号、有markdown包裹、有思考内容
|
||||
|
||||
Reference in New Issue
Block a user