Quellcode durchsuchen

fix: 增强 NER 服务 JSON 解析容错性

1. 检测 API 响应是否被截断 (finish_reason=length)
2. 改进 markdown 代码块移除逻辑
3. 新增方法4:从截断的 JSON 中逐个提取完整实体对象
4. 增加更详细的调试日志

处理场景:
- API 返回被 max_tokens 截断的不完整 JSON
- markdown 代码块格式变化
- 部分实体数据完整但整体 JSON 不完整
何文松 vor 1 Monat
Ursprung
Commit
9b6791e35e
1 geänderte Dateien mit 47 neuen und 11 gelöschten Zeilen
  1. 47 11
      python-services/ner-service/app/services/deepseek_service.py

+ 47 - 11
python-services/ner-service/app/services/deepseek_service.py

@@ -195,8 +195,16 @@ class DeepSeekService:
                     # OpenAI 格式响应
                     choices = result.get("choices", [])
                     if choices:
-                        message = choices[0].get("message", {})
-                        return message.get("content", "")
+                        choice = choices[0]
+                        message = choice.get("message", {})
+                        content = message.get("content", "")
+                        
+                        # 检查是否因为 max_tokens 被截断
+                        finish_reason = choice.get("finish_reason", "")
+                        if finish_reason == "length":
+                            logger.warning(f"API 响应被截断 (finish_reason=length), 考虑增加 max_tokens 或减小分块大小")
+                        
+                        return content
                     return None
                     
             except httpx.TimeoutException:
@@ -221,8 +229,9 @@ class DeepSeekService:
         entities = []
         
         try:
-            # 移除 markdown code block 标记
-            response = re.sub(r'```json\s*', '', response)
+            # 移除 markdown code block 标记(支持多行模式)
+            response = re.sub(r'```json\s*\n?', '', response, flags=re.IGNORECASE)
+            response = re.sub(r'\n?```\s*$', '', response)
             response = re.sub(r'```\s*', '', response)
             response = response.strip()
             
@@ -230,20 +239,47 @@ class DeepSeekService:
             data = None
             try:
                 data = json.loads(response)
-            except json.JSONDecodeError:
-                pass
+            except json.JSONDecodeError as e:
+                logger.debug(f"直接解析 JSON 失败: {e}")
             
-            # 方法2:查找 JSON 对象
+            # 方法2:查找 JSON 对象(使用更宽松的正则)
             if not data or "entities" not in data:
-                json_match = re.search(r'\{\s*"entities"\s*:\s*\[[\s\S]*\]\s*\}', response)
+                # 尝试匹配从 { 开始到最后一个 } 的内容
+                json_match = re.search(r'\{[^{}]*"entities"\s*:\s*\[[\s\S]*?\]\s*\}', response)
                 if json_match:
                     try:
                         data = json.loads(json_match.group())
-                    except json.JSONDecodeError:
-                        pass
+                    except json.JSONDecodeError as e:
+                        logger.debug(f"正则匹配 JSON 解析失败: {e}")
             
+            # 方法3:尝试提取 entities 数组
             if not data or "entities" not in data:
-                logger.warning(f"未找到有效的 entities JSON, response={response[:300]}...")
+                array_match = re.search(r'"entities"\s*:\s*(\[[\s\S]*\])', response)
+                if array_match:
+                    try:
+                        entity_list = json.loads(array_match.group(1))
+                        data = {"entities": entity_list}
+                    except json.JSONDecodeError as e:
+                        logger.debug(f"提取 entities 数组失败: {e}")
+            
+            # 方法4:处理被截断的 JSON,尝试逐个解析完整的实体对象
+            if not data or "entities" not in data:
+                logger.debug("尝试从截断的 JSON 中提取完整实体...")
+                entity_pattern = r'\{\s*"name"\s*:\s*"([^"]+)"\s*,\s*"type"\s*:\s*"([^"]+)"\s*,\s*"charStart"\s*:\s*(\d+)\s*,\s*"charEnd"\s*:\s*(\d+)\s*\}'
+                matches = re.findall(entity_pattern, response)
+                if matches:
+                    data = {"entities": []}
+                    for match in matches:
+                        data["entities"].append({
+                            "name": match[0],
+                            "type": match[1],
+                            "charStart": int(match[2]),
+                            "charEnd": int(match[3])
+                        })
+                    logger.info(f"从截断 JSON 中恢复了 {len(matches)} 个实体")
+            
+            if not data or "entities" not in data:
+                logger.warning(f"未找到有效的 entities JSON, response_length={len(response)}, response_preview={response[:500]}...")
                 return entities
             
             entity_list = data.get("entities", [])