vor 1 Monat · 9b6791e35e
--- a/python-services/ner-service/app/services/deepseek_service.py
+++ b/python-services/ner-service/app/services/deepseek_service.py
@@ -195,8 +195,16 @@ class DeepSeekService:
 
				                     # OpenAI 格式响应
			
 
				                     choices = result.get("choices", [])
			
 
				                     if choices:
			
 
				-                        message = choices[0].get("message", {})
			
 
				-                        return message.get("content", "")
			
 
				+                        choice = choices[0]
			
 
				+                        message = choice.get("message", {})
			
 
				+                        content = message.get("content", "")
			
 
				+                        
			
 
				+                        # 检查是否因为 max_tokens 被截断
			
 
				+                        finish_reason = choice.get("finish_reason", "")
			
 
				+                        if finish_reason == "length":
			
 
				+                            logger.warning(f"API 响应被截断 (finish_reason=length), 考虑增加 max_tokens 或减小分块大小")
			
 
				+                        
			
 
				+                        return content
			
 
				                     return None
			
 
				                     
			
 
				             except httpx.TimeoutException:
			
@@ -221,8 +229,9 @@ class DeepSeekService:
 
				         entities = []
			
 
				         
			
 
				         try:
			
 
				-            # 移除 markdown code block 标记
			
 
				-            response = re.sub(r'```json\s*', '', response)
			
 
				+            # 移除 markdown code block 标记（支持多行模式）
			
 
				+            response = re.sub(r'```json\s*\n?', '', response, flags=re.IGNORECASE)
			
 
				+            response = re.sub(r'\n?```\s*$', '', response)
			
 
				             response = re.sub(r'```\s*', '', response)
			
 
				             response = response.strip()
			
 
				             
			
@@ -230,20 +239,47 @@ class DeepSeekService:
 
				             data = None
			
 
				             try:
			
 
				                 data = json.loads(response)
			
 
				-            except json.JSONDecodeError:
			
 
				-                pass
			
 
				+            except json.JSONDecodeError as e:
			
 
				+                logger.debug(f"直接解析 JSON 失败: {e}")
			
 
				             
			
 
				-            # 方法2：查找 JSON 对象
			
 
				+            # 方法2：查找 JSON 对象（使用更宽松的正则）
			
 
				             if not data or "entities" not in data:
			
 
				-                json_match = re.search(r'\{\s*"entities"\s*:\s*\[[\s\S]*\]\s*\}', response)
			
 
				+                # 尝试匹配从 { 开始到最后一个 } 的内容
			
 
				+                json_match = re.search(r'\{[^{}]*"entities"\s*:\s*\[[\s\S]*?\]\s*\}', response)
			
 
				                 if json_match:
			
 
				                     try:
			
 
				                         data = json.loads(json_match.group())
			
 
				-                    except json.JSONDecodeError:
			
 
				-                        pass
			
 
				+                    except json.JSONDecodeError as e:
			
 
				+                        logger.debug(f"正则匹配 JSON 解析失败: {e}")
			
 
				             
			
 
				+            # 方法3：尝试提取 entities 数组
			
 
				             if not data or "entities" not in data:
			
 
				-                logger.warning(f"未找到有效的 entities JSON, response={response[:300]}...")
			
 
				+                array_match = re.search(r'"entities"\s*:\s*(\[[\s\S]*\])', response)
			
 
				+                if array_match:
			
 
				+                    try:
			
 
				+                        entity_list = json.loads(array_match.group(1))
			
 
				+                        data = {"entities": entity_list}
			
 
				+                    except json.JSONDecodeError as e:
			
 
				+                        logger.debug(f"提取 entities 数组失败: {e}")
			
 
				+            
			
 
				+            # 方法4：处理被截断的 JSON，尝试逐个解析完整的实体对象
			
 
				+            if not data or "entities" not in data:
			
 
				+                logger.debug("尝试从截断的 JSON 中提取完整实体...")
			
 
				+                entity_pattern = r'\{\s*"name"\s*:\s*"([^"]+)"\s*,\s*"type"\s*:\s*"([^"]+)"\s*,\s*"charStart"\s*:\s*(\d+)\s*,\s*"charEnd"\s*:\s*(\d+)\s*\}'
			
 
				+                matches = re.findall(entity_pattern, response)
			
 
				+                if matches:
			
 
				+                    data = {"entities": []}
			
 
				+                    for match in matches:
			
 
				+                        data["entities"].append({
			
 
				+                            "name": match[0],
			
 
				+                            "type": match[1],
			
 
				+                            "charStart": int(match[2]),
			
 
				+                            "charEnd": int(match[3])
			
 
				+                        })
			
 
				+                    logger.info(f"从截断 JSON 中恢复了 {len(matches)} 个实体")
			
 
				+            
			
 
				+            if not data or "entities" not in data:
			
 
				+                logger.warning(f"未找到有效的 entities JSON, response_length={len(response)}, response_preview={response[:500]}...")
			
 
				                 return entities
			
 
				             
			
 
				             entity_list = data.get("entities", [])