|
|
@@ -1,293 +1,26 @@
|
|
|
"""
|
|
|
-要素提取器:混合NER+LLM策略
|
|
|
+要素提取器:使用NER和LLM从文档中提取实体
|
|
|
|
|
|
-从解析后的文档内容中提取要素值,输出前端渲染所需的elements和values。
|
|
|
+支持分章节提取和实体去重。
|
|
|
"""
|
|
|
|
|
|
-import re
|
|
|
import json
|
|
|
-from typing import Dict, List, Any, Optional, Tuple
|
|
|
+import asyncio
|
|
|
+from typing import Dict, List, Any, Optional
|
|
|
from loguru import logger
|
|
|
|
|
|
-# DOCX解析由Java后端完成,这里只处理纯文本
|
|
|
-
|
|
|
-
|
|
|
-# ============================================================
|
|
|
-# NER规则定义
|
|
|
-# ============================================================
|
|
|
-
|
|
|
-NER_RULES = {
|
|
|
- # 日期类
|
|
|
- "project.workStartAt": {
|
|
|
- "patterns": [
|
|
|
- r'评审日期[::]\s*(\d{4}年\d{1,2}月\d{1,2}日)至',
|
|
|
- r'(\d{4}年\d{1,2}月\d{1,2}日)至\d{4}年',
|
|
|
- r'评审(?:开始)?日期[::]\s*(\d{4}年\d{1,2}月\d{1,2}日)',
|
|
|
- ],
|
|
|
- "type": "DATE",
|
|
|
- "element_name": "评审开始日期",
|
|
|
- "element_type": "text",
|
|
|
- "namespace": "project"
|
|
|
- },
|
|
|
- "project.workEndAt": {
|
|
|
- "patterns": [
|
|
|
- r'至(\d{4}年\d{1,2}月\d{1,2}日)',
|
|
|
- r'评审(?:结束)?日期[::]\s*(\d{4}年\d{1,2}月\d{1,2}日)',
|
|
|
- ],
|
|
|
- "type": "DATE",
|
|
|
- "element_name": "评审结束日期",
|
|
|
- "element_type": "text",
|
|
|
- "namespace": "project"
|
|
|
- },
|
|
|
-
|
|
|
- # 得分类
|
|
|
- "project.resultScore": {
|
|
|
- "patterns": [
|
|
|
- r'评审得分[::]\s*(\d+\.?\d*)\s*分',
|
|
|
- r'得分[::]\s*(\d+\.?\d*)\s*分',
|
|
|
- r'(\d+\.?\d*)分\s*级别',
|
|
|
- ],
|
|
|
- "type": "SCORE",
|
|
|
- "element_name": "评审得分",
|
|
|
- "element_type": "text",
|
|
|
- "namespace": "project",
|
|
|
- "post_process": "append_unit" # 添加"分"单位
|
|
|
- },
|
|
|
-
|
|
|
- # 级别类
|
|
|
- "project.resultLevel": {
|
|
|
- "patterns": [
|
|
|
- r'级别[::]\s*(一级|二级|三级)',
|
|
|
- r'评审(?:结论)?级别[::]\s*(一级|二级|三级)',
|
|
|
- r'(一级|二级|三级)\s*(?:企业)?(?:证书)?',
|
|
|
- ],
|
|
|
- "type": "LEVEL",
|
|
|
- "element_name": "评审结论级别",
|
|
|
- "element_type": "text",
|
|
|
- "namespace": "project"
|
|
|
- },
|
|
|
-
|
|
|
- # 编号类
|
|
|
- "basicInfo.projectCode": {
|
|
|
- "patterns": [
|
|
|
- r'项目编号[::]\s*([A-Z]+-\d+-\d+)',
|
|
|
- r'编号[::]\s*([A-Z0-9\-]+)',
|
|
|
- ],
|
|
|
- "type": "CODE",
|
|
|
- "element_name": "项目编号",
|
|
|
- "element_type": "text",
|
|
|
- "namespace": "basicInfo"
|
|
|
- },
|
|
|
- "basicInfo.reviewObjectCertificateCode": {
|
|
|
- "patterns": [
|
|
|
- r'证书编号[::]\s*(ZGDIDBOY-\d+)',
|
|
|
- r'证书编号[((]([^))]+)[))]',
|
|
|
- r'证书编号[::]\s*([A-Z0-9\-]+)',
|
|
|
- ],
|
|
|
- "type": "CODE",
|
|
|
- "element_name": "证书编号",
|
|
|
- "element_type": "text",
|
|
|
- "namespace": "basicInfo"
|
|
|
- },
|
|
|
-
|
|
|
- # 机构类
|
|
|
- "project.reviewObject": {
|
|
|
- "patterns": [
|
|
|
- r'评审对象[::]\s*([^\n]{10,60}(?:公司|集团|院|所))',
|
|
|
- r'对([^\n]{10,60}(?:公司|集团|院|所))进行.*?(?:评审|复审)',
|
|
|
- ],
|
|
|
- "type": "ORG",
|
|
|
- "element_name": "评审对象",
|
|
|
- "element_type": "text",
|
|
|
- "namespace": "project"
|
|
|
- },
|
|
|
- "project.reviewObjectAlias": {
|
|
|
- "patterns": [
|
|
|
- r'以下简称[「『"""]([^」』""]{2,10})[」』"""]',
|
|
|
- r'简称[「『"""]([^」』""]{2,10})[」』"""]',
|
|
|
- r'(以下简称"([^"]{2,10})")',
|
|
|
- ],
|
|
|
- "type": "ALIAS",
|
|
|
- "element_name": "评审对象简称",
|
|
|
- "element_type": "text",
|
|
|
- "namespace": "project"
|
|
|
- },
|
|
|
-}
|
|
|
-
|
|
|
-# ============================================================
|
|
|
-# LLM提取配置
|
|
|
-# ============================================================
|
|
|
-
|
|
|
-LLM_SUMMARY_ELEMENTS = [
|
|
|
- {
|
|
|
- "element_key": "project.target",
|
|
|
- "element_name": "目标",
|
|
|
- "element_type": "paragraph",
|
|
|
- "namespace": "project",
|
|
|
- "source_keywords": ["目标", "5.1.1"],
|
|
|
- "prompt": "请根据以下评审意见,总结企业的安全生产目标情况(100-200字):\n{text}"
|
|
|
- },
|
|
|
- {
|
|
|
- "element_key": "project.duty",
|
|
|
- "element_name": "职责",
|
|
|
- "element_type": "paragraph",
|
|
|
- "namespace": "project",
|
|
|
- "source_keywords": ["职责", "5.1.2"],
|
|
|
- "prompt": "请根据以下评审意见,总结企业的安全生产职责落实情况(100-200字):\n{text}"
|
|
|
- },
|
|
|
- {
|
|
|
- "element_key": "project.fullParticipation",
|
|
|
- "element_name": "全员参与",
|
|
|
- "element_type": "paragraph",
|
|
|
- "namespace": "project",
|
|
|
- "source_keywords": ["全员参与", "5.1.3"],
|
|
|
- "prompt": "请根据以下评审意见,总结企业的全员参与情况(100-200字):\n{text}"
|
|
|
- },
|
|
|
- {
|
|
|
- "element_key": "project.safetyInvestment",
|
|
|
- "element_name": "安全投入",
|
|
|
- "element_type": "paragraph",
|
|
|
- "namespace": "project",
|
|
|
- "source_keywords": ["安全投入", "安全生产费用", "5.1.4"],
|
|
|
- "prompt": "请根据以下评审意见,总结企业的安全投入情况(100-200字):\n{text}"
|
|
|
- },
|
|
|
- {
|
|
|
- "element_key": "project.safetyCulture",
|
|
|
- "element_name": "安全文化",
|
|
|
- "element_type": "paragraph",
|
|
|
- "namespace": "project",
|
|
|
- "source_keywords": ["安全文化", "5.1.5"],
|
|
|
- "prompt": "请根据以下评审意见,总结企业的安全文化建设情况(100-200字):\n{text}"
|
|
|
- },
|
|
|
- {
|
|
|
- "element_key": "project.systematicManagement",
|
|
|
- "element_name": "体系化管理",
|
|
|
- "element_type": "paragraph",
|
|
|
- "namespace": "project",
|
|
|
- "source_keywords": ["制度化管理", "体系化", "5.2"],
|
|
|
- "prompt": "请根据以下评审意见,总结企业的体系化管理情况(100-200字):\n{text}"
|
|
|
- },
|
|
|
- {
|
|
|
- "element_key": "project.employeeTraining",
|
|
|
- "element_name": "人员教育培训",
|
|
|
- "element_type": "paragraph",
|
|
|
- "namespace": "project",
|
|
|
- "source_keywords": ["教育培训", "5.3"],
|
|
|
- "prompt": "请根据以下评审意见,总结企业的人员教育培训情况(100-200字):\n{text}"
|
|
|
- },
|
|
|
- {
|
|
|
- "element_key": "project.assetManagement",
|
|
|
- "element_name": "设备设施管理",
|
|
|
- "element_type": "paragraph",
|
|
|
- "namespace": "project",
|
|
|
- "source_keywords": ["设备设施", "5.4.1"],
|
|
|
- "prompt": "请根据以下评审意见,总结企业的设备设施管理情况(100-200字):\n{text}"
|
|
|
- },
|
|
|
- {
|
|
|
- "element_key": "project.jobSafety",
|
|
|
- "element_name": "作业安全",
|
|
|
- "element_type": "paragraph",
|
|
|
- "namespace": "project",
|
|
|
- "source_keywords": ["作业安全", "5.4.2.1"],
|
|
|
- "prompt": "请根据以下评审意见,总结企业的作业安全情况(100-200字):\n{text}"
|
|
|
- },
|
|
|
- {
|
|
|
- "element_key": "project.riskAssessment",
|
|
|
- "element_name": "风险辨识与评价",
|
|
|
- "element_type": "paragraph",
|
|
|
- "namespace": "project",
|
|
|
- "source_keywords": ["风险辨识", "风险评价", "5.5.1"],
|
|
|
- "prompt": "请根据以下评审意见,总结企业的风险辨识与评价情况(100-200字):\n{text}"
|
|
|
- },
|
|
|
- {
|
|
|
- "element_key": "project.hazardInspection",
|
|
|
- "element_name": "隐患排查",
|
|
|
- "element_type": "paragraph",
|
|
|
- "namespace": "project",
|
|
|
- "source_keywords": ["隐患排查", "5.5.3"],
|
|
|
- "prompt": "请根据以下评审意见,总结企业的隐患排查情况(100-200字):\n{text}"
|
|
|
- },
|
|
|
- {
|
|
|
- "element_key": "project.emergencyResponse",
|
|
|
- "element_name": "应急救援",
|
|
|
- "element_type": "paragraph",
|
|
|
- "namespace": "project",
|
|
|
- "source_keywords": ["应急救援", "应急管理", "5.6"],
|
|
|
- "prompt": "请根据以下评审意见,总结企业的应急救援情况(100-200字):\n{text}"
|
|
|
- },
|
|
|
- {
|
|
|
- "element_key": "project.incidentManagement",
|
|
|
- "element_name": "事故管理",
|
|
|
- "element_type": "paragraph",
|
|
|
- "namespace": "project",
|
|
|
- "source_keywords": ["事故管理", "5.7"],
|
|
|
- "prompt": "请根据以下评审意见,总结企业的事故管理情况(100-200字):\n{text}"
|
|
|
- },
|
|
|
- {
|
|
|
- "element_key": "project.continuousImprovement",
|
|
|
- "element_name": "持续改进",
|
|
|
- "element_type": "paragraph",
|
|
|
- "namespace": "project",
|
|
|
- "source_keywords": ["持续改进", "5.8"],
|
|
|
- "prompt": "请根据以下评审意见,总结企业的持续改进情况(100-200字):\n{text}"
|
|
|
- },
|
|
|
- {
|
|
|
- "element_key": "project.reviewObjectSelfAssessmentProcess",
|
|
|
- "element_name": "自评过程",
|
|
|
- "element_type": "paragraph",
|
|
|
- "namespace": "project",
|
|
|
- "source_keywords": ["自评", "自查"],
|
|
|
- "prompt": "请根据以下内容,总结企业的自评过程(150-250字):\n{text}"
|
|
|
- },
|
|
|
- {
|
|
|
- "element_key": "project.safetyHighlight",
|
|
|
- "element_name": "安全生产管理亮点",
|
|
|
- "element_type": "paragraph",
|
|
|
- "namespace": "project",
|
|
|
- "source_keywords": ["亮点", "特色", "优秀"],
|
|
|
- "prompt": "请根据以下内容,提炼企业的安全生产管理亮点(100-200字):\n{text}"
|
|
|
- },
|
|
|
-]
|
|
|
-
|
|
|
-LLM_TABLE_ELEMENTS = [
|
|
|
- {
|
|
|
- "element_key": "+SPSRRReviewProject",
|
|
|
- "element_name": "现场复审项目",
|
|
|
- "element_type": "table",
|
|
|
- "namespace": "spsrr",
|
|
|
- "table_keywords": ["项目名称", "简称", "类型"],
|
|
|
- "prompt": """请从以下表格中提取复审项目列表,以JSON数组格式返回:
|
|
|
-[{{"name": "项目名称", "alias": "简称", "type": "单位/在建项目", "order": 1}}]
|
|
|
-
|
|
|
-表格内容:
|
|
|
-{text}
|
|
|
-
|
|
|
-只返回JSON数组,不要其他内容。"""
|
|
|
- },
|
|
|
- {
|
|
|
- "element_key": "+SPSRRReviewer",
|
|
|
- "element_name": "现场复审人员",
|
|
|
- "element_type": "table",
|
|
|
- "namespace": "spsrr",
|
|
|
- "table_keywords": ["姓名", "专业", "分工"],
|
|
|
- "prompt": """请从以下表格中提取评审人员列表,以JSON数组格式返回:
|
|
|
-[{{"name": "姓名", "specialty": "专业分工"}}]
|
|
|
-
|
|
|
-表格内容:
|
|
|
-{text}
|
|
|
-
|
|
|
-只返回JSON数组,不要其他内容。"""
|
|
|
- },
|
|
|
-]
|
|
|
+from .ner_service import ner_service
|
|
|
|
|
|
|
|
|
class ElementExtractor:
|
|
|
- """要素提取器"""
|
|
|
+ """
|
|
|
+ 要素提取器
|
|
|
+
|
|
|
+ 使用NER服务识别文档中的实体,可选使用LLM进行智能提取。
|
|
|
+ 不预定义要素结构,返回动态识别的实体。
|
|
|
+ """
|
|
|
|
|
|
def __init__(self):
|
|
|
- self.ner_rules = NER_RULES
|
|
|
- self.llm_summary_config = LLM_SUMMARY_ELEMENTS
|
|
|
- self.llm_table_config = LLM_TABLE_ELEMENTS
|
|
|
self._deepseek_service = None
|
|
|
|
|
|
@property
|
|
|
@@ -309,248 +42,358 @@ class ElementExtractor:
|
|
|
use_llm: bool = True
|
|
|
) -> Dict[str, Any]:
|
|
|
"""
|
|
|
- 从纯文本中提取所有要素(主接口)
|
|
|
+ 从纯文本中提取实体(主接口)
|
|
|
|
|
|
Args:
|
|
|
- text: Java后端解析的纯文本
|
|
|
+ text: 文档纯文本
|
|
|
attachment_id: 附件ID
|
|
|
- use_llm: 是否使用LLM提取(总结型要素)
|
|
|
+ use_llm: 是否使用LLM提取
|
|
|
|
|
|
Returns:
|
|
|
{
|
|
|
- "elements": [...],
|
|
|
- "values": [...],
|
|
|
+ "entities": [...], # NER识别的实体列表
|
|
|
+ "llm_extractions": [...], # LLM提取的内容(可选)
|
|
|
"statistics": {...}
|
|
|
}
|
|
|
"""
|
|
|
- logger.info(f"开始提取要素: attachment_id={attachment_id}, "
|
|
|
+ logger.info(f"开始提取实体: attachment_id={attachment_id}, "
|
|
|
f"text_length={len(text)}, use_llm={use_llm}")
|
|
|
|
|
|
- # 1. NER规则提取
|
|
|
- ner_values = self._extract_by_ner(text, attachment_id)
|
|
|
- logger.info(f"NER提取完成: {len(ner_values)} 个要素")
|
|
|
+ # 1. 使用NER服务提取实体
|
|
|
+ ner_entities = await self._extract_by_ner(text)
|
|
|
+ logger.info(f"NER提取完成: {len(ner_entities)} 个实体")
|
|
|
|
|
|
- # 2. LLM提取(可选)
|
|
|
- llm_values = {}
|
|
|
+ # 2. LLM智能提取(可选)
|
|
|
+ llm_extractions = []
|
|
|
if use_llm and self.deepseek_service:
|
|
|
- llm_values = await self._extract_by_llm(text, attachment_id)
|
|
|
- logger.info(f"LLM提取完成: {len(llm_values)} 个要素")
|
|
|
-
|
|
|
- # 4. 合并结果
|
|
|
- all_values = {**ner_values, **llm_values}
|
|
|
-
|
|
|
- # 5. 生成输出
|
|
|
- elements, values = self._build_output(all_values, attachment_id)
|
|
|
+ llm_extractions = await self._extract_by_llm(text)
|
|
|
+ logger.info(f"LLM提取完成: {len(llm_extractions)} 个内容")
|
|
|
|
|
|
return {
|
|
|
- "elements": elements,
|
|
|
- "values": values,
|
|
|
+ "entities": ner_entities,
|
|
|
+ "llm_extractions": llm_extractions,
|
|
|
"statistics": {
|
|
|
- "total_elements": len(elements),
|
|
|
- "filled_values": len([v for v in values if v.get("isFilled")]),
|
|
|
- "ner_extracted": len(ner_values),
|
|
|
- "llm_extracted": len(llm_values),
|
|
|
+ "ner_entity_count": len(ner_entities),
|
|
|
+ "llm_extraction_count": len(llm_extractions),
|
|
|
+ "text_length": len(text)
|
|
|
}
|
|
|
}
|
|
|
|
|
|
- def _extract_by_ner(
|
|
|
- self,
|
|
|
- text: str,
|
|
|
- attachment_id: int
|
|
|
- ) -> Dict[str, Dict]:
|
|
|
- """NER规则提取"""
|
|
|
- results = {}
|
|
|
-
|
|
|
- for element_key, rule in self.ner_rules.items():
|
|
|
- for pattern in rule['patterns']:
|
|
|
- try:
|
|
|
- match = re.search(pattern, text)
|
|
|
- if match:
|
|
|
- value = match.group(1).strip()
|
|
|
-
|
|
|
- # 后处理
|
|
|
- if rule.get('post_process') == 'append_unit':
|
|
|
- if not value.endswith('分'):
|
|
|
- value = value + '分'
|
|
|
-
|
|
|
- results[element_key] = {
|
|
|
- 'value': value,
|
|
|
- 'confidence': 0.95,
|
|
|
- 'source': 'ner',
|
|
|
- 'position': {
|
|
|
- 'charStart': match.start(1),
|
|
|
- 'charEnd': match.end(1),
|
|
|
- 'line': text[:match.start()].count('\n') + 1
|
|
|
- },
|
|
|
- 'element_name': rule['element_name'],
|
|
|
- 'element_type': rule['element_type'],
|
|
|
- 'namespace': rule['namespace']
|
|
|
- }
|
|
|
- break
|
|
|
- except Exception as e:
|
|
|
- logger.warning(f"NER规则匹配失败: {element_key}, pattern={pattern}, error={e}")
|
|
|
+ async def _extract_by_ner(self, text: str) -> List[Dict]:
|
|
|
+ """
|
|
|
+ 使用NER服务提取实体
|
|
|
|
|
|
- return results
|
|
|
+ 返回实体列表,每个实体包含:
|
|
|
+ - text: 实体文本
|
|
|
+ - type: 实体类型(DATE, ORG, PERSON, NUMBER, CODE等)
|
|
|
+ - label: 实体标签
|
|
|
+ - confidence: 置信度
|
|
|
+ - position: 位置信息
|
|
|
+ """
|
|
|
+ try:
|
|
|
+ # 调用现有的NER服务,返回EntityInfo对象列表
|
|
|
+ entities = await ner_service.extract_entities(text)
|
|
|
+
|
|
|
+ # 格式化输出(EntityInfo是Pydantic模型,使用属性访问)
|
|
|
+ result = []
|
|
|
+ for entity in entities:
|
|
|
+ result.append({
|
|
|
+ "text": entity.name,
|
|
|
+ "type": entity.type,
|
|
|
+ "label": entity.type,
|
|
|
+ "confidence": entity.confidence,
|
|
|
+ "position": {
|
|
|
+ "start": entity.position.char_start if entity.position else 0,
|
|
|
+ "end": entity.position.char_end if entity.position else 0
|
|
|
+ }
|
|
|
+ })
|
|
|
+
|
|
|
+ return result
|
|
|
+ except Exception as e:
|
|
|
+ logger.error(f"NER提取失败: {e}")
|
|
|
+ return []
|
|
|
|
|
|
- async def _extract_by_llm(
|
|
|
- self,
|
|
|
- text: str,
|
|
|
- attachment_id: int
|
|
|
- ) -> Dict[str, Dict]:
|
|
|
- """LLM智能提取(总结型要素)"""
|
|
|
- results = {}
|
|
|
+ async def _extract_by_llm(self, text: str) -> List[Dict]:
|
|
|
+ """
|
|
|
+ 使用LLM智能提取关键信息
|
|
|
|
|
|
+ 让LLM自动识别文档中的重要信息,不预设要提取什么。
|
|
|
+ """
|
|
|
if not self.deepseek_service:
|
|
|
- return results
|
|
|
+ return []
|
|
|
|
|
|
- # 提取总结型要素
|
|
|
- for config in self.llm_summary_config:
|
|
|
- element_key = config['element_key']
|
|
|
+ try:
|
|
|
+ # 截取文档前部分进行分析
|
|
|
+ sample_text = text[:8000] if len(text) > 8000 else text
|
|
|
|
|
|
- # 查找相关文本
|
|
|
- relevant_text = self._find_relevant_text(text, config['source_keywords'])
|
|
|
+ prompt = f"""请分析以下文档,提取其中的关键信息。
|
|
|
+
|
|
|
+要求:
|
|
|
+1. 识别文档类型(如:报告、合同、通知等)
|
|
|
+2. 提取关键实体(如:组织名称、日期、金额、编号等)
|
|
|
+3. 提取关键数据(如:得分、级别、数量等)
|
|
|
+4. 以JSON格式返回
|
|
|
+
|
|
|
+返回格式:
|
|
|
+{{
|
|
|
+ "document_type": "文档类型",
|
|
|
+ "key_entities": [
|
|
|
+ {{"name": "实体名称", "type": "实体类型", "value": "实体值"}}
|
|
|
+ ],
|
|
|
+ "key_data": [
|
|
|
+ {{"name": "数据名称", "value": "数据值", "unit": "单位"}}
|
|
|
+ ],
|
|
|
+ "summary": "文档摘要(50字以内)"
|
|
|
+}}
|
|
|
+
|
|
|
+文档内容:
|
|
|
+{sample_text}
|
|
|
+
|
|
|
+只返回JSON,不要其他内容。"""
|
|
|
+
|
|
|
+ response = await self.deepseek_service.chat(prompt)
|
|
|
|
|
|
- if relevant_text and len(relevant_text) > 50:
|
|
|
- prompt = config['prompt'].format(text=relevant_text[:3000])
|
|
|
+ if response:
|
|
|
+ # 尝试解析JSON
|
|
|
try:
|
|
|
- response = await self.deepseek_service.chat(prompt)
|
|
|
- if response and len(response.strip()) > 20:
|
|
|
- results[element_key] = {
|
|
|
- 'value': response.strip(),
|
|
|
- 'confidence': 0.85,
|
|
|
- 'source': 'llm',
|
|
|
- 'element_name': config['element_name'],
|
|
|
- 'element_type': config['element_type'],
|
|
|
- 'namespace': config['namespace']
|
|
|
- }
|
|
|
- except Exception as e:
|
|
|
- logger.error(f"LLM提取失败: {element_key}, error={e}")
|
|
|
+ # 清理响应,提取JSON部分
|
|
|
+ json_str = response.strip()
|
|
|
+ if json_str.startswith("```"):
|
|
|
+ json_str = json_str.split("```")[1]
|
|
|
+ if json_str.startswith("json"):
|
|
|
+ json_str = json_str[4:]
|
|
|
+
|
|
|
+ data = json.loads(json_str)
|
|
|
+
|
|
|
+ extractions = []
|
|
|
+
|
|
|
+ # 文档类型
|
|
|
+ if data.get("document_type"):
|
|
|
+ extractions.append({
|
|
|
+ "name": "文档类型",
|
|
|
+ "value": data["document_type"],
|
|
|
+ "source": "llm"
|
|
|
+ })
|
|
|
+
|
|
|
+ # 关键实体
|
|
|
+ for entity in data.get("key_entities", []):
|
|
|
+ extractions.append({
|
|
|
+ "name": entity.get("name", ""),
|
|
|
+ "type": entity.get("type", ""),
|
|
|
+ "value": entity.get("value", ""),
|
|
|
+ "source": "llm"
|
|
|
+ })
|
|
|
+
|
|
|
+ # 关键数据
|
|
|
+ for item in data.get("key_data", []):
|
|
|
+ value = item.get("value", "")
|
|
|
+ if item.get("unit"):
|
|
|
+ value = f"{value}{item['unit']}"
|
|
|
+ extractions.append({
|
|
|
+ "name": item.get("name", ""),
|
|
|
+ "value": value,
|
|
|
+ "source": "llm"
|
|
|
+ })
|
|
|
+
|
|
|
+ # 摘要
|
|
|
+ if data.get("summary"):
|
|
|
+ extractions.append({
|
|
|
+ "name": "文档摘要",
|
|
|
+ "value": data["summary"],
|
|
|
+ "source": "llm"
|
|
|
+ })
|
|
|
+
|
|
|
+ return extractions
|
|
|
+
|
|
|
+ except json.JSONDecodeError:
|
|
|
+ logger.warning(f"LLM返回的不是有效JSON: {response[:200]}")
|
|
|
+ return []
|
|
|
+
|
|
|
+ return []
|
|
|
+
|
|
|
+ except Exception as e:
|
|
|
+ logger.error(f"LLM提取失败: {e}")
|
|
|
+ return []
|
|
|
+
|
|
|
+
|
|
|
+ async def extract_from_chapters(
|
|
|
+ self,
|
|
|
+ chapters: List[Dict],
|
|
|
+ attachment_id: int = 0,
|
|
|
+ use_llm: bool = True,
|
|
|
+ parallel: bool = True
|
|
|
+ ) -> Dict[str, Any]:
|
|
|
+ """
|
|
|
+ 分章节提取实体,最后去重合并
|
|
|
|
|
|
- # 表格型要素暂时跳过(需要Java后端提供表格结构)
|
|
|
- # TODO: 后续可以通过Java后端传递表格数据
|
|
|
+ Args:
|
|
|
+ chapters: 章节列表,每个章节包含 {chapter_id, title, text}
|
|
|
+ attachment_id: 附件ID
|
|
|
+ use_llm: 是否使用LLM提取
|
|
|
+ parallel: 是否并行处理章节
|
|
|
+
|
|
|
+ Returns:
|
|
|
+ {
|
|
|
+ "entities": [...], # 去重后的实体列表
|
|
|
+ "chapter_entities": {...}, # 按章节分组的实体
|
|
|
+ "llm_extractions": [...],
|
|
|
+ "statistics": {...}
|
|
|
+ }
|
|
|
+ """
|
|
|
+ logger.info(f"开始分章节提取: {len(chapters)} 个章节, parallel={parallel}")
|
|
|
|
|
|
- return results
|
|
|
-
|
|
|
- def _find_relevant_text(self, text: str, keywords: List[str]) -> str:
|
|
|
- """根据关键词查找相关文本段落"""
|
|
|
- lines = text.split('\n')
|
|
|
- relevant_lines = []
|
|
|
- capturing = False
|
|
|
- capture_count = 0
|
|
|
+ chapter_results = {}
|
|
|
+ all_entities = []
|
|
|
+ all_llm_extractions = []
|
|
|
|
|
|
- for line in lines:
|
|
|
- # 检查是否包含关键词
|
|
|
- if any(kw in line for kw in keywords):
|
|
|
- capturing = True
|
|
|
- capture_count = 0
|
|
|
+ if parallel and len(chapters) > 1:
|
|
|
+ # 并行处理章节
|
|
|
+ tasks = []
|
|
|
+ for chapter in chapters:
|
|
|
+ task = self._extract_chapter(chapter, attachment_id, use_llm)
|
|
|
+ tasks.append(task)
|
|
|
|
|
|
- if capturing:
|
|
|
- relevant_lines.append(line)
|
|
|
- capture_count += 1
|
|
|
- # 最多取30行
|
|
|
- if capture_count > 30:
|
|
|
- capturing = False
|
|
|
+ results = await asyncio.gather(*tasks, return_exceptions=True)
|
|
|
+
|
|
|
+ for chapter, result in zip(chapters, results):
|
|
|
+ if isinstance(result, Exception):
|
|
|
+ logger.error(f"章节 {chapter['chapter_id']} 提取失败: {result}")
|
|
|
+ continue
|
|
|
+ chapter_results[chapter['chapter_id']] = result
|
|
|
+ else:
|
|
|
+ # 串行处理章节
|
|
|
+ for chapter in chapters:
|
|
|
+ try:
|
|
|
+ result = await self._extract_chapter(chapter, attachment_id, use_llm)
|
|
|
+ chapter_results[chapter['chapter_id']] = result
|
|
|
+ except Exception as e:
|
|
|
+ logger.error(f"章节 {chapter['chapter_id']} 提取失败: {e}")
|
|
|
|
|
|
- return '\n'.join(relevant_lines)
|
|
|
-
|
|
|
- def _find_relevant_table(self, tables: List[Dict], keywords: List[str]) -> Optional[Dict]:
|
|
|
- """根据关键词查找相关表格"""
|
|
|
- for table_info in tables:
|
|
|
- table = table_info['table']
|
|
|
- if table.get('data') and len(table['data']) > 0:
|
|
|
- # 检查表头
|
|
|
- header_row = table['data'][0]
|
|
|
- header_texts = [cell.get('text', '') for cell in header_row]
|
|
|
- header_str = ' '.join(header_texts)
|
|
|
-
|
|
|
- # 检查是否包含关键词
|
|
|
- match_count = sum(1 for kw in keywords if kw in header_str)
|
|
|
- if match_count >= 2:
|
|
|
- return table
|
|
|
+ # 合并所有章节的实体
|
|
|
+ for chapter_id, result in chapter_results.items():
|
|
|
+ for entity in result.get('entities', []):
|
|
|
+ entity['chapter_id'] = chapter_id
|
|
|
+ all_entities.append(entity)
|
|
|
+ all_llm_extractions.extend(result.get('llm_extractions', []))
|
|
|
|
|
|
- return None
|
|
|
-
|
|
|
- def _table_to_text(self, table: Dict) -> str:
|
|
|
- """将表格转为文本"""
|
|
|
- lines = []
|
|
|
- for row in table.get('data', []):
|
|
|
- cells = [cell.get('text', '') for cell in row]
|
|
|
- lines.append(' | '.join(cells))
|
|
|
- return '\n'.join(lines)
|
|
|
+ # 去重
|
|
|
+ unique_entities = self._deduplicate_entities(all_entities)
|
|
|
+ unique_llm = self._deduplicate_llm_extractions(all_llm_extractions)
|
|
|
+
|
|
|
+ logger.info(f"分章节提取完成: 原始 {len(all_entities)} 个实体, 去重后 {len(unique_entities)} 个")
|
|
|
+
|
|
|
+ return {
|
|
|
+ "entities": unique_entities,
|
|
|
+ "chapter_entities": chapter_results,
|
|
|
+ "llm_extractions": unique_llm,
|
|
|
+ "statistics": {
|
|
|
+ "chapter_count": len(chapters),
|
|
|
+ "total_entities_before_dedup": len(all_entities),
|
|
|
+ "unique_entity_count": len(unique_entities),
|
|
|
+ "llm_extraction_count": len(unique_llm)
|
|
|
+ }
|
|
|
+ }
|
|
|
|
|
|
- def _build_output(
|
|
|
+ async def _extract_chapter(
|
|
|
self,
|
|
|
- extracted_values: Dict[str, Dict],
|
|
|
- attachment_id: int
|
|
|
- ) -> Tuple[List[Dict], List[Dict]]:
|
|
|
- """构建输出的elements和values"""
|
|
|
+ chapter: Dict,
|
|
|
+ attachment_id: int,
|
|
|
+ use_llm: bool
|
|
|
+ ) -> Dict[str, Any]:
|
|
|
+ """提取单个章节的实体"""
|
|
|
+ chapter_id = chapter.get('chapter_id', 'unknown')
|
|
|
+ title = chapter.get('title', '')
|
|
|
+ text = chapter.get('text', '')
|
|
|
|
|
|
- # 合并所有要素定义
|
|
|
- all_element_defs = {}
|
|
|
+ if not text or len(text.strip()) < 10:
|
|
|
+ return {"entities": [], "llm_extractions": []}
|
|
|
|
|
|
- # 从NER规则获取
|
|
|
- for key, rule in self.ner_rules.items():
|
|
|
- all_element_defs[key] = {
|
|
|
- 'element_name': rule['element_name'],
|
|
|
- 'element_type': rule['element_type'],
|
|
|
- 'namespace': rule['namespace']
|
|
|
- }
|
|
|
+ logger.debug(f"提取章节 {chapter_id}: {title[:30]}... (长度: {len(text)})")
|
|
|
|
|
|
- # 从LLM配置获取
|
|
|
- for config in self.llm_summary_config:
|
|
|
- all_element_defs[config['element_key']] = {
|
|
|
- 'element_name': config['element_name'],
|
|
|
- 'element_type': config['element_type'],
|
|
|
- 'namespace': config['namespace']
|
|
|
- }
|
|
|
+ # NER提取
|
|
|
+ entities = await self._extract_by_ner(text)
|
|
|
|
|
|
- for config in self.llm_table_config:
|
|
|
- all_element_defs[config['element_key']] = {
|
|
|
- 'element_name': config['element_name'],
|
|
|
- 'element_type': config['element_type'],
|
|
|
- 'namespace': config['namespace']
|
|
|
- }
|
|
|
+ # 为每个实体添加章节信息
|
|
|
+ for entity in entities:
|
|
|
+ entity['chapter_id'] = chapter_id
|
|
|
+ entity['chapter_title'] = title
|
|
|
|
|
|
- elements = []
|
|
|
- values = []
|
|
|
+ # LLM提取(可选)
|
|
|
+ llm_extractions = []
|
|
|
+ if use_llm and self.deepseek_service:
|
|
|
+ llm_extractions = await self._extract_by_llm(text)
|
|
|
+ for item in llm_extractions:
|
|
|
+ item['chapter_id'] = chapter_id
|
|
|
+ item['chapter_title'] = title
|
|
|
|
|
|
- for i, (element_key, elem_def) in enumerate(all_element_defs.items()):
|
|
|
- element = {
|
|
|
- "id": 700 + i,
|
|
|
- "elementKey": element_key,
|
|
|
- "elementName": elem_def['element_name'],
|
|
|
- "elementType": elem_def['element_type'],
|
|
|
- "namespace": elem_def['namespace'],
|
|
|
- "sortOrder": i
|
|
|
- }
|
|
|
- elements.append(element)
|
|
|
-
|
|
|
- # 查找提取的值
|
|
|
- extracted = extracted_values.get(element_key)
|
|
|
- if extracted:
|
|
|
- value = {
|
|
|
- "valueId": 800 + i,
|
|
|
- "elementKey": element_key,
|
|
|
- "valueText": extracted['value'],
|
|
|
- "isFilled": True,
|
|
|
- "fillSource": "ai" if extracted['source'] == 'llm' else "rule",
|
|
|
- "confidence": extracted.get('confidence', 0.8),
|
|
|
- "sourceAttachmentId": attachment_id
|
|
|
- }
|
|
|
- if 'position' in extracted:
|
|
|
- value['extractPosition'] = extracted['position']
|
|
|
+ return {
|
|
|
+ "entities": entities,
|
|
|
+ "llm_extractions": llm_extractions
|
|
|
+ }
|
|
|
+
|
|
|
+ def _deduplicate_entities(self, entities: List[Dict]) -> List[Dict]:
|
|
|
+ """
|
|
|
+ 实体去重
|
|
|
+
|
|
|
+ 去重规则:
|
|
|
+ 1. 相同类型+相同文本 -> 保留第一个出现的
|
|
|
+ 2. 包含关系 -> 保留更长的实体
|
|
|
+ """
|
|
|
+ if not entities:
|
|
|
+ return []
|
|
|
+
|
|
|
+ # 按 (type, text) 去重
|
|
|
+ seen = {}
|
|
|
+ for entity in entities:
|
|
|
+ key = (entity.get('type', ''), entity.get('text', ''))
|
|
|
+ if key not in seen:
|
|
|
+ seen[key] = entity
|
|
|
else:
|
|
|
- value = {
|
|
|
- "valueId": 800 + i,
|
|
|
- "elementKey": element_key,
|
|
|
- "valueText": "",
|
|
|
- "isFilled": False,
|
|
|
- "fillSource": "default"
|
|
|
- }
|
|
|
+ # 保留置信度更高的
|
|
|
+ if entity.get('confidence', 0) > seen[key].get('confidence', 0):
|
|
|
+ seen[key] = entity
|
|
|
+
|
|
|
+ unique = list(seen.values())
|
|
|
+
|
|
|
+ # 处理包含关系(可选,较复杂)
|
|
|
+ # 例如:"中国电建集团" 和 "中国电建集团成都勘测设计研究院有限公司"
|
|
|
+ # 保留更长的
|
|
|
+ final = []
|
|
|
+ texts = set()
|
|
|
+
|
|
|
+ # 按文本长度降序排序
|
|
|
+ unique.sort(key=lambda x: len(x.get('text', '')), reverse=True)
|
|
|
+
|
|
|
+ for entity in unique:
|
|
|
+ text = entity.get('text', '')
|
|
|
+ # 检查是否被更长的实体包含
|
|
|
+ is_substring = False
|
|
|
+ for existing_text in texts:
|
|
|
+ if text in existing_text and text != existing_text:
|
|
|
+ is_substring = True
|
|
|
+ break
|
|
|
|
|
|
- values.append(value)
|
|
|
+ if not is_substring:
|
|
|
+ final.append(entity)
|
|
|
+ texts.add(text)
|
|
|
+
|
|
|
+ # 恢复原始顺序(按位置)
|
|
|
+ final.sort(key=lambda x: x.get('position', {}).get('start', 0))
|
|
|
+
|
|
|
+ return final
|
|
|
+
|
|
|
+ def _deduplicate_llm_extractions(self, extractions: List[Dict]) -> List[Dict]:
|
|
|
+ """LLM提取结果去重"""
|
|
|
+ if not extractions:
|
|
|
+ return []
|
|
|
+
|
|
|
+ seen = {}
|
|
|
+ for item in extractions:
|
|
|
+ key = (item.get('name', ''), item.get('value', ''))
|
|
|
+ if key not in seen:
|
|
|
+ seen[key] = item
|
|
|
|
|
|
- return elements, values
|
|
|
+ return list(seen.values())
|
|
|
|
|
|
|
|
|
# 创建单例
|