""" 要素提取器:混合NER+LLM策略 从解析后的文档内容中提取要素值,输出前端渲染所需的elements和values。 """ import re import json from typing import Dict, List, Any, Optional, Tuple from loguru import logger # DOCX解析由Java后端完成,这里只处理纯文本 # ============================================================ # NER规则定义 # ============================================================ NER_RULES = { # 日期类 "project.workStartAt": { "patterns": [ r'评审日期[::]\s*(\d{4}年\d{1,2}月\d{1,2}日)至', r'(\d{4}年\d{1,2}月\d{1,2}日)至\d{4}年', r'评审(?:开始)?日期[::]\s*(\d{4}年\d{1,2}月\d{1,2}日)', ], "type": "DATE", "element_name": "评审开始日期", "element_type": "text", "namespace": "project" }, "project.workEndAt": { "patterns": [ r'至(\d{4}年\d{1,2}月\d{1,2}日)', r'评审(?:结束)?日期[::]\s*(\d{4}年\d{1,2}月\d{1,2}日)', ], "type": "DATE", "element_name": "评审结束日期", "element_type": "text", "namespace": "project" }, # 得分类 "project.resultScore": { "patterns": [ r'评审得分[::]\s*(\d+\.?\d*)\s*分', r'得分[::]\s*(\d+\.?\d*)\s*分', r'(\d+\.?\d*)分\s*级别', ], "type": "SCORE", "element_name": "评审得分", "element_type": "text", "namespace": "project", "post_process": "append_unit" # 添加"分"单位 }, # 级别类 "project.resultLevel": { "patterns": [ r'级别[::]\s*(一级|二级|三级)', r'评审(?:结论)?级别[::]\s*(一级|二级|三级)', r'(一级|二级|三级)\s*(?:企业)?(?:证书)?', ], "type": "LEVEL", "element_name": "评审结论级别", "element_type": "text", "namespace": "project" }, # 编号类 "basicInfo.projectCode": { "patterns": [ r'项目编号[::]\s*([A-Z]+-\d+-\d+)', r'编号[::]\s*([A-Z0-9\-]+)', ], "type": "CODE", "element_name": "项目编号", "element_type": "text", "namespace": "basicInfo" }, "basicInfo.reviewObjectCertificateCode": { "patterns": [ r'证书编号[::]\s*(ZGDIDBOY-\d+)', r'证书编号[((]([^))]+)[))]', r'证书编号[::]\s*([A-Z0-9\-]+)', ], "type": "CODE", "element_name": "证书编号", "element_type": "text", "namespace": "basicInfo" }, # 机构类 "project.reviewObject": { "patterns": [ r'评审对象[::]\s*([^\n]{10,60}(?:公司|集团|院|所))', r'对([^\n]{10,60}(?:公司|集团|院|所))进行.*?(?:评审|复审)', ], "type": "ORG", "element_name": "评审对象", "element_type": "text", "namespace": "project" }, "project.reviewObjectAlias": { "patterns": [ r'以下简称[「『"""]([^」』""]{2,10})[」』"""]', r'简称[「『"""]([^」』""]{2,10})[」』"""]', r'(以下简称"([^"]{2,10})")', ], "type": "ALIAS", "element_name": "评审对象简称", "element_type": "text", "namespace": "project" }, } # ============================================================ # LLM提取配置 # ============================================================ LLM_SUMMARY_ELEMENTS = [ { "element_key": "project.target", "element_name": "目标", "element_type": "paragraph", "namespace": "project", "source_keywords": ["目标", "5.1.1"], "prompt": "请根据以下评审意见,总结企业的安全生产目标情况(100-200字):\n{text}" }, { "element_key": "project.duty", "element_name": "职责", "element_type": "paragraph", "namespace": "project", "source_keywords": ["职责", "5.1.2"], "prompt": "请根据以下评审意见,总结企业的安全生产职责落实情况(100-200字):\n{text}" }, { "element_key": "project.fullParticipation", "element_name": "全员参与", "element_type": "paragraph", "namespace": "project", "source_keywords": ["全员参与", "5.1.3"], "prompt": "请根据以下评审意见,总结企业的全员参与情况(100-200字):\n{text}" }, { "element_key": "project.safetyInvestment", "element_name": "安全投入", "element_type": "paragraph", "namespace": "project", "source_keywords": ["安全投入", "安全生产费用", "5.1.4"], "prompt": "请根据以下评审意见,总结企业的安全投入情况(100-200字):\n{text}" }, { "element_key": "project.safetyCulture", "element_name": "安全文化", "element_type": "paragraph", "namespace": "project", "source_keywords": ["安全文化", "5.1.5"], "prompt": "请根据以下评审意见,总结企业的安全文化建设情况(100-200字):\n{text}" }, { "element_key": "project.systematicManagement", "element_name": "体系化管理", "element_type": "paragraph", "namespace": "project", "source_keywords": ["制度化管理", "体系化", "5.2"], "prompt": "请根据以下评审意见,总结企业的体系化管理情况(100-200字):\n{text}" }, { "element_key": "project.employeeTraining", "element_name": "人员教育培训", "element_type": "paragraph", "namespace": "project", "source_keywords": ["教育培训", "5.3"], "prompt": "请根据以下评审意见,总结企业的人员教育培训情况(100-200字):\n{text}" }, { "element_key": "project.assetManagement", "element_name": "设备设施管理", "element_type": "paragraph", "namespace": "project", "source_keywords": ["设备设施", "5.4.1"], "prompt": "请根据以下评审意见,总结企业的设备设施管理情况(100-200字):\n{text}" }, { "element_key": "project.jobSafety", "element_name": "作业安全", "element_type": "paragraph", "namespace": "project", "source_keywords": ["作业安全", "5.4.2.1"], "prompt": "请根据以下评审意见,总结企业的作业安全情况(100-200字):\n{text}" }, { "element_key": "project.riskAssessment", "element_name": "风险辨识与评价", "element_type": "paragraph", "namespace": "project", "source_keywords": ["风险辨识", "风险评价", "5.5.1"], "prompt": "请根据以下评审意见,总结企业的风险辨识与评价情况(100-200字):\n{text}" }, { "element_key": "project.hazardInspection", "element_name": "隐患排查", "element_type": "paragraph", "namespace": "project", "source_keywords": ["隐患排查", "5.5.3"], "prompt": "请根据以下评审意见,总结企业的隐患排查情况(100-200字):\n{text}" }, { "element_key": "project.emergencyResponse", "element_name": "应急救援", "element_type": "paragraph", "namespace": "project", "source_keywords": ["应急救援", "应急管理", "5.6"], "prompt": "请根据以下评审意见,总结企业的应急救援情况(100-200字):\n{text}" }, { "element_key": "project.incidentManagement", "element_name": "事故管理", "element_type": "paragraph", "namespace": "project", "source_keywords": ["事故管理", "5.7"], "prompt": "请根据以下评审意见,总结企业的事故管理情况(100-200字):\n{text}" }, { "element_key": "project.continuousImprovement", "element_name": "持续改进", "element_type": "paragraph", "namespace": "project", "source_keywords": ["持续改进", "5.8"], "prompt": "请根据以下评审意见,总结企业的持续改进情况(100-200字):\n{text}" }, { "element_key": "project.reviewObjectSelfAssessmentProcess", "element_name": "自评过程", "element_type": "paragraph", "namespace": "project", "source_keywords": ["自评", "自查"], "prompt": "请根据以下内容,总结企业的自评过程(150-250字):\n{text}" }, { "element_key": "project.safetyHighlight", "element_name": "安全生产管理亮点", "element_type": "paragraph", "namespace": "project", "source_keywords": ["亮点", "特色", "优秀"], "prompt": "请根据以下内容,提炼企业的安全生产管理亮点(100-200字):\n{text}" }, ] LLM_TABLE_ELEMENTS = [ { "element_key": "+SPSRRReviewProject", "element_name": "现场复审项目", "element_type": "table", "namespace": "spsrr", "table_keywords": ["项目名称", "简称", "类型"], "prompt": """请从以下表格中提取复审项目列表,以JSON数组格式返回: [{{"name": "项目名称", "alias": "简称", "type": "单位/在建项目", "order": 1}}] 表格内容: {text} 只返回JSON数组,不要其他内容。""" }, { "element_key": "+SPSRRReviewer", "element_name": "现场复审人员", "element_type": "table", "namespace": "spsrr", "table_keywords": ["姓名", "专业", "分工"], "prompt": """请从以下表格中提取评审人员列表,以JSON数组格式返回: [{{"name": "姓名", "specialty": "专业分工"}}] 表格内容: {text} 只返回JSON数组,不要其他内容。""" }, ] class ElementExtractor: """要素提取器""" def __init__(self): self.ner_rules = NER_RULES self.llm_summary_config = LLM_SUMMARY_ELEMENTS self.llm_table_config = LLM_TABLE_ELEMENTS self._deepseek_service = None @property def deepseek_service(self): """延迟加载deepseek服务""" if self._deepseek_service is None: try: from .deepseek_service import deepseek_service self._deepseek_service = deepseek_service except ImportError: logger.warning("DeepSeek服务未配置,LLM提取将跳过") self._deepseek_service = None return self._deepseek_service async def extract_from_text( self, text: str, attachment_id: int = 0, use_llm: bool = True ) -> Dict[str, Any]: """ 从纯文本中提取所有要素(主接口) Args: text: Java后端解析的纯文本 attachment_id: 附件ID use_llm: 是否使用LLM提取(总结型要素) Returns: { "elements": [...], "values": [...], "statistics": {...} } """ logger.info(f"开始提取要素: attachment_id={attachment_id}, " f"text_length={len(text)}, use_llm={use_llm}") # 1. NER规则提取 ner_values = self._extract_by_ner(text, attachment_id) logger.info(f"NER提取完成: {len(ner_values)} 个要素") # 2. LLM提取(可选) llm_values = {} if use_llm and self.deepseek_service: llm_values = await self._extract_by_llm(text, attachment_id) logger.info(f"LLM提取完成: {len(llm_values)} 个要素") # 4. 合并结果 all_values = {**ner_values, **llm_values} # 5. 生成输出 elements, values = self._build_output(all_values, attachment_id) return { "elements": elements, "values": values, "statistics": { "total_elements": len(elements), "filled_values": len([v for v in values if v.get("isFilled")]), "ner_extracted": len(ner_values), "llm_extracted": len(llm_values), } } def _extract_by_ner( self, text: str, attachment_id: int ) -> Dict[str, Dict]: """NER规则提取""" results = {} for element_key, rule in self.ner_rules.items(): for pattern in rule['patterns']: try: match = re.search(pattern, text) if match: value = match.group(1).strip() # 后处理 if rule.get('post_process') == 'append_unit': if not value.endswith('分'): value = value + '分' results[element_key] = { 'value': value, 'confidence': 0.95, 'source': 'ner', 'position': { 'charStart': match.start(1), 'charEnd': match.end(1), 'line': text[:match.start()].count('\n') + 1 }, 'element_name': rule['element_name'], 'element_type': rule['element_type'], 'namespace': rule['namespace'] } break except Exception as e: logger.warning(f"NER规则匹配失败: {element_key}, pattern={pattern}, error={e}") return results async def _extract_by_llm( self, text: str, attachment_id: int ) -> Dict[str, Dict]: """LLM智能提取(总结型要素)""" results = {} if not self.deepseek_service: return results # 提取总结型要素 for config in self.llm_summary_config: element_key = config['element_key'] # 查找相关文本 relevant_text = self._find_relevant_text(text, config['source_keywords']) if relevant_text and len(relevant_text) > 50: prompt = config['prompt'].format(text=relevant_text[:3000]) try: response = await self.deepseek_service.chat(prompt) if response and len(response.strip()) > 20: results[element_key] = { 'value': response.strip(), 'confidence': 0.85, 'source': 'llm', 'element_name': config['element_name'], 'element_type': config['element_type'], 'namespace': config['namespace'] } except Exception as e: logger.error(f"LLM提取失败: {element_key}, error={e}") # 表格型要素暂时跳过(需要Java后端提供表格结构) # TODO: 后续可以通过Java后端传递表格数据 return results def _find_relevant_text(self, text: str, keywords: List[str]) -> str: """根据关键词查找相关文本段落""" lines = text.split('\n') relevant_lines = [] capturing = False capture_count = 0 for line in lines: # 检查是否包含关键词 if any(kw in line for kw in keywords): capturing = True capture_count = 0 if capturing: relevant_lines.append(line) capture_count += 1 # 最多取30行 if capture_count > 30: capturing = False return '\n'.join(relevant_lines) def _find_relevant_table(self, tables: List[Dict], keywords: List[str]) -> Optional[Dict]: """根据关键词查找相关表格""" for table_info in tables: table = table_info['table'] if table.get('data') and len(table['data']) > 0: # 检查表头 header_row = table['data'][0] header_texts = [cell.get('text', '') for cell in header_row] header_str = ' '.join(header_texts) # 检查是否包含关键词 match_count = sum(1 for kw in keywords if kw in header_str) if match_count >= 2: return table return None def _table_to_text(self, table: Dict) -> str: """将表格转为文本""" lines = [] for row in table.get('data', []): cells = [cell.get('text', '') for cell in row] lines.append(' | '.join(cells)) return '\n'.join(lines) def _build_output( self, extracted_values: Dict[str, Dict], attachment_id: int ) -> Tuple[List[Dict], List[Dict]]: """构建输出的elements和values""" # 合并所有要素定义 all_element_defs = {} # 从NER规则获取 for key, rule in self.ner_rules.items(): all_element_defs[key] = { 'element_name': rule['element_name'], 'element_type': rule['element_type'], 'namespace': rule['namespace'] } # 从LLM配置获取 for config in self.llm_summary_config: all_element_defs[config['element_key']] = { 'element_name': config['element_name'], 'element_type': config['element_type'], 'namespace': config['namespace'] } for config in self.llm_table_config: all_element_defs[config['element_key']] = { 'element_name': config['element_name'], 'element_type': config['element_type'], 'namespace': config['namespace'] } elements = [] values = [] for i, (element_key, elem_def) in enumerate(all_element_defs.items()): element = { "id": 700 + i, "elementKey": element_key, "elementName": elem_def['element_name'], "elementType": elem_def['element_type'], "namespace": elem_def['namespace'], "sortOrder": i } elements.append(element) # 查找提取的值 extracted = extracted_values.get(element_key) if extracted: value = { "valueId": 800 + i, "elementKey": element_key, "valueText": extracted['value'], "isFilled": True, "fillSource": "ai" if extracted['source'] == 'llm' else "rule", "confidence": extracted.get('confidence', 0.8), "sourceAttachmentId": attachment_id } if 'position' in extracted: value['extractPosition'] = extracted['position'] else: value = { "valueId": 800 + i, "elementKey": element_key, "valueText": "", "isFilled": False, "fillSource": "default" } values.append(value) return elements, values # 创建单例 element_extractor = ElementExtractor()