| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557 |
- """
- 要素提取器:混合NER+LLM策略
- 从解析后的文档内容中提取要素值,输出前端渲染所需的elements和values。
- """
- import re
- import json
- from typing import Dict, List, Any, Optional, Tuple
- from loguru import logger
- # DOCX解析由Java后端完成,这里只处理纯文本
- # ============================================================
- # NER规则定义
- # ============================================================
- NER_RULES = {
- # 日期类
- "project.workStartAt": {
- "patterns": [
- r'评审日期[::]\s*(\d{4}年\d{1,2}月\d{1,2}日)至',
- r'(\d{4}年\d{1,2}月\d{1,2}日)至\d{4}年',
- r'评审(?:开始)?日期[::]\s*(\d{4}年\d{1,2}月\d{1,2}日)',
- ],
- "type": "DATE",
- "element_name": "评审开始日期",
- "element_type": "text",
- "namespace": "project"
- },
- "project.workEndAt": {
- "patterns": [
- r'至(\d{4}年\d{1,2}月\d{1,2}日)',
- r'评审(?:结束)?日期[::]\s*(\d{4}年\d{1,2}月\d{1,2}日)',
- ],
- "type": "DATE",
- "element_name": "评审结束日期",
- "element_type": "text",
- "namespace": "project"
- },
-
- # 得分类
- "project.resultScore": {
- "patterns": [
- r'评审得分[::]\s*(\d+\.?\d*)\s*分',
- r'得分[::]\s*(\d+\.?\d*)\s*分',
- r'(\d+\.?\d*)分\s*级别',
- ],
- "type": "SCORE",
- "element_name": "评审得分",
- "element_type": "text",
- "namespace": "project",
- "post_process": "append_unit" # 添加"分"单位
- },
-
- # 级别类
- "project.resultLevel": {
- "patterns": [
- r'级别[::]\s*(一级|二级|三级)',
- r'评审(?:结论)?级别[::]\s*(一级|二级|三级)',
- r'(一级|二级|三级)\s*(?:企业)?(?:证书)?',
- ],
- "type": "LEVEL",
- "element_name": "评审结论级别",
- "element_type": "text",
- "namespace": "project"
- },
-
- # 编号类
- "basicInfo.projectCode": {
- "patterns": [
- r'项目编号[::]\s*([A-Z]+-\d+-\d+)',
- r'编号[::]\s*([A-Z0-9\-]+)',
- ],
- "type": "CODE",
- "element_name": "项目编号",
- "element_type": "text",
- "namespace": "basicInfo"
- },
- "basicInfo.reviewObjectCertificateCode": {
- "patterns": [
- r'证书编号[::]\s*(ZGDIDBOY-\d+)',
- r'证书编号[((]([^))]+)[))]',
- r'证书编号[::]\s*([A-Z0-9\-]+)',
- ],
- "type": "CODE",
- "element_name": "证书编号",
- "element_type": "text",
- "namespace": "basicInfo"
- },
-
- # 机构类
- "project.reviewObject": {
- "patterns": [
- r'评审对象[::]\s*([^\n]{10,60}(?:公司|集团|院|所))',
- r'对([^\n]{10,60}(?:公司|集团|院|所))进行.*?(?:评审|复审)',
- ],
- "type": "ORG",
- "element_name": "评审对象",
- "element_type": "text",
- "namespace": "project"
- },
- "project.reviewObjectAlias": {
- "patterns": [
- r'以下简称[「『"""]([^」』""]{2,10})[」』"""]',
- r'简称[「『"""]([^」』""]{2,10})[」』"""]',
- r'(以下简称"([^"]{2,10})")',
- ],
- "type": "ALIAS",
- "element_name": "评审对象简称",
- "element_type": "text",
- "namespace": "project"
- },
- }
- # ============================================================
- # LLM提取配置
- # ============================================================
- LLM_SUMMARY_ELEMENTS = [
- {
- "element_key": "project.target",
- "element_name": "目标",
- "element_type": "paragraph",
- "namespace": "project",
- "source_keywords": ["目标", "5.1.1"],
- "prompt": "请根据以下评审意见,总结企业的安全生产目标情况(100-200字):\n{text}"
- },
- {
- "element_key": "project.duty",
- "element_name": "职责",
- "element_type": "paragraph",
- "namespace": "project",
- "source_keywords": ["职责", "5.1.2"],
- "prompt": "请根据以下评审意见,总结企业的安全生产职责落实情况(100-200字):\n{text}"
- },
- {
- "element_key": "project.fullParticipation",
- "element_name": "全员参与",
- "element_type": "paragraph",
- "namespace": "project",
- "source_keywords": ["全员参与", "5.1.3"],
- "prompt": "请根据以下评审意见,总结企业的全员参与情况(100-200字):\n{text}"
- },
- {
- "element_key": "project.safetyInvestment",
- "element_name": "安全投入",
- "element_type": "paragraph",
- "namespace": "project",
- "source_keywords": ["安全投入", "安全生产费用", "5.1.4"],
- "prompt": "请根据以下评审意见,总结企业的安全投入情况(100-200字):\n{text}"
- },
- {
- "element_key": "project.safetyCulture",
- "element_name": "安全文化",
- "element_type": "paragraph",
- "namespace": "project",
- "source_keywords": ["安全文化", "5.1.5"],
- "prompt": "请根据以下评审意见,总结企业的安全文化建设情况(100-200字):\n{text}"
- },
- {
- "element_key": "project.systematicManagement",
- "element_name": "体系化管理",
- "element_type": "paragraph",
- "namespace": "project",
- "source_keywords": ["制度化管理", "体系化", "5.2"],
- "prompt": "请根据以下评审意见,总结企业的体系化管理情况(100-200字):\n{text}"
- },
- {
- "element_key": "project.employeeTraining",
- "element_name": "人员教育培训",
- "element_type": "paragraph",
- "namespace": "project",
- "source_keywords": ["教育培训", "5.3"],
- "prompt": "请根据以下评审意见,总结企业的人员教育培训情况(100-200字):\n{text}"
- },
- {
- "element_key": "project.assetManagement",
- "element_name": "设备设施管理",
- "element_type": "paragraph",
- "namespace": "project",
- "source_keywords": ["设备设施", "5.4.1"],
- "prompt": "请根据以下评审意见,总结企业的设备设施管理情况(100-200字):\n{text}"
- },
- {
- "element_key": "project.jobSafety",
- "element_name": "作业安全",
- "element_type": "paragraph",
- "namespace": "project",
- "source_keywords": ["作业安全", "5.4.2.1"],
- "prompt": "请根据以下评审意见,总结企业的作业安全情况(100-200字):\n{text}"
- },
- {
- "element_key": "project.riskAssessment",
- "element_name": "风险辨识与评价",
- "element_type": "paragraph",
- "namespace": "project",
- "source_keywords": ["风险辨识", "风险评价", "5.5.1"],
- "prompt": "请根据以下评审意见,总结企业的风险辨识与评价情况(100-200字):\n{text}"
- },
- {
- "element_key": "project.hazardInspection",
- "element_name": "隐患排查",
- "element_type": "paragraph",
- "namespace": "project",
- "source_keywords": ["隐患排查", "5.5.3"],
- "prompt": "请根据以下评审意见,总结企业的隐患排查情况(100-200字):\n{text}"
- },
- {
- "element_key": "project.emergencyResponse",
- "element_name": "应急救援",
- "element_type": "paragraph",
- "namespace": "project",
- "source_keywords": ["应急救援", "应急管理", "5.6"],
- "prompt": "请根据以下评审意见,总结企业的应急救援情况(100-200字):\n{text}"
- },
- {
- "element_key": "project.incidentManagement",
- "element_name": "事故管理",
- "element_type": "paragraph",
- "namespace": "project",
- "source_keywords": ["事故管理", "5.7"],
- "prompt": "请根据以下评审意见,总结企业的事故管理情况(100-200字):\n{text}"
- },
- {
- "element_key": "project.continuousImprovement",
- "element_name": "持续改进",
- "element_type": "paragraph",
- "namespace": "project",
- "source_keywords": ["持续改进", "5.8"],
- "prompt": "请根据以下评审意见,总结企业的持续改进情况(100-200字):\n{text}"
- },
- {
- "element_key": "project.reviewObjectSelfAssessmentProcess",
- "element_name": "自评过程",
- "element_type": "paragraph",
- "namespace": "project",
- "source_keywords": ["自评", "自查"],
- "prompt": "请根据以下内容,总结企业的自评过程(150-250字):\n{text}"
- },
- {
- "element_key": "project.safetyHighlight",
- "element_name": "安全生产管理亮点",
- "element_type": "paragraph",
- "namespace": "project",
- "source_keywords": ["亮点", "特色", "优秀"],
- "prompt": "请根据以下内容,提炼企业的安全生产管理亮点(100-200字):\n{text}"
- },
- ]
- LLM_TABLE_ELEMENTS = [
- {
- "element_key": "+SPSRRReviewProject",
- "element_name": "现场复审项目",
- "element_type": "table",
- "namespace": "spsrr",
- "table_keywords": ["项目名称", "简称", "类型"],
- "prompt": """请从以下表格中提取复审项目列表,以JSON数组格式返回:
- [{{"name": "项目名称", "alias": "简称", "type": "单位/在建项目", "order": 1}}]
- 表格内容:
- {text}
- 只返回JSON数组,不要其他内容。"""
- },
- {
- "element_key": "+SPSRRReviewer",
- "element_name": "现场复审人员",
- "element_type": "table",
- "namespace": "spsrr",
- "table_keywords": ["姓名", "专业", "分工"],
- "prompt": """请从以下表格中提取评审人员列表,以JSON数组格式返回:
- [{{"name": "姓名", "specialty": "专业分工"}}]
- 表格内容:
- {text}
- 只返回JSON数组,不要其他内容。"""
- },
- ]
- class ElementExtractor:
- """要素提取器"""
-
- def __init__(self):
- self.ner_rules = NER_RULES
- self.llm_summary_config = LLM_SUMMARY_ELEMENTS
- self.llm_table_config = LLM_TABLE_ELEMENTS
- self._deepseek_service = None
-
- @property
- def deepseek_service(self):
- """延迟加载deepseek服务"""
- if self._deepseek_service is None:
- try:
- from .deepseek_service import deepseek_service
- self._deepseek_service = deepseek_service
- except ImportError:
- logger.warning("DeepSeek服务未配置,LLM提取将跳过")
- self._deepseek_service = None
- return self._deepseek_service
-
- async def extract_from_text(
- self,
- text: str,
- attachment_id: int = 0,
- use_llm: bool = True
- ) -> Dict[str, Any]:
- """
- 从纯文本中提取所有要素(主接口)
-
- Args:
- text: Java后端解析的纯文本
- attachment_id: 附件ID
- use_llm: 是否使用LLM提取(总结型要素)
-
- Returns:
- {
- "elements": [...],
- "values": [...],
- "statistics": {...}
- }
- """
- logger.info(f"开始提取要素: attachment_id={attachment_id}, "
- f"text_length={len(text)}, use_llm={use_llm}")
-
- # 1. NER规则提取
- ner_values = self._extract_by_ner(text, attachment_id)
- logger.info(f"NER提取完成: {len(ner_values)} 个要素")
-
- # 2. LLM提取(可选)
- llm_values = {}
- if use_llm and self.deepseek_service:
- llm_values = await self._extract_by_llm(text, attachment_id)
- logger.info(f"LLM提取完成: {len(llm_values)} 个要素")
-
- # 4. 合并结果
- all_values = {**ner_values, **llm_values}
-
- # 5. 生成输出
- elements, values = self._build_output(all_values, attachment_id)
-
- return {
- "elements": elements,
- "values": values,
- "statistics": {
- "total_elements": len(elements),
- "filled_values": len([v for v in values if v.get("isFilled")]),
- "ner_extracted": len(ner_values),
- "llm_extracted": len(llm_values),
- }
- }
-
- def _extract_by_ner(
- self,
- text: str,
- attachment_id: int
- ) -> Dict[str, Dict]:
- """NER规则提取"""
- results = {}
-
- for element_key, rule in self.ner_rules.items():
- for pattern in rule['patterns']:
- try:
- match = re.search(pattern, text)
- if match:
- value = match.group(1).strip()
-
- # 后处理
- if rule.get('post_process') == 'append_unit':
- if not value.endswith('分'):
- value = value + '分'
-
- results[element_key] = {
- 'value': value,
- 'confidence': 0.95,
- 'source': 'ner',
- 'position': {
- 'charStart': match.start(1),
- 'charEnd': match.end(1),
- 'line': text[:match.start()].count('\n') + 1
- },
- 'element_name': rule['element_name'],
- 'element_type': rule['element_type'],
- 'namespace': rule['namespace']
- }
- break
- except Exception as e:
- logger.warning(f"NER规则匹配失败: {element_key}, pattern={pattern}, error={e}")
-
- return results
-
- async def _extract_by_llm(
- self,
- text: str,
- attachment_id: int
- ) -> Dict[str, Dict]:
- """LLM智能提取(总结型要素)"""
- results = {}
-
- if not self.deepseek_service:
- return results
-
- # 提取总结型要素
- for config in self.llm_summary_config:
- element_key = config['element_key']
-
- # 查找相关文本
- relevant_text = self._find_relevant_text(text, config['source_keywords'])
-
- if relevant_text and len(relevant_text) > 50:
- prompt = config['prompt'].format(text=relevant_text[:3000])
- try:
- response = await self.deepseek_service.chat(prompt)
- if response and len(response.strip()) > 20:
- results[element_key] = {
- 'value': response.strip(),
- 'confidence': 0.85,
- 'source': 'llm',
- 'element_name': config['element_name'],
- 'element_type': config['element_type'],
- 'namespace': config['namespace']
- }
- except Exception as e:
- logger.error(f"LLM提取失败: {element_key}, error={e}")
-
- # 表格型要素暂时跳过(需要Java后端提供表格结构)
- # TODO: 后续可以通过Java后端传递表格数据
-
- return results
-
- def _find_relevant_text(self, text: str, keywords: List[str]) -> str:
- """根据关键词查找相关文本段落"""
- lines = text.split('\n')
- relevant_lines = []
- capturing = False
- capture_count = 0
-
- for line in lines:
- # 检查是否包含关键词
- if any(kw in line for kw in keywords):
- capturing = True
- capture_count = 0
-
- if capturing:
- relevant_lines.append(line)
- capture_count += 1
- # 最多取30行
- if capture_count > 30:
- capturing = False
-
- return '\n'.join(relevant_lines)
-
- def _find_relevant_table(self, tables: List[Dict], keywords: List[str]) -> Optional[Dict]:
- """根据关键词查找相关表格"""
- for table_info in tables:
- table = table_info['table']
- if table.get('data') and len(table['data']) > 0:
- # 检查表头
- header_row = table['data'][0]
- header_texts = [cell.get('text', '') for cell in header_row]
- header_str = ' '.join(header_texts)
-
- # 检查是否包含关键词
- match_count = sum(1 for kw in keywords if kw in header_str)
- if match_count >= 2:
- return table
-
- return None
-
- def _table_to_text(self, table: Dict) -> str:
- """将表格转为文本"""
- lines = []
- for row in table.get('data', []):
- cells = [cell.get('text', '') for cell in row]
- lines.append(' | '.join(cells))
- return '\n'.join(lines)
-
- def _build_output(
- self,
- extracted_values: Dict[str, Dict],
- attachment_id: int
- ) -> Tuple[List[Dict], List[Dict]]:
- """构建输出的elements和values"""
-
- # 合并所有要素定义
- all_element_defs = {}
-
- # 从NER规则获取
- for key, rule in self.ner_rules.items():
- all_element_defs[key] = {
- 'element_name': rule['element_name'],
- 'element_type': rule['element_type'],
- 'namespace': rule['namespace']
- }
-
- # 从LLM配置获取
- for config in self.llm_summary_config:
- all_element_defs[config['element_key']] = {
- 'element_name': config['element_name'],
- 'element_type': config['element_type'],
- 'namespace': config['namespace']
- }
-
- for config in self.llm_table_config:
- all_element_defs[config['element_key']] = {
- 'element_name': config['element_name'],
- 'element_type': config['element_type'],
- 'namespace': config['namespace']
- }
-
- elements = []
- values = []
-
- for i, (element_key, elem_def) in enumerate(all_element_defs.items()):
- element = {
- "id": 700 + i,
- "elementKey": element_key,
- "elementName": elem_def['element_name'],
- "elementType": elem_def['element_type'],
- "namespace": elem_def['namespace'],
- "sortOrder": i
- }
- elements.append(element)
-
- # 查找提取的值
- extracted = extracted_values.get(element_key)
- if extracted:
- value = {
- "valueId": 800 + i,
- "elementKey": element_key,
- "valueText": extracted['value'],
- "isFilled": True,
- "fillSource": "ai" if extracted['source'] == 'llm' else "rule",
- "confidence": extracted.get('confidence', 0.8),
- "sourceAttachmentId": attachment_id
- }
- if 'position' in extracted:
- value['extractPosition'] = extracted['position']
- else:
- value = {
- "valueId": 800 + i,
- "elementKey": element_key,
- "valueText": "",
- "isFilled": False,
- "fillSource": "default"
- }
-
- values.append(value)
-
- return elements, values
- # 创建单例
- element_extractor = ElementExtractor()
|