element_extractor.py 20 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557
  1. """
  2. 要素提取器:混合NER+LLM策略
  3. 从解析后的文档内容中提取要素值,输出前端渲染所需的elements和values。
  4. """
  5. import re
  6. import json
  7. from typing import Dict, List, Any, Optional, Tuple
  8. from loguru import logger
  9. # DOCX解析由Java后端完成,这里只处理纯文本
  10. # ============================================================
  11. # NER规则定义
  12. # ============================================================
  13. NER_RULES = {
  14. # 日期类
  15. "project.workStartAt": {
  16. "patterns": [
  17. r'评审日期[::]\s*(\d{4}年\d{1,2}月\d{1,2}日)至',
  18. r'(\d{4}年\d{1,2}月\d{1,2}日)至\d{4}年',
  19. r'评审(?:开始)?日期[::]\s*(\d{4}年\d{1,2}月\d{1,2}日)',
  20. ],
  21. "type": "DATE",
  22. "element_name": "评审开始日期",
  23. "element_type": "text",
  24. "namespace": "project"
  25. },
  26. "project.workEndAt": {
  27. "patterns": [
  28. r'至(\d{4}年\d{1,2}月\d{1,2}日)',
  29. r'评审(?:结束)?日期[::]\s*(\d{4}年\d{1,2}月\d{1,2}日)',
  30. ],
  31. "type": "DATE",
  32. "element_name": "评审结束日期",
  33. "element_type": "text",
  34. "namespace": "project"
  35. },
  36. # 得分类
  37. "project.resultScore": {
  38. "patterns": [
  39. r'评审得分[::]\s*(\d+\.?\d*)\s*分',
  40. r'得分[::]\s*(\d+\.?\d*)\s*分',
  41. r'(\d+\.?\d*)分\s*级别',
  42. ],
  43. "type": "SCORE",
  44. "element_name": "评审得分",
  45. "element_type": "text",
  46. "namespace": "project",
  47. "post_process": "append_unit" # 添加"分"单位
  48. },
  49. # 级别类
  50. "project.resultLevel": {
  51. "patterns": [
  52. r'级别[::]\s*(一级|二级|三级)',
  53. r'评审(?:结论)?级别[::]\s*(一级|二级|三级)',
  54. r'(一级|二级|三级)\s*(?:企业)?(?:证书)?',
  55. ],
  56. "type": "LEVEL",
  57. "element_name": "评审结论级别",
  58. "element_type": "text",
  59. "namespace": "project"
  60. },
  61. # 编号类
  62. "basicInfo.projectCode": {
  63. "patterns": [
  64. r'项目编号[::]\s*([A-Z]+-\d+-\d+)',
  65. r'编号[::]\s*([A-Z0-9\-]+)',
  66. ],
  67. "type": "CODE",
  68. "element_name": "项目编号",
  69. "element_type": "text",
  70. "namespace": "basicInfo"
  71. },
  72. "basicInfo.reviewObjectCertificateCode": {
  73. "patterns": [
  74. r'证书编号[::]\s*(ZGDIDBOY-\d+)',
  75. r'证书编号[((]([^))]+)[))]',
  76. r'证书编号[::]\s*([A-Z0-9\-]+)',
  77. ],
  78. "type": "CODE",
  79. "element_name": "证书编号",
  80. "element_type": "text",
  81. "namespace": "basicInfo"
  82. },
  83. # 机构类
  84. "project.reviewObject": {
  85. "patterns": [
  86. r'评审对象[::]\s*([^\n]{10,60}(?:公司|集团|院|所))',
  87. r'对([^\n]{10,60}(?:公司|集团|院|所))进行.*?(?:评审|复审)',
  88. ],
  89. "type": "ORG",
  90. "element_name": "评审对象",
  91. "element_type": "text",
  92. "namespace": "project"
  93. },
  94. "project.reviewObjectAlias": {
  95. "patterns": [
  96. r'以下简称[「『"""]([^」』""]{2,10})[」』"""]',
  97. r'简称[「『"""]([^」』""]{2,10})[」』"""]',
  98. r'(以下简称"([^"]{2,10})")',
  99. ],
  100. "type": "ALIAS",
  101. "element_name": "评审对象简称",
  102. "element_type": "text",
  103. "namespace": "project"
  104. },
  105. }
  106. # ============================================================
  107. # LLM提取配置
  108. # ============================================================
  109. LLM_SUMMARY_ELEMENTS = [
  110. {
  111. "element_key": "project.target",
  112. "element_name": "目标",
  113. "element_type": "paragraph",
  114. "namespace": "project",
  115. "source_keywords": ["目标", "5.1.1"],
  116. "prompt": "请根据以下评审意见,总结企业的安全生产目标情况(100-200字):\n{text}"
  117. },
  118. {
  119. "element_key": "project.duty",
  120. "element_name": "职责",
  121. "element_type": "paragraph",
  122. "namespace": "project",
  123. "source_keywords": ["职责", "5.1.2"],
  124. "prompt": "请根据以下评审意见,总结企业的安全生产职责落实情况(100-200字):\n{text}"
  125. },
  126. {
  127. "element_key": "project.fullParticipation",
  128. "element_name": "全员参与",
  129. "element_type": "paragraph",
  130. "namespace": "project",
  131. "source_keywords": ["全员参与", "5.1.3"],
  132. "prompt": "请根据以下评审意见,总结企业的全员参与情况(100-200字):\n{text}"
  133. },
  134. {
  135. "element_key": "project.safetyInvestment",
  136. "element_name": "安全投入",
  137. "element_type": "paragraph",
  138. "namespace": "project",
  139. "source_keywords": ["安全投入", "安全生产费用", "5.1.4"],
  140. "prompt": "请根据以下评审意见,总结企业的安全投入情况(100-200字):\n{text}"
  141. },
  142. {
  143. "element_key": "project.safetyCulture",
  144. "element_name": "安全文化",
  145. "element_type": "paragraph",
  146. "namespace": "project",
  147. "source_keywords": ["安全文化", "5.1.5"],
  148. "prompt": "请根据以下评审意见,总结企业的安全文化建设情况(100-200字):\n{text}"
  149. },
  150. {
  151. "element_key": "project.systematicManagement",
  152. "element_name": "体系化管理",
  153. "element_type": "paragraph",
  154. "namespace": "project",
  155. "source_keywords": ["制度化管理", "体系化", "5.2"],
  156. "prompt": "请根据以下评审意见,总结企业的体系化管理情况(100-200字):\n{text}"
  157. },
  158. {
  159. "element_key": "project.employeeTraining",
  160. "element_name": "人员教育培训",
  161. "element_type": "paragraph",
  162. "namespace": "project",
  163. "source_keywords": ["教育培训", "5.3"],
  164. "prompt": "请根据以下评审意见,总结企业的人员教育培训情况(100-200字):\n{text}"
  165. },
  166. {
  167. "element_key": "project.assetManagement",
  168. "element_name": "设备设施管理",
  169. "element_type": "paragraph",
  170. "namespace": "project",
  171. "source_keywords": ["设备设施", "5.4.1"],
  172. "prompt": "请根据以下评审意见,总结企业的设备设施管理情况(100-200字):\n{text}"
  173. },
  174. {
  175. "element_key": "project.jobSafety",
  176. "element_name": "作业安全",
  177. "element_type": "paragraph",
  178. "namespace": "project",
  179. "source_keywords": ["作业安全", "5.4.2.1"],
  180. "prompt": "请根据以下评审意见,总结企业的作业安全情况(100-200字):\n{text}"
  181. },
  182. {
  183. "element_key": "project.riskAssessment",
  184. "element_name": "风险辨识与评价",
  185. "element_type": "paragraph",
  186. "namespace": "project",
  187. "source_keywords": ["风险辨识", "风险评价", "5.5.1"],
  188. "prompt": "请根据以下评审意见,总结企业的风险辨识与评价情况(100-200字):\n{text}"
  189. },
  190. {
  191. "element_key": "project.hazardInspection",
  192. "element_name": "隐患排查",
  193. "element_type": "paragraph",
  194. "namespace": "project",
  195. "source_keywords": ["隐患排查", "5.5.3"],
  196. "prompt": "请根据以下评审意见,总结企业的隐患排查情况(100-200字):\n{text}"
  197. },
  198. {
  199. "element_key": "project.emergencyResponse",
  200. "element_name": "应急救援",
  201. "element_type": "paragraph",
  202. "namespace": "project",
  203. "source_keywords": ["应急救援", "应急管理", "5.6"],
  204. "prompt": "请根据以下评审意见,总结企业的应急救援情况(100-200字):\n{text}"
  205. },
  206. {
  207. "element_key": "project.incidentManagement",
  208. "element_name": "事故管理",
  209. "element_type": "paragraph",
  210. "namespace": "project",
  211. "source_keywords": ["事故管理", "5.7"],
  212. "prompt": "请根据以下评审意见,总结企业的事故管理情况(100-200字):\n{text}"
  213. },
  214. {
  215. "element_key": "project.continuousImprovement",
  216. "element_name": "持续改进",
  217. "element_type": "paragraph",
  218. "namespace": "project",
  219. "source_keywords": ["持续改进", "5.8"],
  220. "prompt": "请根据以下评审意见,总结企业的持续改进情况(100-200字):\n{text}"
  221. },
  222. {
  223. "element_key": "project.reviewObjectSelfAssessmentProcess",
  224. "element_name": "自评过程",
  225. "element_type": "paragraph",
  226. "namespace": "project",
  227. "source_keywords": ["自评", "自查"],
  228. "prompt": "请根据以下内容,总结企业的自评过程(150-250字):\n{text}"
  229. },
  230. {
  231. "element_key": "project.safetyHighlight",
  232. "element_name": "安全生产管理亮点",
  233. "element_type": "paragraph",
  234. "namespace": "project",
  235. "source_keywords": ["亮点", "特色", "优秀"],
  236. "prompt": "请根据以下内容,提炼企业的安全生产管理亮点(100-200字):\n{text}"
  237. },
  238. ]
  239. LLM_TABLE_ELEMENTS = [
  240. {
  241. "element_key": "+SPSRRReviewProject",
  242. "element_name": "现场复审项目",
  243. "element_type": "table",
  244. "namespace": "spsrr",
  245. "table_keywords": ["项目名称", "简称", "类型"],
  246. "prompt": """请从以下表格中提取复审项目列表,以JSON数组格式返回:
  247. [{{"name": "项目名称", "alias": "简称", "type": "单位/在建项目", "order": 1}}]
  248. 表格内容:
  249. {text}
  250. 只返回JSON数组,不要其他内容。"""
  251. },
  252. {
  253. "element_key": "+SPSRRReviewer",
  254. "element_name": "现场复审人员",
  255. "element_type": "table",
  256. "namespace": "spsrr",
  257. "table_keywords": ["姓名", "专业", "分工"],
  258. "prompt": """请从以下表格中提取评审人员列表,以JSON数组格式返回:
  259. [{{"name": "姓名", "specialty": "专业分工"}}]
  260. 表格内容:
  261. {text}
  262. 只返回JSON数组,不要其他内容。"""
  263. },
  264. ]
  265. class ElementExtractor:
  266. """要素提取器"""
  267. def __init__(self):
  268. self.ner_rules = NER_RULES
  269. self.llm_summary_config = LLM_SUMMARY_ELEMENTS
  270. self.llm_table_config = LLM_TABLE_ELEMENTS
  271. self._deepseek_service = None
  272. @property
  273. def deepseek_service(self):
  274. """延迟加载deepseek服务"""
  275. if self._deepseek_service is None:
  276. try:
  277. from .deepseek_service import deepseek_service
  278. self._deepseek_service = deepseek_service
  279. except ImportError:
  280. logger.warning("DeepSeek服务未配置,LLM提取将跳过")
  281. self._deepseek_service = None
  282. return self._deepseek_service
  283. async def extract_from_text(
  284. self,
  285. text: str,
  286. attachment_id: int = 0,
  287. use_llm: bool = True
  288. ) -> Dict[str, Any]:
  289. """
  290. 从纯文本中提取所有要素(主接口)
  291. Args:
  292. text: Java后端解析的纯文本
  293. attachment_id: 附件ID
  294. use_llm: 是否使用LLM提取(总结型要素)
  295. Returns:
  296. {
  297. "elements": [...],
  298. "values": [...],
  299. "statistics": {...}
  300. }
  301. """
  302. logger.info(f"开始提取要素: attachment_id={attachment_id}, "
  303. f"text_length={len(text)}, use_llm={use_llm}")
  304. # 1. NER规则提取
  305. ner_values = self._extract_by_ner(text, attachment_id)
  306. logger.info(f"NER提取完成: {len(ner_values)} 个要素")
  307. # 2. LLM提取(可选)
  308. llm_values = {}
  309. if use_llm and self.deepseek_service:
  310. llm_values = await self._extract_by_llm(text, attachment_id)
  311. logger.info(f"LLM提取完成: {len(llm_values)} 个要素")
  312. # 4. 合并结果
  313. all_values = {**ner_values, **llm_values}
  314. # 5. 生成输出
  315. elements, values = self._build_output(all_values, attachment_id)
  316. return {
  317. "elements": elements,
  318. "values": values,
  319. "statistics": {
  320. "total_elements": len(elements),
  321. "filled_values": len([v for v in values if v.get("isFilled")]),
  322. "ner_extracted": len(ner_values),
  323. "llm_extracted": len(llm_values),
  324. }
  325. }
  326. def _extract_by_ner(
  327. self,
  328. text: str,
  329. attachment_id: int
  330. ) -> Dict[str, Dict]:
  331. """NER规则提取"""
  332. results = {}
  333. for element_key, rule in self.ner_rules.items():
  334. for pattern in rule['patterns']:
  335. try:
  336. match = re.search(pattern, text)
  337. if match:
  338. value = match.group(1).strip()
  339. # 后处理
  340. if rule.get('post_process') == 'append_unit':
  341. if not value.endswith('分'):
  342. value = value + '分'
  343. results[element_key] = {
  344. 'value': value,
  345. 'confidence': 0.95,
  346. 'source': 'ner',
  347. 'position': {
  348. 'charStart': match.start(1),
  349. 'charEnd': match.end(1),
  350. 'line': text[:match.start()].count('\n') + 1
  351. },
  352. 'element_name': rule['element_name'],
  353. 'element_type': rule['element_type'],
  354. 'namespace': rule['namespace']
  355. }
  356. break
  357. except Exception as e:
  358. logger.warning(f"NER规则匹配失败: {element_key}, pattern={pattern}, error={e}")
  359. return results
  360. async def _extract_by_llm(
  361. self,
  362. text: str,
  363. attachment_id: int
  364. ) -> Dict[str, Dict]:
  365. """LLM智能提取(总结型要素)"""
  366. results = {}
  367. if not self.deepseek_service:
  368. return results
  369. # 提取总结型要素
  370. for config in self.llm_summary_config:
  371. element_key = config['element_key']
  372. # 查找相关文本
  373. relevant_text = self._find_relevant_text(text, config['source_keywords'])
  374. if relevant_text and len(relevant_text) > 50:
  375. prompt = config['prompt'].format(text=relevant_text[:3000])
  376. try:
  377. response = await self.deepseek_service.chat(prompt)
  378. if response and len(response.strip()) > 20:
  379. results[element_key] = {
  380. 'value': response.strip(),
  381. 'confidence': 0.85,
  382. 'source': 'llm',
  383. 'element_name': config['element_name'],
  384. 'element_type': config['element_type'],
  385. 'namespace': config['namespace']
  386. }
  387. except Exception as e:
  388. logger.error(f"LLM提取失败: {element_key}, error={e}")
  389. # 表格型要素暂时跳过(需要Java后端提供表格结构)
  390. # TODO: 后续可以通过Java后端传递表格数据
  391. return results
  392. def _find_relevant_text(self, text: str, keywords: List[str]) -> str:
  393. """根据关键词查找相关文本段落"""
  394. lines = text.split('\n')
  395. relevant_lines = []
  396. capturing = False
  397. capture_count = 0
  398. for line in lines:
  399. # 检查是否包含关键词
  400. if any(kw in line for kw in keywords):
  401. capturing = True
  402. capture_count = 0
  403. if capturing:
  404. relevant_lines.append(line)
  405. capture_count += 1
  406. # 最多取30行
  407. if capture_count > 30:
  408. capturing = False
  409. return '\n'.join(relevant_lines)
  410. def _find_relevant_table(self, tables: List[Dict], keywords: List[str]) -> Optional[Dict]:
  411. """根据关键词查找相关表格"""
  412. for table_info in tables:
  413. table = table_info['table']
  414. if table.get('data') and len(table['data']) > 0:
  415. # 检查表头
  416. header_row = table['data'][0]
  417. header_texts = [cell.get('text', '') for cell in header_row]
  418. header_str = ' '.join(header_texts)
  419. # 检查是否包含关键词
  420. match_count = sum(1 for kw in keywords if kw in header_str)
  421. if match_count >= 2:
  422. return table
  423. return None
  424. def _table_to_text(self, table: Dict) -> str:
  425. """将表格转为文本"""
  426. lines = []
  427. for row in table.get('data', []):
  428. cells = [cell.get('text', '') for cell in row]
  429. lines.append(' | '.join(cells))
  430. return '\n'.join(lines)
  431. def _build_output(
  432. self,
  433. extracted_values: Dict[str, Dict],
  434. attachment_id: int
  435. ) -> Tuple[List[Dict], List[Dict]]:
  436. """构建输出的elements和values"""
  437. # 合并所有要素定义
  438. all_element_defs = {}
  439. # 从NER规则获取
  440. for key, rule in self.ner_rules.items():
  441. all_element_defs[key] = {
  442. 'element_name': rule['element_name'],
  443. 'element_type': rule['element_type'],
  444. 'namespace': rule['namespace']
  445. }
  446. # 从LLM配置获取
  447. for config in self.llm_summary_config:
  448. all_element_defs[config['element_key']] = {
  449. 'element_name': config['element_name'],
  450. 'element_type': config['element_type'],
  451. 'namespace': config['namespace']
  452. }
  453. for config in self.llm_table_config:
  454. all_element_defs[config['element_key']] = {
  455. 'element_name': config['element_name'],
  456. 'element_type': config['element_type'],
  457. 'namespace': config['namespace']
  458. }
  459. elements = []
  460. values = []
  461. for i, (element_key, elem_def) in enumerate(all_element_defs.items()):
  462. element = {
  463. "id": 700 + i,
  464. "elementKey": element_key,
  465. "elementName": elem_def['element_name'],
  466. "elementType": elem_def['element_type'],
  467. "namespace": elem_def['namespace'],
  468. "sortOrder": i
  469. }
  470. elements.append(element)
  471. # 查找提取的值
  472. extracted = extracted_values.get(element_key)
  473. if extracted:
  474. value = {
  475. "valueId": 800 + i,
  476. "elementKey": element_key,
  477. "valueText": extracted['value'],
  478. "isFilled": True,
  479. "fillSource": "ai" if extracted['source'] == 'llm' else "rule",
  480. "confidence": extracted.get('confidence', 0.8),
  481. "sourceAttachmentId": attachment_id
  482. }
  483. if 'position' in extracted:
  484. value['extractPosition'] = extracted['position']
  485. else:
  486. value = {
  487. "valueId": 800 + i,
  488. "elementKey": element_key,
  489. "valueText": "",
  490. "isFilled": False,
  491. "fillSource": "default"
  492. }
  493. values.append(value)
  494. return elements, values
  495. # 创建单例
  496. element_extractor = ElementExtractor()