|
|
@@ -0,0 +1,818 @@
|
|
|
+# DOCX解析到要素提取完整流程
|
|
|
+
|
|
|
+## 一、整体数据流
|
|
|
+
|
|
|
+```
|
|
|
+┌─────────────────────────────────────────────────────────────────────┐
|
|
|
+│ 用户上传 DOCX │
|
|
|
+└─────────────────────────────┬───────────────────────────────────────┘
|
|
|
+ │
|
|
|
+ ▼
|
|
|
+┌─────────────────────────────────────────────────────────────────────┐
|
|
|
+│ Java后端 (lingyue-project) │
|
|
|
+│ ├─ AttachmentController.upload() - 上传附件 │
|
|
|
+│ ├─ DocxParseService.parseToHtml() - 解析为HTML │
|
|
|
+│ ├─ 存储 doc_content (结构化JSON) 和 parsed_text (纯文本) │
|
|
|
+│ └─ API: GET /api/v1/attachments/{id}/parsed-text │
|
|
|
+└─────────────────────────────┬───────────────────────────────────────┘
|
|
|
+ │ 获取 parsed_text
|
|
|
+ ▼
|
|
|
+┌─────────────────────────────────────────────────────────────────────┐
|
|
|
+│ Python NER服务 (ner-service) │
|
|
|
+│ ├─ 输入: parsed_text (纯文本) │
|
|
|
+│ ├─ POST /extract/from-text │
|
|
|
+│ │ │
|
|
|
+│ │ ┌─────────────────┐ ┌─────────────────────────────────────┐ │
|
|
|
+│ │ │ NER规则提取 │ │ LLM智能提取 (可选) │ │
|
|
|
+│ │ │ (8个简单要素) │ │ (16个总结型要素) │ │
|
|
|
+│ │ │ ├─ 日期 │ │ ├─ 目标、职责、安全投入等 │ │
|
|
|
+│ │ │ ├─ 得分、级别 │ │ └─ 需要语义理解的内容 │ │
|
|
|
+│ │ │ └─ 编号、机构 │ │ │ │
|
|
|
+│ │ └─────────────────┘ └─────────────────────────────────────┘ │
|
|
|
+│ │ │
|
|
|
+│ └─ 输出: elements + values │
|
|
|
+└─────────────────────────────┬───────────────────────────────────────┘
|
|
|
+ │
|
|
|
+ ▼
|
|
|
+┌─────────────────────────────────────────────────────────────────────┐
|
|
|
+│ 前端渲染 │
|
|
|
+│ ├─ elements: 要素定义列表 │
|
|
|
+│ ├─ values: 要素值列表 │
|
|
|
+│ └─ doc_content: 文档渲染内容 (来自Java后端) │
|
|
|
+└─────────────────────────────────────────────────────────────────────┘
|
|
|
+```
|
|
|
+
|
|
|
+## 架构说明
|
|
|
+
|
|
|
+**职责分离**:
|
|
|
+- **Java后端**:负责DOCX解析(已有`DocxParseService`),存储和管理附件
|
|
|
+- **Python NER服务**:负责要素提取(NER规则 + LLM智能提取)
|
|
|
+- **前端**:调用两个服务,组装数据进行渲染
|
|
|
+
|
|
|
+## 二、前端所需的数据结构
|
|
|
+
|
|
|
+### 2.1 elements(要素定义)
|
|
|
+
|
|
|
+```javascript
|
|
|
+// 来自 elementApi.list(projectId)
|
|
|
+elements = [
|
|
|
+ {
|
|
|
+ id: 701,
|
|
|
+ elementKey: "project.reviewObject",
|
|
|
+ elementName: "评审对象",
|
|
|
+ elementType: "text", // text | paragraph | table | static
|
|
|
+ namespace: "project", // 分组命名空间
|
|
|
+ sortOrder: 1,
|
|
|
+ description: "被评审企业的完整名称"
|
|
|
+ },
|
|
|
+ {
|
|
|
+ id: 702,
|
|
|
+ elementKey: "project.workStartAt",
|
|
|
+ elementName: "评审开始日期",
|
|
|
+ elementType: "text",
|
|
|
+ namespace: "project",
|
|
|
+ sortOrder: 2
|
|
|
+ },
|
|
|
+ // ... 共47个要素
|
|
|
+]
|
|
|
+```
|
|
|
+
|
|
|
+### 2.2 values(要素值)
|
|
|
+
|
|
|
+```javascript
|
|
|
+// 来自 valueApi.list(projectId)
|
|
|
+values = [
|
|
|
+ {
|
|
|
+ valueId: 801,
|
|
|
+ elementKey: "project.reviewObject",
|
|
|
+ valueText: "中国电建集团成都勘测设计研究院有限公司",
|
|
|
+ isFilled: true,
|
|
|
+ fillSource: "ai", // default | manual | rule | ai
|
|
|
+ confidence: 0.95,
|
|
|
+ sourceAttachmentId: 402,
|
|
|
+ extractPosition: { charStart: 100, charEnd: 130, line: 5 }
|
|
|
+ },
|
|
|
+ {
|
|
|
+ valueId: 802,
|
|
|
+ elementKey: "project.workStartAt",
|
|
|
+ valueText: "2024年7月13日",
|
|
|
+ isFilled: true,
|
|
|
+ fillSource: "rule",
|
|
|
+ confidence: 0.98
|
|
|
+ },
|
|
|
+ // ...
|
|
|
+]
|
|
|
+```
|
|
|
+
|
|
|
+### 2.3 doc_content(文档渲染内容)
|
|
|
+
|
|
|
+```javascript
|
|
|
+// 来自 attachmentApi.getDocContent(attachmentId)
|
|
|
+doc_content = {
|
|
|
+ page: {
|
|
|
+ widthMm: 210,
|
|
|
+ heightMm: 297,
|
|
|
+ marginTopMm: 25.4,
|
|
|
+ marginBottomMm: 25.4,
|
|
|
+ marginLeftMm: 31.8,
|
|
|
+ marginRightMm: 31.8
|
|
|
+ },
|
|
|
+ blocks: [
|
|
|
+ {
|
|
|
+ id: "b0",
|
|
|
+ type: "heading1",
|
|
|
+ runs: [{ text: "1 企业概述", bold: true, fontSize: 16 }],
|
|
|
+ style: { alignment: "left" }
|
|
|
+ },
|
|
|
+ {
|
|
|
+ id: "b1",
|
|
|
+ type: "paragraph",
|
|
|
+ runs: [
|
|
|
+ { text: "中国电建集团成都勘测设计研究院有限公司", bold: true },
|
|
|
+ { text: "(以下简称"成都院")是..." }
|
|
|
+ ]
|
|
|
+ },
|
|
|
+ {
|
|
|
+ id: "b2",
|
|
|
+ type: "table",
|
|
|
+ table: {
|
|
|
+ rows: 5,
|
|
|
+ cols: 4,
|
|
|
+ data: [
|
|
|
+ [{ text: "序号" }, { text: "项目名称" }, { text: "简称" }, { text: "类型" }],
|
|
|
+ [{ text: "1" }, { text: "成都院本部" }, { text: "本部" }, { text: "单位" }],
|
|
|
+ // ...
|
|
|
+ ]
|
|
|
+ }
|
|
|
+ },
|
|
|
+ // ...
|
|
|
+ ],
|
|
|
+ totalBlocks: 350
|
|
|
+}
|
|
|
+```
|
|
|
+
|
|
|
+## 三、混合提取策略实现
|
|
|
+
|
|
|
+### 3.1 要素分类与提取方法
|
|
|
+
|
|
|
+| 类别 | 要素数 | 提取方法 | 示例 |
|
|
|
+|------|--------|----------|------|
|
|
|
+| A. 简单结构化 | 8 | NER规则 | 日期、得分、级别、编号 |
|
|
|
+| B. 实体识别 | 2 | NER+后处理 | 评审对象、简称 |
|
|
|
+| C. 总结型 | 18 | LLM | 目标、职责、安全投入等 |
|
|
|
+| D. 列表拼接 | 2 | NER+LLM | 复审范围、工作过程 |
|
|
|
+| E. 表格数据 | 7 | 表格解析+LLM | 项目列表、人员列表 |
|
|
|
+
|
|
|
+### 3.2 NER规则提取(8个要素)
|
|
|
+
|
|
|
+```python
|
|
|
+# 提取规则定义
|
|
|
+NER_RULES = {
|
|
|
+ # 日期类
|
|
|
+ "project.workStartAt": {
|
|
|
+ "patterns": [
|
|
|
+ r'评审(?:开始)?日期[::]\s*(\d{4}年\d{1,2}月\d{1,2}日)',
|
|
|
+ r'(\d{4}年\d{1,2}月\d{1,2}日)至',
|
|
|
+ ],
|
|
|
+ "type": "DATE",
|
|
|
+ "post_process": "take_first"
|
|
|
+ },
|
|
|
+ "project.workEndAt": {
|
|
|
+ "patterns": [
|
|
|
+ r'至(\d{4}年\d{1,2}月\d{1,2}日)',
|
|
|
+ r'评审(?:结束)?日期[::]\s*(\d{4}年\d{1,2}月\d{1,2}日)',
|
|
|
+ ],
|
|
|
+ "type": "DATE",
|
|
|
+ "post_process": "take_last"
|
|
|
+ },
|
|
|
+
|
|
|
+ # 得分类
|
|
|
+ "project.resultScore": {
|
|
|
+ "patterns": [
|
|
|
+ r'评审得分[::]\s*(\d+\.?\d*)\s*分',
|
|
|
+ r'得分[::]\s*(\d+\.?\d*)',
|
|
|
+ ],
|
|
|
+ "type": "SCORE",
|
|
|
+ "post_process": "to_float"
|
|
|
+ },
|
|
|
+
|
|
|
+ # 级别类
|
|
|
+ "project.resultLevel": {
|
|
|
+ "patterns": [
|
|
|
+ r'级别[::]\s*(一级|二级|三级)',
|
|
|
+ r'评审(?:结论)?级别[::]\s*(一级|二级|三级)',
|
|
|
+ ],
|
|
|
+ "type": "LEVEL"
|
|
|
+ },
|
|
|
+
|
|
|
+ # 编号类
|
|
|
+ "basicInfo.projectCode": {
|
|
|
+ "patterns": [
|
|
|
+ r'项目编号[::]\s*([A-Z]+-\d+-\d+)',
|
|
|
+ r'编号[::]\s*([A-Z0-9\-]+)',
|
|
|
+ ],
|
|
|
+ "type": "CODE"
|
|
|
+ },
|
|
|
+ "basicInfo.reviewObjectCertificateCode": {
|
|
|
+ "patterns": [
|
|
|
+ r'证书编号[::]\s*(ZGDIDBOY-\d+)',
|
|
|
+ r'证书编号[::]\s*([A-Z0-9\-]+)',
|
|
|
+ ],
|
|
|
+ "type": "CODE"
|
|
|
+ },
|
|
|
+
|
|
|
+ # 机构类
|
|
|
+ "project.reviewObject": {
|
|
|
+ "patterns": [
|
|
|
+ r'评审对象[::]\s*([^\n]{10,50}(?:公司|集团|院|所))',
|
|
|
+ ],
|
|
|
+ "type": "ORG"
|
|
|
+ },
|
|
|
+ "project.reviewObjectAlias": {
|
|
|
+ "patterns": [
|
|
|
+ r'以下简称[「『""]([^」』""]{2,10})[」』""]',
|
|
|
+ r'简称[「『""]([^」』""]{2,10})[」』""]',
|
|
|
+ ],
|
|
|
+ "type": "ALIAS"
|
|
|
+ },
|
|
|
+}
|
|
|
+```
|
|
|
+
|
|
|
+### 3.3 LLM提取(27个要素)
|
|
|
+
|
|
|
+```python
|
|
|
+# LLM提取配置
|
|
|
+LLM_EXTRACT_CONFIG = {
|
|
|
+ # 总结型要素(18个)
|
|
|
+ "summary_elements": [
|
|
|
+ {
|
|
|
+ "element_key": "project.target",
|
|
|
+ "element_name": "目标",
|
|
|
+ "source_pattern": r"5\.1\.1\.\d", # 评审代码匹配
|
|
|
+ "prompt_template": "请根据以下评审意见,总结企业的安全生产目标情况(100-200字):\n{text}"
|
|
|
+ },
|
|
|
+ {
|
|
|
+ "element_key": "project.duty",
|
|
|
+ "element_name": "职责",
|
|
|
+ "source_pattern": r"5\.1\.2\.1",
|
|
|
+ "prompt_template": "请根据以下评审意见,总结企业的安全生产职责落实情况(100-200字):\n{text}"
|
|
|
+ },
|
|
|
+ # ... 其他16个
|
|
|
+ ],
|
|
|
+
|
|
|
+ # 表格提取要素(7个)
|
|
|
+ "table_elements": [
|
|
|
+ {
|
|
|
+ "element_key": "+SPSRRReviewProject",
|
|
|
+ "element_name": "现场复审项目",
|
|
|
+ "table_type": "review_project",
|
|
|
+ "columns": ["项目名称", "简称", "类型", "排序"],
|
|
|
+ "prompt_template": """
|
|
|
+请从以下文本中提取复审项目列表,以JSON数组格式返回:
|
|
|
+[{"name": "项目名称", "alias": "简称", "type": "单位/在建项目", "order": 1}]
|
|
|
+
|
|
|
+文本:{text}
|
|
|
+"""
|
|
|
+ },
|
|
|
+ # ... 其他6个
|
|
|
+ ]
|
|
|
+}
|
|
|
+```
|
|
|
+
|
|
|
+## 四、核心代码实现
|
|
|
+
|
|
|
+### 4.1 Python提取服务
|
|
|
+
|
|
|
+```python
|
|
|
+# python-services/ner-service/app/services/element_extractor.py
|
|
|
+
|
|
|
+import re
|
|
|
+import json
|
|
|
+from typing import Dict, List, Any, Optional
|
|
|
+from loguru import logger
|
|
|
+
|
|
|
+from .ner_service import ner_service
|
|
|
+from .deepseek_service import deepseek_service
|
|
|
+
|
|
|
+
|
|
|
+class ElementExtractor:
|
|
|
+ """要素提取器:混合NER+LLM策略"""
|
|
|
+
|
|
|
+ def __init__(self):
|
|
|
+ self.ner_rules = NER_RULES # 上面定义的规则
|
|
|
+ self.llm_config = LLM_EXTRACT_CONFIG
|
|
|
+
|
|
|
+ async def extract_all(
|
|
|
+ self,
|
|
|
+ doc_content: Dict,
|
|
|
+ attachment_id: int
|
|
|
+ ) -> Dict[str, Any]:
|
|
|
+ """
|
|
|
+ 从文档内容中提取所有要素
|
|
|
+
|
|
|
+ Args:
|
|
|
+ doc_content: parse_docx输出的文档结构
|
|
|
+ attachment_id: 附件ID
|
|
|
+
|
|
|
+ Returns:
|
|
|
+ {
|
|
|
+ "elements": [...], # 要素定义
|
|
|
+ "values": [...], # 要素值
|
|
|
+ "statistics": {...} # 提取统计
|
|
|
+ }
|
|
|
+ """
|
|
|
+ # 1. 将blocks转为纯文本
|
|
|
+ full_text = self._blocks_to_text(doc_content['blocks'])
|
|
|
+ tables = self._extract_tables(doc_content['blocks'])
|
|
|
+
|
|
|
+ logger.info(f"开始提取要素: attachment_id={attachment_id}, "
|
|
|
+ f"text_length={len(full_text)}, tables={len(tables)}")
|
|
|
+
|
|
|
+ # 2. NER规则提取(简单要素)
|
|
|
+ ner_values = await self._extract_by_ner(full_text, attachment_id)
|
|
|
+ logger.info(f"NER提取完成: {len(ner_values)} 个要素")
|
|
|
+
|
|
|
+ # 3. LLM提取(总结型+表格)
|
|
|
+ llm_values = await self._extract_by_llm(full_text, tables, attachment_id)
|
|
|
+ logger.info(f"LLM提取完成: {len(llm_values)} 个要素")
|
|
|
+
|
|
|
+ # 4. 合并结果
|
|
|
+ all_values = {**ner_values, **llm_values}
|
|
|
+
|
|
|
+ # 5. 生成elements和values
|
|
|
+ elements, values = self._build_output(all_values, attachment_id)
|
|
|
+
|
|
|
+ return {
|
|
|
+ "elements": elements,
|
|
|
+ "values": values,
|
|
|
+ "statistics": {
|
|
|
+ "total_elements": len(elements),
|
|
|
+ "filled_values": len([v for v in values if v.get("isFilled")]),
|
|
|
+ "ner_extracted": len(ner_values),
|
|
|
+ "llm_extracted": len(llm_values),
|
|
|
+ }
|
|
|
+ }
|
|
|
+
|
|
|
+ def _blocks_to_text(self, blocks: List[Dict]) -> str:
|
|
|
+ """将blocks转为纯文本"""
|
|
|
+ lines = []
|
|
|
+ for block in blocks:
|
|
|
+ if block['type'] == 'table':
|
|
|
+ # 表格转为文本
|
|
|
+ table = block.get('table', {})
|
|
|
+ for row in table.get('data', []):
|
|
|
+ cells = [cell.get('text', '') for cell in row]
|
|
|
+ lines.append(' | '.join(cells))
|
|
|
+ lines.append('') # 空行分隔
|
|
|
+ else:
|
|
|
+ # 段落
|
|
|
+ runs = block.get('runs', [])
|
|
|
+ text = ''.join(r.get('text', '') for r in runs)
|
|
|
+ lines.append(text)
|
|
|
+ return '\n'.join(lines)
|
|
|
+
|
|
|
+ def _extract_tables(self, blocks: List[Dict]) -> List[Dict]:
|
|
|
+ """提取所有表格"""
|
|
|
+ tables = []
|
|
|
+ for i, block in enumerate(blocks):
|
|
|
+ if block['type'] == 'table':
|
|
|
+ tables.append({
|
|
|
+ 'block_id': block['id'],
|
|
|
+ 'block_index': i,
|
|
|
+ 'table': block.get('table', {})
|
|
|
+ })
|
|
|
+ return tables
|
|
|
+
|
|
|
+ async def _extract_by_ner(
|
|
|
+ self,
|
|
|
+ text: str,
|
|
|
+ attachment_id: int
|
|
|
+ ) -> Dict[str, Dict]:
|
|
|
+ """NER规则提取"""
|
|
|
+ results = {}
|
|
|
+
|
|
|
+ for element_key, rule in self.ner_rules.items():
|
|
|
+ for pattern in rule['patterns']:
|
|
|
+ match = re.search(pattern, text)
|
|
|
+ if match:
|
|
|
+ value = match.group(1)
|
|
|
+
|
|
|
+ # 后处理
|
|
|
+ if rule.get('post_process') == 'to_float':
|
|
|
+ try:
|
|
|
+ value = str(float(value))
|
|
|
+ except:
|
|
|
+ pass
|
|
|
+
|
|
|
+ results[element_key] = {
|
|
|
+ 'value': value,
|
|
|
+ 'confidence': 0.95,
|
|
|
+ 'source': 'ner',
|
|
|
+ 'position': {
|
|
|
+ 'charStart': match.start(1),
|
|
|
+ 'charEnd': match.end(1),
|
|
|
+ 'line': text[:match.start()].count('\n') + 1
|
|
|
+ }
|
|
|
+ }
|
|
|
+ break # 找到第一个匹配就停止
|
|
|
+
|
|
|
+ return results
|
|
|
+
|
|
|
+ async def _extract_by_llm(
|
|
|
+ self,
|
|
|
+ text: str,
|
|
|
+ tables: List[Dict],
|
|
|
+ attachment_id: int
|
|
|
+ ) -> Dict[str, Dict]:
|
|
|
+ """LLM智能提取"""
|
|
|
+ results = {}
|
|
|
+
|
|
|
+ # 1. 提取总结型要素
|
|
|
+ for config in self.llm_config['summary_elements']:
|
|
|
+ element_key = config['element_key']
|
|
|
+
|
|
|
+ # 查找相关文本(基于评审代码)
|
|
|
+ source_pattern = config.get('source_pattern')
|
|
|
+ if source_pattern:
|
|
|
+ relevant_text = self._find_relevant_text(text, source_pattern)
|
|
|
+ else:
|
|
|
+ relevant_text = text[:5000] # 取前5000字
|
|
|
+
|
|
|
+ if relevant_text:
|
|
|
+ prompt = config['prompt_template'].format(text=relevant_text)
|
|
|
+ try:
|
|
|
+ response = await deepseek_service.chat(prompt)
|
|
|
+ results[element_key] = {
|
|
|
+ 'value': response.strip(),
|
|
|
+ 'confidence': 0.85,
|
|
|
+ 'source': 'llm'
|
|
|
+ }
|
|
|
+ except Exception as e:
|
|
|
+ logger.error(f"LLM提取失败: {element_key}, error={e}")
|
|
|
+
|
|
|
+ # 2. 提取表格型要素
|
|
|
+ for config in self.llm_config['table_elements']:
|
|
|
+ element_key = config['element_key']
|
|
|
+
|
|
|
+ # 找到相关表格
|
|
|
+ relevant_table = self._find_relevant_table(tables, config['table_type'])
|
|
|
+ if relevant_table:
|
|
|
+ prompt = config['prompt_template'].format(
|
|
|
+ text=json.dumps(relevant_table, ensure_ascii=False)
|
|
|
+ )
|
|
|
+ try:
|
|
|
+ response = await deepseek_service.chat(prompt)
|
|
|
+ # 解析JSON响应
|
|
|
+ table_data = json.loads(response)
|
|
|
+ results[element_key] = {
|
|
|
+ 'value': json.dumps(table_data, ensure_ascii=False),
|
|
|
+ 'confidence': 0.80,
|
|
|
+ 'source': 'llm',
|
|
|
+ 'is_table': True
|
|
|
+ }
|
|
|
+ except Exception as e:
|
|
|
+ logger.error(f"表格提取失败: {element_key}, error={e}")
|
|
|
+
|
|
|
+ return results
|
|
|
+
|
|
|
+ def _find_relevant_text(self, text: str, pattern: str) -> str:
|
|
|
+ """根据评审代码模式查找相关文本"""
|
|
|
+ # 简单实现:查找包含该代码的段落
|
|
|
+ lines = text.split('\n')
|
|
|
+ relevant_lines = []
|
|
|
+ capturing = False
|
|
|
+
|
|
|
+ for line in lines:
|
|
|
+ if re.search(pattern, line):
|
|
|
+ capturing = True
|
|
|
+ if capturing:
|
|
|
+ relevant_lines.append(line)
|
|
|
+ if len(relevant_lines) > 20: # 最多取20行
|
|
|
+ break
|
|
|
+
|
|
|
+ return '\n'.join(relevant_lines)
|
|
|
+
|
|
|
+ def _find_relevant_table(self, tables: List[Dict], table_type: str) -> Optional[Dict]:
|
|
|
+ """根据表格类型查找相关表格"""
|
|
|
+ # 基于表头关键词匹配
|
|
|
+ keywords = {
|
|
|
+ 'review_project': ['项目名称', '简称', '类型'],
|
|
|
+ 'reviewer': ['姓名', '专业', '分工'],
|
|
|
+ 'suggestion': ['问题', '建议', '整改'],
|
|
|
+ }
|
|
|
+
|
|
|
+ target_keywords = keywords.get(table_type, [])
|
|
|
+
|
|
|
+ for table_info in tables:
|
|
|
+ table = table_info['table']
|
|
|
+ if table.get('data') and len(table['data']) > 0:
|
|
|
+ header_row = table['data'][0]
|
|
|
+ header_texts = [cell.get('text', '') for cell in header_row]
|
|
|
+
|
|
|
+ # 检查是否包含目标关键词
|
|
|
+ match_count = sum(1 for kw in target_keywords if any(kw in h for h in header_texts))
|
|
|
+ if match_count >= 2:
|
|
|
+ return table
|
|
|
+
|
|
|
+ return None
|
|
|
+
|
|
|
+ def _build_output(
|
|
|
+ self,
|
|
|
+ extracted_values: Dict[str, Dict],
|
|
|
+ attachment_id: int
|
|
|
+ ) -> tuple:
|
|
|
+ """构建输出的elements和values"""
|
|
|
+
|
|
|
+ # 要素定义模板
|
|
|
+ ELEMENT_TEMPLATES = {
|
|
|
+ "project.reviewObject": {"name": "评审对象", "type": "text", "namespace": "project"},
|
|
|
+ "project.reviewObjectAlias": {"name": "评审对象简称", "type": "text", "namespace": "project"},
|
|
|
+ "project.workStartAt": {"name": "评审开始日期", "type": "text", "namespace": "project"},
|
|
|
+ "project.workEndAt": {"name": "评审结束日期", "type": "text", "namespace": "project"},
|
|
|
+ "project.resultScore": {"name": "评审得分", "type": "text", "namespace": "project"},
|
|
|
+ "project.resultLevel": {"name": "评审结论级别", "type": "text", "namespace": "project"},
|
|
|
+ "project.target": {"name": "目标", "type": "paragraph", "namespace": "project"},
|
|
|
+ "project.duty": {"name": "职责", "type": "paragraph", "namespace": "project"},
|
|
|
+ # ... 其他要素
|
|
|
+ }
|
|
|
+
|
|
|
+ elements = []
|
|
|
+ values = []
|
|
|
+
|
|
|
+ for i, (element_key, template) in enumerate(ELEMENT_TEMPLATES.items()):
|
|
|
+ element = {
|
|
|
+ "id": 700 + i,
|
|
|
+ "elementKey": element_key,
|
|
|
+ "elementName": template["name"],
|
|
|
+ "elementType": template["type"],
|
|
|
+ "namespace": template["namespace"],
|
|
|
+ "sortOrder": i
|
|
|
+ }
|
|
|
+ elements.append(element)
|
|
|
+
|
|
|
+ # 查找提取的值
|
|
|
+ extracted = extracted_values.get(element_key)
|
|
|
+ if extracted:
|
|
|
+ value = {
|
|
|
+ "valueId": 800 + i,
|
|
|
+ "elementKey": element_key,
|
|
|
+ "valueText": extracted['value'],
|
|
|
+ "isFilled": True,
|
|
|
+ "fillSource": "ai" if extracted['source'] == 'llm' else "rule",
|
|
|
+ "confidence": extracted.get('confidence', 0.8),
|
|
|
+ "sourceAttachmentId": attachment_id
|
|
|
+ }
|
|
|
+ if 'position' in extracted:
|
|
|
+ value['extractPosition'] = extracted['position']
|
|
|
+ else:
|
|
|
+ value = {
|
|
|
+ "valueId": 800 + i,
|
|
|
+ "elementKey": element_key,
|
|
|
+ "valueText": "",
|
|
|
+ "isFilled": False,
|
|
|
+ "fillSource": "default"
|
|
|
+ }
|
|
|
+
|
|
|
+ values.append(value)
|
|
|
+
|
|
|
+ return elements, values
|
|
|
+
|
|
|
+
|
|
|
+# 创建单例
|
|
|
+element_extractor = ElementExtractor()
|
|
|
+```
|
|
|
+
|
|
|
+### 4.2 API接口
|
|
|
+
|
|
|
+```python
|
|
|
+# python-services/ner-service/app/routers/extract.py
|
|
|
+
|
|
|
+from fastapi import APIRouter, HTTPException, UploadFile, File
|
|
|
+from pydantic import BaseModel
|
|
|
+from typing import Dict, Any, Optional
|
|
|
+import json
|
|
|
+
|
|
|
+from ..services.element_extractor import element_extractor
|
|
|
+from ..services.docx_parser import parse_docx_file
|
|
|
+
|
|
|
+router = APIRouter()
|
|
|
+
|
|
|
+
|
|
|
+class ExtractRequest(BaseModel):
|
|
|
+ """提取请求"""
|
|
|
+ doc_content: Dict[str, Any] # parse_docx输出的文档结构
|
|
|
+ attachment_id: int
|
|
|
+
|
|
|
+
|
|
|
+class ExtractResponse(BaseModel):
|
|
|
+ """提取响应"""
|
|
|
+ success: bool
|
|
|
+ elements: list
|
|
|
+ values: list
|
|
|
+ statistics: Dict[str, Any]
|
|
|
+ error: Optional[str] = None
|
|
|
+
|
|
|
+
|
|
|
+@router.post("/extract/from-content", response_model=ExtractResponse)
|
|
|
+async def extract_from_content(request: ExtractRequest):
|
|
|
+ """
|
|
|
+ 从已解析的文档内容中提取要素
|
|
|
+
|
|
|
+ 输入: doc_content (parse_docx输出)
|
|
|
+ 输出: elements + values
|
|
|
+ """
|
|
|
+ try:
|
|
|
+ result = await element_extractor.extract_all(
|
|
|
+ doc_content=request.doc_content,
|
|
|
+ attachment_id=request.attachment_id
|
|
|
+ )
|
|
|
+
|
|
|
+ return ExtractResponse(
|
|
|
+ success=True,
|
|
|
+ elements=result['elements'],
|
|
|
+ values=result['values'],
|
|
|
+ statistics=result['statistics']
|
|
|
+ )
|
|
|
+ except Exception as e:
|
|
|
+ return ExtractResponse(
|
|
|
+ success=False,
|
|
|
+ elements=[],
|
|
|
+ values=[],
|
|
|
+ statistics={},
|
|
|
+ error=str(e)
|
|
|
+ )
|
|
|
+
|
|
|
+
|
|
|
+@router.post("/extract/from-docx")
|
|
|
+async def extract_from_docx(
|
|
|
+ file: UploadFile = File(...),
|
|
|
+ attachment_id: int = 0
|
|
|
+):
|
|
|
+ """
|
|
|
+ 完整流程:上传DOCX → 解析 → 提取要素
|
|
|
+
|
|
|
+ 输入: DOCX文件
|
|
|
+ 输出: doc_content + elements + values
|
|
|
+ """
|
|
|
+ try:
|
|
|
+ # 1. 解析DOCX
|
|
|
+ content = await file.read()
|
|
|
+ doc_content = parse_docx_file(content)
|
|
|
+
|
|
|
+ # 2. 提取要素
|
|
|
+ result = await element_extractor.extract_all(
|
|
|
+ doc_content=doc_content,
|
|
|
+ attachment_id=attachment_id
|
|
|
+ )
|
|
|
+
|
|
|
+ return {
|
|
|
+ "success": True,
|
|
|
+ "doc_content": doc_content,
|
|
|
+ "elements": result['elements'],
|
|
|
+ "values": result['values'],
|
|
|
+ "statistics": result['statistics']
|
|
|
+ }
|
|
|
+ except Exception as e:
|
|
|
+ raise HTTPException(status_code=500, detail=str(e))
|
|
|
+```
|
|
|
+
|
|
|
+### 4.3 前端调用示例
|
|
|
+
|
|
|
+```javascript
|
|
|
+// frontend/vue-demo/src/api/extract.js
|
|
|
+
|
|
|
+import api from './index'
|
|
|
+
|
|
|
+export const extractApi = {
|
|
|
+ /**
|
|
|
+ * 上传DOCX并提取要素(完整流程)
|
|
|
+ */
|
|
|
+ async extractFromDocx(file, attachmentId = 0) {
|
|
|
+ const formData = new FormData()
|
|
|
+ formData.append('file', file)
|
|
|
+ formData.append('attachment_id', attachmentId)
|
|
|
+
|
|
|
+ return api.post('/extract/from-docx', formData, {
|
|
|
+ headers: { 'Content-Type': 'multipart/form-data' },
|
|
|
+ timeout: 120000 // 2分钟超时
|
|
|
+ })
|
|
|
+ },
|
|
|
+
|
|
|
+ /**
|
|
|
+ * 从已解析的内容中提取要素
|
|
|
+ */
|
|
|
+ async extractFromContent(docContent, attachmentId) {
|
|
|
+ return api.post('/extract/from-content', {
|
|
|
+ doc_content: docContent,
|
|
|
+ attachment_id: attachmentId
|
|
|
+ })
|
|
|
+ }
|
|
|
+}
|
|
|
+```
|
|
|
+
|
|
|
+```vue
|
|
|
+<!-- 使用示例 -->
|
|
|
+<script setup>
|
|
|
+import { ref } from 'vue'
|
|
|
+import { extractApi } from '@/api/extract'
|
|
|
+
|
|
|
+const loading = ref(false)
|
|
|
+const elements = ref([])
|
|
|
+const values = ref([])
|
|
|
+const docContent = ref(null)
|
|
|
+
|
|
|
+async function handleUpload(file) {
|
|
|
+ loading.value = true
|
|
|
+ try {
|
|
|
+ const result = await extractApi.extractFromDocx(file.raw)
|
|
|
+
|
|
|
+ if (result.success) {
|
|
|
+ docContent.value = result.doc_content
|
|
|
+ elements.value = result.elements
|
|
|
+ values.value = result.values
|
|
|
+
|
|
|
+ console.log('提取统计:', result.statistics)
|
|
|
+ // { total_elements: 47, filled_values: 35, ner_extracted: 8, llm_extracted: 27 }
|
|
|
+ }
|
|
|
+ } finally {
|
|
|
+ loading.value = false
|
|
|
+ }
|
|
|
+}
|
|
|
+</script>
|
|
|
+```
|
|
|
+
|
|
|
+## 五、实施步骤
|
|
|
+
|
|
|
+### 第一步:DOCX解析服务(已有)
|
|
|
+
|
|
|
+现有的 `parse_docx.py` 已经可以:
|
|
|
+- 解析段落和格式
|
|
|
+- 解析表格
|
|
|
+- 提取图片
|
|
|
+
|
|
|
+**需要改造**:
|
|
|
+- 封装为HTTP服务
|
|
|
+- 支持文件上传接口
|
|
|
+
|
|
|
+### 第二步:NER规则提取(1-2天)
|
|
|
+
|
|
|
+1. 定义8个简单要素的提取规则
|
|
|
+2. 实现规则匹配逻辑
|
|
|
+3. 测试准确率
|
|
|
+
|
|
|
+### 第三步:LLM提取集成(2-3天)
|
|
|
+
|
|
|
+1. 配置DeepSeek API
|
|
|
+2. 设计提示词模板(18个总结型 + 7个表格型)
|
|
|
+3. 实现批量调用和结果解析
|
|
|
+4. 添加错误处理和重试
|
|
|
+
|
|
|
+### 第四步:结果合并与输出(1天)
|
|
|
+
|
|
|
+1. 合并NER和LLM结果
|
|
|
+2. 生成elements和values格式
|
|
|
+3. 添加置信度评分
|
|
|
+
|
|
|
+### 第五步:前端集成(1-2天)
|
|
|
+
|
|
|
+1. 添加上传接口调用
|
|
|
+2. 更新store数据
|
|
|
+3. 触发文档渲染
|
|
|
+
|
|
|
+## 六、预期输出示例
|
|
|
+
|
|
|
+```json
|
|
|
+{
|
|
|
+ "success": true,
|
|
|
+ "doc_content": {
|
|
|
+ "page": { "widthMm": 210, "heightMm": 297 },
|
|
|
+ "blocks": [
|
|
|
+ { "id": "b0", "type": "heading1", "runs": [{"text": "1 企业概述"}] },
|
|
|
+ // ... 350个blocks
|
|
|
+ ],
|
|
|
+ "totalBlocks": 350
|
|
|
+ },
|
|
|
+ "elements": [
|
|
|
+ { "id": 701, "elementKey": "project.reviewObject", "elementName": "评审对象", "elementType": "text" },
|
|
|
+ { "id": 702, "elementKey": "project.workStartAt", "elementName": "评审开始日期", "elementType": "text" },
|
|
|
+ // ... 47个要素
|
|
|
+ ],
|
|
|
+ "values": [
|
|
|
+ { "valueId": 801, "elementKey": "project.reviewObject", "valueText": "中国电建集团成都勘测设计研究院有限公司", "isFilled": true, "fillSource": "rule", "confidence": 0.95 },
|
|
|
+ { "valueId": 802, "elementKey": "project.workStartAt", "valueText": "2024年7月13日", "isFilled": true, "fillSource": "rule", "confidence": 0.98 },
|
|
|
+ { "valueId": 803, "elementKey": "project.target", "valueText": "成都院制定并发布《QHSE"十四五"规划》...", "isFilled": true, "fillSource": "ai", "confidence": 0.85 },
|
|
|
+ // ... 47个值
|
|
|
+ ],
|
|
|
+ "statistics": {
|
|
|
+ "total_elements": 47,
|
|
|
+ "filled_values": 35,
|
|
|
+ "ner_extracted": 8,
|
|
|
+ "llm_extracted": 27
|
|
|
+ }
|
|
|
+}
|
|
|
+```
|
|
|
+
|
|
|
+---
|
|
|
+
|
|
|
+**文档版本**:v1.0
|
|
|
+**创建时间**:2024-03-04
|