┌─────────────────────────────────────────────────────────────────────┐
│ 用户上传 DOCX │
└─────────────────────────────┬───────────────────────────────────────┘
│
▼
┌─────────────────────────────────────────────────────────────────────┐
│ Java后端 (lingyue-project) │
│ ├─ AttachmentController.upload() - 上传附件 │
│ ├─ DocxParseService.parseToHtml() - 解析为HTML │
│ ├─ 存储 doc_content (结构化JSON) 和 parsed_text (纯文本) │
│ └─ API: GET /api/v1/attachments/{id}/parsed-text │
└─────────────────────────────┬───────────────────────────────────────┘
│ 获取 parsed_text
▼
┌─────────────────────────────────────────────────────────────────────┐
│ Python NER服务 (ner-service) │
│ ├─ 输入: parsed_text (纯文本) │
│ ├─ POST /extract/from-text │
│ │ │
│ │ ┌─────────────────┐ ┌─────────────────────────────────────┐ │
│ │ │ NER规则提取 │ │ LLM智能提取 (可选) │ │
│ │ │ (8个简单要素) │ │ (16个总结型要素) │ │
│ │ │ ├─ 日期 │ │ ├─ 目标、职责、安全投入等 │ │
│ │ │ ├─ 得分、级别 │ │ └─ 需要语义理解的内容 │ │
│ │ │ └─ 编号、机构 │ │ │ │
│ │ └─────────────────┘ └─────────────────────────────────────┘ │
│ │ │
│ └─ 输出: elements + values │
└─────────────────────────────┬───────────────────────────────────────┘
│
▼
┌─────────────────────────────────────────────────────────────────────┐
│ 前端渲染 │
│ ├─ elements: 要素定义列表 │
│ ├─ values: 要素值列表 │
│ └─ doc_content: 文档渲染内容 (来自Java后端) │
└─────────────────────────────────────────────────────────────────────┘
职责分离:
DocxParseService),存储和管理附件// 来自 elementApi.list(projectId)
elements = [
{
id: 701,
elementKey: "project.reviewObject",
elementName: "评审对象",
elementType: "text", // text | paragraph | table | static
namespace: "project", // 分组命名空间
sortOrder: 1,
description: "被评审企业的完整名称"
},
{
id: 702,
elementKey: "project.workStartAt",
elementName: "评审开始日期",
elementType: "text",
namespace: "project",
sortOrder: 2
},
// ... 共47个要素
]
// 来自 valueApi.list(projectId)
values = [
{
valueId: 801,
elementKey: "project.reviewObject",
valueText: "中国电建集团成都勘测设计研究院有限公司",
isFilled: true,
fillSource: "ai", // default | manual | rule | ai
confidence: 0.95,
sourceAttachmentId: 402,
extractPosition: { charStart: 100, charEnd: 130, line: 5 }
},
{
valueId: 802,
elementKey: "project.workStartAt",
valueText: "2024年7月13日",
isFilled: true,
fillSource: "rule",
confidence: 0.98
},
// ...
]
// 来自 attachmentApi.getDocContent(attachmentId)
doc_content = {
page: {
widthMm: 210,
heightMm: 297,
marginTopMm: 25.4,
marginBottomMm: 25.4,
marginLeftMm: 31.8,
marginRightMm: 31.8
},
blocks: [
{
id: "b0",
type: "heading1",
runs: [{ text: "1 企业概述", bold: true, fontSize: 16 }],
style: { alignment: "left" }
},
{
id: "b1",
type: "paragraph",
runs: [
{ text: "中国电建集团成都勘测设计研究院有限公司", bold: true },
{ text: "(以下简称"成都院")是..." }
]
},
{
id: "b2",
type: "table",
table: {
rows: 5,
cols: 4,
data: [
[{ text: "序号" }, { text: "项目名称" }, { text: "简称" }, { text: "类型" }],
[{ text: "1" }, { text: "成都院本部" }, { text: "本部" }, { text: "单位" }],
// ...
]
}
},
// ...
],
totalBlocks: 350
}
| 类别 | 要素数 | 提取方法 | 示例 |
|---|---|---|---|
| A. 简单结构化 | 8 | NER规则 | 日期、得分、级别、编号 |
| B. 实体识别 | 2 | NER+后处理 | 评审对象、简称 |
| C. 总结型 | 18 | LLM | 目标、职责、安全投入等 |
| D. 列表拼接 | 2 | NER+LLM | 复审范围、工作过程 |
| E. 表格数据 | 7 | 表格解析+LLM | 项目列表、人员列表 |
# 提取规则定义
NER_RULES = {
# 日期类
"project.workStartAt": {
"patterns": [
r'评审(?:开始)?日期[::]\s*(\d{4}年\d{1,2}月\d{1,2}日)',
r'(\d{4}年\d{1,2}月\d{1,2}日)至',
],
"type": "DATE",
"post_process": "take_first"
},
"project.workEndAt": {
"patterns": [
r'至(\d{4}年\d{1,2}月\d{1,2}日)',
r'评审(?:结束)?日期[::]\s*(\d{4}年\d{1,2}月\d{1,2}日)',
],
"type": "DATE",
"post_process": "take_last"
},
# 得分类
"project.resultScore": {
"patterns": [
r'评审得分[::]\s*(\d+\.?\d*)\s*分',
r'得分[::]\s*(\d+\.?\d*)',
],
"type": "SCORE",
"post_process": "to_float"
},
# 级别类
"project.resultLevel": {
"patterns": [
r'级别[::]\s*(一级|二级|三级)',
r'评审(?:结论)?级别[::]\s*(一级|二级|三级)',
],
"type": "LEVEL"
},
# 编号类
"basicInfo.projectCode": {
"patterns": [
r'项目编号[::]\s*([A-Z]+-\d+-\d+)',
r'编号[::]\s*([A-Z0-9\-]+)',
],
"type": "CODE"
},
"basicInfo.reviewObjectCertificateCode": {
"patterns": [
r'证书编号[::]\s*(ZGDIDBOY-\d+)',
r'证书编号[::]\s*([A-Z0-9\-]+)',
],
"type": "CODE"
},
# 机构类
"project.reviewObject": {
"patterns": [
r'评审对象[::]\s*([^\n]{10,50}(?:公司|集团|院|所))',
],
"type": "ORG"
},
"project.reviewObjectAlias": {
"patterns": [
r'以下简称[「『""]([^」』""]{2,10})[」』""]',
r'简称[「『""]([^」』""]{2,10})[」』""]',
],
"type": "ALIAS"
},
}
# LLM提取配置
LLM_EXTRACT_CONFIG = {
# 总结型要素(18个)
"summary_elements": [
{
"element_key": "project.target",
"element_name": "目标",
"source_pattern": r"5\.1\.1\.\d", # 评审代码匹配
"prompt_template": "请根据以下评审意见,总结企业的安全生产目标情况(100-200字):\n{text}"
},
{
"element_key": "project.duty",
"element_name": "职责",
"source_pattern": r"5\.1\.2\.1",
"prompt_template": "请根据以下评审意见,总结企业的安全生产职责落实情况(100-200字):\n{text}"
},
# ... 其他16个
],
# 表格提取要素(7个)
"table_elements": [
{
"element_key": "+SPSRRReviewProject",
"element_name": "现场复审项目",
"table_type": "review_project",
"columns": ["项目名称", "简称", "类型", "排序"],
"prompt_template": """
请从以下文本中提取复审项目列表,以JSON数组格式返回:
[{"name": "项目名称", "alias": "简称", "type": "单位/在建项目", "order": 1}]
文本:{text}
"""
},
# ... 其他6个
]
}
# python-services/ner-service/app/services/element_extractor.py
import re
import json
from typing import Dict, List, Any, Optional
from loguru import logger
from .ner_service import ner_service
from .deepseek_service import deepseek_service
class ElementExtractor:
"""要素提取器:混合NER+LLM策略"""
def __init__(self):
self.ner_rules = NER_RULES # 上面定义的规则
self.llm_config = LLM_EXTRACT_CONFIG
async def extract_all(
self,
doc_content: Dict,
attachment_id: int
) -> Dict[str, Any]:
"""
从文档内容中提取所有要素
Args:
doc_content: parse_docx输出的文档结构
attachment_id: 附件ID
Returns:
{
"elements": [...], # 要素定义
"values": [...], # 要素值
"statistics": {...} # 提取统计
}
"""
# 1. 将blocks转为纯文本
full_text = self._blocks_to_text(doc_content['blocks'])
tables = self._extract_tables(doc_content['blocks'])
logger.info(f"开始提取要素: attachment_id={attachment_id}, "
f"text_length={len(full_text)}, tables={len(tables)}")
# 2. NER规则提取(简单要素)
ner_values = await self._extract_by_ner(full_text, attachment_id)
logger.info(f"NER提取完成: {len(ner_values)} 个要素")
# 3. LLM提取(总结型+表格)
llm_values = await self._extract_by_llm(full_text, tables, attachment_id)
logger.info(f"LLM提取完成: {len(llm_values)} 个要素")
# 4. 合并结果
all_values = {**ner_values, **llm_values}
# 5. 生成elements和values
elements, values = self._build_output(all_values, attachment_id)
return {
"elements": elements,
"values": values,
"statistics": {
"total_elements": len(elements),
"filled_values": len([v for v in values if v.get("isFilled")]),
"ner_extracted": len(ner_values),
"llm_extracted": len(llm_values),
}
}
def _blocks_to_text(self, blocks: List[Dict]) -> str:
"""将blocks转为纯文本"""
lines = []
for block in blocks:
if block['type'] == 'table':
# 表格转为文本
table = block.get('table', {})
for row in table.get('data', []):
cells = [cell.get('text', '') for cell in row]
lines.append(' | '.join(cells))
lines.append('') # 空行分隔
else:
# 段落
runs = block.get('runs', [])
text = ''.join(r.get('text', '') for r in runs)
lines.append(text)
return '\n'.join(lines)
def _extract_tables(self, blocks: List[Dict]) -> List[Dict]:
"""提取所有表格"""
tables = []
for i, block in enumerate(blocks):
if block['type'] == 'table':
tables.append({
'block_id': block['id'],
'block_index': i,
'table': block.get('table', {})
})
return tables
async def _extract_by_ner(
self,
text: str,
attachment_id: int
) -> Dict[str, Dict]:
"""NER规则提取"""
results = {}
for element_key, rule in self.ner_rules.items():
for pattern in rule['patterns']:
match = re.search(pattern, text)
if match:
value = match.group(1)
# 后处理
if rule.get('post_process') == 'to_float':
try:
value = str(float(value))
except:
pass
results[element_key] = {
'value': value,
'confidence': 0.95,
'source': 'ner',
'position': {
'charStart': match.start(1),
'charEnd': match.end(1),
'line': text[:match.start()].count('\n') + 1
}
}
break # 找到第一个匹配就停止
return results
async def _extract_by_llm(
self,
text: str,
tables: List[Dict],
attachment_id: int
) -> Dict[str, Dict]:
"""LLM智能提取"""
results = {}
# 1. 提取总结型要素
for config in self.llm_config['summary_elements']:
element_key = config['element_key']
# 查找相关文本(基于评审代码)
source_pattern = config.get('source_pattern')
if source_pattern:
relevant_text = self._find_relevant_text(text, source_pattern)
else:
relevant_text = text[:5000] # 取前5000字
if relevant_text:
prompt = config['prompt_template'].format(text=relevant_text)
try:
response = await deepseek_service.chat(prompt)
results[element_key] = {
'value': response.strip(),
'confidence': 0.85,
'source': 'llm'
}
except Exception as e:
logger.error(f"LLM提取失败: {element_key}, error={e}")
# 2. 提取表格型要素
for config in self.llm_config['table_elements']:
element_key = config['element_key']
# 找到相关表格
relevant_table = self._find_relevant_table(tables, config['table_type'])
if relevant_table:
prompt = config['prompt_template'].format(
text=json.dumps(relevant_table, ensure_ascii=False)
)
try:
response = await deepseek_service.chat(prompt)
# 解析JSON响应
table_data = json.loads(response)
results[element_key] = {
'value': json.dumps(table_data, ensure_ascii=False),
'confidence': 0.80,
'source': 'llm',
'is_table': True
}
except Exception as e:
logger.error(f"表格提取失败: {element_key}, error={e}")
return results
def _find_relevant_text(self, text: str, pattern: str) -> str:
"""根据评审代码模式查找相关文本"""
# 简单实现:查找包含该代码的段落
lines = text.split('\n')
relevant_lines = []
capturing = False
for line in lines:
if re.search(pattern, line):
capturing = True
if capturing:
relevant_lines.append(line)
if len(relevant_lines) > 20: # 最多取20行
break
return '\n'.join(relevant_lines)
def _find_relevant_table(self, tables: List[Dict], table_type: str) -> Optional[Dict]:
"""根据表格类型查找相关表格"""
# 基于表头关键词匹配
keywords = {
'review_project': ['项目名称', '简称', '类型'],
'reviewer': ['姓名', '专业', '分工'],
'suggestion': ['问题', '建议', '整改'],
}
target_keywords = keywords.get(table_type, [])
for table_info in tables:
table = table_info['table']
if table.get('data') and len(table['data']) > 0:
header_row = table['data'][0]
header_texts = [cell.get('text', '') for cell in header_row]
# 检查是否包含目标关键词
match_count = sum(1 for kw in target_keywords if any(kw in h for h in header_texts))
if match_count >= 2:
return table
return None
def _build_output(
self,
extracted_values: Dict[str, Dict],
attachment_id: int
) -> tuple:
"""构建输出的elements和values"""
# 要素定义模板
ELEMENT_TEMPLATES = {
"project.reviewObject": {"name": "评审对象", "type": "text", "namespace": "project"},
"project.reviewObjectAlias": {"name": "评审对象简称", "type": "text", "namespace": "project"},
"project.workStartAt": {"name": "评审开始日期", "type": "text", "namespace": "project"},
"project.workEndAt": {"name": "评审结束日期", "type": "text", "namespace": "project"},
"project.resultScore": {"name": "评审得分", "type": "text", "namespace": "project"},
"project.resultLevel": {"name": "评审结论级别", "type": "text", "namespace": "project"},
"project.target": {"name": "目标", "type": "paragraph", "namespace": "project"},
"project.duty": {"name": "职责", "type": "paragraph", "namespace": "project"},
# ... 其他要素
}
elements = []
values = []
for i, (element_key, template) in enumerate(ELEMENT_TEMPLATES.items()):
element = {
"id": 700 + i,
"elementKey": element_key,
"elementName": template["name"],
"elementType": template["type"],
"namespace": template["namespace"],
"sortOrder": i
}
elements.append(element)
# 查找提取的值
extracted = extracted_values.get(element_key)
if extracted:
value = {
"valueId": 800 + i,
"elementKey": element_key,
"valueText": extracted['value'],
"isFilled": True,
"fillSource": "ai" if extracted['source'] == 'llm' else "rule",
"confidence": extracted.get('confidence', 0.8),
"sourceAttachmentId": attachment_id
}
if 'position' in extracted:
value['extractPosition'] = extracted['position']
else:
value = {
"valueId": 800 + i,
"elementKey": element_key,
"valueText": "",
"isFilled": False,
"fillSource": "default"
}
values.append(value)
return elements, values
# 创建单例
element_extractor = ElementExtractor()
# python-services/ner-service/app/routers/extract.py
from fastapi import APIRouter, HTTPException, UploadFile, File
from pydantic import BaseModel
from typing import Dict, Any, Optional
import json
from ..services.element_extractor import element_extractor
from ..services.docx_parser import parse_docx_file
router = APIRouter()
class ExtractRequest(BaseModel):
"""提取请求"""
doc_content: Dict[str, Any] # parse_docx输出的文档结构
attachment_id: int
class ExtractResponse(BaseModel):
"""提取响应"""
success: bool
elements: list
values: list
statistics: Dict[str, Any]
error: Optional[str] = None
@router.post("/extract/from-content", response_model=ExtractResponse)
async def extract_from_content(request: ExtractRequest):
"""
从已解析的文档内容中提取要素
输入: doc_content (parse_docx输出)
输出: elements + values
"""
try:
result = await element_extractor.extract_all(
doc_content=request.doc_content,
attachment_id=request.attachment_id
)
return ExtractResponse(
success=True,
elements=result['elements'],
values=result['values'],
statistics=result['statistics']
)
except Exception as e:
return ExtractResponse(
success=False,
elements=[],
values=[],
statistics={},
error=str(e)
)
@router.post("/extract/from-docx")
async def extract_from_docx(
file: UploadFile = File(...),
attachment_id: int = 0
):
"""
完整流程:上传DOCX → 解析 → 提取要素
输入: DOCX文件
输出: doc_content + elements + values
"""
try:
# 1. 解析DOCX
content = await file.read()
doc_content = parse_docx_file(content)
# 2. 提取要素
result = await element_extractor.extract_all(
doc_content=doc_content,
attachment_id=attachment_id
)
return {
"success": True,
"doc_content": doc_content,
"elements": result['elements'],
"values": result['values'],
"statistics": result['statistics']
}
except Exception as e:
raise HTTPException(status_code=500, detail=str(e))
// frontend/vue-demo/src/api/extract.js
import api from './index'
export const extractApi = {
/**
* 上传DOCX并提取要素(完整流程)
*/
async extractFromDocx(file, attachmentId = 0) {
const formData = new FormData()
formData.append('file', file)
formData.append('attachment_id', attachmentId)
return api.post('/extract/from-docx', formData, {
headers: { 'Content-Type': 'multipart/form-data' },
timeout: 120000 // 2分钟超时
})
},
/**
* 从已解析的内容中提取要素
*/
async extractFromContent(docContent, attachmentId) {
return api.post('/extract/from-content', {
doc_content: docContent,
attachment_id: attachmentId
})
}
}
<!-- 使用示例 -->
<script setup>
import { ref } from 'vue'
import { extractApi } from '@/api/extract'
const loading = ref(false)
const elements = ref([])
const values = ref([])
const docContent = ref(null)
async function handleUpload(file) {
loading.value = true
try {
const result = await extractApi.extractFromDocx(file.raw)
if (result.success) {
docContent.value = result.doc_content
elements.value = result.elements
values.value = result.values
console.log('提取统计:', result.statistics)
// { total_elements: 47, filled_values: 35, ner_extracted: 8, llm_extracted: 27 }
}
} finally {
loading.value = false
}
}
</script>
现有的 parse_docx.py 已经可以:
需要改造:
{
"success": true,
"doc_content": {
"page": { "widthMm": 210, "heightMm": 297 },
"blocks": [
{ "id": "b0", "type": "heading1", "runs": [{"text": "1 企业概述"}] },
// ... 350个blocks
],
"totalBlocks": 350
},
"elements": [
{ "id": 701, "elementKey": "project.reviewObject", "elementName": "评审对象", "elementType": "text" },
{ "id": 702, "elementKey": "project.workStartAt", "elementName": "评审开始日期", "elementType": "text" },
// ... 47个要素
],
"values": [
{ "valueId": 801, "elementKey": "project.reviewObject", "valueText": "中国电建集团成都勘测设计研究院有限公司", "isFilled": true, "fillSource": "rule", "confidence": 0.95 },
{ "valueId": 802, "elementKey": "project.workStartAt", "valueText": "2024年7月13日", "isFilled": true, "fillSource": "rule", "confidence": 0.98 },
{ "valueId": 803, "elementKey": "project.target", "valueText": "成都院制定并发布《QHSE"十四五"规划》...", "isFilled": true, "fillSource": "ai", "confidence": 0.85 },
// ... 47个值
],
"statistics": {
"total_elements": 47,
"filled_values": 35,
"ner_extracted": 8,
"llm_extracted": 27
}
}
文档版本:v1.0
创建时间:2024-03-04