|
@@ -1,6 +1,8 @@
|
|
|
package com.lingyue.graph.listener;
|
|
package com.lingyue.graph.listener;
|
|
|
|
|
|
|
|
import com.lingyue.common.event.DocumentParsedEvent;
|
|
import com.lingyue.common.event.DocumentParsedEvent;
|
|
|
|
|
+import com.lingyue.document.entity.Document;
|
|
|
|
|
+import com.lingyue.document.repository.DocumentRepository;
|
|
|
import com.lingyue.graph.service.GraphNerService;
|
|
import com.lingyue.graph.service.GraphNerService;
|
|
|
import com.lingyue.graph.service.NerToBlockService;
|
|
import com.lingyue.graph.service.NerToBlockService;
|
|
|
import com.lingyue.graph.service.NerToBlockService.TextElementDTO;
|
|
import com.lingyue.graph.service.NerToBlockService.TextElementDTO;
|
|
@@ -17,12 +19,15 @@ import java.util.*;
|
|
|
|
|
|
|
|
/**
|
|
/**
|
|
|
* 文档解析完成事件监听器
|
|
* 文档解析完成事件监听器
|
|
|
- * 监听文档解析完成事件,自动触发 NER 提取并保存到图数据库
|
|
|
|
|
|
|
+ * 监听文档解析完成事件,自动触发后续处理流程:
|
|
|
|
|
+ * 1. 结构化解析(Word 文档 -> 段落/图片/表格)
|
|
|
|
|
+ * 2. NER 实体提取(文本 -> 实体/关系)
|
|
|
*
|
|
*
|
|
|
- * 2026-01-21 更新:增加将 NER 结果转换为 TextElement 的能力
|
|
|
|
|
|
|
+ * 所有步骤同时支持手动触发 API,可单独重新生成
|
|
|
*
|
|
*
|
|
|
* @author lingyue
|
|
* @author lingyue
|
|
|
* @since 2026-01-19
|
|
* @since 2026-01-19
|
|
|
|
|
+ * @updated 2026-01-21 增加自动结构化解析
|
|
|
*/
|
|
*/
|
|
|
@Slf4j
|
|
@Slf4j
|
|
|
@Component
|
|
@Component
|
|
@@ -32,9 +37,16 @@ public class DocumentParsedEventListener {
|
|
|
private final GraphNerService graphNerService;
|
|
private final GraphNerService graphNerService;
|
|
|
private final NerToBlockService nerToBlockService;
|
|
private final NerToBlockService nerToBlockService;
|
|
|
private final RestTemplate restTemplate;
|
|
private final RestTemplate restTemplate;
|
|
|
|
|
+ private final DocumentRepository documentRepository;
|
|
|
|
|
|
|
|
@Value("${ner.auto-extract.enabled:true}")
|
|
@Value("${ner.auto-extract.enabled:true}")
|
|
|
private boolean nerAutoExtractEnabled;
|
|
private boolean nerAutoExtractEnabled;
|
|
|
|
|
+
|
|
|
|
|
+ @Value("${parse.structured.auto-extract.enabled:true}")
|
|
|
|
|
+ private boolean structuredAutoExtractEnabled;
|
|
|
|
|
+
|
|
|
|
|
+ @Value("${server.port:5232}")
|
|
|
|
|
+ private int serverPort;
|
|
|
|
|
|
|
|
@Value("${ner.python-service.url:http://localhost:8001}")
|
|
@Value("${ner.python-service.url:http://localhost:8001}")
|
|
|
private String nerServiceUrl;
|
|
private String nerServiceUrl;
|
|
@@ -50,21 +62,80 @@ public class DocumentParsedEventListener {
|
|
|
|
|
|
|
|
/**
|
|
/**
|
|
|
* 处理文档解析完成事件
|
|
* 处理文档解析完成事件
|
|
|
- * 异步执行 NER 提取,不阻塞主流程
|
|
|
|
|
|
|
+ * 异步执行后续处理流程,不阻塞主流程
|
|
|
|
|
+ *
|
|
|
|
|
+ * 处理顺序:
|
|
|
|
|
+ * 1. 结构化解析(Word 文档提取段落/图片/表格)
|
|
|
|
|
+ * 2. NER 实体提取(文本提取实体/关系)
|
|
|
*/
|
|
*/
|
|
|
@Async
|
|
@Async
|
|
|
@EventListener
|
|
@EventListener
|
|
|
public void handleDocumentParsedEvent(DocumentParsedEvent event) {
|
|
public void handleDocumentParsedEvent(DocumentParsedEvent event) {
|
|
|
- if (!nerAutoExtractEnabled) {
|
|
|
|
|
- log.debug("NER 自动提取已禁用,跳过: documentId={}", event.getDocumentId());
|
|
|
|
|
- return;
|
|
|
|
|
- }
|
|
|
|
|
-
|
|
|
|
|
String documentId = event.getDocumentId();
|
|
String documentId = event.getDocumentId();
|
|
|
String userId = event.getUserId();
|
|
String userId = event.getUserId();
|
|
|
-
|
|
|
|
|
- log.info("收到文档解析完成事件,开始 NER 提取: documentId={}, userId={}", documentId, userId);
|
|
|
|
|
-
|
|
|
|
|
|
|
+
|
|
|
|
|
+ log.info("收到文档解析完成事件: documentId={}, userId={}", documentId, userId);
|
|
|
|
|
+
|
|
|
|
|
+ long totalStartTime = System.currentTimeMillis();
|
|
|
|
|
+
|
|
|
|
|
+ // Step 1: 结构化解析(仅 Word 文档)
|
|
|
|
|
+ if (structuredAutoExtractEnabled) {
|
|
|
|
|
+ triggerStructuredExtraction(documentId);
|
|
|
|
|
+ }
|
|
|
|
|
+
|
|
|
|
|
+ // Step 2: NER 实体提取
|
|
|
|
|
+ if (nerAutoExtractEnabled) {
|
|
|
|
|
+ triggerNerExtraction(documentId, userId);
|
|
|
|
|
+ }
|
|
|
|
|
+
|
|
|
|
|
+ long totalTime = System.currentTimeMillis() - totalStartTime;
|
|
|
|
|
+ log.info("文档后处理完成: documentId={}, totalTime={}ms", documentId, totalTime);
|
|
|
|
|
+ }
|
|
|
|
|
+
|
|
|
|
|
+ /**
|
|
|
|
|
+ * 触发结构化解析
|
|
|
|
|
+ * 仅对 Word 文档有效,提取段落、图片、表格
|
|
|
|
|
+ */
|
|
|
|
|
+ private void triggerStructuredExtraction(String documentId) {
|
|
|
|
|
+ try {
|
|
|
|
|
+ // 检查是否是 Word 文档
|
|
|
|
|
+ Document document = documentRepository.selectById(documentId);
|
|
|
|
|
+ if (document == null) {
|
|
|
|
|
+ log.warn("文档不存在,跳过结构化解析: documentId={}", documentId);
|
|
|
|
|
+ return;
|
|
|
|
|
+ }
|
|
|
|
|
+
|
|
|
|
|
+ String docType = document.getType();
|
|
|
|
|
+ if (!"word".equalsIgnoreCase(docType)) {
|
|
|
|
|
+ log.debug("非 Word 文档,跳过结构化解析: documentId={}, type={}", documentId, docType);
|
|
|
|
|
+ return;
|
|
|
|
|
+ }
|
|
|
|
|
+
|
|
|
|
|
+ log.info("开始自动结构化解析: documentId={}", documentId);
|
|
|
|
|
+ long startTime = System.currentTimeMillis();
|
|
|
|
|
+
|
|
|
|
|
+ // 调用本地 API 触发结构化解析
|
|
|
|
|
+ String url = "http://localhost:" + serverPort + "/parse/structured/" + documentId;
|
|
|
|
|
+
|
|
|
|
|
+ ResponseEntity<Map> response = restTemplate.getForEntity(url, Map.class);
|
|
|
|
|
+
|
|
|
|
|
+ if (response.getStatusCode().is2xxSuccessful()) {
|
|
|
|
|
+ long time = System.currentTimeMillis() - startTime;
|
|
|
|
|
+ log.info("结构化解析完成: documentId={}, time={}ms", documentId, time);
|
|
|
|
|
+ } else {
|
|
|
|
|
+ log.warn("结构化解析失败: documentId={}, status={}", documentId, response.getStatusCode());
|
|
|
|
|
+ }
|
|
|
|
|
+
|
|
|
|
|
+ } catch (Exception e) {
|
|
|
|
|
+ log.error("自动结构化解析异常: documentId={}, error={}", documentId, e.getMessage());
|
|
|
|
|
+ // 异常不向上抛出,不影响后续处理
|
|
|
|
|
+ }
|
|
|
|
|
+ }
|
|
|
|
|
+
|
|
|
|
|
+ /**
|
|
|
|
|
+ * 触发 NER 实体提取
|
|
|
|
|
+ */
|
|
|
|
|
+ private void triggerNerExtraction(String documentId, String userId) {
|
|
|
long startTime = System.currentTimeMillis();
|
|
long startTime = System.currentTimeMillis();
|
|
|
|
|
|
|
|
try {
|
|
try {
|
|
@@ -79,6 +150,8 @@ public class DocumentParsedEventListener {
|
|
|
log.warn("文档文本为空,跳过 NER: documentId={}", documentId);
|
|
log.warn("文档文本为空,跳过 NER: documentId={}", documentId);
|
|
|
return;
|
|
return;
|
|
|
}
|
|
}
|
|
|
|
|
+
|
|
|
|
|
+ log.info("开始自动 NER 提取: documentId={}", documentId);
|
|
|
|
|
|
|
|
// 2. 调用 Python NER 服务(根据配置选择异步轮询或同步 API)
|
|
// 2. 调用 Python NER 服务(根据配置选择异步轮询或同步 API)
|
|
|
Map<String, Object> nerResponse;
|
|
Map<String, Object> nerResponse;
|
|
@@ -106,19 +179,15 @@ public class DocumentParsedEventListener {
|
|
|
|
|
|
|
|
// 5. 将 NER 结果转换为 TextElement 格式(用于结构化文档)
|
|
// 5. 将 NER 结果转换为 TextElement 格式(用于结构化文档)
|
|
|
List<TextElementDTO> textElements = nerToBlockService.convertToTextElements(text, entities);
|
|
List<TextElementDTO> textElements = nerToBlockService.convertToTextElements(text, entities);
|
|
|
- log.info("NER 结果已转换为 TextElement: documentId={}, elementCount={}",
|
|
|
|
|
|
|
+ log.debug("NER 结果已转换为 TextElement: documentId={}, elementCount={}",
|
|
|
documentId, textElements.size());
|
|
documentId, textElements.size());
|
|
|
-
|
|
|
|
|
- // TODO: 将 textElements 保存到 DocumentBlock 表
|
|
|
|
|
- // 这需要调用 document-service 的 API 或通过事件通知
|
|
|
|
|
|
|
|
|
|
long processingTime = System.currentTimeMillis() - startTime;
|
|
long processingTime = System.currentTimeMillis() - startTime;
|
|
|
|
|
|
|
|
- log.info("NER 自动提取完成: documentId={}, entityCount={}, relationCount={}, textElements={}, time={}ms",
|
|
|
|
|
|
|
+ log.info("NER 自动提取完成: documentId={}, entityCount={}, relationCount={}, time={}ms",
|
|
|
documentId,
|
|
documentId,
|
|
|
entities != null ? entities.size() : 0,
|
|
entities != null ? entities.size() : 0,
|
|
|
relationCount,
|
|
relationCount,
|
|
|
- textElements.size(),
|
|
|
|
|
processingTime);
|
|
processingTime);
|
|
|
|
|
|
|
|
} catch (Exception e) {
|
|
} catch (Exception e) {
|