|
|
@@ -93,6 +93,10 @@ public class ParseService {
|
|
|
task.setStatus("processing");
|
|
|
task.setCurrentStep("parsing");
|
|
|
task.setProgress(10);
|
|
|
+ task.setStartedAt(new java.util.Date());
|
|
|
+ // 更新解析阶段状态
|
|
|
+ task.setParseStatus("processing");
|
|
|
+ task.setParseProgress(0);
|
|
|
saveParseTask(task);
|
|
|
|
|
|
try {
|
|
|
@@ -103,6 +107,7 @@ public class ParseService {
|
|
|
log.info("处理PDF文件: {}", sourceFilePath);
|
|
|
task.setCurrentStep("pdf_extraction");
|
|
|
task.setProgress(20);
|
|
|
+ task.setParseProgress(50);
|
|
|
saveParseTask(task);
|
|
|
|
|
|
// PDF使用分页判断逻辑
|
|
|
@@ -112,6 +117,7 @@ public class ParseService {
|
|
|
log.info("处理Word文件: {}", sourceFilePath);
|
|
|
task.setCurrentStep("word_extraction");
|
|
|
task.setProgress(20);
|
|
|
+ task.setParseProgress(50);
|
|
|
saveParseTask(task);
|
|
|
|
|
|
// Word文档直接提取文本
|
|
|
@@ -121,6 +127,7 @@ public class ParseService {
|
|
|
log.info("处理Excel文件: {}", sourceFilePath);
|
|
|
task.setCurrentStep("excel_extraction");
|
|
|
task.setProgress(20);
|
|
|
+ task.setParseProgress(50);
|
|
|
saveParseTask(task);
|
|
|
|
|
|
// Excel表格直接提取文本
|
|
|
@@ -130,6 +137,7 @@ public class ParseService {
|
|
|
log.info("处理图片文件: {}", sourceFilePath);
|
|
|
task.setCurrentStep("ocr");
|
|
|
task.setProgress(20);
|
|
|
+ task.setParseProgress(50);
|
|
|
saveParseTask(task);
|
|
|
|
|
|
// 图片使用OCR
|
|
|
@@ -139,6 +147,7 @@ public class ParseService {
|
|
|
log.info("处理其他文件类型: {}, 使用OCR", fileType);
|
|
|
task.setCurrentStep("ocr");
|
|
|
task.setProgress(20);
|
|
|
+ task.setParseProgress(50);
|
|
|
saveParseTask(task);
|
|
|
|
|
|
// 其他文件类型使用OCR
|
|
|
@@ -146,9 +155,13 @@ public class ParseService {
|
|
|
plainText = ocrResultParser.parseText(ocrResult);
|
|
|
}
|
|
|
|
|
|
+ // 文本提取完成
|
|
|
+ task.setParseStatus("completed");
|
|
|
+ task.setParseProgress(100);
|
|
|
+
|
|
|
// 3. 将纯文本写入 TXT 文件
|
|
|
task.setCurrentStep("saving");
|
|
|
- task.setProgress(80);
|
|
|
+ task.setProgress(40);
|
|
|
saveParseTask(task);
|
|
|
|
|
|
String textFilePath = buildTextFilePath(documentId);
|
|
|
@@ -160,43 +173,49 @@ public class ParseService {
|
|
|
}
|
|
|
log.info("文本已写入: {}", textFilePath);
|
|
|
|
|
|
- // 4. 版面分析
|
|
|
- task.setCurrentStep("layout_analysis");
|
|
|
- task.setProgress(85);
|
|
|
+ // 4. RAG 向量化(在 recordTextStorage 中自动执行)
|
|
|
+ task.setCurrentStep("rag");
|
|
|
+ task.setProgress(50);
|
|
|
+ task.setRagStatus("processing");
|
|
|
+ task.setRagProgress(0);
|
|
|
saveParseTask(task);
|
|
|
|
|
|
try {
|
|
|
- LayoutAnalysisService.LayoutAnalysisResult layoutResult =
|
|
|
- layoutAnalysisService.analyzeLayout(sourceFilePath, fileType, plainText);
|
|
|
- log.info("版面分析完成: 识别到 {} 个元素", layoutResult.getElementCount());
|
|
|
-
|
|
|
- // 将版面分析结果保存到任务选项(可选,用于后续图节点构建)
|
|
|
- if (task.getOptions() == null) {
|
|
|
- task.setOptions(new java.util.HashMap<>());
|
|
|
- }
|
|
|
- java.util.Map<String, Object> options = (java.util.Map<String, Object>) task.getOptions();
|
|
|
- options.put("layoutAnalysis", layoutResult);
|
|
|
+ recordTextStorage(documentId, textFilePath);
|
|
|
+ task.setRagStatus("completed");
|
|
|
+ task.setRagProgress(100);
|
|
|
} catch (Exception e) {
|
|
|
- log.warn("版面分析失败,但不影响主流程: documentId={}", documentId, e);
|
|
|
- // 版面分析失败不影响主流程,只记录警告日志
|
|
|
+ log.warn("RAG索引失败,但不影响主流程: documentId={}, filePath={}", documentId, textFilePath, e);
|
|
|
+ task.setRagStatus("failed");
|
|
|
}
|
|
|
-
|
|
|
- // 5. 记录文本存储路径到数据库
|
|
|
- task.setCurrentStep("recording");
|
|
|
- task.setProgress(90);
|
|
|
+
|
|
|
+ // 5. 结构化解析(版面分析)
|
|
|
+ task.setCurrentStep("structured");
|
|
|
+ task.setProgress(70);
|
|
|
+ task.setStructuredStatus("processing");
|
|
|
+ task.setStructuredProgress(0);
|
|
|
saveParseTask(task);
|
|
|
|
|
|
try {
|
|
|
- recordTextStorage(documentId, textFilePath);
|
|
|
+ LayoutAnalysisService.LayoutAnalysisResult layoutResult =
|
|
|
+ layoutAnalysisService.analyzeLayout(sourceFilePath, fileType, plainText);
|
|
|
+ log.info("版面分析完成: 识别到 {} 个元素", layoutResult.getElementCount());
|
|
|
+
|
|
|
+ task.setStructuredStatus("completed");
|
|
|
+ task.setStructuredProgress(100);
|
|
|
+ task.setStructuredElementCount(layoutResult.getElementCount());
|
|
|
+ task.setStructuredImageCount(layoutResult.getImageCount());
|
|
|
+ task.setStructuredTableCount(layoutResult.getTableCount());
|
|
|
} catch (Exception e) {
|
|
|
- log.warn("记录文本存储路径失败,但不影响主流程: documentId={}, filePath={}", documentId, textFilePath, e);
|
|
|
- // 记录失败不影响主流程,只记录警告日志
|
|
|
+ log.warn("版面分析失败,但不影响主流程: documentId={}", documentId, e);
|
|
|
+ task.setStructuredStatus("failed");
|
|
|
}
|
|
|
|
|
|
- // 6. 更新任务状态为完成
|
|
|
- task.setStatus("completed");
|
|
|
+ // 6. NER 和图构建由事件监听器异步处理,这里标记为等待中
|
|
|
+ // 主解析流程完成,更新任务状态
|
|
|
task.setCurrentStep("completed");
|
|
|
task.setProgress(100);
|
|
|
+ task.setStatus("completed");
|
|
|
task.setCompletedAt(new java.util.Date());
|
|
|
saveParseTask(task);
|
|
|
} catch (Exception e) {
|