il y a 1 mois · 55cffb4e4d
--- a/backend/parse-service/src/main/java/com/lingyue/parse/service/DocumentIndexService.java
+++ b/backend/parse-service/src/main/java/com/lingyue/parse/service/DocumentIndexService.java
@@ -0,0 +1,195 @@
 
				+package com.lingyue.parse.service;
			
 
				+
			
 
				+import com.fasterxml.jackson.databind.ObjectMapper;
			
 
				+import lombok.Data;
			
 
				+import lombok.RequiredArgsConstructor;
			
 
				+import lombok.extern.slf4j.Slf4j;
			
 
				+import org.springframework.stereotype.Service;
			
 
				+
			
 
				+import java.nio.file.Files;
			
 
				+import java.nio.file.Path;
			
 
				+import java.nio.file.Paths;
			
 
				+import java.util.ArrayList;
			
 
				+import java.util.List;
			
 
				+
			
 
				+/**
			
 
				+ * 文档索引服务
			
 
				+ * 负责生成和管理文档的位置索引
			
 
				+ * 
			
 
				+ * @author lingyue
			
 
				+ * @since 2026-01-20
			
 
				+ */
			
 
				+@Slf4j
			
 
				+@Service
			
 
				+@RequiredArgsConstructor
			
 
				+public class DocumentIndexService {
			
 
				+    
			
 
				+    private final ObjectMapper objectMapper;
			
 
				+    
			
 
				+    /**
			
 
				+     * 为纯文本生成行索引（Word/Excel等无分页文档）
			
 
				+     * 
			
 
				+     * @param text 文本内容
			
 
				+     * @param documentId 文档ID
			
 
				+     * @param indexOutputPath 索引输出路径
			
 
				+     * @return 文档索引
			
 
				+     */
			
 
				+    public DocumentIndex generateLineIndex(String text, String documentId, String indexOutputPath) {
			
 
				+        if (text == null || text.isEmpty()) {
			
 
				+            return createEmptyIndex(documentId);
			
 
				+        }
			
 
				+        
			
 
				+        List<LineIndex> lineIndices = new ArrayList<>();
			
 
				+        int charPos = 0;
			
 
				+        int lineNum = 1;
			
 
				+        
			
 
				+        String[] lines = text.split("\n", -1);
			
 
				+        for (String line : lines) {
			
 
				+            LineIndex lineIndex = new LineIndex();
			
 
				+            lineIndex.setLine(lineNum);
			
 
				+            lineIndex.setCharStart(charPos);
			
 
				+            lineIndex.setCharEnd(charPos + line.length());
			
 
				+            lineIndices.add(lineIndex);
			
 
				+            
			
 
				+            charPos += line.length() + 1; // +1 for \n
			
 
				+            lineNum++;
			
 
				+        }
			
 
				+        
			
 
				+        // 创建文档索引（无分页，只有行索引）
			
 
				+        DocumentIndex index = new DocumentIndex();
			
 
				+        index.setDocumentId(documentId);
			
 
				+        index.setTotalChars(text.length());
			
 
				+        index.setTotalLines(lines.length);
			
 
				+        index.setTotalPages(1); // 非分页文档统一为1页
			
 
				+        index.setLines(lineIndices);
			
 
				+        
			
 
				+        // 创建虚拟的单页索引
			
 
				+        List<PageIndex> pages = new ArrayList<>();
			
 
				+        PageIndex singlePage = new PageIndex();
			
 
				+        singlePage.setPage(1);
			
 
				+        singlePage.setCharStart(0);
			
 
				+        singlePage.setCharEnd(text.length());
			
 
				+        singlePage.setLineStart(1);
			
 
				+        singlePage.setLineEnd(lines.length);
			
 
				+        pages.add(singlePage);
			
 
				+        index.setPages(pages);
			
 
				+        
			
 
				+        // 保存索引文件
			
 
				+        if (indexOutputPath != null) {
			
 
				+            saveIndexFile(index, indexOutputPath);
			
 
				+        }
			
 
				+        
			
 
				+        return index;
			
 
				+    }
			
 
				+    
			
 
				+    /**
			
 
				+     * 读取索引文件
			
 
				+     */
			
 
				+    public DocumentIndex loadIndex(String indexFilePath) {
			
 
				+        try {
			
 
				+            Path path = Paths.get(indexFilePath);
			
 
				+            if (!Files.exists(path)) {
			
 
				+                log.warn("索引文件不存在: {}", indexFilePath);
			
 
				+                return null;
			
 
				+            }
			
 
				+            String json = Files.readString(path);
			
 
				+            return objectMapper.readValue(json, DocumentIndex.class);
			
 
				+        } catch (Exception e) {
			
 
				+            log.error("读取索引文件失败: {}", indexFilePath, e);
			
 
				+            return null;
			
 
				+        }
			
 
				+    }
			
 
				+    
			
 
				+    /**
			
 
				+     * 根据字符位置查找页码和行号
			
 
				+     * 
			
 
				+     * @param index 文档索引
			
 
				+     * @param charPosition 字符位置
			
 
				+     * @return [页码, 行号] 或 null（未找到）
			
 
				+     */
			
 
				+    public int[] findPageAndLine(DocumentIndex index, int charPosition) {
			
 
				+        if (index == null || index.getPages() == null) {
			
 
				+            return null;
			
 
				+        }
			
 
				+        
			
 
				+        // 1. 找到所在页
			
 
				+        int page = 1;
			
 
				+        for (PageIndex pageIndex : index.getPages()) {
			
 
				+            if (charPosition >= pageIndex.getCharStart() && charPosition <= pageIndex.getCharEnd()) {
			
 
				+                page = pageIndex.getPage();
			
 
				+                break;
			
 
				+            }
			
 
				+        }
			
 
				+        
			
 
				+        // 2. 找到所在行
			
 
				+        int line = 1;
			
 
				+        if (index.getLines() != null) {
			
 
				+            for (LineIndex lineIndex : index.getLines()) {
			
 
				+                if (charPosition >= lineIndex.getCharStart() && charPosition <= lineIndex.getCharEnd()) {
			
 
				+                    line = lineIndex.getLine();
			
 
				+                    break;
			
 
				+                }
			
 
				+            }
			
 
				+        }
			
 
				+        
			
 
				+        return new int[]{page, line};
			
 
				+    }
			
 
				+    
			
 
				+    private DocumentIndex createEmptyIndex(String documentId) {
			
 
				+        DocumentIndex index = new DocumentIndex();
			
 
				+        index.setDocumentId(documentId);
			
 
				+        index.setTotalChars(0);
			
 
				+        index.setTotalLines(0);
			
 
				+        index.setTotalPages(0);
			
 
				+        index.setPages(new ArrayList<>());
			
 
				+        return index;
			
 
				+    }
			
 
				+    
			
 
				+    private void saveIndexFile(DocumentIndex index, String outputPath) {
			
 
				+        try {
			
 
				+            Path path = Paths.get(outputPath);
			
 
				+            Files.createDirectories(path.getParent());
			
 
				+            String json = objectMapper.writerWithDefaultPrettyPrinter().writeValueAsString(index);
			
 
				+            Files.writeString(path, json);
			
 
				+            log.info("文档索引文件已保存: {}", outputPath);
			
 
				+        } catch (Exception e) {
			
 
				+            log.error("保存文档索引文件失败: {}", outputPath, e);
			
 
				+        }
			
 
				+    }
			
 
				+    
			
 
				+    /**
			
 
				+     * 文档索引
			
 
				+     */
			
 
				+    @Data
			
 
				+    public static class DocumentIndex {
			
 
				+        private String documentId;
			
 
				+        private List<PageIndex> pages;
			
 
				+        private List<LineIndex> lines;
			
 
				+        private int totalChars;
			
 
				+        private int totalLines;
			
 
				+        private int totalPages;
			
 
				+    }
			
 
				+    
			
 
				+    /**
			
 
				+     * 页面索引
			
 
				+     */
			
 
				+    @Data
			
 
				+    public static class PageIndex {
			
 
				+        private int page;
			
 
				+        private int charStart;
			
 
				+        private int charEnd;
			
 
				+        private int lineStart;
			
 
				+        private int lineEnd;
			
 
				+        private boolean ocrUsed;
			
 
				+    }
			
 
				+    
			
 
				+    /**
			
 
				+     * 行索引
			
 
				+     */
			
 
				+    @Data
			
 
				+    public static class LineIndex {
			
 
				+        private int line;
			
 
				+        private int charStart;
			
 
				+        private int charEnd;
			
 
				+    }
			
 
				+}
			
--- a/backend/parse-service/src/main/java/com/lingyue/parse/service/ParseService.java
+++ b/backend/parse-service/src/main/java/com/lingyue/parse/service/ParseService.java
@@ -32,6 +32,7 @@ public class ParseService {
 
				     private final ExcelTextExtractionService excelTextExtractionService;
			
 
				     private final OcrResultParser ocrResultParser;
			
 
				     private final LayoutAnalysisService layoutAnalysisService;
			
 
				+    private final DocumentIndexService documentIndexService;
			
 
				     private final FileStorageProperties fileStorageProperties;
			
 
				     // 单体应用直接注入 Service，不使用 Feign Client
			
 
				     private final com.lingyue.graph.service.TextStorageService textStorageService;
			
@@ -96,8 +97,12 @@ public class ParseService {
 
				                 task.setProgress(20);
			
 
				                 saveParseTask(task);
			
 
				                 
			
 
				-                // PDF使用分页判断逻辑
			
 
				-                plainText = pdfTextExtractionService.extractText(sourceFilePath);
			
 
				+                // PDF使用分页判断逻辑，并生成页面索引
			
 
				+                String indexFilePath = buildIndexFilePath(documentId);
			
 
				+                PdfTextExtractionService.ExtractionResult extractionResult = 
			
 
				+                    pdfTextExtractionService.extractTextWithIndex(sourceFilePath, documentId, indexFilePath);
			
 
				+                plainText = extractionResult.getText();
			
 
				+                log.info("PDF提取完成，索引文件: {}", indexFilePath);
			
 
				             } else if (fileType == FileType.WORD || fileType == FileType.WORD_OLD) {
			
 
				                 log.info("处理Word文件: {}", sourceFilePath);
			
 
				                 task.setCurrentStep("word_extraction");
			
@@ -106,6 +111,11 @@ public class ParseService {
 
				                 
			
 
				                 // Word文档直接提取文本
			
 
				                 plainText = wordTextExtractionService.extractText(sourceFilePath);
			
 
				+                
			
 
				+                // 为Word生成行索引
			
 
				+                String indexFilePath = buildIndexFilePath(documentId);
			
 
				+                documentIndexService.generateLineIndex(plainText, documentId, indexFilePath);
			
 
				+                log.info("Word提取完成，索引文件: {}", indexFilePath);
			
 
				             } else if (fileType == FileType.EXCEL || fileType == FileType.EXCEL_OLD) {
			
 
				                 log.info("处理Excel文件: {}", sourceFilePath);
			
 
				                 task.setCurrentStep("excel_extraction");
			
@@ -114,6 +124,11 @@ public class ParseService {
 
				                 
			
 
				                 // Excel表格直接提取文本
			
 
				                 plainText = excelTextExtractionService.extractText(sourceFilePath);
			
 
				+                
			
 
				+                // 为Excel生成行索引
			
 
				+                String indexFilePath = buildIndexFilePath(documentId);
			
 
				+                documentIndexService.generateLineIndex(plainText, documentId, indexFilePath);
			
 
				+                log.info("Excel提取完成，索引文件: {}", indexFilePath);
			
 
				             } else if (fileType.isImage()) {
			
 
				                 log.info("处理图片文件: {}", sourceFilePath);
			
 
				                 task.setCurrentStep("ocr");
			
@@ -267,6 +282,18 @@ public class ParseService {
 
				         );
			
 
				         return path.toString();
			
 
				     }
			
 
				+    
			
 
				+    /**
			
 
				+     * 根据文档ID构建索引文件存储路径
			
 
				+     */
			
 
				+    private String buildIndexFilePath(String documentId) {
			
 
				+        Path path = Path.of(
			
 
				+                fileStorageProperties.getTextPath(),
			
 
				+                documentId.substring(0, 2),
			
 
				+                documentId + "_index.json"
			
 
				+        );
			
 
				+        return path.toString();
			
 
				+    }
			
 
				 
			
 
				     /**
			
 
				      * 将纯文本写入 TXT 文件
			
--- a/backend/parse-service/src/main/java/com/lingyue/parse/service/PdfTextExtractionService.java
+++ b/backend/parse-service/src/main/java/com/lingyue/parse/service/PdfTextExtractionService.java
@@ -1,10 +1,11 @@
 
				 package com.lingyue.parse.service;
			
 
				 
			
 
				+import com.fasterxml.jackson.databind.ObjectMapper;
			
 
				 import com.lingyue.common.exception.ServiceException;
			
 
				+import lombok.Data;
			
 
				 import lombok.RequiredArgsConstructor;
			
 
				 import lombok.extern.slf4j.Slf4j;
			
 
				 import org.apache.pdfbox.pdmodel.PDDocument;
			
 
				-import org.apache.pdfbox.pdmodel.PDPage;
			
 
				 import org.apache.pdfbox.rendering.PDFRenderer;
			
 
				 import org.apache.pdfbox.text.PDFTextStripper;
			
 
				 import org.springframework.stereotype.Service;
			
@@ -34,6 +35,7 @@ public class PdfTextExtractionService {
 
				     
			
 
				     private final PaddleOcrClient paddleOcrClient;
			
 
				     private final OcrResultParser ocrResultParser;
			
 
				+    private final ObjectMapper objectMapper;
			
 
				     
			
 
				     /**
			
 
				      * 文本阈值：每页至少需要这么多字符才认为有文本层
			
@@ -207,6 +209,167 @@ public class PdfTextExtractionService {
 
				         return combinedText.toString();
			
 
				     }
			
 
				     
			
 
				+    /**
			
 
				+     * 提取PDF文本并生成页面索引
			
 
				+     * 
			
 
				+     * @param pdfFilePath PDF文件路径
			
 
				+     * @param documentId 文档ID
			
 
				+     * @param indexOutputPath 索引文件输出路径（如果为null则不生成索引文件）
			
 
				+     * @return 提取结果（包含文本和索引）
			
 
				+     */
			
 
				+    public ExtractionResult extractTextWithIndex(String pdfFilePath, String documentId, String indexOutputPath) {
			
 
				+        File pdfFile = new File(pdfFilePath);
			
 
				+        if (!pdfFile.exists()) {
			
 
				+            throw new ServiceException("PDF文件不存在: " + pdfFilePath);
			
 
				+        }
			
 
				+        
			
 
				+        List<PageTextResult> pageResults = new ArrayList<>();
			
 
				+        
			
 
				+        try (PDDocument document = org.apache.pdfbox.Loader.loadPDF(pdfFile)) {
			
 
				+            int totalPages = document.getNumberOfPages();
			
 
				+            log.info("开始处理PDF文件（带索引）: {}, 总页数: {}", pdfFilePath, totalPages);
			
 
				+            
			
 
				+            PDFTextStripper textStripper = new PDFTextStripper();
			
 
				+            
			
 
				+            // 逐页处理
			
 
				+            for (int pageNum = 1; pageNum <= totalPages; pageNum++) {
			
 
				+                log.debug("处理第 {} 页/共 {} 页", pageNum, totalPages);
			
 
				+                
			
 
				+                try {
			
 
				+                    textStripper.setStartPage(pageNum);
			
 
				+                    textStripper.setEndPage(pageNum);
			
 
				+                    String pageText = textStripper.getText(document);
			
 
				+                    
			
 
				+                    if (hasSufficientText(pageText)) {
			
 
				+                        log.debug("第 {} 页有文本层，直接使用，文本长度: {}", pageNum, pageText.length());
			
 
				+                        pageResults.add(new PageTextResult(pageNum, pageText, false));
			
 
				+                    } else {
			
 
				+                        log.debug("第 {} 页文本不足，使用OCR处理", pageNum);
			
 
				+                        String ocrText = extractTextByOcr(pdfFilePath, pageNum);
			
 
				+                        pageResults.add(new PageTextResult(pageNum, ocrText, true));
			
 
				+                    }
			
 
				+                } catch (Exception e) {
			
 
				+                    log.error("处理第 {} 页时出错，尝试使用OCR", pageNum, e);
			
 
				+                    try {
			
 
				+                        String ocrText = extractTextByOcr(pdfFilePath, pageNum);
			
 
				+                        pageResults.add(new PageTextResult(pageNum, ocrText, true));
			
 
				+                    } catch (Exception ocrException) {
			
 
				+                        log.error("第 {} 页OCR也失败", pageNum, ocrException);
			
 
				+                        pageResults.add(new PageTextResult(pageNum, "", true));
			
 
				+                    }
			
 
				+                }
			
 
				+            }
			
 
				+        } catch (IOException e) {
			
 
				+            log.error("读取PDF文件失败: {}", pdfFilePath, e);
			
 
				+            throw new ServiceException("读取PDF文件失败: " + e.getMessage());
			
 
				+        }
			
 
				+        
			
 
				+        // 生成带索引的合并结果
			
 
				+        return combinePageTextsWithIndex(pageResults, documentId, indexOutputPath);
			
 
				+    }
			
 
				+    
			
 
				+    /**
			
 
				+     * 合并页面文本并生成索引
			
 
				+     */
			
 
				+    private ExtractionResult combinePageTextsWithIndex(List<PageTextResult> pageResults, 
			
 
				+                                                        String documentId, 
			
 
				+                                                        String indexOutputPath) {
			
 
				+        StringBuilder combinedText = new StringBuilder();
			
 
				+        List<PageIndex> pageIndices = new ArrayList<>();
			
 
				+        
			
 
				+        int currentCharPos = 0;
			
 
				+        int currentLine = 1;
			
 
				+        
			
 
				+        for (PageTextResult result : pageResults) {
			
 
				+            int pageCharStart = currentCharPos;
			
 
				+            int pageLineStart = currentLine;
			
 
				+            
			
 
				+            if (result.getText() != null && !result.getText().trim().isEmpty()) {
			
 
				+                // 添加页头标记
			
 
				+                String pageHeader = "=== 第 " + result.getPageNum() + " 页";
			
 
				+                if (result.isOcrUsed()) {
			
 
				+                    pageHeader += " (OCR识别)";
			
 
				+                } else {
			
 
				+                    pageHeader += " (文本层提取)";
			
 
				+                }
			
 
				+                pageHeader += " ===\n";
			
 
				+                
			
 
				+                combinedText.append(pageHeader);
			
 
				+                currentCharPos += pageHeader.length();
			
 
				+                currentLine++;
			
 
				+                
			
 
				+                // 添加页面内容
			
 
				+                String pageText = result.getText();
			
 
				+                combinedText.append(pageText).append("\n\n");
			
 
				+                
			
 
				+                // 计算行数
			
 
				+                int pageLines = countLines(pageText);
			
 
				+                currentLine += pageLines + 1; // +1 for the empty line separator
			
 
				+                currentCharPos += pageText.length() + 2; // +2 for "\n\n"
			
 
				+            }
			
 
				+            
			
 
				+            // 创建页面索引
			
 
				+            PageIndex pageIndex = new PageIndex();
			
 
				+            pageIndex.setPage(result.getPageNum());
			
 
				+            pageIndex.setCharStart(pageCharStart);
			
 
				+            pageIndex.setCharEnd(currentCharPos - 1);
			
 
				+            pageIndex.setLineStart(pageLineStart);
			
 
				+            pageIndex.setLineEnd(currentLine - 1);
			
 
				+            pageIndex.setOcrUsed(result.isOcrUsed());
			
 
				+            pageIndices.add(pageIndex);
			
 
				+        }
			
 
				+        
			
 
				+        // 创建文档索引
			
 
				+        DocumentIndex documentIndex = new DocumentIndex();
			
 
				+        documentIndex.setDocumentId(documentId);
			
 
				+        documentIndex.setPages(pageIndices);
			
 
				+        documentIndex.setTotalChars(currentCharPos);
			
 
				+        documentIndex.setTotalLines(currentLine - 1);
			
 
				+        documentIndex.setTotalPages(pageResults.size());
			
 
				+        
			
 
				+        // 保存索引文件
			
 
				+        if (indexOutputPath != null) {
			
 
				+            saveIndexFile(documentIndex, indexOutputPath);
			
 
				+        }
			
 
				+        
			
 
				+        ExtractionResult result = new ExtractionResult();
			
 
				+        result.setText(combinedText.toString());
			
 
				+        result.setIndex(documentIndex);
			
 
				+        return result;
			
 
				+    }
			
 
				+    
			
 
				+    /**
			
 
				+     * 计算文本行数
			
 
				+     */
			
 
				+    private int countLines(String text) {
			
 
				+        if (text == null || text.isEmpty()) {
			
 
				+            return 0;
			
 
				+        }
			
 
				+        int lines = 1;
			
 
				+        for (char c : text.toCharArray()) {
			
 
				+            if (c == '\n') {
			
 
				+                lines++;
			
 
				+            }
			
 
				+        }
			
 
				+        return lines;
			
 
				+    }
			
 
				+    
			
 
				+    /**
			
 
				+     * 保存索引文件
			
 
				+     */
			
 
				+    private void saveIndexFile(DocumentIndex index, String outputPath) {
			
 
				+        try {
			
 
				+            Path path = Paths.get(outputPath);
			
 
				+            Files.createDirectories(path.getParent());
			
 
				+            String json = objectMapper.writerWithDefaultPrettyPrinter().writeValueAsString(index);
			
 
				+            Files.writeString(path, json);
			
 
				+            log.info("页面索引文件已保存: {}", outputPath);
			
 
				+        } catch (Exception e) {
			
 
				+            log.error("保存页面索引文件失败: {}", outputPath, e);
			
 
				+            // 索引保存失败不影响主流程
			
 
				+        }
			
 
				+    }
			
 
				+    
			
 
				     /**
			
 
				      * 页面文本结果
			
 
				      */
			
@@ -233,4 +396,38 @@ public class PdfTextExtractionService {
 
				             return ocrUsed;
			
 
				         }
			
 
				     }
			
 
				+    
			
 
				+    /**
			
 
				+     * 提取结果（包含文本和索引）
			
 
				+     */
			
 
				+    @Data
			
 
				+    public static class ExtractionResult {
			
 
				+        private String text;
			
 
				+        private DocumentIndex index;
			
 
				+    }
			
 
				+    
			
 
				+    /**
			
 
				+     * 文档索引
			
 
				+     */
			
 
				+    @Data
			
 
				+    public static class DocumentIndex {
			
 
				+        private String documentId;
			
 
				+        private List<PageIndex> pages;
			
 
				+        private int totalChars;
			
 
				+        private int totalLines;
			
 
				+        private int totalPages;
			
 
				+    }
			
 
				+    
			
 
				+    /**
			
 
				+     * 页面索引
			
 
				+     */
			
 
				+    @Data
			
 
				+    public static class PageIndex {
			
 
				+        private int page;
			
 
				+        private int charStart;
			
 
				+        private int charEnd;
			
 
				+        private int lineStart;
			
 
				+        private int lineEnd;
			
 
				+        private boolean ocrUsed;
			
 
				+    }
			
 
				 }