Browse Source

feat: 添加文档页面索引生成功能

- PdfTextExtractionService 新增 extractTextWithIndex 方法
- 生成包含每页字符位置、行号的 _index.json 索引文件
- 新增 DocumentIndexService 处理 Word/Excel 文档的行索引
- ParseService 集成索引生成,自动为所有文档类型创建索引
何文松 1 month ago
parent
commit
55cffb4e4d

+ 195 - 0
backend/parse-service/src/main/java/com/lingyue/parse/service/DocumentIndexService.java

@@ -0,0 +1,195 @@
+package com.lingyue.parse.service;
+
+import com.fasterxml.jackson.databind.ObjectMapper;
+import lombok.Data;
+import lombok.RequiredArgsConstructor;
+import lombok.extern.slf4j.Slf4j;
+import org.springframework.stereotype.Service;
+
+import java.nio.file.Files;
+import java.nio.file.Path;
+import java.nio.file.Paths;
+import java.util.ArrayList;
+import java.util.List;
+
+/**
+ * 文档索引服务
+ * 负责生成和管理文档的位置索引
+ * 
+ * @author lingyue
+ * @since 2026-01-20
+ */
+@Slf4j
+@Service
+@RequiredArgsConstructor
+public class DocumentIndexService {
+    
+    private final ObjectMapper objectMapper;
+    
+    /**
+     * 为纯文本生成行索引(Word/Excel等无分页文档)
+     * 
+     * @param text 文本内容
+     * @param documentId 文档ID
+     * @param indexOutputPath 索引输出路径
+     * @return 文档索引
+     */
+    public DocumentIndex generateLineIndex(String text, String documentId, String indexOutputPath) {
+        if (text == null || text.isEmpty()) {
+            return createEmptyIndex(documentId);
+        }
+        
+        List<LineIndex> lineIndices = new ArrayList<>();
+        int charPos = 0;
+        int lineNum = 1;
+        
+        String[] lines = text.split("\n", -1);
+        for (String line : lines) {
+            LineIndex lineIndex = new LineIndex();
+            lineIndex.setLine(lineNum);
+            lineIndex.setCharStart(charPos);
+            lineIndex.setCharEnd(charPos + line.length());
+            lineIndices.add(lineIndex);
+            
+            charPos += line.length() + 1; // +1 for \n
+            lineNum++;
+        }
+        
+        // 创建文档索引(无分页,只有行索引)
+        DocumentIndex index = new DocumentIndex();
+        index.setDocumentId(documentId);
+        index.setTotalChars(text.length());
+        index.setTotalLines(lines.length);
+        index.setTotalPages(1); // 非分页文档统一为1页
+        index.setLines(lineIndices);
+        
+        // 创建虚拟的单页索引
+        List<PageIndex> pages = new ArrayList<>();
+        PageIndex singlePage = new PageIndex();
+        singlePage.setPage(1);
+        singlePage.setCharStart(0);
+        singlePage.setCharEnd(text.length());
+        singlePage.setLineStart(1);
+        singlePage.setLineEnd(lines.length);
+        pages.add(singlePage);
+        index.setPages(pages);
+        
+        // 保存索引文件
+        if (indexOutputPath != null) {
+            saveIndexFile(index, indexOutputPath);
+        }
+        
+        return index;
+    }
+    
+    /**
+     * 读取索引文件
+     */
+    public DocumentIndex loadIndex(String indexFilePath) {
+        try {
+            Path path = Paths.get(indexFilePath);
+            if (!Files.exists(path)) {
+                log.warn("索引文件不存在: {}", indexFilePath);
+                return null;
+            }
+            String json = Files.readString(path);
+            return objectMapper.readValue(json, DocumentIndex.class);
+        } catch (Exception e) {
+            log.error("读取索引文件失败: {}", indexFilePath, e);
+            return null;
+        }
+    }
+    
+    /**
+     * 根据字符位置查找页码和行号
+     * 
+     * @param index 文档索引
+     * @param charPosition 字符位置
+     * @return [页码, 行号] 或 null(未找到)
+     */
+    public int[] findPageAndLine(DocumentIndex index, int charPosition) {
+        if (index == null || index.getPages() == null) {
+            return null;
+        }
+        
+        // 1. 找到所在页
+        int page = 1;
+        for (PageIndex pageIndex : index.getPages()) {
+            if (charPosition >= pageIndex.getCharStart() && charPosition <= pageIndex.getCharEnd()) {
+                page = pageIndex.getPage();
+                break;
+            }
+        }
+        
+        // 2. 找到所在行
+        int line = 1;
+        if (index.getLines() != null) {
+            for (LineIndex lineIndex : index.getLines()) {
+                if (charPosition >= lineIndex.getCharStart() && charPosition <= lineIndex.getCharEnd()) {
+                    line = lineIndex.getLine();
+                    break;
+                }
+            }
+        }
+        
+        return new int[]{page, line};
+    }
+    
+    private DocumentIndex createEmptyIndex(String documentId) {
+        DocumentIndex index = new DocumentIndex();
+        index.setDocumentId(documentId);
+        index.setTotalChars(0);
+        index.setTotalLines(0);
+        index.setTotalPages(0);
+        index.setPages(new ArrayList<>());
+        return index;
+    }
+    
+    private void saveIndexFile(DocumentIndex index, String outputPath) {
+        try {
+            Path path = Paths.get(outputPath);
+            Files.createDirectories(path.getParent());
+            String json = objectMapper.writerWithDefaultPrettyPrinter().writeValueAsString(index);
+            Files.writeString(path, json);
+            log.info("文档索引文件已保存: {}", outputPath);
+        } catch (Exception e) {
+            log.error("保存文档索引文件失败: {}", outputPath, e);
+        }
+    }
+    
+    /**
+     * 文档索引
+     */
+    @Data
+    public static class DocumentIndex {
+        private String documentId;
+        private List<PageIndex> pages;
+        private List<LineIndex> lines;
+        private int totalChars;
+        private int totalLines;
+        private int totalPages;
+    }
+    
+    /**
+     * 页面索引
+     */
+    @Data
+    public static class PageIndex {
+        private int page;
+        private int charStart;
+        private int charEnd;
+        private int lineStart;
+        private int lineEnd;
+        private boolean ocrUsed;
+    }
+    
+    /**
+     * 行索引
+     */
+    @Data
+    public static class LineIndex {
+        private int line;
+        private int charStart;
+        private int charEnd;
+    }
+}

+ 29 - 2
backend/parse-service/src/main/java/com/lingyue/parse/service/ParseService.java

@@ -32,6 +32,7 @@ public class ParseService {
     private final ExcelTextExtractionService excelTextExtractionService;
     private final OcrResultParser ocrResultParser;
     private final LayoutAnalysisService layoutAnalysisService;
+    private final DocumentIndexService documentIndexService;
     private final FileStorageProperties fileStorageProperties;
     // 单体应用直接注入 Service,不使用 Feign Client
     private final com.lingyue.graph.service.TextStorageService textStorageService;
@@ -96,8 +97,12 @@ public class ParseService {
                 task.setProgress(20);
                 saveParseTask(task);
                 
-                // PDF使用分页判断逻辑
-                plainText = pdfTextExtractionService.extractText(sourceFilePath);
+                // PDF使用分页判断逻辑,并生成页面索引
+                String indexFilePath = buildIndexFilePath(documentId);
+                PdfTextExtractionService.ExtractionResult extractionResult = 
+                    pdfTextExtractionService.extractTextWithIndex(sourceFilePath, documentId, indexFilePath);
+                plainText = extractionResult.getText();
+                log.info("PDF提取完成,索引文件: {}", indexFilePath);
             } else if (fileType == FileType.WORD || fileType == FileType.WORD_OLD) {
                 log.info("处理Word文件: {}", sourceFilePath);
                 task.setCurrentStep("word_extraction");
@@ -106,6 +111,11 @@ public class ParseService {
                 
                 // Word文档直接提取文本
                 plainText = wordTextExtractionService.extractText(sourceFilePath);
+                
+                // 为Word生成行索引
+                String indexFilePath = buildIndexFilePath(documentId);
+                documentIndexService.generateLineIndex(plainText, documentId, indexFilePath);
+                log.info("Word提取完成,索引文件: {}", indexFilePath);
             } else if (fileType == FileType.EXCEL || fileType == FileType.EXCEL_OLD) {
                 log.info("处理Excel文件: {}", sourceFilePath);
                 task.setCurrentStep("excel_extraction");
@@ -114,6 +124,11 @@ public class ParseService {
                 
                 // Excel表格直接提取文本
                 plainText = excelTextExtractionService.extractText(sourceFilePath);
+                
+                // 为Excel生成行索引
+                String indexFilePath = buildIndexFilePath(documentId);
+                documentIndexService.generateLineIndex(plainText, documentId, indexFilePath);
+                log.info("Excel提取完成,索引文件: {}", indexFilePath);
             } else if (fileType.isImage()) {
                 log.info("处理图片文件: {}", sourceFilePath);
                 task.setCurrentStep("ocr");
@@ -267,6 +282,18 @@ public class ParseService {
         );
         return path.toString();
     }
+    
+    /**
+     * 根据文档ID构建索引文件存储路径
+     */
+    private String buildIndexFilePath(String documentId) {
+        Path path = Path.of(
+                fileStorageProperties.getTextPath(),
+                documentId.substring(0, 2),
+                documentId + "_index.json"
+        );
+        return path.toString();
+    }
 
     /**
      * 将纯文本写入 TXT 文件

+ 198 - 1
backend/parse-service/src/main/java/com/lingyue/parse/service/PdfTextExtractionService.java

@@ -1,10 +1,11 @@
 package com.lingyue.parse.service;
 
+import com.fasterxml.jackson.databind.ObjectMapper;
 import com.lingyue.common.exception.ServiceException;
+import lombok.Data;
 import lombok.RequiredArgsConstructor;
 import lombok.extern.slf4j.Slf4j;
 import org.apache.pdfbox.pdmodel.PDDocument;
-import org.apache.pdfbox.pdmodel.PDPage;
 import org.apache.pdfbox.rendering.PDFRenderer;
 import org.apache.pdfbox.text.PDFTextStripper;
 import org.springframework.stereotype.Service;
@@ -34,6 +35,7 @@ public class PdfTextExtractionService {
     
     private final PaddleOcrClient paddleOcrClient;
     private final OcrResultParser ocrResultParser;
+    private final ObjectMapper objectMapper;
     
     /**
      * 文本阈值:每页至少需要这么多字符才认为有文本层
@@ -207,6 +209,167 @@ public class PdfTextExtractionService {
         return combinedText.toString();
     }
     
+    /**
+     * 提取PDF文本并生成页面索引
+     * 
+     * @param pdfFilePath PDF文件路径
+     * @param documentId 文档ID
+     * @param indexOutputPath 索引文件输出路径(如果为null则不生成索引文件)
+     * @return 提取结果(包含文本和索引)
+     */
+    public ExtractionResult extractTextWithIndex(String pdfFilePath, String documentId, String indexOutputPath) {
+        File pdfFile = new File(pdfFilePath);
+        if (!pdfFile.exists()) {
+            throw new ServiceException("PDF文件不存在: " + pdfFilePath);
+        }
+        
+        List<PageTextResult> pageResults = new ArrayList<>();
+        
+        try (PDDocument document = org.apache.pdfbox.Loader.loadPDF(pdfFile)) {
+            int totalPages = document.getNumberOfPages();
+            log.info("开始处理PDF文件(带索引): {}, 总页数: {}", pdfFilePath, totalPages);
+            
+            PDFTextStripper textStripper = new PDFTextStripper();
+            
+            // 逐页处理
+            for (int pageNum = 1; pageNum <= totalPages; pageNum++) {
+                log.debug("处理第 {} 页/共 {} 页", pageNum, totalPages);
+                
+                try {
+                    textStripper.setStartPage(pageNum);
+                    textStripper.setEndPage(pageNum);
+                    String pageText = textStripper.getText(document);
+                    
+                    if (hasSufficientText(pageText)) {
+                        log.debug("第 {} 页有文本层,直接使用,文本长度: {}", pageNum, pageText.length());
+                        pageResults.add(new PageTextResult(pageNum, pageText, false));
+                    } else {
+                        log.debug("第 {} 页文本不足,使用OCR处理", pageNum);
+                        String ocrText = extractTextByOcr(pdfFilePath, pageNum);
+                        pageResults.add(new PageTextResult(pageNum, ocrText, true));
+                    }
+                } catch (Exception e) {
+                    log.error("处理第 {} 页时出错,尝试使用OCR", pageNum, e);
+                    try {
+                        String ocrText = extractTextByOcr(pdfFilePath, pageNum);
+                        pageResults.add(new PageTextResult(pageNum, ocrText, true));
+                    } catch (Exception ocrException) {
+                        log.error("第 {} 页OCR也失败", pageNum, ocrException);
+                        pageResults.add(new PageTextResult(pageNum, "", true));
+                    }
+                }
+            }
+        } catch (IOException e) {
+            log.error("读取PDF文件失败: {}", pdfFilePath, e);
+            throw new ServiceException("读取PDF文件失败: " + e.getMessage());
+        }
+        
+        // 生成带索引的合并结果
+        return combinePageTextsWithIndex(pageResults, documentId, indexOutputPath);
+    }
+    
+    /**
+     * 合并页面文本并生成索引
+     */
+    private ExtractionResult combinePageTextsWithIndex(List<PageTextResult> pageResults, 
+                                                        String documentId, 
+                                                        String indexOutputPath) {
+        StringBuilder combinedText = new StringBuilder();
+        List<PageIndex> pageIndices = new ArrayList<>();
+        
+        int currentCharPos = 0;
+        int currentLine = 1;
+        
+        for (PageTextResult result : pageResults) {
+            int pageCharStart = currentCharPos;
+            int pageLineStart = currentLine;
+            
+            if (result.getText() != null && !result.getText().trim().isEmpty()) {
+                // 添加页头标记
+                String pageHeader = "=== 第 " + result.getPageNum() + " 页";
+                if (result.isOcrUsed()) {
+                    pageHeader += " (OCR识别)";
+                } else {
+                    pageHeader += " (文本层提取)";
+                }
+                pageHeader += " ===\n";
+                
+                combinedText.append(pageHeader);
+                currentCharPos += pageHeader.length();
+                currentLine++;
+                
+                // 添加页面内容
+                String pageText = result.getText();
+                combinedText.append(pageText).append("\n\n");
+                
+                // 计算行数
+                int pageLines = countLines(pageText);
+                currentLine += pageLines + 1; // +1 for the empty line separator
+                currentCharPos += pageText.length() + 2; // +2 for "\n\n"
+            }
+            
+            // 创建页面索引
+            PageIndex pageIndex = new PageIndex();
+            pageIndex.setPage(result.getPageNum());
+            pageIndex.setCharStart(pageCharStart);
+            pageIndex.setCharEnd(currentCharPos - 1);
+            pageIndex.setLineStart(pageLineStart);
+            pageIndex.setLineEnd(currentLine - 1);
+            pageIndex.setOcrUsed(result.isOcrUsed());
+            pageIndices.add(pageIndex);
+        }
+        
+        // 创建文档索引
+        DocumentIndex documentIndex = new DocumentIndex();
+        documentIndex.setDocumentId(documentId);
+        documentIndex.setPages(pageIndices);
+        documentIndex.setTotalChars(currentCharPos);
+        documentIndex.setTotalLines(currentLine - 1);
+        documentIndex.setTotalPages(pageResults.size());
+        
+        // 保存索引文件
+        if (indexOutputPath != null) {
+            saveIndexFile(documentIndex, indexOutputPath);
+        }
+        
+        ExtractionResult result = new ExtractionResult();
+        result.setText(combinedText.toString());
+        result.setIndex(documentIndex);
+        return result;
+    }
+    
+    /**
+     * 计算文本行数
+     */
+    private int countLines(String text) {
+        if (text == null || text.isEmpty()) {
+            return 0;
+        }
+        int lines = 1;
+        for (char c : text.toCharArray()) {
+            if (c == '\n') {
+                lines++;
+            }
+        }
+        return lines;
+    }
+    
+    /**
+     * 保存索引文件
+     */
+    private void saveIndexFile(DocumentIndex index, String outputPath) {
+        try {
+            Path path = Paths.get(outputPath);
+            Files.createDirectories(path.getParent());
+            String json = objectMapper.writerWithDefaultPrettyPrinter().writeValueAsString(index);
+            Files.writeString(path, json);
+            log.info("页面索引文件已保存: {}", outputPath);
+        } catch (Exception e) {
+            log.error("保存页面索引文件失败: {}", outputPath, e);
+            // 索引保存失败不影响主流程
+        }
+    }
+    
     /**
      * 页面文本结果
      */
@@ -233,4 +396,38 @@ public class PdfTextExtractionService {
             return ocrUsed;
         }
     }
+    
+    /**
+     * 提取结果(包含文本和索引)
+     */
+    @Data
+    public static class ExtractionResult {
+        private String text;
+        private DocumentIndex index;
+    }
+    
+    /**
+     * 文档索引
+     */
+    @Data
+    public static class DocumentIndex {
+        private String documentId;
+        private List<PageIndex> pages;
+        private int totalChars;
+        private int totalLines;
+        private int totalPages;
+    }
+    
+    /**
+     * 页面索引
+     */
+    @Data
+    public static class PageIndex {
+        private int page;
+        private int charStart;
+        private int charEnd;
+        private int lineStart;
+        private int lineEnd;
+        private boolean ocrUsed;
+    }
 }