Ver código fonte

feat: PDF 索引增加行信息,统一索引格式

- PdfTextExtractionService 生成索引时同时记录每行的字符位置
- PDF 和 Word 文档索引格式统一:都包含 pages 和 lines 数组
- 支持精确的行号定位
何文松 1 mês atrás
pai
commit
57b8c92865

+ 36 - 22
backend/parse-service/src/main/java/com/lingyue/parse/service/PdfTextExtractionService.java

@@ -276,6 +276,7 @@ public class PdfTextExtractionService {
                                                         String indexOutputPath) {
         StringBuilder combinedText = new StringBuilder();
         List<PageIndex> pageIndices = new ArrayList<>();
+        List<LineIndex> lineIndices = new ArrayList<>();
         
         int currentCharPos = 0;
         int currentLine = 1;
@@ -294,18 +295,35 @@ public class PdfTextExtractionService {
                 }
                 pageHeader += " ===\n";
                 
+                // 记录页头行的索引
+                LineIndex headerLineIndex = new LineIndex();
+                headerLineIndex.setLine(currentLine);
+                headerLineIndex.setCharStart(currentCharPos);
+                headerLineIndex.setCharEnd(currentCharPos + pageHeader.length() - 2); // -2 去掉 \n
+                lineIndices.add(headerLineIndex);
+                
                 combinedText.append(pageHeader);
                 currentCharPos += pageHeader.length();
                 currentLine++;
                 
-                // 添加页面内容
+                // 添加页面内容并记录每行索引
                 String pageText = result.getText();
-                combinedText.append(pageText).append("\n\n");
+                String[] lines = pageText.split("\n", -1);
+                for (String line : lines) {
+                    LineIndex lineIndex = new LineIndex();
+                    lineIndex.setLine(currentLine);
+                    lineIndex.setCharStart(currentCharPos);
+                    lineIndex.setCharEnd(currentCharPos + line.length());
+                    lineIndices.add(lineIndex);
+                    
+                    currentCharPos += line.length() + 1; // +1 for \n
+                    currentLine++;
+                }
                 
-                // 计算行数
-                int pageLines = countLines(pageText);
-                currentLine += pageLines + 1; // +1 for the empty line separator
-                currentCharPos += pageText.length() + 2; // +2 for "\n\n"
+                // 添加额外的空行分隔符
+                combinedText.append(pageText).append("\n\n");
+                currentCharPos++; // 额外的 \n
+                currentLine++;
             }
             
             // 创建页面索引
@@ -323,6 +341,7 @@ public class PdfTextExtractionService {
         DocumentIndex documentIndex = new DocumentIndex();
         documentIndex.setDocumentId(documentId);
         documentIndex.setPages(pageIndices);
+        documentIndex.setLines(lineIndices);
         documentIndex.setTotalChars(currentCharPos);
         documentIndex.setTotalLines(currentLine - 1);
         documentIndex.setTotalPages(pageResults.size());
@@ -338,22 +357,6 @@ public class PdfTextExtractionService {
         return result;
     }
     
-    /**
-     * 计算文本行数
-     */
-    private int countLines(String text) {
-        if (text == null || text.isEmpty()) {
-            return 0;
-        }
-        int lines = 1;
-        for (char c : text.toCharArray()) {
-            if (c == '\n') {
-                lines++;
-            }
-        }
-        return lines;
-    }
-    
     /**
      * 保存索引文件
      */
@@ -413,11 +416,22 @@ public class PdfTextExtractionService {
     public static class DocumentIndex {
         private String documentId;
         private List<PageIndex> pages;
+        private List<LineIndex> lines;
         private int totalChars;
         private int totalLines;
         private int totalPages;
     }
     
+    /**
+     * 行索引
+     */
+    @Data
+    public static class LineIndex {
+        private int line;
+        private int charStart;
+        private int charEnd;
+    }
+    
     /**
      * 页面索引
      */