Эх сурвалжийг харах

fix: 支持 Word 文档分页符识别

- DocumentIndexService 检测分页符 (\f) 并生成多页索引
- Word 文档现在可以正确识别页码
何文松 1 сар өмнө
parent
commit
45a42771cc

+ 119 - 1
backend/parse-service/src/main/java/com/lingyue/parse/service/DocumentIndexService.java

@@ -27,7 +27,13 @@ public class DocumentIndexService {
     private final ObjectMapper objectMapper;
     
     /**
-     * 为纯文本生成行索引(Word/Excel等无分页文档)
+     * 分页符字符(Form Feed)
+     */
+    private static final char FORM_FEED = '\f';
+    
+    /**
+     * 为纯文本生成行索引(Word/Excel等文档)
+     * 如果文本中包含分页符(\f),则根据分页符生成页面索引
      * 
      * @param text 文本内容
      * @param documentId 文档ID
@@ -39,6 +45,118 @@ public class DocumentIndexService {
             return createEmptyIndex(documentId);
         }
         
+        // 检查是否包含分页符
+        boolean hasPageBreaks = text.indexOf(FORM_FEED) >= 0;
+        
+        if (hasPageBreaks) {
+            log.info("检测到分页符,生成分页索引: documentId={}", documentId);
+            return generateIndexWithPageBreaks(text, documentId, indexOutputPath);
+        } else {
+            log.debug("无分页符,生成单页索引: documentId={}", documentId);
+            return generateSinglePageIndex(text, documentId, indexOutputPath);
+        }
+    }
+    
+    /**
+     * 根据分页符生成多页索引
+     */
+    private DocumentIndex generateIndexWithPageBreaks(String text, String documentId, String indexOutputPath) {
+        List<PageIndex> pageIndices = new ArrayList<>();
+        List<LineIndex> lineIndices = new ArrayList<>();
+        
+        int charPos = 0;
+        int lineNum = 1;
+        int pageNum = 1;
+        int pageCharStart = 0;
+        int pageLineStart = 1;
+        
+        String[] lines = text.split("\n", -1);
+        
+        for (String line : lines) {
+            // 检查这一行是否包含分页符
+            int ffIndex = line.indexOf(FORM_FEED);
+            
+            if (ffIndex >= 0) {
+                // 处理分页符之前的内容
+                if (ffIndex > 0) {
+                    LineIndex lineIndex = new LineIndex();
+                    lineIndex.setLine(lineNum);
+                    lineIndex.setCharStart(charPos);
+                    lineIndex.setCharEnd(charPos + ffIndex);
+                    lineIndices.add(lineIndex);
+                }
+                
+                // 结束当前页
+                PageIndex pageIndex = new PageIndex();
+                pageIndex.setPage(pageNum);
+                pageIndex.setCharStart(pageCharStart);
+                pageIndex.setCharEnd(charPos + ffIndex);
+                pageIndex.setLineStart(pageLineStart);
+                pageIndex.setLineEnd(lineNum);
+                pageIndices.add(pageIndex);
+                
+                // 开始新页
+                pageNum++;
+                pageCharStart = charPos + ffIndex + 1; // +1 跳过分页符
+                pageLineStart = lineNum + 1;
+                
+                // 处理分页符之后的内容(如果有)
+                if (ffIndex + 1 < line.length()) {
+                    lineNum++;
+                    LineIndex afterLineIndex = new LineIndex();
+                    afterLineIndex.setLine(lineNum);
+                    afterLineIndex.setCharStart(charPos + ffIndex + 1);
+                    afterLineIndex.setCharEnd(charPos + line.length());
+                    lineIndices.add(afterLineIndex);
+                }
+            } else {
+                // 普通行
+                LineIndex lineIndex = new LineIndex();
+                lineIndex.setLine(lineNum);
+                lineIndex.setCharStart(charPos);
+                lineIndex.setCharEnd(charPos + line.length());
+                lineIndices.add(lineIndex);
+            }
+            
+            charPos += line.length() + 1; // +1 for \n
+            lineNum++;
+        }
+        
+        // 添加最后一页
+        if (pageCharStart < text.length()) {
+            PageIndex lastPage = new PageIndex();
+            lastPage.setPage(pageNum);
+            lastPage.setCharStart(pageCharStart);
+            lastPage.setCharEnd(text.length());
+            lastPage.setLineStart(pageLineStart);
+            lastPage.setLineEnd(lineNum - 1);
+            pageIndices.add(lastPage);
+        }
+        
+        // 创建文档索引
+        DocumentIndex index = new DocumentIndex();
+        index.setDocumentId(documentId);
+        index.setTotalChars(text.length());
+        index.setTotalLines(lineNum - 1);
+        index.setTotalPages(pageIndices.size());
+        index.setPages(pageIndices);
+        index.setLines(lineIndices);
+        
+        log.info("分页索引生成完成: documentId={}, pages={}, lines={}", 
+                documentId, pageIndices.size(), lineIndices.size());
+        
+        // 保存索引文件
+        if (indexOutputPath != null) {
+            saveIndexFile(index, indexOutputPath);
+        }
+        
+        return index;
+    }
+    
+    /**
+     * 生成单页索引(无分页符的文档)
+     */
+    private DocumentIndex generateSinglePageIndex(String text, String documentId, String indexOutputPath) {
         List<LineIndex> lineIndices = new ArrayList<>();
         int charPos = 0;
         int lineNum = 1;