1 month ago · ffd96053c3
--- a/backend/parse-service/src/main/java/com/lingyue/parse/service/WordTextExtractionService.java
+++ b/backend/parse-service/src/main/java/com/lingyue/parse/service/WordTextExtractionService.java
@@ -4,8 +4,8 @@ import com.lingyue.common.exception.ServiceException;
 
															 import lombok.extern.slf4j.Slf4j;
														
 
															 import org.apache.poi.hwpf.HWPFDocument;
														
 
															 import org.apache.poi.hwpf.extractor.WordExtractor;
														
 
															-import org.apache.poi.xwpf.extractor.XWPFWordExtractor;
														
 
															-import org.apache.poi.xwpf.usermodel.XWPFDocument;
														
 
															+import org.apache.poi.xwpf.usermodel.*;
														
 
															+import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTR;
														
 
															 import org.springframework.stereotype.Service;
														
 
															 import java.io.File;
														
@@ -15,6 +15,7 @@ import java.io.IOException;
 
															 /**
														
 
															  * Word文档文本提取服务
														
 
															  * 支持.docx和.doc格式
														
 
															+ * 在分页符位置插入 \f 字符，便于后续识别页码
														
 
															  * 
														
 
															  * @author lingyue
														
 
															  * @since 2026-01-14
														
@@ -23,11 +24,16 @@ import java.io.IOException;
 
															 @Service
														
 
															 public class WordTextExtractionService {
														
 
															+    /**
														
 
															+     * 分页符字符
														
 
															+     */
														
 
															+    private static final char PAGE_BREAK = '\f';
														
 
															+    
														
 
															     /**
														
 
															      * 提取Word文档文本
														
 
															      * 
														
 
															      * @param wordFilePath Word文件路径
														
 
															-     * @return 提取的文本内容
														
 
															+     * @return 提取的文本内容（分页符位置插入 \f）
														
 
															      */
														
 
															     public String extractText(String wordFilePath) {
														
 
															         File wordFile = new File(wordFilePath);
														
@@ -39,7 +45,7 @@ public class WordTextExtractionService {
 
															         try {
														
 
															             if (fileName.endsWith(".docx")) {
														
 
															-                return extractFromDocx(wordFilePath);
														
 
															+                return extractFromDocxWithPageBreaks(wordFilePath);
														
 
															             } else if (fileName.endsWith(".doc")) {
														
 
															                 return extractFromDoc(wordFilePath);
														
 
															             } else {
														
@@ -52,23 +58,88 @@ public class WordTextExtractionService {
 
															     }
														
 
															     /**
														
 
															-     * 从.docx文件提取文本
														
 
															+     * 从.docx文件提取文本，保留分页符
														
 
															      */
														
 
															-    private String extractFromDocx(String filePath) throws IOException {
														
 
															-        log.info("提取.docx文件文本: {}", filePath);
														
 
															+    private String extractFromDocxWithPageBreaks(String filePath) throws IOException {
														
 
															+        log.info("提取.docx文件文本（含分页符）: {}", filePath);
														
 
															+        
														
 
															+        StringBuilder sb = new StringBuilder();
														
 
															+        int pageBreakCount = 0;
														
 
															         try (FileInputStream fis = new FileInputStream(filePath);
														
 
															-             XWPFDocument document = new XWPFDocument(fis);
														
 
															-             XWPFWordExtractor extractor = new XWPFWordExtractor(document)) {
														
 
															+             XWPFDocument document = new XWPFDocument(fis)) {
														
 
															-            String text = extractor.getText();
														
 
															-            log.debug("提取到文本长度: {}", text != null ? text.length() : 0);
														
 
															-            return text != null ? text : "";
														
 
															+            // 遍历文档主体的所有元素
														
 
															+            for (IBodyElement element : document.getBodyElements()) {
														
 
															+                if (element instanceof XWPFParagraph) {
														
 
															+                    XWPFParagraph paragraph = (XWPFParagraph) element;
														
 
															+                    
														
 
															+                    // 检查段落中的分页符
														
 
															+                    for (XWPFRun run : paragraph.getRuns()) {
														
 
															+                        // 检查 run 中是否有分页符
														
 
															+                        CTR ctr = run.getCTR();
														
 
															+                        if (ctr != null) {
														
 
															+                            // 检查是否有硬分页符 (page break)
														
 
															+                            if (ctr.getBrList() != null) {
														
 
															+                                for (org.openxmlformats.schemas.wordprocessingml.x2006.main.CTBr br : ctr.getBrList()) {
														
 
															+                                    if (br.getType() == org.openxmlformats.schemas.wordprocessingml.x2006.main.STBrType.PAGE) {
														
 
															+                                        sb.append(PAGE_BREAK);
														
 
															+                                        pageBreakCount++;
														
 
															+                                    }
														
 
															+                                }
														
 
															+                            }
														
 
															+                        }
														
 
															+                        
														
 
															+                        // 添加文本内容
														
 
															+                        String text = run.getText(0);
														
 
															+                        if (text != null) {
														
 
															+                            sb.append(text);
														
 
															+                        }
														
 
															+                    }
														
 
															+                    
														
 
															+                    // 检查段落后是否有分页符（通过段落属性）
														
 
															+                    if (paragraph.getCTP() != null && paragraph.getCTP().getPPr() != null) {
														
 
															+                        var pPr = paragraph.getCTP().getPPr();
														
 
															+                        // 检查分节符带来的分页
														
 
															+                        if (pPr.getSectPr() != null) {
														
 
															+                            var sectPr = pPr.getSectPr();
														
 
															+                            if (sectPr.getType() != null) {
														
 
															+                                String type = sectPr.getType().getVal().toString();
														
 
															+                                if ("nextPage".equals(type) || "oddPage".equals(type) || "evenPage".equals(type)) {
														
 
															+                                    sb.append(PAGE_BREAK);
														
 
															+                                    pageBreakCount++;
														
 
															+                                }
														
 
															+                            }
														
 
															+                        }
														
 
															+                    }
														
 
															+                    
														
 
															+                    sb.append("\n");
														
 
															+                    
														
 
															+                } else if (element instanceof XWPFTable) {
														
 
															+                    XWPFTable table = (XWPFTable) element;
														
 
															+                    for (XWPFTableRow row : table.getRows()) {
														
 
															+                        for (XWPFTableCell cell : row.getTableCells()) {
														
 
															+                            sb.append(cell.getText()).append("\t");
														
 
															+                        }
														
 
															+                        sb.append("\n");
														
 
															+                    }
														
 
															+                }
														
 
															+            }
														
 
															+            
														
 
															+            // 检查文档末尾的分节符
														
 
															+            if (document.getDocument().getBody().getSectPr() != null) {
														
 
															+                // 文档末尾的分节符不需要额外处理
														
 
															+            }
														
 
															         }
														
 
															+        
														
 
															+        String result = sb.toString();
														
 
															+        log.info("提取完成: 文本长度={}, 分页符数量={}", result.length(), pageBreakCount);
														
 
															+        return result;
														
 
															     }
														
 
															     /**
														
 
															      * 从.doc文件提取文本
														
 
															+     * .doc 格式的分页符通常会被 WordExtractor 保留为 \f
														
 
															      */
														
 
															     private String extractFromDoc(String filePath) throws IOException {
														
 
															         log.info("提取.doc文件文本: {}", filePath);
														
@@ -78,7 +149,14 @@ public class WordTextExtractionService {
 
															              WordExtractor extractor = new WordExtractor(document)) {
														
 
															             String text = extractor.getText();
														
 
															-            log.debug("提取到文本长度: {}", text != null ? text.length() : 0);
														
 
															+            int pageBreakCount = 0;
														
 
															+            if (text != null) {
														
 
															+                for (char c : text.toCharArray()) {
														
 
															+                    if (c == PAGE_BREAK) pageBreakCount++;
														
 
															+                }
														
 
															+            }
														
 
															+            log.info("提取完成: 文本长度={}, 分页符数量={}", 
														
 
															+                    text != null ? text.length() : 0, pageBreakCount);
														
 
															             return text != null ? text : "";
														
 
															         }
														
 
															     }