1 ماه پیش · ffd96053c3
--- a/backend/parse-service/src/main/java/com/lingyue/parse/service/WordTextExtractionService.java
+++ b/backend/parse-service/src/main/java/com/lingyue/parse/service/WordTextExtractionService.java
@@ -4,8 +4,8 @@ import com.lingyue.common.exception.ServiceException;
 
				 import lombok.extern.slf4j.Slf4j;
			
 
				 import org.apache.poi.hwpf.HWPFDocument;
			
 
				 import org.apache.poi.hwpf.extractor.WordExtractor;
			
 
				-import org.apache.poi.xwpf.extractor.XWPFWordExtractor;
			
 
				-import org.apache.poi.xwpf.usermodel.XWPFDocument;
			
 
				+import org.apache.poi.xwpf.usermodel.*;
			
 
				+import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTR;
			
 
				 import org.springframework.stereotype.Service;
			
 
				 
			
 
				 import java.io.File;
			
@@ -15,6 +15,7 @@ import java.io.IOException;
 
				 /**
			
 
				  * Word文档文本提取服务
			
 
				  * 支持.docx和.doc格式
			
 
				+ * 在分页符位置插入 \f 字符，便于后续识别页码
			
 
				  * 
			
 
				  * @author lingyue
			
 
				  * @since 2026-01-14
			
@@ -23,11 +24,16 @@ import java.io.IOException;
 
				 @Service
			
 
				 public class WordTextExtractionService {
			
 
				     
			
 
				+    /**
			
 
				+     * 分页符字符
			
 
				+     */
			
 
				+    private static final char PAGE_BREAK = '\f';
			
 
				+    
			
 
				     /**
			
 
				      * 提取Word文档文本
			
 
				      * 
			
 
				      * @param wordFilePath Word文件路径
			
 
				-     * @return 提取的文本内容
			
 
				+     * @return 提取的文本内容（分页符位置插入 \f）
			
 
				      */
			
 
				     public String extractText(String wordFilePath) {
			
 
				         File wordFile = new File(wordFilePath);
			
@@ -39,7 +45,7 @@ public class WordTextExtractionService {
 
				         
			
 
				         try {
			
 
				             if (fileName.endsWith(".docx")) {
			
 
				-                return extractFromDocx(wordFilePath);
			
 
				+                return extractFromDocxWithPageBreaks(wordFilePath);
			
 
				             } else if (fileName.endsWith(".doc")) {
			
 
				                 return extractFromDoc(wordFilePath);
			
 
				             } else {
			
@@ -52,23 +58,88 @@ public class WordTextExtractionService {
 
				     }
			
 
				     
			
 
				     /**
			
 
				-     * 从.docx文件提取文本
			
 
				+     * 从.docx文件提取文本，保留分页符
			
 
				      */
			
 
				-    private String extractFromDocx(String filePath) throws IOException {
			
 
				-        log.info("提取.docx文件文本: {}", filePath);
			
 
				+    private String extractFromDocxWithPageBreaks(String filePath) throws IOException {
			
 
				+        log.info("提取.docx文件文本（含分页符）: {}", filePath);
			
 
				+        
			
 
				+        StringBuilder sb = new StringBuilder();
			
 
				+        int pageBreakCount = 0;
			
 
				         
			
 
				         try (FileInputStream fis = new FileInputStream(filePath);
			
 
				-             XWPFDocument document = new XWPFDocument(fis);
			
 
				-             XWPFWordExtractor extractor = new XWPFWordExtractor(document)) {
			
 
				+             XWPFDocument document = new XWPFDocument(fis)) {
			
 
				             
			
 
				-            String text = extractor.getText();
			
 
				-            log.debug("提取到文本长度: {}", text != null ? text.length() : 0);
			
 
				-            return text != null ? text : "";
			
 
				+            // 遍历文档主体的所有元素
			
 
				+            for (IBodyElement element : document.getBodyElements()) {
			
 
				+                if (element instanceof XWPFParagraph) {
			
 
				+                    XWPFParagraph paragraph = (XWPFParagraph) element;
			
 
				+                    
			
 
				+                    // 检查段落中的分页符
			
 
				+                    for (XWPFRun run : paragraph.getRuns()) {
			
 
				+                        // 检查 run 中是否有分页符
			
 
				+                        CTR ctr = run.getCTR();
			
 
				+                        if (ctr != null) {
			
 
				+                            // 检查是否有硬分页符 (page break)
			
 
				+                            if (ctr.getBrList() != null) {
			
 
				+                                for (org.openxmlformats.schemas.wordprocessingml.x2006.main.CTBr br : ctr.getBrList()) {
			
 
				+                                    if (br.getType() == org.openxmlformats.schemas.wordprocessingml.x2006.main.STBrType.PAGE) {
			
 
				+                                        sb.append(PAGE_BREAK);
			
 
				+                                        pageBreakCount++;
			
 
				+                                    }
			
 
				+                                }
			
 
				+                            }
			
 
				+                        }
			
 
				+                        
			
 
				+                        // 添加文本内容
			
 
				+                        String text = run.getText(0);
			
 
				+                        if (text != null) {
			
 
				+                            sb.append(text);
			
 
				+                        }
			
 
				+                    }
			
 
				+                    
			
 
				+                    // 检查段落后是否有分页符（通过段落属性）
			
 
				+                    if (paragraph.getCTP() != null && paragraph.getCTP().getPPr() != null) {
			
 
				+                        var pPr = paragraph.getCTP().getPPr();
			
 
				+                        // 检查分节符带来的分页
			
 
				+                        if (pPr.getSectPr() != null) {
			
 
				+                            var sectPr = pPr.getSectPr();
			
 
				+                            if (sectPr.getType() != null) {
			
 
				+                                String type = sectPr.getType().getVal().toString();
			
 
				+                                if ("nextPage".equals(type) || "oddPage".equals(type) || "evenPage".equals(type)) {
			
 
				+                                    sb.append(PAGE_BREAK);
			
 
				+                                    pageBreakCount++;
			
 
				+                                }
			
 
				+                            }
			
 
				+                        }
			
 
				+                    }
			
 
				+                    
			
 
				+                    sb.append("\n");
			
 
				+                    
			
 
				+                } else if (element instanceof XWPFTable) {
			
 
				+                    XWPFTable table = (XWPFTable) element;
			
 
				+                    for (XWPFTableRow row : table.getRows()) {
			
 
				+                        for (XWPFTableCell cell : row.getTableCells()) {
			
 
				+                            sb.append(cell.getText()).append("\t");
			
 
				+                        }
			
 
				+                        sb.append("\n");
			
 
				+                    }
			
 
				+                }
			
 
				+            }
			
 
				+            
			
 
				+            // 检查文档末尾的分节符
			
 
				+            if (document.getDocument().getBody().getSectPr() != null) {
			
 
				+                // 文档末尾的分节符不需要额外处理
			
 
				+            }
			
 
				         }
			
 
				+        
			
 
				+        String result = sb.toString();
			
 
				+        log.info("提取完成: 文本长度={}, 分页符数量={}", result.length(), pageBreakCount);
			
 
				+        return result;
			
 
				     }
			
 
				     
			
 
				     /**
			
 
				      * 从.doc文件提取文本
			
 
				+     * .doc 格式的分页符通常会被 WordExtractor 保留为 \f
			
 
				      */
			
 
				     private String extractFromDoc(String filePath) throws IOException {
			
 
				         log.info("提取.doc文件文本: {}", filePath);
			
@@ -78,7 +149,14 @@ public class WordTextExtractionService {
 
				              WordExtractor extractor = new WordExtractor(document)) {
			
 
				             
			
 
				             String text = extractor.getText();
			
 
				-            log.debug("提取到文本长度: {}", text != null ? text.length() : 0);
			
 
				+            int pageBreakCount = 0;
			
 
				+            if (text != null) {
			
 
				+                for (char c : text.toCharArray()) {
			
 
				+                    if (c == PAGE_BREAK) pageBreakCount++;
			
 
				+                }
			
 
				+            }
			
 
				+            log.info("提取完成: 文本长度={}, 分页符数量={}", 
			
 
				+                    text != null ? text.length() : 0, pageBreakCount);
			
 
				             return text != null ? text : "";
			
 
				         }
			
 
				     }