|
@@ -4,8 +4,8 @@ import com.lingyue.common.exception.ServiceException;
|
|
|
import lombok.extern.slf4j.Slf4j;
|
|
import lombok.extern.slf4j.Slf4j;
|
|
|
import org.apache.poi.hwpf.HWPFDocument;
|
|
import org.apache.poi.hwpf.HWPFDocument;
|
|
|
import org.apache.poi.hwpf.extractor.WordExtractor;
|
|
import org.apache.poi.hwpf.extractor.WordExtractor;
|
|
|
-import org.apache.poi.xwpf.extractor.XWPFWordExtractor;
|
|
|
|
|
-import org.apache.poi.xwpf.usermodel.XWPFDocument;
|
|
|
|
|
|
|
+import org.apache.poi.xwpf.usermodel.*;
|
|
|
|
|
+import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTR;
|
|
|
import org.springframework.stereotype.Service;
|
|
import org.springframework.stereotype.Service;
|
|
|
|
|
|
|
|
import java.io.File;
|
|
import java.io.File;
|
|
@@ -15,6 +15,7 @@ import java.io.IOException;
|
|
|
/**
|
|
/**
|
|
|
* Word文档文本提取服务
|
|
* Word文档文本提取服务
|
|
|
* 支持.docx和.doc格式
|
|
* 支持.docx和.doc格式
|
|
|
|
|
+ * 在分页符位置插入 \f 字符,便于后续识别页码
|
|
|
*
|
|
*
|
|
|
* @author lingyue
|
|
* @author lingyue
|
|
|
* @since 2026-01-14
|
|
* @since 2026-01-14
|
|
@@ -23,11 +24,16 @@ import java.io.IOException;
|
|
|
@Service
|
|
@Service
|
|
|
public class WordTextExtractionService {
|
|
public class WordTextExtractionService {
|
|
|
|
|
|
|
|
|
|
+ /**
|
|
|
|
|
+ * 分页符字符
|
|
|
|
|
+ */
|
|
|
|
|
+ private static final char PAGE_BREAK = '\f';
|
|
|
|
|
+
|
|
|
/**
|
|
/**
|
|
|
* 提取Word文档文本
|
|
* 提取Word文档文本
|
|
|
*
|
|
*
|
|
|
* @param wordFilePath Word文件路径
|
|
* @param wordFilePath Word文件路径
|
|
|
- * @return 提取的文本内容
|
|
|
|
|
|
|
+ * @return 提取的文本内容(分页符位置插入 \f)
|
|
|
*/
|
|
*/
|
|
|
public String extractText(String wordFilePath) {
|
|
public String extractText(String wordFilePath) {
|
|
|
File wordFile = new File(wordFilePath);
|
|
File wordFile = new File(wordFilePath);
|
|
@@ -39,7 +45,7 @@ public class WordTextExtractionService {
|
|
|
|
|
|
|
|
try {
|
|
try {
|
|
|
if (fileName.endsWith(".docx")) {
|
|
if (fileName.endsWith(".docx")) {
|
|
|
- return extractFromDocx(wordFilePath);
|
|
|
|
|
|
|
+ return extractFromDocxWithPageBreaks(wordFilePath);
|
|
|
} else if (fileName.endsWith(".doc")) {
|
|
} else if (fileName.endsWith(".doc")) {
|
|
|
return extractFromDoc(wordFilePath);
|
|
return extractFromDoc(wordFilePath);
|
|
|
} else {
|
|
} else {
|
|
@@ -52,23 +58,88 @@ public class WordTextExtractionService {
|
|
|
}
|
|
}
|
|
|
|
|
|
|
|
/**
|
|
/**
|
|
|
- * 从.docx文件提取文本
|
|
|
|
|
|
|
+ * 从.docx文件提取文本,保留分页符
|
|
|
*/
|
|
*/
|
|
|
- private String extractFromDocx(String filePath) throws IOException {
|
|
|
|
|
- log.info("提取.docx文件文本: {}", filePath);
|
|
|
|
|
|
|
+ private String extractFromDocxWithPageBreaks(String filePath) throws IOException {
|
|
|
|
|
+ log.info("提取.docx文件文本(含分页符): {}", filePath);
|
|
|
|
|
+
|
|
|
|
|
+ StringBuilder sb = new StringBuilder();
|
|
|
|
|
+ int pageBreakCount = 0;
|
|
|
|
|
|
|
|
try (FileInputStream fis = new FileInputStream(filePath);
|
|
try (FileInputStream fis = new FileInputStream(filePath);
|
|
|
- XWPFDocument document = new XWPFDocument(fis);
|
|
|
|
|
- XWPFWordExtractor extractor = new XWPFWordExtractor(document)) {
|
|
|
|
|
|
|
+ XWPFDocument document = new XWPFDocument(fis)) {
|
|
|
|
|
|
|
|
- String text = extractor.getText();
|
|
|
|
|
- log.debug("提取到文本长度: {}", text != null ? text.length() : 0);
|
|
|
|
|
- return text != null ? text : "";
|
|
|
|
|
|
|
+ // 遍历文档主体的所有元素
|
|
|
|
|
+ for (IBodyElement element : document.getBodyElements()) {
|
|
|
|
|
+ if (element instanceof XWPFParagraph) {
|
|
|
|
|
+ XWPFParagraph paragraph = (XWPFParagraph) element;
|
|
|
|
|
+
|
|
|
|
|
+ // 检查段落中的分页符
|
|
|
|
|
+ for (XWPFRun run : paragraph.getRuns()) {
|
|
|
|
|
+ // 检查 run 中是否有分页符
|
|
|
|
|
+ CTR ctr = run.getCTR();
|
|
|
|
|
+ if (ctr != null) {
|
|
|
|
|
+ // 检查是否有硬分页符 (page break)
|
|
|
|
|
+ if (ctr.getBrList() != null) {
|
|
|
|
|
+ for (org.openxmlformats.schemas.wordprocessingml.x2006.main.CTBr br : ctr.getBrList()) {
|
|
|
|
|
+ if (br.getType() == org.openxmlformats.schemas.wordprocessingml.x2006.main.STBrType.PAGE) {
|
|
|
|
|
+ sb.append(PAGE_BREAK);
|
|
|
|
|
+ pageBreakCount++;
|
|
|
|
|
+ }
|
|
|
|
|
+ }
|
|
|
|
|
+ }
|
|
|
|
|
+ }
|
|
|
|
|
+
|
|
|
|
|
+ // 添加文本内容
|
|
|
|
|
+ String text = run.getText(0);
|
|
|
|
|
+ if (text != null) {
|
|
|
|
|
+ sb.append(text);
|
|
|
|
|
+ }
|
|
|
|
|
+ }
|
|
|
|
|
+
|
|
|
|
|
+ // 检查段落后是否有分页符(通过段落属性)
|
|
|
|
|
+ if (paragraph.getCTP() != null && paragraph.getCTP().getPPr() != null) {
|
|
|
|
|
+ var pPr = paragraph.getCTP().getPPr();
|
|
|
|
|
+ // 检查分节符带来的分页
|
|
|
|
|
+ if (pPr.getSectPr() != null) {
|
|
|
|
|
+ var sectPr = pPr.getSectPr();
|
|
|
|
|
+ if (sectPr.getType() != null) {
|
|
|
|
|
+ String type = sectPr.getType().getVal().toString();
|
|
|
|
|
+ if ("nextPage".equals(type) || "oddPage".equals(type) || "evenPage".equals(type)) {
|
|
|
|
|
+ sb.append(PAGE_BREAK);
|
|
|
|
|
+ pageBreakCount++;
|
|
|
|
|
+ }
|
|
|
|
|
+ }
|
|
|
|
|
+ }
|
|
|
|
|
+ }
|
|
|
|
|
+
|
|
|
|
|
+ sb.append("\n");
|
|
|
|
|
+
|
|
|
|
|
+ } else if (element instanceof XWPFTable) {
|
|
|
|
|
+ XWPFTable table = (XWPFTable) element;
|
|
|
|
|
+ for (XWPFTableRow row : table.getRows()) {
|
|
|
|
|
+ for (XWPFTableCell cell : row.getTableCells()) {
|
|
|
|
|
+ sb.append(cell.getText()).append("\t");
|
|
|
|
|
+ }
|
|
|
|
|
+ sb.append("\n");
|
|
|
|
|
+ }
|
|
|
|
|
+ }
|
|
|
|
|
+ }
|
|
|
|
|
+
|
|
|
|
|
+ // 检查文档末尾的分节符
|
|
|
|
|
+ if (document.getDocument().getBody().getSectPr() != null) {
|
|
|
|
|
+ // 文档末尾的分节符不需要额外处理
|
|
|
|
|
+ }
|
|
|
}
|
|
}
|
|
|
|
|
+
|
|
|
|
|
+ String result = sb.toString();
|
|
|
|
|
+ log.info("提取完成: 文本长度={}, 分页符数量={}", result.length(), pageBreakCount);
|
|
|
|
|
+ return result;
|
|
|
}
|
|
}
|
|
|
|
|
|
|
|
/**
|
|
/**
|
|
|
* 从.doc文件提取文本
|
|
* 从.doc文件提取文本
|
|
|
|
|
+ * .doc 格式的分页符通常会被 WordExtractor 保留为 \f
|
|
|
*/
|
|
*/
|
|
|
private String extractFromDoc(String filePath) throws IOException {
|
|
private String extractFromDoc(String filePath) throws IOException {
|
|
|
log.info("提取.doc文件文本: {}", filePath);
|
|
log.info("提取.doc文件文本: {}", filePath);
|
|
@@ -78,7 +149,14 @@ public class WordTextExtractionService {
|
|
|
WordExtractor extractor = new WordExtractor(document)) {
|
|
WordExtractor extractor = new WordExtractor(document)) {
|
|
|
|
|
|
|
|
String text = extractor.getText();
|
|
String text = extractor.getText();
|
|
|
- log.debug("提取到文本长度: {}", text != null ? text.length() : 0);
|
|
|
|
|
|
|
+ int pageBreakCount = 0;
|
|
|
|
|
+ if (text != null) {
|
|
|
|
|
+ for (char c : text.toCharArray()) {
|
|
|
|
|
+ if (c == PAGE_BREAK) pageBreakCount++;
|
|
|
|
|
+ }
|
|
|
|
|
+ }
|
|
|
|
|
+ log.info("提取完成: 文本长度={}, 分页符数量={}",
|
|
|
|
|
+ text != null ? text.length() : 0, pageBreakCount);
|
|
|
return text != null ? text : "";
|
|
return text != null ? text : "";
|
|
|
}
|
|
}
|
|
|
}
|
|
}
|