|
@@ -4,13 +4,14 @@ import com.lingyue.common.exception.ServiceException;
|
|
|
import lombok.extern.slf4j.Slf4j;
|
|
import lombok.extern.slf4j.Slf4j;
|
|
|
import org.apache.poi.hwpf.HWPFDocument;
|
|
import org.apache.poi.hwpf.HWPFDocument;
|
|
|
import org.apache.poi.hwpf.extractor.WordExtractor;
|
|
import org.apache.poi.hwpf.extractor.WordExtractor;
|
|
|
-import org.apache.poi.xwpf.extractor.XWPFWordExtractor;
|
|
|
|
|
import org.apache.poi.xwpf.usermodel.XWPFDocument;
|
|
import org.apache.poi.xwpf.usermodel.XWPFDocument;
|
|
|
|
|
+import org.apache.poi.xwpf.usermodel.XWPFParagraph;
|
|
|
import org.springframework.stereotype.Service;
|
|
import org.springframework.stereotype.Service;
|
|
|
|
|
|
|
|
import java.io.File;
|
|
import java.io.File;
|
|
|
import java.io.FileInputStream;
|
|
import java.io.FileInputStream;
|
|
|
import java.io.IOException;
|
|
import java.io.IOException;
|
|
|
|
|
+import java.util.regex.Pattern;
|
|
|
|
|
|
|
|
/**
|
|
/**
|
|
|
* Word文档文本提取服务
|
|
* Word文档文本提取服务
|
|
@@ -23,6 +24,14 @@ import java.io.IOException;
|
|
|
@Service
|
|
@Service
|
|
|
public class WordTextExtractionService {
|
|
public class WordTextExtractionService {
|
|
|
|
|
|
|
|
|
|
+ // 匹配 Word 批注格式: "Comment by xxx: ..." 或 "批注 [xxx]: ..."
|
|
|
|
|
+ private static final Pattern COMMENT_PATTERN = Pattern.compile(
|
|
|
|
|
+ "(?m)^\\s*Comment by [^:]+:.*$|" + // English format
|
|
|
|
|
+ "(?m)^\\s*批注\\s*\\[[^\\]]+\\]:.*$|" + // Chinese format
|
|
|
|
|
+ "\\[Comment by [^\\]]+\\]|" + // Inline comment marker
|
|
|
|
|
+ "\\[批注[^\\]]*\\]" // Inline Chinese comment
|
|
|
|
|
+ );
|
|
|
|
|
+
|
|
|
/**
|
|
/**
|
|
|
* 提取Word文档文本
|
|
* 提取Word文档文本
|
|
|
*
|
|
*
|
|
@@ -52,18 +61,34 @@ public class WordTextExtractionService {
|
|
|
}
|
|
}
|
|
|
|
|
|
|
|
/**
|
|
/**
|
|
|
- * 从.docx文件提取文本
|
|
|
|
|
|
|
+ * 从.docx文件提取文本(不包含批注)
|
|
|
|
|
+ *
|
|
|
|
|
+ * 注意:XWPFWordExtractor.getText() 会包含批注内容,
|
|
|
|
|
+ * 所以我们直接遍历段落提取文本,跳过批注。
|
|
|
*/
|
|
*/
|
|
|
private String extractFromDocx(String filePath) throws IOException {
|
|
private String extractFromDocx(String filePath) throws IOException {
|
|
|
log.info("提取.docx文件文本: {}", filePath);
|
|
log.info("提取.docx文件文本: {}", filePath);
|
|
|
|
|
|
|
|
try (FileInputStream fis = new FileInputStream(filePath);
|
|
try (FileInputStream fis = new FileInputStream(filePath);
|
|
|
- XWPFDocument document = new XWPFDocument(fis);
|
|
|
|
|
- XWPFWordExtractor extractor = new XWPFWordExtractor(document)) {
|
|
|
|
|
|
|
+ XWPFDocument document = new XWPFDocument(fis)) {
|
|
|
|
|
|
|
|
- String text = extractor.getText();
|
|
|
|
|
- log.debug("提取到文本长度: {}", text != null ? text.length() : 0);
|
|
|
|
|
- return text != null ? text : "";
|
|
|
|
|
|
|
+ StringBuilder textBuilder = new StringBuilder();
|
|
|
|
|
+
|
|
|
|
|
+ // 遍历所有段落,只提取正文文本(不包含批注)
|
|
|
|
|
+ for (XWPFParagraph paragraph : document.getParagraphs()) {
|
|
|
|
|
+ String paragraphText = paragraph.getText();
|
|
|
|
|
+ if (paragraphText != null && !paragraphText.isEmpty()) {
|
|
|
|
|
+ textBuilder.append(paragraphText).append("\n");
|
|
|
|
|
+ }
|
|
|
|
|
+ }
|
|
|
|
|
+
|
|
|
|
|
+ String text = textBuilder.toString();
|
|
|
|
|
+
|
|
|
|
|
+ // 过滤掉可能残留的批注文本
|
|
|
|
|
+ text = removeCommentText(text);
|
|
|
|
|
+
|
|
|
|
|
+ log.debug("提取到文本长度: {}", text.length());
|
|
|
|
|
+ return text;
|
|
|
}
|
|
}
|
|
|
}
|
|
}
|
|
|
|
|
|
|
@@ -78,8 +103,34 @@ public class WordTextExtractionService {
|
|
|
WordExtractor extractor = new WordExtractor(document)) {
|
|
WordExtractor extractor = new WordExtractor(document)) {
|
|
|
|
|
|
|
|
String text = extractor.getText();
|
|
String text = extractor.getText();
|
|
|
|
|
+
|
|
|
|
|
+ // 过滤批注
|
|
|
|
|
+ if (text != null) {
|
|
|
|
|
+ text = removeCommentText(text);
|
|
|
|
|
+ }
|
|
|
|
|
+
|
|
|
log.debug("提取到文本长度: {}", text != null ? text.length() : 0);
|
|
log.debug("提取到文本长度: {}", text != null ? text.length() : 0);
|
|
|
return text != null ? text : "";
|
|
return text != null ? text : "";
|
|
|
}
|
|
}
|
|
|
}
|
|
}
|
|
|
|
|
+
|
|
|
|
|
+ /**
|
|
|
|
|
+ * 移除文本中的批注内容
|
|
|
|
|
+ */
|
|
|
|
|
+ private String removeCommentText(String text) {
|
|
|
|
|
+ if (text == null || text.isEmpty()) {
|
|
|
|
|
+ return text;
|
|
|
|
|
+ }
|
|
|
|
|
+
|
|
|
|
|
+ String cleaned = COMMENT_PATTERN.matcher(text).replaceAll("");
|
|
|
|
|
+
|
|
|
|
|
+ // 清理多余的空行
|
|
|
|
|
+ cleaned = cleaned.replaceAll("\\n{3,}", "\n\n");
|
|
|
|
|
+
|
|
|
|
|
+ if (cleaned.length() < text.length()) {
|
|
|
|
|
+ log.debug("移除批注文本: 原长度={}, 新长度={}", text.length(), cleaned.length());
|
|
|
|
|
+ }
|
|
|
|
|
+
|
|
|
|
|
+ return cleaned.trim();
|
|
|
|
|
+ }
|
|
|
}
|
|
}
|