|
@@ -116,7 +116,12 @@ public class WordStructuredExtractionService {
|
|
|
String cleanedText = text.trim();
|
|
String cleanedText = text.trim();
|
|
|
Map<String, Object> style = extractParagraphStyle(paragraph);
|
|
Map<String, Object> style = extractParagraphStyle(paragraph);
|
|
|
|
|
|
|
|
- if ("toc_item".equals(paragraphType) || isTocEntry(cleanedText)) {
|
|
|
|
|
|
|
+ // 检测目录项:通过域代码或段落格式
|
|
|
|
|
+ boolean isToc = "toc_item".equals(paragraphType) ||
|
|
|
|
|
+ isTocEntry(cleanedText) ||
|
|
|
|
|
+ isTocParagraph(paragraph, cleanedText);
|
|
|
|
|
+
|
|
|
|
|
+ if (isToc) {
|
|
|
String[] tocParts = cleanTocTextWithPage(cleanedText);
|
|
String[] tocParts = cleanTocTextWithPage(cleanedText);
|
|
|
cleanedText = tocParts[0];
|
|
cleanedText = tocParts[0];
|
|
|
if (!tocParts[1].isEmpty()) {
|
|
if (!tocParts[1].isEmpty()) {
|
|
@@ -127,8 +132,10 @@ public class WordStructuredExtractionService {
|
|
|
|
|
|
|
|
textElement.setContent(cleanedText);
|
|
textElement.setContent(cleanedText);
|
|
|
textElement.setStyle(style);
|
|
textElement.setStyle(style);
|
|
|
- // 逐 Run 提取格式(目录项也提取,但文本会被清理)
|
|
|
|
|
- textElement.setRuns(extractTextRunsWithClean(paragraph.getRuns(), "toc_item".equals(textElement.getType())));
|
|
|
|
|
|
|
+ // 逐 Run 提取格式(目录项不提取 runs,直接使用清理后的 content)
|
|
|
|
|
+ if (!"toc_item".equals(textElement.getType())) {
|
|
|
|
|
+ textElement.setRuns(extractTextRuns(paragraph.getRuns()));
|
|
|
|
|
+ }
|
|
|
|
|
|
|
|
elements.add(textElement);
|
|
elements.add(textElement);
|
|
|
fullText.append(cleanedText).append("\n");
|
|
fullText.append(cleanedText).append("\n");
|
|
@@ -341,6 +348,33 @@ public class WordStructuredExtractionService {
|
|
|
text.contains("_Toc");
|
|
text.contains("_Toc");
|
|
|
}
|
|
}
|
|
|
|
|
|
|
|
|
|
+ /**
|
|
|
|
|
+ * 检测段落是否为目录项(通过段落属性和内容格式)
|
|
|
|
|
+ * 用于处理域代码已被 Word 渲染为纯文本的情况
|
|
|
|
|
+ */
|
|
|
|
|
+ private boolean isTocParagraph(XWPFParagraph paragraph, String text) {
|
|
|
|
|
+ if (text == null || text.isEmpty()) return false;
|
|
|
|
|
+
|
|
|
|
|
+ // 检查段落样式是否为 TOC 相关
|
|
|
|
|
+ String styleName = paragraph.getStyle();
|
|
|
|
|
+ if (styleName != null) {
|
|
|
|
|
+ String lowerStyle = styleName.toLowerCase();
|
|
|
|
|
+ if (lowerStyle.contains("toc") || lowerStyle.contains("目录")) {
|
|
|
|
|
+ return true;
|
|
|
|
|
+ }
|
|
|
|
|
+ }
|
|
|
|
|
+
|
|
|
|
|
+ // 检查内容格式:章节号 + 标题 + 页码
|
|
|
|
|
+ // 如: "1 企业概述 1", "1.1 企业简介 1", "2.3.4 某某内容 15"
|
|
|
|
|
+ String trimmed = text.trim();
|
|
|
|
|
+ // 匹配: 数字[.数字]* 中文标题 数字
|
|
|
|
|
+ if (trimmed.matches("^\\d+(\\.\\d+)*\\s+[\\u4e00-\\u9fa5].+\\s+\\d+$")) {
|
|
|
|
|
+ return true;
|
|
|
|
|
+ }
|
|
|
|
|
+
|
|
|
|
|
+ return false;
|
|
|
|
|
+ }
|
|
|
|
|
+
|
|
|
/**
|
|
/**
|
|
|
* 清理目录文本,移除域代码,返回 [标题, 页码]
|
|
* 清理目录文本,移除域代码,返回 [标题, 页码]
|
|
|
*/
|
|
*/
|