|
|
@@ -333,9 +333,11 @@ public class WordStructuredExtractionService {
|
|
|
private boolean isTocEntry(String text) {
|
|
|
if (text == null) return false;
|
|
|
// 目录域代码特征
|
|
|
+ // 检测目录特征(Java 字符串中 \\ 表示单个反斜杠)
|
|
|
return text.contains("HYPERLINK") ||
|
|
|
text.contains("PAGEREF") ||
|
|
|
- text.contains("TOC \\o") ||
|
|
|
+ text.contains("TOC \\") || // TOC \o, TOC \h 等
|
|
|
+ text.contains("TOC\\") || // 无空格情况
|
|
|
text.contains("_Toc");
|
|
|
}
|
|
|
|
|
|
@@ -347,29 +349,29 @@ public class WordStructuredExtractionService {
|
|
|
|
|
|
String result = text;
|
|
|
|
|
|
- // 移除 TOC 域代码头
|
|
|
- result = result.replaceAll("TOC\\s+\\\\[ohzu]\\s+\"[^\"]*\"\\s*", "");
|
|
|
- result = result.replaceAll("TOC\\s+\\\\[ohzu]\\s*", "");
|
|
|
+ // 移除 TOC 域代码头(支持多种格式)
|
|
|
+ // 格式: TOC \o "1-2" \h \z \u
|
|
|
+ result = result.replaceAll("TOC\\s*(\\\\[ohzu]\\s*(\"[^\"]*\")?\\s*)+", "");
|
|
|
|
|
|
// 移除 HYPERLINK 域代码
|
|
|
// 格式: HYPERLINK \l "_Toc176869144" 实际文本
|
|
|
result = result.replaceAll("HYPERLINK\\s+\\\\l\\s+\"[^\"]*\"\\s*", "");
|
|
|
result = result.replaceAll("HYPERLINK\\s+\"[^\"]*\"\\s*", "");
|
|
|
+ result = result.replaceAll("HYPERLINK\\s*", "");
|
|
|
|
|
|
// 提取页码(在 PAGEREF 后面)
|
|
|
String pageNum = "";
|
|
|
- java.util.regex.Pattern pagePattern = java.util.regex.Pattern.compile("PAGEREF\\s+[^\\s]+\\s+\\\\h\\s*(\\d+)");
|
|
|
+ java.util.regex.Pattern pagePattern = java.util.regex.Pattern.compile("PAGEREF\\s+\\S+\\s*(\\\\h)?\\s*(\\d+)");
|
|
|
java.util.regex.Matcher pageMatcher = pagePattern.matcher(result);
|
|
|
if (pageMatcher.find()) {
|
|
|
- pageNum = pageMatcher.group(1);
|
|
|
+ pageNum = pageMatcher.group(2);
|
|
|
}
|
|
|
|
|
|
// 移除 PAGEREF 域代码
|
|
|
// 格式: PAGEREF _Toc176869144 \h 1
|
|
|
- result = result.replaceAll("PAGEREF\\s+[^\\s]+\\s+\\\\h\\s*\\d*", "");
|
|
|
- result = result.replaceAll("PAGEREF\\s+[^\\s]+\\s*", "");
|
|
|
+ result = result.replaceAll("PAGEREF\\s+\\S+\\s*(\\\\h)?\\s*\\d*", "");
|
|
|
|
|
|
- // 移除其他常见域代码标记
|
|
|
+ // 移除其他常见域代码标记(如 \l \o \h \z \u)
|
|
|
result = result.replaceAll("\\\\[lohzu]\\s*", "");
|
|
|
|
|
|
// 清理多余空格
|