|
|
@@ -112,15 +112,21 @@ public class WordStructuredExtractionService {
|
|
|
String paragraphType = detectParagraphType(paragraph, text);
|
|
|
textElement.setType(paragraphType);
|
|
|
|
|
|
- // 如果是目录项,清理域代码
|
|
|
+ // 如果是目录项,清理域代码并提取页码
|
|
|
String cleanedText = text.trim();
|
|
|
+ Map<String, Object> style = extractParagraphStyle(paragraph);
|
|
|
+
|
|
|
if ("toc_item".equals(paragraphType) || isTocEntry(cleanedText)) {
|
|
|
- cleanedText = cleanTocText(cleanedText);
|
|
|
+ String[] tocParts = cleanTocTextWithPage(cleanedText);
|
|
|
+ cleanedText = tocParts[0];
|
|
|
+ if (!tocParts[1].isEmpty()) {
|
|
|
+ style.put("tocPageNum", tocParts[1]);
|
|
|
+ }
|
|
|
textElement.setType("toc_item");
|
|
|
}
|
|
|
|
|
|
textElement.setContent(cleanedText);
|
|
|
- textElement.setStyle(extractParagraphStyle(paragraph));
|
|
|
+ textElement.setStyle(style);
|
|
|
// 逐 Run 提取格式(目录项也提取,但文本会被清理)
|
|
|
textElement.setRuns(extractTextRunsWithClean(paragraph.getRuns(), "toc_item".equals(textElement.getType())));
|
|
|
|
|
|
@@ -334,10 +340,10 @@ public class WordStructuredExtractionService {
|
|
|
}
|
|
|
|
|
|
/**
|
|
|
- * 清理目录文本,移除域代码
|
|
|
+ * 清理目录文本,移除域代码,返回 [标题, 页码]
|
|
|
*/
|
|
|
- private String cleanTocText(String text) {
|
|
|
- if (text == null) return "";
|
|
|
+ private String[] cleanTocTextWithPage(String text) {
|
|
|
+ if (text == null) return new String[]{"", ""};
|
|
|
|
|
|
String result = text;
|
|
|
|
|
|
@@ -350,9 +356,17 @@ public class WordStructuredExtractionService {
|
|
|
result = result.replaceAll("HYPERLINK\\s+\\\\l\\s+\"[^\"]*\"\\s*", "");
|
|
|
result = result.replaceAll("HYPERLINK\\s+\"[^\"]*\"\\s*", "");
|
|
|
|
|
|
+ // 提取页码(在 PAGEREF 后面)
|
|
|
+ String pageNum = "";
|
|
|
+ java.util.regex.Pattern pagePattern = java.util.regex.Pattern.compile("PAGEREF\\s+[^\\s]+\\s+\\\\h\\s*(\\d+)");
|
|
|
+ java.util.regex.Matcher pageMatcher = pagePattern.matcher(result);
|
|
|
+ if (pageMatcher.find()) {
|
|
|
+ pageNum = pageMatcher.group(1);
|
|
|
+ }
|
|
|
+
|
|
|
// 移除 PAGEREF 域代码
|
|
|
// 格式: PAGEREF _Toc176869144 \h 1
|
|
|
- result = result.replaceAll("PAGEREF\\s+[^\\s]+\\s+\\\\h\\s*", "");
|
|
|
+ result = result.replaceAll("PAGEREF\\s+[^\\s]+\\s+\\\\h\\s*\\d*", "");
|
|
|
result = result.replaceAll("PAGEREF\\s+[^\\s]+\\s*", "");
|
|
|
|
|
|
// 移除其他常见域代码标记
|
|
|
@@ -361,7 +375,23 @@ public class WordStructuredExtractionService {
|
|
|
// 清理多余空格
|
|
|
result = result.replaceAll("\\s+", " ").trim();
|
|
|
|
|
|
- return result;
|
|
|
+ // 如果页码为空,尝试从末尾提取数字(有些目录格式是 "标题 页码")
|
|
|
+ if (pageNum.isEmpty() && !result.isEmpty()) {
|
|
|
+ java.util.regex.Matcher endNumMatcher = java.util.regex.Pattern.compile("\\s+(\\d+)$").matcher(result);
|
|
|
+ if (endNumMatcher.find()) {
|
|
|
+ pageNum = endNumMatcher.group(1);
|
|
|
+ result = result.substring(0, endNumMatcher.start()).trim();
|
|
|
+ }
|
|
|
+ }
|
|
|
+
|
|
|
+ return new String[]{result, pageNum};
|
|
|
+ }
|
|
|
+
|
|
|
+ /**
|
|
|
+ * 清理目录文本,移除域代码(简化版,只返回标题)
|
|
|
+ */
|
|
|
+ private String cleanTocText(String text) {
|
|
|
+ return cleanTocTextWithPage(text)[0];
|
|
|
}
|
|
|
|
|
|
/**
|