|
|
@@ -108,14 +108,24 @@ public class WordStructuredExtractionService {
|
|
|
if (text != null && !text.trim().isEmpty()) {
|
|
|
ContentElement textElement = new ContentElement();
|
|
|
textElement.setIndex(elementIndex++);
|
|
|
- textElement.setType(detectParagraphType(paragraph, text));
|
|
|
- textElement.setContent(text.trim());
|
|
|
+
|
|
|
+ String paragraphType = detectParagraphType(paragraph, text);
|
|
|
+ textElement.setType(paragraphType);
|
|
|
+
|
|
|
+ // 如果是目录项,清理域代码
|
|
|
+ String cleanedText = text.trim();
|
|
|
+ if ("toc_item".equals(paragraphType) || isTocEntry(cleanedText)) {
|
|
|
+ cleanedText = cleanTocText(cleanedText);
|
|
|
+ textElement.setType("toc_item");
|
|
|
+ }
|
|
|
+
|
|
|
+ textElement.setContent(cleanedText);
|
|
|
textElement.setStyle(extractParagraphStyle(paragraph));
|
|
|
- // 逐 Run 提取格式
|
|
|
- textElement.setRuns(extractTextRuns(paragraph.getRuns()));
|
|
|
+ // 逐 Run 提取格式(目录项也提取,但文本会被清理)
|
|
|
+ textElement.setRuns(extractTextRunsWithClean(paragraph.getRuns(), "toc_item".equals(textElement.getType())));
|
|
|
|
|
|
elements.add(textElement);
|
|
|
- fullText.append(text).append("\n");
|
|
|
+ fullText.append(cleanedText).append("\n");
|
|
|
}
|
|
|
|
|
|
} else if (bodyElement instanceof XWPFTable) {
|
|
|
@@ -286,8 +296,13 @@ public class WordStructuredExtractionService {
|
|
|
}
|
|
|
}
|
|
|
|
|
|
- // 基于内容检测
|
|
|
+ // 检测目录项(包含 HYPERLINK 或 PAGEREF 域代码)
|
|
|
String trimmed = text.trim();
|
|
|
+ if (isTocEntry(trimmed)) {
|
|
|
+ return "toc_item";
|
|
|
+ }
|
|
|
+
|
|
|
+ // 基于内容检测
|
|
|
if (trimmed.length() < 100) {
|
|
|
if (trimmed.matches("^[一二三四五六七八九十]+[、.].*") ||
|
|
|
trimmed.matches("^第[一二三四五六七八九十]+[章节部分].*") ||
|
|
|
@@ -306,6 +321,49 @@ public class WordStructuredExtractionService {
|
|
|
return "paragraph";
|
|
|
}
|
|
|
|
|
|
+ /**
|
|
|
+ * 检测是否为目录项
|
|
|
+ */
|
|
|
+ private boolean isTocEntry(String text) {
|
|
|
+ if (text == null) return false;
|
|
|
+ // 目录域代码特征
|
|
|
+ return text.contains("HYPERLINK") ||
|
|
|
+ text.contains("PAGEREF") ||
|
|
|
+ text.contains("TOC \\o") ||
|
|
|
+ text.contains("_Toc");
|
|
|
+ }
|
|
|
+
|
|
|
+ /**
|
|
|
+ * 清理目录文本,移除域代码
|
|
|
+ */
|
|
|
+ private String cleanTocText(String text) {
|
|
|
+ if (text == null) return "";
|
|
|
+
|
|
|
+ String result = text;
|
|
|
+
|
|
|
+ // 移除 TOC 域代码头
|
|
|
+ result = result.replaceAll("TOC\\s+\\\\[ohzu]\\s+\"[^\"]*\"\\s*", "");
|
|
|
+ result = result.replaceAll("TOC\\s+\\\\[ohzu]\\s*", "");
|
|
|
+
|
|
|
+ // 移除 HYPERLINK 域代码
|
|
|
+ // 格式: HYPERLINK \l "_Toc176869144" 实际文本
|
|
|
+ result = result.replaceAll("HYPERLINK\\s+\\\\l\\s+\"[^\"]*\"\\s*", "");
|
|
|
+ result = result.replaceAll("HYPERLINK\\s+\"[^\"]*\"\\s*", "");
|
|
|
+
|
|
|
+ // 移除 PAGEREF 域代码
|
|
|
+ // 格式: PAGEREF _Toc176869144 \h 1
|
|
|
+ result = result.replaceAll("PAGEREF\\s+[^\\s]+\\s+\\\\h\\s*", "");
|
|
|
+ result = result.replaceAll("PAGEREF\\s+[^\\s]+\\s*", "");
|
|
|
+
|
|
|
+ // 移除其他常见域代码标记
|
|
|
+ result = result.replaceAll("\\\\[lohzu]\\s*", "");
|
|
|
+
|
|
|
+ // 清理多余空格
|
|
|
+ result = result.replaceAll("\\s+", " ").trim();
|
|
|
+
|
|
|
+ return result;
|
|
|
+ }
|
|
|
+
|
|
|
/**
|
|
|
* 提取段落样式
|
|
|
*/
|
|
|
@@ -430,19 +488,32 @@ public class WordStructuredExtractionService {
|
|
|
* 提取段落中所有 Run 的文本和样式
|
|
|
*/
|
|
|
private List<TextRun> extractTextRuns(List<XWPFRun> xwpfRuns) {
|
|
|
+ return extractTextRunsWithClean(xwpfRuns, false);
|
|
|
+ }
|
|
|
+
|
|
|
+ /**
|
|
|
+ * 提取段落中所有 Run 的文本和样式(可选清理目录域代码)
|
|
|
+ */
|
|
|
+ private List<TextRun> extractTextRunsWithClean(List<XWPFRun> xwpfRuns, boolean cleanToc) {
|
|
|
if (xwpfRuns == null || xwpfRuns.isEmpty()) {
|
|
|
return null;
|
|
|
}
|
|
|
|
|
|
List<TextRun> runs = new ArrayList<>();
|
|
|
for (XWPFRun xwpfRun : xwpfRuns) {
|
|
|
- // 使用 toString() 而不是 text(),因为 toString() 会包含换行符
|
|
|
- // 或者手动处理 Run 内容
|
|
|
String text = extractRunText(xwpfRun);
|
|
|
if (text == null || text.isEmpty()) {
|
|
|
continue;
|
|
|
}
|
|
|
|
|
|
+ // 如果需要清理目录域代码
|
|
|
+ if (cleanToc) {
|
|
|
+ text = cleanTocText(text);
|
|
|
+ if (text.isEmpty()) {
|
|
|
+ continue;
|
|
|
+ }
|
|
|
+ }
|
|
|
+
|
|
|
TextRun run = new TextRun();
|
|
|
run.setText(text);
|
|
|
|