瀏覽代碼

feat: 支持Word目录(TOC)解析和渲染

后端改进 (WordStructuredExtractionService):
1. 新增 isTocEntry() - 检测目录项(包含 HYPERLINK/PAGEREF 等域代码)
2. 新增 cleanTocText() - 清理目录域代码,移除:
   - TOC \o "1-2" 等域代码头
   - HYPERLINK \l "_Toc..." 链接代码
   - PAGEREF _Toc... \h 页码引用代码
3. 修改 detectParagraphType() 识别 toc_item 类型
4. 新增 extractTextRunsWithClean() 支持清理目录文本的 runs

前端改进 (Editor.vue):
1. wrapWithParagraphTag 新增 toc/toc_item 类型渲染
2. 新增目录样式:
   - .doc-toc-title 目录标题居中加粗
   - .doc-toc-item 目录项带虚线下划线、悬停效果
何文松 1 月之前
父節點
當前提交
253342802b

+ 79 - 8
backend/parse-service/src/main/java/com/lingyue/parse/service/WordStructuredExtractionService.java

@@ -108,14 +108,24 @@ public class WordStructuredExtractionService {
                     if (text != null && !text.trim().isEmpty()) {
                         ContentElement textElement = new ContentElement();
                         textElement.setIndex(elementIndex++);
-                        textElement.setType(detectParagraphType(paragraph, text));
-                        textElement.setContent(text.trim());
+                        
+                        String paragraphType = detectParagraphType(paragraph, text);
+                        textElement.setType(paragraphType);
+                        
+                        // 如果是目录项,清理域代码
+                        String cleanedText = text.trim();
+                        if ("toc_item".equals(paragraphType) || isTocEntry(cleanedText)) {
+                            cleanedText = cleanTocText(cleanedText);
+                            textElement.setType("toc_item");
+                        }
+                        
+                        textElement.setContent(cleanedText);
                         textElement.setStyle(extractParagraphStyle(paragraph));
-                        // 逐 Run 提取格式
-                        textElement.setRuns(extractTextRuns(paragraph.getRuns()));
+                        // 逐 Run 提取格式(目录项也提取,但文本会被清理)
+                        textElement.setRuns(extractTextRunsWithClean(paragraph.getRuns(), "toc_item".equals(textElement.getType())));
                         
                         elements.add(textElement);
-                        fullText.append(text).append("\n");
+                        fullText.append(cleanedText).append("\n");
                     }
                     
                 } else if (bodyElement instanceof XWPFTable) {
@@ -286,8 +296,13 @@ public class WordStructuredExtractionService {
             }
         }
         
-        // 基于内容检测
+        // 检测目录项(包含 HYPERLINK 或 PAGEREF 域代码)
         String trimmed = text.trim();
+        if (isTocEntry(trimmed)) {
+            return "toc_item";
+        }
+        
+        // 基于内容检测
         if (trimmed.length() < 100) {
             if (trimmed.matches("^[一二三四五六七八九十]+[、.].*") ||
                 trimmed.matches("^第[一二三四五六七八九十]+[章节部分].*") ||
@@ -306,6 +321,49 @@ public class WordStructuredExtractionService {
         return "paragraph";
     }
     
+    /**
+     * 检测是否为目录项
+     */
+    private boolean isTocEntry(String text) {
+        if (text == null) return false;
+        // 目录域代码特征
+        return text.contains("HYPERLINK") || 
+               text.contains("PAGEREF") || 
+               text.contains("TOC \\o") ||
+               text.contains("_Toc");
+    }
+    
+    /**
+     * 清理目录文本,移除域代码
+     */
+    private String cleanTocText(String text) {
+        if (text == null) return "";
+        
+        String result = text;
+        
+        // 移除 TOC 域代码头
+        result = result.replaceAll("TOC\\s+\\\\[ohzu]\\s+\"[^\"]*\"\\s*", "");
+        result = result.replaceAll("TOC\\s+\\\\[ohzu]\\s*", "");
+        
+        // 移除 HYPERLINK 域代码
+        // 格式: HYPERLINK \l "_Toc176869144" 实际文本
+        result = result.replaceAll("HYPERLINK\\s+\\\\l\\s+\"[^\"]*\"\\s*", "");
+        result = result.replaceAll("HYPERLINK\\s+\"[^\"]*\"\\s*", "");
+        
+        // 移除 PAGEREF 域代码
+        // 格式: PAGEREF _Toc176869144 \h 1
+        result = result.replaceAll("PAGEREF\\s+[^\\s]+\\s+\\\\h\\s*", "");
+        result = result.replaceAll("PAGEREF\\s+[^\\s]+\\s*", "");
+        
+        // 移除其他常见域代码标记
+        result = result.replaceAll("\\\\[lohzu]\\s*", "");
+        
+        // 清理多余空格
+        result = result.replaceAll("\\s+", " ").trim();
+        
+        return result;
+    }
+    
     /**
      * 提取段落样式
      */
@@ -430,19 +488,32 @@ public class WordStructuredExtractionService {
      * 提取段落中所有 Run 的文本和样式
      */
     private List<TextRun> extractTextRuns(List<XWPFRun> xwpfRuns) {
+        return extractTextRunsWithClean(xwpfRuns, false);
+    }
+    
+    /**
+     * 提取段落中所有 Run 的文本和样式(可选清理目录域代码)
+     */
+    private List<TextRun> extractTextRunsWithClean(List<XWPFRun> xwpfRuns, boolean cleanToc) {
         if (xwpfRuns == null || xwpfRuns.isEmpty()) {
             return null;
         }
         
         List<TextRun> runs = new ArrayList<>();
         for (XWPFRun xwpfRun : xwpfRuns) {
-            // 使用 toString() 而不是 text(),因为 toString() 会包含换行符
-            // 或者手动处理 Run 内容
             String text = extractRunText(xwpfRun);
             if (text == null || text.isEmpty()) {
                 continue;
             }
             
+            // 如果需要清理目录域代码
+            if (cleanToc) {
+                text = cleanTocText(text);
+                if (text.isEmpty()) {
+                    continue;
+                }
+            }
+            
             TextRun run = new TextRun();
             run.setText(text);
             

+ 34 - 0
frontend/vue-demo/src/views/Editor.vue

@@ -934,6 +934,10 @@ function wrapWithParagraphTag(content, type, style) {
       return `<h3${styleAttr}>${content}</h3>`
     case 'heading':
       return `<h2${styleAttr}>${content}</h2>`
+    case 'toc':
+      return `<div class="doc-toc-title"${styleAttr}>${content}</div>`
+    case 'toc_item':
+      return `<div class="doc-toc-item"${styleAttr}>${content}</div>`
     case 'bullet':
     case 'list_item':
       return `<div class="doc-list-item bullet"${styleAttr}>${content}</div>`
@@ -1480,6 +1484,36 @@ onUnmounted(() => {
       }
     }
     
+    // 目录样式
+    :deep(.doc-toc-title) {
+      font-size: 18pt;
+      font-weight: bold;
+      text-align: center;
+      margin: 20px 0 16px;
+    }
+    
+    :deep(.doc-toc-item) {
+      display: flex;
+      justify-content: space-between;
+      align-items: baseline;
+      padding: 4px 0;
+      border-bottom: 1px dotted #ccc;
+      cursor: pointer;
+      transition: background-color 0.2s;
+      
+      &:hover {
+        background-color: #f5f5f5;
+      }
+      
+      // 页码样式(如果有的话)
+      &::after {
+        content: attr(data-page);
+        flex-shrink: 0;
+        margin-left: 8px;
+        color: #666;
+      }
+    }
+    
     // 列表项样式
     :deep(.doc-list-item) {
       position: relative;