1 month ago · 253342802b
--- a/backend/parse-service/src/main/java/com/lingyue/parse/service/WordStructuredExtractionService.java
+++ b/backend/parse-service/src/main/java/com/lingyue/parse/service/WordStructuredExtractionService.java
@@ -108,14 +108,24 @@ public class WordStructuredExtractionService {
 
				                     if (text != null && !text.trim().isEmpty()) {
			
 
				                         ContentElement textElement = new ContentElement();
			
 
				                         textElement.setIndex(elementIndex++);
			
 
				-                        textElement.setType(detectParagraphType(paragraph, text));
			
 
				-                        textElement.setContent(text.trim());
			
 
				+                        
			
 
				+                        String paragraphType = detectParagraphType(paragraph, text);
			
 
				+                        textElement.setType(paragraphType);
			
 
				+                        
			
 
				+                        // 如果是目录项，清理域代码
			
 
				+                        String cleanedText = text.trim();
			
 
				+                        if ("toc_item".equals(paragraphType) || isTocEntry(cleanedText)) {
			
 
				+                            cleanedText = cleanTocText(cleanedText);
			
 
				+                            textElement.setType("toc_item");
			
 
				+                        }
			
 
				+                        
			
 
				+                        textElement.setContent(cleanedText);
			
 
				                         textElement.setStyle(extractParagraphStyle(paragraph));
			
 
				-                        // 逐 Run 提取格式
			
 
				-                        textElement.setRuns(extractTextRuns(paragraph.getRuns()));
			
 
				+                        // 逐 Run 提取格式（目录项也提取，但文本会被清理）
			
 
				+                        textElement.setRuns(extractTextRunsWithClean(paragraph.getRuns(), "toc_item".equals(textElement.getType())));
			
 
				                         
			
 
				                         elements.add(textElement);
			
 
				-                        fullText.append(text).append("\n");
			
 
				+                        fullText.append(cleanedText).append("\n");
			
 
				                     }
			
 
				                     
			
 
				                 } else if (bodyElement instanceof XWPFTable) {
			
@@ -286,8 +296,13 @@ public class WordStructuredExtractionService {
 
				             }
			
 
				         }
			
 
				         
			
 
				-        // 基于内容检测
			
 
				+        // 检测目录项（包含 HYPERLINK 或 PAGEREF 域代码）
			
 
				         String trimmed = text.trim();
			
 
				+        if (isTocEntry(trimmed)) {
			
 
				+            return "toc_item";
			
 
				+        }
			
 
				+        
			
 
				+        // 基于内容检测
			
 
				         if (trimmed.length() < 100) {
			
 
				             if (trimmed.matches("^[一二三四五六七八九十]+[、.].*") ||
			
 
				                 trimmed.matches("^第[一二三四五六七八九十]+[章节部分].*") ||
			
@@ -306,6 +321,49 @@ public class WordStructuredExtractionService {
 
				         return "paragraph";
			
 
				     }
			
 
				     
			
 
				+    /**
			
 
				+     * 检测是否为目录项
			
 
				+     */
			
 
				+    private boolean isTocEntry(String text) {
			
 
				+        if (text == null) return false;
			
 
				+        // 目录域代码特征
			
 
				+        return text.contains("HYPERLINK") || 
			
 
				+               text.contains("PAGEREF") || 
			
 
				+               text.contains("TOC \\o") ||
			
 
				+               text.contains("_Toc");
			
 
				+    }
			
 
				+    
			
 
				+    /**
			
 
				+     * 清理目录文本，移除域代码
			
 
				+     */
			
 
				+    private String cleanTocText(String text) {
			
 
				+        if (text == null) return "";
			
 
				+        
			
 
				+        String result = text;
			
 
				+        
			
 
				+        // 移除 TOC 域代码头
			
 
				+        result = result.replaceAll("TOC\\s+\\\\[ohzu]\\s+\"[^\"]*\"\\s*", "");
			
 
				+        result = result.replaceAll("TOC\\s+\\\\[ohzu]\\s*", "");
			
 
				+        
			
 
				+        // 移除 HYPERLINK 域代码
			
 
				+        // 格式: HYPERLINK \l "_Toc176869144" 实际文本
			
 
				+        result = result.replaceAll("HYPERLINK\\s+\\\\l\\s+\"[^\"]*\"\\s*", "");
			
 
				+        result = result.replaceAll("HYPERLINK\\s+\"[^\"]*\"\\s*", "");
			
 
				+        
			
 
				+        // 移除 PAGEREF 域代码
			
 
				+        // 格式: PAGEREF _Toc176869144 \h 1
			
 
				+        result = result.replaceAll("PAGEREF\\s+[^\\s]+\\s+\\\\h\\s*", "");
			
 
				+        result = result.replaceAll("PAGEREF\\s+[^\\s]+\\s*", "");
			
 
				+        
			
 
				+        // 移除其他常见域代码标记
			
 
				+        result = result.replaceAll("\\\\[lohzu]\\s*", "");
			
 
				+        
			
 
				+        // 清理多余空格
			
 
				+        result = result.replaceAll("\\s+", " ").trim();
			
 
				+        
			
 
				+        return result;
			
 
				+    }
			
 
				+    
			
 
				     /**
			
 
				      * 提取段落样式
			
 
				      */
			
@@ -430,19 +488,32 @@ public class WordStructuredExtractionService {
 
				      * 提取段落中所有 Run 的文本和样式
			
 
				      */
			
 
				     private List<TextRun> extractTextRuns(List<XWPFRun> xwpfRuns) {
			
 
				+        return extractTextRunsWithClean(xwpfRuns, false);
			
 
				+    }
			
 
				+    
			
 
				+    /**
			
 
				+     * 提取段落中所有 Run 的文本和样式（可选清理目录域代码）
			
 
				+     */
			
 
				+    private List<TextRun> extractTextRunsWithClean(List<XWPFRun> xwpfRuns, boolean cleanToc) {
			
 
				         if (xwpfRuns == null || xwpfRuns.isEmpty()) {
			
 
				             return null;
			
 
				         }
			
 
				         
			
 
				         List<TextRun> runs = new ArrayList<>();
			
 
				         for (XWPFRun xwpfRun : xwpfRuns) {
			
 
				-            // 使用 toString() 而不是 text()，因为 toString() 会包含换行符
			
 
				-            // 或者手动处理 Run 内容
			
 
				             String text = extractRunText(xwpfRun);
			
 
				             if (text == null || text.isEmpty()) {
			
 
				                 continue;
			
 
				             }
			
 
				             
			
 
				+            // 如果需要清理目录域代码
			
 
				+            if (cleanToc) {
			
 
				+                text = cleanTocText(text);
			
 
				+                if (text.isEmpty()) {
			
 
				+                    continue;
			
 
				+                }
			
 
				+            }
			
 
				+            
			
 
				             TextRun run = new TextRun();
			
 
				             run.setText(text);
			
 
				             
			
--- a/frontend/vue-demo/src/views/Editor.vue
+++ b/frontend/vue-demo/src/views/Editor.vue
@@ -934,6 +934,10 @@ function wrapWithParagraphTag(content, type, style) {
 
				       return `<h3${styleAttr}>${content}</h3>`
			
 
				     case 'heading':
			
 
				       return `<h2${styleAttr}>${content}</h2>`
			
 
				+    case 'toc':
			
 
				+      return `<div class="doc-toc-title"${styleAttr}>${content}</div>`
			
 
				+    case 'toc_item':
			
 
				+      return `<div class="doc-toc-item"${styleAttr}>${content}</div>`
			
 
				     case 'bullet':
			
 
				     case 'list_item':
			
 
				       return `<div class="doc-list-item bullet"${styleAttr}>${content}</div>`
			
@@ -1480,6 +1484,36 @@ onUnmounted(() => {
 
				       }
			
 
				     }
			
 
				     
			
 
				+    // 目录样式
			
 
				+    :deep(.doc-toc-title) {
			
 
				+      font-size: 18pt;
			
 
				+      font-weight: bold;
			
 
				+      text-align: center;
			
 
				+      margin: 20px 0 16px;
			
 
				+    }
			
 
				+    
			
 
				+    :deep(.doc-toc-item) {
			
 
				+      display: flex;
			
 
				+      justify-content: space-between;
			
 
				+      align-items: baseline;
			
 
				+      padding: 4px 0;
			
 
				+      border-bottom: 1px dotted #ccc;
			
 
				+      cursor: pointer;
			
 
				+      transition: background-color 0.2s;
			
 
				+      
			
 
				+      &:hover {
			
 
				+        background-color: #f5f5f5;
			
 
				+      }
			
 
				+      
			
 
				+      // 页码样式（如果有的话）
			
 
				+      &::after {
			
 
				+        content: attr(data-page);
			
 
				+        flex-shrink: 0;
			
 
				+        margin-left: 8px;
			
 
				+        color: #666;
			
 
				+      }
			
 
				+    }
			
 
				+    
			
 
				     // 列表项样式
			
 
				     :deep(.doc-list-item) {
			
 
				       position: relative;