4 tygodni temu · 9d425e82c4
--- a/backend/parse-service/src/main/java/com/lingyue/parse/service/WordStructuredExtractionService.java
+++ b/backend/parse-service/src/main/java/com/lingyue/parse/service/WordStructuredExtractionService.java
@@ -112,15 +112,21 @@ public class WordStructuredExtractionService {
 
				                         String paragraphType = detectParagraphType(paragraph, text);
			
 
				                         textElement.setType(paragraphType);
			
 
				                         
			
 
				-                        // 如果是目录项，清理域代码
			
 
				+                        // 如果是目录项，清理域代码并提取页码
			
 
				                         String cleanedText = text.trim();
			
 
				+                        Map<String, Object> style = extractParagraphStyle(paragraph);
			
 
				+                        
			
 
				                         if ("toc_item".equals(paragraphType) || isTocEntry(cleanedText)) {
			
 
				-                            cleanedText = cleanTocText(cleanedText);
			
 
				+                            String[] tocParts = cleanTocTextWithPage(cleanedText);
			
 
				+                            cleanedText = tocParts[0];
			
 
				+                            if (!tocParts[1].isEmpty()) {
			
 
				+                                style.put("tocPageNum", tocParts[1]);
			
 
				+                            }
			
 
				                             textElement.setType("toc_item");
			
 
				                         }
			
 
				                         
			
 
				                         textElement.setContent(cleanedText);
			
 
				-                        textElement.setStyle(extractParagraphStyle(paragraph));
			
 
				+                        textElement.setStyle(style);
			
 
				                         // 逐 Run 提取格式（目录项也提取，但文本会被清理）
			
 
				                         textElement.setRuns(extractTextRunsWithClean(paragraph.getRuns(), "toc_item".equals(textElement.getType())));
			
 
				                         
			
@@ -334,10 +340,10 @@ public class WordStructuredExtractionService {
 
				     }
			
 
				     
			
 
				     /**
			
 
				-     * 清理目录文本，移除域代码
			
 
				+     * 清理目录文本，移除域代码，返回 [标题, 页码]
			
 
				      */
			
 
				-    private String cleanTocText(String text) {
			
 
				-        if (text == null) return "";
			
 
				+    private String[] cleanTocTextWithPage(String text) {
			
 
				+        if (text == null) return new String[]{"", ""};
			
 
				         
			
 
				         String result = text;
			
 
				         
			
@@ -350,9 +356,17 @@ public class WordStructuredExtractionService {
 
				         result = result.replaceAll("HYPERLINK\\s+\\\\l\\s+\"[^\"]*\"\\s*", "");
			
 
				         result = result.replaceAll("HYPERLINK\\s+\"[^\"]*\"\\s*", "");
			
 
				         
			
 
				+        // 提取页码（在 PAGEREF 后面）
			
 
				+        String pageNum = "";
			
 
				+        java.util.regex.Pattern pagePattern = java.util.regex.Pattern.compile("PAGEREF\\s+[^\\s]+\\s+\\\\h\\s*(\\d+)");
			
 
				+        java.util.regex.Matcher pageMatcher = pagePattern.matcher(result);
			
 
				+        if (pageMatcher.find()) {
			
 
				+            pageNum = pageMatcher.group(1);
			
 
				+        }
			
 
				+        
			
 
				         // 移除 PAGEREF 域代码
			
 
				         // 格式: PAGEREF _Toc176869144 \h 1
			
 
				-        result = result.replaceAll("PAGEREF\\s+[^\\s]+\\s+\\\\h\\s*", "");
			
 
				+        result = result.replaceAll("PAGEREF\\s+[^\\s]+\\s+\\\\h\\s*\\d*", "");
			
 
				         result = result.replaceAll("PAGEREF\\s+[^\\s]+\\s*", "");
			
 
				         
			
 
				         // 移除其他常见域代码标记
			
@@ -361,7 +375,23 @@ public class WordStructuredExtractionService {
 
				         // 清理多余空格
			
 
				         result = result.replaceAll("\\s+", " ").trim();
			
 
				         
			
 
				-        return result;
			
 
				+        // 如果页码为空，尝试从末尾提取数字（有些目录格式是 "标题 页码"）
			
 
				+        if (pageNum.isEmpty() && !result.isEmpty()) {
			
 
				+            java.util.regex.Matcher endNumMatcher = java.util.regex.Pattern.compile("\\s+(\\d+)$").matcher(result);
			
 
				+            if (endNumMatcher.find()) {
			
 
				+                pageNum = endNumMatcher.group(1);
			
 
				+                result = result.substring(0, endNumMatcher.start()).trim();
			
 
				+            }
			
 
				+        }
			
 
				+        
			
 
				+        return new String[]{result, pageNum};
			
 
				+    }
			
 
				+    
			
 
				+    /**
			
 
				+     * 清理目录文本，移除域代码（简化版，只返回标题）
			
 
				+     */
			
 
				+    private String cleanTocText(String text) {
			
 
				+        return cleanTocTextWithPage(text)[0];
			
 
				     }
			
 
				     
			
 
				     /**
			
--- a/frontend/vue-demo/src/views/Editor.vue
+++ b/frontend/vue-demo/src/views/Editor.vue
@@ -925,6 +925,19 @@ function wrapWithParagraphTag(content, type, style) {
 
				   
			
 
				   const styleAttr = styleAttrs.length > 0 ? ` style="${styleAttrs.join(';')}"` : ''
			
 
				   
			
 
				+  // 目录项特殊处理
			
 
				+  if (type === 'toc_item') {
			
 
				+    const pageNum = style?.tocPageNum || ''
			
 
				+    // 计算缩进级别（根据章节号判断）
			
 
				+    let level = 0
			
 
				+    const levelMatch = content.match(/^(\d+(?:\.\d+)*)/)
			
 
				+    if (levelMatch) {
			
 
				+      level = (levelMatch[1].match(/\./g) || []).length
			
 
				+    }
			
 
				+    const indentStyle = level > 0 ? ` style="padding-left:${level * 20}px"` : ''
			
 
				+    return `<div class="doc-toc-item"${indentStyle}><span class="toc-title">${content}</span><span class="toc-dots"></span><span class="toc-page">${pageNum}</span></div>`
			
 
				+  }
			
 
				+  
			
 
				   switch (type) {
			
 
				     case 'heading1':
			
 
				       return `<h1${styleAttr}>${content}</h1>`
			
@@ -936,8 +949,6 @@ function wrapWithParagraphTag(content, type, style) {
 
				       return `<h2${styleAttr}>${content}</h2>`
			
 
				     case 'toc':
			
 
				       return `<div class="doc-toc-title"${styleAttr}>${content}</div>`
			
 
				-    case 'toc_item':
			
 
				-      return `<div class="doc-toc-item"${styleAttr}>${content}</div>`
			
 
				     case 'bullet':
			
 
				     case 'list_item':
			
 
				       return `<div class="doc-list-item bullet"${styleAttr}>${content}</div>`
			
@@ -1494,10 +1505,9 @@ onUnmounted(() => {
 
				     
			
 
				     :deep(.doc-toc-item) {
			
 
				       display: flex;
			
 
				-      justify-content: space-between;
			
 
				       align-items: baseline;
			
 
				-      padding: 4px 0;
			
 
				-      border-bottom: 1px dotted #ccc;
			
 
				+      padding: 6px 0;
			
 
				+      line-height: 1.6;
			
 
				       cursor: pointer;
			
 
				       transition: background-color 0.2s;
			
 
				       
			
@@ -1505,12 +1515,24 @@ onUnmounted(() => {
 
				         background-color: #f5f5f5;
			
 
				       }
			
 
				       
			
 
				-      // 页码样式（如果有的话）
			
 
				-      &::after {
			
 
				-        content: attr(data-page);
			
 
				+      .toc-title {
			
 
				+        flex-shrink: 0;
			
 
				+        white-space: nowrap;
			
 
				+      }
			
 
				+      
			
 
				+      .toc-dots {
			
 
				+        flex: 1;
			
 
				+        border-bottom: 1px dotted #999;
			
 
				+        margin: 0 8px;
			
 
				+        min-width: 20px;
			
 
				+        height: 0.6em;
			
 
				+      }
			
 
				+      
			
 
				+      .toc-page {
			
 
				         flex-shrink: 0;
			
 
				-        margin-left: 8px;
			
 
				         color: #666;
			
 
				+        min-width: 20px;
			
 
				+        text-align: right;
			
 
				       }
			
 
				     }