فهرست منبع

fix: 修复Word批注解析和图片展示问题

1. Word解析时过滤批注内容(Comment by xxx: ...)
   - 修改WordTextExtractionService,不再使用XWPFWordExtractor
   - 直接遍历段落提取文本,跳过批注
   - 添加正则表达式过滤残留的批注文本

2. 图片展示问题修复
   - StructuredDocumentDTO添加images字段
   - StructuredDocumentService从document_elements获取图片信息
   - 前端Editor.vue添加renderStructuredDocument函数
   - 将blocks和images按index排序合并渲染

3. 任务中心当前阶段显示中文名称(之前已提交)
何文松 1 ماه پیش
والد
کامیت
8684762da5

+ 32 - 0
backend/document-service/src/main/java/com/lingyue/document/dto/StructuredDocumentDTO.java

@@ -44,6 +44,9 @@ public class StructuredDocumentDTO {
     @Schema(description = "内容块列表")
     private List<BlockDTO> blocks;
     
+    @Schema(description = "图片列表(从 document_elements 中提取)")
+    private List<ImageDTO> images;
+    
     @Schema(description = "实体统计")
     private EntityStats entityStats;
     
@@ -88,6 +91,35 @@ public class StructuredDocumentDTO {
         private String markedHtml;
     }
     
+    /**
+     * 图片 DTO
+     */
+    @Data
+    @Builder
+    @NoArgsConstructor
+    @AllArgsConstructor
+    @Schema(description = "图片信息")
+    public static class ImageDTO {
+        
+        @Schema(description = "图片在文档中的顺序索引")
+        private Integer index;
+        
+        @Schema(description = "图片访问 URL")
+        private String url;
+        
+        @Schema(description = "图片描述/替代文本")
+        private String alt;
+        
+        @Schema(description = "图片宽度(像素)")
+        private Integer width;
+        
+        @Schema(description = "图片高度(像素)")
+        private Integer height;
+        
+        @Schema(description = "图片格式")
+        private String format;
+    }
+    
     /**
      * 实体统计
      */

+ 24 - 0
backend/document-service/src/main/java/com/lingyue/document/service/StructuredDocumentService.java

@@ -6,6 +6,7 @@ import com.lingyue.document.dto.StructuredDocumentDTO.*;
 import com.lingyue.document.entity.Document;
 import com.lingyue.document.entity.DocumentBlock;
 import com.lingyue.document.entity.DocumentBlock.TextElement;
+import com.lingyue.document.entity.DocumentElement;
 import com.lingyue.document.repository.DocumentBlockRepository;
 import com.lingyue.document.repository.DocumentRepository;
 import lombok.RequiredArgsConstructor;
@@ -34,6 +35,7 @@ public class StructuredDocumentService {
     
     private final DocumentRepository documentRepository;
     private final DocumentBlockRepository blockRepository;
+    private final DocumentElementService documentElementService;
     
     /**
      * 获取结构化文档(用于编辑器渲染)
@@ -56,17 +58,39 @@ public class StructuredDocumentService {
         // 4. 统计实体
         EntityStats stats = buildEntityStats(blocks);
         
+        // 5. 获取图片列表
+        List<ImageDTO> images = buildImageList(documentId);
+        
         return StructuredDocumentDTO.builder()
                 .documentId(documentId)
                 .revision(1) // TODO: 实现版本控制
                 .title(document.getName())
                 .status(document.getStatus())
                 .blocks(blockDTOs)
+                .images(images)
                 .entityStats(stats)
                 .updatedAt(document.getUpdateTime())
                 .build();
     }
     
+    /**
+     * 构建图片列表(从 document_elements 表获取)
+     */
+    private List<ImageDTO> buildImageList(String documentId) {
+        List<DocumentElement> imageElements = documentElementService.getImagesByDocumentId(documentId);
+        
+        return imageElements.stream()
+                .map(el -> ImageDTO.builder()
+                        .index(el.getElementIndex())
+                        .url(el.getImageUrl())
+                        .alt(el.getImageAlt())
+                        .width(el.getImageWidth())
+                        .height(el.getImageHeight())
+                        .format(el.getImageFormat())
+                        .build())
+                .collect(Collectors.toList());
+    }
+    
     /**
      * 构建块 DTO
      */

+ 58 - 7
backend/parse-service/src/main/java/com/lingyue/parse/service/WordTextExtractionService.java

@@ -4,13 +4,14 @@ import com.lingyue.common.exception.ServiceException;
 import lombok.extern.slf4j.Slf4j;
 import org.apache.poi.hwpf.HWPFDocument;
 import org.apache.poi.hwpf.extractor.WordExtractor;
-import org.apache.poi.xwpf.extractor.XWPFWordExtractor;
 import org.apache.poi.xwpf.usermodel.XWPFDocument;
+import org.apache.poi.xwpf.usermodel.XWPFParagraph;
 import org.springframework.stereotype.Service;
 
 import java.io.File;
 import java.io.FileInputStream;
 import java.io.IOException;
+import java.util.regex.Pattern;
 
 /**
  * Word文档文本提取服务
@@ -23,6 +24,14 @@ import java.io.IOException;
 @Service
 public class WordTextExtractionService {
     
+    // 匹配 Word 批注格式: "Comment by xxx: ..." 或 "批注 [xxx]: ..."
+    private static final Pattern COMMENT_PATTERN = Pattern.compile(
+            "(?m)^\\s*Comment by [^:]+:.*$|" +          // English format
+            "(?m)^\\s*批注\\s*\\[[^\\]]+\\]:.*$|" +     // Chinese format
+            "\\[Comment by [^\\]]+\\]|" +                // Inline comment marker
+            "\\[批注[^\\]]*\\]"                          // Inline Chinese comment
+    );
+    
     /**
      * 提取Word文档文本
      * 
@@ -52,18 +61,34 @@ public class WordTextExtractionService {
     }
     
     /**
-     * 从.docx文件提取文本
+     * 从.docx文件提取文本(不包含批注)
+     * 
+     * 注意:XWPFWordExtractor.getText() 会包含批注内容,
+     * 所以我们直接遍历段落提取文本,跳过批注。
      */
     private String extractFromDocx(String filePath) throws IOException {
         log.info("提取.docx文件文本: {}", filePath);
         
         try (FileInputStream fis = new FileInputStream(filePath);
-             XWPFDocument document = new XWPFDocument(fis);
-             XWPFWordExtractor extractor = new XWPFWordExtractor(document)) {
+             XWPFDocument document = new XWPFDocument(fis)) {
             
-            String text = extractor.getText();
-            log.debug("提取到文本长度: {}", text != null ? text.length() : 0);
-            return text != null ? text : "";
+            StringBuilder textBuilder = new StringBuilder();
+            
+            // 遍历所有段落,只提取正文文本(不包含批注)
+            for (XWPFParagraph paragraph : document.getParagraphs()) {
+                String paragraphText = paragraph.getText();
+                if (paragraphText != null && !paragraphText.isEmpty()) {
+                    textBuilder.append(paragraphText).append("\n");
+                }
+            }
+            
+            String text = textBuilder.toString();
+            
+            // 过滤掉可能残留的批注文本
+            text = removeCommentText(text);
+            
+            log.debug("提取到文本长度: {}", text.length());
+            return text;
         }
     }
     
@@ -78,8 +103,34 @@ public class WordTextExtractionService {
              WordExtractor extractor = new WordExtractor(document)) {
             
             String text = extractor.getText();
+            
+            // 过滤批注
+            if (text != null) {
+                text = removeCommentText(text);
+            }
+            
             log.debug("提取到文本长度: {}", text != null ? text.length() : 0);
             return text != null ? text : "";
         }
     }
+    
+    /**
+     * 移除文本中的批注内容
+     */
+    private String removeCommentText(String text) {
+        if (text == null || text.isEmpty()) {
+            return text;
+        }
+        
+        String cleaned = COMMENT_PATTERN.matcher(text).replaceAll("");
+        
+        // 清理多余的空行
+        cleaned = cleaned.replaceAll("\\n{3,}", "\n\n");
+        
+        if (cleaned.length() < text.length()) {
+            log.debug("移除批注文本: 原长度={}, 新长度={}", text.length(), cleaned.length());
+        }
+        
+        return cleaned.trim();
+    }
 }

+ 45 - 8
frontend/vue-demo/src/views/Editor.vue

@@ -365,12 +365,9 @@ async function fetchTemplateData() {
     if (baseDocumentId) {
       try {
         const structuredDoc = await documentApi.getStructured(baseDocumentId)
-        // 将结构化文档的 blocks 转换为 HTML 内容
+        // 将结构化文档的 blocks 和 images 合并渲染
         if (structuredDoc && structuredDoc.blocks && structuredDoc.blocks.length > 0) {
-          // 优先使用 markedHtml(带实体标注),其次使用 html
-          documentContent.value = structuredDoc.blocks
-            .map(block => block.markedHtml || block.html || block.plainText || '')
-            .join('')
+          documentContent.value = renderStructuredDocument(structuredDoc)
         } else {
           documentContent.value = emptyPlaceholder
         }
@@ -427,6 +424,48 @@ const emptyPlaceholder = `
   </div>
 `
 
+/**
+ * 渲染结构化文档(合并 blocks 和 images)
+ * 根据 index 排序,将图片插入到正确的位置
+ */
+function renderStructuredDocument(structuredDoc) {
+  const blocks = structuredDoc.blocks || []
+  const images = structuredDoc.images || []
+  
+  // 如果没有图片,直接渲染 blocks
+  if (images.length === 0) {
+    return blocks
+      .map(block => block.markedHtml || block.html || block.plainText || '')
+      .join('')
+  }
+  
+  // 将 blocks 和 images 合并,按 index 排序
+  const allElements = [
+    ...blocks.map(block => ({
+      type: 'block',
+      index: block.index,
+      html: block.markedHtml || block.html || block.plainText || ''
+    })),
+    ...images.map(img => ({
+      type: 'image',
+      index: img.index,
+      html: `<div class="doc-image" style="text-align: center; margin: 16px 0;">
+        <img src="${img.url}" alt="${img.alt || '图片'}" 
+             style="max-width: 100%; height: auto;"
+             ${img.width ? `width="${img.width}"` : ''}
+             ${img.height ? `height="${img.height}"` : ''} />
+        ${img.alt ? `<p class="image-caption" style="color: #666; font-size: 12px; margin-top: 8px;">${img.alt}</p>` : ''}
+      </div>`
+    }))
+  ]
+  
+  // 按 index 排序
+  allElements.sort((a, b) => (a.index || 0) - (b.index || 0))
+  
+  // 合并 HTML
+  return allElements.map(el => el.html).join('')
+}
+
 // 计算属性
 const groupedVariables = computed(() => {
   const groups = {}
@@ -464,9 +503,7 @@ async function handleRegenerateBlocks() {
     // 重新加载文档内容
     const structuredDoc = await documentApi.getStructured(baseDocumentId)
     if (structuredDoc && structuredDoc.blocks && structuredDoc.blocks.length > 0) {
-      documentContent.value = structuredDoc.blocks
-        .map(block => block.markedHtml || block.html || block.plainText || '')
-        .join('')
+      documentContent.value = renderStructuredDocument(structuredDoc)
     }
   } catch (error) {
     console.error('重新生成失败:', error)