Przeglądaj źródła

feat: 添加 Word 文档结构化解析,支持图片和表格提取

新增功能:
1. WordStructuredExtractionService - 结构化提取 Word 文档
   - 按顺序提取段落、图片、表格
   - 保持原始文档排版结构
   - 提取段落样式(字体、对齐、缩进等)
   - 自动保存图片到服务器

2. DocumentElement 实体和存储
   - 支持存储段落、图片、表格等不同类型元素
   - 图片信息:URL、尺寸、格式
   - 表格信息:行列数据、合并单元格

3. 新增 API 接口
   - GET /api/v1/documents/{id}/elements - 获取文档结构化内容
   - GET /api/v1/documents/{id}/images - 获取文档图片列表
   - GET /api/v1/documents/{id}/tables - 获取文档表格列表
   - GET /api/v1/files/images/{documentId}/{imageName} - 访问图片文件
   - GET /parse/structured/{documentId} - 实时解析获取结构化内容

4. 数据库迁移
   - document_elements 表存储结构化元素
   - documents 表新增 structured_status, image_count, table_count 字段
何文松 1 miesiąc temu
rodzic
commit
6b553689a2

+ 62 - 0
backend/document-service/src/main/java/com/lingyue/document/controller/DocumentController.java

@@ -3,7 +3,9 @@ package com.lingyue.document.controller;
 import com.baomidou.mybatisplus.core.metadata.IPage;
 import com.baomidou.mybatisplus.core.metadata.IPage;
 import com.baomidou.mybatisplus.extension.plugins.pagination.Page;
 import com.baomidou.mybatisplus.extension.plugins.pagination.Page;
 import com.lingyue.document.entity.Document;
 import com.lingyue.document.entity.Document;
+import com.lingyue.document.entity.DocumentElement;
 import com.lingyue.document.service.DocumentService;
 import com.lingyue.document.service.DocumentService;
+import com.lingyue.document.service.DocumentElementService;
 import com.lingyue.common.domain.AjaxResult;
 import com.lingyue.common.domain.AjaxResult;
 import io.swagger.v3.oas.annotations.Operation;
 import io.swagger.v3.oas.annotations.Operation;
 import io.swagger.v3.oas.annotations.Parameter;
 import io.swagger.v3.oas.annotations.Parameter;
@@ -13,6 +15,9 @@ import lombok.RequiredArgsConstructor;
 import lombok.extern.slf4j.Slf4j;
 import lombok.extern.slf4j.Slf4j;
 import org.springframework.web.bind.annotation.*;
 import org.springframework.web.bind.annotation.*;
 
 
+import java.util.List;
+import java.util.Map;
+
 /**
 /**
  * 文档控制器
  * 文档控制器
  * 提供文档的 CRUD 操作
  * 提供文档的 CRUD 操作
@@ -28,6 +33,7 @@ import org.springframework.web.bind.annotation.*;
 public class DocumentController {
 public class DocumentController {
     
     
     private final DocumentService documentService;
     private final DocumentService documentService;
+    private final DocumentElementService documentElementService;
     
     
     /**
     /**
      * 获取文档列表(分页)
      * 获取文档列表(分页)
@@ -142,6 +148,55 @@ public class DocumentController {
         }
         }
     }
     }
     
     
+    /**
+     * 获取文档的结构化内容
+     * 包含段落、图片、表格,按原始顺序排列
+     */
+    @GetMapping("/{documentId}/elements")
+    @Operation(summary = "获取文档结构化内容", description = "获取文档的段落、图片、表格等结构化元素,保持原始排版顺序")
+    public AjaxResult<?> getDocumentElements(
+            @Parameter(description = "文档ID", required = true)
+            @PathVariable String documentId) {
+        
+        List<DocumentElement> elements = documentElementService.getElementsByDocumentId(documentId);
+        if (elements.isEmpty()) {
+            return AjaxResult.error("文档尚未进行结构化解析或无内容");
+        }
+        
+        DocumentElementsResponse response = new DocumentElementsResponse();
+        response.setDocumentId(documentId);
+        response.setElements(elements);
+        response.setStats(documentElementService.getElementStats(documentId));
+        
+        return AjaxResult.success(response);
+    }
+    
+    /**
+     * 获取文档中的所有图片
+     */
+    @GetMapping("/{documentId}/images")
+    @Operation(summary = "获取文档图片列表", description = "获取文档中所有图片的信息和URL")
+    public AjaxResult<?> getDocumentImages(
+            @Parameter(description = "文档ID", required = true)
+            @PathVariable String documentId) {
+        
+        List<DocumentElement> images = documentElementService.getImagesByDocumentId(documentId);
+        return AjaxResult.success(images);
+    }
+    
+    /**
+     * 获取文档中的所有表格
+     */
+    @GetMapping("/{documentId}/tables")
+    @Operation(summary = "获取文档表格列表", description = "获取文档中所有表格的数据")
+    public AjaxResult<?> getDocumentTables(
+            @Parameter(description = "文档ID", required = true)
+            @PathVariable String documentId) {
+        
+        List<DocumentElement> tables = documentElementService.getTablesByDocumentId(documentId);
+        return AjaxResult.success(tables);
+    }
+    
     // ==================== 响应 DTO ====================
     // ==================== 响应 DTO ====================
     
     
     @Data
     @Data
@@ -160,4 +215,11 @@ public class DocumentController {
         private java.util.Date startedAt;
         private java.util.Date startedAt;
         private java.util.Date completedAt;
         private java.util.Date completedAt;
     }
     }
+    
+    @Data
+    public static class DocumentElementsResponse {
+        private String documentId;
+        private List<DocumentElement> elements;
+        private Map<String, Object> stats;
+    }
 }
 }

+ 112 - 0
backend/document-service/src/main/java/com/lingyue/document/entity/DocumentElement.java

@@ -0,0 +1,112 @@
+package com.lingyue.document.entity;
+
+import com.baomidou.mybatisplus.annotation.TableField;
+import com.baomidou.mybatisplus.annotation.TableName;
+import com.baomidou.mybatisplus.extension.handlers.JacksonTypeHandler;
+import com.lingyue.common.domain.entity.SimpleModel;
+import lombok.Data;
+import lombok.EqualsAndHashCode;
+
+import java.util.List;
+import java.util.Map;
+
+/**
+ * 文档结构化元素实体
+ * 存储从 Word/PDF 中提取的段落、图片、表格等
+ * 
+ * @author lingyue
+ * @since 2026-01-21
+ */
+@Data
+@EqualsAndHashCode(callSuper = true)
+@TableName(value = "document_elements", autoResultMap = true)
+public class DocumentElement extends SimpleModel {
+    
+    /**
+     * 文档 ID
+     */
+    private String documentId;
+    
+    /**
+     * 元素在文档中的顺序索引
+     */
+    private Integer elementIndex;
+    
+    /**
+     * 元素类型:paragraph/heading/heading1-9/list_item/image/table/title/toc
+     */
+    private String elementType;
+    
+    /**
+     * 文本内容(文本类型元素)
+     */
+    private String content;
+    
+    /**
+     * 样式信息(JSON)
+     * {alignment, fontSize, fontFamily, bold, italic, underline, color, indentLeft, indentFirstLine, lineSpacing}
+     */
+    @TableField(typeHandler = JacksonTypeHandler.class)
+    private Map<String, Object> style;
+    
+    // ========== 图片相关 ==========
+    
+    /**
+     * 图片访问 URL
+     */
+    private String imageUrl;
+    
+    /**
+     * 图片存储路径
+     */
+    private String imagePath;
+    
+    /**
+     * 图片描述/替代文本
+     */
+    private String imageAlt;
+    
+    /**
+     * 图片宽度(像素)
+     */
+    private Integer imageWidth;
+    
+    /**
+     * 图片高度(像素)
+     */
+    private Integer imageHeight;
+    
+    /**
+     * 图片格式:png/jpg/gif/etc
+     */
+    private String imageFormat;
+    
+    // ========== 表格相关 ==========
+    
+    /**
+     * 表格索引
+     */
+    private Integer tableIndex;
+    
+    /**
+     * 表格数据(JSON)
+     * [[{row, col, text, colSpan, rowSpan, merged}, ...], ...]
+     */
+    @TableField(typeHandler = JacksonTypeHandler.class)
+    private List<List<Map<String, Object>>> tableData;
+    
+    /**
+     * 表格行数
+     */
+    private Integer tableRowCount;
+    
+    /**
+     * 表格列数
+     */
+    private Integer tableColCount;
+    
+    /**
+     * 表格文本(用于搜索)
+     */
+    private String tableText;
+}

+ 60 - 0
backend/document-service/src/main/java/com/lingyue/document/repository/DocumentElementRepository.java

@@ -0,0 +1,60 @@
+package com.lingyue.document.repository;
+
+import com.baomidou.mybatisplus.core.mapper.BaseMapper;
+import com.lingyue.document.entity.DocumentElement;
+import org.apache.ibatis.annotations.*;
+
+import java.util.List;
+
+/**
+ * 文档元素 Repository
+ * 
+ * @author lingyue
+ * @since 2026-01-21
+ */
+@Mapper
+public interface DocumentElementRepository extends BaseMapper<DocumentElement> {
+    
+    /**
+     * 根据文档ID查询所有元素(按顺序)
+     */
+    @Select("SELECT * FROM document_elements WHERE document_id = #{documentId} ORDER BY element_index")
+    List<DocumentElement> findByDocumentId(@Param("documentId") String documentId);
+    
+    /**
+     * 根据文档ID和类型查询元素
+     */
+    @Select("SELECT * FROM document_elements WHERE document_id = #{documentId} AND element_type = #{elementType} ORDER BY element_index")
+    List<DocumentElement> findByDocumentIdAndType(@Param("documentId") String documentId, 
+                                                   @Param("elementType") String elementType);
+    
+    /**
+     * 查询文档中的所有图片
+     */
+    @Select("SELECT * FROM document_elements WHERE document_id = #{documentId} AND element_type = 'image' ORDER BY element_index")
+    List<DocumentElement> findImagesByDocumentId(@Param("documentId") String documentId);
+    
+    /**
+     * 查询文档中的所有表格
+     */
+    @Select("SELECT * FROM document_elements WHERE document_id = #{documentId} AND element_type = 'table' ORDER BY element_index")
+    List<DocumentElement> findTablesByDocumentId(@Param("documentId") String documentId);
+    
+    /**
+     * 删除文档的所有元素
+     */
+    @Delete("DELETE FROM document_elements WHERE document_id = #{documentId}")
+    int deleteByDocumentId(@Param("documentId") String documentId);
+    
+    /**
+     * 统计文档元素数量
+     */
+    @Select("SELECT COUNT(*) FROM document_elements WHERE document_id = #{documentId}")
+    int countByDocumentId(@Param("documentId") String documentId);
+    
+    /**
+     * 按类型统计
+     */
+    @Select("SELECT element_type, COUNT(*) as count FROM document_elements WHERE document_id = #{documentId} GROUP BY element_type")
+    List<java.util.Map<String, Object>> countByDocumentIdGroupByType(@Param("documentId") String documentId);
+}

+ 140 - 0
backend/document-service/src/main/java/com/lingyue/document/service/DocumentElementService.java

@@ -0,0 +1,140 @@
+package com.lingyue.document.service;
+
+import com.lingyue.document.entity.DocumentElement;
+import com.lingyue.document.repository.DocumentElementRepository;
+import lombok.RequiredArgsConstructor;
+import lombok.extern.slf4j.Slf4j;
+import org.springframework.stereotype.Service;
+import org.springframework.transaction.annotation.Transactional;
+
+import java.util.ArrayList;
+import java.util.HashMap;
+import java.util.List;
+import java.util.Map;
+
+/**
+ * 文档元素服务
+ * 管理文档的结构化内容(段落、图片、表格)
+ * 
+ * @author lingyue
+ * @since 2026-01-21
+ */
+@Slf4j
+@Service
+@RequiredArgsConstructor
+public class DocumentElementService {
+    
+    private final DocumentElementRepository elementRepository;
+    
+    /**
+     * 获取文档的所有结构化元素(按顺序)
+     */
+    public List<DocumentElement> getElementsByDocumentId(String documentId) {
+        return elementRepository.findByDocumentId(documentId);
+    }
+    
+    /**
+     * 获取文档中的所有图片
+     */
+    public List<DocumentElement> getImagesByDocumentId(String documentId) {
+        return elementRepository.findImagesByDocumentId(documentId);
+    }
+    
+    /**
+     * 获取文档中的所有表格
+     */
+    public List<DocumentElement> getTablesByDocumentId(String documentId) {
+        return elementRepository.findTablesByDocumentId(documentId);
+    }
+    
+    /**
+     * 保存文档的结构化元素
+     * 
+     * @param documentId 文档ID
+     * @param elements   元素列表(来自解析服务)
+     */
+    @Transactional
+    public void saveElements(String documentId, List<Map<String, Object>> elements) {
+        log.info("保存文档结构化元素: documentId={}, count={}", documentId, elements.size());
+        
+        // 先删除旧数据
+        elementRepository.deleteByDocumentId(documentId);
+        
+        // 批量插入新数据
+        for (Map<String, Object> element : elements) {
+            DocumentElement entity = convertToEntity(documentId, element);
+            elementRepository.insert(entity);
+        }
+        
+        log.info("文档结构化元素保存完成: documentId={}", documentId);
+    }
+    
+    /**
+     * 将解析结果转换为实体
+     */
+    private DocumentElement convertToEntity(String documentId, Map<String, Object> element) {
+        DocumentElement entity = new DocumentElement();
+        entity.setId(java.util.UUID.randomUUID().toString().replace("-", ""));
+        entity.setDocumentId(documentId);
+        entity.setElementIndex((Integer) element.get("index"));
+        entity.setElementType((String) element.get("type"));
+        entity.setContent((String) element.get("content"));
+        
+        // 样式
+        @SuppressWarnings("unchecked")
+        Map<String, Object> style = (Map<String, Object>) element.get("style");
+        entity.setStyle(style);
+        
+        // 图片相关
+        entity.setImageUrl((String) element.get("imageUrl"));
+        entity.setImagePath((String) element.get("imagePath"));
+        entity.setImageAlt((String) element.get("imageAlt"));
+        entity.setImageWidth((Integer) element.get("imageWidth"));
+        entity.setImageHeight((Integer) element.get("imageHeight"));
+        entity.setImageFormat((String) element.get("imageFormat"));
+        
+        // 表格相关
+        entity.setTableIndex((Integer) element.get("tableIndex"));
+        entity.setTableRowCount((Integer) element.get("tableRowCount"));
+        entity.setTableColCount((Integer) element.get("tableColCount"));
+        entity.setTableText((String) element.get("tableText"));
+        
+        // 表格数据
+        @SuppressWarnings("unchecked")
+        List<List<Map<String, Object>>> tableRows = (List<List<Map<String, Object>>>) element.get("tableRows");
+        if (tableRows != null) {
+            entity.setTableData(tableRows);
+        }
+        
+        return entity;
+    }
+    
+    /**
+     * 删除文档的所有元素
+     */
+    @Transactional
+    public void deleteByDocumentId(String documentId) {
+        elementRepository.deleteByDocumentId(documentId);
+    }
+    
+    /**
+     * 获取文档元素统计
+     */
+    public Map<String, Object> getElementStats(String documentId) {
+        Map<String, Object> stats = new HashMap<>();
+        
+        int total = elementRepository.countByDocumentId(documentId);
+        stats.put("total", total);
+        
+        List<Map<String, Object>> typeStats = elementRepository.countByDocumentIdGroupByType(documentId);
+        Map<String, Integer> byType = new HashMap<>();
+        for (Map<String, Object> typeStat : typeStats) {
+            String type = (String) typeStat.get("element_type");
+            Long count = (Long) typeStat.get("count");
+            byType.put(type, count.intValue());
+        }
+        stats.put("byType", byType);
+        
+        return stats;
+    }
+}

+ 109 - 0
backend/parse-service/src/main/java/com/lingyue/parse/controller/ImageController.java

@@ -0,0 +1,109 @@
+package com.lingyue.parse.controller;
+
+import io.swagger.v3.oas.annotations.Operation;
+import io.swagger.v3.oas.annotations.Parameter;
+import io.swagger.v3.oas.annotations.tags.Tag;
+import lombok.extern.slf4j.Slf4j;
+import org.springframework.beans.factory.annotation.Value;
+import org.springframework.core.io.FileSystemResource;
+import org.springframework.core.io.Resource;
+import org.springframework.http.HttpHeaders;
+import org.springframework.http.MediaType;
+import org.springframework.http.ResponseEntity;
+import org.springframework.web.bind.annotation.*;
+
+import java.io.File;
+import java.nio.file.Files;
+import java.nio.file.Path;
+import java.nio.file.Paths;
+
+/**
+ * 图片访问控制器
+ * 
+ * @author lingyue
+ * @since 2026-01-21
+ */
+@Slf4j
+@RestController
+@RequestMapping("/api/v1/files")
+@Tag(name = "文件访问", description = "图片和文件访问接口")
+public class ImageController {
+    
+    @Value("${file.storage.base-path:/data/lingyue}")
+    private String basePath;
+    
+    /**
+     * 获取文档中的图片
+     */
+    @GetMapping("/images/{documentId}/{imageName}")
+    @Operation(summary = "获取文档图片", description = "根据文档ID和图片名称获取图片")
+    public ResponseEntity<Resource> getImage(
+            @Parameter(description = "文档ID", required = true)
+            @PathVariable String documentId,
+            @Parameter(description = "图片文件名", required = true)
+            @PathVariable String imageName) {
+        
+        try {
+            // 安全检查:防止路径遍历攻击
+            if (documentId.contains("..") || imageName.contains("..")) {
+                log.warn("非法的路径访问尝试: documentId={}, imageName={}", documentId, imageName);
+                return ResponseEntity.badRequest().build();
+            }
+            
+            Path imagePath = Paths.get(basePath, "images", documentId, imageName);
+            File imageFile = imagePath.toFile();
+            
+            if (!imageFile.exists()) {
+                log.warn("图片不存在: {}", imagePath);
+                return ResponseEntity.notFound().build();
+            }
+            
+            // 检测内容类型
+            String contentType = Files.probeContentType(imagePath);
+            if (contentType == null) {
+                // 根据扩展名推断
+                String ext = imageName.substring(imageName.lastIndexOf('.') + 1).toLowerCase();
+                switch (ext) {
+                    case "png":
+                        contentType = "image/png";
+                        break;
+                    case "jpg":
+                    case "jpeg":
+                        contentType = "image/jpeg";
+                        break;
+                    case "gif":
+                        contentType = "image/gif";
+                        break;
+                    case "bmp":
+                        contentType = "image/bmp";
+                        break;
+                    case "webp":
+                        contentType = "image/webp";
+                        break;
+                    case "svg":
+                        contentType = "image/svg+xml";
+                        break;
+                    case "emf":
+                        contentType = "image/x-emf";
+                        break;
+                    case "wmf":
+                        contentType = "image/x-wmf";
+                        break;
+                    default:
+                        contentType = "application/octet-stream";
+                }
+            }
+            
+            Resource resource = new FileSystemResource(imageFile);
+            
+            return ResponseEntity.ok()
+                    .contentType(MediaType.parseMediaType(contentType))
+                    .header(HttpHeaders.CACHE_CONTROL, "max-age=86400") // 缓存1天
+                    .body(resource);
+            
+        } catch (Exception e) {
+            log.error("获取图片失败: documentId={}, imageName={}", documentId, imageName, e);
+            return ResponseEntity.internalServerError().build();
+        }
+    }
+}

+ 19 - 0
backend/parse-service/src/main/java/com/lingyue/parse/controller/ParseController.java

@@ -3,6 +3,7 @@ package com.lingyue.parse.controller;
 import com.lingyue.common.domain.AjaxResult;
 import com.lingyue.common.domain.AjaxResult;
 import com.lingyue.parse.service.ParseTaskCenterService;
 import com.lingyue.parse.service.ParseTaskCenterService;
 import com.lingyue.parse.service.ParseTaskExecutor;
 import com.lingyue.parse.service.ParseTaskExecutor;
+import com.lingyue.parse.service.WordStructuredExtractionService;
 import com.lingyue.parse.vo.ParseTaskCenterVO;
 import com.lingyue.parse.vo.ParseTaskCenterVO;
 import io.swagger.v3.oas.annotations.Operation;
 import io.swagger.v3.oas.annotations.Operation;
 import io.swagger.v3.oas.annotations.Parameter;
 import io.swagger.v3.oas.annotations.Parameter;
@@ -26,6 +27,7 @@ public class ParseController {
 
 
     private final ParseTaskExecutor parseTaskExecutor;
     private final ParseTaskExecutor parseTaskExecutor;
     private final ParseTaskCenterService taskCenterService;
     private final ParseTaskCenterService taskCenterService;
+    private final WordStructuredExtractionService wordStructuredExtractionService;
 
 
     /**
     /**
      * 启动解析
      * 启动解析
@@ -57,5 +59,22 @@ public class ParseController {
         }
         }
         return AjaxResult.success(detail);
         return AjaxResult.success(detail);
     }
     }
+    
+    /**
+     * 获取 Word 文档结构化内容
+     * 包含段落、图片、表格的顺序和位置信息
+     */
+    @GetMapping("/structured/{documentId}")
+    @Operation(summary = "获取Word文档结构化内容", description = "提取Word文档的段落、图片、表格,保持原始排版结构")
+    public AjaxResult<?> getStructuredContent(
+            @Parameter(description = "文档ID", required = true)
+            @PathVariable String documentId,
+            @Parameter(description = "Word文件路径", required = true)
+            @RequestParam("filePath") String filePath) {
+        
+        WordStructuredExtractionService.WordStructuredResult result = 
+                wordStructuredExtractionService.extractStructured(filePath, documentId);
+        return AjaxResult.success(result);
+    }
 }
 }
 
 

+ 414 - 0
backend/parse-service/src/main/java/com/lingyue/parse/service/WordStructuredExtractionService.java

@@ -0,0 +1,414 @@
+package com.lingyue.parse.service;
+
+import com.lingyue.common.exception.ServiceException;
+import lombok.Data;
+import lombok.RequiredArgsConstructor;
+import lombok.extern.slf4j.Slf4j;
+import org.apache.poi.openxml4j.exceptions.InvalidFormatException;
+import org.apache.poi.util.Units;
+import org.apache.poi.xwpf.usermodel.*;
+import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTDrawing;
+import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTR;
+import org.springframework.beans.factory.annotation.Value;
+import org.springframework.stereotype.Service;
+
+import java.io.*;
+import java.nio.file.Files;
+import java.nio.file.Path;
+import java.nio.file.Paths;
+import java.util.*;
+
+/**
+ * Word 文档结构化提取服务
+ * 按顺序提取段落、图片、表格,保持原始排版结构
+ * 
+ * @author lingyue
+ * @since 2026-01-21
+ */
+@Slf4j
+@Service
+@RequiredArgsConstructor
+public class WordStructuredExtractionService {
+    
+    @Value("${file.storage.base-path:/data/lingyue}")
+    private String basePath;
+    
+    /**
+     * 提取 Word 文档结构化内容
+     * 
+     * @param wordFilePath Word 文件路径
+     * @param documentId   文档 ID(用于存储图片)
+     * @return 结构化内容列表
+     */
+    public WordStructuredResult extractStructured(String wordFilePath, String documentId) {
+        File wordFile = new File(wordFilePath);
+        if (!wordFile.exists()) {
+            throw new ServiceException("Word文件不存在: " + wordFilePath);
+        }
+        
+        String fileName = wordFile.getName().toLowerCase();
+        if (!fileName.endsWith(".docx")) {
+            throw new ServiceException("仅支持 .docx 格式的结构化提取: " + fileName);
+        }
+        
+        try {
+            return extractFromDocx(wordFilePath, documentId);
+        } catch (IOException e) {
+            log.error("提取Word文档结构化内容失败: {}", wordFilePath, e);
+            throw new ServiceException("提取Word文档结构化内容失败: " + e.getMessage());
+        }
+    }
+    
+    /**
+     * 从 .docx 文件提取结构化内容
+     */
+    private WordStructuredResult extractFromDocx(String filePath, String documentId) throws IOException {
+        log.info("开始提取 .docx 结构化内容: {}", filePath);
+        
+        WordStructuredResult result = new WordStructuredResult();
+        result.setDocumentId(documentId);
+        List<ContentElement> elements = new ArrayList<>();
+        StringBuilder fullText = new StringBuilder();
+        
+        // 创建图片存储目录
+        String imageDir = basePath + "/images/" + documentId;
+        Files.createDirectories(Paths.get(imageDir));
+        
+        try (FileInputStream fis = new FileInputStream(filePath);
+             XWPFDocument document = new XWPFDocument(fis)) {
+            
+            int elementIndex = 0;
+            int imageIndex = 0;
+            int tableIndex = 0;
+            
+            // 遍历文档体中的所有元素(保持顺序)
+            for (IBodyElement bodyElement : document.getBodyElements()) {
+                if (bodyElement instanceof XWPFParagraph) {
+                    XWPFParagraph paragraph = (XWPFParagraph) bodyElement;
+                    
+                    // 检查段落中是否有图片
+                    List<XWPFRun> runs = paragraph.getRuns();
+                    boolean hasImage = false;
+                    
+                    for (XWPFRun run : runs) {
+                        List<XWPFPicture> pictures = run.getEmbeddedPictures();
+                        for (XWPFPicture picture : pictures) {
+                            hasImage = true;
+                            // 提取图片
+                            ContentElement imgElement = extractImage(picture, imageDir, documentId, imageIndex++, elementIndex++);
+                            if (imgElement != null) {
+                                elements.add(imgElement);
+                                fullText.append("[图片: ").append(imgElement.getImageAlt()).append("]\n");
+                            }
+                        }
+                    }
+                    
+                    // 提取段落文本
+                    String text = paragraph.getText();
+                    if (text != null && !text.trim().isEmpty()) {
+                        ContentElement textElement = new ContentElement();
+                        textElement.setIndex(elementIndex++);
+                        textElement.setType(detectParagraphType(paragraph, text));
+                        textElement.setContent(text.trim());
+                        textElement.setStyle(extractParagraphStyle(paragraph));
+                        
+                        elements.add(textElement);
+                        fullText.append(text).append("\n");
+                    }
+                    
+                } else if (bodyElement instanceof XWPFTable) {
+                    XWPFTable table = (XWPFTable) bodyElement;
+                    
+                    // 提取表格
+                    ContentElement tableElement = extractTable(table, tableIndex++, elementIndex++);
+                    elements.add(tableElement);
+                    fullText.append("[表格 ").append(tableIndex).append("]\n");
+                    fullText.append(tableElement.getTableText()).append("\n");
+                }
+            }
+            
+            result.setElements(elements);
+            result.setFullText(fullText.toString());
+            result.setImageCount(imageIndex);
+            result.setTableCount(tableIndex);
+            result.setTotalElements(elements.size());
+            
+            log.info("结构化提取完成: elements={}, images={}, tables={}", 
+                    elements.size(), imageIndex, tableIndex);
+        }
+        
+        return result;
+    }
+    
+    /**
+     * 提取图片
+     */
+    private ContentElement extractImage(XWPFPicture picture, String imageDir, 
+                                        String documentId, int imageIndex, int elementIndex) {
+        try {
+            XWPFPictureData pictureData = picture.getPictureData();
+            if (pictureData == null) {
+                return null;
+            }
+            
+            String extension = pictureData.suggestFileExtension();
+            String imageName = String.format("image_%03d.%s", imageIndex, extension);
+            String imagePath = imageDir + "/" + imageName;
+            
+            // 保存图片
+            try (FileOutputStream fos = new FileOutputStream(imagePath)) {
+                fos.write(pictureData.getData());
+            }
+            
+            // 获取图片尺寸
+            int width = 0;
+            int height = 0;
+            try {
+                // 尝试从 CTDrawing 获取尺寸(EMU 单位)
+                width = (int) (picture.getCTPicture().getSpPr().getXfrm().getExt().getCx() / Units.EMU_PER_PIXEL);
+                height = (int) (picture.getCTPicture().getSpPr().getXfrm().getExt().getCy() / Units.EMU_PER_PIXEL);
+            } catch (Exception e) {
+                log.debug("无法获取图片尺寸: {}", e.getMessage());
+            }
+            
+            ContentElement element = new ContentElement();
+            element.setIndex(elementIndex);
+            element.setType("image");
+            element.setContent(null);
+            element.setImageUrl("/api/v1/files/images/" + documentId + "/" + imageName);
+            element.setImagePath(imagePath);
+            element.setImageAlt(picture.getDescription() != null ? picture.getDescription() : "图片 " + (imageIndex + 1));
+            element.setImageWidth(width > 0 ? width : null);
+            element.setImageHeight(height > 0 ? height : null);
+            element.setImageFormat(extension);
+            
+            log.debug("提取图片: index={}, path={}, size={}x{}", imageIndex, imagePath, width, height);
+            return element;
+            
+        } catch (Exception e) {
+            log.warn("提取图片失败: {}", e.getMessage());
+            return null;
+        }
+    }
+    
+    /**
+     * 提取表格
+     */
+    private ContentElement extractTable(XWPFTable table, int tableIndex, int elementIndex) {
+        ContentElement element = new ContentElement();
+        element.setIndex(elementIndex);
+        element.setType("table");
+        element.setTableIndex(tableIndex);
+        
+        List<List<TableCell>> rows = new ArrayList<>();
+        StringBuilder tableText = new StringBuilder();
+        
+        int rowCount = table.getNumberOfRows();
+        int colCount = 0;
+        
+        for (int i = 0; i < rowCount; i++) {
+            XWPFTableRow row = table.getRow(i);
+            List<XWPFTableCell> cells = row.getTableCells();
+            List<TableCell> rowData = new ArrayList<>();
+            
+            colCount = Math.max(colCount, cells.size());
+            
+            for (int j = 0; j < cells.size(); j++) {
+                XWPFTableCell cell = cells.get(j);
+                TableCell cellData = new TableCell();
+                cellData.setRow(i);
+                cellData.setCol(j);
+                cellData.setText(cell.getText());
+                
+                // 提取单元格样式
+                try {
+                    if (cell.getCTTc().getTcPr() != null) {
+                        if (cell.getCTTc().getTcPr().getGridSpan() != null) {
+                            cellData.setColSpan(cell.getCTTc().getTcPr().getGridSpan().getVal().intValue());
+                        }
+                        if (cell.getCTTc().getTcPr().getVMerge() != null) {
+                            cellData.setMerged(true);
+                        }
+                    }
+                } catch (Exception e) {
+                    // 忽略样式提取错误
+                }
+                
+                rowData.add(cellData);
+                tableText.append(cell.getText());
+                if (j < cells.size() - 1) {
+                    tableText.append("\t");
+                }
+            }
+            
+            rows.add(rowData);
+            tableText.append("\n");
+        }
+        
+        element.setTableRows(rows);
+        element.setTableRowCount(rowCount);
+        element.setTableColCount(colCount);
+        element.setTableText(tableText.toString().trim());
+        
+        log.debug("提取表格: index={}, rows={}, cols={}", tableIndex, rowCount, colCount);
+        return element;
+    }
+    
+    /**
+     * 检测段落类型
+     */
+    private String detectParagraphType(XWPFParagraph paragraph, String text) {
+        // 检查样式名称
+        String styleName = paragraph.getStyle();
+        if (styleName != null) {
+            String lowerStyle = styleName.toLowerCase();
+            if (lowerStyle.contains("heading") || lowerStyle.contains("标题")) {
+                return "heading";
+            }
+            if (lowerStyle.contains("title")) {
+                return "title";
+            }
+            if (lowerStyle.contains("toc")) {
+                return "toc";
+            }
+        }
+        
+        // 检查大纲级别
+        if (paragraph.getCTP().getPPr() != null && 
+            paragraph.getCTP().getPPr().getOutlineLvl() != null) {
+            int level = paragraph.getCTP().getPPr().getOutlineLvl().getVal().intValue();
+            if (level >= 0 && level <= 9) {
+                return "heading" + (level + 1);
+            }
+        }
+        
+        // 基于内容检测
+        String trimmed = text.trim();
+        if (trimmed.length() < 100) {
+            if (trimmed.matches("^[一二三四五六七八九十]+[、.].*") ||
+                trimmed.matches("^第[一二三四五六七八九十]+[章节部分].*") ||
+                trimmed.matches("^\\d+\\.\\d*\\s+.*")) {
+                return "heading";
+            }
+        }
+        
+        // 检测列表
+        if (trimmed.matches("^[•·\\-\\*]\\s+.*") || 
+            trimmed.matches("^\\d+[.、)]\\s+.*") ||
+            trimmed.matches("^[a-zA-Z][.)]\\s+.*")) {
+            return "list_item";
+        }
+        
+        return "paragraph";
+    }
+    
+    /**
+     * 提取段落样式
+     */
+    private Map<String, Object> extractParagraphStyle(XWPFParagraph paragraph) {
+        Map<String, Object> style = new HashMap<>();
+        
+        try {
+            // 对齐方式
+            ParagraphAlignment alignment = paragraph.getAlignment();
+            if (alignment != null) {
+                style.put("alignment", alignment.name().toLowerCase());
+            }
+            
+            // 缩进
+            int indentLeft = paragraph.getIndentationLeft();
+            if (indentLeft > 0) {
+                style.put("indentLeft", indentLeft);
+            }
+            int indentFirstLine = paragraph.getIndentationFirstLine();
+            if (indentFirstLine > 0) {
+                style.put("indentFirstLine", indentFirstLine);
+            }
+            
+            // 行距
+            if (paragraph.getSpacingBetween() > 0) {
+                style.put("lineSpacing", paragraph.getSpacingBetween());
+            }
+            
+            // 字体(从第一个 run 获取)
+            List<XWPFRun> runs = paragraph.getRuns();
+            if (!runs.isEmpty()) {
+                XWPFRun firstRun = runs.get(0);
+                if (firstRun.getFontFamily() != null) {
+                    style.put("fontFamily", firstRun.getFontFamily());
+                }
+                if (firstRun.getFontSizeAsDouble() != null && firstRun.getFontSizeAsDouble() > 0) {
+                    style.put("fontSize", firstRun.getFontSizeAsDouble());
+                }
+                if (firstRun.isBold()) {
+                    style.put("bold", true);
+                }
+                if (firstRun.isItalic()) {
+                    style.put("italic", true);
+                }
+                if (firstRun.getUnderline() != UnderlinePatterns.NONE) {
+                    style.put("underline", true);
+                }
+                if (firstRun.getColor() != null) {
+                    style.put("color", firstRun.getColor());
+                }
+            }
+        } catch (Exception e) {
+            log.debug("提取段落样式失败: {}", e.getMessage());
+        }
+        
+        return style.isEmpty() ? null : style;
+    }
+    
+    /**
+     * 结构化提取结果
+     */
+    @Data
+    public static class WordStructuredResult {
+        private String documentId;
+        private List<ContentElement> elements;
+        private String fullText;
+        private int imageCount;
+        private int tableCount;
+        private int totalElements;
+    }
+    
+    /**
+     * 内容元素
+     */
+    @Data
+    public static class ContentElement {
+        private int index;                    // 元素在文档中的顺序索引
+        private String type;                  // paragraph/heading/heading1-9/list_item/image/table/title/toc
+        private String content;               // 文本内容(仅文本类型)
+        private Map<String, Object> style;    // 样式信息
+        
+        // 图片相关
+        private String imageUrl;              // 图片访问 URL
+        private String imagePath;             // 图片存储路径
+        private String imageAlt;              // 图片描述
+        private Integer imageWidth;           // 图片宽度(像素)
+        private Integer imageHeight;          // 图片高度(像素)
+        private String imageFormat;           // 图片格式
+        
+        // 表格相关
+        private Integer tableIndex;           // 表格索引
+        private List<List<TableCell>> tableRows; // 表格行数据
+        private Integer tableRowCount;        // 行数
+        private Integer tableColCount;        // 列数
+        private String tableText;             // 表格文本(用于搜索)
+    }
+    
+    /**
+     * 表格单元格
+     */
+    @Data
+    public static class TableCell {
+        private int row;
+        private int col;
+        private String text;
+        private Integer colSpan;              // 列合并数
+        private Integer rowSpan;              // 行合并数
+        private boolean merged;               // 是否为合并单元格
+    }
+}

+ 57 - 0
database/migrations/V2026_01_21_02__add_document_elements.sql

@@ -0,0 +1,57 @@
+-- 文档结构化元素表
+-- 存储从 Word/PDF 等文档中提取的结构化内容(段落、图片、表格)
+
+CREATE TABLE IF NOT EXISTS document_elements (
+    id VARCHAR(64) PRIMARY KEY,
+    document_id VARCHAR(64) NOT NULL,
+    element_index INT NOT NULL,              -- 元素在文档中的顺序
+    element_type VARCHAR(32) NOT NULL,       -- paragraph/heading/heading1-9/list_item/image/table/title/toc
+    content TEXT,                            -- 文本内容(文本类型)
+    style JSONB,                             -- 样式信息(字体、对齐、缩进等)
+    
+    -- 图片相关
+    image_url VARCHAR(500),                  -- 图片访问URL
+    image_path VARCHAR(500),                 -- 图片存储路径
+    image_alt VARCHAR(255),                  -- 图片描述
+    image_width INT,                         -- 图片宽度(像素)
+    image_height INT,                        -- 图片高度(像素)
+    image_format VARCHAR(16),                -- 图片格式
+    
+    -- 表格相关
+    table_index INT,                         -- 表格索引
+    table_data JSONB,                        -- 表格数据(行列内容)
+    table_row_count INT,                     -- 行数
+    table_col_count INT,                     -- 列数
+    table_text TEXT,                         -- 表格文本(用于搜索)
+    
+    -- 审计字段
+    create_by VARCHAR(64),
+    create_by_name VARCHAR(128),
+    create_time TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
+    update_by VARCHAR(64),
+    update_by_name VARCHAR(128),
+    update_time TIMESTAMP DEFAULT CURRENT_TIMESTAMP
+);
+
+-- 索引
+CREATE INDEX IF NOT EXISTS idx_document_elements_document_id ON document_elements(document_id);
+CREATE INDEX IF NOT EXISTS idx_document_elements_type ON document_elements(element_type);
+CREATE INDEX IF NOT EXISTS idx_document_elements_order ON document_elements(document_id, element_index);
+
+-- 注释
+COMMENT ON TABLE document_elements IS '文档结构化元素表';
+COMMENT ON COLUMN document_elements.element_index IS '元素在文档中的顺序索引';
+COMMENT ON COLUMN document_elements.element_type IS '元素类型:paragraph/heading/image/table等';
+COMMENT ON COLUMN document_elements.style IS '样式信息JSON:{alignment, fontSize, bold, color等}';
+COMMENT ON COLUMN document_elements.table_data IS '表格数据JSON:[[{row,col,text,colSpan},...],...]';
+
+-- 更新 documents 表,添加结构化解析状态
+ALTER TABLE documents ADD COLUMN IF NOT EXISTS structured_status VARCHAR(20) DEFAULT 'pending';
+ALTER TABLE documents ADD COLUMN IF NOT EXISTS image_count INT DEFAULT 0;
+ALTER TABLE documents ADD COLUMN IF NOT EXISTS table_count INT DEFAULT 0;
+ALTER TABLE documents ADD COLUMN IF NOT EXISTS element_count INT DEFAULT 0;
+
+COMMENT ON COLUMN documents.structured_status IS '结构化解析状态:pending/completed/failed';
+COMMENT ON COLUMN documents.image_count IS '文档中的图片数量';
+COMMENT ON COLUMN documents.table_count IS '文档中的表格数量';
+COMMENT ON COLUMN documents.element_count IS '文档中的元素总数';