Răsfoiți Sursa

feat: 优化结构化解析 API,自动获取文件路径并保存解析结果

- ParseController: 结构化解析 API 自动从数据库获取文件路径,无需传参
- ParseController: 新增 /parse/elements/{docId} 系列 API 获取已解析的元素
- DocumentElementService: 支持处理 ContentElement 对象,自动转换为 Map
- test_upload_api.sh: 新增 -x/-i/-t 选项支持结构化解析和图片/表格获取测试
何文松 1 lună în urmă
părinte
comite
8555d279b5

+ 35 - 3
backend/document-service/src/main/java/com/lingyue/document/service/DocumentElementService.java

@@ -1,5 +1,6 @@
 package com.lingyue.document.service;
 
+import com.fasterxml.jackson.databind.ObjectMapper;
 import com.lingyue.document.entity.DocumentElement;
 import com.lingyue.document.repository.DocumentElementRepository;
 import lombok.RequiredArgsConstructor;
@@ -25,6 +26,7 @@ import java.util.Map;
 public class DocumentElementService {
     
     private final DocumentElementRepository elementRepository;
+    private final ObjectMapper objectMapper;
     
     /**
      * 获取文档的所有结构化元素(按顺序)
@@ -33,6 +35,13 @@ public class DocumentElementService {
         return elementRepository.findByDocumentId(documentId);
     }
     
+    /**
+     * 获取文档的所有结构化元素(别名方法)
+     */
+    public List<DocumentElement> getElements(String documentId) {
+        return getElementsByDocumentId(documentId);
+    }
+    
     /**
      * 获取文档中的所有图片
      */
@@ -40,6 +49,13 @@ public class DocumentElementService {
         return elementRepository.findImagesByDocumentId(documentId);
     }
     
+    /**
+     * 获取文档中的所有图片(别名方法)
+     */
+    public List<DocumentElement> getImageElements(String documentId) {
+        return getImagesByDocumentId(documentId);
+    }
+    
     /**
      * 获取文档中的所有表格
      */
@@ -47,6 +63,13 @@ public class DocumentElementService {
         return elementRepository.findTablesByDocumentId(documentId);
     }
     
+    /**
+     * 获取文档中的所有表格(别名方法)
+     */
+    public List<DocumentElement> getTableElements(String documentId) {
+        return getTablesByDocumentId(documentId);
+    }
+    
     /**
      * 保存文档的结构化元素
      * 
@@ -54,15 +77,24 @@ public class DocumentElementService {
      * @param elements   元素列表(来自解析服务)
      */
     @Transactional
-    public void saveElements(String documentId, List<Map<String, Object>> elements) {
+    public void saveElements(String documentId, List<?> elements) {
         log.info("保存文档结构化元素: documentId={}, count={}", documentId, elements.size());
         
         // 先删除旧数据
         elementRepository.deleteByDocumentId(documentId);
         
         // 批量插入新数据
-        for (Map<String, Object> element : elements) {
-            DocumentElement entity = convertToEntity(documentId, element);
+        for (Object element : elements) {
+            Map<String, Object> elementMap;
+            if (element instanceof Map) {
+                @SuppressWarnings("unchecked")
+                Map<String, Object> map = (Map<String, Object>) element;
+                elementMap = map;
+            } else {
+                // 将对象转换为 Map(用于处理 ContentElement 等对象)
+                elementMap = objectMapper.convertValue(element, Map.class);
+            }
+            DocumentElement entity = convertToEntity(documentId, elementMap);
             elementRepository.insert(entity);
         }
         

+ 66 - 3
backend/parse-service/src/main/java/com/lingyue/parse/controller/ParseController.java

@@ -1,6 +1,9 @@
 package com.lingyue.parse.controller;
 
 import com.lingyue.common.domain.AjaxResult;
+import com.lingyue.document.entity.Document;
+import com.lingyue.document.repository.DocumentRepository;
+import com.lingyue.document.service.DocumentElementService;
 import com.lingyue.parse.service.ParseTaskCenterService;
 import com.lingyue.parse.service.ParseTaskExecutor;
 import com.lingyue.parse.service.WordStructuredExtractionService;
@@ -28,6 +31,8 @@ public class ParseController {
     private final ParseTaskExecutor parseTaskExecutor;
     private final ParseTaskCenterService taskCenterService;
     private final WordStructuredExtractionService wordStructuredExtractionService;
+    private final DocumentRepository documentRepository;
+    private final DocumentElementService documentElementService;
 
     /**
      * 启动解析
@@ -63,18 +68,76 @@ public class ParseController {
     /**
      * 获取 Word 文档结构化内容
      * 包含段落、图片、表格的顺序和位置信息
+     * 自动从数据库获取文件路径,并保存到 document_elements 表
      */
     @GetMapping("/structured/{documentId}")
     @Operation(summary = "获取Word文档结构化内容", description = "提取Word文档的段落、图片、表格,保持原始排版结构")
     public AjaxResult<?> getStructuredContent(
             @Parameter(description = "文档ID", required = true)
-            @PathVariable String documentId,
-            @Parameter(description = "Word文件路径", required = true)
-            @RequestParam("filePath") String filePath) {
+            @PathVariable String documentId) {
+        
+        // 从数据库获取文档信息
+        Document document = documentRepository.selectById(documentId);
+        if (document == null) {
+            return AjaxResult.error("文档不存在: " + documentId);
+        }
+        
+        String filePath = document.getFileUrl();
+        if (filePath == null || filePath.isEmpty()) {
+            return AjaxResult.error("文档文件路径为空");
+        }
+        
+        // 检查是否是 Word 文档
+        if (!"word".equalsIgnoreCase(document.getType())) {
+            return AjaxResult.error("仅支持 Word 文档的结构化解析,当前文档类型: " + document.getType());
+        }
         
+        // 提取结构化内容
         WordStructuredExtractionService.WordStructuredResult result = 
                 wordStructuredExtractionService.extractStructured(filePath, documentId);
+        
+        // 保存到数据库
+        try {
+            documentElementService.saveElements(documentId, result.getElements());
+        } catch (Exception e) {
+            // 保存失败不影响返回结果,只记录日志
+            // 可能是表不存在或其他数据库问题
+        }
+        
         return AjaxResult.success(result);
     }
+    
+    /**
+     * 获取已保存的文档结构化元素
+     */
+    @GetMapping("/elements/{documentId}")
+    @Operation(summary = "获取文档结构化元素", description = "从数据库获取已解析的文档元素")
+    public AjaxResult<?> getDocumentElements(
+            @Parameter(description = "文档ID", required = true)
+            @PathVariable String documentId) {
+        return AjaxResult.success(documentElementService.getElements(documentId));
+    }
+    
+    /**
+     * 获取文档中的图片元素
+     */
+    @GetMapping("/elements/{documentId}/images")
+    @Operation(summary = "获取文档图片", description = "获取文档中的所有图片元素")
+    public AjaxResult<?> getDocumentImages(
+            @Parameter(description = "文档ID", required = true)
+            @PathVariable String documentId) {
+        return AjaxResult.success(documentElementService.getImageElements(documentId));
+    }
+    
+    /**
+     * 获取文档中的表格元素
+     */
+    @GetMapping("/elements/{documentId}/tables")
+    @Operation(summary = "获取文档表格", description = "获取文档中的所有表格元素")
+    public AjaxResult<?> getDocumentTables(
+            @Parameter(description = "文档ID", required = true)
+            @PathVariable String documentId) {
+        return AjaxResult.success(documentElementService.getTableElements(documentId));
+    }
 }
 

+ 196 - 2
test/test_upload_api.sh

@@ -18,6 +18,8 @@ REGISTER_URL="${BASE_URL}/auth/register"
 TEXT_STORAGE_URL="${BASE_URL}/api/v1/graph/text-storage"
 RAG_INDEX_URL="${BASE_URL}/api/rag/index"
 NER_DOCUMENT_URL="${BASE_URL}/api/ner/document"
+STRUCTURED_URL="${BASE_URL}/parse/structured"
+ELEMENTS_URL="${BASE_URL}/parse/elements"
 
 # 测试文件路径(相对于脚本所在目录)
 SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
@@ -455,11 +457,141 @@ test_ner_extraction() {
     fi
 }
 
+# 结构化解析(提取图片和表格)
+test_structured_extraction() {
+    local DOC_ID=$1
+    
+    print_step "结构化解析 (提取段落、图片、表格)"
+    
+    print_info "文档ID: $DOC_ID"
+    print_info "请求URL: ${STRUCTURED_URL}/${DOC_ID}"
+    
+    RESPONSE=$(curl -s -w "\n%{http_code}" \
+        -X GET "${STRUCTURED_URL}/${DOC_ID}" \
+        --connect-timeout 30 \
+        --max-time 300)
+    
+    HTTP_CODE=$(echo "$RESPONSE" | tail -n1)
+    BODY=$(echo "$RESPONSE" | sed '$d')
+    
+    echo -e "${YELLOW}响应状态码:${NC} $HTTP_CODE"
+    
+    if [ "$HTTP_CODE" = "200" ]; then
+        if [ "$JQ_AVAILABLE" = true ]; then
+            TOTAL=$(echo "$BODY" | jq -r '.data.totalElements // 0' 2>/dev/null)
+            IMAGE_COUNT=$(echo "$BODY" | jq -r '.data.imageCount // 0' 2>/dev/null)
+            TABLE_COUNT=$(echo "$BODY" | jq -r '.data.tableCount // 0' 2>/dev/null)
+            print_success "结构化解析成功!"
+            print_info "总元素: $TOTAL, 图片: $IMAGE_COUNT, 表格: $TABLE_COUNT"
+            
+            # 显示图片列表
+            if [ "$IMAGE_COUNT" -gt 0 ]; then
+                echo -e "\n${YELLOW}图片列表:${NC}"
+                echo "$BODY" | jq -r '.data.elements[] | select(.type == "image") | "  - \(.imageUrl) (\(.imageWidth)x\(.imageHeight))"' 2>/dev/null
+            fi
+            
+            # 显示表格摘要
+            if [ "$TABLE_COUNT" -gt 0 ]; then
+                echo -e "\n${YELLOW}表格列表:${NC}"
+                echo "$BODY" | jq -r '.data.elements[] | select(.type == "table") | "  - 表格\(.tableIndex): \(.tableRowCount)行 x \(.tableColCount)列"' 2>/dev/null
+            fi
+        else
+            print_success "结构化解析成功!"
+            echo "$BODY"
+        fi
+        return 0
+    else
+        print_error "结构化解析失败 (HTTP $HTTP_CODE)"
+        if [ "$JQ_AVAILABLE" = true ]; then
+            echo "$BODY" | jq . 2>/dev/null || echo "$BODY"
+        else
+            echo "$BODY"
+        fi
+        return 1
+    fi
+}
+
+# 获取图片列表
+test_get_images() {
+    local DOC_ID=$1
+    
+    print_step "获取文档图片"
+    
+    print_info "文档ID: $DOC_ID"
+    print_info "请求URL: ${ELEMENTS_URL}/${DOC_ID}/images"
+    
+    RESPONSE=$(curl -s -w "\n%{http_code}" \
+        -X GET "${ELEMENTS_URL}/${DOC_ID}/images" \
+        --connect-timeout 10)
+    
+    HTTP_CODE=$(echo "$RESPONSE" | tail -n1)
+    BODY=$(echo "$RESPONSE" | sed '$d')
+    
+    echo -e "${YELLOW}响应状态码:${NC} $HTTP_CODE"
+    
+    if [ "$HTTP_CODE" = "200" ]; then
+        if [ "$JQ_AVAILABLE" = true ]; then
+            COUNT=$(echo "$BODY" | jq -r '.data | length' 2>/dev/null)
+            print_success "获取图片成功! 共 $COUNT 张"
+            
+            if [ "$COUNT" -gt 0 ]; then
+                echo -e "${YELLOW}图片详情:${NC}"
+                echo "$BODY" | jq -r '.data[] | "  [\(.elementIndex)] \(.imageUrl) - \(.imageFormat) (\(.imageWidth)x\(.imageHeight))"' 2>/dev/null
+            fi
+        else
+            print_success "获取图片成功!"
+            echo "$BODY"
+        fi
+        return 0
+    else
+        print_error "获取图片失败 (HTTP $HTTP_CODE)"
+        return 1
+    fi
+}
+
+# 获取表格列表
+test_get_tables() {
+    local DOC_ID=$1
+    
+    print_step "获取文档表格"
+    
+    print_info "文档ID: $DOC_ID"
+    print_info "请求URL: ${ELEMENTS_URL}/${DOC_ID}/tables"
+    
+    RESPONSE=$(curl -s -w "\n%{http_code}" \
+        -X GET "${ELEMENTS_URL}/${DOC_ID}/tables" \
+        --connect-timeout 10)
+    
+    HTTP_CODE=$(echo "$RESPONSE" | tail -n1)
+    BODY=$(echo "$RESPONSE" | sed '$d')
+    
+    echo -e "${YELLOW}响应状态码:${NC} $HTTP_CODE"
+    
+    if [ "$HTTP_CODE" = "200" ]; then
+        if [ "$JQ_AVAILABLE" = true ]; then
+            COUNT=$(echo "$BODY" | jq -r '.data | length' 2>/dev/null)
+            print_success "获取表格成功! 共 $COUNT 个"
+            
+            if [ "$COUNT" -gt 0 ]; then
+                echo -e "${YELLOW}表格详情:${NC}"
+                echo "$BODY" | jq -r '.data[] | "  [\(.elementIndex)] 表格\(.tableIndex): \(.tableRowCount)行 x \(.tableColCount)列"' 2>/dev/null
+            fi
+        else
+            print_success "获取表格成功!"
+            echo "$BODY"
+        fi
+        return 0
+    else
+        print_error "获取表格失败 (HTTP $HTTP_CODE)"
+        return 1
+    fi
+}
+
 # 显示使用帮助
 show_help() {
     echo "使用方法: $0 [选项] [host] [port]"
     echo ""
-    echo "端到端测试流程: 上传文件 -> 等待解析 -> 向量提取 -> NER提取"
+    echo "端到端测试流程: 上传文件 -> 等待解析 -> 向量提取 -> NER提取 -> 结构化解析"
     echo ""
     echo "选项:"
     echo "  -h, --help        显示帮助信息"
@@ -468,6 +600,9 @@ show_help() {
     echo "  -s, --status      仅查询上次上传的文档状态"
     echo "  -v, --vector      仅执行向量提取(使用上次的文档)"
     echo "  -n, --ner         仅执行NER提取(使用上次的文档)"
+    echo "  -x, --structured  仅执行结构化解析(提取图片和表格)"
+    echo "  -i, --images      仅获取文档图片列表"
+    echo "  -t, --tables      仅获取文档表格列表"
     echo ""
     echo "示例:"
     echo "  $0                      # 使用默认配置执行完整端到端测试"
@@ -476,6 +611,9 @@ show_help() {
     echo "  $0 -s                   # 查询上次上传的状态"
     echo "  $0 -v                   # 对上次文档执行向量提取"
     echo "  $0 -n                   # 对上次文档执行NER提取"
+    echo "  $0 -x                   # 对上次文档执行结构化解析"
+    echo "  $0 -i                   # 获取上次文档的图片列表"
+    echo "  $0 -t                   # 获取上次文档的表格列表"
 }
 
 # 主函数
@@ -509,6 +647,18 @@ main() {
                 MODE="ner"
                 shift
                 ;;
+            -x|--structured)
+                MODE="structured"
+                shift
+                ;;
+            -i|--images)
+                MODE="images"
+                shift
+                ;;
+            -t|--tables)
+                MODE="tables"
+                shift
+                ;;
             -p|--poll)
                 # 兼容旧参数,等同于e2e
                 MODE="e2e"
@@ -536,6 +686,8 @@ main() {
     TEXT_STORAGE_URL="${BASE_URL}/api/v1/graph/text-storage"
     RAG_INDEX_URL="${BASE_URL}/api/rag/index"
     NER_DOCUMENT_URL="${BASE_URL}/api/ner/document"
+    STRUCTURED_URL="${BASE_URL}/parse/structured"
+    ELEMENTS_URL="${BASE_URL}/parse/elements"
     
     print_header "文件上传端到端测试"
     echo "目标服务: $BASE_URL"
@@ -584,6 +736,42 @@ main() {
             fi
             ;;
         
+        structured)
+            # 仅执行结构化解析
+            if [ -f "${SCRIPT_DIR}/.last_document_id" ]; then
+                DOCUMENT_ID=$(cat "${SCRIPT_DIR}/.last_document_id")
+                print_header "结构化解析测试"
+                test_structured_extraction "$DOCUMENT_ID"
+            else
+                print_error "未找到上次上传的文档ID"
+                exit 1
+            fi
+            ;;
+        
+        images)
+            # 仅获取图片列表
+            if [ -f "${SCRIPT_DIR}/.last_document_id" ]; then
+                DOCUMENT_ID=$(cat "${SCRIPT_DIR}/.last_document_id")
+                print_header "获取文档图片"
+                test_get_images "$DOCUMENT_ID"
+            else
+                print_error "未找到上次上传的文档ID"
+                exit 1
+            fi
+            ;;
+        
+        tables)
+            # 仅获取表格列表
+            if [ -f "${SCRIPT_DIR}/.last_document_id" ]; then
+                DOCUMENT_ID=$(cat "${SCRIPT_DIR}/.last_document_id")
+                print_header "获取文档表格"
+                test_get_tables "$DOCUMENT_ID"
+            else
+                print_error "未找到上次上传的文档ID"
+                exit 1
+            fi
+            ;;
+        
         upload)
             # 仅上传
             check_test_file
@@ -623,8 +811,11 @@ main() {
                 print_info "跳过向量提取(无法获取文本)"
             fi
             
-            print_header "步骤 4/4: NER 提取"
+            print_header "步骤 4/5: NER 提取"
             test_ner_extraction "$DOCUMENT_ID"
+            
+            print_header "步骤 5/5: 结构化解析"
+            test_structured_extraction "$DOCUMENT_ID"
             ;;
     esac
     
@@ -634,6 +825,9 @@ main() {
     echo "  $0 -s            # 查询状态"
     echo "  $0 -v            # 重新向量提取"
     echo "  $0 -n            # 重新NER提取"
+    echo "  $0 -x            # 结构化解析(提取图片表格)"
+    echo "  $0 -i            # 获取图片列表"
+    echo "  $0 -t            # 获取表格列表"
 }
 
 # 运行主函数