浏览代码

feat: 重构上传测试脚本为端到端测试流程

- 新增完整端到端测试流程: 上传 -> 解析等待 -> 向量提取 -> NER提取
- 添加向量提取 (RAG 索引) 测试接口调用
- 添加 NER 文档级实体识别测试接口调用
- 新增 -v/--vector 和 -n/--ner 单独测试选项
- 优化轮询状态显示为单行刷新模式
- 保持 -p/--poll 旧参数兼容性
何文松 1 月之前
父节点
当前提交
01ac07c2b3
共有 1 个文件被更改,包括 320 次插入55 次删除
  1. 320 55
      test/test_upload_api.sh

+ 320 - 55
test/test_upload_api.sh

@@ -1,8 +1,9 @@
 #!/bin/bash
 
 # ============================================
-# 文件上传接口测试脚本
+# 文件上传端到端测试脚本
 # ============================================
+# 测试流程: 上传 -> 解析等待 -> 向量提取 -> NER提取
 # 使用方法: ./test_upload_api.sh [host] [port]
 # 示例: ./test_upload_api.sh localhost 5232
 # ============================================
@@ -14,6 +15,9 @@ BASE_URL="http://${HOST}:${PORT}"
 UPLOAD_URL="${BASE_URL}/api/v1/parse/upload"
 STATUS_URL="${BASE_URL}/parse/status"
 REGISTER_URL="${BASE_URL}/auth/register"
+TEXT_STORAGE_URL="${BASE_URL}/api/v1/graph/text-storage"
+RAG_INDEX_URL="${BASE_URL}/api/rag/index"
+NER_DOCUMENT_URL="${BASE_URL}/api/ner/document"
 
 # 测试文件路径(相对于脚本所在目录)
 SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
@@ -25,12 +29,15 @@ TEST_USERNAME="testuser_${TIMESTAMP}"
 TEST_EMAIL="testuser_${TIMESTAMP}@test.com"
 TEST_PASSWORD="Test123456!"
 USER_ID=""
+DOCUMENT_ID=""
+DOCUMENT_TEXT=""
 
 # 颜色定义
 RED='\033[0;31m'
 GREEN='\033[0;32m'
 YELLOW='\033[1;33m'
 BLUE='\033[0;34m'
+CYAN='\033[0;36m'
 NC='\033[0m' # No Color
 
 # 输出函数
@@ -40,6 +47,10 @@ print_header() {
     echo -e "${BLUE}============================================${NC}"
 }
 
+print_step() {
+    echo -e "\n${CYAN}>>> $1${NC}"
+}
+
 print_success() {
     echo -e "${GREEN}✓ $1${NC}"
 }
@@ -158,7 +169,7 @@ register_test_user() {
 
 # 测试文件上传
 test_upload() {
-    print_header "测试文件上传接口"
+    print_step "文件上传"
     
     print_info "上传URL: $UPLOAD_URL"
     print_info "用户ID: $USER_ID"
@@ -190,34 +201,31 @@ test_upload() {
     if [ "$HTTP_CODE" = "200" ]; then
         print_success "文件上传成功!"
         
-        # 提取documentId用于后续状态查询
+        # 提取documentId用于后续操作
         if [ "$JQ_AVAILABLE" = true ]; then
             DOCUMENT_ID=$(echo "$BODY" | jq -r '.data.documentId // .documentId // empty' 2>/dev/null)
             if [ -n "$DOCUMENT_ID" ] && [ "$DOCUMENT_ID" != "null" ]; then
                 print_info "文档ID: $DOCUMENT_ID"
                 echo "$DOCUMENT_ID" > "${SCRIPT_DIR}/.last_document_id"
-                
-                # 查询解析状态
-                test_parse_status "$DOCUMENT_ID"
+            else
+                print_error "无法从响应中获取文档ID"
+                return 1
             fi
         fi
+        return 0
     else
         print_error "文件上传失败 (HTTP $HTTP_CODE)"
+        return 1
     fi
 }
 
-# 测试解析状态查询
+# 测试解析状态查询(单次)
 test_parse_status() {
     local DOC_ID=$1
     
-    print_header "查询解析状态"
-    
     print_info "文档ID: $DOC_ID"
     print_info "状态URL: ${STATUS_URL}/${DOC_ID}"
     
-    # 等待一会儿让解析任务开始
-    sleep 2
-    
     RESPONSE=$(curl -s -w "\n%{http_code}" \
         -X GET "${STATUS_URL}/${DOC_ID}" \
         --connect-timeout 10)
@@ -244,25 +252,24 @@ test_parse_status() {
 # 轮询解析状态直到完成
 poll_parse_status() {
     local DOC_ID=$1
-    local MAX_ATTEMPTS=${2:-30}
-    local INTERVAL=${3:-5}
+    local MAX_ATTEMPTS=${2:-60}
+    local INTERVAL=${3:-3}
     
-    print_header "轮询解析状态 (最多${MAX_ATTEMPTS}次, 间隔${INTERVAL}秒)"
+    print_step "轮询解析状态 (最多${MAX_ATTEMPTS}次, 间隔${INTERVAL}秒)"
     
     for ((i=1; i<=MAX_ATTEMPTS; i++)); do
-        print_info "第 $i 次查询..."
-        
         RESPONSE=$(curl -s "${STATUS_URL}/${DOC_ID}" --connect-timeout 10)
         
         if [ "$JQ_AVAILABLE" = true ]; then
             STATUS=$(echo "$RESPONSE" | jq -r '.data.parseStatus // .parseStatus // empty' 2>/dev/null)
-            echo "当前状态: $STATUS"
+            echo -ne "\r第 $i 次查询... 状态: $STATUS    "
             
             if [ "$STATUS" = "2" ] || [ "$STATUS" = "COMPLETED" ]; then
+                echo ""
                 print_success "解析完成!"
-                echo "$RESPONSE" | jq .
                 return 0
             elif [ "$STATUS" = "3" ] || [ "$STATUS" = "FAILED" ]; then
+                echo ""
                 print_error "解析失败!"
                 echo "$RESPONSE" | jq .
                 return 1
@@ -274,30 +281,196 @@ poll_parse_status() {
         sleep $INTERVAL
     done
     
-    print_error "轮询超时"
+    echo ""
+    print_error "轮询超时,解析未完成"
     return 1
 }
 
+# 获取解析后的文本内容
+get_document_text() {
+    local DOC_ID=$1
+    
+    print_step "获取文档解析文本"
+    
+    print_info "文档ID: $DOC_ID"
+    print_info "请求URL: ${TEXT_STORAGE_URL}/${DOC_ID}"
+    
+    RESPONSE=$(curl -s -w "\n%{http_code}" \
+        -X GET "${TEXT_STORAGE_URL}/${DOC_ID}" \
+        --connect-timeout 10)
+    
+    HTTP_CODE=$(echo "$RESPONSE" | tail -n1)
+    BODY=$(echo "$RESPONSE" | sed '$d')
+    
+    echo -e "${YELLOW}响应状态码:${NC} $HTTP_CODE"
+    
+    if [ "$HTTP_CODE" = "200" ]; then
+        if [ "$JQ_AVAILABLE" = true ]; then
+            # 获取文件路径
+            FILE_PATH=$(echo "$BODY" | jq -r '.data.filePath // empty' 2>/dev/null)
+            if [ -n "$FILE_PATH" ] && [ "$FILE_PATH" != "null" ]; then
+                print_success "获取文本存储记录成功!"
+                print_info "文件路径: $FILE_PATH"
+                
+                # 读取文件内容
+                if [ -f "$FILE_PATH" ]; then
+                    DOCUMENT_TEXT=$(cat "$FILE_PATH" 2>/dev/null)
+                    TEXT_LENGTH=${#DOCUMENT_TEXT}
+                    print_success "读取文本成功 (长度: $TEXT_LENGTH 字符)"
+                    
+                    # 显示前200个字符
+                    echo -e "${YELLOW}文本预览:${NC}"
+                    echo "${DOCUMENT_TEXT:0:200}..."
+                    return 0
+                else
+                    print_error "文件不存在: $FILE_PATH"
+                    return 1
+                fi
+            else
+                print_error "响应中无文件路径"
+                echo "$BODY" | jq . 2>/dev/null
+                return 1
+            fi
+        fi
+    else
+        print_error "获取文本存储失败 (HTTP $HTTP_CODE)"
+        echo "$BODY"
+        return 1
+    fi
+}
+
+# 向量提取(RAG 索引)
+test_vector_extraction() {
+    local DOC_ID=$1
+    local TEXT=$2
+    
+    print_step "向量提取 (RAG 索引)"
+    
+    print_info "文档ID: $DOC_ID"
+    print_info "文本长度: ${#TEXT} 字符"
+    print_info "请求URL: $RAG_INDEX_URL"
+    
+    # 构建请求JSON(需要转义文本中的特殊字符)
+    if [ "$JQ_AVAILABLE" = true ]; then
+        REQUEST_BODY=$(jq -n \
+            --arg docId "$DOC_ID" \
+            --arg text "$TEXT" \
+            '{documentId: $docId, text: $text}')
+    else
+        # 简单转义
+        ESCAPED_TEXT=$(echo "$TEXT" | sed 's/\\/\\\\/g' | sed 's/"/\\"/g' | tr '\n' ' ')
+        REQUEST_BODY="{\"documentId\":\"${DOC_ID}\",\"text\":\"${ESCAPED_TEXT}\"}"
+    fi
+    
+    RESPONSE=$(curl -s -w "\n%{http_code}" \
+        -X POST "$RAG_INDEX_URL" \
+        -H "Content-Type: application/json" \
+        -d "$REQUEST_BODY" \
+        --connect-timeout 30 \
+        --max-time 300)
+    
+    HTTP_CODE=$(echo "$RESPONSE" | tail -n1)
+    BODY=$(echo "$RESPONSE" | sed '$d')
+    
+    echo -e "${YELLOW}响应状态码:${NC} $HTTP_CODE"
+    echo -e "${YELLOW}响应内容:${NC}"
+    
+    if [ "$JQ_AVAILABLE" = true ]; then
+        echo "$BODY" | jq . 2>/dev/null || echo "$BODY"
+    else
+        echo "$BODY"
+    fi
+    
+    if [ "$HTTP_CODE" = "200" ]; then
+        if [ "$JQ_AVAILABLE" = true ]; then
+            CHUNK_COUNT=$(echo "$BODY" | jq -r '.data.chunkCount // empty' 2>/dev/null)
+            if [ -n "$CHUNK_COUNT" ] && [ "$CHUNK_COUNT" != "null" ]; then
+                print_success "向量提取成功! 生成 $CHUNK_COUNT 个分块"
+            else
+                print_success "向量提取成功!"
+            fi
+        else
+            print_success "向量提取成功!"
+        fi
+        return 0
+    else
+        print_error "向量提取失败 (HTTP $HTTP_CODE)"
+        return 1
+    fi
+}
+
+# NER 提取
+test_ner_extraction() {
+    local DOC_ID=$1
+    
+    print_step "NER 提取 (命名实体识别)"
+    
+    print_info "文档ID: $DOC_ID"
+    print_info "请求URL: ${NER_DOCUMENT_URL}/${DOC_ID}"
+    
+    RESPONSE=$(curl -s -w "\n%{http_code}" \
+        -X POST "${NER_DOCUMENT_URL}/${DOC_ID}" \
+        -H "Content-Type: application/json" \
+        --connect-timeout 30 \
+        --max-time 300)
+    
+    HTTP_CODE=$(echo "$RESPONSE" | tail -n1)
+    BODY=$(echo "$RESPONSE" | sed '$d')
+    
+    echo -e "${YELLOW}响应状态码:${NC} $HTTP_CODE"
+    echo -e "${YELLOW}响应内容:${NC}"
+    
+    if [ "$JQ_AVAILABLE" = true ]; then
+        echo "$BODY" | jq . 2>/dev/null || echo "$BODY"
+    else
+        echo "$BODY"
+    fi
+    
+    if [ "$HTTP_CODE" = "200" ]; then
+        if [ "$JQ_AVAILABLE" = true ]; then
+            ENTITY_COUNT=$(echo "$BODY" | jq -r '.data.entityCount // empty' 2>/dev/null)
+            RELATION_COUNT=$(echo "$BODY" | jq -r '.data.relationCount // empty' 2>/dev/null)
+            if [ -n "$ENTITY_COUNT" ] && [ "$ENTITY_COUNT" != "null" ]; then
+                print_success "NER 提取成功! 实体: $ENTITY_COUNT, 关系: $RELATION_COUNT"
+            else
+                print_success "NER 提取成功!"
+            fi
+        else
+            print_success "NER 提取成功!"
+        fi
+        return 0
+    else
+        print_error "NER 提取失败 (HTTP $HTTP_CODE)"
+        return 1
+    fi
+}
+
 # 显示使用帮助
 show_help() {
     echo "使用方法: $0 [选项] [host] [port]"
     echo ""
+    echo "端到端测试流程: 上传文件 -> 等待解析 -> 向量提取 -> NER提取"
+    echo ""
     echo "选项:"
-    echo "  -h, --help      显示帮助信息"
-    echo "  -p, --poll      上传后轮询解析状态直到完成"
-    echo "  -s, --status    仅查询上次上传的文档状态"
+    echo "  -h, --help        显示帮助信息"
+    echo "  -e, --e2e         执行完整端到端测试 (默认)"
+    echo "  -u, --upload-only 仅执行上传测试"
+    echo "  -s, --status      仅查询上次上传的文档状态"
+    echo "  -v, --vector      仅执行向量提取(使用上次的文档)"
+    echo "  -n, --ner         仅执行NER提取(使用上次的文档)"
     echo ""
     echo "示例:"
-    echo "  $0                      # 使用默认配置 (localhost:5232)"
+    echo "  $0                      # 使用默认配置执行完整端到端测试"
     echo "  $0 192.168.1.100 5232   # 指定服务器地址"
-    echo "  $0 -p                   # 上传并轮询状态"
+    echo "  $0 -u                   # 仅上传文件"
     echo "  $0 -s                   # 查询上次上传的状态"
+    echo "  $0 -v                   # 对上次文档执行向量提取"
+    echo "  $0 -n                   # 对上次文档执行NER提取"
 }
 
 # 主函数
 main() {
-    local POLL_STATUS=false
-    local STATUS_ONLY=false
+    local MODE="e2e"  # 默认执行完整端到端测试
     
     # 解析参数
     while [[ $# -gt 0 ]]; do
@@ -306,19 +479,39 @@ main() {
                 show_help
                 exit 0
                 ;;
-            -p|--poll)
-                POLL_STATUS=true
+            -e|--e2e)
+                MODE="e2e"
+                shift
+                ;;
+            -u|--upload-only)
+                MODE="upload"
                 shift
                 ;;
             -s|--status)
-                STATUS_ONLY=true
+                MODE="status"
+                shift
+                ;;
+            -v|--vector)
+                MODE="vector"
+                shift
+                ;;
+            -n|--ner)
+                MODE="ner"
+                shift
+                ;;
+            -p|--poll)
+                # 兼容旧参数,等同于e2e
+                MODE="e2e"
                 shift
                 ;;
             *)
-                if [[ -z "$HOST_ARG" ]]; then
-                    HOST=$1
-                elif [[ -z "$PORT_ARG" ]]; then
-                    PORT=$1
+                if [[ ! "$1" =~ ^- ]]; then
+                    if [[ -z "$HOST_SET" ]]; then
+                        HOST=$1
+                        HOST_SET=true
+                    else
+                        PORT=$1
+                    fi
                 fi
                 shift
                 ;;
@@ -330,35 +523,107 @@ main() {
     UPLOAD_URL="${BASE_URL}/api/v1/parse/upload"
     STATUS_URL="${BASE_URL}/parse/status"
     REGISTER_URL="${BASE_URL}/auth/register"
+    TEXT_STORAGE_URL="${BASE_URL}/api/v1/graph/text-storage"
+    RAG_INDEX_URL="${BASE_URL}/api/rag/index"
+    NER_DOCUMENT_URL="${BASE_URL}/api/ner/document"
     
-    print_header "文件上传接口测试"
+    print_header "文件上传端到端测试"
     echo "目标服务: $BASE_URL"
+    echo "测试模式: $MODE"
     echo "时间: $(date '+%Y-%m-%d %H:%M:%S')"
     
     check_dependencies
     
-    if [ "$STATUS_ONLY" = true ]; then
-        if [ -f "${SCRIPT_DIR}/.last_document_id" ]; then
-            DOCUMENT_ID=$(cat "${SCRIPT_DIR}/.last_document_id")
-            test_parse_status "$DOCUMENT_ID"
-        else
-            print_error "未找到上次上传的文档ID"
-            exit 1
-        fi
-        exit 0
-    fi
-    
-    check_test_file
-    check_service
-    register_test_user
-    test_upload
-    
-    if [ "$POLL_STATUS" = true ] && [ -f "${SCRIPT_DIR}/.last_document_id" ]; then
-        DOCUMENT_ID=$(cat "${SCRIPT_DIR}/.last_document_id")
-        poll_parse_status "$DOCUMENT_ID"
-    fi
+    # 根据模式执行不同操作
+    case $MODE in
+        status)
+            # 仅查询状态
+            if [ -f "${SCRIPT_DIR}/.last_document_id" ]; then
+                DOCUMENT_ID=$(cat "${SCRIPT_DIR}/.last_document_id")
+                print_header "查询解析状态"
+                test_parse_status "$DOCUMENT_ID"
+            else
+                print_error "未找到上次上传的文档ID"
+                exit 1
+            fi
+            ;;
+        
+        vector)
+            # 仅执行向量提取
+            if [ -f "${SCRIPT_DIR}/.last_document_id" ]; then
+                DOCUMENT_ID=$(cat "${SCRIPT_DIR}/.last_document_id")
+                print_header "向量提取测试"
+                if get_document_text "$DOCUMENT_ID"; then
+                    test_vector_extraction "$DOCUMENT_ID" "$DOCUMENT_TEXT"
+                fi
+            else
+                print_error "未找到上次上传的文档ID"
+                exit 1
+            fi
+            ;;
+        
+        ner)
+            # 仅执行NER提取
+            if [ -f "${SCRIPT_DIR}/.last_document_id" ]; then
+                DOCUMENT_ID=$(cat "${SCRIPT_DIR}/.last_document_id")
+                print_header "NER 提取测试"
+                test_ner_extraction "$DOCUMENT_ID"
+            else
+                print_error "未找到上次上传的文档ID"
+                exit 1
+            fi
+            ;;
+        
+        upload)
+            # 仅上传
+            check_test_file
+            check_service
+            register_test_user
+            test_upload
+            ;;
+        
+        e2e)
+            # 完整端到端测试
+            check_test_file
+            check_service
+            register_test_user
+            
+            print_header "步骤 1/4: 文件上传"
+            test_upload
+            
+            if [ -z "$DOCUMENT_ID" ] && [ -f "${SCRIPT_DIR}/.last_document_id" ]; then
+                DOCUMENT_ID=$(cat "${SCRIPT_DIR}/.last_document_id")
+            fi
+            
+            if [ -z "$DOCUMENT_ID" ]; then
+                print_error "无法获取文档ID,终止测试"
+                exit 1
+            fi
+            
+            print_header "步骤 2/4: 等待解析完成"
+            if ! poll_parse_status "$DOCUMENT_ID" 60 3; then
+                print_error "解析未完成,终止测试"
+                exit 1
+            fi
+            
+            print_header "步骤 3/4: 向量提取"
+            if get_document_text "$DOCUMENT_ID"; then
+                test_vector_extraction "$DOCUMENT_ID" "$DOCUMENT_TEXT"
+            else
+                print_info "跳过向量提取(无法获取文本)"
+            fi
+            
+            print_header "步骤 4/4: NER 提取"
+            test_ner_extraction "$DOCUMENT_ID"
+            ;;
+    esac
     
     print_header "测试完成"
+    echo -e "${GREEN}文档ID: $DOCUMENT_ID${NC}"
+    echo "可使用以下命令进行后续操作:"
+    echo "  $0 -s            # 查询状态"
+    echo "  $0 -v            # 重新向量提取"
+    echo "  $0 -n            # 重新NER提取"
 }
 
 # 运行主函数