#!/bin/bash # ============================================ # 文件上传端到端测试脚本 # ============================================ # 测试流程: 上传 -> 解析等待 -> 向量提取 -> NER提取 # 使用方法: ./test_upload_api.sh [host] [port] # 示例: ./test_upload_api.sh localhost 5232 # ============================================ # 配置参数 HOST=${1:-localhost} PORT=${2:-5232} BASE_URL="http://${HOST}:${PORT}" UPLOAD_URL="${BASE_URL}/api/v1/parse/upload" STATUS_URL="${BASE_URL}/parse/status" REGISTER_URL="${BASE_URL}/auth/register" TEXT_STORAGE_URL="${BASE_URL}/api/v1/graph/text-storage" RAG_INDEX_URL="${BASE_URL}/api/rag/index" NER_DOCUMENT_URL="${BASE_URL}/api/ner/document" # 测试文件路径(相对于脚本所在目录) SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" TEST_FILE="${SCRIPT_DIR}/test.docx" # 测试用户信息 TIMESTAMP=$(date +%s) TEST_USERNAME="testuser_${TIMESTAMP}" TEST_EMAIL="testuser_${TIMESTAMP}@test.com" TEST_PASSWORD="Test123456!" USER_ID="" DOCUMENT_ID="" DOCUMENT_TEXT="" # 颜色定义 RED='\033[0;31m' GREEN='\033[0;32m' YELLOW='\033[1;33m' BLUE='\033[0;34m' CYAN='\033[0;36m' NC='\033[0m' # No Color # 输出函数 print_header() { echo -e "\n${BLUE}============================================${NC}" echo -e "${BLUE}$1${NC}" echo -e "${BLUE}============================================${NC}" } print_step() { echo -e "\n${CYAN}>>> $1${NC}" } print_success() { echo -e "${GREEN}✓ $1${NC}" } print_error() { echo -e "${RED}✗ $1${NC}" } print_info() { echo -e "${YELLOW}➤ $1${NC}" } # 检查依赖 check_dependencies() { print_header "检查依赖" if ! command -v curl &> /dev/null; then print_error "curl 未安装" exit 1 fi print_success "curl 已安装" if ! command -v jq &> /dev/null; then print_info "jq 未安装,JSON格式化将不可用" JQ_AVAILABLE=false else print_success "jq 已安装" JQ_AVAILABLE=true fi } # 检查测试文件 check_test_file() { print_header "检查测试文件" if [ ! -f "$TEST_FILE" ]; then print_error "测试文件不存在: $TEST_FILE" exit 1 fi FILE_SIZE=$(stat -c%s "$TEST_FILE" 2>/dev/null || stat -f%z "$TEST_FILE" 2>/dev/null) print_success "测试文件存在: $TEST_FILE" print_info "文件大小: $FILE_SIZE bytes" } # 检查服务是否可用 check_service() { print_header "检查服务状态" print_info "测试服务: $BASE_URL" HTTP_CODE=$(curl -s -o /dev/null -w "%{http_code}" --connect-timeout 5 "${BASE_URL}/actuator/health" 2>/dev/null) if [ "$HTTP_CODE" = "200" ]; then print_success "服务正常运行 (HTTP $HTTP_CODE)" elif [ "$HTTP_CODE" = "000" ]; then print_error "无法连接到服务 $BASE_URL" print_info "请确保 parse-service 正在运行" exit 1 else print_info "健康检查返回 HTTP $HTTP_CODE,继续测试..." fi } # 注册测试用户 register_test_user() { print_header "注册测试用户" print_info "用户名: $TEST_USERNAME" print_info "邮箱: $TEST_EMAIL" print_info "注册URL: $REGISTER_URL" RESPONSE=$(curl -s -w "\n%{http_code}" \ -X POST "$REGISTER_URL" \ -H "Content-Type: application/json" \ -d "{\"username\":\"${TEST_USERNAME}\",\"email\":\"${TEST_EMAIL}\",\"password\":\"${TEST_PASSWORD}\",\"confirmPassword\":\"${TEST_PASSWORD}\"}" \ --connect-timeout 10) HTTP_CODE=$(echo "$RESPONSE" | tail -n1) BODY=$(echo "$RESPONSE" | sed '$d') echo -e "\n${YELLOW}响应状态码:${NC} $HTTP_CODE" if [ "$JQ_AVAILABLE" = true ]; then echo "$BODY" | jq . 2>/dev/null || echo "$BODY" else echo "$BODY" fi # 解析用户ID if [ "$HTTP_CODE" = "200" ] || [ "$HTTP_CODE" = "201" ]; then print_success "用户注册成功!" if [ "$JQ_AVAILABLE" = true ]; then USER_ID=$(echo "$BODY" | jq -r '.data.user.id // .data.userId // .userId // empty' 2>/dev/null) if [ -z "$USER_ID" ] || [ "$USER_ID" = "null" ]; then # 尝试其他可能的字段 USER_ID=$(echo "$BODY" | jq -r '.data.id // .id // empty' 2>/dev/null) fi if [ -n "$USER_ID" ] && [ "$USER_ID" != "null" ]; then print_info "用户ID: $USER_ID" echo "$USER_ID" > "${SCRIPT_DIR}/.last_user_id" else print_error "无法从响应中获取用户ID" echo "响应内容: $BODY" exit 1 fi fi else print_error "用户注册失败 (HTTP $HTTP_CODE)" print_info "响应: $BODY" exit 1 fi } # 测试文件上传 test_upload() { print_step "文件上传" print_info "上传URL: $UPLOAD_URL" print_info "用户ID: $USER_ID" print_info "文件: $TEST_FILE" echo -e "\n发送请求..." RESPONSE=$(curl -s -w "\n%{http_code}" \ -X POST "$UPLOAD_URL" \ -H "Content-Type: multipart/form-data" \ -F "file=@${TEST_FILE}" \ -F "userId=${USER_ID}" \ --connect-timeout 10 \ --max-time 300) HTTP_CODE=$(echo "$RESPONSE" | tail -n1) BODY=$(echo "$RESPONSE" | sed '$d') echo -e "\n${YELLOW}响应状态码:${NC} $HTTP_CODE" echo -e "${YELLOW}响应内容:${NC}" if [ "$JQ_AVAILABLE" = true ]; then echo "$BODY" | jq . 2>/dev/null || echo "$BODY" else echo "$BODY" fi # 解析响应 if [ "$HTTP_CODE" = "200" ]; then print_success "文件上传成功!" # 提取documentId用于后续操作 if [ "$JQ_AVAILABLE" = true ]; then DOCUMENT_ID=$(echo "$BODY" | jq -r '.data.documentId // .documentId // empty' 2>/dev/null) if [ -n "$DOCUMENT_ID" ] && [ "$DOCUMENT_ID" != "null" ]; then print_info "文档ID: $DOCUMENT_ID" echo "$DOCUMENT_ID" > "${SCRIPT_DIR}/.last_document_id" else print_error "无法从响应中获取文档ID" return 1 fi fi return 0 else print_error "文件上传失败 (HTTP $HTTP_CODE)" return 1 fi } # 测试解析状态查询(单次) test_parse_status() { local DOC_ID=$1 print_info "文档ID: $DOC_ID" print_info "状态URL: ${STATUS_URL}/${DOC_ID}" RESPONSE=$(curl -s -w "\n%{http_code}" \ -X GET "${STATUS_URL}/${DOC_ID}" \ --connect-timeout 10) HTTP_CODE=$(echo "$RESPONSE" | tail -n1) BODY=$(echo "$RESPONSE" | sed '$d') echo -e "\n${YELLOW}响应状态码:${NC} $HTTP_CODE" echo -e "${YELLOW}响应内容:${NC}" if [ "$JQ_AVAILABLE" = true ]; then echo "$BODY" | jq . 2>/dev/null || echo "$BODY" # 显示关键状态信息 if [ "$HTTP_CODE" = "200" ]; then STATUS=$(echo "$BODY" | jq -r '.data.status // empty' 2>/dev/null) PROGRESS=$(echo "$BODY" | jq -r '.data.progress // 0' 2>/dev/null) CURRENT_STEP=$(echo "$BODY" | jq -r '.data.currentStep // empty' 2>/dev/null) print_info "状态: $STATUS, 进度: ${PROGRESS}%, 当前步骤: $CURRENT_STEP" fi else echo "$BODY" fi if [ "$HTTP_CODE" = "200" ]; then print_success "状态查询成功!" else print_error "状态查询失败 (HTTP $HTTP_CODE)" fi } # 轮询解析状态直到完成 poll_parse_status() { local DOC_ID=$1 local MAX_ATTEMPTS=${2:-60} local INTERVAL=${3:-3} print_step "轮询解析状态 (最多${MAX_ATTEMPTS}次, 间隔${INTERVAL}秒)" for ((i=1; i<=MAX_ATTEMPTS; i++)); do RESPONSE=$(curl -s "${STATUS_URL}/${DOC_ID}" --connect-timeout 10) if [ "$JQ_AVAILABLE" = true ]; then # 状态字段为 status,值为: pending/processing/completed/failed STATUS=$(echo "$RESPONSE" | jq -r '.data.status // .status // empty' 2>/dev/null) PROGRESS=$(echo "$RESPONSE" | jq -r '.data.progress // .progress // 0' 2>/dev/null) echo -ne "\r第 $i 次查询... 状态: $STATUS, 进度: ${PROGRESS}% " if [ "$STATUS" = "completed" ] || [ "$STATUS" = "COMPLETED" ]; then echo "" print_success "解析完成!" return 0 elif [ "$STATUS" = "failed" ] || [ "$STATUS" = "FAILED" ]; then echo "" print_error "解析失败!" echo "$RESPONSE" | jq . return 1 fi else echo "$RESPONSE" fi sleep $INTERVAL done echo "" print_error "轮询超时,解析未完成" return 1 } # 获取解析后的文本内容 get_document_text() { local DOC_ID=$1 print_step "获取文档解析文本" print_info "文档ID: $DOC_ID" print_info "请求URL: ${TEXT_STORAGE_URL}/${DOC_ID}" RESPONSE=$(curl -s -w "\n%{http_code}" \ -X GET "${TEXT_STORAGE_URL}/${DOC_ID}" \ --connect-timeout 10) HTTP_CODE=$(echo "$RESPONSE" | tail -n1) BODY=$(echo "$RESPONSE" | sed '$d') echo -e "${YELLOW}响应状态码:${NC} $HTTP_CODE" if [ "$HTTP_CODE" = "200" ]; then if [ "$JQ_AVAILABLE" = true ]; then # 获取文件路径 FILE_PATH=$(echo "$BODY" | jq -r '.data.filePath // empty' 2>/dev/null) if [ -n "$FILE_PATH" ] && [ "$FILE_PATH" != "null" ]; then print_success "获取文本存储记录成功!" print_info "文件路径: $FILE_PATH" # 读取文件内容 if [ -f "$FILE_PATH" ]; then DOCUMENT_TEXT=$(cat "$FILE_PATH" 2>/dev/null) TEXT_LENGTH=${#DOCUMENT_TEXT} print_success "读取文本成功 (长度: $TEXT_LENGTH 字符)" # 显示前200个字符 echo -e "${YELLOW}文本预览:${NC}" echo "${DOCUMENT_TEXT:0:200}..." return 0 else print_error "文件不存在: $FILE_PATH" return 1 fi else print_error "响应中无文件路径" echo "$BODY" | jq . 2>/dev/null return 1 fi fi else print_error "获取文本存储失败 (HTTP $HTTP_CODE)" echo "$BODY" return 1 fi } # 向量提取(RAG 索引) test_vector_extraction() { local DOC_ID=$1 local TEXT=$2 print_step "向量提取 (RAG 索引)" print_info "文档ID: $DOC_ID" print_info "文本长度: ${#TEXT} 字符" print_info "请求URL: $RAG_INDEX_URL" # 构建请求JSON(需要转义文本中的特殊字符) if [ "$JQ_AVAILABLE" = true ]; then REQUEST_BODY=$(jq -n \ --arg docId "$DOC_ID" \ --arg text "$TEXT" \ '{documentId: $docId, text: $text}') else # 简单转义 ESCAPED_TEXT=$(echo "$TEXT" | sed 's/\\/\\\\/g' | sed 's/"/\\"/g' | tr '\n' ' ') REQUEST_BODY="{\"documentId\":\"${DOC_ID}\",\"text\":\"${ESCAPED_TEXT}\"}" fi RESPONSE=$(curl -s -w "\n%{http_code}" \ -X POST "$RAG_INDEX_URL" \ -H "Content-Type: application/json" \ -d "$REQUEST_BODY" \ --connect-timeout 30 \ --max-time 300) HTTP_CODE=$(echo "$RESPONSE" | tail -n1) BODY=$(echo "$RESPONSE" | sed '$d') echo -e "${YELLOW}响应状态码:${NC} $HTTP_CODE" echo -e "${YELLOW}响应内容:${NC}" if [ "$JQ_AVAILABLE" = true ]; then echo "$BODY" | jq . 2>/dev/null || echo "$BODY" else echo "$BODY" fi if [ "$HTTP_CODE" = "200" ]; then if [ "$JQ_AVAILABLE" = true ]; then CHUNK_COUNT=$(echo "$BODY" | jq -r '.data.chunkCount // empty' 2>/dev/null) if [ -n "$CHUNK_COUNT" ] && [ "$CHUNK_COUNT" != "null" ]; then print_success "向量提取成功! 生成 $CHUNK_COUNT 个分块" else print_success "向量提取成功!" fi else print_success "向量提取成功!" fi return 0 else print_error "向量提取失败 (HTTP $HTTP_CODE)" return 1 fi } # NER 提取 test_ner_extraction() { local DOC_ID=$1 print_step "NER 提取 (命名实体识别)" print_info "文档ID: $DOC_ID" print_info "请求URL: ${NER_DOCUMENT_URL}/${DOC_ID}" RESPONSE=$(curl -s -w "\n%{http_code}" \ -X POST "${NER_DOCUMENT_URL}/${DOC_ID}" \ -H "Content-Type: application/json" \ --connect-timeout 30 \ --max-time 300) HTTP_CODE=$(echo "$RESPONSE" | tail -n1) BODY=$(echo "$RESPONSE" | sed '$d') echo -e "${YELLOW}响应状态码:${NC} $HTTP_CODE" echo -e "${YELLOW}响应内容:${NC}" if [ "$JQ_AVAILABLE" = true ]; then echo "$BODY" | jq . 2>/dev/null || echo "$BODY" else echo "$BODY" fi if [ "$HTTP_CODE" = "200" ]; then if [ "$JQ_AVAILABLE" = true ]; then ENTITY_COUNT=$(echo "$BODY" | jq -r '.data.entityCount // empty' 2>/dev/null) RELATION_COUNT=$(echo "$BODY" | jq -r '.data.relationCount // empty' 2>/dev/null) if [ -n "$ENTITY_COUNT" ] && [ "$ENTITY_COUNT" != "null" ]; then print_success "NER 提取成功! 实体: $ENTITY_COUNT, 关系: $RELATION_COUNT" else print_success "NER 提取成功!" fi else print_success "NER 提取成功!" fi return 0 else print_error "NER 提取失败 (HTTP $HTTP_CODE)" return 1 fi } # 显示使用帮助 show_help() { echo "使用方法: $0 [选项] [host] [port]" echo "" echo "端到端测试流程: 上传文件 -> 等待解析 -> 向量提取 -> NER提取" echo "" echo "选项:" echo " -h, --help 显示帮助信息" echo " -e, --e2e 执行完整端到端测试 (默认)" echo " -u, --upload-only 仅执行上传测试" echo " -s, --status 仅查询上次上传的文档状态" echo " -v, --vector 仅执行向量提取(使用上次的文档)" echo " -n, --ner 仅执行NER提取(使用上次的文档)" echo "" echo "示例:" echo " $0 # 使用默认配置执行完整端到端测试" echo " $0 192.168.1.100 5232 # 指定服务器地址" echo " $0 -u # 仅上传文件" echo " $0 -s # 查询上次上传的状态" echo " $0 -v # 对上次文档执行向量提取" echo " $0 -n # 对上次文档执行NER提取" } # 主函数 main() { local MODE="e2e" # 默认执行完整端到端测试 # 解析参数 while [[ $# -gt 0 ]]; do case $1 in -h|--help) show_help exit 0 ;; -e|--e2e) MODE="e2e" shift ;; -u|--upload-only) MODE="upload" shift ;; -s|--status) MODE="status" shift ;; -v|--vector) MODE="vector" shift ;; -n|--ner) MODE="ner" shift ;; -p|--poll) # 兼容旧参数,等同于e2e MODE="e2e" shift ;; *) if [[ ! "$1" =~ ^- ]]; then if [[ -z "$HOST_SET" ]]; then HOST=$1 HOST_SET=true else PORT=$1 fi fi shift ;; esac done # 更新URL BASE_URL="http://${HOST}:${PORT}" UPLOAD_URL="${BASE_URL}/api/v1/parse/upload" STATUS_URL="${BASE_URL}/parse/status" REGISTER_URL="${BASE_URL}/auth/register" TEXT_STORAGE_URL="${BASE_URL}/api/v1/graph/text-storage" RAG_INDEX_URL="${BASE_URL}/api/rag/index" NER_DOCUMENT_URL="${BASE_URL}/api/ner/document" print_header "文件上传端到端测试" echo "目标服务: $BASE_URL" echo "测试模式: $MODE" echo "时间: $(date '+%Y-%m-%d %H:%M:%S')" check_dependencies # 根据模式执行不同操作 case $MODE in status) # 仅查询状态 if [ -f "${SCRIPT_DIR}/.last_document_id" ]; then DOCUMENT_ID=$(cat "${SCRIPT_DIR}/.last_document_id") print_header "查询解析状态" test_parse_status "$DOCUMENT_ID" else print_error "未找到上次上传的文档ID" exit 1 fi ;; vector) # 仅执行向量提取 if [ -f "${SCRIPT_DIR}/.last_document_id" ]; then DOCUMENT_ID=$(cat "${SCRIPT_DIR}/.last_document_id") print_header "向量提取测试" if get_document_text "$DOCUMENT_ID"; then test_vector_extraction "$DOCUMENT_ID" "$DOCUMENT_TEXT" fi else print_error "未找到上次上传的文档ID" exit 1 fi ;; ner) # 仅执行NER提取 if [ -f "${SCRIPT_DIR}/.last_document_id" ]; then DOCUMENT_ID=$(cat "${SCRIPT_DIR}/.last_document_id") print_header "NER 提取测试" test_ner_extraction "$DOCUMENT_ID" else print_error "未找到上次上传的文档ID" exit 1 fi ;; upload) # 仅上传 check_test_file check_service register_test_user test_upload ;; e2e) # 完整端到端测试 check_test_file check_service register_test_user print_header "步骤 1/4: 文件上传" test_upload if [ -z "$DOCUMENT_ID" ] && [ -f "${SCRIPT_DIR}/.last_document_id" ]; then DOCUMENT_ID=$(cat "${SCRIPT_DIR}/.last_document_id") fi if [ -z "$DOCUMENT_ID" ]; then print_error "无法获取文档ID,终止测试" exit 1 fi print_header "步骤 2/4: 等待解析完成" if ! poll_parse_status "$DOCUMENT_ID" 60 3; then print_error "解析未完成,终止测试" exit 1 fi print_header "步骤 3/4: 向量提取" if get_document_text "$DOCUMENT_ID"; then test_vector_extraction "$DOCUMENT_ID" "$DOCUMENT_TEXT" else print_info "跳过向量提取(无法获取文本)" fi print_header "步骤 4/4: NER 提取" test_ner_extraction "$DOCUMENT_ID" ;; esac print_header "测试完成" echo -e "${GREEN}文档ID: $DOCUMENT_ID${NC}" echo "可使用以下命令进行后续操作:" echo " $0 -s # 查询状态" echo " $0 -v # 重新向量提取" echo " $0 -n # 重新NER提取" } # 运行主函数 main "$@"