#!/bin/bash # ============================================ # 文件上传端到端测试脚本 # ============================================ # 测试流程: 上传 -> 解析等待 -> 自动处理(向量/NER/结构化)-> 数据源操作 # 使用方法: ./test_upload_api.sh [host] [port] # 示例: ./test_upload_api.sh localhost 5232 # ============================================ # 配置参数 HOST=${1:-localhost} PORT=${2:-5232} BASE_URL="http://${HOST}:${PORT}" UPLOAD_URL="${BASE_URL}/api/v1/parse/upload" STATUS_URL="${BASE_URL}/parse/status" REGISTER_URL="${BASE_URL}/auth/register" TEXT_STORAGE_URL="${BASE_URL}/api/v1/graph/text-storage" RAG_INDEX_URL="${BASE_URL}/api/rag/index" NER_DOCUMENT_URL="${BASE_URL}/api/ner/document" STRUCTURED_URL="${BASE_URL}/parse/structured" ELEMENTS_URL="${BASE_URL}/parse/elements" DATASOURCE_URL="${BASE_URL}/api/v1/datasource" GRAPH_URL="${BASE_URL}/api/graph" # 测试文件路径(相对于脚本所在目录) SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" TEST_FILE="${SCRIPT_DIR}/test.docx" # 测试用户信息 TIMESTAMP=$(date +%s) TEST_USERNAME="testuser_${TIMESTAMP}" TEST_EMAIL="testuser_${TIMESTAMP}@test.com" TEST_PASSWORD="Test123456!" USER_ID="" DOCUMENT_ID="" DOCUMENT_TEXT="" # 颜色定义 RED='\033[0;31m' GREEN='\033[0;32m' YELLOW='\033[1;33m' BLUE='\033[0;34m' CYAN='\033[0;36m' NC='\033[0m' # No Color # 输出函数 print_header() { echo -e "\n${BLUE}============================================${NC}" echo -e "${BLUE}$1${NC}" echo -e "${BLUE}============================================${NC}" } print_step() { echo -e "\n${CYAN}>>> $1${NC}" } print_success() { echo -e "${GREEN}✓ $1${NC}" } print_error() { echo -e "${RED}✗ $1${NC}" } print_info() { echo -e "${YELLOW}➤ $1${NC}" } # 检查依赖 check_dependencies() { print_header "检查依赖" if ! command -v curl &> /dev/null; then print_error "curl 未安装" exit 1 fi print_success "curl 已安装" if ! command -v jq &> /dev/null; then print_info "jq 未安装,JSON格式化将不可用" JQ_AVAILABLE=false else print_success "jq 已安装" JQ_AVAILABLE=true fi } # 检查测试文件 check_test_file() { print_header "检查测试文件" if [ ! -f "$TEST_FILE" ]; then print_error "测试文件不存在: $TEST_FILE" exit 1 fi FILE_SIZE=$(stat -c%s "$TEST_FILE" 2>/dev/null || stat -f%z "$TEST_FILE" 2>/dev/null) print_success "测试文件存在: $TEST_FILE" print_info "文件大小: $FILE_SIZE bytes" } # 检查服务是否可用 check_service() { print_header "检查服务状态" print_info "测试服务: $BASE_URL" HTTP_CODE=$(curl -s -o /dev/null -w "%{http_code}" --connect-timeout 5 "${BASE_URL}/actuator/health" 2>/dev/null) if [ "$HTTP_CODE" = "200" ]; then print_success "服务正常运行 (HTTP $HTTP_CODE)" elif [ "$HTTP_CODE" = "000" ]; then print_error "无法连接到服务 $BASE_URL" print_info "请确保 parse-service 正在运行" exit 1 else print_info "健康检查返回 HTTP $HTTP_CODE,继续测试..." fi } # 注册测试用户 register_test_user() { print_header "注册测试用户" print_info "用户名: $TEST_USERNAME" print_info "邮箱: $TEST_EMAIL" print_info "注册URL: $REGISTER_URL" RESPONSE=$(curl -s -w "\n%{http_code}" \ -X POST "$REGISTER_URL" \ -H "Content-Type: application/json" \ -d "{\"username\":\"${TEST_USERNAME}\",\"email\":\"${TEST_EMAIL}\",\"password\":\"${TEST_PASSWORD}\",\"confirmPassword\":\"${TEST_PASSWORD}\"}" \ --connect-timeout 10) HTTP_CODE=$(echo "$RESPONSE" | tail -n1) BODY=$(echo "$RESPONSE" | sed '$d') echo -e "\n${YELLOW}响应状态码:${NC} $HTTP_CODE" if [ "$JQ_AVAILABLE" = true ]; then echo "$BODY" | jq . 2>/dev/null || echo "$BODY" else echo "$BODY" fi # 解析用户ID if [ "$HTTP_CODE" = "200" ] || [ "$HTTP_CODE" = "201" ]; then print_success "用户注册成功!" if [ "$JQ_AVAILABLE" = true ]; then USER_ID=$(echo "$BODY" | jq -r '.data.user.id // .data.userId // .userId // empty' 2>/dev/null) if [ -z "$USER_ID" ] || [ "$USER_ID" = "null" ]; then # 尝试其他可能的字段 USER_ID=$(echo "$BODY" | jq -r '.data.id // .id // empty' 2>/dev/null) fi if [ -n "$USER_ID" ] && [ "$USER_ID" != "null" ]; then print_info "用户ID: $USER_ID" echo "$USER_ID" > "${SCRIPT_DIR}/.last_user_id" else print_error "无法从响应中获取用户ID" echo "响应内容: $BODY" exit 1 fi fi else print_error "用户注册失败 (HTTP $HTTP_CODE)" print_info "响应: $BODY" exit 1 fi } # 测试文件上传 test_upload() { print_step "文件上传" print_info "上传URL: $UPLOAD_URL" print_info "用户ID: $USER_ID" print_info "文件: $TEST_FILE" echo -e "\n发送请求..." RESPONSE=$(curl -s -w "\n%{http_code}" \ -X POST "$UPLOAD_URL" \ -H "Content-Type: multipart/form-data" \ -F "file=@${TEST_FILE}" \ -F "userId=${USER_ID}" \ --connect-timeout 10 \ --max-time 300) HTTP_CODE=$(echo "$RESPONSE" | tail -n1) BODY=$(echo "$RESPONSE" | sed '$d') echo -e "\n${YELLOW}响应状态码:${NC} $HTTP_CODE" echo -e "${YELLOW}响应内容:${NC}" if [ "$JQ_AVAILABLE" = true ]; then echo "$BODY" | jq . 2>/dev/null || echo "$BODY" else echo "$BODY" fi # 解析响应 if [ "$HTTP_CODE" = "200" ]; then print_success "文件上传成功!" # 提取documentId用于后续操作 if [ "$JQ_AVAILABLE" = true ]; then DOCUMENT_ID=$(echo "$BODY" | jq -r '.data.documentId // .documentId // empty' 2>/dev/null) if [ -n "$DOCUMENT_ID" ] && [ "$DOCUMENT_ID" != "null" ]; then print_info "文档ID: $DOCUMENT_ID" echo "$DOCUMENT_ID" > "${SCRIPT_DIR}/.last_document_id" else print_error "无法从响应中获取文档ID" return 1 fi fi return 0 else print_error "文件上传失败 (HTTP $HTTP_CODE)" return 1 fi } # 测试解析状态查询(单次) test_parse_status() { local DOC_ID=$1 print_info "文档ID: $DOC_ID" print_info "状态URL: ${STATUS_URL}/${DOC_ID}" RESPONSE=$(curl -s -w "\n%{http_code}" \ -X GET "${STATUS_URL}/${DOC_ID}" \ --connect-timeout 10) HTTP_CODE=$(echo "$RESPONSE" | tail -n1) BODY=$(echo "$RESPONSE" | sed '$d') echo -e "\n${YELLOW}响应状态码:${NC} $HTTP_CODE" echo -e "${YELLOW}响应内容:${NC}" if [ "$JQ_AVAILABLE" = true ]; then echo "$BODY" | jq . 2>/dev/null || echo "$BODY" # 显示关键状态信息 if [ "$HTTP_CODE" = "200" ]; then STATUS=$(echo "$BODY" | jq -r '.data.status // empty' 2>/dev/null) PROGRESS=$(echo "$BODY" | jq -r '.data.progress // 0' 2>/dev/null) CURRENT_STEP=$(echo "$BODY" | jq -r '.data.currentStep // empty' 2>/dev/null) print_info "状态: $STATUS, 进度: ${PROGRESS}%, 当前步骤: $CURRENT_STEP" fi else echo "$BODY" fi if [ "$HTTP_CODE" = "200" ]; then print_success "状态查询成功!" else print_error "状态查询失败 (HTTP $HTTP_CODE)" fi } # 轮询解析状态直到完成 poll_parse_status() { local DOC_ID=$1 local MAX_ATTEMPTS=${2:-60} local INTERVAL=${3:-3} print_step "轮询解析状态 (最多${MAX_ATTEMPTS}次, 间隔${INTERVAL}秒)" for ((i=1; i<=MAX_ATTEMPTS; i++)); do RESPONSE=$(curl -s "${STATUS_URL}/${DOC_ID}" --connect-timeout 10) if [ "$JQ_AVAILABLE" = true ]; then # 状态字段为 status,值为: pending/processing/completed/failed STATUS=$(echo "$RESPONSE" | jq -r '.data.status // .status // empty' 2>/dev/null) PROGRESS=$(echo "$RESPONSE" | jq -r '.data.progress // .progress // 0' 2>/dev/null) echo -ne "\r第 $i 次查询... 状态: $STATUS, 进度: ${PROGRESS}% " if [ "$STATUS" = "completed" ] || [ "$STATUS" = "COMPLETED" ]; then echo "" print_success "解析完成!" return 0 elif [ "$STATUS" = "failed" ] || [ "$STATUS" = "FAILED" ]; then echo "" print_error "解析失败!" echo "$RESPONSE" | jq . return 1 fi else echo "$RESPONSE" fi sleep $INTERVAL done echo "" print_error "轮询超时,解析未完成" return 1 } # 获取解析后的文本内容 get_document_text() { local DOC_ID=$1 print_step "获取文档解析文本" print_info "文档ID: $DOC_ID" print_info "请求URL: ${TEXT_STORAGE_URL}/${DOC_ID}" RESPONSE=$(curl -s -w "\n%{http_code}" \ -X GET "${TEXT_STORAGE_URL}/${DOC_ID}" \ --connect-timeout 10) HTTP_CODE=$(echo "$RESPONSE" | tail -n1) BODY=$(echo "$RESPONSE" | sed '$d') echo -e "${YELLOW}响应状态码:${NC} $HTTP_CODE" if [ "$HTTP_CODE" = "200" ]; then if [ "$JQ_AVAILABLE" = true ]; then # 获取文件路径 FILE_PATH=$(echo "$BODY" | jq -r '.data.filePath // empty' 2>/dev/null) if [ -n "$FILE_PATH" ] && [ "$FILE_PATH" != "null" ]; then print_success "获取文本存储记录成功!" print_info "文件路径: $FILE_PATH" # 读取文件内容 if [ -f "$FILE_PATH" ]; then DOCUMENT_TEXT=$(cat "$FILE_PATH" 2>/dev/null) TEXT_LENGTH=${#DOCUMENT_TEXT} print_success "读取文本成功 (长度: $TEXT_LENGTH 字符)" # 显示前200个字符 echo -e "${YELLOW}文本预览:${NC}" echo "${DOCUMENT_TEXT:0:200}..." return 0 else print_error "文件不存在: $FILE_PATH" return 1 fi else print_error "响应中无文件路径" echo "$BODY" | jq . 2>/dev/null return 1 fi fi else print_error "获取文本存储失败 (HTTP $HTTP_CODE)" echo "$BODY" return 1 fi } # 向量提取(RAG 索引) test_vector_extraction() { local DOC_ID=$1 local TEXT=$2 print_step "向量提取 (RAG 索引)" print_info "文档ID: $DOC_ID" print_info "文本长度: ${#TEXT} 字符" print_info "请求URL: $RAG_INDEX_URL" # 构建请求JSON(需要转义文本中的特殊字符) if [ "$JQ_AVAILABLE" = true ]; then REQUEST_BODY=$(jq -n \ --arg docId "$DOC_ID" \ --arg text "$TEXT" \ '{documentId: $docId, text: $text}') else # 简单转义 ESCAPED_TEXT=$(echo "$TEXT" | sed 's/\\/\\\\/g' | sed 's/"/\\"/g' | tr '\n' ' ') REQUEST_BODY="{\"documentId\":\"${DOC_ID}\",\"text\":\"${ESCAPED_TEXT}\"}" fi RESPONSE=$(curl -s -w "\n%{http_code}" \ -X POST "$RAG_INDEX_URL" \ -H "Content-Type: application/json" \ -d "$REQUEST_BODY" \ --connect-timeout 30 \ --max-time 300) HTTP_CODE=$(echo "$RESPONSE" | tail -n1) BODY=$(echo "$RESPONSE" | sed '$d') echo -e "${YELLOW}响应状态码:${NC} $HTTP_CODE" echo -e "${YELLOW}响应内容:${NC}" if [ "$JQ_AVAILABLE" = true ]; then echo "$BODY" | jq . 2>/dev/null || echo "$BODY" else echo "$BODY" fi if [ "$HTTP_CODE" = "200" ]; then if [ "$JQ_AVAILABLE" = true ]; then CHUNK_COUNT=$(echo "$BODY" | jq -r '.data.chunkCount // empty' 2>/dev/null) if [ -n "$CHUNK_COUNT" ] && [ "$CHUNK_COUNT" != "null" ]; then print_success "向量提取成功! 生成 $CHUNK_COUNT 个分块" else print_success "向量提取成功!" fi else print_success "向量提取成功!" fi return 0 else print_error "向量提取失败 (HTTP $HTTP_CODE)" return 1 fi } # NER 提取 test_ner_extraction() { local DOC_ID=$1 print_step "NER 提取 (命名实体识别)" print_info "文档ID: $DOC_ID" print_info "请求URL: ${NER_DOCUMENT_URL}/${DOC_ID}" RESPONSE=$(curl -s -w "\n%{http_code}" \ -X POST "${NER_DOCUMENT_URL}/${DOC_ID}" \ -H "Content-Type: application/json" \ --connect-timeout 30 \ --max-time 300) HTTP_CODE=$(echo "$RESPONSE" | tail -n1) BODY=$(echo "$RESPONSE" | sed '$d') echo -e "${YELLOW}响应状态码:${NC} $HTTP_CODE" echo -e "${YELLOW}响应内容:${NC}" if [ "$JQ_AVAILABLE" = true ]; then echo "$BODY" | jq . 2>/dev/null || echo "$BODY" else echo "$BODY" fi if [ "$HTTP_CODE" = "200" ]; then if [ "$JQ_AVAILABLE" = true ]; then ENTITY_COUNT=$(echo "$BODY" | jq -r '.data.entityCount // empty' 2>/dev/null) RELATION_COUNT=$(echo "$BODY" | jq -r '.data.relationCount // empty' 2>/dev/null) if [ -n "$ENTITY_COUNT" ] && [ "$ENTITY_COUNT" != "null" ]; then print_success "NER 提取成功! 实体: $ENTITY_COUNT, 关系: $RELATION_COUNT" else print_success "NER 提取成功!" fi else print_success "NER 提取成功!" fi return 0 else print_error "NER 提取失败 (HTTP $HTTP_CODE)" return 1 fi } # 结构化解析(提取图片和表格) test_structured_extraction() { local DOC_ID=$1 print_step "结构化解析 (提取段落、图片、表格)" print_info "文档ID: $DOC_ID" print_info "请求URL: ${STRUCTURED_URL}/${DOC_ID}" RESPONSE=$(curl -s -w "\n%{http_code}" \ -X GET "${STRUCTURED_URL}/${DOC_ID}" \ --connect-timeout 30 \ --max-time 300) HTTP_CODE=$(echo "$RESPONSE" | tail -n1) BODY=$(echo "$RESPONSE" | sed '$d') echo -e "${YELLOW}响应状态码:${NC} $HTTP_CODE" if [ "$HTTP_CODE" = "200" ]; then if [ "$JQ_AVAILABLE" = true ]; then TOTAL=$(echo "$BODY" | jq -r '.data.totalElements // 0' 2>/dev/null) IMAGE_COUNT=$(echo "$BODY" | jq -r '.data.imageCount // 0' 2>/dev/null) TABLE_COUNT=$(echo "$BODY" | jq -r '.data.tableCount // 0' 2>/dev/null) print_success "结构化解析成功!" print_info "总元素: $TOTAL, 图片: $IMAGE_COUNT, 表格: $TABLE_COUNT" # 显示图片列表 if [ "$IMAGE_COUNT" -gt 0 ]; then echo -e "\n${YELLOW}图片列表:${NC}" echo "$BODY" | jq -r '.data.elements[] | select(.type == "image") | " - \(.imageUrl) (\(.imageWidth)x\(.imageHeight))"' 2>/dev/null fi # 显示表格摘要 if [ "$TABLE_COUNT" -gt 0 ]; then echo -e "\n${YELLOW}表格列表:${NC}" echo "$BODY" | jq -r '.data.elements[] | select(.type == "table") | " - 表格\(.tableIndex): \(.tableRowCount)行 x \(.tableColCount)列"' 2>/dev/null fi else print_success "结构化解析成功!" echo "$BODY" fi return 0 else print_error "结构化解析失败 (HTTP $HTTP_CODE)" if [ "$JQ_AVAILABLE" = true ]; then echo "$BODY" | jq . 2>/dev/null || echo "$BODY" else echo "$BODY" fi return 1 fi } # 获取图片列表 test_get_images() { local DOC_ID=$1 print_step "获取文档图片" print_info "文档ID: $DOC_ID" print_info "请求URL: ${ELEMENTS_URL}/${DOC_ID}/images" RESPONSE=$(curl -s -w "\n%{http_code}" \ -X GET "${ELEMENTS_URL}/${DOC_ID}/images" \ --connect-timeout 10) HTTP_CODE=$(echo "$RESPONSE" | tail -n1) BODY=$(echo "$RESPONSE" | sed '$d') echo -e "${YELLOW}响应状态码:${NC} $HTTP_CODE" if [ "$HTTP_CODE" = "200" ]; then if [ "$JQ_AVAILABLE" = true ]; then COUNT=$(echo "$BODY" | jq -r '.data | length' 2>/dev/null) print_success "获取图片成功! 共 $COUNT 张" if [ "$COUNT" -gt 0 ]; then echo -e "${YELLOW}图片详情:${NC}" echo "$BODY" | jq -r '.data[] | " [\(.elementIndex)] \(.imageUrl) - \(.imageFormat) (\(.imageWidth)x\(.imageHeight))"' 2>/dev/null fi else print_success "获取图片成功!" echo "$BODY" fi return 0 else print_error "获取图片失败 (HTTP $HTTP_CODE)" return 1 fi } # 获取表格列表 test_get_tables() { local DOC_ID=$1 print_step "获取文档表格" print_info "文档ID: $DOC_ID" print_info "请求URL: ${ELEMENTS_URL}/${DOC_ID}/tables" RESPONSE=$(curl -s -w "\n%{http_code}" \ -X GET "${ELEMENTS_URL}/${DOC_ID}/tables" \ --connect-timeout 10) HTTP_CODE=$(echo "$RESPONSE" | tail -n1) BODY=$(echo "$RESPONSE" | sed '$d') echo -e "${YELLOW}响应状态码:${NC} $HTTP_CODE" if [ "$HTTP_CODE" = "200" ]; then if [ "$JQ_AVAILABLE" = true ]; then COUNT=$(echo "$BODY" | jq -r '.data | length' 2>/dev/null) print_success "获取表格成功! 共 $COUNT 个" if [ "$COUNT" -gt 0 ]; then echo -e "${YELLOW}表格详情:${NC}" echo "$BODY" | jq -r '.data[] | " [\(.elementIndex)] 表格\(.tableIndex): \(.tableRowCount)行 x \(.tableColCount)列"' 2>/dev/null fi else print_success "获取表格成功!" echo "$BODY" fi return 0 else print_error "获取表格失败 (HTTP $HTTP_CODE)" return 1 fi } # ============================================ # 数据源相关测试函数 # ============================================ # 获取文档的 GraphNode 列表 test_get_graph_nodes() { local DOC_ID=$1 print_step "获取文档 GraphNode 列表" print_info "文档ID: $DOC_ID" print_info "请求URL: ${GRAPH_URL}/documents/${DOC_ID}/nodes" RESPONSE=$(curl -s -w "\n%{http_code}" \ -X GET "${GRAPH_URL}/documents/${DOC_ID}/nodes" \ --connect-timeout 10) HTTP_CODE=$(echo "$RESPONSE" | tail -n1) BODY=$(echo "$RESPONSE" | sed '$d') echo -e "${YELLOW}响应状态码:${NC} $HTTP_CODE" if [ "$HTTP_CODE" = "200" ]; then if [ "$JQ_AVAILABLE" = true ]; then COUNT=$(echo "$BODY" | jq -r '.data | length' 2>/dev/null) print_success "获取 GraphNode 成功! 共 $COUNT 个" if [ "$COUNT" -gt 0 ]; then echo -e "${YELLOW}节点列表 (前10个):${NC}" echo "$BODY" | jq -r '.data[:10][] | " [\(.id)] \(.nodeType): \(.name)"' 2>/dev/null # 保存第一个节点ID供后续测试使用 FIRST_NODE_ID=$(echo "$BODY" | jq -r '.data[0].id // empty' 2>/dev/null) if [ -n "$FIRST_NODE_ID" ] && [ "$FIRST_NODE_ID" != "null" ]; then echo "$FIRST_NODE_ID" > "${SCRIPT_DIR}/.last_node_id" print_info "已保存第一个节点ID: $FIRST_NODE_ID" fi fi else print_success "获取 GraphNode 成功!" echo "$BODY" fi return 0 else print_error "获取 GraphNode 失败 (HTTP $HTTP_CODE)" return 1 fi } # 获取文档的数据源列表 test_get_datasources() { local DOC_ID=$1 print_step "获取文档数据源列表" print_info "文档ID: $DOC_ID" print_info "请求URL: ${DATASOURCE_URL}/document/${DOC_ID}" RESPONSE=$(curl -s -w "\n%{http_code}" \ -X GET "${DATASOURCE_URL}/document/${DOC_ID}" \ --connect-timeout 10) HTTP_CODE=$(echo "$RESPONSE" | tail -n1) BODY=$(echo "$RESPONSE" | sed '$d') echo -e "${YELLOW}响应状态码:${NC} $HTTP_CODE" if [ "$HTTP_CODE" = "200" ]; then if [ "$JQ_AVAILABLE" = true ]; then COUNT=$(echo "$BODY" | jq -r '.data | length' 2>/dev/null) print_success "获取数据源成功! 共 $COUNT 个" if [ "$COUNT" -gt 0 ]; then echo -e "${YELLOW}数据源列表:${NC}" echo "$BODY" | jq -r '.data[] | " [\(.id)] \(.name) (\(.type)) - 值类型: \(.valueType), 聚合: \(.aggregateType)"' 2>/dev/null fi else print_success "获取数据源成功!" echo "$BODY" fi return 0 else print_error "获取数据源失败 (HTTP $HTTP_CODE)" return 1 fi } # 创建数据源 test_create_datasource() { local DOC_ID=$1 local NAME=$2 local TYPE=$3 local VALUE_TYPE=${4:-text} local AGGREGATE_TYPE=${5:-first} print_step "创建数据源" print_info "文档ID: $DOC_ID" print_info "名称: $NAME" print_info "类型: $TYPE" print_info "值类型: $VALUE_TYPE" print_info "聚合方式: $AGGREGATE_TYPE" # 获取用户ID local ACTUAL_USER_ID="default-user" if [ -f "${SCRIPT_DIR}/.last_user_id" ]; then ACTUAL_USER_ID=$(cat "${SCRIPT_DIR}/.last_user_id") fi REQUEST_BODY=$(cat </dev/null) print_success "创建数据源成功!" print_info "数据源ID: $DS_ID" echo "$DS_ID" > "${SCRIPT_DIR}/.last_datasource_id" echo "$BODY" | jq '.data' 2>/dev/null else print_success "创建数据源成功!" echo "$BODY" fi return 0 else print_error "创建数据源失败 (HTTP $HTTP_CODE)" if [ "$JQ_AVAILABLE" = true ]; then echo "$BODY" | jq . 2>/dev/null || echo "$BODY" else echo "$BODY" fi return 1 fi } # 绑定节点到数据源 test_bind_nodes_to_datasource() { local DS_ID=$1 local NODE_TYPE=$2 # graph_node 或 document_element local NODE_IDS=$3 # 逗号分隔的节点ID列表 local MODE=${4:-append} # replace/append/remove print_step "绑定节点到数据源" print_info "数据源ID: $DS_ID" print_info "节点类型: $NODE_TYPE" print_info "节点IDs: $NODE_IDS" print_info "模式: $MODE" # 构建 refs 数组 local REFS_ARRAY="[" local FIRST=true IFS=',' read -ra IDS <<< "$NODE_IDS" for id in "${IDS[@]}"; do if [ "$FIRST" = true ]; then FIRST=false else REFS_ARRAY+="," fi REFS_ARRAY+="{\"type\":\"$NODE_TYPE\",\"id\":\"$id\"}" done REFS_ARRAY+="]" REQUEST_BODY=$(cat </dev/null else echo "$BODY" fi return 0 else print_error "绑定节点失败 (HTTP $HTTP_CODE)" if [ "$JQ_AVAILABLE" = true ]; then echo "$BODY" | jq . 2>/dev/null || echo "$BODY" else echo "$BODY" fi return 1 fi } # 获取数据源的值 test_get_datasource_value() { local DS_ID=$1 print_step "获取数据源值" print_info "数据源ID: $DS_ID" print_info "请求URL: ${DATASOURCE_URL}/${DS_ID}/value" RESPONSE=$(curl -s -w "\n%{http_code}" \ -X GET "${DATASOURCE_URL}/${DS_ID}/value" \ --connect-timeout 10) HTTP_CODE=$(echo "$RESPONSE" | tail -n1) BODY=$(echo "$RESPONSE" | sed '$d') echo -e "${YELLOW}响应状态码:${NC} $HTTP_CODE" if [ "$HTTP_CODE" = "200" ]; then print_success "获取数据源值成功!" if [ "$JQ_AVAILABLE" = true ]; then echo "$BODY" | jq '.data' 2>/dev/null else echo "$BODY" fi return 0 else print_error "获取数据源值失败 (HTTP $HTTP_CODE)" if [ "$JQ_AVAILABLE" = true ]; then echo "$BODY" | jq . 2>/dev/null || echo "$BODY" else echo "$BODY" fi return 1 fi } # 数据源完整测试流程 test_datasource_flow() { local DOC_ID=$1 print_header "数据源完整测试流程" # 1. 获取 GraphNode 列表 test_get_graph_nodes "$DOC_ID" # 2. 获取现有数据源 test_get_datasources "$DOC_ID" # 3. 创建一个测试数据源 print_step "创建测试数据源" if test_create_datasource "$DOC_ID" "测试数据源_$(date +%s)" "entity" "text" "concat"; then DS_ID=$(cat "${SCRIPT_DIR}/.last_datasource_id" 2>/dev/null) # 4. 如果有节点,绑定到数据源 if [ -f "${SCRIPT_DIR}/.last_node_id" ]; then NODE_ID=$(cat "${SCRIPT_DIR}/.last_node_id") test_bind_nodes_to_datasource "$DS_ID" "graph_node" "$NODE_ID" "append" # 5. 获取数据源值 test_get_datasource_value "$DS_ID" else print_info "没有可用的节点ID,跳过绑定测试" fi fi } # 显示使用帮助 show_help() { echo "使用方法: $0 [选项] [host] [port]" echo "" echo "端到端测试流程: 上传文件 -> 等待解析 -> 自动处理 -> 数据源操作" echo "" echo "选项:" echo " -h, --help 显示帮助信息" echo " -e, --e2e 执行完整端到端测试 (默认)" echo " -u, --upload-only 仅执行上传测试" echo " -s, --status 仅查询上次上传的文档状态" echo " -v, --vector 仅执行向量提取(使用上次的文档)" echo " -n, --ner 仅执行NER提取(使用上次的文档)" echo " -x, --structured 仅执行结构化解析(提取图片和表格)" echo " -i, --images 仅获取文档图片列表" echo " -t, --tables 仅获取文档表格列表" echo " -g, --nodes 获取文档的 GraphNode 列表" echo " -d, --datasource 获取文档的数据源列表" echo " --ds-create 创建数据源 (需要 --name 和 --type)" echo " --ds-bind 绑定节点到数据源" echo " --ds-value 获取数据源的值" echo " --ds-flow 执行数据源完整测试流程" echo "" echo "数据源相关参数:" echo " --name NAME 数据源名称" echo " --type TYPE 数据源类型 (entity/paragraph/image/table)" echo " --value-type TYPE 值类型 (text/image/table/mixed)" echo " --aggregate TYPE 聚合方式 (first/last/concat/sum/avg/list)" echo " --ds-id ID 数据源ID" echo " --node-type TYPE 节点类型 (graph_node/document_element)" echo " --node-ids IDS 节点ID列表 (逗号分隔)" echo "" echo "示例:" echo " $0 # 完整端到端测试" echo " $0 192.168.1.100 5232 # 指定服务器地址" echo " $0 -u # 仅上传文件" echo " $0 -s # 查询上次上传的状态" echo " $0 -g # 获取文档的 GraphNode 列表" echo " $0 -d # 获取文档的数据源列表" echo " $0 --ds-flow # 执行数据源完整测试流程" echo " $0 --ds-create --name '报告编号' --type entity" echo " $0 --ds-bind --ds-id xxx --node-type graph_node --node-ids 'id1,id2'" echo " $0 --ds-value --ds-id xxx" } # 主函数 main() { local MODE="e2e" # 默认执行完整端到端测试 # 数据源相关参数 local DS_NAME="" local DS_TYPE="" local DS_VALUE_TYPE="text" local DS_AGGREGATE="first" local DS_ID="" local NODE_TYPE="graph_node" local NODE_IDS="" # 解析参数 while [[ $# -gt 0 ]]; do case $1 in -h|--help) show_help exit 0 ;; -e|--e2e) MODE="e2e" shift ;; -u|--upload-only) MODE="upload" shift ;; -s|--status) MODE="status" shift ;; -v|--vector) MODE="vector" shift ;; -n|--ner) MODE="ner" shift ;; -x|--structured) MODE="structured" shift ;; -i|--images) MODE="images" shift ;; -t|--tables) MODE="tables" shift ;; -g|--nodes) MODE="nodes" shift ;; -d|--datasource) MODE="datasource" shift ;; --ds-create) MODE="ds-create" shift ;; --ds-bind) MODE="ds-bind" shift ;; --ds-value) MODE="ds-value" shift ;; --ds-flow) MODE="ds-flow" shift ;; --name) DS_NAME="$2" shift 2 ;; --type) DS_TYPE="$2" shift 2 ;; --value-type) DS_VALUE_TYPE="$2" shift 2 ;; --aggregate) DS_AGGREGATE="$2" shift 2 ;; --ds-id) DS_ID="$2" shift 2 ;; --node-type) NODE_TYPE="$2" shift 2 ;; --node-ids) NODE_IDS="$2" shift 2 ;; -p|--poll) # 兼容旧参数,等同于e2e MODE="e2e" shift ;; *) if [[ ! "$1" =~ ^- ]]; then if [[ -z "$HOST_SET" ]]; then HOST=$1 HOST_SET=true else PORT=$1 fi fi shift ;; esac done # 更新URL BASE_URL="http://${HOST}:${PORT}" UPLOAD_URL="${BASE_URL}/api/v1/parse/upload" STATUS_URL="${BASE_URL}/parse/status" REGISTER_URL="${BASE_URL}/auth/register" TEXT_STORAGE_URL="${BASE_URL}/api/v1/graph/text-storage" RAG_INDEX_URL="${BASE_URL}/api/rag/index" NER_DOCUMENT_URL="${BASE_URL}/api/ner/document" STRUCTURED_URL="${BASE_URL}/parse/structured" ELEMENTS_URL="${BASE_URL}/parse/elements" DATASOURCE_URL="${BASE_URL}/api/v1/datasource" GRAPH_URL="${BASE_URL}/api/graph" print_header "文件上传端到端测试" echo "目标服务: $BASE_URL" echo "测试模式: $MODE" echo "时间: $(date '+%Y-%m-%d %H:%M:%S')" check_dependencies # 根据模式执行不同操作 case $MODE in status) # 仅查询状态 if [ -f "${SCRIPT_DIR}/.last_document_id" ]; then DOCUMENT_ID=$(cat "${SCRIPT_DIR}/.last_document_id") print_header "查询解析状态" test_parse_status "$DOCUMENT_ID" else print_error "未找到上次上传的文档ID" exit 1 fi ;; vector) # 仅执行向量提取 if [ -f "${SCRIPT_DIR}/.last_document_id" ]; then DOCUMENT_ID=$(cat "${SCRIPT_DIR}/.last_document_id") print_header "向量提取测试" if get_document_text "$DOCUMENT_ID"; then test_vector_extraction "$DOCUMENT_ID" "$DOCUMENT_TEXT" fi else print_error "未找到上次上传的文档ID" exit 1 fi ;; ner) # 仅执行NER提取 if [ -f "${SCRIPT_DIR}/.last_document_id" ]; then DOCUMENT_ID=$(cat "${SCRIPT_DIR}/.last_document_id") print_header "NER 提取测试" test_ner_extraction "$DOCUMENT_ID" else print_error "未找到上次上传的文档ID" exit 1 fi ;; structured) # 仅执行结构化解析 if [ -f "${SCRIPT_DIR}/.last_document_id" ]; then DOCUMENT_ID=$(cat "${SCRIPT_DIR}/.last_document_id") print_header "结构化解析测试" test_structured_extraction "$DOCUMENT_ID" else print_error "未找到上次上传的文档ID" exit 1 fi ;; images) # 仅获取图片列表 if [ -f "${SCRIPT_DIR}/.last_document_id" ]; then DOCUMENT_ID=$(cat "${SCRIPT_DIR}/.last_document_id") print_header "获取文档图片" test_get_images "$DOCUMENT_ID" else print_error "未找到上次上传的文档ID" exit 1 fi ;; tables) # 仅获取表格列表 if [ -f "${SCRIPT_DIR}/.last_document_id" ]; then DOCUMENT_ID=$(cat "${SCRIPT_DIR}/.last_document_id") print_header "获取文档表格" test_get_tables "$DOCUMENT_ID" else print_error "未找到上次上传的文档ID" exit 1 fi ;; nodes) # 获取 GraphNode 列表 if [ -f "${SCRIPT_DIR}/.last_document_id" ]; then DOCUMENT_ID=$(cat "${SCRIPT_DIR}/.last_document_id") print_header "获取文档 GraphNode" test_get_graph_nodes "$DOCUMENT_ID" else print_error "未找到上次上传的文档ID" exit 1 fi ;; datasource) # 获取数据源列表 if [ -f "${SCRIPT_DIR}/.last_document_id" ]; then DOCUMENT_ID=$(cat "${SCRIPT_DIR}/.last_document_id") print_header "获取文档数据源" test_get_datasources "$DOCUMENT_ID" else print_error "未找到上次上传的文档ID" exit 1 fi ;; ds-create) # 创建数据源 if [ -z "$DS_NAME" ]; then print_error "请指定数据源名称 (--name)" exit 1 fi if [ -z "$DS_TYPE" ]; then print_error "请指定数据源类型 (--type)" exit 1 fi if [ -f "${SCRIPT_DIR}/.last_document_id" ]; then DOCUMENT_ID=$(cat "${SCRIPT_DIR}/.last_document_id") print_header "创建数据源" test_create_datasource "$DOCUMENT_ID" "$DS_NAME" "$DS_TYPE" "$DS_VALUE_TYPE" "$DS_AGGREGATE" else print_error "未找到上次上传的文档ID" exit 1 fi ;; ds-bind) # 绑定节点到数据源 if [ -z "$DS_ID" ] && [ -f "${SCRIPT_DIR}/.last_datasource_id" ]; then DS_ID=$(cat "${SCRIPT_DIR}/.last_datasource_id") fi if [ -z "$DS_ID" ]; then print_error "请指定数据源ID (--ds-id)" exit 1 fi if [ -z "$NODE_IDS" ] && [ -f "${SCRIPT_DIR}/.last_node_id" ]; then NODE_IDS=$(cat "${SCRIPT_DIR}/.last_node_id") fi if [ -z "$NODE_IDS" ]; then print_error "请指定节点ID (--node-ids)" exit 1 fi print_header "绑定节点到数据源" test_bind_nodes_to_datasource "$DS_ID" "$NODE_TYPE" "$NODE_IDS" "append" ;; ds-value) # 获取数据源值 if [ -z "$DS_ID" ] && [ -f "${SCRIPT_DIR}/.last_datasource_id" ]; then DS_ID=$(cat "${SCRIPT_DIR}/.last_datasource_id") fi if [ -z "$DS_ID" ]; then print_error "请指定数据源ID (--ds-id)" exit 1 fi print_header "获取数据源值" test_get_datasource_value "$DS_ID" ;; ds-flow) # 数据源完整测试流程 if [ -f "${SCRIPT_DIR}/.last_document_id" ]; then DOCUMENT_ID=$(cat "${SCRIPT_DIR}/.last_document_id") test_datasource_flow "$DOCUMENT_ID" else print_error "未找到上次上传的文档ID,请先上传文档" exit 1 fi ;; upload) # 仅上传 check_test_file check_service register_test_user test_upload ;; e2e) # 完整端到端测试 # 注意:上传后会自动触发 RAG向量化、结构化解析、NER提取 check_test_file check_service register_test_user print_header "步骤 1/3: 文件上传" print_info "上传后将自动触发: RAG向量化 + 结构化解析 + NER提取" test_upload if [ -z "$DOCUMENT_ID" ] && [ -f "${SCRIPT_DIR}/.last_document_id" ]; then DOCUMENT_ID=$(cat "${SCRIPT_DIR}/.last_document_id") fi if [ -z "$DOCUMENT_ID" ]; then print_error "无法获取文档ID,终止测试" exit 1 fi print_header "步骤 2/3: 等待解析完成" if ! poll_parse_status "$DOCUMENT_ID" 60 3; then print_error "解析未完成,终止测试" exit 1 fi # 等待自动处理完成(RAG + 结构化解析 + NER) # NER 可能需要较长时间(约4-5分钟),这里先等待一小段时间查看初始结果 print_header "步骤 3/3: 等待后台自动处理并查看结果" print_info "后台正在自动执行: RAG向量化、结构化解析、NER提取" print_info "NER 提取可能需要几分钟,可稍后使用 -g 查看 GraphNode 列表" print_info "等待 10 秒后查看初始结果..." sleep 10 # 查看结构化解析结果(通常很快完成) print_step "查看结构化解析结果" test_get_images "$DOCUMENT_ID" test_get_tables "$DOCUMENT_ID" # 查看 GraphNode(如果 NER 还未完成,可能为空) print_step "查看 GraphNode 列表 (NER 结果)" test_get_graph_nodes "$DOCUMENT_ID" print_info "" print_info "提示: NER 提取需要几分钟,可稍后运行 '$0 -g' 查看完整结果" ;; esac print_header "测试完成" if [ -n "$DOCUMENT_ID" ]; then echo -e "${GREEN}文档ID: $DOCUMENT_ID${NC}" fi if [ -f "${SCRIPT_DIR}/.last_datasource_id" ]; then echo -e "${GREEN}最后数据源ID: $(cat ${SCRIPT_DIR}/.last_datasource_id)${NC}" fi echo "" echo "可使用以下命令进行后续操作:" echo "" echo " === 文档处理 ===" echo " $0 -s # 查询解析状态" echo " $0 -v # 重新向量提取" echo " $0 -n # 重新NER提取" echo " $0 -x # 结构化解析" echo " $0 -i # 获取图片列表" echo " $0 -t # 获取表格列表" echo "" echo " === 数据源操作 ===" echo " $0 -g # 获取 GraphNode 列表" echo " $0 -d # 获取数据源列表" echo " $0 --ds-flow # 执行数据源完整测试" echo " $0 --ds-create --name '名称' --type entity" echo " $0 --ds-bind --node-ids 'id1,id2'" echo " $0 --ds-value" } # 运行主函数 main "$@"