test_upload_api.sh 26 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834
  1. #!/bin/bash
  2. # ============================================
  3. # 文件上传端到端测试脚本
  4. # ============================================
  5. # 测试流程: 上传 -> 解析等待 -> 向量提取 -> NER提取
  6. # 使用方法: ./test_upload_api.sh [host] [port]
  7. # 示例: ./test_upload_api.sh localhost 5232
  8. # ============================================
  9. # 配置参数
  10. HOST=${1:-localhost}
  11. PORT=${2:-5232}
  12. BASE_URL="http://${HOST}:${PORT}"
  13. UPLOAD_URL="${BASE_URL}/api/v1/parse/upload"
  14. STATUS_URL="${BASE_URL}/parse/status"
  15. REGISTER_URL="${BASE_URL}/auth/register"
  16. TEXT_STORAGE_URL="${BASE_URL}/api/v1/graph/text-storage"
  17. RAG_INDEX_URL="${BASE_URL}/api/rag/index"
  18. NER_DOCUMENT_URL="${BASE_URL}/api/ner/document"
  19. STRUCTURED_URL="${BASE_URL}/parse/structured"
  20. ELEMENTS_URL="${BASE_URL}/parse/elements"
  21. # 测试文件路径(相对于脚本所在目录)
  22. SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
  23. TEST_FILE="${SCRIPT_DIR}/test.docx"
  24. # 测试用户信息
  25. TIMESTAMP=$(date +%s)
  26. TEST_USERNAME="testuser_${TIMESTAMP}"
  27. TEST_EMAIL="testuser_${TIMESTAMP}@test.com"
  28. TEST_PASSWORD="Test123456!"
  29. USER_ID=""
  30. DOCUMENT_ID=""
  31. DOCUMENT_TEXT=""
  32. # 颜色定义
  33. RED='\033[0;31m'
  34. GREEN='\033[0;32m'
  35. YELLOW='\033[1;33m'
  36. BLUE='\033[0;34m'
  37. CYAN='\033[0;36m'
  38. NC='\033[0m' # No Color
  39. # 输出函数
  40. print_header() {
  41. echo -e "\n${BLUE}============================================${NC}"
  42. echo -e "${BLUE}$1${NC}"
  43. echo -e "${BLUE}============================================${NC}"
  44. }
  45. print_step() {
  46. echo -e "\n${CYAN}>>> $1${NC}"
  47. }
  48. print_success() {
  49. echo -e "${GREEN}✓ $1${NC}"
  50. }
  51. print_error() {
  52. echo -e "${RED}✗ $1${NC}"
  53. }
  54. print_info() {
  55. echo -e "${YELLOW}➤ $1${NC}"
  56. }
  57. # 检查依赖
  58. check_dependencies() {
  59. print_header "检查依赖"
  60. if ! command -v curl &> /dev/null; then
  61. print_error "curl 未安装"
  62. exit 1
  63. fi
  64. print_success "curl 已安装"
  65. if ! command -v jq &> /dev/null; then
  66. print_info "jq 未安装,JSON格式化将不可用"
  67. JQ_AVAILABLE=false
  68. else
  69. print_success "jq 已安装"
  70. JQ_AVAILABLE=true
  71. fi
  72. }
  73. # 检查测试文件
  74. check_test_file() {
  75. print_header "检查测试文件"
  76. if [ ! -f "$TEST_FILE" ]; then
  77. print_error "测试文件不存在: $TEST_FILE"
  78. exit 1
  79. fi
  80. FILE_SIZE=$(stat -c%s "$TEST_FILE" 2>/dev/null || stat -f%z "$TEST_FILE" 2>/dev/null)
  81. print_success "测试文件存在: $TEST_FILE"
  82. print_info "文件大小: $FILE_SIZE bytes"
  83. }
  84. # 检查服务是否可用
  85. check_service() {
  86. print_header "检查服务状态"
  87. print_info "测试服务: $BASE_URL"
  88. HTTP_CODE=$(curl -s -o /dev/null -w "%{http_code}" --connect-timeout 5 "${BASE_URL}/actuator/health" 2>/dev/null)
  89. if [ "$HTTP_CODE" = "200" ]; then
  90. print_success "服务正常运行 (HTTP $HTTP_CODE)"
  91. elif [ "$HTTP_CODE" = "000" ]; then
  92. print_error "无法连接到服务 $BASE_URL"
  93. print_info "请确保 parse-service 正在运行"
  94. exit 1
  95. else
  96. print_info "健康检查返回 HTTP $HTTP_CODE,继续测试..."
  97. fi
  98. }
  99. # 注册测试用户
  100. register_test_user() {
  101. print_header "注册测试用户"
  102. print_info "用户名: $TEST_USERNAME"
  103. print_info "邮箱: $TEST_EMAIL"
  104. print_info "注册URL: $REGISTER_URL"
  105. RESPONSE=$(curl -s -w "\n%{http_code}" \
  106. -X POST "$REGISTER_URL" \
  107. -H "Content-Type: application/json" \
  108. -d "{\"username\":\"${TEST_USERNAME}\",\"email\":\"${TEST_EMAIL}\",\"password\":\"${TEST_PASSWORD}\",\"confirmPassword\":\"${TEST_PASSWORD}\"}" \
  109. --connect-timeout 10)
  110. HTTP_CODE=$(echo "$RESPONSE" | tail -n1)
  111. BODY=$(echo "$RESPONSE" | sed '$d')
  112. echo -e "\n${YELLOW}响应状态码:${NC} $HTTP_CODE"
  113. if [ "$JQ_AVAILABLE" = true ]; then
  114. echo "$BODY" | jq . 2>/dev/null || echo "$BODY"
  115. else
  116. echo "$BODY"
  117. fi
  118. # 解析用户ID
  119. if [ "$HTTP_CODE" = "200" ] || [ "$HTTP_CODE" = "201" ]; then
  120. print_success "用户注册成功!"
  121. if [ "$JQ_AVAILABLE" = true ]; then
  122. USER_ID=$(echo "$BODY" | jq -r '.data.user.id // .data.userId // .userId // empty' 2>/dev/null)
  123. if [ -z "$USER_ID" ] || [ "$USER_ID" = "null" ]; then
  124. # 尝试其他可能的字段
  125. USER_ID=$(echo "$BODY" | jq -r '.data.id // .id // empty' 2>/dev/null)
  126. fi
  127. if [ -n "$USER_ID" ] && [ "$USER_ID" != "null" ]; then
  128. print_info "用户ID: $USER_ID"
  129. echo "$USER_ID" > "${SCRIPT_DIR}/.last_user_id"
  130. else
  131. print_error "无法从响应中获取用户ID"
  132. echo "响应内容: $BODY"
  133. exit 1
  134. fi
  135. fi
  136. else
  137. print_error "用户注册失败 (HTTP $HTTP_CODE)"
  138. print_info "响应: $BODY"
  139. exit 1
  140. fi
  141. }
  142. # 测试文件上传
  143. test_upload() {
  144. print_step "文件上传"
  145. print_info "上传URL: $UPLOAD_URL"
  146. print_info "用户ID: $USER_ID"
  147. print_info "文件: $TEST_FILE"
  148. echo -e "\n发送请求..."
  149. RESPONSE=$(curl -s -w "\n%{http_code}" \
  150. -X POST "$UPLOAD_URL" \
  151. -H "Content-Type: multipart/form-data" \
  152. -F "file=@${TEST_FILE}" \
  153. -F "userId=${USER_ID}" \
  154. --connect-timeout 10 \
  155. --max-time 300)
  156. HTTP_CODE=$(echo "$RESPONSE" | tail -n1)
  157. BODY=$(echo "$RESPONSE" | sed '$d')
  158. echo -e "\n${YELLOW}响应状态码:${NC} $HTTP_CODE"
  159. echo -e "${YELLOW}响应内容:${NC}"
  160. if [ "$JQ_AVAILABLE" = true ]; then
  161. echo "$BODY" | jq . 2>/dev/null || echo "$BODY"
  162. else
  163. echo "$BODY"
  164. fi
  165. # 解析响应
  166. if [ "$HTTP_CODE" = "200" ]; then
  167. print_success "文件上传成功!"
  168. # 提取documentId用于后续操作
  169. if [ "$JQ_AVAILABLE" = true ]; then
  170. DOCUMENT_ID=$(echo "$BODY" | jq -r '.data.documentId // .documentId // empty' 2>/dev/null)
  171. if [ -n "$DOCUMENT_ID" ] && [ "$DOCUMENT_ID" != "null" ]; then
  172. print_info "文档ID: $DOCUMENT_ID"
  173. echo "$DOCUMENT_ID" > "${SCRIPT_DIR}/.last_document_id"
  174. else
  175. print_error "无法从响应中获取文档ID"
  176. return 1
  177. fi
  178. fi
  179. return 0
  180. else
  181. print_error "文件上传失败 (HTTP $HTTP_CODE)"
  182. return 1
  183. fi
  184. }
  185. # 测试解析状态查询(单次)
  186. test_parse_status() {
  187. local DOC_ID=$1
  188. print_info "文档ID: $DOC_ID"
  189. print_info "状态URL: ${STATUS_URL}/${DOC_ID}"
  190. RESPONSE=$(curl -s -w "\n%{http_code}" \
  191. -X GET "${STATUS_URL}/${DOC_ID}" \
  192. --connect-timeout 10)
  193. HTTP_CODE=$(echo "$RESPONSE" | tail -n1)
  194. BODY=$(echo "$RESPONSE" | sed '$d')
  195. echo -e "\n${YELLOW}响应状态码:${NC} $HTTP_CODE"
  196. echo -e "${YELLOW}响应内容:${NC}"
  197. if [ "$JQ_AVAILABLE" = true ]; then
  198. echo "$BODY" | jq . 2>/dev/null || echo "$BODY"
  199. # 显示关键状态信息
  200. if [ "$HTTP_CODE" = "200" ]; then
  201. STATUS=$(echo "$BODY" | jq -r '.data.status // empty' 2>/dev/null)
  202. PROGRESS=$(echo "$BODY" | jq -r '.data.progress // 0' 2>/dev/null)
  203. CURRENT_STEP=$(echo "$BODY" | jq -r '.data.currentStep // empty' 2>/dev/null)
  204. print_info "状态: $STATUS, 进度: ${PROGRESS}%, 当前步骤: $CURRENT_STEP"
  205. fi
  206. else
  207. echo "$BODY"
  208. fi
  209. if [ "$HTTP_CODE" = "200" ]; then
  210. print_success "状态查询成功!"
  211. else
  212. print_error "状态查询失败 (HTTP $HTTP_CODE)"
  213. fi
  214. }
  215. # 轮询解析状态直到完成
  216. poll_parse_status() {
  217. local DOC_ID=$1
  218. local MAX_ATTEMPTS=${2:-60}
  219. local INTERVAL=${3:-3}
  220. print_step "轮询解析状态 (最多${MAX_ATTEMPTS}次, 间隔${INTERVAL}秒)"
  221. for ((i=1; i<=MAX_ATTEMPTS; i++)); do
  222. RESPONSE=$(curl -s "${STATUS_URL}/${DOC_ID}" --connect-timeout 10)
  223. if [ "$JQ_AVAILABLE" = true ]; then
  224. # 状态字段为 status,值为: pending/processing/completed/failed
  225. STATUS=$(echo "$RESPONSE" | jq -r '.data.status // .status // empty' 2>/dev/null)
  226. PROGRESS=$(echo "$RESPONSE" | jq -r '.data.progress // .progress // 0' 2>/dev/null)
  227. echo -ne "\r第 $i 次查询... 状态: $STATUS, 进度: ${PROGRESS}% "
  228. if [ "$STATUS" = "completed" ] || [ "$STATUS" = "COMPLETED" ]; then
  229. echo ""
  230. print_success "解析完成!"
  231. return 0
  232. elif [ "$STATUS" = "failed" ] || [ "$STATUS" = "FAILED" ]; then
  233. echo ""
  234. print_error "解析失败!"
  235. echo "$RESPONSE" | jq .
  236. return 1
  237. fi
  238. else
  239. echo "$RESPONSE"
  240. fi
  241. sleep $INTERVAL
  242. done
  243. echo ""
  244. print_error "轮询超时,解析未完成"
  245. return 1
  246. }
  247. # 获取解析后的文本内容
  248. get_document_text() {
  249. local DOC_ID=$1
  250. print_step "获取文档解析文本"
  251. print_info "文档ID: $DOC_ID"
  252. print_info "请求URL: ${TEXT_STORAGE_URL}/${DOC_ID}"
  253. RESPONSE=$(curl -s -w "\n%{http_code}" \
  254. -X GET "${TEXT_STORAGE_URL}/${DOC_ID}" \
  255. --connect-timeout 10)
  256. HTTP_CODE=$(echo "$RESPONSE" | tail -n1)
  257. BODY=$(echo "$RESPONSE" | sed '$d')
  258. echo -e "${YELLOW}响应状态码:${NC} $HTTP_CODE"
  259. if [ "$HTTP_CODE" = "200" ]; then
  260. if [ "$JQ_AVAILABLE" = true ]; then
  261. # 获取文件路径
  262. FILE_PATH=$(echo "$BODY" | jq -r '.data.filePath // empty' 2>/dev/null)
  263. if [ -n "$FILE_PATH" ] && [ "$FILE_PATH" != "null" ]; then
  264. print_success "获取文本存储记录成功!"
  265. print_info "文件路径: $FILE_PATH"
  266. # 读取文件内容
  267. if [ -f "$FILE_PATH" ]; then
  268. DOCUMENT_TEXT=$(cat "$FILE_PATH" 2>/dev/null)
  269. TEXT_LENGTH=${#DOCUMENT_TEXT}
  270. print_success "读取文本成功 (长度: $TEXT_LENGTH 字符)"
  271. # 显示前200个字符
  272. echo -e "${YELLOW}文本预览:${NC}"
  273. echo "${DOCUMENT_TEXT:0:200}..."
  274. return 0
  275. else
  276. print_error "文件不存在: $FILE_PATH"
  277. return 1
  278. fi
  279. else
  280. print_error "响应中无文件路径"
  281. echo "$BODY" | jq . 2>/dev/null
  282. return 1
  283. fi
  284. fi
  285. else
  286. print_error "获取文本存储失败 (HTTP $HTTP_CODE)"
  287. echo "$BODY"
  288. return 1
  289. fi
  290. }
  291. # 向量提取(RAG 索引)
  292. test_vector_extraction() {
  293. local DOC_ID=$1
  294. local TEXT=$2
  295. print_step "向量提取 (RAG 索引)"
  296. print_info "文档ID: $DOC_ID"
  297. print_info "文本长度: ${#TEXT} 字符"
  298. print_info "请求URL: $RAG_INDEX_URL"
  299. # 构建请求JSON(需要转义文本中的特殊字符)
  300. if [ "$JQ_AVAILABLE" = true ]; then
  301. REQUEST_BODY=$(jq -n \
  302. --arg docId "$DOC_ID" \
  303. --arg text "$TEXT" \
  304. '{documentId: $docId, text: $text}')
  305. else
  306. # 简单转义
  307. ESCAPED_TEXT=$(echo "$TEXT" | sed 's/\\/\\\\/g' | sed 's/"/\\"/g' | tr '\n' ' ')
  308. REQUEST_BODY="{\"documentId\":\"${DOC_ID}\",\"text\":\"${ESCAPED_TEXT}\"}"
  309. fi
  310. RESPONSE=$(curl -s -w "\n%{http_code}" \
  311. -X POST "$RAG_INDEX_URL" \
  312. -H "Content-Type: application/json" \
  313. -d "$REQUEST_BODY" \
  314. --connect-timeout 30 \
  315. --max-time 300)
  316. HTTP_CODE=$(echo "$RESPONSE" | tail -n1)
  317. BODY=$(echo "$RESPONSE" | sed '$d')
  318. echo -e "${YELLOW}响应状态码:${NC} $HTTP_CODE"
  319. echo -e "${YELLOW}响应内容:${NC}"
  320. if [ "$JQ_AVAILABLE" = true ]; then
  321. echo "$BODY" | jq . 2>/dev/null || echo "$BODY"
  322. else
  323. echo "$BODY"
  324. fi
  325. if [ "$HTTP_CODE" = "200" ]; then
  326. if [ "$JQ_AVAILABLE" = true ]; then
  327. CHUNK_COUNT=$(echo "$BODY" | jq -r '.data.chunkCount // empty' 2>/dev/null)
  328. if [ -n "$CHUNK_COUNT" ] && [ "$CHUNK_COUNT" != "null" ]; then
  329. print_success "向量提取成功! 生成 $CHUNK_COUNT 个分块"
  330. else
  331. print_success "向量提取成功!"
  332. fi
  333. else
  334. print_success "向量提取成功!"
  335. fi
  336. return 0
  337. else
  338. print_error "向量提取失败 (HTTP $HTTP_CODE)"
  339. return 1
  340. fi
  341. }
  342. # NER 提取
  343. test_ner_extraction() {
  344. local DOC_ID=$1
  345. print_step "NER 提取 (命名实体识别)"
  346. print_info "文档ID: $DOC_ID"
  347. print_info "请求URL: ${NER_DOCUMENT_URL}/${DOC_ID}"
  348. RESPONSE=$(curl -s -w "\n%{http_code}" \
  349. -X POST "${NER_DOCUMENT_URL}/${DOC_ID}" \
  350. -H "Content-Type: application/json" \
  351. --connect-timeout 30 \
  352. --max-time 300)
  353. HTTP_CODE=$(echo "$RESPONSE" | tail -n1)
  354. BODY=$(echo "$RESPONSE" | sed '$d')
  355. echo -e "${YELLOW}响应状态码:${NC} $HTTP_CODE"
  356. echo -e "${YELLOW}响应内容:${NC}"
  357. if [ "$JQ_AVAILABLE" = true ]; then
  358. echo "$BODY" | jq . 2>/dev/null || echo "$BODY"
  359. else
  360. echo "$BODY"
  361. fi
  362. if [ "$HTTP_CODE" = "200" ]; then
  363. if [ "$JQ_AVAILABLE" = true ]; then
  364. ENTITY_COUNT=$(echo "$BODY" | jq -r '.data.entityCount // empty' 2>/dev/null)
  365. RELATION_COUNT=$(echo "$BODY" | jq -r '.data.relationCount // empty' 2>/dev/null)
  366. if [ -n "$ENTITY_COUNT" ] && [ "$ENTITY_COUNT" != "null" ]; then
  367. print_success "NER 提取成功! 实体: $ENTITY_COUNT, 关系: $RELATION_COUNT"
  368. else
  369. print_success "NER 提取成功!"
  370. fi
  371. else
  372. print_success "NER 提取成功!"
  373. fi
  374. return 0
  375. else
  376. print_error "NER 提取失败 (HTTP $HTTP_CODE)"
  377. return 1
  378. fi
  379. }
  380. # 结构化解析(提取图片和表格)
  381. test_structured_extraction() {
  382. local DOC_ID=$1
  383. print_step "结构化解析 (提取段落、图片、表格)"
  384. print_info "文档ID: $DOC_ID"
  385. print_info "请求URL: ${STRUCTURED_URL}/${DOC_ID}"
  386. RESPONSE=$(curl -s -w "\n%{http_code}" \
  387. -X GET "${STRUCTURED_URL}/${DOC_ID}" \
  388. --connect-timeout 30 \
  389. --max-time 300)
  390. HTTP_CODE=$(echo "$RESPONSE" | tail -n1)
  391. BODY=$(echo "$RESPONSE" | sed '$d')
  392. echo -e "${YELLOW}响应状态码:${NC} $HTTP_CODE"
  393. if [ "$HTTP_CODE" = "200" ]; then
  394. if [ "$JQ_AVAILABLE" = true ]; then
  395. TOTAL=$(echo "$BODY" | jq -r '.data.totalElements // 0' 2>/dev/null)
  396. IMAGE_COUNT=$(echo "$BODY" | jq -r '.data.imageCount // 0' 2>/dev/null)
  397. TABLE_COUNT=$(echo "$BODY" | jq -r '.data.tableCount // 0' 2>/dev/null)
  398. print_success "结构化解析成功!"
  399. print_info "总元素: $TOTAL, 图片: $IMAGE_COUNT, 表格: $TABLE_COUNT"
  400. # 显示图片列表
  401. if [ "$IMAGE_COUNT" -gt 0 ]; then
  402. echo -e "\n${YELLOW}图片列表:${NC}"
  403. echo "$BODY" | jq -r '.data.elements[] | select(.type == "image") | " - \(.imageUrl) (\(.imageWidth)x\(.imageHeight))"' 2>/dev/null
  404. fi
  405. # 显示表格摘要
  406. if [ "$TABLE_COUNT" -gt 0 ]; then
  407. echo -e "\n${YELLOW}表格列表:${NC}"
  408. echo "$BODY" | jq -r '.data.elements[] | select(.type == "table") | " - 表格\(.tableIndex): \(.tableRowCount)行 x \(.tableColCount)列"' 2>/dev/null
  409. fi
  410. else
  411. print_success "结构化解析成功!"
  412. echo "$BODY"
  413. fi
  414. return 0
  415. else
  416. print_error "结构化解析失败 (HTTP $HTTP_CODE)"
  417. if [ "$JQ_AVAILABLE" = true ]; then
  418. echo "$BODY" | jq . 2>/dev/null || echo "$BODY"
  419. else
  420. echo "$BODY"
  421. fi
  422. return 1
  423. fi
  424. }
  425. # 获取图片列表
  426. test_get_images() {
  427. local DOC_ID=$1
  428. print_step "获取文档图片"
  429. print_info "文档ID: $DOC_ID"
  430. print_info "请求URL: ${ELEMENTS_URL}/${DOC_ID}/images"
  431. RESPONSE=$(curl -s -w "\n%{http_code}" \
  432. -X GET "${ELEMENTS_URL}/${DOC_ID}/images" \
  433. --connect-timeout 10)
  434. HTTP_CODE=$(echo "$RESPONSE" | tail -n1)
  435. BODY=$(echo "$RESPONSE" | sed '$d')
  436. echo -e "${YELLOW}响应状态码:${NC} $HTTP_CODE"
  437. if [ "$HTTP_CODE" = "200" ]; then
  438. if [ "$JQ_AVAILABLE" = true ]; then
  439. COUNT=$(echo "$BODY" | jq -r '.data | length' 2>/dev/null)
  440. print_success "获取图片成功! 共 $COUNT 张"
  441. if [ "$COUNT" -gt 0 ]; then
  442. echo -e "${YELLOW}图片详情:${NC}"
  443. echo "$BODY" | jq -r '.data[] | " [\(.elementIndex)] \(.imageUrl) - \(.imageFormat) (\(.imageWidth)x\(.imageHeight))"' 2>/dev/null
  444. fi
  445. else
  446. print_success "获取图片成功!"
  447. echo "$BODY"
  448. fi
  449. return 0
  450. else
  451. print_error "获取图片失败 (HTTP $HTTP_CODE)"
  452. return 1
  453. fi
  454. }
  455. # 获取表格列表
  456. test_get_tables() {
  457. local DOC_ID=$1
  458. print_step "获取文档表格"
  459. print_info "文档ID: $DOC_ID"
  460. print_info "请求URL: ${ELEMENTS_URL}/${DOC_ID}/tables"
  461. RESPONSE=$(curl -s -w "\n%{http_code}" \
  462. -X GET "${ELEMENTS_URL}/${DOC_ID}/tables" \
  463. --connect-timeout 10)
  464. HTTP_CODE=$(echo "$RESPONSE" | tail -n1)
  465. BODY=$(echo "$RESPONSE" | sed '$d')
  466. echo -e "${YELLOW}响应状态码:${NC} $HTTP_CODE"
  467. if [ "$HTTP_CODE" = "200" ]; then
  468. if [ "$JQ_AVAILABLE" = true ]; then
  469. COUNT=$(echo "$BODY" | jq -r '.data | length' 2>/dev/null)
  470. print_success "获取表格成功! 共 $COUNT 个"
  471. if [ "$COUNT" -gt 0 ]; then
  472. echo -e "${YELLOW}表格详情:${NC}"
  473. echo "$BODY" | jq -r '.data[] | " [\(.elementIndex)] 表格\(.tableIndex): \(.tableRowCount)行 x \(.tableColCount)列"' 2>/dev/null
  474. fi
  475. else
  476. print_success "获取表格成功!"
  477. echo "$BODY"
  478. fi
  479. return 0
  480. else
  481. print_error "获取表格失败 (HTTP $HTTP_CODE)"
  482. return 1
  483. fi
  484. }
  485. # 显示使用帮助
  486. show_help() {
  487. echo "使用方法: $0 [选项] [host] [port]"
  488. echo ""
  489. echo "端到端测试流程: 上传文件 -> 等待解析 -> 向量提取 -> NER提取 -> 结构化解析"
  490. echo ""
  491. echo "选项:"
  492. echo " -h, --help 显示帮助信息"
  493. echo " -e, --e2e 执行完整端到端测试 (默认)"
  494. echo " -u, --upload-only 仅执行上传测试"
  495. echo " -s, --status 仅查询上次上传的文档状态"
  496. echo " -v, --vector 仅执行向量提取(使用上次的文档)"
  497. echo " -n, --ner 仅执行NER提取(使用上次的文档)"
  498. echo " -x, --structured 仅执行结构化解析(提取图片和表格)"
  499. echo " -i, --images 仅获取文档图片列表"
  500. echo " -t, --tables 仅获取文档表格列表"
  501. echo ""
  502. echo "示例:"
  503. echo " $0 # 使用默认配置执行完整端到端测试"
  504. echo " $0 192.168.1.100 5232 # 指定服务器地址"
  505. echo " $0 -u # 仅上传文件"
  506. echo " $0 -s # 查询上次上传的状态"
  507. echo " $0 -v # 对上次文档执行向量提取"
  508. echo " $0 -n # 对上次文档执行NER提取"
  509. echo " $0 -x # 对上次文档执行结构化解析"
  510. echo " $0 -i # 获取上次文档的图片列表"
  511. echo " $0 -t # 获取上次文档的表格列表"
  512. }
  513. # 主函数
  514. main() {
  515. local MODE="e2e" # 默认执行完整端到端测试
  516. # 解析参数
  517. while [[ $# -gt 0 ]]; do
  518. case $1 in
  519. -h|--help)
  520. show_help
  521. exit 0
  522. ;;
  523. -e|--e2e)
  524. MODE="e2e"
  525. shift
  526. ;;
  527. -u|--upload-only)
  528. MODE="upload"
  529. shift
  530. ;;
  531. -s|--status)
  532. MODE="status"
  533. shift
  534. ;;
  535. -v|--vector)
  536. MODE="vector"
  537. shift
  538. ;;
  539. -n|--ner)
  540. MODE="ner"
  541. shift
  542. ;;
  543. -x|--structured)
  544. MODE="structured"
  545. shift
  546. ;;
  547. -i|--images)
  548. MODE="images"
  549. shift
  550. ;;
  551. -t|--tables)
  552. MODE="tables"
  553. shift
  554. ;;
  555. -p|--poll)
  556. # 兼容旧参数,等同于e2e
  557. MODE="e2e"
  558. shift
  559. ;;
  560. *)
  561. if [[ ! "$1" =~ ^- ]]; then
  562. if [[ -z "$HOST_SET" ]]; then
  563. HOST=$1
  564. HOST_SET=true
  565. else
  566. PORT=$1
  567. fi
  568. fi
  569. shift
  570. ;;
  571. esac
  572. done
  573. # 更新URL
  574. BASE_URL="http://${HOST}:${PORT}"
  575. UPLOAD_URL="${BASE_URL}/api/v1/parse/upload"
  576. STATUS_URL="${BASE_URL}/parse/status"
  577. REGISTER_URL="${BASE_URL}/auth/register"
  578. TEXT_STORAGE_URL="${BASE_URL}/api/v1/graph/text-storage"
  579. RAG_INDEX_URL="${BASE_URL}/api/rag/index"
  580. NER_DOCUMENT_URL="${BASE_URL}/api/ner/document"
  581. STRUCTURED_URL="${BASE_URL}/parse/structured"
  582. ELEMENTS_URL="${BASE_URL}/parse/elements"
  583. print_header "文件上传端到端测试"
  584. echo "目标服务: $BASE_URL"
  585. echo "测试模式: $MODE"
  586. echo "时间: $(date '+%Y-%m-%d %H:%M:%S')"
  587. check_dependencies
  588. # 根据模式执行不同操作
  589. case $MODE in
  590. status)
  591. # 仅查询状态
  592. if [ -f "${SCRIPT_DIR}/.last_document_id" ]; then
  593. DOCUMENT_ID=$(cat "${SCRIPT_DIR}/.last_document_id")
  594. print_header "查询解析状态"
  595. test_parse_status "$DOCUMENT_ID"
  596. else
  597. print_error "未找到上次上传的文档ID"
  598. exit 1
  599. fi
  600. ;;
  601. vector)
  602. # 仅执行向量提取
  603. if [ -f "${SCRIPT_DIR}/.last_document_id" ]; then
  604. DOCUMENT_ID=$(cat "${SCRIPT_DIR}/.last_document_id")
  605. print_header "向量提取测试"
  606. if get_document_text "$DOCUMENT_ID"; then
  607. test_vector_extraction "$DOCUMENT_ID" "$DOCUMENT_TEXT"
  608. fi
  609. else
  610. print_error "未找到上次上传的文档ID"
  611. exit 1
  612. fi
  613. ;;
  614. ner)
  615. # 仅执行NER提取
  616. if [ -f "${SCRIPT_DIR}/.last_document_id" ]; then
  617. DOCUMENT_ID=$(cat "${SCRIPT_DIR}/.last_document_id")
  618. print_header "NER 提取测试"
  619. test_ner_extraction "$DOCUMENT_ID"
  620. else
  621. print_error "未找到上次上传的文档ID"
  622. exit 1
  623. fi
  624. ;;
  625. structured)
  626. # 仅执行结构化解析
  627. if [ -f "${SCRIPT_DIR}/.last_document_id" ]; then
  628. DOCUMENT_ID=$(cat "${SCRIPT_DIR}/.last_document_id")
  629. print_header "结构化解析测试"
  630. test_structured_extraction "$DOCUMENT_ID"
  631. else
  632. print_error "未找到上次上传的文档ID"
  633. exit 1
  634. fi
  635. ;;
  636. images)
  637. # 仅获取图片列表
  638. if [ -f "${SCRIPT_DIR}/.last_document_id" ]; then
  639. DOCUMENT_ID=$(cat "${SCRIPT_DIR}/.last_document_id")
  640. print_header "获取文档图片"
  641. test_get_images "$DOCUMENT_ID"
  642. else
  643. print_error "未找到上次上传的文档ID"
  644. exit 1
  645. fi
  646. ;;
  647. tables)
  648. # 仅获取表格列表
  649. if [ -f "${SCRIPT_DIR}/.last_document_id" ]; then
  650. DOCUMENT_ID=$(cat "${SCRIPT_DIR}/.last_document_id")
  651. print_header "获取文档表格"
  652. test_get_tables "$DOCUMENT_ID"
  653. else
  654. print_error "未找到上次上传的文档ID"
  655. exit 1
  656. fi
  657. ;;
  658. upload)
  659. # 仅上传
  660. check_test_file
  661. check_service
  662. register_test_user
  663. test_upload
  664. ;;
  665. e2e)
  666. # 完整端到端测试
  667. check_test_file
  668. check_service
  669. register_test_user
  670. print_header "步骤 1/4: 文件上传"
  671. test_upload
  672. if [ -z "$DOCUMENT_ID" ] && [ -f "${SCRIPT_DIR}/.last_document_id" ]; then
  673. DOCUMENT_ID=$(cat "${SCRIPT_DIR}/.last_document_id")
  674. fi
  675. if [ -z "$DOCUMENT_ID" ]; then
  676. print_error "无法获取文档ID,终止测试"
  677. exit 1
  678. fi
  679. print_header "步骤 2/4: 等待解析完成"
  680. if ! poll_parse_status "$DOCUMENT_ID" 60 3; then
  681. print_error "解析未完成,终止测试"
  682. exit 1
  683. fi
  684. print_header "步骤 3/4: 向量提取"
  685. if get_document_text "$DOCUMENT_ID"; then
  686. test_vector_extraction "$DOCUMENT_ID" "$DOCUMENT_TEXT"
  687. else
  688. print_info "跳过向量提取(无法获取文本)"
  689. fi
  690. print_header "步骤 4/5: NER 提取"
  691. test_ner_extraction "$DOCUMENT_ID"
  692. print_header "步骤 5/5: 结构化解析"
  693. test_structured_extraction "$DOCUMENT_ID"
  694. ;;
  695. esac
  696. print_header "测试完成"
  697. echo -e "${GREEN}文档ID: $DOCUMENT_ID${NC}"
  698. echo "可使用以下命令进行后续操作:"
  699. echo " $0 -s # 查询状态"
  700. echo " $0 -v # 重新向量提取"
  701. echo " $0 -n # 重新NER提取"
  702. echo " $0 -x # 结构化解析(提取图片表格)"
  703. echo " $0 -i # 获取图片列表"
  704. echo " $0 -t # 获取表格列表"
  705. }
  706. # 运行主函数
  707. main "$@"