|
|
@@ -18,6 +18,8 @@ REGISTER_URL="${BASE_URL}/auth/register"
|
|
|
TEXT_STORAGE_URL="${BASE_URL}/api/v1/graph/text-storage"
|
|
|
RAG_INDEX_URL="${BASE_URL}/api/rag/index"
|
|
|
NER_DOCUMENT_URL="${BASE_URL}/api/ner/document"
|
|
|
+STRUCTURED_URL="${BASE_URL}/parse/structured"
|
|
|
+ELEMENTS_URL="${BASE_URL}/parse/elements"
|
|
|
|
|
|
# 测试文件路径(相对于脚本所在目录)
|
|
|
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
|
|
|
@@ -455,11 +457,141 @@ test_ner_extraction() {
|
|
|
fi
|
|
|
}
|
|
|
|
|
|
+# 结构化解析(提取图片和表格)
|
|
|
+test_structured_extraction() {
|
|
|
+ local DOC_ID=$1
|
|
|
+
|
|
|
+ print_step "结构化解析 (提取段落、图片、表格)"
|
|
|
+
|
|
|
+ print_info "文档ID: $DOC_ID"
|
|
|
+ print_info "请求URL: ${STRUCTURED_URL}/${DOC_ID}"
|
|
|
+
|
|
|
+ RESPONSE=$(curl -s -w "\n%{http_code}" \
|
|
|
+ -X GET "${STRUCTURED_URL}/${DOC_ID}" \
|
|
|
+ --connect-timeout 30 \
|
|
|
+ --max-time 300)
|
|
|
+
|
|
|
+ HTTP_CODE=$(echo "$RESPONSE" | tail -n1)
|
|
|
+ BODY=$(echo "$RESPONSE" | sed '$d')
|
|
|
+
|
|
|
+ echo -e "${YELLOW}响应状态码:${NC} $HTTP_CODE"
|
|
|
+
|
|
|
+ if [ "$HTTP_CODE" = "200" ]; then
|
|
|
+ if [ "$JQ_AVAILABLE" = true ]; then
|
|
|
+ TOTAL=$(echo "$BODY" | jq -r '.data.totalElements // 0' 2>/dev/null)
|
|
|
+ IMAGE_COUNT=$(echo "$BODY" | jq -r '.data.imageCount // 0' 2>/dev/null)
|
|
|
+ TABLE_COUNT=$(echo "$BODY" | jq -r '.data.tableCount // 0' 2>/dev/null)
|
|
|
+ print_success "结构化解析成功!"
|
|
|
+ print_info "总元素: $TOTAL, 图片: $IMAGE_COUNT, 表格: $TABLE_COUNT"
|
|
|
+
|
|
|
+ # 显示图片列表
|
|
|
+ if [ "$IMAGE_COUNT" -gt 0 ]; then
|
|
|
+ echo -e "\n${YELLOW}图片列表:${NC}"
|
|
|
+ echo "$BODY" | jq -r '.data.elements[] | select(.type == "image") | " - \(.imageUrl) (\(.imageWidth)x\(.imageHeight))"' 2>/dev/null
|
|
|
+ fi
|
|
|
+
|
|
|
+ # 显示表格摘要
|
|
|
+ if [ "$TABLE_COUNT" -gt 0 ]; then
|
|
|
+ echo -e "\n${YELLOW}表格列表:${NC}"
|
|
|
+ echo "$BODY" | jq -r '.data.elements[] | select(.type == "table") | " - 表格\(.tableIndex): \(.tableRowCount)行 x \(.tableColCount)列"' 2>/dev/null
|
|
|
+ fi
|
|
|
+ else
|
|
|
+ print_success "结构化解析成功!"
|
|
|
+ echo "$BODY"
|
|
|
+ fi
|
|
|
+ return 0
|
|
|
+ else
|
|
|
+ print_error "结构化解析失败 (HTTP $HTTP_CODE)"
|
|
|
+ if [ "$JQ_AVAILABLE" = true ]; then
|
|
|
+ echo "$BODY" | jq . 2>/dev/null || echo "$BODY"
|
|
|
+ else
|
|
|
+ echo "$BODY"
|
|
|
+ fi
|
|
|
+ return 1
|
|
|
+ fi
|
|
|
+}
|
|
|
+
|
|
|
+# 获取图片列表
|
|
|
+test_get_images() {
|
|
|
+ local DOC_ID=$1
|
|
|
+
|
|
|
+ print_step "获取文档图片"
|
|
|
+
|
|
|
+ print_info "文档ID: $DOC_ID"
|
|
|
+ print_info "请求URL: ${ELEMENTS_URL}/${DOC_ID}/images"
|
|
|
+
|
|
|
+ RESPONSE=$(curl -s -w "\n%{http_code}" \
|
|
|
+ -X GET "${ELEMENTS_URL}/${DOC_ID}/images" \
|
|
|
+ --connect-timeout 10)
|
|
|
+
|
|
|
+ HTTP_CODE=$(echo "$RESPONSE" | tail -n1)
|
|
|
+ BODY=$(echo "$RESPONSE" | sed '$d')
|
|
|
+
|
|
|
+ echo -e "${YELLOW}响应状态码:${NC} $HTTP_CODE"
|
|
|
+
|
|
|
+ if [ "$HTTP_CODE" = "200" ]; then
|
|
|
+ if [ "$JQ_AVAILABLE" = true ]; then
|
|
|
+ COUNT=$(echo "$BODY" | jq -r '.data | length' 2>/dev/null)
|
|
|
+ print_success "获取图片成功! 共 $COUNT 张"
|
|
|
+
|
|
|
+ if [ "$COUNT" -gt 0 ]; then
|
|
|
+ echo -e "${YELLOW}图片详情:${NC}"
|
|
|
+ echo "$BODY" | jq -r '.data[] | " [\(.elementIndex)] \(.imageUrl) - \(.imageFormat) (\(.imageWidth)x\(.imageHeight))"' 2>/dev/null
|
|
|
+ fi
|
|
|
+ else
|
|
|
+ print_success "获取图片成功!"
|
|
|
+ echo "$BODY"
|
|
|
+ fi
|
|
|
+ return 0
|
|
|
+ else
|
|
|
+ print_error "获取图片失败 (HTTP $HTTP_CODE)"
|
|
|
+ return 1
|
|
|
+ fi
|
|
|
+}
|
|
|
+
|
|
|
+# 获取表格列表
|
|
|
+test_get_tables() {
|
|
|
+ local DOC_ID=$1
|
|
|
+
|
|
|
+ print_step "获取文档表格"
|
|
|
+
|
|
|
+ print_info "文档ID: $DOC_ID"
|
|
|
+ print_info "请求URL: ${ELEMENTS_URL}/${DOC_ID}/tables"
|
|
|
+
|
|
|
+ RESPONSE=$(curl -s -w "\n%{http_code}" \
|
|
|
+ -X GET "${ELEMENTS_URL}/${DOC_ID}/tables" \
|
|
|
+ --connect-timeout 10)
|
|
|
+
|
|
|
+ HTTP_CODE=$(echo "$RESPONSE" | tail -n1)
|
|
|
+ BODY=$(echo "$RESPONSE" | sed '$d')
|
|
|
+
|
|
|
+ echo -e "${YELLOW}响应状态码:${NC} $HTTP_CODE"
|
|
|
+
|
|
|
+ if [ "$HTTP_CODE" = "200" ]; then
|
|
|
+ if [ "$JQ_AVAILABLE" = true ]; then
|
|
|
+ COUNT=$(echo "$BODY" | jq -r '.data | length' 2>/dev/null)
|
|
|
+ print_success "获取表格成功! 共 $COUNT 个"
|
|
|
+
|
|
|
+ if [ "$COUNT" -gt 0 ]; then
|
|
|
+ echo -e "${YELLOW}表格详情:${NC}"
|
|
|
+ echo "$BODY" | jq -r '.data[] | " [\(.elementIndex)] 表格\(.tableIndex): \(.tableRowCount)行 x \(.tableColCount)列"' 2>/dev/null
|
|
|
+ fi
|
|
|
+ else
|
|
|
+ print_success "获取表格成功!"
|
|
|
+ echo "$BODY"
|
|
|
+ fi
|
|
|
+ return 0
|
|
|
+ else
|
|
|
+ print_error "获取表格失败 (HTTP $HTTP_CODE)"
|
|
|
+ return 1
|
|
|
+ fi
|
|
|
+}
|
|
|
+
|
|
|
# 显示使用帮助
|
|
|
show_help() {
|
|
|
echo "使用方法: $0 [选项] [host] [port]"
|
|
|
echo ""
|
|
|
- echo "端到端测试流程: 上传文件 -> 等待解析 -> 向量提取 -> NER提取"
|
|
|
+ echo "端到端测试流程: 上传文件 -> 等待解析 -> 向量提取 -> NER提取 -> 结构化解析"
|
|
|
echo ""
|
|
|
echo "选项:"
|
|
|
echo " -h, --help 显示帮助信息"
|
|
|
@@ -468,6 +600,9 @@ show_help() {
|
|
|
echo " -s, --status 仅查询上次上传的文档状态"
|
|
|
echo " -v, --vector 仅执行向量提取(使用上次的文档)"
|
|
|
echo " -n, --ner 仅执行NER提取(使用上次的文档)"
|
|
|
+ echo " -x, --structured 仅执行结构化解析(提取图片和表格)"
|
|
|
+ echo " -i, --images 仅获取文档图片列表"
|
|
|
+ echo " -t, --tables 仅获取文档表格列表"
|
|
|
echo ""
|
|
|
echo "示例:"
|
|
|
echo " $0 # 使用默认配置执行完整端到端测试"
|
|
|
@@ -476,6 +611,9 @@ show_help() {
|
|
|
echo " $0 -s # 查询上次上传的状态"
|
|
|
echo " $0 -v # 对上次文档执行向量提取"
|
|
|
echo " $0 -n # 对上次文档执行NER提取"
|
|
|
+ echo " $0 -x # 对上次文档执行结构化解析"
|
|
|
+ echo " $0 -i # 获取上次文档的图片列表"
|
|
|
+ echo " $0 -t # 获取上次文档的表格列表"
|
|
|
}
|
|
|
|
|
|
# 主函数
|
|
|
@@ -509,6 +647,18 @@ main() {
|
|
|
MODE="ner"
|
|
|
shift
|
|
|
;;
|
|
|
+ -x|--structured)
|
|
|
+ MODE="structured"
|
|
|
+ shift
|
|
|
+ ;;
|
|
|
+ -i|--images)
|
|
|
+ MODE="images"
|
|
|
+ shift
|
|
|
+ ;;
|
|
|
+ -t|--tables)
|
|
|
+ MODE="tables"
|
|
|
+ shift
|
|
|
+ ;;
|
|
|
-p|--poll)
|
|
|
# 兼容旧参数,等同于e2e
|
|
|
MODE="e2e"
|
|
|
@@ -536,6 +686,8 @@ main() {
|
|
|
TEXT_STORAGE_URL="${BASE_URL}/api/v1/graph/text-storage"
|
|
|
RAG_INDEX_URL="${BASE_URL}/api/rag/index"
|
|
|
NER_DOCUMENT_URL="${BASE_URL}/api/ner/document"
|
|
|
+ STRUCTURED_URL="${BASE_URL}/parse/structured"
|
|
|
+ ELEMENTS_URL="${BASE_URL}/parse/elements"
|
|
|
|
|
|
print_header "文件上传端到端测试"
|
|
|
echo "目标服务: $BASE_URL"
|
|
|
@@ -584,6 +736,42 @@ main() {
|
|
|
fi
|
|
|
;;
|
|
|
|
|
|
+ structured)
|
|
|
+ # 仅执行结构化解析
|
|
|
+ if [ -f "${SCRIPT_DIR}/.last_document_id" ]; then
|
|
|
+ DOCUMENT_ID=$(cat "${SCRIPT_DIR}/.last_document_id")
|
|
|
+ print_header "结构化解析测试"
|
|
|
+ test_structured_extraction "$DOCUMENT_ID"
|
|
|
+ else
|
|
|
+ print_error "未找到上次上传的文档ID"
|
|
|
+ exit 1
|
|
|
+ fi
|
|
|
+ ;;
|
|
|
+
|
|
|
+ images)
|
|
|
+ # 仅获取图片列表
|
|
|
+ if [ -f "${SCRIPT_DIR}/.last_document_id" ]; then
|
|
|
+ DOCUMENT_ID=$(cat "${SCRIPT_DIR}/.last_document_id")
|
|
|
+ print_header "获取文档图片"
|
|
|
+ test_get_images "$DOCUMENT_ID"
|
|
|
+ else
|
|
|
+ print_error "未找到上次上传的文档ID"
|
|
|
+ exit 1
|
|
|
+ fi
|
|
|
+ ;;
|
|
|
+
|
|
|
+ tables)
|
|
|
+ # 仅获取表格列表
|
|
|
+ if [ -f "${SCRIPT_DIR}/.last_document_id" ]; then
|
|
|
+ DOCUMENT_ID=$(cat "${SCRIPT_DIR}/.last_document_id")
|
|
|
+ print_header "获取文档表格"
|
|
|
+ test_get_tables "$DOCUMENT_ID"
|
|
|
+ else
|
|
|
+ print_error "未找到上次上传的文档ID"
|
|
|
+ exit 1
|
|
|
+ fi
|
|
|
+ ;;
|
|
|
+
|
|
|
upload)
|
|
|
# 仅上传
|
|
|
check_test_file
|
|
|
@@ -623,8 +811,11 @@ main() {
|
|
|
print_info "跳过向量提取(无法获取文本)"
|
|
|
fi
|
|
|
|
|
|
- print_header "步骤 4/4: NER 提取"
|
|
|
+ print_header "步骤 4/5: NER 提取"
|
|
|
test_ner_extraction "$DOCUMENT_ID"
|
|
|
+
|
|
|
+ print_header "步骤 5/5: 结构化解析"
|
|
|
+ test_structured_extraction "$DOCUMENT_ID"
|
|
|
;;
|
|
|
esac
|
|
|
|
|
|
@@ -634,6 +825,9 @@ main() {
|
|
|
echo " $0 -s # 查询状态"
|
|
|
echo " $0 -v # 重新向量提取"
|
|
|
echo " $0 -n # 重新NER提取"
|
|
|
+ echo " $0 -x # 结构化解析(提取图片表格)"
|
|
|
+ echo " $0 -i # 获取图片列表"
|
|
|
+ echo " $0 -t # 获取表格列表"
|
|
|
}
|
|
|
|
|
|
# 运行主函数
|