test_api.py 43 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179
  1. #!/usr/bin/env python3
  2. # -*- coding: utf-8 -*-
  3. """
  4. PDF Converter API 测试脚本
  5. 测试新增的投资类型:
  6. - fsApproval: 可研批复
  7. - fsReview: 可研评审
  8. - pdApproval: 初设批复
  9. - safetyFsApproval: 安评可研批复
  10. 以及现有类型:
  11. - settlementReport: 结算报告
  12. - designReview: 初设评审
  13. """
  14. import os
  15. import sys
  16. import json
  17. import time
  18. import base64
  19. import requests
  20. from pathlib import Path
  21. from typing import Optional, Dict, Any, List
  22. # API 配置(默认本机 4214 端口;可通过环境变量 PDF_CONVERTER_API_URL 覆盖)
  23. API_BASE_URL = os.getenv("PDF_CONVERTER_API_URL", "http://47.108.80.98:4214")
  24. # 测试文件配置
  25. TEST_DIR = Path(__file__).parent / "test"
  26. # 测试用例:文件名 -> (文档类型, 是否去水印, 是否只保留表格附件)
  27. # 格式:
  28. # "文件名": ("类型", 去水印, 只保留表格) - 完整格式
  29. # "文件名": ("类型", 去水印) - 兼容格式,只保留表格默认True
  30. # "文件名": "类型" - 旧格式,去水印False,只保留表格True
  31. TEST_CASES = {
  32. # 新增投资类型
  33. "鄂电司发展〔2024〕124号 国网湖北省电力有限公司关于襄阳连云220千伏输变电工程可行性研究报告的批复.pdf": ("safetyFsApproval", True,False), # 需要去水印 + 只保留表格附件
  34. "2-(可研批复)晋电发展〔2017〕831号+国网山西省电力公司关于临汾古县、晋城周村220kV输变电等工程可行性研究报告的批复.pdf.pdf": "fsApproval",
  35. "1-(可研评审)晋电经研规划〔2017〕187号(盖章)国网山西经研院关于山西晋城周村220kV输变电工程可行性研究报告的评审意见.pdf": "fsReview",
  36. "5-(初设批复)晋电建设〔2019〕566号 国网山西省电力公司关于晋城周村220kV输变电工程初步设计的批复 .pdf": "pdApproval",
  37. # 现有类型
  38. "9-(结算报告)山西晋城周村220kV输变电工程结算审计报告.pdf": "settlementReport",
  39. "4-(初设评审)中电联电力建设技术经济咨询中心技经〔2019〕201号关于山西周村220kV输变电工程初步设计的评审意见.pdf": "designReview",
  40. # 决算报告
  41. "10-(决算报告)盖章页-山西晋城周村220kV输变电工程竣工决算审核报告(中瑞诚鉴字(2021)第002040号).pdf": "finalAccount",
  42. }
  43. # pdf_to_markdown 测试用例:取 TEST_CASES 中第一个文件
  44. PDF2MD_TEST_CASES = [list(TEST_CASES.keys())[0]]
  45. # OCR 测试用例:PDF 文件路径列表(会提取每页为图片后调用 /ocr)
  46. OCR_TEST_CASES = [
  47. "007/3、附件2:核准批复.pdf",
  48. "007/5、附件7:检测报告.pdf",
  49. ]
  50. def print_header(title: str):
  51. """打印标题"""
  52. print("\n" + "=" * 60)
  53. print(f" {title}")
  54. print("=" * 60)
  55. def print_result(success: bool, message: str):
  56. """打印结果"""
  57. status = "✅ 成功" if success else "❌ 失败"
  58. print(f" {status}: {message}")
  59. def check_health() -> bool:
  60. """检查 API 健康状态"""
  61. print_header("检查 API 健康状态")
  62. try:
  63. response = requests.get(f"{API_BASE_URL}/health", timeout=10)
  64. if response.status_code == 200:
  65. print_result(True, f"API 正常运行 - {response.json()}")
  66. return True
  67. else:
  68. print_result(False, f"状态码: {response.status_code}")
  69. return False
  70. except requests.exceptions.RequestException as e:
  71. print_result(False, f"连接失败: {e}")
  72. return False
  73. def upload_file(file_path: Path, document_type: str, remove_watermark: bool = False, table_only: bool = True) -> Optional[str]:
  74. """上传文件并获取任务 ID
  75. Args:
  76. file_path: 文件路径
  77. document_type: 文档类型
  78. remove_watermark: 是否去水印
  79. table_only: 是否只保留表格附件
  80. """
  81. print(f"\n 📤 上传文件: {file_path.name}")
  82. print(f" 类型: {document_type}")
  83. if remove_watermark:
  84. print(f" 去水印: 是")
  85. if table_only:
  86. print(f" 只保留表格: 是")
  87. try:
  88. with open(file_path, "rb") as f:
  89. files = {"file": (file_path.name, f, "application/pdf")}
  90. # 使用 data 发送表单参数,参数名是 type(不是 document_type)
  91. data = {"type": document_type}
  92. # 添加去水印参数
  93. if remove_watermark:
  94. data["remove_watermark"] = "true"
  95. data["watermark_light_threshold"] = "200"
  96. data["watermark_saturation_threshold"] = "30"
  97. # 添加只保留表格参数
  98. data["table_only"] = "true" if table_only else "false"
  99. response = requests.post(
  100. f"{API_BASE_URL}/convert",
  101. files=files,
  102. data=data,
  103. timeout=60
  104. )
  105. if response.status_code == 200:
  106. result = response.json()
  107. task_id = result.get("task_id")
  108. print(f" 任务 ID: {task_id}")
  109. return task_id
  110. else:
  111. print_result(False, f"上传失败: {response.status_code} - {response.text}")
  112. return None
  113. except Exception as e:
  114. print_result(False, f"上传异常: {e}")
  115. return None
  116. def poll_task_status(task_id: str, max_wait: int = 300) -> Optional[Dict[str, Any]]:
  117. """轮询任务状态"""
  118. print(f" ⏳ 等待任务完成...")
  119. start_time = time.time()
  120. poll_interval = 5 # 轮询间隔(秒)
  121. while time.time() - start_time < max_wait:
  122. try:
  123. response = requests.get(f"{API_BASE_URL}/task/{task_id}", timeout=10)
  124. if response.status_code == 200:
  125. result = response.json()
  126. status = result.get("status")
  127. if status == "completed":
  128. elapsed = time.time() - start_time
  129. print(f" 完成! 耗时: {elapsed:.1f}s")
  130. return result
  131. elif status == "failed":
  132. error = result.get("error", "未知错误")
  133. print_result(False, f"任务失败: {error}")
  134. return None
  135. else:
  136. # 仍在处理中
  137. elapsed = time.time() - start_time
  138. print(f" 处理中... ({elapsed:.0f}s)", end="\r")
  139. else:
  140. print_result(False, f"查询状态失败: {response.status_code}")
  141. return None
  142. except Exception as e:
  143. print_result(False, f"查询异常: {e}")
  144. return None
  145. time.sleep(poll_interval)
  146. print_result(False, f"超时: 超过 {max_wait} 秒")
  147. return None
  148. def get_json_result(task_id: str) -> Optional[Dict[str, Any]]:
  149. """获取 JSON 结果"""
  150. try:
  151. response = requests.get(f"{API_BASE_URL}/task/{task_id}/json", timeout=30)
  152. if response.status_code == 200:
  153. return response.json()
  154. else:
  155. print_result(False, f"获取 JSON 失败: {response.status_code}")
  156. return None
  157. except Exception as e:
  158. print_result(False, f"获取 JSON 异常: {e}")
  159. return None
  160. def validate_result(result: Dict[str, Any], expected_type: str) -> bool:
  161. """验证结果"""
  162. document_type = result.get("document_type")
  163. data = result.get("data")
  164. # 检查文档类型
  165. if document_type != expected_type:
  166. print_result(False, f"文档类型不匹配: 期望 {expected_type}, 实际 {document_type}")
  167. return False
  168. # 检查数据是否为空
  169. if not data:
  170. print_result(False, "数据为空")
  171. return False
  172. # 对于投资类型,检查嵌套结构
  173. if expected_type in ["fsApproval", "fsReview", "pdApproval", "safetyFsApproval"]:
  174. # 检查是否是新格式(包含 projectInfo)
  175. project_info = None
  176. if isinstance(data, dict) and "data" in data:
  177. # 新格式:{"projectInfo": {...}, "data": [...]}
  178. project_info = data.get("projectInfo")
  179. data = data["data"]
  180. if project_info:
  181. print(f"\n 📋 项目信息:")
  182. print(f" 工程名称: {project_info.get('projectName', '')}")
  183. print(f" 项目单位: {project_info.get('projectUnit', '')}")
  184. print(f" 设计单位: {project_info.get('designUnit', '')}")
  185. # 验证数据格式
  186. if not isinstance(data, list):
  187. print_result(False, f"数据格式错误: 期望 list, 实际 {type(data).__name__}")
  188. return False
  189. if len(data) == 0:
  190. print_result(False, "投资数据列表为空")
  191. return False
  192. # 检查第一项的结构
  193. first_item = data[0]
  194. required_fields = ["name", "Level", "staticInvestment", "dynamicInvestment", "items"]
  195. missing_fields = [f for f in required_fields if f not in first_item]
  196. if missing_fields:
  197. print_result(False, f"缺少字段: {missing_fields}")
  198. return False
  199. print_result(True, f"解析到 {len(data)} 个大类")
  200. # 打印摘要
  201. for item in data:
  202. name = item.get("name", "")
  203. static = item.get("staticInvestment", 0)
  204. dynamic = item.get("dynamicInvestment", 0)
  205. sub_items = len(item.get("items", []))
  206. print(f" - {name}: 静态={static}, 动态={dynamic}, 子项={sub_items}")
  207. # 对于结算报告
  208. elif expected_type == "settlementReport":
  209. if isinstance(data, list):
  210. print_result(True, f"解析到 {len(data)} 条记录")
  211. else:
  212. print_result(True, f"解析完成")
  213. # 对于初设评审
  214. elif expected_type == "designReview":
  215. if isinstance(data, list):
  216. print_result(True, f"解析到 {len(data)} 条记录")
  217. else:
  218. print_result(True, f"解析完成")
  219. return True
  220. def test_single_file(file_path: Path, document_type: str, remove_watermark: bool = False, table_only: bool = True) -> bool:
  221. """测试单个文件
  222. Args:
  223. file_path: 文件路径
  224. document_type: 文档类型
  225. remove_watermark: 是否去水印
  226. table_only: 是否只保留表格附件
  227. """
  228. print_header(f"测试: {document_type}")
  229. print(f" 文件: {file_path.name}")
  230. if remove_watermark:
  231. print(f" 去水印: 是")
  232. if table_only:
  233. print(f" 只保留表格: 是")
  234. # 1. 上传文件
  235. task_id = upload_file(file_path, document_type, remove_watermark, table_only)
  236. if not task_id:
  237. return False
  238. # 2. 等待任务完成
  239. task_result = poll_task_status(task_id)
  240. if not task_result:
  241. return False
  242. # 3. 获取 JSON 结果
  243. json_result = get_json_result(task_id)
  244. if not json_result:
  245. return False
  246. # 4. 验证结果
  247. is_valid = validate_result(json_result, document_type)
  248. # 5. 保存结果到文件
  249. output_dir = Path(__file__).parent / "test_results"
  250. output_dir.mkdir(exist_ok=True)
  251. output_file = output_dir / f"{document_type}_result.json"
  252. with open(output_file, "w", encoding="utf-8") as f:
  253. json.dump(json_result, f, ensure_ascii=False, indent=2)
  254. print(f" 💾 结果已保存: {output_file}")
  255. return is_valid
  256. def run_all_tests():
  257. """运行所有测试"""
  258. print_header("PDF Converter API 测试")
  259. print(f" API 地址: {API_BASE_URL}")
  260. print(f" 测试目录: {TEST_DIR}")
  261. # 检查测试目录
  262. if not TEST_DIR.exists():
  263. print_result(False, f"测试目录不存在: {TEST_DIR}")
  264. return
  265. # 检查 API 健康状态
  266. if not check_health():
  267. print("\n❌ API 不可用,终止测试")
  268. return
  269. # 统计结果
  270. total = 0
  271. passed = 0
  272. failed = 0
  273. skipped = 0
  274. # 运行每个测试用例
  275. for filename, config in TEST_CASES.items():
  276. # 解析配置格式
  277. if isinstance(config, tuple):
  278. if len(config) >= 3:
  279. document_type, remove_watermark, table_only = config[:3]
  280. elif len(config) == 2:
  281. document_type, remove_watermark = config
  282. table_only = True # 默认只保留表格
  283. else:
  284. document_type = config[0]
  285. remove_watermark = False
  286. table_only = True
  287. else:
  288. document_type = config
  289. remove_watermark = False
  290. table_only = True
  291. file_path = TEST_DIR / filename
  292. if not file_path.exists():
  293. print_header(f"跳过: {document_type}")
  294. print_result(False, f"文件不存在: {filename}")
  295. skipped += 1
  296. continue
  297. total += 1
  298. try:
  299. if test_single_file(file_path, document_type, remove_watermark, table_only):
  300. passed += 1
  301. else:
  302. failed += 1
  303. except Exception as e:
  304. print_result(False, f"测试异常: {e}")
  305. failed += 1
  306. # 打印总结
  307. print_header("测试总结")
  308. print(f" 总计: {total}")
  309. print(f" ✅ 通过: {passed}")
  310. print(f" ❌ 失败: {failed}")
  311. print(f" ⏭️ 跳过: {skipped}")
  312. if failed == 0 and skipped == 0:
  313. print("\n🎉 所有测试通过!")
  314. elif failed > 0:
  315. print(f"\n⚠️ 有 {failed} 个测试失败")
  316. def test_single(document_type: str):
  317. """测试单个类型"""
  318. print_header(f"单项测试: {document_type}")
  319. # 检查 API
  320. if not check_health():
  321. print("\n❌ API 不可用")
  322. return
  323. # 查找对应的文件
  324. for filename, config in TEST_CASES.items():
  325. # 解析配置格式
  326. if isinstance(config, tuple):
  327. if len(config) >= 3:
  328. dtype, remove_watermark, table_only = config[:3]
  329. elif len(config) == 2:
  330. dtype, remove_watermark = config
  331. table_only = True
  332. else:
  333. dtype = config[0]
  334. remove_watermark = False
  335. table_only = True
  336. else:
  337. dtype = config
  338. remove_watermark = False
  339. table_only = True
  340. if dtype == document_type:
  341. file_path = TEST_DIR / filename
  342. if file_path.exists():
  343. test_single_file(file_path, document_type, remove_watermark, table_only)
  344. return
  345. else:
  346. print_result(False, f"文件不存在: {filename}")
  347. return
  348. print_result(False, f"未找到类型 {document_type} 的测试文件")
  349. def test_ocr(
  350. image_path: Optional[str] = None,
  351. remove_watermark: bool = False,
  352. light_threshold: int = 200,
  353. saturation_threshold: int = 30,
  354. crop_header_footer: bool = False,
  355. header_ratio: float = 0.05,
  356. footer_ratio: float = 0.05,
  357. auto_detect_header_footer: bool = False
  358. ) -> bool:
  359. """
  360. 测试 OCR 接口
  361. Args:
  362. image_path: 图片路径或包含base64数据的txt文件路径,默认使用 test/image.png
  363. 支持格式:
  364. - 图片文件:.png, .jpg, .jpeg
  365. - txt文件:包含base64编码的图片数据(可带data:image/xxx;base64,前缀)
  366. remove_watermark: 是否去除水印
  367. light_threshold: 水印亮度阈值(0-255),默认200
  368. saturation_threshold: 水印饱和度阈值(0-255),默认30
  369. crop_header_footer: 是否裁剪页眉页脚
  370. header_ratio: 页眉裁剪比例(0-1),默认0.05
  371. footer_ratio: 页脚裁剪比例(0-1),默认0.05
  372. auto_detect_header_footer: 是否自动检测页眉页脚边界
  373. Returns:
  374. 是否测试成功
  375. """
  376. print_header("测试 OCR 接口")
  377. # 检查 API
  378. if not check_health():
  379. print("\n❌ API 不可用")
  380. return False
  381. # 确定图片路径
  382. if image_path is None:
  383. image_path = TEST_DIR / "image.png"
  384. else:
  385. image_path = Path(image_path)
  386. print(f" 📷 文件路径: {image_path}")
  387. if not image_path.exists():
  388. print_result(False, f"文件不存在: {image_path}")
  389. return False
  390. suffix = image_path.suffix.lower()
  391. # 判断是 txt 文件还是图片文件
  392. if suffix == ".txt":
  393. # 从 txt 文件读取 base64 数据
  394. print(f" 📄 文件类型: txt (base64 数据)")
  395. try:
  396. with open(image_path, "r", encoding="utf-8") as f:
  397. image_base64 = f.read().strip()
  398. # 解析 data URI,提取格式和 base64 数据
  399. if image_base64.startswith("data:"):
  400. # 格式: data:image/png;base64,xxxxx
  401. if "," in image_base64:
  402. header, image_base64 = image_base64.split(",", 1)
  403. # 从 header 中提取图片格式
  404. if "image/png" in header:
  405. image_format = "png"
  406. elif "image/jpeg" in header or "image/jpg" in header:
  407. image_format = "jpeg"
  408. else:
  409. image_format = "png" # 默认
  410. print(f" 🖼️ 图片格式 (从data URI解析): {image_format}")
  411. else:
  412. image_format = "png"
  413. print(f" 🖼️ 图片格式 (默认): {image_format}")
  414. else:
  415. image_format = "png"
  416. print(f" 🖼️ 图片格式 (默认): {image_format}")
  417. print(f" 🔤 Base64长度: {len(image_base64)} 字符")
  418. except Exception as e:
  419. print_result(False, f"读取txt文件失败: {e}")
  420. return False
  421. else:
  422. # 读取图片文件并转为 base64
  423. print(f" 📄 文件类型: 图片文件")
  424. try:
  425. with open(image_path, "rb") as f:
  426. image_data = f.read()
  427. image_base64 = base64.b64encode(image_data).decode("utf-8")
  428. print(f" 📦 图片大小: {len(image_data)} bytes")
  429. print(f" 🔤 Base64长度: {len(image_base64)} 字符")
  430. except Exception as e:
  431. print_result(False, f"读取图片失败: {e}")
  432. return False
  433. # 确定图片格式
  434. format_map = {".png": "png", ".jpg": "jpeg", ".jpeg": "jpeg"}
  435. image_format = format_map.get(suffix, "png")
  436. print(f" 🖼️ 图片格式: {image_format}")
  437. # 调用 OCR 接口
  438. print(f"\n 📤 调用 OCR 接口...")
  439. # 构建请求参数
  440. request_data = {
  441. "image_base64": image_base64,
  442. "image_format": image_format
  443. }
  444. if crop_header_footer:
  445. request_data["crop_header_footer"] = True
  446. if auto_detect_header_footer:
  447. request_data["auto_detect_header_footer"] = True
  448. print(f" ✂️ 裁剪页眉页脚: 自动检测模式")
  449. else:
  450. request_data["header_ratio"] = header_ratio
  451. request_data["footer_ratio"] = footer_ratio
  452. print(f" ✂️ 裁剪页眉页脚: 是 (顶部={header_ratio*100:.0f}%, 底部={footer_ratio*100:.0f}%)")
  453. if remove_watermark:
  454. request_data["remove_watermark"] = True
  455. request_data["watermark_light_threshold"] = light_threshold
  456. request_data["watermark_saturation_threshold"] = saturation_threshold
  457. print(f" 🔧 去水印: 是 (亮度阈值={light_threshold}, 饱和度阈值={saturation_threshold})")
  458. try:
  459. start_time = time.time()
  460. response = requests.post(
  461. f"{API_BASE_URL}/ocr",
  462. json=request_data,
  463. timeout=120
  464. )
  465. elapsed = time.time() - start_time
  466. if response.status_code == 200:
  467. result = response.json()
  468. print_result(True, f"OCR 识别成功 (耗时: {elapsed:.2f}s)")
  469. # 显示识别结果(支持两种返回格式)
  470. # 格式1: {"texts": [...], "gpu_info": {...}}
  471. # 格式2: {"code": 0, "data": {"texts": [...]}, "gpu_info": {...}}
  472. if "data" in result and isinstance(result.get("data"), dict):
  473. texts: List[str] = result.get("data", {}).get("texts", [])
  474. else:
  475. texts: List[str] = result.get("texts", [])
  476. gpu_info = result.get("gpu_info", {})
  477. print(f"\n 📝 识别结果 ({len(texts)} 个文本块):")
  478. for i, text in enumerate(texts[:10]): # 最多显示前10个
  479. # 截断长文本
  480. display_text = text[:50] + "..." if len(text) > 50 else text
  481. print(f" [{i+1}] {display_text}")
  482. if len(texts) > 10:
  483. print(f" ... 还有 {len(texts) - 10} 个文本块")
  484. # 显示 GPU 信息
  485. if gpu_info:
  486. print(f"\n 💻 GPU 监控信息:")
  487. gpu_util = gpu_info.get('gpu_utilization', gpu_info.get('gpu_util_avg', 'N/A'))
  488. if isinstance(gpu_util, float):
  489. gpu_util = f"{gpu_util:.1f}"
  490. print(f" GPU利用率: {gpu_util}%")
  491. mem_used = gpu_info.get('gpu_memory_used_max', gpu_info.get('memory_used_max', 'N/A'))
  492. if isinstance(mem_used, (int, float)):
  493. mem_used = f"{mem_used / (1024**2):.0f}" # 转为 MB
  494. print(f" 显存使用峰值: {mem_used} MB")
  495. gpu_name = gpu_info.get('gpu_name', 'N/A')
  496. print(f" GPU型号: {gpu_name}")
  497. # 保存完整结果
  498. output_dir = Path(__file__).parent / "test_results"
  499. output_dir.mkdir(exist_ok=True)
  500. output_file = output_dir / "ocr_result.json"
  501. with open(output_file, "w", encoding="utf-8") as f:
  502. json.dump(result, f, ensure_ascii=False, indent=2)
  503. print(f"\n 💾 结果已保存: {output_file}")
  504. return True
  505. else:
  506. print_result(False, f"OCR 失败: {response.status_code} - {response.text}")
  507. return False
  508. except requests.exceptions.Timeout:
  509. print_result(False, "OCR 请求超时")
  510. return False
  511. except Exception as e:
  512. print_result(False, f"OCR 异常: {e}")
  513. return False
  514. # ---------------------------------------------------------------------------
  515. # PDF 文件 OCR 测试(提取每页为图片后调用 /ocr)
  516. # ---------------------------------------------------------------------------
  517. def test_ocr_pdf(
  518. pdf_path: str,
  519. remove_watermark: bool = False,
  520. light_threshold: int = 200,
  521. saturation_threshold: int = 30,
  522. crop_header_footer: bool = False,
  523. header_ratio: float = 0.05,
  524. footer_ratio: float = 0.05,
  525. auto_detect_header_footer: bool = False,
  526. max_pages: int = 0,
  527. ) -> bool:
  528. """
  529. 测试 PDF 文件的 OCR:提取每页为图片后调用 /ocr 接口
  530. Args:
  531. pdf_path: PDF 文件路径
  532. remove_watermark: 是否去除水印
  533. light_threshold: 水印亮度阈值
  534. saturation_threshold: 水印饱和度阈值
  535. crop_header_footer: 是否裁剪页眉页脚
  536. header_ratio: 页眉裁剪比例
  537. footer_ratio: 页脚裁剪比例
  538. auto_detect_header_footer: 是否自动检测页眉页脚边界
  539. max_pages: 最大处理页数,0 表示全部
  540. Returns:
  541. 是否测试成功
  542. """
  543. try:
  544. import fitz # PyMuPDF
  545. except ImportError:
  546. print_result(False, "PyMuPDF 未安装,无法提取 PDF 页面。请安装: pip install pymupdf")
  547. return False
  548. fp = Path(pdf_path)
  549. if not fp.exists():
  550. print_result(False, f"文件不存在: {fp}")
  551. return False
  552. print(f" 📄 PDF 文件: {fp.name}")
  553. try:
  554. doc = fitz.open(str(fp))
  555. total_pages = len(doc)
  556. print(f" 📃 总页数: {total_pages}")
  557. pages_to_process = total_pages if max_pages == 0 else min(max_pages, total_pages)
  558. print(f" 🔄 处理页数: {pages_to_process}")
  559. all_texts = []
  560. success_count = 0
  561. for page_idx in range(pages_to_process):
  562. page = doc[page_idx]
  563. # 渲染页面为图片 (DPI=150)
  564. mat = fitz.Matrix(150 / 72, 150 / 72)
  565. pix = page.get_pixmap(matrix=mat)
  566. img_data = pix.tobytes("png")
  567. image_base64 = base64.b64encode(img_data).decode("utf-8")
  568. print(f"\n 📄 页 {page_idx + 1}/{pages_to_process}")
  569. # 构建请求参数
  570. request_data = {
  571. "image_base64": image_base64,
  572. "image_format": "png"
  573. }
  574. if crop_header_footer:
  575. request_data["crop_header_footer"] = True
  576. if auto_detect_header_footer:
  577. request_data["auto_detect_header_footer"] = True
  578. else:
  579. request_data["header_ratio"] = header_ratio
  580. request_data["footer_ratio"] = footer_ratio
  581. if remove_watermark:
  582. request_data["remove_watermark"] = True
  583. request_data["watermark_light_threshold"] = light_threshold
  584. request_data["watermark_saturation_threshold"] = saturation_threshold
  585. try:
  586. start_time = time.time()
  587. response = requests.post(
  588. f"{API_BASE_URL}/ocr",
  589. json=request_data,
  590. timeout=120
  591. )
  592. elapsed = time.time() - start_time
  593. if response.status_code == 200:
  594. result = response.json()
  595. # 提取文本
  596. if "data" in result and isinstance(result.get("data"), dict):
  597. texts = result.get("data", {}).get("texts", [])
  598. else:
  599. texts = result.get("texts", [])
  600. all_texts.extend(texts)
  601. success_count += 1
  602. print(f" ✅ OCR 成功 ({elapsed:.2f}s), 识别 {len(texts)} 个文本块")
  603. # 显示前3个文本块
  604. for i, text in enumerate(texts[:3]):
  605. display = text[:40] + "..." if len(text) > 40 else text
  606. print(f" [{i+1}] {display}")
  607. if len(texts) > 3:
  608. print(f" ... 还有 {len(texts) - 3} 个")
  609. else:
  610. print(f" ❌ OCR 失败: {response.status_code}")
  611. except Exception as e:
  612. print(f" ❌ OCR 异常: {e}")
  613. doc.close()
  614. # 保存结果
  615. output_dir = Path(__file__).parent / "test_results"
  616. output_dir.mkdir(exist_ok=True)
  617. output_file = output_dir / f"ocr_pdf_{fp.stem}.json"
  618. with open(output_file, "w", encoding="utf-8") as f:
  619. json.dump({"file": str(fp), "pages": pages_to_process, "texts": all_texts}, f, ensure_ascii=False, indent=2)
  620. print(f"\n 💾 结果已保存: {output_file}")
  621. print(f" 📊 汇总: {success_count}/{pages_to_process} 页成功, 共 {len(all_texts)} 个文本块")
  622. return success_count == pages_to_process
  623. except Exception as e:
  624. print_result(False, f"处理 PDF 异常: {e}")
  625. return False
  626. def run_ocr_tests(
  627. remove_watermark: bool = False,
  628. crop_header_footer: bool = False,
  629. max_pages: int = 0,
  630. ) -> bool:
  631. """运行 OCR_TEST_CASES 中所有 PDF 文件的 OCR 测试"""
  632. print_header("测试 OCR 接口 (PDF 文件)")
  633. # 检查 API
  634. if not check_health():
  635. print("\n❌ API 不可用")
  636. return False
  637. total = len(OCR_TEST_CASES)
  638. passed = 0
  639. failed = 0
  640. for idx, rel_path in enumerate(OCR_TEST_CASES, 1):
  641. fp = TEST_DIR / rel_path
  642. print(f"\n{'='*60}")
  643. print(f" [{idx}/{total}] {rel_path}")
  644. print(f"{'='*60}")
  645. if not fp.exists():
  646. print_result(False, f"文件不存在: {fp}")
  647. failed += 1
  648. continue
  649. if test_ocr_pdf(
  650. str(fp),
  651. remove_watermark=remove_watermark,
  652. crop_header_footer=crop_header_footer,
  653. max_pages=max_pages,
  654. ):
  655. passed += 1
  656. else:
  657. failed += 1
  658. # 汇总
  659. print_header("OCR 测试汇总")
  660. print(f" 总计: {total}")
  661. print(f" ✅ 通过: {passed}")
  662. print(f" ❌ 失败: {failed}")
  663. if failed == 0:
  664. print("\n🎉 所有 OCR 测试通过!")
  665. return failed == 0
  666. # ---------------------------------------------------------------------------
  667. # /pdf_to_markdown 接口测试
  668. # ---------------------------------------------------------------------------
  669. def upload_pdf_to_markdown(
  670. file_path: Path,
  671. backend: Optional[str] = None,
  672. remove_watermark: bool = False,
  673. crop_header_footer: bool = False,
  674. return_images: bool = False,
  675. ) -> Optional[str]:
  676. """上传文件到 /pdf_to_markdown 并返回 task_id"""
  677. print(f"\n 📤 上传文件: {file_path.name}")
  678. try:
  679. with open(file_path, "rb") as f:
  680. mime = "application/pdf" if file_path.suffix.lower() == ".pdf" else "image/*"
  681. files = {"file": (file_path.name, f, mime)}
  682. data: Dict[str, Any] = {}
  683. if backend:
  684. data["backend"] = backend
  685. if remove_watermark:
  686. data["remove_watermark"] = "true"
  687. if crop_header_footer:
  688. data["crop_header_footer"] = "true"
  689. if return_images:
  690. data["return_images"] = "true"
  691. response = requests.post(
  692. f"{API_BASE_URL}/pdf_to_markdown",
  693. files=files,
  694. data=data,
  695. timeout=60,
  696. )
  697. if response.status_code == 200:
  698. result = response.json()
  699. task_id = result.get("task_id")
  700. print(f" 任务 ID: {task_id}")
  701. return task_id
  702. else:
  703. print_result(False, f"上传失败: {response.status_code} - {response.text[:300]}")
  704. return None
  705. except Exception as e:
  706. print_result(False, f"上传异常: {e}")
  707. return None
  708. def download_markdown(task_id: str) -> Optional[str]:
  709. """从 /task/{task_id}/json 获取 markdown 文本"""
  710. try:
  711. response = requests.get(f"{API_BASE_URL}/task/{task_id}/json", timeout=30)
  712. if response.status_code == 200:
  713. data = response.json()
  714. return data.get("markdown", "")
  715. else:
  716. print_result(False, f"获取 Markdown 失败: {response.status_code}")
  717. return None
  718. except Exception as e:
  719. print_result(False, f"获取 Markdown 异常: {e}")
  720. return None
  721. def download_markdown_file(task_id: str, save_path: Path) -> bool:
  722. """从 /download/{task_id}/markdown 下载 .md 文件"""
  723. try:
  724. response = requests.get(f"{API_BASE_URL}/download/{task_id}/markdown", timeout=30)
  725. if response.status_code == 200:
  726. save_path.parent.mkdir(parents=True, exist_ok=True)
  727. save_path.write_bytes(response.content)
  728. print(f" 💾 Markdown 文件已保存: {save_path}")
  729. return True
  730. else:
  731. print_result(False, f"下载 Markdown 文件失败: {response.status_code}")
  732. return False
  733. except Exception as e:
  734. print_result(False, f"下载 Markdown 文件异常: {e}")
  735. return False
  736. def test_pdf_to_markdown(
  737. file_path: Optional[str] = None,
  738. backend: Optional[str] = None,
  739. remove_watermark: bool = False,
  740. crop_header_footer: bool = False,
  741. return_images: bool = False,
  742. max_wait: int = 600,
  743. ) -> bool:
  744. """测试 /pdf_to_markdown 接口
  745. Args:
  746. file_path: 要测试的文件路径,默认使用 TEST_DIR 下第一个 PDF
  747. backend: MinerU backend,留空使用服务端默认
  748. remove_watermark: 是否去水印
  749. crop_header_footer: 是否裁剪页眉页脚
  750. return_images: 是否返回图片
  751. max_wait: 最大等待秒数
  752. """
  753. print_header("测试 /pdf_to_markdown 接口")
  754. # 检查 API
  755. if not check_health():
  756. print("\n❌ API 不可用")
  757. return False
  758. # 确定测试文件列表
  759. if file_path:
  760. files_to_test = [Path(file_path)]
  761. else:
  762. # 遍历 PDF2MD_TEST_CASES 中所有文件
  763. files_to_test = []
  764. for fname in PDF2MD_TEST_CASES:
  765. fp = TEST_DIR / fname
  766. if fp.exists():
  767. files_to_test.append(fp)
  768. else:
  769. print(f" ⚠️ 跳过不存在的文件: {fname}")
  770. if not files_to_test:
  771. print_result(False, f"TEST_DIR ({TEST_DIR}) 中没有可用的测试文件")
  772. return False
  773. total = len(files_to_test)
  774. passed = 0
  775. failed = 0
  776. for idx, fp in enumerate(files_to_test, 1):
  777. print(f"\n{'='*60}")
  778. print(f" [{idx}/{total}] {fp.name}")
  779. print(f"{'='*60}")
  780. if not fp.exists():
  781. print_result(False, f"文件不存在: {fp}")
  782. failed += 1
  783. continue
  784. print(f" 📄 文件: {fp.name} ({fp.stat().st_size / 1024:.1f} KB)")
  785. if backend:
  786. print(f" 🔧 Backend: {backend}")
  787. if remove_watermark:
  788. print(f" 🔧 去水印: 是")
  789. if crop_header_footer:
  790. print(f" 🔧 裁剪页眉页脚: 是")
  791. if return_images:
  792. print(f" 🔧 返回图片: 是")
  793. # 1. 上传
  794. task_id = upload_pdf_to_markdown(fp, backend, remove_watermark, crop_header_footer, return_images)
  795. if not task_id:
  796. failed += 1
  797. continue
  798. # 2. 轮询
  799. task_result = poll_task_status(task_id, max_wait=max_wait)
  800. if not task_result:
  801. failed += 1
  802. continue
  803. # 3. 获取 Markdown 文本
  804. md_text = download_markdown(task_id)
  805. # 4. 下载 .md 文件
  806. output_dir = Path(__file__).parent / "test_results"
  807. output_dir.mkdir(exist_ok=True)
  808. md_file = output_dir / f"pdf2md_{fp.stem}.md"
  809. download_markdown_file(task_id, md_file)
  810. # 5. 下载 ZIP(如果 return_images)
  811. if return_images:
  812. try:
  813. zip_resp = requests.get(f"{API_BASE_URL}/download/{task_id}/zip", timeout=60)
  814. if zip_resp.status_code == 200:
  815. zip_file = output_dir / f"pdf2md_{fp.stem}.zip"
  816. zip_file.write_bytes(zip_resp.content)
  817. print(f" 💾 ZIP 文件已保存: {zip_file} ({len(zip_resp.content) / 1024:.1f} KB)")
  818. else:
  819. print_result(False, f"下载 ZIP 失败: {zip_resp.status_code}")
  820. except Exception as e:
  821. print_result(False, f"下载 ZIP 异常: {e}")
  822. # 6. 输出摘要
  823. if md_text:
  824. lines = md_text.strip().split("\n")
  825. print(f"\n 📝 Markdown 结果: {len(md_text)} 字符, {len(lines)} 行")
  826. print(f" --- 前 10 行 ---")
  827. for line in lines[:10]:
  828. display = line[:80] + "..." if len(line) > 80 else line
  829. print(f" {display}")
  830. if len(lines) > 10:
  831. print(f" ... 还有 {len(lines) - 10} 行")
  832. print_result(True, "PDF 转 Markdown 成功")
  833. passed += 1
  834. else:
  835. print_result(False, "未获取到 Markdown 内容")
  836. failed += 1
  837. # 打印汇总
  838. print_header("pdf_to_markdown 测试汇总")
  839. print(f" 总计: {total}")
  840. print(f" ✅ 通过: {passed}")
  841. print(f" ❌ 失败: {failed}")
  842. if failed == 0:
  843. print("\n🎉 所有 pdf_to_markdown 测试通过!")
  844. return failed == 0
  845. if __name__ == "__main__":
  846. if len(sys.argv) > 1:
  847. # 测试指定类型
  848. doc_type = sys.argv[1]
  849. if doc_type in ["--help", "-h"]:
  850. print("用法:")
  851. print(" python test_api.py # 运行所有 /convert 测试")
  852. print(" python test_api.py <type> # 测试指定文档类型")
  853. print(" python test_api.py ocr # 测试 OCR 接口(图片)")
  854. print(" python test_api.py ocr <path> [--nowm] [--crop]")
  855. print(" python test_api.py ocrpdf # 测试 OCR 接口(PDF 文件,遍历 OCR_TEST_CASES)")
  856. print(" python test_api.py ocrpdf <path> [--nowm] [--crop] [--pages=N]")
  857. print(" python test_api.py pdf2md # 测试 /pdf_to_markdown(默认文件)")
  858. print(" python test_api.py pdf2md <path> [--backend=X] [--nowm] [--crop] [--images]")
  859. print("\n可用类型:")
  860. for dtype in set(v if isinstance(v, str) else v[0] for v in TEST_CASES.values()):
  861. print(f" - {dtype}")
  862. print(" - ocr (OCR 图片识别)")
  863. print(" - ocrpdf (OCR PDF 文件,提取每页调用 /ocr)")
  864. print(" - pdf2md (PDF/图片转 Markdown)")
  865. print("\nOCR 去水印参数:")
  866. print(" --nowm 启用去水印")
  867. print(" --light=N 亮度阈值(0-255,默认200)")
  868. print(" --sat=N 饱和度阈值(0-255,默认30)")
  869. print("\nOCR 裁剪页眉页脚参数:")
  870. print(" --crop 启用裁剪页眉页脚(固定比例模式)")
  871. print(" --crop-auto 启用裁剪页眉页脚(自动检测模式)")
  872. print(" --header=N 页眉裁剪比例(0-1,默认0.05表示5%)")
  873. print(" --footer=N 页脚裁剪比例(0-1,默认0.05表示5%)")
  874. print("\npdf2md 参数:")
  875. print(" --backend=X 指定 MinerU backend")
  876. print(" --nowm 启用去水印")
  877. print(" --crop 启用裁剪页眉页脚")
  878. print(" --images 返回图片(可下载 ZIP)")
  879. print(" --wait=N 最大等待秒数(默认600)")
  880. print("\nocrpdf 参数:")
  881. print(" --nowm 启用去水印")
  882. print(" --crop 启用裁剪页眉页脚")
  883. print(" --pages=N 最大处理页数(0=全部)")
  884. elif doc_type == "ocr":
  885. # 解析 OCR 参数
  886. image_path = None
  887. remove_watermark = False
  888. light_threshold = 200
  889. saturation_threshold = 30
  890. crop_header_footer = False
  891. header_ratio = 0.05
  892. footer_ratio = 0.05
  893. auto_detect_header_footer = False
  894. for arg in sys.argv[2:]:
  895. if arg == "--nowm":
  896. remove_watermark = True
  897. elif arg == "--crop":
  898. crop_header_footer = True
  899. elif arg == "--crop-auto":
  900. crop_header_footer = True
  901. auto_detect_header_footer = True
  902. elif arg.startswith("--light="):
  903. try:
  904. light_threshold = int(arg.split("=")[1])
  905. except ValueError:
  906. print(f"警告: 无效的亮度阈值 {arg},使用默认值 200")
  907. elif arg.startswith("--sat="):
  908. try:
  909. saturation_threshold = int(arg.split("=")[1])
  910. except ValueError:
  911. print(f"警告: 无效的饱和度阈值 {arg},使用默认值 30")
  912. elif arg.startswith("--header="):
  913. try:
  914. header_ratio = float(arg.split("=")[1])
  915. except ValueError:
  916. print(f"警告: 无效的页眉比例 {arg},使用默认值 0.05")
  917. elif arg.startswith("--footer="):
  918. try:
  919. footer_ratio = float(arg.split("=")[1])
  920. except ValueError:
  921. print(f"警告: 无效的页脚比例 {arg},使用默认值 0.05")
  922. elif not arg.startswith("--"):
  923. image_path = arg
  924. test_ocr(
  925. image_path,
  926. remove_watermark,
  927. light_threshold,
  928. saturation_threshold,
  929. crop_header_footer,
  930. header_ratio,
  931. footer_ratio,
  932. auto_detect_header_footer
  933. )
  934. elif doc_type == "ocrpdf":
  935. # 解析 ocrpdf 参数
  936. ocrpdf_file = None
  937. ocrpdf_nowm = False
  938. ocrpdf_crop = False
  939. ocrpdf_pages = 0
  940. for arg in sys.argv[2:]:
  941. if arg == "--nowm":
  942. ocrpdf_nowm = True
  943. elif arg == "--crop":
  944. ocrpdf_crop = True
  945. elif arg.startswith("--pages="):
  946. try:
  947. ocrpdf_pages = int(arg.split("=", 1)[1])
  948. except ValueError:
  949. print(f"警告: 无效的页数 {arg},使用默认值 0(全部)")
  950. elif not arg.startswith("--"):
  951. ocrpdf_file = arg
  952. if ocrpdf_file:
  953. # 测试单个 PDF 文件
  954. print_header("测试 OCR 接口 (PDF 文件)")
  955. if not check_health():
  956. print("\n❌ API 不可用")
  957. else:
  958. test_ocr_pdf(
  959. ocrpdf_file,
  960. remove_watermark=ocrpdf_nowm,
  961. crop_header_footer=ocrpdf_crop,
  962. max_pages=ocrpdf_pages,
  963. )
  964. else:
  965. # 遍历 OCR_TEST_CASES
  966. run_ocr_tests(
  967. remove_watermark=ocrpdf_nowm,
  968. crop_header_footer=ocrpdf_crop,
  969. max_pages=ocrpdf_pages,
  970. )
  971. elif doc_type == "pdf2md":
  972. # 解析 pdf2md 参数
  973. pdf2md_file = None
  974. pdf2md_backend = None
  975. pdf2md_nowm = False
  976. pdf2md_crop = False
  977. pdf2md_images = False
  978. pdf2md_wait = 600
  979. for arg in sys.argv[2:]:
  980. if arg == "--nowm":
  981. pdf2md_nowm = True
  982. elif arg == "--crop":
  983. pdf2md_crop = True
  984. elif arg == "--images":
  985. pdf2md_images = True
  986. elif arg.startswith("--backend="):
  987. pdf2md_backend = arg.split("=", 1)[1]
  988. elif arg.startswith("--wait="):
  989. try:
  990. pdf2md_wait = int(arg.split("=", 1)[1])
  991. except ValueError:
  992. print(f"警告: 无效的等待时间 {arg},使用默认值 600")
  993. elif not arg.startswith("--"):
  994. pdf2md_file = arg
  995. test_pdf_to_markdown(
  996. file_path=pdf2md_file,
  997. backend=pdf2md_backend,
  998. remove_watermark=pdf2md_nowm,
  999. crop_header_footer=pdf2md_crop,
  1000. return_images=pdf2md_images,
  1001. max_wait=pdf2md_wait,
  1002. )
  1003. else:
  1004. test_single(doc_type)
  1005. else:
  1006. # 运行所有测试
  1007. run_all_tests()