main.py 57 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130113111321133113411351136113711381139114011411142114311441145114611471148114911501151115211531154115511561157115811591160116111621163116411651166116711681169117011711172117311741175117611771178117911801181118211831184118511861187118811891190119111921193119411951196119711981199120012011202120312041205120612071208120912101211121212131214121512161217121812191220122112221223122412251226122712281229123012311232123312341235123612371238123912401241124212431244124512461247124812491250125112521253125412551256125712581259126012611262126312641265126612671268126912701271127212731274127512761277127812791280128112821283128412851286128712881289129012911292129312941295129612971298129913001301130213031304130513061307130813091310131113121313131413151316131713181319132013211322132313241325132613271328132913301331133213331334133513361337133813391340134113421343134413451346134713481349135013511352135313541355135613571358135913601361136213631364136513661367136813691370137113721373137413751376137713781379138013811382138313841385138613871388138913901391139213931394139513961397139813991400140114021403140414051406
  1. # Copyright (c) Opendatalab. All rights reserved.
  2. """
  3. PDF转换工具 FastAPI 版本 v2 - 使用外部API接口
  4. """
  5. import asyncio
  6. import os
  7. import shutil
  8. import tempfile
  9. import uuid
  10. import base64
  11. from datetime import datetime
  12. import json
  13. import zipfile
  14. from pathlib import Path
  15. from typing import Optional, List
  16. from urllib.parse import quote
  17. from fastapi import FastAPI, File, UploadFile, Form, HTTPException
  18. from fastapi.responses import FileResponse, JSONResponse, Response
  19. from fastapi.middleware.cors import CORSMiddleware
  20. from pydantic import BaseModel
  21. from typing_extensions import Annotated, Literal
  22. from ..processor.converter import convert_to_markdown, convert_pdf_to_markdown_only
  23. from ..utils.logging_config import get_logger
  24. from ..utils.pdf_watermark_remover import remove_watermark_from_pdf, crop_header_footer_from_pdf
  25. # 尝试导入配置
  26. try:
  27. from ..config import (
  28. DEFAULT_MODEL_NAME, DEFAULT_GPU_MEMORY_UTILIZATION, DEFAULT_DPI, DEFAULT_MAX_PAGES,
  29. DEFAULT_API_URL, DEFAULT_BACKEND, DEFAULT_PARSE_METHOD, DEFAULT_START_PAGE_ID,
  30. DEFAULT_END_PAGE_ID, DEFAULT_LANGUAGE, DEFAULT_RESPONSE_FORMAT_ZIP,
  31. DEFAULT_RETURN_MIDDLE_JSON, DEFAULT_RETURN_MODEL_OUTPUT, DEFAULT_RETURN_MD,
  32. DEFAULT_RETURN_IMAGES, DEFAULT_RETURN_CONTENT_LIST, DEFAULT_SERVER_URL,
  33. LOG_DIR, LOG_LEVEL
  34. )
  35. except ImportError:
  36. # 如果导入失败,使用硬编码的默认值(不推荐,正常情况下应该能导入)
  37. DEFAULT_MODEL_NAME = "OpenDataLab/MinerU2.5-2509-1.2B"
  38. DEFAULT_GPU_MEMORY_UTILIZATION = 0.9
  39. DEFAULT_DPI = 200
  40. DEFAULT_MAX_PAGES = 10
  41. DEFAULT_API_URL = "http://127.0.0.1:5282"
  42. DEFAULT_BACKEND = "vlm-vllm-async-engine"
  43. DEFAULT_PARSE_METHOD = "auto"
  44. DEFAULT_START_PAGE_ID = 0
  45. DEFAULT_END_PAGE_ID = 99999
  46. DEFAULT_LANGUAGE = "ch"
  47. DEFAULT_RESPONSE_FORMAT_ZIP = True
  48. DEFAULT_RETURN_MIDDLE_JSON = False
  49. DEFAULT_RETURN_MODEL_OUTPUT = True
  50. DEFAULT_RETURN_MD = True
  51. DEFAULT_RETURN_IMAGES = True
  52. DEFAULT_RETURN_CONTENT_LIST = False
  53. DEFAULT_SERVER_URL = "string"
  54. LOG_DIR = "./logs"
  55. LOG_LEVEL = "INFO"
  56. # 初始化日志
  57. # v2 使用简化的日志配置,从 v1 复用或使用 loguru
  58. try:
  59. # 尝试导入 v1 的日志初始化函数
  60. import sys
  61. from pathlib import Path
  62. v1_path = Path(__file__).parent.parent.parent / "pdf_converter"
  63. if str(v1_path.parent) not in sys.path:
  64. sys.path.insert(0, str(v1_path.parent))
  65. from pdf_converter.utils.logging_config import init_logging
  66. init_logging(
  67. log_dir=LOG_DIR,
  68. log_level=LOG_LEVEL,
  69. log_to_file=True,
  70. log_to_console=True
  71. )
  72. except Exception:
  73. # 如果无法导入,直接使用 get_logger(会使用 loguru 后备)
  74. pass
  75. # 获取日志记录器
  76. logger = get_logger("pdf_converter_v2.api")
  77. # MinerU 服务管理器(延迟导入,避免循环依赖)
  78. _mineru_manager = None
  79. def get_mineru_manager():
  80. """获取 MinerU 服务管理器"""
  81. global _mineru_manager
  82. if _mineru_manager is None:
  83. from ..utils.mineru_service_manager import get_mineru_manager as _get_manager
  84. _mineru_manager = _get_manager()
  85. return _mineru_manager
  86. app = FastAPI(
  87. title="PDF转换工具API v2",
  88. description="将PDF转换为Markdown和JSON格式(使用外部API)",
  89. version="2.0.0"
  90. )
  91. # 添加验证错误处理器,记录详细错误信息
  92. from fastapi.exceptions import RequestValidationError
  93. from starlette.requests import Request
  94. @app.exception_handler(RequestValidationError)
  95. async def validation_exception_handler(request: Request, exc: RequestValidationError):
  96. """捕获 422 验证错误并记录详细信息"""
  97. logger.error(f"[验证错误] URL: {request.url}")
  98. logger.error(f"[验证错误] Method: {request.method}")
  99. logger.error(f"[验证错误] Headers: {dict(request.headers)}")
  100. logger.error(f"[验证错误] 错误详情: {exc.errors()}")
  101. return JSONResponse(
  102. status_code=422,
  103. content={"detail": exc.errors(), "body": str(exc.body) if hasattr(exc, 'body') else None}
  104. )
  105. # 配置CORS
  106. app.add_middleware(
  107. CORSMiddleware,
  108. allow_origins=["*"], # 生产环境应限制为特定域名
  109. allow_credentials=True,
  110. allow_methods=["*"],
  111. allow_headers=["*"],
  112. )
  113. # 存储任务状态
  114. task_status = {}
  115. # MinerU 定时管理器暂时禁用,保持原有逻辑
  116. # @app.on_event("startup")
  117. # async def startup_event():
  118. # """应用启动时初始化"""
  119. # logger.info("[启动] 正在初始化 MinerU 服务管理器...")
  120. # try:
  121. # manager = get_mineru_manager()
  122. # manager.start_monitor()
  123. # logger.info("[启动] MinerU 服务管理器初始化完成")
  124. # except Exception as e:
  125. # logger.warning(f"[启动] MinerU 服务管理器初始化失败(非致命): {e}")
  126. # @app.on_event("shutdown")
  127. # async def shutdown_event():
  128. # """应用关闭时清理"""
  129. # logger.info("[关闭] 正在停止 MinerU 服务监控...")
  130. # try:
  131. # manager = get_mineru_manager()
  132. # manager.stop_monitor()
  133. # logger.info("[关闭] MinerU 服务监控已停止")
  134. # except Exception as e:
  135. # logger.warning(f"[关闭] 停止 MinerU 服务监控失败: {e}")
  136. class ConversionRequest(BaseModel):
  137. """转换请求模型(v2 精简版)"""
  138. # 新增:强制文档类型(正式全称)
  139. doc_type: Optional[str] = None
  140. class ConversionResponse(BaseModel):
  141. """转换响应模型"""
  142. task_id: str
  143. status: str
  144. message: str
  145. markdown_file: Optional[str] = None
  146. json_file: Optional[str] = None
  147. document_type: Optional[str] = None
  148. class GpuInfo(BaseModel):
  149. """GPU监控信息(基于采集数据计算得出)"""
  150. gpu_index: Optional[int] = None
  151. gpu_memory_used: Optional[int] = None # 字节,任务期间的最大显存使用量
  152. gpu_utilization: Optional[float] = None # 百分比,平均GPU利用率
  153. gpu_memory_total: Optional[int] = None # 总显存(字节)
  154. gpu_name: Optional[str] = None
  155. # 以下为可选统计字段
  156. gpu_memory_used_avg: Optional[int] = None # 平均显存使用(字节)
  157. gpu_memory_used_max: Optional[int] = None # 最大显存使用(字节)
  158. gpu_utilization_max: Optional[float] = None # 最大GPU利用率(%)
  159. system_load_avg_1min: Optional[float] = None # 平均1分钟系统负载
  160. system_load_max_1min: Optional[float] = None # 最大1分钟系统负载
  161. system_load_avg_5min: Optional[float] = None # 平均5分钟系统负载
  162. system_load_max_5min: Optional[float] = None # 最大5分钟系统负载
  163. system_load_avg_15min: Optional[float] = None # 平均15分钟系统负载
  164. system_load_max_15min: Optional[float] = None # 最大15分钟系统负载
  165. sample_count: Optional[int] = None # 采集的样本数量
  166. duration: Optional[float] = None # 监控持续时间(秒)
  167. class TaskStatus(BaseModel):
  168. """任务状态模型"""
  169. task_id: str
  170. status: str # pending, processing, completed, failed
  171. message: str
  172. progress: Optional[float] = None
  173. markdown_file: Optional[str] = None
  174. json_file: Optional[str] = None
  175. document_type: Optional[str] = None
  176. error: Optional[str] = None
  177. gpu_info: Optional[GpuInfo] = None # GPU监控信息
  178. class OCRRequest(BaseModel):
  179. """OCR识别请求模型"""
  180. image_base64: str # base64编码的图片数据
  181. image_format: Optional[str] = "png" # 图片格式:png, jpg, jpeg
  182. remove_watermark: Optional[bool] = False # 是否去除水印
  183. watermark_light_threshold: Optional[int] = 200 # 水印亮度阈值(0-255),高于此值的浅色像素可能是水印
  184. watermark_saturation_threshold: Optional[int] = 30 # 水印饱和度阈值(0-255),低于此值的低饱和度像素可能是水印
  185. crop_header_footer: Optional[bool] = False # 是否裁剪页眉页脚
  186. header_ratio: Optional[float] = 0.05 # 页眉裁剪比例(0-1),默认5%
  187. footer_ratio: Optional[float] = 0.05 # 页脚裁剪比例(0-1),默认5%
  188. class OCRResponse(BaseModel):
  189. """OCR识别响应模型"""
  190. code: int # 状态码:0表示成功,-1或其他表示错误
  191. message: str # 消息
  192. data: Optional[dict] = None # 数据,包含texts和full_text
  193. gpu_info: Optional[GpuInfo] = None # GPU监控信息
  194. class PdfToMarkdownResponse(BaseModel):
  195. """PDF 转 Markdown 同步接口响应"""
  196. markdown: str # 生成的 Markdown 全文
  197. filename: str # 建议的文件名(如 xxx.md)
  198. @app.get("/")
  199. async def root():
  200. """API根路径"""
  201. return {
  202. "name": "PDF转换工具API",
  203. "version": "2.0.0",
  204. "description": "将PDF/图片转换为Markdown和JSON格式(使用外部API)",
  205. "workflow": {
  206. "step1": "POST /convert - 上传文件,立即返回 task_id(不等待处理)",
  207. "step2": "GET /task/{task_id} - 轮询查询任务状态",
  208. "step3a": "GET /task/{task_id}/json - 任务完成后直接获取JSON数据(推荐)",
  209. "step3b": "GET /download/{task_id}/json - 任务完成后下载JSON文件",
  210. "step4": "DELETE /task/{task_id} - (可选) 删除任务清理临时文件"
  211. },
  212. "endpoints": {
  213. "POST /convert": "转换PDF/图片文件(异步,立即返回task_id)",
  214. "POST /pdf_to_markdown": "PDF/图片转 Markdown(异步,立即返回task_id,通过 task_id 查询状态并下载 .md)",
  215. "GET /task/{task_id}": "查询任务状态(轮询接口)",
  216. "GET /task/{task_id}/json": "直接获取JSON数据(返回JSON对象,不下载文件)",
  217. "GET /download/{task_id}/markdown": "下载Markdown文件",
  218. "GET /download/{task_id}/zip": "下载 md+图片 压缩包(需 POST /pdf_to_markdown 时 return_images=true)",
  219. "GET /download/{task_id}/json": "下载JSON文件",
  220. "DELETE /task/{task_id}": "删除任务及其临时文件",
  221. "GET /health": "健康检查"
  222. }
  223. }
  224. @app.get("/health")
  225. async def health_check():
  226. """健康检查"""
  227. return {"status": "healthy", "service": "pdf_converter_v2"}
  228. @app.get("/mineru/status")
  229. async def mineru_status():
  230. """获取 MinerU 服务状态"""
  231. try:
  232. manager = get_mineru_manager()
  233. return manager.get_status()
  234. except Exception as e:
  235. logger.exception(f"获取 MinerU 状态失败: {e}")
  236. return {"error": str(e)}
  237. @app.post("/mineru/start")
  238. async def mineru_start():
  239. """手动启动 MinerU 服务"""
  240. try:
  241. manager = get_mineru_manager()
  242. success = await manager.start_service()
  243. return {
  244. "success": success,
  245. "message": "服务已启动" if success else "服务启动失败",
  246. "status": manager.get_status()
  247. }
  248. except Exception as e:
  249. logger.exception(f"启动 MinerU 服务失败: {e}")
  250. return {"success": False, "error": str(e)}
  251. @app.post("/mineru/stop")
  252. async def mineru_stop():
  253. """手动停止 MinerU 服务(仅在无活跃任务时)"""
  254. try:
  255. manager = get_mineru_manager()
  256. success = await manager.stop_service()
  257. return {
  258. "success": success,
  259. "message": "服务已停止" if success else "服务停止失败(可能有活跃任务)",
  260. "status": manager.get_status()
  261. }
  262. except Exception as e:
  263. logger.exception(f"停止 MinerU 服务失败: {e}")
  264. return {"success": False, "error": str(e)}
  265. async def process_conversion_task(
  266. task_id: str,
  267. file_path: str,
  268. output_dir: str,
  269. request: ConversionRequest
  270. ):
  271. """
  272. 后台处理转换任务
  273. 注意:这个函数在响应返回给客户端之后才会执行
  274. """
  275. # 资源监控:启动后台采集线程(每0.5秒采集一次)
  276. from ..utils.resource_monitor import ResourceMonitor
  277. monitor = ResourceMonitor(interval=0.5)
  278. monitor.start()
  279. try:
  280. logger.info(f"[任务 {task_id}] 后台任务开始执行...")
  281. task_status[task_id]["status"] = "processing"
  282. task_status[task_id]["message"] = "开始处理文件..."
  283. logger.info(f"[任务 {task_id}] 开始处理: {file_path}")
  284. result = None
  285. tables_info = None
  286. # 针对投资估算类型,需要先切割附件页
  287. if request.doc_type in ("fsApproval", "fsReview", "pdApproval", "safetyFsApproval"):
  288. logger.info(f"[任务 {task_id}] 文档类型 {request.doc_type},需要先切割附件页")
  289. # 导入附件页切割函数
  290. import sys
  291. from pathlib import Path as PathLib
  292. sys.path.insert(0, str(PathLib(__file__).parent.parent))
  293. attachment_split_success = False
  294. try:
  295. from test_no import split_attachment_pages
  296. # 创建附件页输出目录
  297. attachment_dir = PathLib(output_dir) / "attachments"
  298. attachment_dir.mkdir(parents=True, exist_ok=True)
  299. # 切割附件页
  300. logger.info(f"[任务 {task_id}] 开始切割附件页,输出目录: {attachment_dir}")
  301. await asyncio.to_thread(
  302. split_attachment_pages,
  303. file_path,
  304. attachment_dir,
  305. use_ocr=True,
  306. debug=False
  307. )
  308. # 查找切割后的附件页PDF
  309. attachment_pdfs = list(attachment_dir.glob("*_附件页_*.pdf"))
  310. logger.info(f"[任务 {task_id}] 附件页目录内容: {list(attachment_dir.iterdir()) if attachment_dir.exists() else '(目录不存在)'}")
  311. if attachment_pdfs:
  312. # 使用第一个附件页PDF作为输入
  313. file_path = str(attachment_pdfs[0])
  314. logger.info(f"[任务 {task_id}] 附件页切割完成,使用文件: {file_path}")
  315. attachment_split_success = True
  316. else:
  317. logger.warning(f"[任务 {task_id}] 未找到附件页PDF文件,使用原始文件")
  318. logger.info(f"[任务 {task_id}] 提示: 如果PDF是扫描件,请确保安装了Tesseract OCR或PaddleOCR以启用文本识别")
  319. except ImportError as e:
  320. logger.error(f"[任务 {task_id}] 导入附件页切割模块失败: {e}")
  321. logger.warning(f"[任务 {task_id}] 将使用原始文件继续处理")
  322. except Exception as e:
  323. logger.exception(f"[任务 {task_id}] 附件页切割失败: {e}")
  324. logger.warning(f"[任务 {task_id}] 将使用原始文件继续处理")
  325. logger.info(f"[任务 {task_id}] 附件页切割状态: {'成功' if attachment_split_success else '失败/跳过'},使用文件: {file_path}")
  326. # 针对结算报告 / 初设评审类文档,检查是否有文本层
  327. # 如果有文本层,直接执行表格提取,不调用外部 OCR API(速度更快)
  328. if request.doc_type in ("settlementReport", "designReview"):
  329. logger.info(f"[任务 {task_id}] 文档类型 {request.doc_type},检查 PDF 文本层...")
  330. # 检查 PDF 是否有文本层
  331. from ..utils.file_utils import check_pdf_has_text_layer
  332. has_text_layer, _ = await asyncio.to_thread(check_pdf_has_text_layer, file_path)
  333. if has_text_layer:
  334. # 有文本层,直接执行表格提取,跳过外部 OCR API
  335. logger.info(f"[任务 {task_id}] PDF 有文本层,直接执行表格提取(跳过外部 OCR API)")
  336. # 延迟导入,避免启动时因 pandas/numpy 版本冲突导致服务无法启动
  337. from ..utils.table_extractor import extract_and_filter_tables_for_pdf
  338. # 在线程池中执行表格提取(因为它是同步函数,使用 to_thread 避免阻塞事件循环)
  339. def run_table_extraction_sync():
  340. try:
  341. logger.info(f"[任务 {task_id}] 开始执行表格提取函数...")
  342. logger.info(f"[任务 {task_id}] 参数: pdf_path={file_path}, output_dir={output_dir}, doc_type={request.doc_type}")
  343. result = extract_and_filter_tables_for_pdf(
  344. pdf_path=file_path,
  345. base_output_dir=output_dir,
  346. doc_type=request.doc_type, # type: ignore[arg-type]
  347. )
  348. logger.info(f"[任务 {task_id}] 表格提取函数执行完成,返回结果: {result is not None}")
  349. return result
  350. except Exception as e:
  351. logger.exception(f"[任务 {task_id}] 表格提取/筛选失败: {e}")
  352. return None
  353. # 执行表格提取
  354. tables_info = await asyncio.to_thread(run_table_extraction_sync)
  355. # 构造一个简单的 result,包含必要的字段
  356. if tables_info:
  357. # 将表格信息挂到任务状态,方便后续调试或扩展
  358. task_status[task_id]["tables"] = tables_info
  359. logger.info(
  360. f"[任务 {task_id}] 表格提取完成,筛选目录: {tables_info.get('filtered_dir')}"
  361. )
  362. # 构造 result,包含解析后的 JSON 数据
  363. result = {
  364. "markdown_file": None, # 这两个类型不需要 markdown
  365. "json_file": None, # JSON 数据直接放在 json_data 中
  366. "json_data": {
  367. "document_type": request.doc_type,
  368. "data": tables_info.get("parsed_data", {}),
  369. }
  370. }
  371. else:
  372. # 表格提取失败,返回错误
  373. logger.error(f"[任务 {task_id}] 表格提取失败,返回空结果")
  374. result = {
  375. "markdown_file": None,
  376. "json_file": None,
  377. "json_data": {
  378. "document_type": request.doc_type,
  379. "data": {},
  380. "error": "表格提取失败"
  381. }
  382. }
  383. else:
  384. # 没有文本层(扫描件),需要调用外部 OCR API
  385. logger.warning(f"[任务 {task_id}] PDF 无文本层(可能是扫描件),调用外部 OCR API")
  386. # MinerU 服务管理暂时禁用,保持原有逻辑
  387. # mineru_mgr = get_mineru_manager()
  388. # await mineru_mgr.start_service()
  389. # mineru_mgr.task_started()
  390. result = await convert_to_markdown(
  391. input_file=file_path,
  392. output_dir=output_dir,
  393. is_ocr=True, # 启用 OCR
  394. formula_enable=True,
  395. table_enable=True,
  396. language=DEFAULT_LANGUAGE,
  397. backend=DEFAULT_BACKEND,
  398. url=None,
  399. embed_images=False,
  400. output_json=True,
  401. start_page_id=DEFAULT_START_PAGE_ID,
  402. end_page_id=DEFAULT_END_PAGE_ID,
  403. parse_method=DEFAULT_PARSE_METHOD,
  404. server_url=DEFAULT_SERVER_URL,
  405. response_format_zip=DEFAULT_RESPONSE_FORMAT_ZIP,
  406. return_middle_json=DEFAULT_RETURN_MIDDLE_JSON,
  407. return_model_output=DEFAULT_RETURN_MODEL_OUTPUT,
  408. return_md=DEFAULT_RETURN_MD,
  409. return_images=DEFAULT_RETURN_IMAGES,
  410. return_content_list=DEFAULT_RETURN_CONTENT_LIST,
  411. forced_document_type=request.doc_type
  412. )
  413. else:
  414. # 其他类型(包括投资类型 fsApproval, fsReview, pdApproval 以及 noiseRec, emRec, opStatus)
  415. # 执行转换(v2 使用外部API)
  416. # v2 特有的参数通过配置或环境变量获取
  417. # MinerU 服务管理暂时禁用,保持原有逻辑
  418. # mineru_mgr = get_mineru_manager()
  419. # await mineru_mgr.start_service()
  420. # mineru_mgr.task_started()
  421. result = await convert_to_markdown(
  422. input_file=file_path,
  423. output_dir=output_dir,
  424. # v2: 去除max_pages、公式/表格等前端可调参数
  425. is_ocr=False,
  426. formula_enable=True,
  427. table_enable=True,
  428. language=DEFAULT_LANGUAGE,
  429. backend=DEFAULT_BACKEND,
  430. url=None,
  431. # v2: 固定为 False
  432. embed_images=False,
  433. output_json=True,
  434. start_page_id=DEFAULT_START_PAGE_ID,
  435. end_page_id=DEFAULT_END_PAGE_ID,
  436. parse_method=DEFAULT_PARSE_METHOD,
  437. server_url=DEFAULT_SERVER_URL,
  438. response_format_zip=DEFAULT_RESPONSE_FORMAT_ZIP,
  439. return_middle_json=DEFAULT_RETURN_MIDDLE_JSON,
  440. return_model_output=DEFAULT_RETURN_MODEL_OUTPUT,
  441. return_md=DEFAULT_RETURN_MD,
  442. return_images=DEFAULT_RETURN_IMAGES,
  443. return_content_list=DEFAULT_RETURN_CONTENT_LIST,
  444. forced_document_type=request.doc_type
  445. )
  446. # 停止监控并获取统计结果(基于采集的数据计算)
  447. monitor.stop()
  448. stats = monitor.get_statistics()
  449. if stats:
  450. task_status[task_id]["gpu_info"] = stats
  451. if result:
  452. task_status[task_id]["status"] = "completed"
  453. task_status[task_id]["message"] = "转换成功"
  454. task_status[task_id]["markdown_file"] = result.get("markdown_file")
  455. task_status[task_id]["json_file"] = result.get("json_file")
  456. # 保存JSON数据内容,以便直接返回
  457. if result.get("json_data"):
  458. json_data = result["json_data"].copy()
  459. task_status[task_id]["json_data"] = json_data
  460. task_status[task_id]["document_type"] = json_data.get("document_type")
  461. logger.info(f"[任务 {task_id}] 处理成功")
  462. else:
  463. task_status[task_id]["status"] = "failed"
  464. task_status[task_id]["message"] = "转换失败"
  465. task_status[task_id]["error"] = "转换返回None"
  466. logger.error(f"[任务 {task_id}] 转换失败")
  467. except Exception as e:
  468. # 停止监控并获取统计结果(即使异常也记录)
  469. monitor.stop()
  470. stats = monitor.get_statistics()
  471. if stats:
  472. task_status[task_id]["gpu_info"] = stats
  473. task_status[task_id]["status"] = "failed"
  474. task_status[task_id]["message"] = f"处理出错: {str(e)}"
  475. task_status[task_id]["error"] = str(e)
  476. logger.exception(f"[任务 {task_id}] 处理失败: {e}")
  477. # 注意:不再在转换完成后立即删除上传的文件
  478. # 文件将保留在临时目录中,直到用户调用 DELETE /task/{task_id} 接口时才清理
  479. # 这样可以方便用户查看上传的文件内容
  480. async def process_pdf_to_markdown_task(
  481. task_id: str,
  482. file_path: str,
  483. output_dir: str,
  484. backend: str,
  485. remove_watermark: bool,
  486. watermark_light_threshold: int,
  487. watermark_saturation_threshold: int,
  488. crop_header_footer: bool,
  489. header_ratio: float,
  490. footer_ratio: float,
  491. return_images: bool = False,
  492. ):
  493. """后台执行 PDF/图片转 Markdown(仅转 MD,无 doc_type 等)。"""
  494. try:
  495. logger.info(f"[任务 {task_id}] PDF转Markdown 后台任务开始...")
  496. task_status[task_id]["status"] = "processing"
  497. task_status[task_id]["message"] = "正在转换 PDF/图片为 Markdown..."
  498. ext = (Path(file_path).suffix or "").lower()
  499. is_pdf = ext == ".pdf"
  500. current_path = file_path
  501. if is_pdf and remove_watermark:
  502. next_path = os.path.join(os.path.dirname(output_dir), "no_watermark.pdf")
  503. ok = await asyncio.to_thread(
  504. remove_watermark_from_pdf,
  505. current_path,
  506. next_path,
  507. light_threshold=watermark_light_threshold,
  508. saturation_threshold=watermark_saturation_threshold,
  509. )
  510. if ok:
  511. current_path = next_path
  512. else:
  513. logger.warning(f"[任务 {task_id}] 去水印失败,使用原文件继续")
  514. if is_pdf and crop_header_footer:
  515. next_path = os.path.join(os.path.dirname(output_dir), "cropped.pdf")
  516. ok = await asyncio.to_thread(
  517. crop_header_footer_from_pdf,
  518. current_path,
  519. next_path,
  520. header_ratio=header_ratio,
  521. footer_ratio=footer_ratio,
  522. )
  523. if ok:
  524. current_path = next_path
  525. else:
  526. logger.warning(f"[任务 {task_id}] 页眉页脚裁剪失败,使用原文件继续")
  527. result = await convert_pdf_to_markdown_only(
  528. input_file=current_path,
  529. output_dir=output_dir,
  530. backend=backend,
  531. url=None,
  532. return_images=return_images,
  533. )
  534. if not result:
  535. task_status[task_id]["status"] = "failed"
  536. task_status[task_id]["message"] = "转换失败"
  537. task_status[task_id]["error"] = "PDF 转 Markdown 返回空"
  538. logger.error(f"[任务 {task_id}] PDF 转 Markdown 返回空")
  539. return
  540. md_content = result.get("markdown", "")
  541. filename = result.get("filename", "output.md")
  542. if not filename.endswith(".md"):
  543. filename = filename + ".md"
  544. markdown_file_path = os.path.join(output_dir, filename)
  545. with open(markdown_file_path, "w", encoding="utf-8") as f:
  546. f.write(md_content)
  547. task_status[task_id]["status"] = "completed"
  548. task_status[task_id]["message"] = "转换成功"
  549. task_status[task_id]["markdown_file"] = markdown_file_path
  550. task_status[task_id]["json_data"] = {"markdown": md_content, "filename": filename}
  551. task_status[task_id]["document_type"] = None
  552. if return_images:
  553. zip_basename = f"{datetime.now().strftime('%Y%m%d_%H%M%S')}_{task_id[:8]}.zip"
  554. zip_path = os.path.join(output_dir, zip_basename)
  555. try:
  556. with zipfile.ZipFile(zip_path, "w", zipfile.ZIP_DEFLATED) as zf:
  557. for root, _, files in os.walk(output_dir):
  558. for f in files:
  559. if f == zip_basename:
  560. continue
  561. abs_path = os.path.join(root, f)
  562. arcname = os.path.relpath(abs_path, output_dir)
  563. zf.write(abs_path, arcname)
  564. task_status[task_id]["zip_file"] = zip_path
  565. logger.info(f"[任务 {task_id}] 已打包 md+图片: {zip_path}")
  566. except Exception as e:
  567. logger.warning(f"[任务 {task_id}] 打包 zip 失败: {e}")
  568. logger.info(f"[任务 {task_id}] PDF 转 Markdown 完成: {markdown_file_path}")
  569. except Exception as e:
  570. task_status[task_id]["status"] = "failed"
  571. task_status[task_id]["message"] = f"处理出错: {str(e)}"
  572. task_status[task_id]["error"] = str(e)
  573. logger.exception(f"[任务 {task_id}] PDF 转 Markdown 失败: {e}")
  574. @app.post("/convert", response_model=ConversionResponse)
  575. async def convert_file(
  576. file: Annotated[UploadFile, File(description="上传的PDF或图片文件")],
  577. # 新增:类型参数(英文传参)含 safetyFsApproval 安评可研批复
  578. type: Annotated[
  579. Optional[Literal["noiseRec", "emRec", "opStatus", "settlementReport", "designReview", "fsApproval", "fsReview", "pdApproval", "safetyFsApproval", "finalAccount"]],
  580. Form(description="文档类型:noiseRec | emRec | opStatus | settlementReport | designReview | fsApproval | fsReview | pdApproval | safetyFsApproval | finalAccount")
  581. ] = None,
  582. ):
  583. """
  584. 转换PDF/图片文件(异步处理)
  585. 工作流程:
  586. 1. 接收文件并生成task_id
  587. 2. 立即返回task_id(不等待任何处理)
  588. 3. 后台异步执行转换任务(调用外部API)
  589. 4. 客户端使用task_id轮询状态或直接获取结果
  590. - **file**: 上传的文件(PDF或图片)
  591. - **type**: 文档类型
  592. * noiseRec - 噪声检测
  593. * emRec - 电磁检测
  594. * opStatus - 工况信息
  595. * settlementReport - 结算报告
  596. * designReview - 设计评审
  597. * fsApproval - 可研批复投资估算
  598. * fsReview - 可研评审投资估算
  599. * pdApproval - 初设批复概算投资
  600. * safetyFsApproval - 安评可研批复
  601. 注意:v2 版本内部使用外部API进行转换,v2特有的配置参数(如API URL、backend等)
  602. 通过环境变量或配置文件设置,不通过API参数传入。
  603. """
  604. # 生成任务ID
  605. task_id = str(uuid.uuid4())
  606. # 创建临时目录和输出目录
  607. temp_dir = tempfile.mkdtemp(prefix=f"pdf_converter_v2_{task_id}_")
  608. output_dir = os.path.join(temp_dir, "output")
  609. os.makedirs(output_dir, exist_ok=True)
  610. # 保存上传的文件
  611. # 不使用原始文件名,直接使用简单的固定命名,避免文件名过长问题
  612. # 先尝试从Content-Type推断扩展名
  613. content_type = file.content_type or ""
  614. extension_map = {
  615. "application/pdf": ".pdf",
  616. "image/png": ".png",
  617. "image/jpeg": ".jpg",
  618. "image/jpg": ".jpg",
  619. }
  620. ext = extension_map.get(content_type, "")
  621. # 如果没有从Content-Type获取到,尝试从原始文件名获取扩展名
  622. if not ext and file.filename:
  623. ext = Path(file.filename).suffix
  624. # 如果还是没有,使用默认扩展名
  625. if not ext:
  626. ext = ".pdf" # 默认假设是PDF
  627. # 使用简单的固定文件名
  628. file_path = os.path.join(temp_dir, f"file{ext}")
  629. try:
  630. with open(file_path, "wb") as f:
  631. content = await file.read()
  632. f.write(content)
  633. logger.info(f"[任务 {task_id}] 文件已保存: {file_path} ({len(content)} bytes)")
  634. # 如果保存后文件名仍然没有扩展名,尝试通过文件内容检测并重命名
  635. if not Path(file_path).suffix:
  636. from ..utils.paddleocr_fallback import detect_file_type
  637. detected_type = detect_file_type(file_path)
  638. if detected_type:
  639. ext_map = {
  640. "pdf": ".pdf",
  641. "png": ".png",
  642. "jpeg": ".jpg",
  643. }
  644. ext = ext_map.get(detected_type)
  645. if ext:
  646. new_file_path = os.path.join(temp_dir, f"file{ext}")
  647. os.rename(file_path, new_file_path)
  648. file_path = new_file_path
  649. logger.info(f"[任务 {task_id}] 通过文件内容检测到类型 {detected_type},重命名为: {file_path}")
  650. except Exception as e:
  651. raise HTTPException(status_code=500, detail=f"保存文件失败: {str(e)}")
  652. # 计算页数并限制:>300页直接报错;图片按1页处理
  653. try:
  654. suffix = (Path(file_path).suffix or "").lower()
  655. pages = 1
  656. if suffix == ".pdf":
  657. # 粗略统计:基于PDF标记
  658. with open(file_path, "rb") as pf:
  659. pdf_bytes = pf.read()
  660. try:
  661. pages = pdf_bytes.count(b"/Type /Page")
  662. if pages <= 0:
  663. pages = 1
  664. except Exception:
  665. pages = 1
  666. else:
  667. # 常见图片格式视为单页
  668. pages = 1
  669. if pages > 300:
  670. # 清理临时目录后报错
  671. try:
  672. shutil.rmtree(temp_dir)
  673. except Exception:
  674. pass
  675. raise HTTPException(status_code=400, detail="文件页数超过300页,拒绝处理")
  676. logger.info(f"[任务 {task_id}] 页数评估: {pages}")
  677. except HTTPException:
  678. raise
  679. except Exception as e:
  680. logger.warning(f"[任务 {task_id}] 页数评估失败,按1页处理: {e}")
  681. # 初始化任务状态
  682. task_status[task_id] = {
  683. "status": "pending",
  684. "message": "任务已创建",
  685. "progress": 0.0,
  686. "markdown_file": None,
  687. "json_file": None,
  688. "json_data": None, # 存储JSON数据内容
  689. "document_type": None,
  690. "error": None,
  691. "temp_dir": temp_dir,
  692. "output_dir": output_dir,
  693. "file_path": file_path # 保存上传文件的路径,方便查看
  694. }
  695. # 处理类型参数映射
  696. type_map = {
  697. "noiseRec": "noiseMonitoringRecord",
  698. "emRec": "electromagneticTestRecord",
  699. "opStatus": "operatingConditionInfo",
  700. # 结算报告类
  701. "settlementReport": "settlementReport",
  702. # 初设评审类
  703. "designReview": "designReview",
  704. # 投资估算类(新增)
  705. "fsApproval": "fsApproval",
  706. "fsReview": "fsReview",
  707. "pdApproval": "pdApproval",
  708. "safetyFsApproval": "safetyFsApproval",
  709. # 决算报告
  710. "finalAccount": "finalAccount",
  711. }
  712. doc_type = None
  713. if type:
  714. if type not in type_map:
  715. # 清理临时目录后报错
  716. try:
  717. shutil.rmtree(temp_dir)
  718. except Exception:
  719. pass
  720. raise HTTPException(status_code=400, detail="无效的type参数")
  721. doc_type = type_map[type]
  722. # 创建请求对象(v2 精简)
  723. request = ConversionRequest(
  724. doc_type=doc_type,
  725. )
  726. # 使用 asyncio.create_task 创建后台任务,确保立即返回
  727. task = asyncio.create_task(
  728. process_conversion_task(
  729. task_id,
  730. file_path,
  731. output_dir,
  732. request
  733. )
  734. )
  735. # 立即返回task_id,不等待任何处理
  736. logger.info(f"[任务 {task_id}] 任务已创建并添加到后台,立即返回task_id")
  737. return ConversionResponse(
  738. task_id=task_id,
  739. status="pending",
  740. message="任务已创建,正在后台处理中,请使用task_id查询状态",
  741. markdown_file=None,
  742. json_file=None,
  743. document_type=None
  744. )
  745. PDF_TO_MARKDOWN_DESCRIPTION = """
  746. 将 **PDF 或图片** 转为纯 Markdown 文本的异步接口。提交后立即返回 `task_id`,不等待转换完成。
  747. ## 调用流程
  748. 1. **POST 本接口**:上传文件(`multipart/form-data`),请求体需包含 `file` 及可选表单项;响应返回 `task_id`、`status: "pending"`。
  749. 2. **轮询状态**:**GET /task/{task_id}**,直到 `status` 为 `completed` 或 `failed`。
  750. 3. **获取结果**(仅当 `status == "completed"` 时):
  751. - **GET /download/{task_id}/markdown**:下载生成的 `.md` 文件;
  752. - **GET /task/{task_id}/json**:获取 JSON `{ "markdown": "全文", "filename": "xxx.md" }`;
  753. - 若提交时传了 **return_images=true**:**GET /download/{task_id}/zip** 下载 Markdown + 图片压缩包。
  754. ## 参数说明
  755. | 参数 | 类型 | 必填 | 说明 |
  756. |------|------|------|------|
  757. | file | file | 是 | PDF 或图片文件(如 PNG、JPG)。 |
  758. | backend | string | 否 | 识别引擎:`mineru`(默认,MinerU file_parse)或 `paddle`(PaddleOCR doc_parser)。 |
  759. | remove_watermark | boolean | 否 | 是否先对 PDF 去水印,默认 `false`,仅对 PDF 生效。 |
  760. | watermark_light_threshold | integer | 否 | 去水印亮度阈值 0–255,默认 200。 |
  761. | watermark_saturation_threshold | integer | 否 | 去水印饱和度阈值 0–255,默认 30。 |
  762. | crop_header_footer | boolean | 否 | 是否裁剪页眉页脚,默认 `false`,仅对 PDF 生效。 |
  763. | header_ratio | number | 否 | 页眉裁剪比例 0–1,如 0.05 表示裁掉顶部 5%,默认 0.05。 |
  764. | footer_ratio | number | 否 | 页脚裁剪比例 0–1,默认 0.05。 |
  765. | return_images | boolean | 否 | 是否同时拉取并保存图片;为 `true` 时完成后可下载 zip(md+图片),默认 `false`。 |
  766. ## 限制与说明
  767. - **页数**:单文件不超过 300 页,超过将返回 400。
  768. - **大 PDF**:超过 50 页会按 50 页一段切割后分别转换再合并 MD,以降低 MinerU 端内存占用。
  769. - 去水印、裁剪页眉页脚仅对 **PDF** 生效,图片类型会忽略这些参数。
  770. """
  771. @app.post(
  772. "/pdf_to_markdown",
  773. tags=["PDF转Markdown"],
  774. summary="PDF/图片转 Markdown(异步)",
  775. description=PDF_TO_MARKDOWN_DESCRIPTION,
  776. response_model=ConversionResponse,
  777. responses={
  778. 200: {
  779. "description": "成功创建任务,返回 task_id。需用 GET /task/{task_id} 轮询,完成后通过 GET /download/{task_id}/markdown 或 GET /task/{task_id}/json 获取结果;若传了 return_images=true 还可通过 GET /download/{task_id}/zip 下载 md+图片包。",
  780. "content": {
  781. "application/json": {
  782. "example": {
  783. "task_id": "550e8400-e29b-41d4-a716-446655440000",
  784. "status": "pending",
  785. "message": "任务已创建,请使用 GET /task/{task_id} 查询状态,完成后通过 GET /download/{task_id}/markdown 或 GET /task/{task_id}/json 获取结果",
  786. "markdown_file": None,
  787. "json_file": None,
  788. "document_type": None,
  789. }
  790. }
  791. },
  792. },
  793. 400: {
  794. "description": "请求非法:例如文件页数超过 300 页。",
  795. "content": {
  796. "application/json": {"example": {"detail": "文件页数超过 300 页,拒绝处理"}}
  797. },
  798. },
  799. 500: {
  800. "description": "服务端错误:如保存上传文件失败、转换过程异常等。",
  801. "content": {
  802. "application/json": {"example": {"detail": "保存文件失败: ..."}}
  803. },
  804. },
  805. },
  806. )
  807. async def pdf_to_markdown(
  808. file: Annotated[UploadFile, File(description="上传的 PDF 或图片文件(必填)")],
  809. backend: Annotated[
  810. Optional[Literal["mineru", "paddle"]],
  811. Form(description="识别后端:mineru = MinerU file_parse(默认);paddle = PaddleOCR doc_parser"),
  812. ] = "mineru",
  813. remove_watermark: Annotated[
  814. bool,
  815. Form(description="是否先对 PDF 去水印,仅对 PDF 生效,默认 false"),
  816. ] = False,
  817. watermark_light_threshold: Annotated[
  818. int,
  819. Form(description="去水印亮度阈值 0–255,高于此值的浅色像素视为水印,默认 200"),
  820. ] = 200,
  821. watermark_saturation_threshold: Annotated[
  822. int,
  823. Form(description="去水印饱和度阈值 0–255,低于此值的低饱和度像素视为水印,默认 30"),
  824. ] = 30,
  825. crop_header_footer: Annotated[
  826. bool,
  827. Form(description="是否裁剪 PDF 页眉页脚,仅对 PDF 生效,默认 false"),
  828. ] = False,
  829. header_ratio: Annotated[
  830. float,
  831. Form(description="页眉裁剪比例 0–1,如 0.05 表示裁掉顶部 5%,默认 0.05"),
  832. ] = 0.05,
  833. footer_ratio: Annotated[
  834. float,
  835. Form(description="页脚裁剪比例 0–1,如 0.05 表示裁掉底部 5%,默认 0.05"),
  836. ] = 0.05,
  837. return_images: Annotated[
  838. bool,
  839. Form(description="是否同时拉取并保存图片;为 true 时完成后可通过 GET /download/{task_id}/zip 下载 md+图片 压缩包,默认 false"),
  840. ] = False,
  841. ):
  842. """PDF/图片转 Markdown(异步):提交后立即返回 task_id,轮询 GET /task/{task_id} 后通过 GET /download/{task_id}/markdown 或 /zip 获取结果。"""
  843. task_id = str(uuid.uuid4())
  844. content_type = file.content_type or ""
  845. ext_map = {"application/pdf": ".pdf", "image/png": ".png", "image/jpeg": ".jpg", "image/jpg": ".jpg"}
  846. ext = ext_map.get(content_type, "") or (Path(file.filename or "").suffix if file.filename else "") or ".pdf"
  847. temp_dir = tempfile.mkdtemp(prefix=f"pdf_converter_v2_{task_id}_")
  848. file_path = os.path.join(temp_dir, f"file{ext}")
  849. try:
  850. content = await file.read()
  851. with open(file_path, "wb") as f:
  852. f.write(content)
  853. except Exception as e:
  854. try:
  855. shutil.rmtree(temp_dir)
  856. except Exception:
  857. pass
  858. raise HTTPException(status_code=500, detail=f"保存文件失败: {str(e)}")
  859. pages = 1
  860. if ext.lower() == ".pdf" and content:
  861. pages = max(1, content.count(b"/Type /Page"))
  862. if pages > 300:
  863. try:
  864. shutil.rmtree(temp_dir)
  865. except Exception:
  866. pass
  867. raise HTTPException(status_code=400, detail="文件页数超过 300 页,拒绝处理")
  868. output_dir = os.path.join(temp_dir, "output")
  869. os.makedirs(output_dir, exist_ok=True)
  870. task_status[task_id] = {
  871. "status": "pending",
  872. "message": "任务已创建",
  873. "progress": 0.0,
  874. "markdown_file": None,
  875. "json_file": None,
  876. "json_data": None,
  877. "document_type": None,
  878. "error": None,
  879. "temp_dir": temp_dir,
  880. "output_dir": output_dir,
  881. "file_path": file_path,
  882. "zip_file": None,
  883. }
  884. asyncio.create_task(
  885. process_pdf_to_markdown_task(
  886. task_id=task_id,
  887. file_path=file_path,
  888. output_dir=output_dir,
  889. backend=backend or "mineru",
  890. remove_watermark=remove_watermark,
  891. watermark_light_threshold=watermark_light_threshold,
  892. watermark_saturation_threshold=watermark_saturation_threshold,
  893. crop_header_footer=crop_header_footer,
  894. header_ratio=header_ratio,
  895. footer_ratio=footer_ratio,
  896. return_images=return_images,
  897. )
  898. )
  899. logger.info(f"[任务 {task_id}] PDF 转 Markdown 任务已创建,立即返回 task_id")
  900. return ConversionResponse(
  901. task_id=task_id,
  902. status="pending",
  903. message="任务已创建,请使用 GET /task/{task_id} 查询状态,完成后通过 GET /download/{task_id}/markdown 或 GET /task/{task_id}/json 获取结果",
  904. markdown_file=None,
  905. json_file=None,
  906. document_type=None,
  907. )
  908. @app.get("/task/{task_id}", response_model=TaskStatus)
  909. async def get_task_status(task_id: str):
  910. """
  911. 查询任务状态(轮询接口)
  912. 客户端应定期调用此接口查询任务状态,直到状态变为 "completed" 或 "failed"
  913. - **task_id**: 任务ID(从 /convert 接口返回)
  914. 状态说明:
  915. - **pending**: 等待处理
  916. - **processing**: 正在处理中
  917. - **completed**: 处理完成,可以使用 /task/{task_id}/json 获取JSON数据
  918. - **failed**: 处理失败,查看 error 字段获取错误信息
  919. """
  920. if task_id not in task_status:
  921. raise HTTPException(status_code=404, detail="任务不存在")
  922. status_info = task_status[task_id]
  923. # 处理GPU信息
  924. gpu_info_model = None
  925. if "gpu_info" in status_info and status_info["gpu_info"]:
  926. gpu_info_model = GpuInfo(**status_info["gpu_info"])
  927. return TaskStatus(
  928. task_id=task_id,
  929. status=status_info["status"],
  930. message=status_info["message"],
  931. progress=status_info.get("progress"),
  932. markdown_file=status_info.get("markdown_file"),
  933. json_file=status_info.get("json_file"),
  934. document_type=status_info.get("document_type"),
  935. error=status_info.get("error"),
  936. gpu_info=gpu_info_model
  937. )
  938. @app.get("/download/{task_id}/markdown")
  939. async def download_markdown(task_id: str):
  940. """
  941. 下载Markdown文件
  942. - **task_id**: 任务ID
  943. """
  944. if task_id not in task_status:
  945. raise HTTPException(status_code=404, detail="任务不存在")
  946. status_info = task_status[task_id]
  947. if status_info["status"] != "completed":
  948. raise HTTPException(status_code=400, detail="任务尚未完成")
  949. markdown_file = status_info.get("markdown_file")
  950. if not markdown_file or not os.path.exists(markdown_file):
  951. raise HTTPException(status_code=404, detail="Markdown文件不存在")
  952. return FileResponse(
  953. markdown_file,
  954. media_type="text/markdown",
  955. filename=os.path.basename(markdown_file)
  956. )
  957. @app.get("/download/{task_id}/zip")
  958. async def download_zip(task_id: str):
  959. """
  960. 下载 Markdown + 图片 压缩包(仅当提交任务时传了 return_images=true 时存在)
  961. - **task_id**: 任务ID
  962. """
  963. if task_id not in task_status:
  964. raise HTTPException(status_code=404, detail="任务不存在")
  965. status_info = task_status[task_id]
  966. if status_info["status"] != "completed":
  967. raise HTTPException(status_code=400, detail="任务尚未完成")
  968. zip_file = status_info.get("zip_file")
  969. if not zip_file or not os.path.exists(zip_file):
  970. raise HTTPException(
  971. status_code=404,
  972. detail="未生成 zip(请在 POST /pdf_to_markdown 时传 return_images=true)",
  973. )
  974. return FileResponse(
  975. zip_file,
  976. media_type="application/zip",
  977. filename=os.path.basename(zip_file),
  978. )
  979. @app.get("/task/{task_id}/json")
  980. async def get_json(task_id: str):
  981. """
  982. 直接获取JSON数据(返回JSON内容,不下载文件)
  983. - **task_id**: 任务ID
  984. 返回:JSON格式的数据对象,包含解析后的文档内容
  985. """
  986. if task_id not in task_status:
  987. raise HTTPException(status_code=404, detail="任务不存在")
  988. status_info = task_status[task_id]
  989. if status_info["status"] == "pending" or status_info["status"] == "processing":
  990. raise HTTPException(status_code=400, detail="任务尚未完成,请稍后再试")
  991. if status_info["status"] == "failed":
  992. raise HTTPException(status_code=400, detail=f"任务失败: {status_info.get('error', '未知错误')}")
  993. json_data = status_info.get("json_data")
  994. if not json_data:
  995. # 如果没有保存JSON数据,尝试从文件读取
  996. json_file = status_info.get("json_file")
  997. if json_file and os.path.exists(json_file):
  998. try:
  999. with open(json_file, 'r', encoding='utf-8') as f:
  1000. json_data = json.load(f)
  1001. except Exception as e:
  1002. logger.error(f"[任务 {task_id}] 读取JSON文件失败: {e}")
  1003. raise HTTPException(status_code=500, detail="读取JSON文件失败")
  1004. else:
  1005. raise HTTPException(status_code=404, detail="JSON数据不存在(任务可能没有生成JSON数据)")
  1006. return JSONResponse(content=json_data)
  1007. @app.get("/download/{task_id}/json")
  1008. async def download_json(task_id: str):
  1009. """
  1010. 下载JSON文件(返回文件下载)
  1011. - **task_id**: 任务ID
  1012. """
  1013. if task_id not in task_status:
  1014. raise HTTPException(status_code=404, detail="任务不存在")
  1015. status_info = task_status[task_id]
  1016. if status_info["status"] != "completed":
  1017. raise HTTPException(status_code=400, detail="任务尚未完成")
  1018. json_file = status_info.get("json_file")
  1019. if not json_file or not os.path.exists(json_file):
  1020. raise HTTPException(status_code=404, detail="JSON文件不存在")
  1021. return FileResponse(
  1022. json_file,
  1023. media_type="application/json",
  1024. filename=os.path.basename(json_file)
  1025. )
  1026. @app.delete("/task/{task_id}")
  1027. async def delete_task(task_id: str):
  1028. """
  1029. 删除任务及其临时文件
  1030. - **task_id**: 任务ID
  1031. """
  1032. if task_id not in task_status:
  1033. raise HTTPException(status_code=404, detail="任务不存在")
  1034. status_info = task_status[task_id]
  1035. temp_dir = status_info.get("temp_dir")
  1036. # 清理临时目录
  1037. if temp_dir and os.path.exists(temp_dir):
  1038. try:
  1039. shutil.rmtree(temp_dir)
  1040. logger.info(f"[任务 {task_id}] 临时目录已清理: {temp_dir}")
  1041. except Exception as e:
  1042. logger.warning(f"[任务 {task_id}] 清理临时目录失败: {e}")
  1043. # 删除任务状态
  1044. del task_status[task_id]
  1045. return {"message": "任务已删除"}
  1046. @app.post("/ocr", response_model=OCRResponse)
  1047. async def ocr_image(request: OCRRequest):
  1048. """
  1049. 对base64编码的图片进行OCR识别
  1050. - **image_base64**: base64编码的图片数据(可以包含data:image/xxx;base64,前缀)
  1051. - **image_format**: 图片格式(png, jpg, jpeg),默认为png
  1052. - **remove_watermark**: 是否去除水印,默认为false
  1053. - **watermark_light_threshold**: 水印亮度阈值(0-255),默认200,高于此值的浅色像素可能是水印
  1054. - **watermark_saturation_threshold**: 水印饱和度阈值(0-255),默认30,低于此值的低饱和度像素可能是水印
  1055. - **crop_header_footer**: 是否裁剪页眉页脚,默认为false
  1056. - **header_ratio**: 页眉裁剪比例(0-1),默认0.05表示裁剪顶部5%
  1057. - **footer_ratio**: 页脚裁剪比例(0-1),默认0.05表示裁剪底部5%
  1058. 返回识别出的文本列表和GPU监控信息
  1059. """
  1060. temp_dir = None
  1061. image_path = None
  1062. # 资源监控:启动后台采集线程(每0.5秒采集一次)
  1063. from ..utils.resource_monitor import ResourceMonitor
  1064. monitor = ResourceMonitor(interval=0.5)
  1065. monitor.start()
  1066. try:
  1067. # 创建临时目录
  1068. temp_dir = tempfile.mkdtemp(prefix=f"pdf_converter_ocr_{uuid.uuid4()}_")
  1069. logger.info(f"[OCR] 创建临时目录: {temp_dir}")
  1070. # 解码base64数据
  1071. image_base64 = request.image_base64.strip()
  1072. # 移除可能的数据URI前缀(如 data:image/png;base64,)
  1073. if "," in image_base64:
  1074. image_base64 = image_base64.split(",")[-1]
  1075. try:
  1076. image_bytes = base64.b64decode(image_base64)
  1077. logger.info(f"[OCR] Base64解码成功,图片大小: {len(image_bytes)} bytes")
  1078. except Exception as e:
  1079. logger.error(f"[OCR] Base64解码失败: {e}")
  1080. # 停止监控并获取统计结果
  1081. monitor.stop()
  1082. stats = monitor.get_statistics()
  1083. gpu_info_model = GpuInfo(**stats) if stats else None
  1084. return OCRResponse(
  1085. code=-1,
  1086. message="无法解码base64图片数据",
  1087. data=None,
  1088. gpu_info=gpu_info_model
  1089. )
  1090. # 确定图片格式和扩展名
  1091. image_format = request.image_format.lower() if request.image_format else "png"
  1092. if image_format not in ["png", "jpg", "jpeg"]:
  1093. image_format = "png"
  1094. ext_map = {
  1095. "png": ".png",
  1096. "jpg": ".jpg",
  1097. "jpeg": ".jpg"
  1098. }
  1099. ext = ext_map.get(image_format, ".png")
  1100. # 保存图片文件
  1101. image_filename = f"ocr_image_{uuid.uuid4().hex[:8]}{ext}"
  1102. image_path = os.path.join(temp_dir, image_filename)
  1103. with open(image_path, "wb") as f:
  1104. f.write(image_bytes)
  1105. logger.info(f"[OCR] 图片已保存: {image_path}")
  1106. # 如果需要裁剪页眉页脚,先进行裁剪
  1107. if request.crop_header_footer:
  1108. try:
  1109. from ..utils.image_preprocessor import crop_header_footer, check_opencv_available
  1110. if check_opencv_available():
  1111. logger.info(f"[OCR] 开始裁剪页眉页脚,顶部比例: {request.header_ratio}, 底部比例: {request.footer_ratio}")
  1112. # 裁剪后的图片路径
  1113. cropped_image_path = os.path.join(temp_dir, f"ocr_image_cropped{ext}")
  1114. image_path = crop_header_footer(
  1115. image_path,
  1116. output_path=cropped_image_path,
  1117. header_ratio=request.header_ratio or 0.05,
  1118. footer_ratio=request.footer_ratio or 0.05
  1119. )
  1120. logger.info(f"[OCR] 裁剪页眉页脚完成: {image_path}")
  1121. else:
  1122. logger.warning("[OCR] OpenCV 未安装,跳过裁剪页眉页脚")
  1123. except Exception as e:
  1124. logger.warning(f"[OCR] 裁剪页眉页脚失败,使用原图继续: {e}")
  1125. # 如果需要去水印,进行预处理
  1126. if request.remove_watermark:
  1127. try:
  1128. from ..utils.image_preprocessor import remove_watermark, check_opencv_available
  1129. if check_opencv_available():
  1130. logger.info(f"[OCR] 开始去水印处理,亮度阈值: {request.watermark_light_threshold}, 饱和度阈值: {request.watermark_saturation_threshold}")
  1131. # 去水印后的图片路径
  1132. nowm_image_path = os.path.join(temp_dir, f"ocr_image_nowm{ext}")
  1133. image_path = remove_watermark(
  1134. image_path,
  1135. output_path=nowm_image_path,
  1136. light_threshold=request.watermark_light_threshold or 200,
  1137. saturation_threshold=request.watermark_saturation_threshold or 30,
  1138. method="hsv"
  1139. )
  1140. logger.info(f"[OCR] 去水印完成: {image_path}")
  1141. else:
  1142. logger.warning("[OCR] OpenCV 未安装,跳过去水印处理")
  1143. except Exception as e:
  1144. logger.warning(f"[OCR] 去水印处理失败,使用原图继续: {e}")
  1145. # 调用PaddleOCR进行识别(监控线程在此期间持续采集数据)
  1146. from ..utils.paddleocr_fallback import call_paddleocr_ocr
  1147. # 创建保存OCR结果的目录
  1148. ocr_save_path = os.path.join(temp_dir, "ocr_output")
  1149. os.makedirs(ocr_save_path, exist_ok=True)
  1150. logger.info(f"[OCR] 开始调用PaddleOCR识别: {image_path}")
  1151. texts, md_file_path = call_paddleocr_ocr(image_path, ocr_save_path)
  1152. # 停止监控并获取统计结果(基于采集的数据计算)
  1153. monitor.stop()
  1154. stats = monitor.get_statistics()
  1155. gpu_info_model = GpuInfo(**stats) if stats else None
  1156. if texts is None:
  1157. logger.warning("[OCR] PaddleOCR识别失败或未返回结果")
  1158. return OCRResponse(
  1159. code=-1,
  1160. message="PaddleOCR未能识别出文本内容",
  1161. data=None,
  1162. gpu_info=gpu_info_model # 即使失败也返回GPU信息
  1163. )
  1164. # 返回所有文本(已按Y坐标排序并合并,保持正确顺序)
  1165. if not texts:
  1166. texts = []
  1167. # 直接使用texts数组,按行用\n连接生成完整文本
  1168. # texts已经是按Y坐标排序并合并的,顺序正确
  1169. full_text = "\n".join(texts) if texts else ""
  1170. # 记录文件位置
  1171. logger.info(f"[OCR] 识别成功,共识别出 {len(texts)} 个文本片段,完整文本长度: {len(full_text)} 字符")
  1172. logger.info(f"[OCR] 上传的图片已保存: {image_path}")
  1173. if md_file_path:
  1174. logger.info(f"[OCR] 生成的Markdown文件已保存: {md_file_path}")
  1175. logger.info(f"[OCR] 所有文件保存在目录: {temp_dir}")
  1176. return OCRResponse(
  1177. code=0,
  1178. message=f"成功识别出 {len(texts)} 个文本片段",
  1179. data={
  1180. "texts": texts,
  1181. "full_text": full_text
  1182. },
  1183. gpu_info=gpu_info_model # 返回GPU监控信息
  1184. )
  1185. except Exception as e:
  1186. # 停止监控并获取统计结果(即使异常也记录)
  1187. monitor.stop()
  1188. stats = monitor.get_statistics()
  1189. gpu_info_model = GpuInfo(**stats) if stats else None
  1190. logger.exception(f"[OCR] 处理失败: {e}")
  1191. return OCRResponse(
  1192. code=-1,
  1193. message=f"OCR处理过程中发生错误: {str(e)}",
  1194. data=None,
  1195. gpu_info=gpu_info_model
  1196. )
  1197. # 注意:不再删除临时文件,保留上传的图片和生成的markdown文件
  1198. # 启动时的初始化
  1199. @app.on_event("startup")
  1200. async def startup_event():
  1201. """应用启动时的初始化"""
  1202. logger.info("PDF转换工具API v2 服务启动")
  1203. logger.info("可用端点: POST /convert, GET /task/{task_id}, GET /download/{task_id}/*, POST /ocr")
  1204. # 关闭时的清理
  1205. @app.on_event("shutdown")
  1206. async def shutdown_event():
  1207. """应用关闭时的清理"""
  1208. logger.info("清理临时文件和任务状态...")
  1209. # 清理所有临时目录
  1210. for task_id, status_info in list(task_status.items()):
  1211. temp_dir = status_info.get("temp_dir")
  1212. if temp_dir and os.path.exists(temp_dir):
  1213. try:
  1214. shutil.rmtree(temp_dir)
  1215. except Exception as e:
  1216. logger.warning(f"清理任务 {task_id} 的临时目录失败: {e}")
  1217. task_status.clear()
  1218. logger.info("PDF转换工具API v2 服务关闭")