main.py 58 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130113111321133113411351136113711381139114011411142114311441145114611471148114911501151115211531154115511561157115811591160116111621163116411651166116711681169117011711172117311741175117611771178117911801181118211831184118511861187118811891190119111921193119411951196119711981199120012011202120312041205120612071208120912101211121212131214121512161217121812191220122112221223122412251226122712281229123012311232123312341235123612371238123912401241124212431244124512461247124812491250125112521253125412551256125712581259126012611262126312641265126612671268126912701271127212731274127512761277127812791280128112821283128412851286128712881289129012911292129312941295129612971298129913001301130213031304130513061307130813091310131113121313131413151316131713181319132013211322132313241325132613271328132913301331133213331334133513361337133813391340134113421343134413451346134713481349135013511352135313541355135613571358135913601361136213631364136513661367136813691370137113721373137413751376137713781379138013811382138313841385138613871388138913901391139213931394139513961397139813991400140114021403
  1. # Copyright (c) Opendatalab. All rights reserved.
  2. """
  3. PDF转换工具 FastAPI 版本 v2 - 使用外部API接口
  4. """
  5. import asyncio
  6. import os
  7. import shutil
  8. import tempfile
  9. import uuid
  10. import base64
  11. from datetime import datetime
  12. import json
  13. import zipfile
  14. from pathlib import Path
  15. from typing import Optional, List
  16. from urllib.parse import quote
  17. from fastapi import FastAPI, File, UploadFile, Form, HTTPException
  18. from fastapi.responses import FileResponse, JSONResponse, Response
  19. from fastapi.middleware.cors import CORSMiddleware
  20. from pydantic import BaseModel
  21. from typing_extensions import Annotated, Literal
  22. from ..processor.converter import convert_to_markdown, convert_pdf_to_markdown_only
  23. from ..utils.logging_config import get_logger
  24. from ..utils.pdf_watermark_remover import remove_watermark_from_pdf, crop_header_footer_from_pdf
  25. # 尝试导入配置,如果不存在则使用默认值
  26. try:
  27. from ..config import (
  28. DEFAULT_MODEL_NAME, DEFAULT_GPU_MEMORY_UTILIZATION, DEFAULT_DPI, DEFAULT_MAX_PAGES,
  29. DEFAULT_API_URL, DEFAULT_BACKEND, DEFAULT_PARSE_METHOD, DEFAULT_START_PAGE_ID,
  30. DEFAULT_END_PAGE_ID, DEFAULT_LANGUAGE, DEFAULT_RESPONSE_FORMAT_ZIP,
  31. DEFAULT_RETURN_MIDDLE_JSON, DEFAULT_RETURN_MODEL_OUTPUT, DEFAULT_RETURN_MD,
  32. DEFAULT_RETURN_IMAGES, DEFAULT_RETURN_CONTENT_LIST, DEFAULT_SERVER_URL
  33. )
  34. except ImportError:
  35. # 如果配置不存在,使用默认值
  36. DEFAULT_MODEL_NAME = "OpenDataLab/MinerU2.5-2509-1.2B"
  37. DEFAULT_GPU_MEMORY_UTILIZATION = 0.9
  38. DEFAULT_DPI = 200
  39. DEFAULT_MAX_PAGES = 10
  40. DEFAULT_API_URL = os.getenv("API_URL", "http://127.0.0.1:5282")
  41. DEFAULT_BACKEND = os.getenv("BACKEND", "pipeline")
  42. DEFAULT_PARSE_METHOD = os.getenv("PARSE_METHOD", "auto")
  43. DEFAULT_START_PAGE_ID = int(os.getenv("START_PAGE_ID", "0"))
  44. DEFAULT_END_PAGE_ID = int(os.getenv("END_PAGE_ID", "99999"))
  45. DEFAULT_LANGUAGE = os.getenv("LANGUAGE", "ch")
  46. DEFAULT_RESPONSE_FORMAT_ZIP = os.getenv("RESPONSE_FORMAT_ZIP", "true").lower() == "true"
  47. DEFAULT_RETURN_MIDDLE_JSON = os.getenv("RETURN_MIDDLE_JSON", "false").lower() == "true"
  48. DEFAULT_RETURN_MODEL_OUTPUT = os.getenv("RETURN_MODEL_OUTPUT", "true").lower() == "true"
  49. DEFAULT_RETURN_MD = os.getenv("RETURN_MD", "true").lower() == "true"
  50. DEFAULT_RETURN_IMAGES = os.getenv("RETURN_IMAGES", "true").lower() == "true" # 默认启用,以便PaddleOCR备用解析可以使用
  51. DEFAULT_RETURN_CONTENT_LIST = os.getenv("RETURN_CONTENT_LIST", "false").lower() == "true"
  52. DEFAULT_SERVER_URL = os.getenv("SERVER_URL", "string")
  53. # 初始化日志
  54. # v2 使用简化的日志配置,从 v1 复用或使用 loguru
  55. try:
  56. # 尝试导入 v1 的日志初始化函数
  57. import sys
  58. from pathlib import Path
  59. v1_path = Path(__file__).parent.parent.parent / "pdf_converter"
  60. if str(v1_path.parent) not in sys.path:
  61. sys.path.insert(0, str(v1_path.parent))
  62. from pdf_converter.utils.logging_config import init_logging
  63. init_logging(
  64. log_dir=os.getenv("PDF_CONVERTER_LOG_DIR", "./logs"),
  65. log_level=os.getenv("LOG_LEVEL", "INFO"),
  66. log_to_file=True,
  67. log_to_console=True
  68. )
  69. except Exception:
  70. # 如果无法导入,直接使用 get_logger(会使用 loguru 后备)
  71. pass
  72. # 获取日志记录器
  73. logger = get_logger("pdf_converter_v2.api")
  74. # MinerU 服务管理器(延迟导入,避免循环依赖)
  75. _mineru_manager = None
  76. def get_mineru_manager():
  77. """获取 MinerU 服务管理器"""
  78. global _mineru_manager
  79. if _mineru_manager is None:
  80. from ..utils.mineru_service_manager import get_mineru_manager as _get_manager
  81. _mineru_manager = _get_manager()
  82. return _mineru_manager
  83. app = FastAPI(
  84. title="PDF转换工具API v2",
  85. description="将PDF转换为Markdown和JSON格式(使用外部API)",
  86. version="2.0.0"
  87. )
  88. # 添加验证错误处理器,记录详细错误信息
  89. from fastapi.exceptions import RequestValidationError
  90. from starlette.requests import Request
  91. @app.exception_handler(RequestValidationError)
  92. async def validation_exception_handler(request: Request, exc: RequestValidationError):
  93. """捕获 422 验证错误并记录详细信息"""
  94. logger.error(f"[验证错误] URL: {request.url}")
  95. logger.error(f"[验证错误] Method: {request.method}")
  96. logger.error(f"[验证错误] Headers: {dict(request.headers)}")
  97. logger.error(f"[验证错误] 错误详情: {exc.errors()}")
  98. return JSONResponse(
  99. status_code=422,
  100. content={"detail": exc.errors(), "body": str(exc.body) if hasattr(exc, 'body') else None}
  101. )
  102. # 配置CORS
  103. app.add_middleware(
  104. CORSMiddleware,
  105. allow_origins=["*"], # 生产环境应限制为特定域名
  106. allow_credentials=True,
  107. allow_methods=["*"],
  108. allow_headers=["*"],
  109. )
  110. # 存储任务状态
  111. task_status = {}
  112. # MinerU 定时管理器暂时禁用,保持原有逻辑
  113. # @app.on_event("startup")
  114. # async def startup_event():
  115. # """应用启动时初始化"""
  116. # logger.info("[启动] 正在初始化 MinerU 服务管理器...")
  117. # try:
  118. # manager = get_mineru_manager()
  119. # manager.start_monitor()
  120. # logger.info("[启动] MinerU 服务管理器初始化完成")
  121. # except Exception as e:
  122. # logger.warning(f"[启动] MinerU 服务管理器初始化失败(非致命): {e}")
  123. # @app.on_event("shutdown")
  124. # async def shutdown_event():
  125. # """应用关闭时清理"""
  126. # logger.info("[关闭] 正在停止 MinerU 服务监控...")
  127. # try:
  128. # manager = get_mineru_manager()
  129. # manager.stop_monitor()
  130. # logger.info("[关闭] MinerU 服务监控已停止")
  131. # except Exception as e:
  132. # logger.warning(f"[关闭] 停止 MinerU 服务监控失败: {e}")
  133. class ConversionRequest(BaseModel):
  134. """转换请求模型(v2 精简版)"""
  135. # 新增:强制文档类型(正式全称)
  136. doc_type: Optional[str] = None
  137. class ConversionResponse(BaseModel):
  138. """转换响应模型"""
  139. task_id: str
  140. status: str
  141. message: str
  142. markdown_file: Optional[str] = None
  143. json_file: Optional[str] = None
  144. document_type: Optional[str] = None
  145. class GpuInfo(BaseModel):
  146. """GPU监控信息(基于采集数据计算得出)"""
  147. gpu_index: Optional[int] = None
  148. gpu_memory_used: Optional[int] = None # 字节,任务期间的最大显存使用量
  149. gpu_utilization: Optional[float] = None # 百分比,平均GPU利用率
  150. gpu_memory_total: Optional[int] = None # 总显存(字节)
  151. gpu_name: Optional[str] = None
  152. # 以下为可选统计字段
  153. gpu_memory_used_avg: Optional[int] = None # 平均显存使用(字节)
  154. gpu_memory_used_max: Optional[int] = None # 最大显存使用(字节)
  155. gpu_utilization_max: Optional[float] = None # 最大GPU利用率(%)
  156. system_load_avg_1min: Optional[float] = None # 平均1分钟系统负载
  157. system_load_max_1min: Optional[float] = None # 最大1分钟系统负载
  158. system_load_avg_5min: Optional[float] = None # 平均5分钟系统负载
  159. system_load_max_5min: Optional[float] = None # 最大5分钟系统负载
  160. system_load_avg_15min: Optional[float] = None # 平均15分钟系统负载
  161. system_load_max_15min: Optional[float] = None # 最大15分钟系统负载
  162. sample_count: Optional[int] = None # 采集的样本数量
  163. duration: Optional[float] = None # 监控持续时间(秒)
  164. class TaskStatus(BaseModel):
  165. """任务状态模型"""
  166. task_id: str
  167. status: str # pending, processing, completed, failed
  168. message: str
  169. progress: Optional[float] = None
  170. markdown_file: Optional[str] = None
  171. json_file: Optional[str] = None
  172. document_type: Optional[str] = None
  173. error: Optional[str] = None
  174. gpu_info: Optional[GpuInfo] = None # GPU监控信息
  175. class OCRRequest(BaseModel):
  176. """OCR识别请求模型"""
  177. image_base64: str # base64编码的图片数据
  178. image_format: Optional[str] = "png" # 图片格式:png, jpg, jpeg
  179. remove_watermark: Optional[bool] = False # 是否去除水印
  180. watermark_light_threshold: Optional[int] = 200 # 水印亮度阈值(0-255),高于此值的浅色像素可能是水印
  181. watermark_saturation_threshold: Optional[int] = 30 # 水印饱和度阈值(0-255),低于此值的低饱和度像素可能是水印
  182. crop_header_footer: Optional[bool] = False # 是否裁剪页眉页脚
  183. header_ratio: Optional[float] = 0.05 # 页眉裁剪比例(0-1),默认5%
  184. footer_ratio: Optional[float] = 0.05 # 页脚裁剪比例(0-1),默认5%
  185. class OCRResponse(BaseModel):
  186. """OCR识别响应模型"""
  187. code: int # 状态码:0表示成功,-1或其他表示错误
  188. message: str # 消息
  189. data: Optional[dict] = None # 数据,包含texts和full_text
  190. gpu_info: Optional[GpuInfo] = None # GPU监控信息
  191. class PdfToMarkdownResponse(BaseModel):
  192. """PDF 转 Markdown 同步接口响应"""
  193. markdown: str # 生成的 Markdown 全文
  194. filename: str # 建议的文件名(如 xxx.md)
  195. @app.get("/")
  196. async def root():
  197. """API根路径"""
  198. return {
  199. "name": "PDF转换工具API",
  200. "version": "2.0.0",
  201. "description": "将PDF/图片转换为Markdown和JSON格式(使用外部API)",
  202. "workflow": {
  203. "step1": "POST /convert - 上传文件,立即返回 task_id(不等待处理)",
  204. "step2": "GET /task/{task_id} - 轮询查询任务状态",
  205. "step3a": "GET /task/{task_id}/json - 任务完成后直接获取JSON数据(推荐)",
  206. "step3b": "GET /download/{task_id}/json - 任务完成后下载JSON文件",
  207. "step4": "DELETE /task/{task_id} - (可选) 删除任务清理临时文件"
  208. },
  209. "endpoints": {
  210. "POST /convert": "转换PDF/图片文件(异步,立即返回task_id)",
  211. "POST /pdf_to_markdown": "PDF/图片转 Markdown(异步,立即返回task_id,通过 task_id 查询状态并下载 .md)",
  212. "GET /task/{task_id}": "查询任务状态(轮询接口)",
  213. "GET /task/{task_id}/json": "直接获取JSON数据(返回JSON对象,不下载文件)",
  214. "GET /download/{task_id}/markdown": "下载Markdown文件",
  215. "GET /download/{task_id}/zip": "下载 md+图片 压缩包(需 POST /pdf_to_markdown 时 return_images=true)",
  216. "GET /download/{task_id}/json": "下载JSON文件",
  217. "DELETE /task/{task_id}": "删除任务及其临时文件",
  218. "GET /health": "健康检查"
  219. }
  220. }
  221. @app.get("/health")
  222. async def health_check():
  223. """健康检查"""
  224. return {"status": "healthy", "service": "pdf_converter_v2"}
  225. @app.get("/mineru/status")
  226. async def mineru_status():
  227. """获取 MinerU 服务状态"""
  228. try:
  229. manager = get_mineru_manager()
  230. return manager.get_status()
  231. except Exception as e:
  232. logger.exception(f"获取 MinerU 状态失败: {e}")
  233. return {"error": str(e)}
  234. @app.post("/mineru/start")
  235. async def mineru_start():
  236. """手动启动 MinerU 服务"""
  237. try:
  238. manager = get_mineru_manager()
  239. success = await manager.start_service()
  240. return {
  241. "success": success,
  242. "message": "服务已启动" if success else "服务启动失败",
  243. "status": manager.get_status()
  244. }
  245. except Exception as e:
  246. logger.exception(f"启动 MinerU 服务失败: {e}")
  247. return {"success": False, "error": str(e)}
  248. @app.post("/mineru/stop")
  249. async def mineru_stop():
  250. """手动停止 MinerU 服务(仅在无活跃任务时)"""
  251. try:
  252. manager = get_mineru_manager()
  253. success = await manager.stop_service()
  254. return {
  255. "success": success,
  256. "message": "服务已停止" if success else "服务停止失败(可能有活跃任务)",
  257. "status": manager.get_status()
  258. }
  259. except Exception as e:
  260. logger.exception(f"停止 MinerU 服务失败: {e}")
  261. return {"success": False, "error": str(e)}
  262. async def process_conversion_task(
  263. task_id: str,
  264. file_path: str,
  265. output_dir: str,
  266. request: ConversionRequest
  267. ):
  268. """
  269. 后台处理转换任务
  270. 注意:这个函数在响应返回给客户端之后才会执行
  271. """
  272. # 资源监控:启动后台采集线程(每0.5秒采集一次)
  273. from ..utils.resource_monitor import ResourceMonitor
  274. monitor = ResourceMonitor(interval=0.5)
  275. monitor.start()
  276. try:
  277. logger.info(f"[任务 {task_id}] 后台任务开始执行...")
  278. task_status[task_id]["status"] = "processing"
  279. task_status[task_id]["message"] = "开始处理文件..."
  280. logger.info(f"[任务 {task_id}] 开始处理: {file_path}")
  281. result = None
  282. tables_info = None
  283. # 针对投资估算类型,需要先切割附件页
  284. if request.doc_type in ("fsApproval", "fsReview", "pdApproval", "safetyFsApproval"):
  285. logger.info(f"[任务 {task_id}] 文档类型 {request.doc_type},需要先切割附件页")
  286. # 导入附件页切割函数
  287. import sys
  288. from pathlib import Path as PathLib
  289. sys.path.insert(0, str(PathLib(__file__).parent.parent))
  290. attachment_split_success = False
  291. try:
  292. from test_no import split_attachment_pages
  293. # 创建附件页输出目录
  294. attachment_dir = PathLib(output_dir) / "attachments"
  295. attachment_dir.mkdir(parents=True, exist_ok=True)
  296. # 切割附件页
  297. logger.info(f"[任务 {task_id}] 开始切割附件页,输出目录: {attachment_dir}")
  298. await asyncio.to_thread(
  299. split_attachment_pages,
  300. file_path,
  301. attachment_dir,
  302. use_ocr=True,
  303. debug=False
  304. )
  305. # 查找切割后的附件页PDF
  306. attachment_pdfs = list(attachment_dir.glob("*_附件页_*.pdf"))
  307. logger.info(f"[任务 {task_id}] 附件页目录内容: {list(attachment_dir.iterdir()) if attachment_dir.exists() else '(目录不存在)'}")
  308. if attachment_pdfs:
  309. # 使用第一个附件页PDF作为输入
  310. file_path = str(attachment_pdfs[0])
  311. logger.info(f"[任务 {task_id}] 附件页切割完成,使用文件: {file_path}")
  312. attachment_split_success = True
  313. else:
  314. logger.warning(f"[任务 {task_id}] 未找到附件页PDF文件,使用原始文件")
  315. logger.info(f"[任务 {task_id}] 提示: 如果PDF是扫描件,请确保安装了Tesseract OCR或PaddleOCR以启用文本识别")
  316. except ImportError as e:
  317. logger.error(f"[任务 {task_id}] 导入附件页切割模块失败: {e}")
  318. logger.warning(f"[任务 {task_id}] 将使用原始文件继续处理")
  319. except Exception as e:
  320. logger.exception(f"[任务 {task_id}] 附件页切割失败: {e}")
  321. logger.warning(f"[任务 {task_id}] 将使用原始文件继续处理")
  322. logger.info(f"[任务 {task_id}] 附件页切割状态: {'成功' if attachment_split_success else '失败/跳过'},使用文件: {file_path}")
  323. # 针对结算报告 / 初设评审类文档,检查是否有文本层
  324. # 如果有文本层,直接执行表格提取,不调用外部 OCR API(速度更快)
  325. if request.doc_type in ("settlementReport", "designReview"):
  326. logger.info(f"[任务 {task_id}] 文档类型 {request.doc_type},检查 PDF 文本层...")
  327. # 检查 PDF 是否有文本层
  328. from ..utils.file_utils import check_pdf_has_text_layer
  329. has_text_layer, _ = await asyncio.to_thread(check_pdf_has_text_layer, file_path)
  330. if has_text_layer:
  331. # 有文本层,直接执行表格提取,跳过外部 OCR API
  332. logger.info(f"[任务 {task_id}] PDF 有文本层,直接执行表格提取(跳过外部 OCR API)")
  333. # 延迟导入,避免启动时因 pandas/numpy 版本冲突导致服务无法启动
  334. from ..utils.table_extractor import extract_and_filter_tables_for_pdf
  335. # 在线程池中执行表格提取(因为它是同步函数,使用 to_thread 避免阻塞事件循环)
  336. def run_table_extraction_sync():
  337. try:
  338. logger.info(f"[任务 {task_id}] 开始执行表格提取函数...")
  339. logger.info(f"[任务 {task_id}] 参数: pdf_path={file_path}, output_dir={output_dir}, doc_type={request.doc_type}")
  340. result = extract_and_filter_tables_for_pdf(
  341. pdf_path=file_path,
  342. base_output_dir=output_dir,
  343. doc_type=request.doc_type, # type: ignore[arg-type]
  344. )
  345. logger.info(f"[任务 {task_id}] 表格提取函数执行完成,返回结果: {result is not None}")
  346. return result
  347. except Exception as e:
  348. logger.exception(f"[任务 {task_id}] 表格提取/筛选失败: {e}")
  349. return None
  350. # 执行表格提取
  351. tables_info = await asyncio.to_thread(run_table_extraction_sync)
  352. # 构造一个简单的 result,包含必要的字段
  353. if tables_info:
  354. # 将表格信息挂到任务状态,方便后续调试或扩展
  355. task_status[task_id]["tables"] = tables_info
  356. logger.info(
  357. f"[任务 {task_id}] 表格提取完成,筛选目录: {tables_info.get('filtered_dir')}"
  358. )
  359. # 构造 result,包含解析后的 JSON 数据
  360. result = {
  361. "markdown_file": None, # 这两个类型不需要 markdown
  362. "json_file": None, # JSON 数据直接放在 json_data 中
  363. "json_data": {
  364. "document_type": request.doc_type,
  365. "data": tables_info.get("parsed_data", {}),
  366. }
  367. }
  368. else:
  369. # 表格提取失败,返回错误
  370. logger.error(f"[任务 {task_id}] 表格提取失败,返回空结果")
  371. result = {
  372. "markdown_file": None,
  373. "json_file": None,
  374. "json_data": {
  375. "document_type": request.doc_type,
  376. "data": {},
  377. "error": "表格提取失败"
  378. }
  379. }
  380. else:
  381. # 没有文本层(扫描件),需要调用外部 OCR API
  382. logger.warning(f"[任务 {task_id}] PDF 无文本层(可能是扫描件),调用外部 OCR API")
  383. # MinerU 服务管理暂时禁用,保持原有逻辑
  384. # mineru_mgr = get_mineru_manager()
  385. # await mineru_mgr.start_service()
  386. # mineru_mgr.task_started()
  387. result = await convert_to_markdown(
  388. input_file=file_path,
  389. output_dir=output_dir,
  390. is_ocr=True, # 启用 OCR
  391. formula_enable=True,
  392. table_enable=True,
  393. language=DEFAULT_LANGUAGE,
  394. backend=DEFAULT_BACKEND,
  395. url=None,
  396. embed_images=False,
  397. output_json=True,
  398. start_page_id=DEFAULT_START_PAGE_ID,
  399. end_page_id=DEFAULT_END_PAGE_ID,
  400. parse_method=DEFAULT_PARSE_METHOD,
  401. server_url=DEFAULT_SERVER_URL,
  402. response_format_zip=DEFAULT_RESPONSE_FORMAT_ZIP,
  403. return_middle_json=DEFAULT_RETURN_MIDDLE_JSON,
  404. return_model_output=DEFAULT_RETURN_MODEL_OUTPUT,
  405. return_md=DEFAULT_RETURN_MD,
  406. return_images=DEFAULT_RETURN_IMAGES,
  407. return_content_list=DEFAULT_RETURN_CONTENT_LIST,
  408. forced_document_type=request.doc_type
  409. )
  410. else:
  411. # 其他类型(包括投资类型 fsApproval, fsReview, pdApproval 以及 noiseRec, emRec, opStatus)
  412. # 执行转换(v2 使用外部API)
  413. # v2 特有的参数通过配置或环境变量获取
  414. # MinerU 服务管理暂时禁用,保持原有逻辑
  415. # mineru_mgr = get_mineru_manager()
  416. # await mineru_mgr.start_service()
  417. # mineru_mgr.task_started()
  418. result = await convert_to_markdown(
  419. input_file=file_path,
  420. output_dir=output_dir,
  421. # v2: 去除max_pages、公式/表格等前端可调参数
  422. is_ocr=False,
  423. formula_enable=True,
  424. table_enable=True,
  425. language=DEFAULT_LANGUAGE,
  426. backend=DEFAULT_BACKEND,
  427. url=None,
  428. # v2: 固定为 False
  429. embed_images=False,
  430. output_json=True,
  431. start_page_id=DEFAULT_START_PAGE_ID,
  432. end_page_id=DEFAULT_END_PAGE_ID,
  433. parse_method=DEFAULT_PARSE_METHOD,
  434. server_url=DEFAULT_SERVER_URL,
  435. response_format_zip=DEFAULT_RESPONSE_FORMAT_ZIP,
  436. return_middle_json=DEFAULT_RETURN_MIDDLE_JSON,
  437. return_model_output=DEFAULT_RETURN_MODEL_OUTPUT,
  438. return_md=DEFAULT_RETURN_MD,
  439. return_images=DEFAULT_RETURN_IMAGES,
  440. return_content_list=DEFAULT_RETURN_CONTENT_LIST,
  441. forced_document_type=request.doc_type
  442. )
  443. # 停止监控并获取统计结果(基于采集的数据计算)
  444. monitor.stop()
  445. stats = monitor.get_statistics()
  446. if stats:
  447. task_status[task_id]["gpu_info"] = stats
  448. if result:
  449. task_status[task_id]["status"] = "completed"
  450. task_status[task_id]["message"] = "转换成功"
  451. task_status[task_id]["markdown_file"] = result.get("markdown_file")
  452. task_status[task_id]["json_file"] = result.get("json_file")
  453. # 保存JSON数据内容,以便直接返回
  454. if result.get("json_data"):
  455. json_data = result["json_data"].copy()
  456. task_status[task_id]["json_data"] = json_data
  457. task_status[task_id]["document_type"] = json_data.get("document_type")
  458. logger.info(f"[任务 {task_id}] 处理成功")
  459. else:
  460. task_status[task_id]["status"] = "failed"
  461. task_status[task_id]["message"] = "转换失败"
  462. task_status[task_id]["error"] = "转换返回None"
  463. logger.error(f"[任务 {task_id}] 转换失败")
  464. except Exception as e:
  465. # 停止监控并获取统计结果(即使异常也记录)
  466. monitor.stop()
  467. stats = monitor.get_statistics()
  468. if stats:
  469. task_status[task_id]["gpu_info"] = stats
  470. task_status[task_id]["status"] = "failed"
  471. task_status[task_id]["message"] = f"处理出错: {str(e)}"
  472. task_status[task_id]["error"] = str(e)
  473. logger.exception(f"[任务 {task_id}] 处理失败: {e}")
  474. # 注意:不再在转换完成后立即删除上传的文件
  475. # 文件将保留在临时目录中,直到用户调用 DELETE /task/{task_id} 接口时才清理
  476. # 这样可以方便用户查看上传的文件内容
  477. async def process_pdf_to_markdown_task(
  478. task_id: str,
  479. file_path: str,
  480. output_dir: str,
  481. backend: str,
  482. remove_watermark: bool,
  483. watermark_light_threshold: int,
  484. watermark_saturation_threshold: int,
  485. crop_header_footer: bool,
  486. header_ratio: float,
  487. footer_ratio: float,
  488. return_images: bool = False,
  489. ):
  490. """后台执行 PDF/图片转 Markdown(仅转 MD,无 doc_type 等)。"""
  491. try:
  492. logger.info(f"[任务 {task_id}] PDF转Markdown 后台任务开始...")
  493. task_status[task_id]["status"] = "processing"
  494. task_status[task_id]["message"] = "正在转换 PDF/图片为 Markdown..."
  495. ext = (Path(file_path).suffix or "").lower()
  496. is_pdf = ext == ".pdf"
  497. current_path = file_path
  498. if is_pdf and remove_watermark:
  499. next_path = os.path.join(os.path.dirname(output_dir), "no_watermark.pdf")
  500. ok = await asyncio.to_thread(
  501. remove_watermark_from_pdf,
  502. current_path,
  503. next_path,
  504. light_threshold=watermark_light_threshold,
  505. saturation_threshold=watermark_saturation_threshold,
  506. )
  507. if ok:
  508. current_path = next_path
  509. else:
  510. logger.warning(f"[任务 {task_id}] 去水印失败,使用原文件继续")
  511. if is_pdf and crop_header_footer:
  512. next_path = os.path.join(os.path.dirname(output_dir), "cropped.pdf")
  513. ok = await asyncio.to_thread(
  514. crop_header_footer_from_pdf,
  515. current_path,
  516. next_path,
  517. header_ratio=header_ratio,
  518. footer_ratio=footer_ratio,
  519. )
  520. if ok:
  521. current_path = next_path
  522. else:
  523. logger.warning(f"[任务 {task_id}] 页眉页脚裁剪失败,使用原文件继续")
  524. result = await convert_pdf_to_markdown_only(
  525. input_file=current_path,
  526. output_dir=output_dir,
  527. backend=backend,
  528. url=None,
  529. return_images=return_images,
  530. )
  531. if not result:
  532. task_status[task_id]["status"] = "failed"
  533. task_status[task_id]["message"] = "转换失败"
  534. task_status[task_id]["error"] = "PDF 转 Markdown 返回空"
  535. logger.error(f"[任务 {task_id}] PDF 转 Markdown 返回空")
  536. return
  537. md_content = result.get("markdown", "")
  538. filename = result.get("filename", "output.md")
  539. if not filename.endswith(".md"):
  540. filename = filename + ".md"
  541. markdown_file_path = os.path.join(output_dir, filename)
  542. with open(markdown_file_path, "w", encoding="utf-8") as f:
  543. f.write(md_content)
  544. task_status[task_id]["status"] = "completed"
  545. task_status[task_id]["message"] = "转换成功"
  546. task_status[task_id]["markdown_file"] = markdown_file_path
  547. task_status[task_id]["json_data"] = {"markdown": md_content, "filename": filename}
  548. task_status[task_id]["document_type"] = None
  549. if return_images:
  550. zip_basename = f"{datetime.now().strftime('%Y%m%d_%H%M%S')}_{task_id[:8]}.zip"
  551. zip_path = os.path.join(output_dir, zip_basename)
  552. try:
  553. with zipfile.ZipFile(zip_path, "w", zipfile.ZIP_DEFLATED) as zf:
  554. for root, _, files in os.walk(output_dir):
  555. for f in files:
  556. if f == zip_basename:
  557. continue
  558. abs_path = os.path.join(root, f)
  559. arcname = os.path.relpath(abs_path, output_dir)
  560. zf.write(abs_path, arcname)
  561. task_status[task_id]["zip_file"] = zip_path
  562. logger.info(f"[任务 {task_id}] 已打包 md+图片: {zip_path}")
  563. except Exception as e:
  564. logger.warning(f"[任务 {task_id}] 打包 zip 失败: {e}")
  565. logger.info(f"[任务 {task_id}] PDF 转 Markdown 完成: {markdown_file_path}")
  566. except Exception as e:
  567. task_status[task_id]["status"] = "failed"
  568. task_status[task_id]["message"] = f"处理出错: {str(e)}"
  569. task_status[task_id]["error"] = str(e)
  570. logger.exception(f"[任务 {task_id}] PDF 转 Markdown 失败: {e}")
  571. @app.post("/convert", response_model=ConversionResponse)
  572. async def convert_file(
  573. file: Annotated[UploadFile, File(description="上传的PDF或图片文件")],
  574. # 新增:类型参数(英文传参)含 safetyFsApproval 安评可研批复
  575. type: Annotated[
  576. Optional[Literal["noiseRec", "emRec", "opStatus", "settlementReport", "designReview", "fsApproval", "fsReview", "pdApproval", "safetyFsApproval", "finalAccount"]],
  577. Form(description="文档类型:noiseRec | emRec | opStatus | settlementReport | designReview | fsApproval | fsReview | pdApproval | safetyFsApproval | finalAccount")
  578. ] = None,
  579. ):
  580. """
  581. 转换PDF/图片文件(异步处理)
  582. 工作流程:
  583. 1. 接收文件并生成task_id
  584. 2. 立即返回task_id(不等待任何处理)
  585. 3. 后台异步执行转换任务(调用外部API)
  586. 4. 客户端使用task_id轮询状态或直接获取结果
  587. - **file**: 上传的文件(PDF或图片)
  588. - **type**: 文档类型
  589. * noiseRec - 噪声检测
  590. * emRec - 电磁检测
  591. * opStatus - 工况信息
  592. * settlementReport - 结算报告
  593. * designReview - 设计评审
  594. * fsApproval - 可研批复投资估算
  595. * fsReview - 可研评审投资估算
  596. * pdApproval - 初设批复概算投资
  597. * safetyFsApproval - 安评可研批复
  598. 注意:v2 版本内部使用外部API进行转换,v2特有的配置参数(如API URL、backend等)
  599. 通过环境变量或配置文件设置,不通过API参数传入。
  600. """
  601. # 生成任务ID
  602. task_id = str(uuid.uuid4())
  603. # 创建临时目录和输出目录
  604. temp_dir = tempfile.mkdtemp(prefix=f"pdf_converter_v2_{task_id}_")
  605. output_dir = os.path.join(temp_dir, "output")
  606. os.makedirs(output_dir, exist_ok=True)
  607. # 保存上传的文件
  608. # 不使用原始文件名,直接使用简单的固定命名,避免文件名过长问题
  609. # 先尝试从Content-Type推断扩展名
  610. content_type = file.content_type or ""
  611. extension_map = {
  612. "application/pdf": ".pdf",
  613. "image/png": ".png",
  614. "image/jpeg": ".jpg",
  615. "image/jpg": ".jpg",
  616. }
  617. ext = extension_map.get(content_type, "")
  618. # 如果没有从Content-Type获取到,尝试从原始文件名获取扩展名
  619. if not ext and file.filename:
  620. ext = Path(file.filename).suffix
  621. # 如果还是没有,使用默认扩展名
  622. if not ext:
  623. ext = ".pdf" # 默认假设是PDF
  624. # 使用简单的固定文件名
  625. file_path = os.path.join(temp_dir, f"file{ext}")
  626. try:
  627. with open(file_path, "wb") as f:
  628. content = await file.read()
  629. f.write(content)
  630. logger.info(f"[任务 {task_id}] 文件已保存: {file_path} ({len(content)} bytes)")
  631. # 如果保存后文件名仍然没有扩展名,尝试通过文件内容检测并重命名
  632. if not Path(file_path).suffix:
  633. from ..utils.paddleocr_fallback import detect_file_type
  634. detected_type = detect_file_type(file_path)
  635. if detected_type:
  636. ext_map = {
  637. "pdf": ".pdf",
  638. "png": ".png",
  639. "jpeg": ".jpg",
  640. }
  641. ext = ext_map.get(detected_type)
  642. if ext:
  643. new_file_path = os.path.join(temp_dir, f"file{ext}")
  644. os.rename(file_path, new_file_path)
  645. file_path = new_file_path
  646. logger.info(f"[任务 {task_id}] 通过文件内容检测到类型 {detected_type},重命名为: {file_path}")
  647. except Exception as e:
  648. raise HTTPException(status_code=500, detail=f"保存文件失败: {str(e)}")
  649. # 计算页数并限制:>300页直接报错;图片按1页处理
  650. try:
  651. suffix = (Path(file_path).suffix or "").lower()
  652. pages = 1
  653. if suffix == ".pdf":
  654. # 粗略统计:基于PDF标记
  655. with open(file_path, "rb") as pf:
  656. pdf_bytes = pf.read()
  657. try:
  658. pages = pdf_bytes.count(b"/Type /Page")
  659. if pages <= 0:
  660. pages = 1
  661. except Exception:
  662. pages = 1
  663. else:
  664. # 常见图片格式视为单页
  665. pages = 1
  666. if pages > 300:
  667. # 清理临时目录后报错
  668. try:
  669. shutil.rmtree(temp_dir)
  670. except Exception:
  671. pass
  672. raise HTTPException(status_code=400, detail="文件页数超过300页,拒绝处理")
  673. logger.info(f"[任务 {task_id}] 页数评估: {pages}")
  674. except HTTPException:
  675. raise
  676. except Exception as e:
  677. logger.warning(f"[任务 {task_id}] 页数评估失败,按1页处理: {e}")
  678. # 初始化任务状态
  679. task_status[task_id] = {
  680. "status": "pending",
  681. "message": "任务已创建",
  682. "progress": 0.0,
  683. "markdown_file": None,
  684. "json_file": None,
  685. "json_data": None, # 存储JSON数据内容
  686. "document_type": None,
  687. "error": None,
  688. "temp_dir": temp_dir,
  689. "output_dir": output_dir,
  690. "file_path": file_path # 保存上传文件的路径,方便查看
  691. }
  692. # 处理类型参数映射
  693. type_map = {
  694. "noiseRec": "noiseMonitoringRecord",
  695. "emRec": "electromagneticTestRecord",
  696. "opStatus": "operatingConditionInfo",
  697. # 结算报告类
  698. "settlementReport": "settlementReport",
  699. # 初设评审类
  700. "designReview": "designReview",
  701. # 投资估算类(新增)
  702. "fsApproval": "fsApproval",
  703. "fsReview": "fsReview",
  704. "pdApproval": "pdApproval",
  705. "safetyFsApproval": "safetyFsApproval",
  706. # 决算报告
  707. "finalAccount": "finalAccount",
  708. }
  709. doc_type = None
  710. if type:
  711. if type not in type_map:
  712. # 清理临时目录后报错
  713. try:
  714. shutil.rmtree(temp_dir)
  715. except Exception:
  716. pass
  717. raise HTTPException(status_code=400, detail="无效的type参数")
  718. doc_type = type_map[type]
  719. # 创建请求对象(v2 精简)
  720. request = ConversionRequest(
  721. doc_type=doc_type,
  722. )
  723. # 使用 asyncio.create_task 创建后台任务,确保立即返回
  724. task = asyncio.create_task(
  725. process_conversion_task(
  726. task_id,
  727. file_path,
  728. output_dir,
  729. request
  730. )
  731. )
  732. # 立即返回task_id,不等待任何处理
  733. logger.info(f"[任务 {task_id}] 任务已创建并添加到后台,立即返回task_id")
  734. return ConversionResponse(
  735. task_id=task_id,
  736. status="pending",
  737. message="任务已创建,正在后台处理中,请使用task_id查询状态",
  738. markdown_file=None,
  739. json_file=None,
  740. document_type=None
  741. )
  742. PDF_TO_MARKDOWN_DESCRIPTION = """
  743. 将 **PDF 或图片** 转为纯 Markdown 文本的异步接口。提交后立即返回 `task_id`,不等待转换完成。
  744. ## 调用流程
  745. 1. **POST 本接口**:上传文件(`multipart/form-data`),请求体需包含 `file` 及可选表单项;响应返回 `task_id`、`status: "pending"`。
  746. 2. **轮询状态**:**GET /task/{task_id}**,直到 `status` 为 `completed` 或 `failed`。
  747. 3. **获取结果**(仅当 `status == "completed"` 时):
  748. - **GET /download/{task_id}/markdown**:下载生成的 `.md` 文件;
  749. - **GET /task/{task_id}/json**:获取 JSON `{ "markdown": "全文", "filename": "xxx.md" }`;
  750. - 若提交时传了 **return_images=true**:**GET /download/{task_id}/zip** 下载 Markdown + 图片压缩包。
  751. ## 参数说明
  752. | 参数 | 类型 | 必填 | 说明 |
  753. |------|------|------|------|
  754. | file | file | 是 | PDF 或图片文件(如 PNG、JPG)。 |
  755. | backend | string | 否 | 识别引擎:`mineru`(默认,MinerU file_parse)或 `paddle`(PaddleOCR doc_parser)。 |
  756. | remove_watermark | boolean | 否 | 是否先对 PDF 去水印,默认 `false`,仅对 PDF 生效。 |
  757. | watermark_light_threshold | integer | 否 | 去水印亮度阈值 0–255,默认 200。 |
  758. | watermark_saturation_threshold | integer | 否 | 去水印饱和度阈值 0–255,默认 30。 |
  759. | crop_header_footer | boolean | 否 | 是否裁剪页眉页脚,默认 `false`,仅对 PDF 生效。 |
  760. | header_ratio | number | 否 | 页眉裁剪比例 0–1,如 0.05 表示裁掉顶部 5%,默认 0.05。 |
  761. | footer_ratio | number | 否 | 页脚裁剪比例 0–1,默认 0.05。 |
  762. | return_images | boolean | 否 | 是否同时拉取并保存图片;为 `true` 时完成后可下载 zip(md+图片),默认 `false`。 |
  763. ## 限制与说明
  764. - **页数**:单文件不超过 300 页,超过将返回 400。
  765. - **大 PDF**:超过 50 页会按 50 页一段切割后分别转换再合并 MD,以降低 MinerU 端内存占用。
  766. - 去水印、裁剪页眉页脚仅对 **PDF** 生效,图片类型会忽略这些参数。
  767. """
  768. @app.post(
  769. "/pdf_to_markdown",
  770. tags=["PDF转Markdown"],
  771. summary="PDF/图片转 Markdown(异步)",
  772. description=PDF_TO_MARKDOWN_DESCRIPTION,
  773. response_model=ConversionResponse,
  774. responses={
  775. 200: {
  776. "description": "成功创建任务,返回 task_id。需用 GET /task/{task_id} 轮询,完成后通过 GET /download/{task_id}/markdown 或 GET /task/{task_id}/json 获取结果;若传了 return_images=true 还可通过 GET /download/{task_id}/zip 下载 md+图片包。",
  777. "content": {
  778. "application/json": {
  779. "example": {
  780. "task_id": "550e8400-e29b-41d4-a716-446655440000",
  781. "status": "pending",
  782. "message": "任务已创建,请使用 GET /task/{task_id} 查询状态,完成后通过 GET /download/{task_id}/markdown 或 GET /task/{task_id}/json 获取结果",
  783. "markdown_file": None,
  784. "json_file": None,
  785. "document_type": None,
  786. }
  787. }
  788. },
  789. },
  790. 400: {
  791. "description": "请求非法:例如文件页数超过 300 页。",
  792. "content": {
  793. "application/json": {"example": {"detail": "文件页数超过 300 页,拒绝处理"}}
  794. },
  795. },
  796. 500: {
  797. "description": "服务端错误:如保存上传文件失败、转换过程异常等。",
  798. "content": {
  799. "application/json": {"example": {"detail": "保存文件失败: ..."}}
  800. },
  801. },
  802. },
  803. )
  804. async def pdf_to_markdown(
  805. file: Annotated[UploadFile, File(description="上传的 PDF 或图片文件(必填)")],
  806. backend: Annotated[
  807. Optional[Literal["mineru", "paddle"]],
  808. Form(description="识别后端:mineru = MinerU file_parse(默认);paddle = PaddleOCR doc_parser"),
  809. ] = "mineru",
  810. remove_watermark: Annotated[
  811. bool,
  812. Form(description="是否先对 PDF 去水印,仅对 PDF 生效,默认 false"),
  813. ] = False,
  814. watermark_light_threshold: Annotated[
  815. int,
  816. Form(description="去水印亮度阈值 0–255,高于此值的浅色像素视为水印,默认 200"),
  817. ] = 200,
  818. watermark_saturation_threshold: Annotated[
  819. int,
  820. Form(description="去水印饱和度阈值 0–255,低于此值的低饱和度像素视为水印,默认 30"),
  821. ] = 30,
  822. crop_header_footer: Annotated[
  823. bool,
  824. Form(description="是否裁剪 PDF 页眉页脚,仅对 PDF 生效,默认 false"),
  825. ] = False,
  826. header_ratio: Annotated[
  827. float,
  828. Form(description="页眉裁剪比例 0–1,如 0.05 表示裁掉顶部 5%,默认 0.05"),
  829. ] = 0.05,
  830. footer_ratio: Annotated[
  831. float,
  832. Form(description="页脚裁剪比例 0–1,如 0.05 表示裁掉底部 5%,默认 0.05"),
  833. ] = 0.05,
  834. return_images: Annotated[
  835. bool,
  836. Form(description="是否同时拉取并保存图片;为 true 时完成后可通过 GET /download/{task_id}/zip 下载 md+图片 压缩包,默认 false"),
  837. ] = False,
  838. ):
  839. """PDF/图片转 Markdown(异步):提交后立即返回 task_id,轮询 GET /task/{task_id} 后通过 GET /download/{task_id}/markdown 或 /zip 获取结果。"""
  840. task_id = str(uuid.uuid4())
  841. content_type = file.content_type or ""
  842. ext_map = {"application/pdf": ".pdf", "image/png": ".png", "image/jpeg": ".jpg", "image/jpg": ".jpg"}
  843. ext = ext_map.get(content_type, "") or (Path(file.filename or "").suffix if file.filename else "") or ".pdf"
  844. temp_dir = tempfile.mkdtemp(prefix=f"pdf_converter_v2_{task_id}_")
  845. file_path = os.path.join(temp_dir, f"file{ext}")
  846. try:
  847. content = await file.read()
  848. with open(file_path, "wb") as f:
  849. f.write(content)
  850. except Exception as e:
  851. try:
  852. shutil.rmtree(temp_dir)
  853. except Exception:
  854. pass
  855. raise HTTPException(status_code=500, detail=f"保存文件失败: {str(e)}")
  856. pages = 1
  857. if ext.lower() == ".pdf" and content:
  858. pages = max(1, content.count(b"/Type /Page"))
  859. if pages > 300:
  860. try:
  861. shutil.rmtree(temp_dir)
  862. except Exception:
  863. pass
  864. raise HTTPException(status_code=400, detail="文件页数超过 300 页,拒绝处理")
  865. output_dir = os.path.join(temp_dir, "output")
  866. os.makedirs(output_dir, exist_ok=True)
  867. task_status[task_id] = {
  868. "status": "pending",
  869. "message": "任务已创建",
  870. "progress": 0.0,
  871. "markdown_file": None,
  872. "json_file": None,
  873. "json_data": None,
  874. "document_type": None,
  875. "error": None,
  876. "temp_dir": temp_dir,
  877. "output_dir": output_dir,
  878. "file_path": file_path,
  879. "zip_file": None,
  880. }
  881. asyncio.create_task(
  882. process_pdf_to_markdown_task(
  883. task_id=task_id,
  884. file_path=file_path,
  885. output_dir=output_dir,
  886. backend=backend or "mineru",
  887. remove_watermark=remove_watermark,
  888. watermark_light_threshold=watermark_light_threshold,
  889. watermark_saturation_threshold=watermark_saturation_threshold,
  890. crop_header_footer=crop_header_footer,
  891. header_ratio=header_ratio,
  892. footer_ratio=footer_ratio,
  893. return_images=return_images,
  894. )
  895. )
  896. logger.info(f"[任务 {task_id}] PDF 转 Markdown 任务已创建,立即返回 task_id")
  897. return ConversionResponse(
  898. task_id=task_id,
  899. status="pending",
  900. message="任务已创建,请使用 GET /task/{task_id} 查询状态,完成后通过 GET /download/{task_id}/markdown 或 GET /task/{task_id}/json 获取结果",
  901. markdown_file=None,
  902. json_file=None,
  903. document_type=None,
  904. )
  905. @app.get("/task/{task_id}", response_model=TaskStatus)
  906. async def get_task_status(task_id: str):
  907. """
  908. 查询任务状态(轮询接口)
  909. 客户端应定期调用此接口查询任务状态,直到状态变为 "completed" 或 "failed"
  910. - **task_id**: 任务ID(从 /convert 接口返回)
  911. 状态说明:
  912. - **pending**: 等待处理
  913. - **processing**: 正在处理中
  914. - **completed**: 处理完成,可以使用 /task/{task_id}/json 获取JSON数据
  915. - **failed**: 处理失败,查看 error 字段获取错误信息
  916. """
  917. if task_id not in task_status:
  918. raise HTTPException(status_code=404, detail="任务不存在")
  919. status_info = task_status[task_id]
  920. # 处理GPU信息
  921. gpu_info_model = None
  922. if "gpu_info" in status_info and status_info["gpu_info"]:
  923. gpu_info_model = GpuInfo(**status_info["gpu_info"])
  924. return TaskStatus(
  925. task_id=task_id,
  926. status=status_info["status"],
  927. message=status_info["message"],
  928. progress=status_info.get("progress"),
  929. markdown_file=status_info.get("markdown_file"),
  930. json_file=status_info.get("json_file"),
  931. document_type=status_info.get("document_type"),
  932. error=status_info.get("error"),
  933. gpu_info=gpu_info_model
  934. )
  935. @app.get("/download/{task_id}/markdown")
  936. async def download_markdown(task_id: str):
  937. """
  938. 下载Markdown文件
  939. - **task_id**: 任务ID
  940. """
  941. if task_id not in task_status:
  942. raise HTTPException(status_code=404, detail="任务不存在")
  943. status_info = task_status[task_id]
  944. if status_info["status"] != "completed":
  945. raise HTTPException(status_code=400, detail="任务尚未完成")
  946. markdown_file = status_info.get("markdown_file")
  947. if not markdown_file or not os.path.exists(markdown_file):
  948. raise HTTPException(status_code=404, detail="Markdown文件不存在")
  949. return FileResponse(
  950. markdown_file,
  951. media_type="text/markdown",
  952. filename=os.path.basename(markdown_file)
  953. )
  954. @app.get("/download/{task_id}/zip")
  955. async def download_zip(task_id: str):
  956. """
  957. 下载 Markdown + 图片 压缩包(仅当提交任务时传了 return_images=true 时存在)
  958. - **task_id**: 任务ID
  959. """
  960. if task_id not in task_status:
  961. raise HTTPException(status_code=404, detail="任务不存在")
  962. status_info = task_status[task_id]
  963. if status_info["status"] != "completed":
  964. raise HTTPException(status_code=400, detail="任务尚未完成")
  965. zip_file = status_info.get("zip_file")
  966. if not zip_file or not os.path.exists(zip_file):
  967. raise HTTPException(
  968. status_code=404,
  969. detail="未生成 zip(请在 POST /pdf_to_markdown 时传 return_images=true)",
  970. )
  971. return FileResponse(
  972. zip_file,
  973. media_type="application/zip",
  974. filename=os.path.basename(zip_file),
  975. )
  976. @app.get("/task/{task_id}/json")
  977. async def get_json(task_id: str):
  978. """
  979. 直接获取JSON数据(返回JSON内容,不下载文件)
  980. - **task_id**: 任务ID
  981. 返回:JSON格式的数据对象,包含解析后的文档内容
  982. """
  983. if task_id not in task_status:
  984. raise HTTPException(status_code=404, detail="任务不存在")
  985. status_info = task_status[task_id]
  986. if status_info["status"] == "pending" or status_info["status"] == "processing":
  987. raise HTTPException(status_code=400, detail="任务尚未完成,请稍后再试")
  988. if status_info["status"] == "failed":
  989. raise HTTPException(status_code=400, detail=f"任务失败: {status_info.get('error', '未知错误')}")
  990. json_data = status_info.get("json_data")
  991. if not json_data:
  992. # 如果没有保存JSON数据,尝试从文件读取
  993. json_file = status_info.get("json_file")
  994. if json_file and os.path.exists(json_file):
  995. try:
  996. with open(json_file, 'r', encoding='utf-8') as f:
  997. json_data = json.load(f)
  998. except Exception as e:
  999. logger.error(f"[任务 {task_id}] 读取JSON文件失败: {e}")
  1000. raise HTTPException(status_code=500, detail="读取JSON文件失败")
  1001. else:
  1002. raise HTTPException(status_code=404, detail="JSON数据不存在(任务可能没有生成JSON数据)")
  1003. return JSONResponse(content=json_data)
  1004. @app.get("/download/{task_id}/json")
  1005. async def download_json(task_id: str):
  1006. """
  1007. 下载JSON文件(返回文件下载)
  1008. - **task_id**: 任务ID
  1009. """
  1010. if task_id not in task_status:
  1011. raise HTTPException(status_code=404, detail="任务不存在")
  1012. status_info = task_status[task_id]
  1013. if status_info["status"] != "completed":
  1014. raise HTTPException(status_code=400, detail="任务尚未完成")
  1015. json_file = status_info.get("json_file")
  1016. if not json_file or not os.path.exists(json_file):
  1017. raise HTTPException(status_code=404, detail="JSON文件不存在")
  1018. return FileResponse(
  1019. json_file,
  1020. media_type="application/json",
  1021. filename=os.path.basename(json_file)
  1022. )
  1023. @app.delete("/task/{task_id}")
  1024. async def delete_task(task_id: str):
  1025. """
  1026. 删除任务及其临时文件
  1027. - **task_id**: 任务ID
  1028. """
  1029. if task_id not in task_status:
  1030. raise HTTPException(status_code=404, detail="任务不存在")
  1031. status_info = task_status[task_id]
  1032. temp_dir = status_info.get("temp_dir")
  1033. # 清理临时目录
  1034. if temp_dir and os.path.exists(temp_dir):
  1035. try:
  1036. shutil.rmtree(temp_dir)
  1037. logger.info(f"[任务 {task_id}] 临时目录已清理: {temp_dir}")
  1038. except Exception as e:
  1039. logger.warning(f"[任务 {task_id}] 清理临时目录失败: {e}")
  1040. # 删除任务状态
  1041. del task_status[task_id]
  1042. return {"message": "任务已删除"}
  1043. @app.post("/ocr", response_model=OCRResponse)
  1044. async def ocr_image(request: OCRRequest):
  1045. """
  1046. 对base64编码的图片进行OCR识别
  1047. - **image_base64**: base64编码的图片数据(可以包含data:image/xxx;base64,前缀)
  1048. - **image_format**: 图片格式(png, jpg, jpeg),默认为png
  1049. - **remove_watermark**: 是否去除水印,默认为false
  1050. - **watermark_light_threshold**: 水印亮度阈值(0-255),默认200,高于此值的浅色像素可能是水印
  1051. - **watermark_saturation_threshold**: 水印饱和度阈值(0-255),默认30,低于此值的低饱和度像素可能是水印
  1052. - **crop_header_footer**: 是否裁剪页眉页脚,默认为false
  1053. - **header_ratio**: 页眉裁剪比例(0-1),默认0.05表示裁剪顶部5%
  1054. - **footer_ratio**: 页脚裁剪比例(0-1),默认0.05表示裁剪底部5%
  1055. 返回识别出的文本列表和GPU监控信息
  1056. """
  1057. temp_dir = None
  1058. image_path = None
  1059. # 资源监控:启动后台采集线程(每0.5秒采集一次)
  1060. from ..utils.resource_monitor import ResourceMonitor
  1061. monitor = ResourceMonitor(interval=0.5)
  1062. monitor.start()
  1063. try:
  1064. # 创建临时目录
  1065. temp_dir = tempfile.mkdtemp(prefix=f"pdf_converter_ocr_{uuid.uuid4()}_")
  1066. logger.info(f"[OCR] 创建临时目录: {temp_dir}")
  1067. # 解码base64数据
  1068. image_base64 = request.image_base64.strip()
  1069. # 移除可能的数据URI前缀(如 data:image/png;base64,)
  1070. if "," in image_base64:
  1071. image_base64 = image_base64.split(",")[-1]
  1072. try:
  1073. image_bytes = base64.b64decode(image_base64)
  1074. logger.info(f"[OCR] Base64解码成功,图片大小: {len(image_bytes)} bytes")
  1075. except Exception as e:
  1076. logger.error(f"[OCR] Base64解码失败: {e}")
  1077. # 停止监控并获取统计结果
  1078. monitor.stop()
  1079. stats = monitor.get_statistics()
  1080. gpu_info_model = GpuInfo(**stats) if stats else None
  1081. return OCRResponse(
  1082. code=-1,
  1083. message="无法解码base64图片数据",
  1084. data=None,
  1085. gpu_info=gpu_info_model
  1086. )
  1087. # 确定图片格式和扩展名
  1088. image_format = request.image_format.lower() if request.image_format else "png"
  1089. if image_format not in ["png", "jpg", "jpeg"]:
  1090. image_format = "png"
  1091. ext_map = {
  1092. "png": ".png",
  1093. "jpg": ".jpg",
  1094. "jpeg": ".jpg"
  1095. }
  1096. ext = ext_map.get(image_format, ".png")
  1097. # 保存图片文件
  1098. image_filename = f"ocr_image_{uuid.uuid4().hex[:8]}{ext}"
  1099. image_path = os.path.join(temp_dir, image_filename)
  1100. with open(image_path, "wb") as f:
  1101. f.write(image_bytes)
  1102. logger.info(f"[OCR] 图片已保存: {image_path}")
  1103. # 如果需要裁剪页眉页脚,先进行裁剪
  1104. if request.crop_header_footer:
  1105. try:
  1106. from ..utils.image_preprocessor import crop_header_footer, check_opencv_available
  1107. if check_opencv_available():
  1108. logger.info(f"[OCR] 开始裁剪页眉页脚,顶部比例: {request.header_ratio}, 底部比例: {request.footer_ratio}")
  1109. # 裁剪后的图片路径
  1110. cropped_image_path = os.path.join(temp_dir, f"ocr_image_cropped{ext}")
  1111. image_path = crop_header_footer(
  1112. image_path,
  1113. output_path=cropped_image_path,
  1114. header_ratio=request.header_ratio or 0.05,
  1115. footer_ratio=request.footer_ratio or 0.05
  1116. )
  1117. logger.info(f"[OCR] 裁剪页眉页脚完成: {image_path}")
  1118. else:
  1119. logger.warning("[OCR] OpenCV 未安装,跳过裁剪页眉页脚")
  1120. except Exception as e:
  1121. logger.warning(f"[OCR] 裁剪页眉页脚失败,使用原图继续: {e}")
  1122. # 如果需要去水印,进行预处理
  1123. if request.remove_watermark:
  1124. try:
  1125. from ..utils.image_preprocessor import remove_watermark, check_opencv_available
  1126. if check_opencv_available():
  1127. logger.info(f"[OCR] 开始去水印处理,亮度阈值: {request.watermark_light_threshold}, 饱和度阈值: {request.watermark_saturation_threshold}")
  1128. # 去水印后的图片路径
  1129. nowm_image_path = os.path.join(temp_dir, f"ocr_image_nowm{ext}")
  1130. image_path = remove_watermark(
  1131. image_path,
  1132. output_path=nowm_image_path,
  1133. light_threshold=request.watermark_light_threshold or 200,
  1134. saturation_threshold=request.watermark_saturation_threshold or 30,
  1135. method="hsv"
  1136. )
  1137. logger.info(f"[OCR] 去水印完成: {image_path}")
  1138. else:
  1139. logger.warning("[OCR] OpenCV 未安装,跳过去水印处理")
  1140. except Exception as e:
  1141. logger.warning(f"[OCR] 去水印处理失败,使用原图继续: {e}")
  1142. # 调用PaddleOCR进行识别(监控线程在此期间持续采集数据)
  1143. from ..utils.paddleocr_fallback import call_paddleocr_ocr
  1144. # 创建保存OCR结果的目录
  1145. ocr_save_path = os.path.join(temp_dir, "ocr_output")
  1146. os.makedirs(ocr_save_path, exist_ok=True)
  1147. logger.info(f"[OCR] 开始调用PaddleOCR识别: {image_path}")
  1148. texts, md_file_path = call_paddleocr_ocr(image_path, ocr_save_path)
  1149. # 停止监控并获取统计结果(基于采集的数据计算)
  1150. monitor.stop()
  1151. stats = monitor.get_statistics()
  1152. gpu_info_model = GpuInfo(**stats) if stats else None
  1153. if texts is None:
  1154. logger.warning("[OCR] PaddleOCR识别失败或未返回结果")
  1155. return OCRResponse(
  1156. code=-1,
  1157. message="PaddleOCR未能识别出文本内容",
  1158. data=None,
  1159. gpu_info=gpu_info_model # 即使失败也返回GPU信息
  1160. )
  1161. # 返回所有文本(已按Y坐标排序并合并,保持正确顺序)
  1162. if not texts:
  1163. texts = []
  1164. # 直接使用texts数组,按行用\n连接生成完整文本
  1165. # texts已经是按Y坐标排序并合并的,顺序正确
  1166. full_text = "\n".join(texts) if texts else ""
  1167. # 记录文件位置
  1168. logger.info(f"[OCR] 识别成功,共识别出 {len(texts)} 个文本片段,完整文本长度: {len(full_text)} 字符")
  1169. logger.info(f"[OCR] 上传的图片已保存: {image_path}")
  1170. if md_file_path:
  1171. logger.info(f"[OCR] 生成的Markdown文件已保存: {md_file_path}")
  1172. logger.info(f"[OCR] 所有文件保存在目录: {temp_dir}")
  1173. return OCRResponse(
  1174. code=0,
  1175. message=f"成功识别出 {len(texts)} 个文本片段",
  1176. data={
  1177. "texts": texts,
  1178. "full_text": full_text
  1179. },
  1180. gpu_info=gpu_info_model # 返回GPU监控信息
  1181. )
  1182. except Exception as e:
  1183. # 停止监控并获取统计结果(即使异常也记录)
  1184. monitor.stop()
  1185. stats = monitor.get_statistics()
  1186. gpu_info_model = GpuInfo(**stats) if stats else None
  1187. logger.exception(f"[OCR] 处理失败: {e}")
  1188. return OCRResponse(
  1189. code=-1,
  1190. message=f"OCR处理过程中发生错误: {str(e)}",
  1191. data=None,
  1192. gpu_info=gpu_info_model
  1193. )
  1194. # 注意:不再删除临时文件,保留上传的图片和生成的markdown文件
  1195. # 启动时的初始化
  1196. @app.on_event("startup")
  1197. async def startup_event():
  1198. """应用启动时的初始化"""
  1199. logger.info("PDF转换工具API v2 服务启动")
  1200. logger.info("可用端点: POST /convert, GET /task/{task_id}, GET /download/{task_id}/*, POST /ocr")
  1201. # 关闭时的清理
  1202. @app.on_event("shutdown")
  1203. async def shutdown_event():
  1204. """应用关闭时的清理"""
  1205. logger.info("清理临时文件和任务状态...")
  1206. # 清理所有临时目录
  1207. for task_id, status_info in list(task_status.items()):
  1208. temp_dir = status_info.get("temp_dir")
  1209. if temp_dir and os.path.exists(temp_dir):
  1210. try:
  1211. shutil.rmtree(temp_dir)
  1212. except Exception as e:
  1213. logger.warning(f"清理任务 {task_id} 的临时目录失败: {e}")
  1214. task_status.clear()
  1215. logger.info("PDF转换工具API v2 服务关闭")