|
|
@@ -13,14 +13,15 @@ import base64
|
|
|
import json
|
|
|
from pathlib import Path
|
|
|
from typing import Optional, List
|
|
|
+from urllib.parse import quote
|
|
|
|
|
|
from fastapi import FastAPI, File, UploadFile, Form, HTTPException
|
|
|
-from fastapi.responses import FileResponse, JSONResponse
|
|
|
+from fastapi.responses import FileResponse, JSONResponse, Response
|
|
|
from fastapi.middleware.cors import CORSMiddleware
|
|
|
from pydantic import BaseModel
|
|
|
from typing_extensions import Annotated, Literal
|
|
|
|
|
|
-from ..processor.converter import convert_to_markdown
|
|
|
+from ..processor.converter import convert_to_markdown, convert_pdf_to_markdown_only
|
|
|
from ..utils.logging_config import get_logger
|
|
|
|
|
|
# 尝试导入配置,如果不存在则使用默认值
|
|
|
@@ -150,16 +151,6 @@ class ConversionRequest(BaseModel):
|
|
|
"""转换请求模型(v2 精简版)"""
|
|
|
# 新增:强制文档类型(正式全称)
|
|
|
doc_type: Optional[str] = None
|
|
|
- # 新增:去水印参数
|
|
|
- remove_watermark: Optional[bool] = False
|
|
|
- watermark_light_threshold: Optional[int] = 200
|
|
|
- watermark_saturation_threshold: Optional[int] = 30
|
|
|
- crop_header_footer: Optional[bool] = False
|
|
|
- header_ratio: Optional[float] = 0.05
|
|
|
- footer_ratio: Optional[float] = 0.05
|
|
|
- auto_detect_header_footer: Optional[bool] = False
|
|
|
- # 新增:附件页切割参数
|
|
|
- table_only: Optional[bool] = False # 是否只保留包含表格的附件页(默认False)
|
|
|
|
|
|
|
|
|
class ConversionResponse(BaseModel):
|
|
|
@@ -216,7 +207,6 @@ class OCRRequest(BaseModel):
|
|
|
crop_header_footer: Optional[bool] = False # 是否裁剪页眉页脚
|
|
|
header_ratio: Optional[float] = 0.05 # 页眉裁剪比例(0-1),默认5%
|
|
|
footer_ratio: Optional[float] = 0.05 # 页脚裁剪比例(0-1),默认5%
|
|
|
- auto_detect_header_footer: Optional[bool] = False # 是否自动检测页眉页脚边界
|
|
|
|
|
|
|
|
|
class OCRResponse(BaseModel):
|
|
|
@@ -227,6 +217,12 @@ class OCRResponse(BaseModel):
|
|
|
gpu_info: Optional[GpuInfo] = None # GPU监控信息
|
|
|
|
|
|
|
|
|
+class PdfToMarkdownResponse(BaseModel):
|
|
|
+ """PDF 转 Markdown 同步接口响应"""
|
|
|
+ markdown: str # 生成的 Markdown 全文
|
|
|
+ filename: str # 建议的文件名(如 xxx.md)
|
|
|
+
|
|
|
+
|
|
|
@app.get("/")
|
|
|
async def root():
|
|
|
"""API根路径"""
|
|
|
@@ -243,6 +239,7 @@ async def root():
|
|
|
},
|
|
|
"endpoints": {
|
|
|
"POST /convert": "转换PDF/图片文件(异步,立即返回task_id)",
|
|
|
+ "POST /pdf_to_markdown": "PDF/图片转 Markdown(同步,默认返回 .md 文件下载,format=json 可返回 JSON)",
|
|
|
"GET /task/{task_id}": "查询任务状态(轮询接口)",
|
|
|
"GET /task/{task_id}/json": "直接获取JSON数据(返回JSON对象,不下载文件)",
|
|
|
"GET /download/{task_id}/markdown": "下载Markdown文件",
|
|
|
@@ -325,108 +322,11 @@ async def process_conversion_task(
|
|
|
|
|
|
logger.info(f"[任务 {task_id}] 开始处理: {file_path}")
|
|
|
|
|
|
- # 文件预处理(支持图片和PDF)
|
|
|
- from pathlib import Path as PathLib
|
|
|
- file_suffix = PathLib(file_path).suffix.lower()
|
|
|
- is_image = file_suffix in ['.png', '.jpg', '.jpeg', '.bmp', '.tiff', '.tif']
|
|
|
- is_pdf = file_suffix == '.pdf'
|
|
|
-
|
|
|
- # 图片预处理:去水印或裁剪页眉页脚
|
|
|
- if is_image and (request.remove_watermark or request.crop_header_footer):
|
|
|
- logger.info(f"[任务 {task_id}] 检测到图片文件,开始预处理...")
|
|
|
- preprocessed_path = file_path
|
|
|
-
|
|
|
- # 裁剪页眉页脚
|
|
|
- if request.crop_header_footer:
|
|
|
- try:
|
|
|
- from ..utils.image_preprocessor import crop_header_footer, check_opencv_available
|
|
|
-
|
|
|
- if check_opencv_available():
|
|
|
- if request.auto_detect_header_footer:
|
|
|
- logger.info(f"[任务 {task_id}] 开始自动检测并裁剪页眉页脚")
|
|
|
- else:
|
|
|
- logger.info(f"[任务 {task_id}] 开始裁剪页眉页脚,顶部比例: {request.header_ratio}, 底部比例: {request.footer_ratio}")
|
|
|
-
|
|
|
- # 裁剪后的图片路径
|
|
|
- cropped_path = str(PathLib(output_dir) / f"preprocessed_cropped{file_suffix}")
|
|
|
-
|
|
|
- preprocessed_path = await asyncio.to_thread(
|
|
|
- crop_header_footer,
|
|
|
- preprocessed_path,
|
|
|
- output_path=cropped_path,
|
|
|
- header_ratio=request.header_ratio or 0.05,
|
|
|
- footer_ratio=request.footer_ratio or 0.05,
|
|
|
- auto_detect=request.auto_detect_header_footer or False
|
|
|
- )
|
|
|
- logger.info(f"[任务 {task_id}] 裁剪页眉页脚完成: {preprocessed_path}")
|
|
|
- else:
|
|
|
- logger.warning(f"[任务 {task_id}] OpenCV 未安装,跳过裁剪页眉页脚")
|
|
|
- except Exception as e:
|
|
|
- logger.warning(f"[任务 {task_id}] 裁剪页眉页脚失败,使用原图继续: {e}")
|
|
|
-
|
|
|
- # 去水印
|
|
|
- if request.remove_watermark:
|
|
|
- try:
|
|
|
- from ..utils.image_preprocessor import remove_watermark, check_opencv_available
|
|
|
-
|
|
|
- if check_opencv_available():
|
|
|
- logger.info(f"[任务 {task_id}] 开始去水印处理,亮度阈值: {request.watermark_light_threshold}, 饱和度阈值: {request.watermark_saturation_threshold}")
|
|
|
-
|
|
|
- # 去水印后的图片路径
|
|
|
- nowm_path = str(PathLib(output_dir) / f"preprocessed_nowm{file_suffix}")
|
|
|
-
|
|
|
- preprocessed_path = await asyncio.to_thread(
|
|
|
- remove_watermark,
|
|
|
- preprocessed_path,
|
|
|
- output_path=nowm_path,
|
|
|
- light_threshold=request.watermark_light_threshold or 200,
|
|
|
- saturation_threshold=request.watermark_saturation_threshold or 30,
|
|
|
- method="hsv"
|
|
|
- )
|
|
|
- logger.info(f"[任务 {task_id}] 去水印完成: {preprocessed_path}")
|
|
|
- else:
|
|
|
- logger.warning(f"[任务 {task_id}] OpenCV 未安装,跳过去水印处理")
|
|
|
- except Exception as e:
|
|
|
- logger.warning(f"[任务 {task_id}] 去水印处理失败,使用原图继续: {e}")
|
|
|
-
|
|
|
- # 更新文件路径为预处理后的路径
|
|
|
- if preprocessed_path != file_path:
|
|
|
- file_path = preprocessed_path
|
|
|
- logger.info(f"[任务 {task_id}] 图片预处理完成,使用预处理后的文件: {file_path}")
|
|
|
-
|
|
|
- # PDF预处理:去水印
|
|
|
- elif is_pdf and request.remove_watermark:
|
|
|
- logger.info(f"[任务 {task_id}] 检测到PDF文件,开始去水印预处理...")
|
|
|
- try:
|
|
|
- from ..utils.pdf_watermark_remover import remove_watermark_from_pdf
|
|
|
-
|
|
|
- # 去水印后的PDF路径
|
|
|
- nowm_pdf_path = str(PathLib(output_dir) / f"preprocessed_nowm.pdf")
|
|
|
-
|
|
|
- # 执行去水印
|
|
|
- logger.info(f"[任务 {task_id}] 开始PDF去水印处理,亮度阈值: {request.watermark_light_threshold}, 饱和度阈值: {request.watermark_saturation_threshold}")
|
|
|
- success = await asyncio.to_thread(
|
|
|
- remove_watermark_from_pdf,
|
|
|
- input_pdf=file_path,
|
|
|
- output_pdf=nowm_pdf_path,
|
|
|
- light_threshold=request.watermark_light_threshold or 200,
|
|
|
- saturation_threshold=request.watermark_saturation_threshold or 30,
|
|
|
- dpi=200 # PDF转图片的DPI
|
|
|
- )
|
|
|
-
|
|
|
- if success and PathLib(nowm_pdf_path).exists():
|
|
|
- file_path = nowm_pdf_path
|
|
|
- logger.info(f"[任务 {task_id}] PDF去水印完成: {file_path}")
|
|
|
- else:
|
|
|
- logger.warning(f"[任务 {task_id}] PDF去水印失败,使用原PDF继续")
|
|
|
- except Exception as e:
|
|
|
- logger.warning(f"[任务 {task_id}] PDF去水印处理失败,使用原PDF继续: {e}")
|
|
|
-
|
|
|
result = None
|
|
|
tables_info = None
|
|
|
|
|
|
# 针对投资估算类型,需要先切割附件页
|
|
|
- if request.doc_type in ("fsApproval", "fsReview", "pdApproval", "safetyFsApproval"):
|
|
|
+ if request.doc_type in ("fsApproval", "fsReview", "pdApproval"):
|
|
|
logger.info(f"[任务 {task_id}] 文档类型 {request.doc_type},需要先切割附件页")
|
|
|
|
|
|
# 导入附件页切割函数
|
|
|
@@ -442,21 +342,18 @@ async def process_conversion_task(
|
|
|
attachment_dir = PathLib(output_dir) / "attachments"
|
|
|
attachment_dir.mkdir(parents=True, exist_ok=True)
|
|
|
|
|
|
- # 切割附件页(根据 table_only 参数决定是否过滤非表格内容)
|
|
|
- logger.info(f"[任务 {task_id}] 开始切割附件页(table_only={request.table_only}),输出目录: {attachment_dir}")
|
|
|
+ # 切割附件页
|
|
|
+ logger.info(f"[任务 {task_id}] 开始切割附件页,输出目录: {attachment_dir}")
|
|
|
await asyncio.to_thread(
|
|
|
split_attachment_pages,
|
|
|
file_path,
|
|
|
attachment_dir,
|
|
|
use_ocr=True,
|
|
|
- debug=False,
|
|
|
- table_only=request.table_only # 是否只保留包含表格的附件页
|
|
|
+ debug=False
|
|
|
)
|
|
|
|
|
|
- # 查找切割后的附件页PDF(优先使用表格附件页,其次使用普通附件页)
|
|
|
- attachment_pdfs = list(attachment_dir.glob("*_表格附件页_*.pdf"))
|
|
|
- if not attachment_pdfs:
|
|
|
- attachment_pdfs = list(attachment_dir.glob("*_附件页_*.pdf"))
|
|
|
+ # 查找切割后的附件页PDF
|
|
|
+ attachment_pdfs = list(attachment_dir.glob("*_附件页_*.pdf"))
|
|
|
logger.info(f"[任务 {task_id}] 附件页目录内容: {list(attachment_dir.iterdir()) if attachment_dir.exists() else '(目录不存在)'}")
|
|
|
|
|
|
if attachment_pdfs:
|
|
|
@@ -649,44 +546,11 @@ async def process_conversion_task(
|
|
|
@app.post("/convert", response_model=ConversionResponse)
|
|
|
async def convert_file(
|
|
|
file: Annotated[UploadFile, File(description="上传的PDF或图片文件")],
|
|
|
- # 新增:类型参数(英文传参) noiseRec | emRec | opStatus | settlementReport | designReview | fsApproval | fsReview | pdApproval | finalAccount | safetyFsApproval
|
|
|
+ # 新增:类型参数(英文传参) noiseRec | emRec | opStatus | settlementReport | designReview | fsApproval | fsReview | pdApproval | finalAccount
|
|
|
type: Annotated[
|
|
|
- Optional[Literal["noiseRec", "emRec", "opStatus", "settlementReport", "designReview", "fsApproval", "fsReview", "pdApproval", "finalAccount", "safetyFsApproval"]],
|
|
|
- Form(description="文档类型:noiseRec | emRec | opStatus | settlementReport | designReview | fsApproval | fsReview | pdApproval | finalAccount | safetyFsApproval")
|
|
|
+ Optional[Literal["noiseRec", "emRec", "opStatus", "settlementReport", "designReview", "fsApproval", "fsReview", "pdApproval", "finalAccount"]],
|
|
|
+ Form(description="文档类型:noiseRec | emRec | opStatus | settlementReport | designReview | fsApproval | fsReview | pdApproval | finalAccount")
|
|
|
] = None,
|
|
|
- # 新增:去水印参数
|
|
|
- remove_watermark: Annotated[
|
|
|
- Optional[bool],
|
|
|
- Form(description="是否去除水印,默认为false")
|
|
|
- ] = False,
|
|
|
- watermark_light_threshold: Annotated[
|
|
|
- Optional[int],
|
|
|
- Form(description="水印亮度阈值(0-255),默认200,高于此值的浅色像素可能是水印")
|
|
|
- ] = 200,
|
|
|
- watermark_saturation_threshold: Annotated[
|
|
|
- Optional[int],
|
|
|
- Form(description="水印饱和度阈值(0-255),默认30,低于此值的低饱和度像素可能是水印")
|
|
|
- ] = 30,
|
|
|
- crop_header_footer: Annotated[
|
|
|
- Optional[bool],
|
|
|
- Form(description="是否裁剪页眉页脚,默认为false")
|
|
|
- ] = False,
|
|
|
- header_ratio: Annotated[
|
|
|
- Optional[float],
|
|
|
- Form(description="页眉裁剪比例(0-1),默认0.05表示裁剪顶部5%")
|
|
|
- ] = 0.05,
|
|
|
- footer_ratio: Annotated[
|
|
|
- Optional[float],
|
|
|
- Form(description="页脚裁剪比例(0-1),默认0.05表示裁剪底部5%")
|
|
|
- ] = 0.05,
|
|
|
- auto_detect_header_footer: Annotated[
|
|
|
- Optional[bool],
|
|
|
- Form(description="是否自动检测页眉页脚边界,默认为false(启用后忽略header_ratio和footer_ratio)")
|
|
|
- ] = False,
|
|
|
- table_only: Annotated[
|
|
|
- Optional[bool],
|
|
|
- Form(description="是否只保留包含表格的附件页,默认为false")
|
|
|
- ] = False,
|
|
|
):
|
|
|
"""
|
|
|
转换PDF/图片文件(异步处理)
|
|
|
@@ -707,16 +571,6 @@ async def convert_file(
|
|
|
* fsApproval - 可研批复投资估算
|
|
|
* fsReview - 可研评审投资估算
|
|
|
* pdApproval - 初设批复概算投资
|
|
|
- * finalAccount - 决算报告
|
|
|
- * safetyFsApproval - 安评可研批复投资估算
|
|
|
- - **remove_watermark**: 是否去除水印(仅对图片有效),默认为false
|
|
|
- - **watermark_light_threshold**: 水印亮度阈值(0-255),默认200
|
|
|
- - **watermark_saturation_threshold**: 水印饱和度阈值(0-255),默认30
|
|
|
- - **crop_header_footer**: 是否裁剪页眉页脚(仅对图片有效),默认为false
|
|
|
- - **header_ratio**: 页眉裁剪比例(0-1),默认0.05
|
|
|
- - **footer_ratio**: 页脚裁剪比例(0-1),默认0.05
|
|
|
- - **auto_detect_header_footer**: 是否自动检测页眉页脚边界,默认为false
|
|
|
- - **table_only**: 是否只保留包含表格的附件页,默认为false
|
|
|
|
|
|
注意:v2 版本内部使用外部API进行转换,v2特有的配置参数(如API URL、backend等)
|
|
|
通过环境变量或配置文件设置,不通过API参数传入。
|
|
|
@@ -837,8 +691,6 @@ async def convert_file(
|
|
|
"pdApproval": "pdApproval",
|
|
|
# 决算报告
|
|
|
"finalAccount": "finalAccount",
|
|
|
- # 安评类
|
|
|
- "safetyFsApproval": "safetyFsApproval",
|
|
|
}
|
|
|
doc_type = None
|
|
|
if type:
|
|
|
@@ -854,14 +706,6 @@ async def convert_file(
|
|
|
# 创建请求对象(v2 精简)
|
|
|
request = ConversionRequest(
|
|
|
doc_type=doc_type,
|
|
|
- remove_watermark=remove_watermark,
|
|
|
- watermark_light_threshold=watermark_light_threshold,
|
|
|
- watermark_saturation_threshold=watermark_saturation_threshold,
|
|
|
- crop_header_footer=crop_header_footer,
|
|
|
- header_ratio=header_ratio,
|
|
|
- footer_ratio=footer_ratio,
|
|
|
- auto_detect_header_footer=auto_detect_header_footer,
|
|
|
- table_only=table_only,
|
|
|
)
|
|
|
|
|
|
# 使用 asyncio.create_task 创建后台任务,确保立即返回
|
|
|
@@ -886,6 +730,76 @@ async def convert_file(
|
|
|
)
|
|
|
|
|
|
|
|
|
+@app.post("/pdf_to_markdown")
|
|
|
+async def pdf_to_markdown(
|
|
|
+ file: Annotated[UploadFile, File(description="上传的 PDF 或图片文件")],
|
|
|
+ backend: Annotated[
|
|
|
+ Optional[Literal["mineru", "paddle"]],
|
|
|
+ Form(description="识别后端:mineru 调用 MinerU file_parse,paddle 调用 PaddleOCR doc_parser")
|
|
|
+ ] = "mineru",
|
|
|
+ format: Annotated[
|
|
|
+ Literal["file", "json"],
|
|
|
+ Form(description="返回格式:file 直接返回 .md 文件下载(适合多页),json 返回 JSON 内嵌 markdown 字段(适合少页)")
|
|
|
+ ] = "file",
|
|
|
+):
|
|
|
+ """
|
|
|
+ PDF/图片转 Markdown(同步接口)
|
|
|
+ 直接调用 MinerU 或 PaddleOCR 进行识别,生成完整 MD 后返回。
|
|
|
+ - **file**: 上传的 PDF 或图片
|
|
|
+ - **backend**: mineru(默认)/ paddle
|
|
|
+ - **format**: file(默认)— 直接返回 .md 文件下载,适合多页、大文本;json — 返回 JSON { "markdown", "filename" },适合少页
|
|
|
+ 注意:大文件或页数多时可能较慢,建议页数不超过 20。
|
|
|
+ """
|
|
|
+ temp_dir = None
|
|
|
+ file_path = None
|
|
|
+ try:
|
|
|
+ content_type = file.content_type or ""
|
|
|
+ ext_map = {"application/pdf": ".pdf", "image/png": ".png", "image/jpeg": ".jpg", "image/jpg": ".jpg"}
|
|
|
+ ext = ext_map.get(content_type, "") or (Path(file.filename or "").suffix if file.filename else "") or ".pdf"
|
|
|
+ temp_dir = tempfile.mkdtemp(prefix="pdf_converter_v2_pdf_to_md_")
|
|
|
+ file_path = os.path.join(temp_dir, f"file{ext}")
|
|
|
+ content = await file.read()
|
|
|
+ with open(file_path, "wb") as f:
|
|
|
+ f.write(content)
|
|
|
+ # 页数限制(与 /convert 一致)
|
|
|
+ pages = 1
|
|
|
+ if ext.lower() == ".pdf" and content:
|
|
|
+ pages = max(1, content.count(b"/Type /Page"))
|
|
|
+ if pages > 20:
|
|
|
+ raise HTTPException(status_code=400, detail="文件页数超过 20 页,拒绝处理")
|
|
|
+ output_dir = os.path.join(temp_dir, "output")
|
|
|
+ os.makedirs(output_dir, exist_ok=True)
|
|
|
+ api_url = os.getenv("API_URL", "http://127.0.0.1:5282")
|
|
|
+ result = await convert_pdf_to_markdown_only(
|
|
|
+ input_file=file_path,
|
|
|
+ output_dir=output_dir,
|
|
|
+ backend=backend or "mineru",
|
|
|
+ url=api_url,
|
|
|
+ )
|
|
|
+ if not result:
|
|
|
+ raise HTTPException(status_code=500, detail="PDF 转 Markdown 失败,请查看服务端日志")
|
|
|
+ if format == "file":
|
|
|
+ # 直接返回 .md 文件下载,避免大文本放在 JSON 里
|
|
|
+ safe_filename = quote(result["filename"])
|
|
|
+ return Response(
|
|
|
+ content=result["markdown"],
|
|
|
+ media_type="text/markdown; charset=utf-8",
|
|
|
+ headers={"Content-Disposition": f'attachment; filename="{result["filename"]}"; filename*=UTF-8\'\'{safe_filename}'},
|
|
|
+ )
|
|
|
+ return PdfToMarkdownResponse(markdown=result["markdown"], filename=result["filename"])
|
|
|
+ except HTTPException:
|
|
|
+ raise
|
|
|
+ except Exception as e:
|
|
|
+ logger.exception(f"[pdf_to_markdown] 转换失败: {e}")
|
|
|
+ raise HTTPException(status_code=500, detail=str(e))
|
|
|
+ finally:
|
|
|
+ if temp_dir and os.path.isdir(temp_dir):
|
|
|
+ try:
|
|
|
+ shutil.rmtree(temp_dir)
|
|
|
+ except Exception as exc:
|
|
|
+ logger.debug(f"[pdf_to_markdown] 清理临时目录失败: {exc}")
|
|
|
+
|
|
|
+
|
|
|
@app.get("/task/{task_id}", response_model=TaskStatus)
|
|
|
async def get_task_status(task_id: str):
|
|
|
"""
|
|
|
@@ -1051,7 +965,6 @@ async def ocr_image(request: OCRRequest):
|
|
|
- **crop_header_footer**: 是否裁剪页眉页脚,默认为false
|
|
|
- **header_ratio**: 页眉裁剪比例(0-1),默认0.05表示裁剪顶部5%
|
|
|
- **footer_ratio**: 页脚裁剪比例(0-1),默认0.05表示裁剪底部5%
|
|
|
- - **auto_detect_header_footer**: 是否自动检测页眉页脚边界,默认为false(启用后忽略header_ratio和footer_ratio)
|
|
|
|
|
|
返回识别出的文本列表和GPU监控信息
|
|
|
"""
|
|
|
@@ -1117,10 +1030,7 @@ async def ocr_image(request: OCRRequest):
|
|
|
from ..utils.image_preprocessor import crop_header_footer, check_opencv_available
|
|
|
|
|
|
if check_opencv_available():
|
|
|
- if request.auto_detect_header_footer:
|
|
|
- logger.info("[OCR] 开始自动检测并裁剪页眉页脚")
|
|
|
- else:
|
|
|
- logger.info(f"[OCR] 开始裁剪页眉页脚,顶部比例: {request.header_ratio}, 底部比例: {request.footer_ratio}")
|
|
|
+ logger.info(f"[OCR] 开始裁剪页眉页脚,顶部比例: {request.header_ratio}, 底部比例: {request.footer_ratio}")
|
|
|
|
|
|
# 裁剪后的图片路径
|
|
|
cropped_image_path = os.path.join(temp_dir, f"ocr_image_cropped{ext}")
|
|
|
@@ -1129,8 +1039,7 @@ async def ocr_image(request: OCRRequest):
|
|
|
image_path,
|
|
|
output_path=cropped_image_path,
|
|
|
header_ratio=request.header_ratio or 0.05,
|
|
|
- footer_ratio=request.footer_ratio or 0.05,
|
|
|
- auto_detect=request.auto_detect_header_footer or False
|
|
|
+ footer_ratio=request.footer_ratio or 0.05
|
|
|
)
|
|
|
logger.info(f"[OCR] 裁剪页眉页脚完成: {image_path}")
|
|
|
else:
|