Browse Source

pdf_converter_v2: 新增 POST /pdf_to_markdown 同步接口,默认返回 .md 文件下载

何文松 3 weeks ago
parent
commit
313ff337c2

+ 91 - 182
pdf_converter_v2/api/main.py

@@ -13,14 +13,15 @@ import base64
 import json
 from pathlib import Path
 from typing import Optional, List
+from urllib.parse import quote
 
 from fastapi import FastAPI, File, UploadFile, Form, HTTPException
-from fastapi.responses import FileResponse, JSONResponse
+from fastapi.responses import FileResponse, JSONResponse, Response
 from fastapi.middleware.cors import CORSMiddleware
 from pydantic import BaseModel
 from typing_extensions import Annotated, Literal
 
-from ..processor.converter import convert_to_markdown
+from ..processor.converter import convert_to_markdown, convert_pdf_to_markdown_only
 from ..utils.logging_config import get_logger
 
 # 尝试导入配置,如果不存在则使用默认值
@@ -150,16 +151,6 @@ class ConversionRequest(BaseModel):
     """转换请求模型(v2 精简版)"""
     # 新增:强制文档类型(正式全称)
     doc_type: Optional[str] = None
-    # 新增:去水印参数
-    remove_watermark: Optional[bool] = False
-    watermark_light_threshold: Optional[int] = 200
-    watermark_saturation_threshold: Optional[int] = 30
-    crop_header_footer: Optional[bool] = False
-    header_ratio: Optional[float] = 0.05
-    footer_ratio: Optional[float] = 0.05
-    auto_detect_header_footer: Optional[bool] = False
-    # 新增:附件页切割参数
-    table_only: Optional[bool] = False  # 是否只保留包含表格的附件页(默认False)
 
 
 class ConversionResponse(BaseModel):
@@ -216,7 +207,6 @@ class OCRRequest(BaseModel):
     crop_header_footer: Optional[bool] = False  # 是否裁剪页眉页脚
     header_ratio: Optional[float] = 0.05  # 页眉裁剪比例(0-1),默认5%
     footer_ratio: Optional[float] = 0.05  # 页脚裁剪比例(0-1),默认5%
-    auto_detect_header_footer: Optional[bool] = False  # 是否自动检测页眉页脚边界
 
 
 class OCRResponse(BaseModel):
@@ -227,6 +217,12 @@ class OCRResponse(BaseModel):
     gpu_info: Optional[GpuInfo] = None  # GPU监控信息
 
 
+class PdfToMarkdownResponse(BaseModel):
+    """PDF 转 Markdown 同步接口响应"""
+    markdown: str  # 生成的 Markdown 全文
+    filename: str  # 建议的文件名(如 xxx.md)
+
+
 @app.get("/")
 async def root():
     """API根路径"""
@@ -243,6 +239,7 @@ async def root():
         },
         "endpoints": {
             "POST /convert": "转换PDF/图片文件(异步,立即返回task_id)",
+            "POST /pdf_to_markdown": "PDF/图片转 Markdown(同步,默认返回 .md 文件下载,format=json 可返回 JSON)",
             "GET /task/{task_id}": "查询任务状态(轮询接口)",
             "GET /task/{task_id}/json": "直接获取JSON数据(返回JSON对象,不下载文件)",
             "GET /download/{task_id}/markdown": "下载Markdown文件",
@@ -325,108 +322,11 @@ async def process_conversion_task(
         
         logger.info(f"[任务 {task_id}] 开始处理: {file_path}")
         
-        # 文件预处理(支持图片和PDF)
-        from pathlib import Path as PathLib
-        file_suffix = PathLib(file_path).suffix.lower()
-        is_image = file_suffix in ['.png', '.jpg', '.jpeg', '.bmp', '.tiff', '.tif']
-        is_pdf = file_suffix == '.pdf'
-        
-        # 图片预处理:去水印或裁剪页眉页脚
-        if is_image and (request.remove_watermark or request.crop_header_footer):
-            logger.info(f"[任务 {task_id}] 检测到图片文件,开始预处理...")
-            preprocessed_path = file_path
-            
-            # 裁剪页眉页脚
-            if request.crop_header_footer:
-                try:
-                    from ..utils.image_preprocessor import crop_header_footer, check_opencv_available
-                    
-                    if check_opencv_available():
-                        if request.auto_detect_header_footer:
-                            logger.info(f"[任务 {task_id}] 开始自动检测并裁剪页眉页脚")
-                        else:
-                            logger.info(f"[任务 {task_id}] 开始裁剪页眉页脚,顶部比例: {request.header_ratio}, 底部比例: {request.footer_ratio}")
-                        
-                        # 裁剪后的图片路径
-                        cropped_path = str(PathLib(output_dir) / f"preprocessed_cropped{file_suffix}")
-                        
-                        preprocessed_path = await asyncio.to_thread(
-                            crop_header_footer,
-                            preprocessed_path,
-                            output_path=cropped_path,
-                            header_ratio=request.header_ratio or 0.05,
-                            footer_ratio=request.footer_ratio or 0.05,
-                            auto_detect=request.auto_detect_header_footer or False
-                        )
-                        logger.info(f"[任务 {task_id}] 裁剪页眉页脚完成: {preprocessed_path}")
-                    else:
-                        logger.warning(f"[任务 {task_id}] OpenCV 未安装,跳过裁剪页眉页脚")
-                except Exception as e:
-                    logger.warning(f"[任务 {task_id}] 裁剪页眉页脚失败,使用原图继续: {e}")
-            
-            # 去水印
-            if request.remove_watermark:
-                try:
-                    from ..utils.image_preprocessor import remove_watermark, check_opencv_available
-                    
-                    if check_opencv_available():
-                        logger.info(f"[任务 {task_id}] 开始去水印处理,亮度阈值: {request.watermark_light_threshold}, 饱和度阈值: {request.watermark_saturation_threshold}")
-                        
-                        # 去水印后的图片路径
-                        nowm_path = str(PathLib(output_dir) / f"preprocessed_nowm{file_suffix}")
-                        
-                        preprocessed_path = await asyncio.to_thread(
-                            remove_watermark,
-                            preprocessed_path,
-                            output_path=nowm_path,
-                            light_threshold=request.watermark_light_threshold or 200,
-                            saturation_threshold=request.watermark_saturation_threshold or 30,
-                            method="hsv"
-                        )
-                        logger.info(f"[任务 {task_id}] 去水印完成: {preprocessed_path}")
-                    else:
-                        logger.warning(f"[任务 {task_id}] OpenCV 未安装,跳过去水印处理")
-                except Exception as e:
-                    logger.warning(f"[任务 {task_id}] 去水印处理失败,使用原图继续: {e}")
-            
-            # 更新文件路径为预处理后的路径
-            if preprocessed_path != file_path:
-                file_path = preprocessed_path
-                logger.info(f"[任务 {task_id}] 图片预处理完成,使用预处理后的文件: {file_path}")
-        
-        # PDF预处理:去水印
-        elif is_pdf and request.remove_watermark:
-            logger.info(f"[任务 {task_id}] 检测到PDF文件,开始去水印预处理...")
-            try:
-                from ..utils.pdf_watermark_remover import remove_watermark_from_pdf
-                
-                # 去水印后的PDF路径
-                nowm_pdf_path = str(PathLib(output_dir) / f"preprocessed_nowm.pdf")
-                
-                # 执行去水印
-                logger.info(f"[任务 {task_id}] 开始PDF去水印处理,亮度阈值: {request.watermark_light_threshold}, 饱和度阈值: {request.watermark_saturation_threshold}")
-                success = await asyncio.to_thread(
-                    remove_watermark_from_pdf,
-                    input_pdf=file_path,
-                    output_pdf=nowm_pdf_path,
-                    light_threshold=request.watermark_light_threshold or 200,
-                    saturation_threshold=request.watermark_saturation_threshold or 30,
-                    dpi=200  # PDF转图片的DPI
-                )
-                
-                if success and PathLib(nowm_pdf_path).exists():
-                    file_path = nowm_pdf_path
-                    logger.info(f"[任务 {task_id}] PDF去水印完成: {file_path}")
-                else:
-                    logger.warning(f"[任务 {task_id}] PDF去水印失败,使用原PDF继续")
-            except Exception as e:
-                logger.warning(f"[任务 {task_id}] PDF去水印处理失败,使用原PDF继续: {e}")
-        
         result = None
         tables_info = None
         
         # 针对投资估算类型,需要先切割附件页
-        if request.doc_type in ("fsApproval", "fsReview", "pdApproval", "safetyFsApproval"):
+        if request.doc_type in ("fsApproval", "fsReview", "pdApproval"):
             logger.info(f"[任务 {task_id}] 文档类型 {request.doc_type},需要先切割附件页")
             
             # 导入附件页切割函数
@@ -442,21 +342,18 @@ async def process_conversion_task(
                 attachment_dir = PathLib(output_dir) / "attachments"
                 attachment_dir.mkdir(parents=True, exist_ok=True)
                 
-                # 切割附件页(根据 table_only 参数决定是否过滤非表格内容)
-                logger.info(f"[任务 {task_id}] 开始切割附件页(table_only={request.table_only}),输出目录: {attachment_dir}")
+                # 切割附件页
+                logger.info(f"[任务 {task_id}] 开始切割附件页,输出目录: {attachment_dir}")
                 await asyncio.to_thread(
                     split_attachment_pages,
                     file_path,
                     attachment_dir,
                     use_ocr=True,
-                    debug=False,
-                    table_only=request.table_only  # 是否只保留包含表格的附件页
+                    debug=False
                 )
                 
-                # 查找切割后的附件页PDF(优先使用表格附件页,其次使用普通附件页)
-                attachment_pdfs = list(attachment_dir.glob("*_表格附件页_*.pdf"))
-                if not attachment_pdfs:
-                    attachment_pdfs = list(attachment_dir.glob("*_附件页_*.pdf"))
+                # 查找切割后的附件页PDF
+                attachment_pdfs = list(attachment_dir.glob("*_附件页_*.pdf"))
                 logger.info(f"[任务 {task_id}] 附件页目录内容: {list(attachment_dir.iterdir()) if attachment_dir.exists() else '(目录不存在)'}")
                 
                 if attachment_pdfs:
@@ -649,44 +546,11 @@ async def process_conversion_task(
 @app.post("/convert", response_model=ConversionResponse)
 async def convert_file(
     file: Annotated[UploadFile, File(description="上传的PDF或图片文件")],
-    # 新增:类型参数(英文传参) noiseRec | emRec | opStatus | settlementReport | designReview | fsApproval | fsReview | pdApproval | finalAccount | safetyFsApproval
+    # 新增:类型参数(英文传参) noiseRec | emRec | opStatus | settlementReport | designReview | fsApproval | fsReview | pdApproval | finalAccount
     type: Annotated[
-        Optional[Literal["noiseRec", "emRec", "opStatus", "settlementReport", "designReview", "fsApproval", "fsReview", "pdApproval", "finalAccount", "safetyFsApproval"]],
-        Form(description="文档类型:noiseRec | emRec | opStatus | settlementReport | designReview | fsApproval | fsReview | pdApproval | finalAccount | safetyFsApproval")
+        Optional[Literal["noiseRec", "emRec", "opStatus", "settlementReport", "designReview", "fsApproval", "fsReview", "pdApproval", "finalAccount"]],
+        Form(description="文档类型:noiseRec | emRec | opStatus | settlementReport | designReview | fsApproval | fsReview | pdApproval | finalAccount")
     ] = None,
-    # 新增:去水印参数
-    remove_watermark: Annotated[
-        Optional[bool],
-        Form(description="是否去除水印,默认为false")
-    ] = False,
-    watermark_light_threshold: Annotated[
-        Optional[int],
-        Form(description="水印亮度阈值(0-255),默认200,高于此值的浅色像素可能是水印")
-    ] = 200,
-    watermark_saturation_threshold: Annotated[
-        Optional[int],
-        Form(description="水印饱和度阈值(0-255),默认30,低于此值的低饱和度像素可能是水印")
-    ] = 30,
-    crop_header_footer: Annotated[
-        Optional[bool],
-        Form(description="是否裁剪页眉页脚,默认为false")
-    ] = False,
-    header_ratio: Annotated[
-        Optional[float],
-        Form(description="页眉裁剪比例(0-1),默认0.05表示裁剪顶部5%")
-    ] = 0.05,
-    footer_ratio: Annotated[
-        Optional[float],
-        Form(description="页脚裁剪比例(0-1),默认0.05表示裁剪底部5%")
-    ] = 0.05,
-    auto_detect_header_footer: Annotated[
-        Optional[bool],
-        Form(description="是否自动检测页眉页脚边界,默认为false(启用后忽略header_ratio和footer_ratio)")
-    ] = False,
-    table_only: Annotated[
-        Optional[bool],
-        Form(description="是否只保留包含表格的附件页,默认为false")
-    ] = False,
 ):
     """
     转换PDF/图片文件(异步处理)
@@ -707,16 +571,6 @@ async def convert_file(
       * fsApproval - 可研批复投资估算
       * fsReview - 可研评审投资估算
       * pdApproval - 初设批复概算投资
-      * finalAccount - 决算报告
-      * safetyFsApproval - 安评可研批复投资估算
-    - **remove_watermark**: 是否去除水印(仅对图片有效),默认为false
-    - **watermark_light_threshold**: 水印亮度阈值(0-255),默认200
-    - **watermark_saturation_threshold**: 水印饱和度阈值(0-255),默认30
-    - **crop_header_footer**: 是否裁剪页眉页脚(仅对图片有效),默认为false
-    - **header_ratio**: 页眉裁剪比例(0-1),默认0.05
-    - **footer_ratio**: 页脚裁剪比例(0-1),默认0.05
-    - **auto_detect_header_footer**: 是否自动检测页眉页脚边界,默认为false
-    - **table_only**: 是否只保留包含表格的附件页,默认为false
     
     注意:v2 版本内部使用外部API进行转换,v2特有的配置参数(如API URL、backend等)
     通过环境变量或配置文件设置,不通过API参数传入。
@@ -837,8 +691,6 @@ async def convert_file(
         "pdApproval": "pdApproval",
         # 决算报告
         "finalAccount": "finalAccount",
-        # 安评类
-        "safetyFsApproval": "safetyFsApproval",
     }
     doc_type = None
     if type:
@@ -854,14 +706,6 @@ async def convert_file(
     # 创建请求对象(v2 精简)
     request = ConversionRequest(
         doc_type=doc_type,
-        remove_watermark=remove_watermark,
-        watermark_light_threshold=watermark_light_threshold,
-        watermark_saturation_threshold=watermark_saturation_threshold,
-        crop_header_footer=crop_header_footer,
-        header_ratio=header_ratio,
-        footer_ratio=footer_ratio,
-        auto_detect_header_footer=auto_detect_header_footer,
-        table_only=table_only,
     )
     
     # 使用 asyncio.create_task 创建后台任务,确保立即返回
@@ -886,6 +730,76 @@ async def convert_file(
     )
 
 
+@app.post("/pdf_to_markdown")
+async def pdf_to_markdown(
+    file: Annotated[UploadFile, File(description="上传的 PDF 或图片文件")],
+    backend: Annotated[
+        Optional[Literal["mineru", "paddle"]],
+        Form(description="识别后端:mineru 调用 MinerU file_parse,paddle 调用 PaddleOCR doc_parser")
+    ] = "mineru",
+    format: Annotated[
+        Literal["file", "json"],
+        Form(description="返回格式:file 直接返回 .md 文件下载(适合多页),json 返回 JSON 内嵌 markdown 字段(适合少页)")
+    ] = "file",
+):
+    """
+    PDF/图片转 Markdown(同步接口)
+    直接调用 MinerU 或 PaddleOCR 进行识别,生成完整 MD 后返回。
+    - **file**: 上传的 PDF 或图片
+    - **backend**: mineru(默认)/ paddle
+    - **format**: file(默认)— 直接返回 .md 文件下载,适合多页、大文本;json — 返回 JSON { "markdown", "filename" },适合少页
+    注意:大文件或页数多时可能较慢,建议页数不超过 20。
+    """
+    temp_dir = None
+    file_path = None
+    try:
+        content_type = file.content_type or ""
+        ext_map = {"application/pdf": ".pdf", "image/png": ".png", "image/jpeg": ".jpg", "image/jpg": ".jpg"}
+        ext = ext_map.get(content_type, "") or (Path(file.filename or "").suffix if file.filename else "") or ".pdf"
+        temp_dir = tempfile.mkdtemp(prefix="pdf_converter_v2_pdf_to_md_")
+        file_path = os.path.join(temp_dir, f"file{ext}")
+        content = await file.read()
+        with open(file_path, "wb") as f:
+            f.write(content)
+        # 页数限制(与 /convert 一致)
+        pages = 1
+        if ext.lower() == ".pdf" and content:
+            pages = max(1, content.count(b"/Type /Page"))
+        if pages > 20:
+            raise HTTPException(status_code=400, detail="文件页数超过 20 页,拒绝处理")
+        output_dir = os.path.join(temp_dir, "output")
+        os.makedirs(output_dir, exist_ok=True)
+        api_url = os.getenv("API_URL", "http://127.0.0.1:5282")
+        result = await convert_pdf_to_markdown_only(
+            input_file=file_path,
+            output_dir=output_dir,
+            backend=backend or "mineru",
+            url=api_url,
+        )
+        if not result:
+            raise HTTPException(status_code=500, detail="PDF 转 Markdown 失败,请查看服务端日志")
+        if format == "file":
+            # 直接返回 .md 文件下载,避免大文本放在 JSON 里
+            safe_filename = quote(result["filename"])
+            return Response(
+                content=result["markdown"],
+                media_type="text/markdown; charset=utf-8",
+                headers={"Content-Disposition": f'attachment; filename="{result["filename"]}"; filename*=UTF-8\'\'{safe_filename}'},
+            )
+        return PdfToMarkdownResponse(markdown=result["markdown"], filename=result["filename"])
+    except HTTPException:
+        raise
+    except Exception as e:
+        logger.exception(f"[pdf_to_markdown] 转换失败: {e}")
+        raise HTTPException(status_code=500, detail=str(e))
+    finally:
+        if temp_dir and os.path.isdir(temp_dir):
+            try:
+                shutil.rmtree(temp_dir)
+            except Exception as exc:
+                logger.debug(f"[pdf_to_markdown] 清理临时目录失败: {exc}")
+
+
 @app.get("/task/{task_id}", response_model=TaskStatus)
 async def get_task_status(task_id: str):
     """
@@ -1051,7 +965,6 @@ async def ocr_image(request: OCRRequest):
     - **crop_header_footer**: 是否裁剪页眉页脚,默认为false
     - **header_ratio**: 页眉裁剪比例(0-1),默认0.05表示裁剪顶部5%
     - **footer_ratio**: 页脚裁剪比例(0-1),默认0.05表示裁剪底部5%
-    - **auto_detect_header_footer**: 是否自动检测页眉页脚边界,默认为false(启用后忽略header_ratio和footer_ratio)
     
     返回识别出的文本列表和GPU监控信息
     """
@@ -1117,10 +1030,7 @@ async def ocr_image(request: OCRRequest):
                 from ..utils.image_preprocessor import crop_header_footer, check_opencv_available
                 
                 if check_opencv_available():
-                    if request.auto_detect_header_footer:
-                        logger.info("[OCR] 开始自动检测并裁剪页眉页脚")
-                    else:
-                        logger.info(f"[OCR] 开始裁剪页眉页脚,顶部比例: {request.header_ratio}, 底部比例: {request.footer_ratio}")
+                    logger.info(f"[OCR] 开始裁剪页眉页脚,顶部比例: {request.header_ratio}, 底部比例: {request.footer_ratio}")
                     
                     # 裁剪后的图片路径
                     cropped_image_path = os.path.join(temp_dir, f"ocr_image_cropped{ext}")
@@ -1129,8 +1039,7 @@ async def ocr_image(request: OCRRequest):
                         image_path,
                         output_path=cropped_image_path,
                         header_ratio=request.header_ratio or 0.05,
-                        footer_ratio=request.footer_ratio or 0.05,
-                        auto_detect=request.auto_detect_header_footer or False
+                        footer_ratio=request.footer_ratio or 0.05
                     )
                     logger.info(f"[OCR] 裁剪页眉页脚完成: {image_path}")
                 else:

+ 2 - 2
pdf_converter_v2/processor/__init__.py

@@ -2,7 +2,7 @@
 
 """PDF转换处理器模块"""
 
-from .converter import convert_to_markdown
+from .converter import convert_to_markdown, convert_pdf_to_markdown_only
 
-__all__ = ['convert_to_markdown']
+__all__ = ['convert_to_markdown', 'convert_pdf_to_markdown_only']
 

+ 51 - 0
pdf_converter_v2/processor/converter.py

@@ -426,3 +426,54 @@ async def convert_to_markdown(
         logger.exception(f"转换过程出错: {e}")
         return None
 
+
+async def convert_pdf_to_markdown_only(
+    input_file: str,
+    output_dir: str,
+    backend: str = "mineru",
+    url: Optional[str] = None,
+    max_pages: int = 99999,
+    formula_enable: bool = True,
+    table_enable: bool = True,
+    language: str = "ch",
+) -> Optional[dict]:
+    """
+    仅将 PDF/图片 转为 Markdown 文本,不解析 JSON。
+    用于 API 同步返回 MD 内容。
+    :param input_file: 输入文件路径
+    :param output_dir: 输出目录(临时使用)
+    :param backend: "mineru" 调用 MinerU file_parse,"paddle" 调用 PaddleOCR doc_parser
+    :param url: MinerU API 地址(backend=mineru 时使用)
+    :return: {"markdown": str, "filename": str} 或 None
+    """
+    if not os.path.exists(input_file):
+        logger.error(f"输入文件不存在: {input_file}")
+        return None
+    url = url or os.getenv("API_URL", "http://127.0.0.1:5282")
+    result = None
+    if backend == "paddle":
+        result = await _convert_with_paddle(
+            input_file=input_file,
+            output_dir=output_dir,
+            embed_images=False,
+            output_json=False,
+            forced_document_type=None,
+        )
+    else:
+        result = await convert_to_markdown(
+            input_file=input_file,
+            output_dir=output_dir,
+            max_pages=max_pages,
+            output_json=False,
+            formula_enable=formula_enable,
+            table_enable=table_enable,
+            language=language,
+            url=url,
+            embed_images=False,
+        )
+    if not result or not result.get("content"):
+        return None
+    md_path = result.get("markdown_file") or ""
+    filename = Path(md_path).name if md_path else Path(input_file).stem + ".md"
+    return {"markdown": result["content"], "filename": filename}
+