3 weeks ago · 313ff337c2
--- a/pdf_converter_v2/api/main.py
+++ b/pdf_converter_v2/api/main.py
@@ -13,14 +13,15 @@ import base64
 
				 import json
			
 
				 from pathlib import Path
			
 
				 from typing import Optional, List
			
 
				+from urllib.parse import quote
			
 
				 
			
 
				 from fastapi import FastAPI, File, UploadFile, Form, HTTPException
			
 
				-from fastapi.responses import FileResponse, JSONResponse
			
 
				+from fastapi.responses import FileResponse, JSONResponse, Response
			
 
				 from fastapi.middleware.cors import CORSMiddleware
			
 
				 from pydantic import BaseModel
			
 
				 from typing_extensions import Annotated, Literal
			
 
				 
			
 
				-from ..processor.converter import convert_to_markdown
			
 
				+from ..processor.converter import convert_to_markdown, convert_pdf_to_markdown_only
			
 
				 from ..utils.logging_config import get_logger
			
 
				 
			
 
				 # 尝试导入配置，如果不存在则使用默认值
			
@@ -150,16 +151,6 @@ class ConversionRequest(BaseModel):
 
				     """转换请求模型（v2 精简版）"""
			
 
				     # 新增：强制文档类型（正式全称）
			
 
				     doc_type: Optional[str] = None
			
 
				-    # 新增：去水印参数
			
 
				-    remove_watermark: Optional[bool] = False
			
 
				-    watermark_light_threshold: Optional[int] = 200
			
 
				-    watermark_saturation_threshold: Optional[int] = 30
			
 
				-    crop_header_footer: Optional[bool] = False
			
 
				-    header_ratio: Optional[float] = 0.05
			
 
				-    footer_ratio: Optional[float] = 0.05
			
 
				-    auto_detect_header_footer: Optional[bool] = False
			
 
				-    # 新增：附件页切割参数
			
 
				-    table_only: Optional[bool] = False  # 是否只保留包含表格的附件页（默认False）
			
 
				 
			
 
				 
			
 
				 class ConversionResponse(BaseModel):
			
@@ -216,7 +207,6 @@ class OCRRequest(BaseModel):
 
				     crop_header_footer: Optional[bool] = False  # 是否裁剪页眉页脚
			
 
				     header_ratio: Optional[float] = 0.05  # 页眉裁剪比例（0-1），默认5%
			
 
				     footer_ratio: Optional[float] = 0.05  # 页脚裁剪比例（0-1），默认5%
			
 
				-    auto_detect_header_footer: Optional[bool] = False  # 是否自动检测页眉页脚边界
			
 
				 
			
 
				 
			
 
				 class OCRResponse(BaseModel):
			
@@ -227,6 +217,12 @@ class OCRResponse(BaseModel):
 
				     gpu_info: Optional[GpuInfo] = None  # GPU监控信息
			
 
				 
			
 
				 
			
 
				+class PdfToMarkdownResponse(BaseModel):
			
 
				+    """PDF 转 Markdown 同步接口响应"""
			
 
				+    markdown: str  # 生成的 Markdown 全文
			
 
				+    filename: str  # 建议的文件名（如 xxx.md）
			
 
				+
			
 
				+
			
 
				 @app.get("/")
			
 
				 async def root():
			
 
				     """API根路径"""
			
@@ -243,6 +239,7 @@ async def root():
 
				         },
			
 
				         "endpoints": {
			
 
				             "POST /convert": "转换PDF/图片文件（异步，立即返回task_id）",
			
 
				+            "POST /pdf_to_markdown": "PDF/图片转 Markdown（同步，默认返回 .md 文件下载，format=json 可返回 JSON）",
			
 
				             "GET /task/{task_id}": "查询任务状态（轮询接口）",
			
 
				             "GET /task/{task_id}/json": "直接获取JSON数据（返回JSON对象，不下载文件）",
			
 
				             "GET /download/{task_id}/markdown": "下载Markdown文件",
			
@@ -325,108 +322,11 @@ async def process_conversion_task(
 
				         
			
 
				         logger.info(f"[任务 {task_id}] 开始处理: {file_path}")
			
 
				         
			
 
				-        # 文件预处理（支持图片和PDF）
			
 
				-        from pathlib import Path as PathLib
			
 
				-        file_suffix = PathLib(file_path).suffix.lower()
			
 
				-        is_image = file_suffix in ['.png', '.jpg', '.jpeg', '.bmp', '.tiff', '.tif']
			
 
				-        is_pdf = file_suffix == '.pdf'
			
 
				-        
			
 
				-        # 图片预处理：去水印或裁剪页眉页脚
			
 
				-        if is_image and (request.remove_watermark or request.crop_header_footer):
			
 
				-            logger.info(f"[任务 {task_id}] 检测到图片文件，开始预处理...")
			
 
				-            preprocessed_path = file_path
			
 
				-            
			
 
				-            # 裁剪页眉页脚
			
 
				-            if request.crop_header_footer:
			
 
				-                try:
			
 
				-                    from ..utils.image_preprocessor import crop_header_footer, check_opencv_available
			
 
				-                    
			
 
				-                    if check_opencv_available():
			
 
				-                        if request.auto_detect_header_footer:
			
 
				-                            logger.info(f"[任务 {task_id}] 开始自动检测并裁剪页眉页脚")
			
 
				-                        else:
			
 
				-                            logger.info(f"[任务 {task_id}] 开始裁剪页眉页脚，顶部比例: {request.header_ratio}, 底部比例: {request.footer_ratio}")
			
 
				-                        
			
 
				-                        # 裁剪后的图片路径
			
 
				-                        cropped_path = str(PathLib(output_dir) / f"preprocessed_cropped{file_suffix}")
			
 
				-                        
			
 
				-                        preprocessed_path = await asyncio.to_thread(
			
 
				-                            crop_header_footer,
			
 
				-                            preprocessed_path,
			
 
				-                            output_path=cropped_path,
			
 
				-                            header_ratio=request.header_ratio or 0.05,
			
 
				-                            footer_ratio=request.footer_ratio or 0.05,
			
 
				-                            auto_detect=request.auto_detect_header_footer or False
			
 
				-                        )
			
 
				-                        logger.info(f"[任务 {task_id}] 裁剪页眉页脚完成: {preprocessed_path}")
			
 
				-                    else:
			
 
				-                        logger.warning(f"[任务 {task_id}] OpenCV 未安装，跳过裁剪页眉页脚")
			
 
				-                except Exception as e:
			
 
				-                    logger.warning(f"[任务 {task_id}] 裁剪页眉页脚失败，使用原图继续: {e}")
			
 
				-            
			
 
				-            # 去水印
			
 
				-            if request.remove_watermark:
			
 
				-                try:
			
 
				-                    from ..utils.image_preprocessor import remove_watermark, check_opencv_available
			
 
				-                    
			
 
				-                    if check_opencv_available():
			
 
				-                        logger.info(f"[任务 {task_id}] 开始去水印处理，亮度阈值: {request.watermark_light_threshold}, 饱和度阈值: {request.watermark_saturation_threshold}")
			
 
				-                        
			
 
				-                        # 去水印后的图片路径
			
 
				-                        nowm_path = str(PathLib(output_dir) / f"preprocessed_nowm{file_suffix}")
			
 
				-                        
			
 
				-                        preprocessed_path = await asyncio.to_thread(
			
 
				-                            remove_watermark,
			
 
				-                            preprocessed_path,
			
 
				-                            output_path=nowm_path,
			
 
				-                            light_threshold=request.watermark_light_threshold or 200,
			
 
				-                            saturation_threshold=request.watermark_saturation_threshold or 30,
			
 
				-                            method="hsv"
			
 
				-                        )
			
 
				-                        logger.info(f"[任务 {task_id}] 去水印完成: {preprocessed_path}")
			
 
				-                    else:
			
 
				-                        logger.warning(f"[任务 {task_id}] OpenCV 未安装，跳过去水印处理")
			
 
				-                except Exception as e:
			
 
				-                    logger.warning(f"[任务 {task_id}] 去水印处理失败，使用原图继续: {e}")
			
 
				-            
			
 
				-            # 更新文件路径为预处理后的路径
			
 
				-            if preprocessed_path != file_path:
			
 
				-                file_path = preprocessed_path
			
 
				-                logger.info(f"[任务 {task_id}] 图片预处理完成，使用预处理后的文件: {file_path}")
			
 
				-        
			
 
				-        # PDF预处理：去水印
			
 
				-        elif is_pdf and request.remove_watermark:
			
 
				-            logger.info(f"[任务 {task_id}] 检测到PDF文件，开始去水印预处理...")
			
 
				-            try:
			
 
				-                from ..utils.pdf_watermark_remover import remove_watermark_from_pdf
			
 
				-                
			
 
				-                # 去水印后的PDF路径
			
 
				-                nowm_pdf_path = str(PathLib(output_dir) / f"preprocessed_nowm.pdf")
			
 
				-                
			
 
				-                # 执行去水印
			
 
				-                logger.info(f"[任务 {task_id}] 开始PDF去水印处理，亮度阈值: {request.watermark_light_threshold}, 饱和度阈值: {request.watermark_saturation_threshold}")
			
 
				-                success = await asyncio.to_thread(
			
 
				-                    remove_watermark_from_pdf,
			
 
				-                    input_pdf=file_path,
			
 
				-                    output_pdf=nowm_pdf_path,
			
 
				-                    light_threshold=request.watermark_light_threshold or 200,
			
 
				-                    saturation_threshold=request.watermark_saturation_threshold or 30,
			
 
				-                    dpi=200  # PDF转图片的DPI
			
 
				-                )
			
 
				-                
			
 
				-                if success and PathLib(nowm_pdf_path).exists():
			
 
				-                    file_path = nowm_pdf_path
			
 
				-                    logger.info(f"[任务 {task_id}] PDF去水印完成: {file_path}")
			
 
				-                else:
			
 
				-                    logger.warning(f"[任务 {task_id}] PDF去水印失败，使用原PDF继续")
			
 
				-            except Exception as e:
			
 
				-                logger.warning(f"[任务 {task_id}] PDF去水印处理失败，使用原PDF继续: {e}")
			
 
				-        
			
 
				         result = None
			
 
				         tables_info = None
			
 
				         
			
 
				         # 针对投资估算类型，需要先切割附件页
			
 
				-        if request.doc_type in ("fsApproval", "fsReview", "pdApproval", "safetyFsApproval"):
			
 
				+        if request.doc_type in ("fsApproval", "fsReview", "pdApproval"):
			
 
				             logger.info(f"[任务 {task_id}] 文档类型 {request.doc_type}，需要先切割附件页")
			
 
				             
			
 
				             # 导入附件页切割函数
			
@@ -442,21 +342,18 @@ async def process_conversion_task(
 
				                 attachment_dir = PathLib(output_dir) / "attachments"
			
 
				                 attachment_dir.mkdir(parents=True, exist_ok=True)
			
 
				                 
			
 
				-                # 切割附件页（根据 table_only 参数决定是否过滤非表格内容）
			
 
				-                logger.info(f"[任务 {task_id}] 开始切割附件页（table_only={request.table_only}），输出目录: {attachment_dir}")
			
 
				+                # 切割附件页
			
 
				+                logger.info(f"[任务 {task_id}] 开始切割附件页，输出目录: {attachment_dir}")
			
 
				                 await asyncio.to_thread(
			
 
				                     split_attachment_pages,
			
 
				                     file_path,
			
 
				                     attachment_dir,
			
 
				                     use_ocr=True,
			
 
				-                    debug=False,
			
 
				-                    table_only=request.table_only  # 是否只保留包含表格的附件页
			
 
				+                    debug=False
			
 
				                 )
			
 
				                 
			
 
				-                # 查找切割后的附件页PDF（优先使用表格附件页，其次使用普通附件页）
			
 
				-                attachment_pdfs = list(attachment_dir.glob("*_表格附件页_*.pdf"))
			
 
				-                if not attachment_pdfs:
			
 
				-                    attachment_pdfs = list(attachment_dir.glob("*_附件页_*.pdf"))
			
 
				+                # 查找切割后的附件页PDF
			
 
				+                attachment_pdfs = list(attachment_dir.glob("*_附件页_*.pdf"))
			
 
				                 logger.info(f"[任务 {task_id}] 附件页目录内容: {list(attachment_dir.iterdir()) if attachment_dir.exists() else '(目录不存在)'}")
			
 
				                 
			
 
				                 if attachment_pdfs:
			
@@ -649,44 +546,11 @@ async def process_conversion_task(
 
				 @app.post("/convert", response_model=ConversionResponse)
			
 
				 async def convert_file(
			
 
				     file: Annotated[UploadFile, File(description="上传的PDF或图片文件")],
			
 
				-    # 新增：类型参数（英文传参） noiseRec | emRec | opStatus | settlementReport | designReview | fsApproval | fsReview | pdApproval | finalAccount | safetyFsApproval
			
 
				+    # 新增：类型参数（英文传参） noiseRec | emRec | opStatus | settlementReport | designReview | fsApproval | fsReview | pdApproval | finalAccount
			
 
				     type: Annotated[
			
 
				-        Optional[Literal["noiseRec", "emRec", "opStatus", "settlementReport", "designReview", "fsApproval", "fsReview", "pdApproval", "finalAccount", "safetyFsApproval"]],
			
 
				-        Form(description="文档类型：noiseRec | emRec | opStatus | settlementReport | designReview | fsApproval | fsReview | pdApproval | finalAccount | safetyFsApproval")
			
 
				+        Optional[Literal["noiseRec", "emRec", "opStatus", "settlementReport", "designReview", "fsApproval", "fsReview", "pdApproval", "finalAccount"]],
			
 
				+        Form(description="文档类型：noiseRec | emRec | opStatus | settlementReport | designReview | fsApproval | fsReview | pdApproval | finalAccount")
			
 
				     ] = None,
			
 
				-    # 新增：去水印参数
			
 
				-    remove_watermark: Annotated[
			
 
				-        Optional[bool],
			
 
				-        Form(description="是否去除水印，默认为false")
			
 
				-    ] = False,
			
 
				-    watermark_light_threshold: Annotated[
			
 
				-        Optional[int],
			
 
				-        Form(description="水印亮度阈值（0-255），默认200，高于此值的浅色像素可能是水印")
			
 
				-    ] = 200,
			
 
				-    watermark_saturation_threshold: Annotated[
			
 
				-        Optional[int],
			
 
				-        Form(description="水印饱和度阈值（0-255），默认30，低于此值的低饱和度像素可能是水印")
			
 
				-    ] = 30,
			
 
				-    crop_header_footer: Annotated[
			
 
				-        Optional[bool],
			
 
				-        Form(description="是否裁剪页眉页脚，默认为false")
			
 
				-    ] = False,
			
 
				-    header_ratio: Annotated[
			
 
				-        Optional[float],
			
 
				-        Form(description="页眉裁剪比例（0-1），默认0.05表示裁剪顶部5%")
			
 
				-    ] = 0.05,
			
 
				-    footer_ratio: Annotated[
			
 
				-        Optional[float],
			
 
				-        Form(description="页脚裁剪比例（0-1），默认0.05表示裁剪底部5%")
			
 
				-    ] = 0.05,
			
 
				-    auto_detect_header_footer: Annotated[
			
 
				-        Optional[bool],
			
 
				-        Form(description="是否自动检测页眉页脚边界，默认为false（启用后忽略header_ratio和footer_ratio）")
			
 
				-    ] = False,
			
 
				-    table_only: Annotated[
			
 
				-        Optional[bool],
			
 
				-        Form(description="是否只保留包含表格的附件页，默认为false")
			
 
				-    ] = False,
			
 
				 ):
			
 
				     """
			
 
				     转换PDF/图片文件（异步处理）
			
@@ -707,16 +571,6 @@ async def convert_file(
 
				       * fsApproval - 可研批复投资估算
			
 
				       * fsReview - 可研评审投资估算
			
 
				       * pdApproval - 初设批复概算投资
			
 
				-      * finalAccount - 决算报告
			
 
				-      * safetyFsApproval - 安评可研批复投资估算
			
 
				-    - **remove_watermark**: 是否去除水印（仅对图片有效），默认为false
			
 
				-    - **watermark_light_threshold**: 水印亮度阈值（0-255），默认200
			
 
				-    - **watermark_saturation_threshold**: 水印饱和度阈值（0-255），默认30
			
 
				-    - **crop_header_footer**: 是否裁剪页眉页脚（仅对图片有效），默认为false
			
 
				-    - **header_ratio**: 页眉裁剪比例（0-1），默认0.05
			
 
				-    - **footer_ratio**: 页脚裁剪比例（0-1），默认0.05
			
 
				-    - **auto_detect_header_footer**: 是否自动检测页眉页脚边界，默认为false
			
 
				-    - **table_only**: 是否只保留包含表格的附件页，默认为false
			
 
				     
			
 
				     注意：v2 版本内部使用外部API进行转换，v2特有的配置参数（如API URL、backend等）
			
 
				     通过环境变量或配置文件设置，不通过API参数传入。
			
@@ -837,8 +691,6 @@ async def convert_file(
 
				         "pdApproval": "pdApproval",
			
 
				         # 决算报告
			
 
				         "finalAccount": "finalAccount",
			
 
				-        # 安评类
			
 
				-        "safetyFsApproval": "safetyFsApproval",
			
 
				     }
			
 
				     doc_type = None
			
 
				     if type:
			
@@ -854,14 +706,6 @@ async def convert_file(
 
				     # 创建请求对象（v2 精简）
			
 
				     request = ConversionRequest(
			
 
				         doc_type=doc_type,
			
 
				-        remove_watermark=remove_watermark,
			
 
				-        watermark_light_threshold=watermark_light_threshold,
			
 
				-        watermark_saturation_threshold=watermark_saturation_threshold,
			
 
				-        crop_header_footer=crop_header_footer,
			
 
				-        header_ratio=header_ratio,
			
 
				-        footer_ratio=footer_ratio,
			
 
				-        auto_detect_header_footer=auto_detect_header_footer,
			
 
				-        table_only=table_only,
			
 
				     )
			
 
				     
			
 
				     # 使用 asyncio.create_task 创建后台任务，确保立即返回
			
@@ -886,6 +730,76 @@ async def convert_file(
 
				     )
			
 
				 
			
 
				 
			
 
				+@app.post("/pdf_to_markdown")
			
 
				+async def pdf_to_markdown(
			
 
				+    file: Annotated[UploadFile, File(description="上传的 PDF 或图片文件")],
			
 
				+    backend: Annotated[
			
 
				+        Optional[Literal["mineru", "paddle"]],
			
 
				+        Form(description="识别后端：mineru 调用 MinerU file_parse，paddle 调用 PaddleOCR doc_parser")
			
 
				+    ] = "mineru",
			
 
				+    format: Annotated[
			
 
				+        Literal["file", "json"],
			
 
				+        Form(description="返回格式：file 直接返回 .md 文件下载（适合多页），json 返回 JSON 内嵌 markdown 字段（适合少页）")
			
 
				+    ] = "file",
			
 
				+):
			
 
				+    """
			
 
				+    PDF/图片转 Markdown（同步接口）
			
 
				+    直接调用 MinerU 或 PaddleOCR 进行识别，生成完整 MD 后返回。
			
 
				+    - **file**: 上传的 PDF 或图片
			
 
				+    - **backend**: mineru（默认）/ paddle
			
 
				+    - **format**: file（默认）— 直接返回 .md 文件下载，适合多页、大文本；json — 返回 JSON { "markdown", "filename" }，适合少页
			
 
				+    注意：大文件或页数多时可能较慢，建议页数不超过 20。
			
 
				+    """
			
 
				+    temp_dir = None
			
 
				+    file_path = None
			
 
				+    try:
			
 
				+        content_type = file.content_type or ""
			
 
				+        ext_map = {"application/pdf": ".pdf", "image/png": ".png", "image/jpeg": ".jpg", "image/jpg": ".jpg"}
			
 
				+        ext = ext_map.get(content_type, "") or (Path(file.filename or "").suffix if file.filename else "") or ".pdf"
			
 
				+        temp_dir = tempfile.mkdtemp(prefix="pdf_converter_v2_pdf_to_md_")
			
 
				+        file_path = os.path.join(temp_dir, f"file{ext}")
			
 
				+        content = await file.read()
			
 
				+        with open(file_path, "wb") as f:
			
 
				+            f.write(content)
			
 
				+        # 页数限制（与 /convert 一致）
			
 
				+        pages = 1
			
 
				+        if ext.lower() == ".pdf" and content:
			
 
				+            pages = max(1, content.count(b"/Type /Page"))
			
 
				+        if pages > 20:
			
 
				+            raise HTTPException(status_code=400, detail="文件页数超过 20 页，拒绝处理")
			
 
				+        output_dir = os.path.join(temp_dir, "output")
			
 
				+        os.makedirs(output_dir, exist_ok=True)
			
 
				+        api_url = os.getenv("API_URL", "http://127.0.0.1:5282")
			
 
				+        result = await convert_pdf_to_markdown_only(
			
 
				+            input_file=file_path,
			
 
				+            output_dir=output_dir,
			
 
				+            backend=backend or "mineru",
			
 
				+            url=api_url,
			
 
				+        )
			
 
				+        if not result:
			
 
				+            raise HTTPException(status_code=500, detail="PDF 转 Markdown 失败，请查看服务端日志")
			
 
				+        if format == "file":
			
 
				+            # 直接返回 .md 文件下载，避免大文本放在 JSON 里
			
 
				+            safe_filename = quote(result["filename"])
			
 
				+            return Response(
			
 
				+                content=result["markdown"],
			
 
				+                media_type="text/markdown; charset=utf-8",
			
 
				+                headers={"Content-Disposition": f'attachment; filename="{result["filename"]}"; filename*=UTF-8\'\'{safe_filename}'},
			
 
				+            )
			
 
				+        return PdfToMarkdownResponse(markdown=result["markdown"], filename=result["filename"])
			
 
				+    except HTTPException:
			
 
				+        raise
			
 
				+    except Exception as e:
			
 
				+        logger.exception(f"[pdf_to_markdown] 转换失败: {e}")
			
 
				+        raise HTTPException(status_code=500, detail=str(e))
			
 
				+    finally:
			
 
				+        if temp_dir and os.path.isdir(temp_dir):
			
 
				+            try:
			
 
				+                shutil.rmtree(temp_dir)
			
 
				+            except Exception as exc:
			
 
				+                logger.debug(f"[pdf_to_markdown] 清理临时目录失败: {exc}")
			
 
				+
			
 
				+
			
 
				 @app.get("/task/{task_id}", response_model=TaskStatus)
			
 
				 async def get_task_status(task_id: str):
			
 
				     """
			
@@ -1051,7 +965,6 @@ async def ocr_image(request: OCRRequest):
 
				     - **crop_header_footer**: 是否裁剪页眉页脚，默认为false
			
 
				     - **header_ratio**: 页眉裁剪比例（0-1），默认0.05表示裁剪顶部5%
			
 
				     - **footer_ratio**: 页脚裁剪比例（0-1），默认0.05表示裁剪底部5%
			
 
				-    - **auto_detect_header_footer**: 是否自动检测页眉页脚边界，默认为false（启用后忽略header_ratio和footer_ratio）
			
 
				     
			
 
				     返回识别出的文本列表和GPU监控信息
			
 
				     """
			
@@ -1117,10 +1030,7 @@ async def ocr_image(request: OCRRequest):
 
				                 from ..utils.image_preprocessor import crop_header_footer, check_opencv_available
			
 
				                 
			
 
				                 if check_opencv_available():
			
 
				-                    if request.auto_detect_header_footer:
			
 
				-                        logger.info("[OCR] 开始自动检测并裁剪页眉页脚")
			
 
				-                    else:
			
 
				-                        logger.info(f"[OCR] 开始裁剪页眉页脚，顶部比例: {request.header_ratio}, 底部比例: {request.footer_ratio}")
			
 
				+                    logger.info(f"[OCR] 开始裁剪页眉页脚，顶部比例: {request.header_ratio}, 底部比例: {request.footer_ratio}")
			
 
				                     
			
 
				                     # 裁剪后的图片路径
			
 
				                     cropped_image_path = os.path.join(temp_dir, f"ocr_image_cropped{ext}")
			
@@ -1129,8 +1039,7 @@ async def ocr_image(request: OCRRequest):
 
				                         image_path,
			
 
				                         output_path=cropped_image_path,
			
 
				                         header_ratio=request.header_ratio or 0.05,
			
 
				-                        footer_ratio=request.footer_ratio or 0.05,
			
 
				-                        auto_detect=request.auto_detect_header_footer or False
			
 
				+                        footer_ratio=request.footer_ratio or 0.05
			
 
				                     )
			
 
				                     logger.info(f"[OCR] 裁剪页眉页脚完成: {image_path}")
			
 
				                 else:
			
--- a/pdf_converter_v2/processor/__init__.py
+++ b/pdf_converter_v2/processor/__init__.py
@@ -2,7 +2,7 @@
 
				 
			
 
				 """PDF转换处理器模块"""
			
 
				 
			
 
				-from .converter import convert_to_markdown
			
 
				+from .converter import convert_to_markdown, convert_pdf_to_markdown_only
			
 
				 
			
 
				-__all__ = ['convert_to_markdown']
			
 
				+__all__ = ['convert_to_markdown', 'convert_pdf_to_markdown_only']
			
 
				 
			
--- a/pdf_converter_v2/processor/converter.py
+++ b/pdf_converter_v2/processor/converter.py
@@ -426,3 +426,54 @@ async def convert_to_markdown(
 
				         logger.exception(f"转换过程出错: {e}")
			
 
				         return None
			
 
				 
			
 
				+
			
 
				+async def convert_pdf_to_markdown_only(
			
 
				+    input_file: str,
			
 
				+    output_dir: str,
			
 
				+    backend: str = "mineru",
			
 
				+    url: Optional[str] = None,
			
 
				+    max_pages: int = 99999,
			
 
				+    formula_enable: bool = True,
			
 
				+    table_enable: bool = True,
			
 
				+    language: str = "ch",
			
 
				+) -> Optional[dict]:
			
 
				+    """
			
 
				+    仅将 PDF/图片 转为 Markdown 文本，不解析 JSON。
			
 
				+    用于 API 同步返回 MD 内容。
			
 
				+    :param input_file: 输入文件路径
			
 
				+    :param output_dir: 输出目录（临时使用）
			
 
				+    :param backend: "mineru" 调用 MinerU file_parse，"paddle" 调用 PaddleOCR doc_parser
			
 
				+    :param url: MinerU API 地址（backend=mineru 时使用）
			
 
				+    :return: {"markdown": str, "filename": str} 或 None
			
 
				+    """
			
 
				+    if not os.path.exists(input_file):
			
 
				+        logger.error(f"输入文件不存在: {input_file}")
			
 
				+        return None
			
 
				+    url = url or os.getenv("API_URL", "http://127.0.0.1:5282")
			
 
				+    result = None
			
 
				+    if backend == "paddle":
			
 
				+        result = await _convert_with_paddle(
			
 
				+            input_file=input_file,
			
 
				+            output_dir=output_dir,
			
 
				+            embed_images=False,
			
 
				+            output_json=False,
			
 
				+            forced_document_type=None,
			
 
				+        )
			
 
				+    else:
			
 
				+        result = await convert_to_markdown(
			
 
				+            input_file=input_file,
			
 
				+            output_dir=output_dir,
			
 
				+            max_pages=max_pages,
			
 
				+            output_json=False,
			
 
				+            formula_enable=formula_enable,
			
 
				+            table_enable=table_enable,
			
 
				+            language=language,
			
 
				+            url=url,
			
 
				+            embed_images=False,
			
 
				+        )
			
 
				+    if not result or not result.get("content"):
			
 
				+        return None
			
 
				+    md_path = result.get("markdown_file") or ""
			
 
				+    filename = Path(md_path).name if md_path else Path(input_file).stem + ".md"
			
 
				+    return {"markdown": result["content"], "filename": filename}
			
 
				+