|
|
@@ -11,6 +11,7 @@ import tempfile
|
|
|
import uuid
|
|
|
import base64
|
|
|
import json
|
|
|
+import zipfile
|
|
|
from pathlib import Path
|
|
|
from typing import Optional, List
|
|
|
from urllib.parse import quote
|
|
|
@@ -23,6 +24,7 @@ from typing_extensions import Annotated, Literal
|
|
|
|
|
|
from ..processor.converter import convert_to_markdown, convert_pdf_to_markdown_only
|
|
|
from ..utils.logging_config import get_logger
|
|
|
+from ..utils.pdf_watermark_remover import remove_watermark_from_pdf, crop_header_footer_from_pdf
|
|
|
|
|
|
# 尝试导入配置,如果不存在则使用默认值
|
|
|
try:
|
|
|
@@ -239,10 +241,11 @@ async def root():
|
|
|
},
|
|
|
"endpoints": {
|
|
|
"POST /convert": "转换PDF/图片文件(异步,立即返回task_id)",
|
|
|
- "POST /pdf_to_markdown": "PDF/图片转 Markdown(同步,默认返回 .md 文件下载,format=json 可返回 JSON)",
|
|
|
+ "POST /pdf_to_markdown": "PDF/图片转 Markdown(异步,立即返回task_id,通过 task_id 查询状态并下载 .md)",
|
|
|
"GET /task/{task_id}": "查询任务状态(轮询接口)",
|
|
|
"GET /task/{task_id}/json": "直接获取JSON数据(返回JSON对象,不下载文件)",
|
|
|
"GET /download/{task_id}/markdown": "下载Markdown文件",
|
|
|
+ "GET /download/{task_id}/zip": "下载 md+图片 压缩包(需 POST /pdf_to_markdown 时 return_images=true)",
|
|
|
"GET /download/{task_id}/json": "下载JSON文件",
|
|
|
"DELETE /task/{task_id}": "删除任务及其临时文件",
|
|
|
"GET /health": "健康检查"
|
|
|
@@ -543,6 +546,109 @@ async def process_conversion_task(
|
|
|
# 这样可以方便用户查看上传的文件内容
|
|
|
|
|
|
|
|
|
+async def process_pdf_to_markdown_task(
|
|
|
+ task_id: str,
|
|
|
+ file_path: str,
|
|
|
+ output_dir: str,
|
|
|
+ backend: str,
|
|
|
+ remove_watermark: bool,
|
|
|
+ watermark_light_threshold: int,
|
|
|
+ watermark_saturation_threshold: int,
|
|
|
+ crop_header_footer: bool,
|
|
|
+ header_ratio: float,
|
|
|
+ footer_ratio: float,
|
|
|
+ return_images: bool = False,
|
|
|
+):
|
|
|
+ """后台执行 PDF/图片转 Markdown(仅转 MD,无 doc_type 等)。"""
|
|
|
+ try:
|
|
|
+ logger.info(f"[任务 {task_id}] PDF转Markdown 后台任务开始...")
|
|
|
+ task_status[task_id]["status"] = "processing"
|
|
|
+ task_status[task_id]["message"] = "正在转换 PDF/图片为 Markdown..."
|
|
|
+
|
|
|
+ ext = (Path(file_path).suffix or "").lower()
|
|
|
+ is_pdf = ext == ".pdf"
|
|
|
+ current_path = file_path
|
|
|
+
|
|
|
+ if is_pdf and remove_watermark:
|
|
|
+ next_path = os.path.join(os.path.dirname(output_dir), "no_watermark.pdf")
|
|
|
+ ok = await asyncio.to_thread(
|
|
|
+ remove_watermark_from_pdf,
|
|
|
+ current_path,
|
|
|
+ next_path,
|
|
|
+ light_threshold=watermark_light_threshold,
|
|
|
+ saturation_threshold=watermark_saturation_threshold,
|
|
|
+ )
|
|
|
+ if ok:
|
|
|
+ current_path = next_path
|
|
|
+ else:
|
|
|
+ logger.warning(f"[任务 {task_id}] 去水印失败,使用原文件继续")
|
|
|
+ if is_pdf and crop_header_footer:
|
|
|
+ next_path = os.path.join(os.path.dirname(output_dir), "cropped.pdf")
|
|
|
+ ok = await asyncio.to_thread(
|
|
|
+ crop_header_footer_from_pdf,
|
|
|
+ current_path,
|
|
|
+ next_path,
|
|
|
+ header_ratio=header_ratio,
|
|
|
+ footer_ratio=footer_ratio,
|
|
|
+ )
|
|
|
+ if ok:
|
|
|
+ current_path = next_path
|
|
|
+ else:
|
|
|
+ logger.warning(f"[任务 {task_id}] 页眉页脚裁剪失败,使用原文件继续")
|
|
|
+
|
|
|
+ api_url = os.getenv("API_URL", "http://127.0.0.1:5282")
|
|
|
+ result = await convert_pdf_to_markdown_only(
|
|
|
+ input_file=current_path,
|
|
|
+ output_dir=output_dir,
|
|
|
+ backend=backend,
|
|
|
+ url=api_url,
|
|
|
+ return_images=return_images,
|
|
|
+ )
|
|
|
+ if not result:
|
|
|
+ task_status[task_id]["status"] = "failed"
|
|
|
+ task_status[task_id]["message"] = "转换失败"
|
|
|
+ task_status[task_id]["error"] = "PDF 转 Markdown 返回空"
|
|
|
+ logger.error(f"[任务 {task_id}] PDF 转 Markdown 返回空")
|
|
|
+ return
|
|
|
+
|
|
|
+ md_content = result.get("markdown", "")
|
|
|
+ filename = result.get("filename", "output.md")
|
|
|
+ if not filename.endswith(".md"):
|
|
|
+ filename = filename + ".md"
|
|
|
+ markdown_file_path = os.path.join(output_dir, filename)
|
|
|
+ with open(markdown_file_path, "w", encoding="utf-8") as f:
|
|
|
+ f.write(md_content)
|
|
|
+
|
|
|
+ task_status[task_id]["status"] = "completed"
|
|
|
+ task_status[task_id]["message"] = "转换成功"
|
|
|
+ task_status[task_id]["markdown_file"] = markdown_file_path
|
|
|
+ task_status[task_id]["json_data"] = {"markdown": md_content, "filename": filename}
|
|
|
+ task_status[task_id]["document_type"] = None
|
|
|
+
|
|
|
+ if return_images:
|
|
|
+ zip_path = os.path.join(output_dir, "markdown_with_images.zip")
|
|
|
+ try:
|
|
|
+ with zipfile.ZipFile(zip_path, "w", zipfile.ZIP_DEFLATED) as zf:
|
|
|
+ for root, _, files in os.walk(output_dir):
|
|
|
+ for f in files:
|
|
|
+ if f == "markdown_with_images.zip":
|
|
|
+ continue
|
|
|
+ abs_path = os.path.join(root, f)
|
|
|
+ arcname = os.path.relpath(abs_path, output_dir)
|
|
|
+ zf.write(abs_path, arcname)
|
|
|
+ task_status[task_id]["zip_file"] = zip_path
|
|
|
+ logger.info(f"[任务 {task_id}] 已打包 md+图片: {zip_path}")
|
|
|
+ except Exception as e:
|
|
|
+ logger.warning(f"[任务 {task_id}] 打包 zip 失败: {e}")
|
|
|
+
|
|
|
+ logger.info(f"[任务 {task_id}] PDF 转 Markdown 完成: {markdown_file_path}")
|
|
|
+ except Exception as e:
|
|
|
+ task_status[task_id]["status"] = "failed"
|
|
|
+ task_status[task_id]["message"] = f"处理出错: {str(e)}"
|
|
|
+ task_status[task_id]["error"] = str(e)
|
|
|
+ logger.exception(f"[任务 {task_id}] PDF 转 Markdown 失败: {e}")
|
|
|
+
|
|
|
+
|
|
|
@app.post("/convert", response_model=ConversionResponse)
|
|
|
async def convert_file(
|
|
|
file: Annotated[UploadFile, File(description="上传的PDF或图片文件")],
|
|
|
@@ -631,7 +737,7 @@ async def convert_file(
|
|
|
except Exception as e:
|
|
|
raise HTTPException(status_code=500, detail=f"保存文件失败: {str(e)}")
|
|
|
|
|
|
- # 计算页数并限制:>20页直接报错;图片按1页处理
|
|
|
+ # 计算页数并限制:>300页直接报错;图片按1页处理
|
|
|
try:
|
|
|
suffix = (Path(file_path).suffix or "").lower()
|
|
|
pages = 1
|
|
|
@@ -648,13 +754,13 @@ async def convert_file(
|
|
|
else:
|
|
|
# 常见图片格式视为单页
|
|
|
pages = 1
|
|
|
- if pages > 20:
|
|
|
+ if pages > 300:
|
|
|
# 清理临时目录后报错
|
|
|
try:
|
|
|
shutil.rmtree(temp_dir)
|
|
|
except Exception:
|
|
|
pass
|
|
|
- raise HTTPException(status_code=400, detail="文件页数超过20页,拒绝处理")
|
|
|
+ raise HTTPException(status_code=400, detail="文件页数超过300页,拒绝处理")
|
|
|
logger.info(f"[任务 {task_id}] 页数评估: {pages}")
|
|
|
except HTTPException:
|
|
|
raise
|
|
|
@@ -730,83 +836,181 @@ async def convert_file(
|
|
|
)
|
|
|
|
|
|
|
|
|
+PDF_TO_MARKDOWN_DESCRIPTION = """
|
|
|
+将 **PDF 或图片** 转为纯 Markdown 文本的异步接口。提交后立即返回 `task_id`,不等待转换完成。
|
|
|
+
|
|
|
+## 调用流程
|
|
|
+
|
|
|
+1. **POST 本接口**:上传文件(`multipart/form-data`),请求体需包含 `file` 及可选表单项;响应返回 `task_id`、`status: "pending"`。
|
|
|
+2. **轮询状态**:**GET /task/{task_id}**,直到 `status` 为 `completed` 或 `failed`。
|
|
|
+3. **获取结果**(仅当 `status == "completed"` 时):
|
|
|
+ - **GET /download/{task_id}/markdown**:下载生成的 `.md` 文件;
|
|
|
+ - **GET /task/{task_id}/json**:获取 JSON `{ "markdown": "全文", "filename": "xxx.md" }`;
|
|
|
+ - 若提交时传了 **return_images=true**:**GET /download/{task_id}/zip** 下载 Markdown + 图片压缩包。
|
|
|
+
|
|
|
+## 参数说明
|
|
|
+
|
|
|
+| 参数 | 类型 | 必填 | 说明 |
|
|
|
+|------|------|------|------|
|
|
|
+| file | file | 是 | PDF 或图片文件(如 PNG、JPG)。 |
|
|
|
+| backend | string | 否 | 识别引擎:`mineru`(默认,MinerU file_parse)或 `paddle`(PaddleOCR doc_parser)。 |
|
|
|
+| remove_watermark | boolean | 否 | 是否先对 PDF 去水印,默认 `false`,仅对 PDF 生效。 |
|
|
|
+| watermark_light_threshold | integer | 否 | 去水印亮度阈值 0–255,默认 200。 |
|
|
|
+| watermark_saturation_threshold | integer | 否 | 去水印饱和度阈值 0–255,默认 30。 |
|
|
|
+| crop_header_footer | boolean | 否 | 是否裁剪页眉页脚,默认 `false`,仅对 PDF 生效。 |
|
|
|
+| header_ratio | number | 否 | 页眉裁剪比例 0–1,如 0.05 表示裁掉顶部 5%,默认 0.05。 |
|
|
|
+| footer_ratio | number | 否 | 页脚裁剪比例 0–1,默认 0.05。 |
|
|
|
+| return_images | boolean | 否 | 是否同时拉取并保存图片;为 `true` 时完成后可下载 zip(md+图片),默认 `false`。 |
|
|
|
+
|
|
|
+## 限制与说明
|
|
|
+
|
|
|
+- **页数**:单文件不超过 300 页,超过将返回 400。
|
|
|
+- **大 PDF**:超过 50 页会按 50 页一段切割后分别转换再合并 MD,以降低 MinerU 端内存占用。
|
|
|
+- 去水印、裁剪页眉页脚仅对 **PDF** 生效,图片类型会忽略这些参数。
|
|
|
+"""
|
|
|
+
|
|
|
+
|
|
|
@app.post(
|
|
|
"/pdf_to_markdown",
|
|
|
tags=["PDF转Markdown"],
|
|
|
- summary="PDF/图片转 Markdown(同步)",
|
|
|
+ summary="PDF/图片转 Markdown(异步)",
|
|
|
+ description=PDF_TO_MARKDOWN_DESCRIPTION,
|
|
|
+ response_model=ConversionResponse,
|
|
|
responses={
|
|
|
- 200: {"description": "format=file 时返回 .md 文件;format=json 时返回 JSON { markdown, filename }"},
|
|
|
- 400: {"description": "文件页数超过 20 页"},
|
|
|
- 500: {"description": "转换失败"},
|
|
|
+ 200: {
|
|
|
+ "description": "成功创建任务,返回 task_id。需用 GET /task/{task_id} 轮询,完成后通过 GET /download/{task_id}/markdown 或 GET /task/{task_id}/json 获取结果;若传了 return_images=true 还可通过 GET /download/{task_id}/zip 下载 md+图片包。",
|
|
|
+ "content": {
|
|
|
+ "application/json": {
|
|
|
+ "example": {
|
|
|
+ "task_id": "550e8400-e29b-41d4-a716-446655440000",
|
|
|
+ "status": "pending",
|
|
|
+ "message": "任务已创建,请使用 GET /task/{task_id} 查询状态,完成后通过 GET /download/{task_id}/markdown 或 GET /task/{task_id}/json 获取结果",
|
|
|
+ "markdown_file": None,
|
|
|
+ "json_file": None,
|
|
|
+ "document_type": None,
|
|
|
+ }
|
|
|
+ }
|
|
|
+ },
|
|
|
+ },
|
|
|
+ 400: {
|
|
|
+ "description": "请求非法:例如文件页数超过 300 页。",
|
|
|
+ "content": {
|
|
|
+ "application/json": {"example": {"detail": "文件页数超过 300 页,拒绝处理"}}
|
|
|
+ },
|
|
|
+ },
|
|
|
+ 500: {
|
|
|
+ "description": "服务端错误:如保存上传文件失败、转换过程异常等。",
|
|
|
+ "content": {
|
|
|
+ "application/json": {"example": {"detail": "保存文件失败: ..."}}
|
|
|
+ },
|
|
|
+ },
|
|
|
},
|
|
|
)
|
|
|
async def pdf_to_markdown(
|
|
|
- file: Annotated[UploadFile, File(description="上传的 PDF 或图片文件")],
|
|
|
+ file: Annotated[UploadFile, File(description="上传的 PDF 或图片文件(必填)")],
|
|
|
backend: Annotated[
|
|
|
Optional[Literal["mineru", "paddle"]],
|
|
|
- Form(description="识别后端:mineru 调用 MinerU file_parse,paddle 调用 PaddleOCR doc_parser")
|
|
|
+ Form(description="识别后端:mineru = MinerU file_parse(默认);paddle = PaddleOCR doc_parser"),
|
|
|
] = "mineru",
|
|
|
- format: Annotated[
|
|
|
- Literal["file", "json"],
|
|
|
- Form(description="返回格式:file 直接返回 .md 文件下载(适合多页),json 返回 JSON 内嵌 markdown 字段(适合少页)")
|
|
|
- ] = "file",
|
|
|
+ remove_watermark: Annotated[
|
|
|
+ bool,
|
|
|
+ Form(description="是否先对 PDF 去水印,仅对 PDF 生效,默认 false"),
|
|
|
+ ] = False,
|
|
|
+ watermark_light_threshold: Annotated[
|
|
|
+ int,
|
|
|
+ Form(description="去水印亮度阈值 0–255,高于此值的浅色像素视为水印,默认 200"),
|
|
|
+ ] = 200,
|
|
|
+ watermark_saturation_threshold: Annotated[
|
|
|
+ int,
|
|
|
+ Form(description="去水印饱和度阈值 0–255,低于此值的低饱和度像素视为水印,默认 30"),
|
|
|
+ ] = 30,
|
|
|
+ crop_header_footer: Annotated[
|
|
|
+ bool,
|
|
|
+ Form(description="是否裁剪 PDF 页眉页脚,仅对 PDF 生效,默认 false"),
|
|
|
+ ] = False,
|
|
|
+ header_ratio: Annotated[
|
|
|
+ float,
|
|
|
+ Form(description="页眉裁剪比例 0–1,如 0.05 表示裁掉顶部 5%,默认 0.05"),
|
|
|
+ ] = 0.05,
|
|
|
+ footer_ratio: Annotated[
|
|
|
+ float,
|
|
|
+ Form(description="页脚裁剪比例 0–1,如 0.05 表示裁掉底部 5%,默认 0.05"),
|
|
|
+ ] = 0.05,
|
|
|
+ return_images: Annotated[
|
|
|
+ bool,
|
|
|
+ Form(description="是否同时拉取并保存图片;为 true 时完成后可通过 GET /download/{task_id}/zip 下载 md+图片 压缩包,默认 false"),
|
|
|
+ ] = False,
|
|
|
):
|
|
|
- """
|
|
|
- PDF/图片转 Markdown(同步接口)
|
|
|
- 直接调用 MinerU 或 PaddleOCR 进行识别,生成完整 MD 后返回。
|
|
|
- - **file**: 上传的 PDF 或图片
|
|
|
- - **backend**: mineru(默认)/ paddle
|
|
|
- - **format**: file(默认)— 直接返回 .md 文件下载,适合多页、大文本;json — 返回 JSON { "markdown", "filename" },适合少页
|
|
|
- 注意:大文件或页数多时可能较慢,建议页数不超过 20。
|
|
|
- """
|
|
|
- temp_dir = None
|
|
|
- file_path = None
|
|
|
+ """PDF/图片转 Markdown(异步):提交后立即返回 task_id,轮询 GET /task/{task_id} 后通过 GET /download/{task_id}/markdown 或 /zip 获取结果。"""
|
|
|
+ task_id = str(uuid.uuid4())
|
|
|
+ content_type = file.content_type or ""
|
|
|
+ ext_map = {"application/pdf": ".pdf", "image/png": ".png", "image/jpeg": ".jpg", "image/jpg": ".jpg"}
|
|
|
+ ext = ext_map.get(content_type, "") or (Path(file.filename or "").suffix if file.filename else "") or ".pdf"
|
|
|
+ temp_dir = tempfile.mkdtemp(prefix=f"pdf_converter_v2_{task_id}_")
|
|
|
+ file_path = os.path.join(temp_dir, f"file{ext}")
|
|
|
try:
|
|
|
- content_type = file.content_type or ""
|
|
|
- ext_map = {"application/pdf": ".pdf", "image/png": ".png", "image/jpeg": ".jpg", "image/jpg": ".jpg"}
|
|
|
- ext = ext_map.get(content_type, "") or (Path(file.filename or "").suffix if file.filename else "") or ".pdf"
|
|
|
- temp_dir = tempfile.mkdtemp(prefix="pdf_converter_v2_pdf_to_md_")
|
|
|
- file_path = os.path.join(temp_dir, f"file{ext}")
|
|
|
content = await file.read()
|
|
|
with open(file_path, "wb") as f:
|
|
|
f.write(content)
|
|
|
- # 页数限制(与 /convert 一致)
|
|
|
- pages = 1
|
|
|
- if ext.lower() == ".pdf" and content:
|
|
|
- pages = max(1, content.count(b"/Type /Page"))
|
|
|
- if pages > 20:
|
|
|
- raise HTTPException(status_code=400, detail="文件页数超过 20 页,拒绝处理")
|
|
|
- output_dir = os.path.join(temp_dir, "output")
|
|
|
- os.makedirs(output_dir, exist_ok=True)
|
|
|
- api_url = os.getenv("API_URL", "http://127.0.0.1:5282")
|
|
|
- result = await convert_pdf_to_markdown_only(
|
|
|
- input_file=file_path,
|
|
|
+ except Exception as e:
|
|
|
+ try:
|
|
|
+ shutil.rmtree(temp_dir)
|
|
|
+ except Exception:
|
|
|
+ pass
|
|
|
+ raise HTTPException(status_code=500, detail=f"保存文件失败: {str(e)}")
|
|
|
+
|
|
|
+ pages = 1
|
|
|
+ if ext.lower() == ".pdf" and content:
|
|
|
+ pages = max(1, content.count(b"/Type /Page"))
|
|
|
+ if pages > 300:
|
|
|
+ try:
|
|
|
+ shutil.rmtree(temp_dir)
|
|
|
+ except Exception:
|
|
|
+ pass
|
|
|
+ raise HTTPException(status_code=400, detail="文件页数超过 300 页,拒绝处理")
|
|
|
+
|
|
|
+ output_dir = os.path.join(temp_dir, "output")
|
|
|
+ os.makedirs(output_dir, exist_ok=True)
|
|
|
+
|
|
|
+ task_status[task_id] = {
|
|
|
+ "status": "pending",
|
|
|
+ "message": "任务已创建",
|
|
|
+ "progress": 0.0,
|
|
|
+ "markdown_file": None,
|
|
|
+ "json_file": None,
|
|
|
+ "json_data": None,
|
|
|
+ "document_type": None,
|
|
|
+ "error": None,
|
|
|
+ "temp_dir": temp_dir,
|
|
|
+ "output_dir": output_dir,
|
|
|
+ "file_path": file_path,
|
|
|
+ "zip_file": None,
|
|
|
+ }
|
|
|
+
|
|
|
+ asyncio.create_task(
|
|
|
+ process_pdf_to_markdown_task(
|
|
|
+ task_id=task_id,
|
|
|
+ file_path=file_path,
|
|
|
output_dir=output_dir,
|
|
|
backend=backend or "mineru",
|
|
|
- url=api_url,
|
|
|
+ remove_watermark=remove_watermark,
|
|
|
+ watermark_light_threshold=watermark_light_threshold,
|
|
|
+ watermark_saturation_threshold=watermark_saturation_threshold,
|
|
|
+ crop_header_footer=crop_header_footer,
|
|
|
+ header_ratio=header_ratio,
|
|
|
+ footer_ratio=footer_ratio,
|
|
|
+ return_images=return_images,
|
|
|
)
|
|
|
- if not result:
|
|
|
- raise HTTPException(status_code=500, detail="PDF 转 Markdown 失败,请查看服务端日志")
|
|
|
- if format == "file":
|
|
|
- # 直接返回 .md 文件下载,避免大文本放在 JSON 里
|
|
|
- safe_filename = quote(result["filename"])
|
|
|
- return Response(
|
|
|
- content=result["markdown"],
|
|
|
- media_type="text/markdown; charset=utf-8",
|
|
|
- headers={"Content-Disposition": f'attachment; filename="{result["filename"]}"; filename*=UTF-8\'\'{safe_filename}'},
|
|
|
- )
|
|
|
- return PdfToMarkdownResponse(markdown=result["markdown"], filename=result["filename"])
|
|
|
- except HTTPException:
|
|
|
- raise
|
|
|
- except Exception as e:
|
|
|
- logger.exception(f"[pdf_to_markdown] 转换失败: {e}")
|
|
|
- raise HTTPException(status_code=500, detail=str(e))
|
|
|
- finally:
|
|
|
- if temp_dir and os.path.isdir(temp_dir):
|
|
|
- try:
|
|
|
- shutil.rmtree(temp_dir)
|
|
|
- except Exception as exc:
|
|
|
- logger.debug(f"[pdf_to_markdown] 清理临时目录失败: {exc}")
|
|
|
+ )
|
|
|
+ logger.info(f"[任务 {task_id}] PDF 转 Markdown 任务已创建,立即返回 task_id")
|
|
|
+ return ConversionResponse(
|
|
|
+ task_id=task_id,
|
|
|
+ status="pending",
|
|
|
+ message="任务已创建,请使用 GET /task/{task_id} 查询状态,完成后通过 GET /download/{task_id}/markdown 或 GET /task/{task_id}/json 获取结果",
|
|
|
+ markdown_file=None,
|
|
|
+ json_file=None,
|
|
|
+ document_type=None,
|
|
|
+ )
|
|
|
|
|
|
|
|
|
@app.get("/task/{task_id}", response_model=TaskStatus)
|
|
|
@@ -872,6 +1076,31 @@ async def download_markdown(task_id: str):
|
|
|
)
|
|
|
|
|
|
|
|
|
+@app.get("/download/{task_id}/zip")
|
|
|
+async def download_zip(task_id: str):
|
|
|
+ """
|
|
|
+ 下载 Markdown + 图片 压缩包(仅当提交任务时传了 return_images=true 时存在)
|
|
|
+
|
|
|
+ - **task_id**: 任务ID
|
|
|
+ """
|
|
|
+ if task_id not in task_status:
|
|
|
+ raise HTTPException(status_code=404, detail="任务不存在")
|
|
|
+ status_info = task_status[task_id]
|
|
|
+ if status_info["status"] != "completed":
|
|
|
+ raise HTTPException(status_code=400, detail="任务尚未完成")
|
|
|
+ zip_file = status_info.get("zip_file")
|
|
|
+ if not zip_file or not os.path.exists(zip_file):
|
|
|
+ raise HTTPException(
|
|
|
+ status_code=404,
|
|
|
+ detail="未生成 zip(请在 POST /pdf_to_markdown 时传 return_images=true)",
|
|
|
+ )
|
|
|
+ return FileResponse(
|
|
|
+ zip_file,
|
|
|
+ media_type="application/zip",
|
|
|
+ filename="markdown_with_images.zip",
|
|
|
+ )
|
|
|
+
|
|
|
+
|
|
|
@app.get("/task/{task_id}/json")
|
|
|
async def get_json(task_id: str):
|
|
|
"""
|