|
|
@@ -19,6 +19,12 @@ from PIL import Image
|
|
|
from ..utils.logging_config import get_logger
|
|
|
from ..utils.file_utils import safe_stem
|
|
|
from ..utils.pdf_splitter import get_pdf_page_count, split_pdf_by_pages
|
|
|
+from ..utils.mineru_url_selector import get_next_mineru_api_url, get_mineru_api_url_list
|
|
|
+from ..utils.paddleocr_fallback import (
|
|
|
+ get_paddle_ocr_devices,
|
|
|
+ get_paddle_ocr_device_args_for_index,
|
|
|
+ _paddle_ocr_device_args,
|
|
|
+)
|
|
|
|
|
|
logger = get_logger("pdf_converter_v2.processor")
|
|
|
PADDLE_CMD = os.getenv("PADDLE_DOC_PARSER_CMD", "paddleocr")
|
|
|
@@ -42,6 +48,26 @@ async def _run_paddle_doc_parser(cmd: Sequence[str]) -> tuple[int, str, str]:
|
|
|
return process.returncode, stdout, stderr
|
|
|
|
|
|
|
|
|
+def _paddle_base_cmd(input_path: str, save_path_base: str, device_args: list) -> list:
|
|
|
+ """构建 PaddleOCR doc_parser 命令(含设备参数)。"""
|
|
|
+ return [
|
|
|
+ PADDLE_CMD,
|
|
|
+ "doc_parser",
|
|
|
+ "-i",
|
|
|
+ input_path,
|
|
|
+ "--precision",
|
|
|
+ "fp32",
|
|
|
+ "--use_doc_unwarping",
|
|
|
+ "False",
|
|
|
+ "--use_doc_orientation_classify",
|
|
|
+ "True",
|
|
|
+ "--use_chart_recognition",
|
|
|
+ "True",
|
|
|
+ "--save_path",
|
|
|
+ save_path_base,
|
|
|
+ ] + device_args
|
|
|
+
|
|
|
+
|
|
|
async def _convert_with_paddle(
|
|
|
input_file: str,
|
|
|
output_dir: str,
|
|
|
@@ -49,73 +75,109 @@ async def _convert_with_paddle(
|
|
|
output_json: bool,
|
|
|
forced_document_type: Optional[str],
|
|
|
):
|
|
|
- """针对工况附件使用 PaddleOCR doc_parser 直接转换"""
|
|
|
+ """针对工况附件使用 PaddleOCR doc_parser 直接转换;多卡时按页拆分并行跑满所有卡。"""
|
|
|
if not os.path.exists(input_file):
|
|
|
logger.error(f"[Paddle] 输入文件不存在: {input_file}")
|
|
|
return None
|
|
|
-
|
|
|
+
|
|
|
file_name = f'{safe_stem(Path(input_file).stem)}_{time.strftime("%y%m%d_%H%M%S")}'
|
|
|
os.makedirs(output_dir, exist_ok=True)
|
|
|
-
|
|
|
temp_dir = tempfile.mkdtemp(prefix=f"pdf_converter_paddle_{file_name}_")
|
|
|
logger.info(f"[Paddle] 创建临时目录: {temp_dir}")
|
|
|
- save_path_base = os.path.join(temp_dir, Path(input_file).stem)
|
|
|
- os.makedirs(save_path_base, exist_ok=True)
|
|
|
-
|
|
|
- cmd = [
|
|
|
- PADDLE_CMD,
|
|
|
- "doc_parser",
|
|
|
- "-i",
|
|
|
- input_file,
|
|
|
- "--precision",
|
|
|
- "fp32",
|
|
|
- "--use_doc_unwarping",
|
|
|
- "False",
|
|
|
- "--use_doc_orientation_classify",
|
|
|
- "True",
|
|
|
- "--use_chart_recognition",
|
|
|
- "True",
|
|
|
- "--save_path",
|
|
|
- save_path_base,
|
|
|
- ]
|
|
|
-
|
|
|
+
|
|
|
+ devices = get_paddle_ocr_devices()
|
|
|
+ ext = (Path(input_file).suffix or "").lower()
|
|
|
+ page_count = get_pdf_page_count(input_file) if ext == ".pdf" else 0
|
|
|
+ use_multi_card = len(devices) > 1 and ext == ".pdf" and page_count > 1
|
|
|
+
|
|
|
try:
|
|
|
- return_code, _, stderr = await _run_paddle_doc_parser(cmd)
|
|
|
- if return_code != 0:
|
|
|
- logger.error(f"[Paddle] doc_parser 执行失败 code={return_code}")
|
|
|
- if stderr:
|
|
|
- logger.error(stderr)
|
|
|
- return None
|
|
|
-
|
|
|
- md_files = sorted(Path(save_path_base).rglob("*.md"))
|
|
|
- if not md_files:
|
|
|
- logger.error("[Paddle] 未找到Markdown文件")
|
|
|
+ if use_multi_card:
|
|
|
+ # 多卡:按页拆成 N 段,每段用一张卡并行 doc_parser,再合并
|
|
|
+ chunk_size = (page_count + len(devices) - 1) // len(devices)
|
|
|
+ chunks_dir = os.path.join(temp_dir, "chunks")
|
|
|
+ os.makedirs(chunks_dir, exist_ok=True)
|
|
|
+ chunk_paths = split_pdf_by_pages(input_file, chunks_dir, chunk_size=chunk_size)
|
|
|
+ if not chunk_paths:
|
|
|
+ logger.error("[Paddle] 多卡拆分 PDF 失败")
|
|
|
+ return None
|
|
|
+ logger.info(f"[Paddle] PDF 共 {page_count} 页,拆成 {len(chunk_paths)} 段并行使用 {len(devices)} 张卡")
|
|
|
+ tasks = []
|
|
|
+ for i, chunk_path in enumerate(chunk_paths):
|
|
|
+ save_path_base_i = os.path.join(temp_dir, f"out_{i}", Path(chunk_path).stem)
|
|
|
+ os.makedirs(save_path_base_i, exist_ok=True)
|
|
|
+ cmd = _paddle_base_cmd(
|
|
|
+ chunk_path,
|
|
|
+ save_path_base_i,
|
|
|
+ get_paddle_ocr_device_args_for_index(i),
|
|
|
+ )
|
|
|
+ tasks.append(_run_paddle_doc_parser(cmd))
|
|
|
+ results = await asyncio.gather(*tasks, return_exceptions=True)
|
|
|
+ markdown_parts = []
|
|
|
+ all_save_bases = [os.path.join(temp_dir, f"out_{i}", Path(chunk_paths[i]).stem) for i in range(len(chunk_paths))]
|
|
|
+ for i, res in enumerate(results):
|
|
|
+ if isinstance(res, Exception):
|
|
|
+ logger.warning(f"[Paddle] 第 {i + 1} 段 doc_parser 异常: {res}")
|
|
|
+ continue
|
|
|
+ ret_code, _, stderr = res
|
|
|
+ if ret_code != 0:
|
|
|
+ logger.warning(f"[Paddle] 第 {i + 1} 段 doc_parser 失败: {stderr}")
|
|
|
+ continue
|
|
|
+ base = all_save_bases[i]
|
|
|
+ md_files = sorted(Path(base).rglob("*.md"))
|
|
|
+ for md_file in md_files:
|
|
|
+ async with aiofiles.open(md_file, "r", encoding="utf-8") as f:
|
|
|
+ markdown_parts.append(await f.read())
|
|
|
+ final_content = "\n\n".join(markdown_parts) if markdown_parts else ""
|
|
|
+ else:
|
|
|
+ # 单卡或非 PDF:一次 doc_parser
|
|
|
+ save_path_base = os.path.join(temp_dir, Path(input_file).stem)
|
|
|
+ os.makedirs(save_path_base, exist_ok=True)
|
|
|
+ cmd = _paddle_base_cmd(input_file, save_path_base, _paddle_ocr_device_args())
|
|
|
+ return_code, _, stderr = await _run_paddle_doc_parser(cmd)
|
|
|
+ if return_code != 0:
|
|
|
+ logger.error(f"[Paddle] doc_parser 执行失败 code={return_code}")
|
|
|
+ if stderr:
|
|
|
+ logger.error(stderr)
|
|
|
+ return None
|
|
|
+ md_files = sorted(Path(save_path_base).rglob("*.md"))
|
|
|
+ if not md_files:
|
|
|
+ logger.error("[Paddle] 未找到Markdown文件")
|
|
|
+ return None
|
|
|
+ markdown_parts = []
|
|
|
+ for md_file in md_files:
|
|
|
+ async with aiofiles.open(md_file, "r", encoding="utf-8") as f:
|
|
|
+ markdown_parts.append(await f.read())
|
|
|
+ final_content = "\n\n".join(markdown_parts)
|
|
|
+
|
|
|
+ if not final_content:
|
|
|
+ logger.error("[Paddle] 合并后无内容")
|
|
|
return None
|
|
|
-
|
|
|
- markdown_parts = []
|
|
|
- for md_file in md_files:
|
|
|
- async with aiofiles.open(md_file, "r", encoding="utf-8") as f:
|
|
|
- markdown_parts.append(await f.read())
|
|
|
- final_content = "\n\n".join(markdown_parts)
|
|
|
+
|
|
|
logger.info(f"[Paddle] 合并后的markdown长度: {len(final_content)}")
|
|
|
-
|
|
|
local_md_dir = os.path.join(output_dir, file_name, "markdown")
|
|
|
os.makedirs(local_md_dir, exist_ok=True)
|
|
|
md_path = os.path.join(local_md_dir, f"{file_name}.md")
|
|
|
async with aiofiles.open(md_path, "w", encoding="utf-8") as f:
|
|
|
await f.write(final_content)
|
|
|
-
|
|
|
output_md_path = os.path.join(output_dir, f"{file_name}.md")
|
|
|
async with aiofiles.open(output_md_path, "w", encoding="utf-8") as f:
|
|
|
await f.write(final_content)
|
|
|
-
|
|
|
+
|
|
|
if embed_images:
|
|
|
local_image_dir = os.path.join(output_dir, file_name, "images")
|
|
|
os.makedirs(local_image_dir, exist_ok=True)
|
|
|
- for asset in Path(save_path_base).rglob("*"):
|
|
|
- if asset.is_file() and asset.suffix.lower() in {".png", ".jpg", ".jpeg", ".bmp", ".tif", ".tiff"}:
|
|
|
- shutil.copy2(asset, os.path.join(local_image_dir, asset.name))
|
|
|
-
|
|
|
+ if use_multi_card:
|
|
|
+ for i in range(len(chunk_paths)):
|
|
|
+ base = os.path.join(temp_dir, f"out_{i}", Path(chunk_paths[i]).stem)
|
|
|
+ for asset in Path(base).rglob("*"):
|
|
|
+ if asset.is_file() and asset.suffix.lower() in {".png", ".jpg", ".jpeg", ".bmp", ".tif", ".tiff"}:
|
|
|
+ shutil.copy2(asset, os.path.join(local_image_dir, asset.name))
|
|
|
+ else:
|
|
|
+ base = os.path.join(temp_dir, Path(input_file).stem)
|
|
|
+ for asset in Path(base).rglob("*"):
|
|
|
+ if asset.is_file() and asset.suffix.lower() in {".png", ".jpg", ".jpeg", ".bmp", ".tif", ".tiff"}:
|
|
|
+ shutil.copy2(asset, os.path.join(local_image_dir, asset.name))
|
|
|
+
|
|
|
json_data = None
|
|
|
json_path = None
|
|
|
if output_json:
|
|
|
@@ -135,7 +197,7 @@ async def _convert_with_paddle(
|
|
|
await f.write(json.dumps(json_data, ensure_ascii=False, indent=2))
|
|
|
except Exception as exc:
|
|
|
logger.exception(f"[Paddle] JSON转换失败: {exc}")
|
|
|
-
|
|
|
+
|
|
|
return {
|
|
|
"markdown_file": output_md_path,
|
|
|
"json_file": json_path,
|
|
|
@@ -178,6 +240,8 @@ async def convert_to_markdown(
|
|
|
logger.error(f"输入文件不存在: {input_file}")
|
|
|
return None
|
|
|
|
|
|
+ url = url or get_next_mineru_api_url()
|
|
|
+
|
|
|
# 生成文件名
|
|
|
file_name = f'{safe_stem(Path(input_file).stem)}_{time.strftime("%y%m%d_%H%M%S")}'
|
|
|
|
|
|
@@ -458,7 +522,7 @@ async def convert_pdf_to_markdown_only(
|
|
|
return None
|
|
|
|
|
|
ext = (Path(input_file).suffix or "").lower()
|
|
|
- url = url or os.getenv("API_URL", "http://127.0.0.1:5282")
|
|
|
+ url = url or get_next_mineru_api_url()
|
|
|
|
|
|
# 仅对 PDF 做按页切割;图片或页数不足则单次转换
|
|
|
if ext == ".pdf":
|
|
|
@@ -466,6 +530,53 @@ async def convert_pdf_to_markdown_only(
|
|
|
if page_count <= 0:
|
|
|
logger.error(f"无法获取 PDF 页数: {input_file}")
|
|
|
return None
|
|
|
+
|
|
|
+ # MinerU 多卡:按页拆成 N 段,每段发到不同 API 实例并行转换后合并(单任务用满所有卡)
|
|
|
+ url_list = get_mineru_api_url_list()
|
|
|
+ if backend != "paddle" and len(url_list) > 1 and page_count > 1:
|
|
|
+ chunk_size = (page_count + len(url_list) - 1) // len(url_list)
|
|
|
+ chunks_dir = tempfile.mkdtemp(prefix="pdf_multi_card_", dir=output_dir)
|
|
|
+ try:
|
|
|
+ chunk_paths = split_pdf_by_pages(input_file, chunks_dir, chunk_size=chunk_size)
|
|
|
+ if not chunk_paths:
|
|
|
+ return None
|
|
|
+ logger.info(f"PDF 共 {page_count} 页,拆成 {len(chunk_paths)} 段并行发往 {len(url_list)} 个 MinerU 实例")
|
|
|
+ tasks = []
|
|
|
+ for i, chunk_path in enumerate(chunk_paths):
|
|
|
+ chunk_out = os.path.join(chunks_dir, f"out_{i}")
|
|
|
+ os.makedirs(chunk_out, exist_ok=True)
|
|
|
+ tasks.append(
|
|
|
+ convert_to_markdown(
|
|
|
+ input_file=chunk_path,
|
|
|
+ output_dir=chunk_out,
|
|
|
+ max_pages=max_pages,
|
|
|
+ output_json=False,
|
|
|
+ formula_enable=formula_enable,
|
|
|
+ table_enable=table_enable,
|
|
|
+ language=language,
|
|
|
+ url=url_list[i % len(url_list)],
|
|
|
+ embed_images=return_images,
|
|
|
+ )
|
|
|
+ )
|
|
|
+ results = await asyncio.gather(*tasks, return_exceptions=True)
|
|
|
+ parts = []
|
|
|
+ for i, r in enumerate(results):
|
|
|
+ if isinstance(r, Exception):
|
|
|
+ logger.warning(f"第 {i + 1} 段 MinerU 转换异常: {r}")
|
|
|
+ continue
|
|
|
+ if r and r.get("content"):
|
|
|
+ parts.append(r["content"])
|
|
|
+ if not parts:
|
|
|
+ return None
|
|
|
+ merged = "\n\n".join(parts)
|
|
|
+ filename = Path(input_file).stem + ".md"
|
|
|
+ return {"markdown": merged, "filename": filename}
|
|
|
+ finally:
|
|
|
+ try:
|
|
|
+ shutil.rmtree(chunks_dir, ignore_errors=True)
|
|
|
+ except Exception as e:
|
|
|
+ logger.debug(f"清理多卡临时目录失败: {e}")
|
|
|
+
|
|
|
if page_count > PDF_CHUNK_PAGES:
|
|
|
chunks_dir = tempfile.mkdtemp(prefix="pdf_chunks_", dir=output_dir)
|
|
|
try:
|