|
|
@@ -55,6 +55,12 @@ except ImportError:
|
|
|
PADDLEOCR_API_AVAILABLE = False
|
|
|
logger.warning("[PaddleOCR备用] paddleocr Python API 未安装")
|
|
|
|
|
|
+# 调用方式选择:
|
|
|
+# - "cli": 使用命令行方式(paddleocr doc_parser)
|
|
|
+# - "wrapper": 使用独立脚本方式(paddleocr_wrapper.py)
|
|
|
+# 默认使用命令行方式
|
|
|
+PADDLEOCR_CALL_METHOD = os.environ.get("PADDLEOCR_CALL_METHOD", "cli")
|
|
|
+
|
|
|
# PaddleOCR VL pipeline 单例(避免重复初始化)
|
|
|
_PADDLEOCR_PIPELINE_CACHE: Dict[tuple, Any] = {}
|
|
|
|
|
|
@@ -285,7 +291,11 @@ def _call_paddleocr_api(
|
|
|
use_chart_recognition: bool = False,
|
|
|
use_layout_detection: bool = False
|
|
|
) -> tuple[bool, Optional[str]]:
|
|
|
- """通过独立脚本调用 PaddleOCR VL(避免显存共享问题)
|
|
|
+ """调用 PaddleOCR VL 进行识别
|
|
|
+
|
|
|
+ 支持两种调用方式(通过 PADDLEOCR_CALL_METHOD 环境变量控制):
|
|
|
+ - "cli": 使用命令行方式(paddleocr doc_parser)- 默认
|
|
|
+ - "wrapper": 使用独立脚本方式(paddleocr_wrapper.py)- 避免显存共享
|
|
|
|
|
|
如果 GPU 显存为 16GB,会自动停止 MinerU 服务以释放显存,
|
|
|
PaddleOCR 处理完成后再重启 MinerU 服务。
|
|
|
@@ -303,7 +313,7 @@ def _call_paddleocr_api(
|
|
|
|
|
|
try:
|
|
|
if not os.path.exists(image_path):
|
|
|
- logger.error(f"[PaddleOCR Wrapper] 图片文件不存在: {image_path}")
|
|
|
+ logger.error(f"[PaddleOCR API] 图片文件不存在: {image_path}")
|
|
|
return False, None
|
|
|
|
|
|
os.makedirs(save_path, exist_ok=True)
|
|
|
@@ -311,13 +321,100 @@ def _call_paddleocr_api(
|
|
|
# 检测 GPU 显存,如果是 16GB 则停止 MinerU 服务
|
|
|
gpu_memory = _get_gpu_memory_total()
|
|
|
if gpu_memory and gpu_memory <= 16:
|
|
|
- logger.info(f"[PaddleOCR Wrapper] 检测到 GPU 显存为 {gpu_memory}GB,停止 MinerU 服务以释放显存")
|
|
|
+ logger.info(f"[PaddleOCR API] 检测到 GPU 显存为 {gpu_memory}GB,停止 MinerU 服务以释放显存")
|
|
|
if _stop_mineru_service():
|
|
|
mineru_stopped = True
|
|
|
# 等待服务完全停止
|
|
|
import time
|
|
|
time.sleep(3)
|
|
|
|
|
|
+ # 根据配置选择调用方式
|
|
|
+ if PADDLEOCR_CALL_METHOD == "wrapper":
|
|
|
+ # 使用独立脚本方式
|
|
|
+ return _call_paddleocr_wrapper(image_path, save_path, use_chart_recognition, use_layout_detection)
|
|
|
+ else:
|
|
|
+ # 使用命令行方式(默认)
|
|
|
+ return _call_paddleocr_cli(image_path, save_path, use_chart_recognition, use_layout_detection)
|
|
|
+
|
|
|
+ except Exception as e:
|
|
|
+ logger.error(f"[PaddleOCR API] 处理失败: {e}")
|
|
|
+ import traceback
|
|
|
+ logger.error(traceback.format_exc())
|
|
|
+ return False, None
|
|
|
+ finally:
|
|
|
+ # 如果停止了 MinerU 服务,需要重新启动
|
|
|
+ if mineru_stopped:
|
|
|
+ logger.info("[PaddleOCR API] PaddleOCR 处理完成,重启 MinerU 服务")
|
|
|
+ _start_mineru_service()
|
|
|
+
|
|
|
+
|
|
|
+def _call_paddleocr_cli(
|
|
|
+ image_path: str,
|
|
|
+ save_path: str,
|
|
|
+ use_chart_recognition: bool = False,
|
|
|
+ use_layout_detection: bool = False
|
|
|
+) -> tuple[bool, Optional[str]]:
|
|
|
+ """使用命令行方式调用 PaddleOCR"""
|
|
|
+ try:
|
|
|
+ image_basename = os.path.splitext(os.path.basename(image_path))[0]
|
|
|
+ save_path_base = os.path.join(save_path, image_basename)
|
|
|
+
|
|
|
+ # 构建命令
|
|
|
+ cmd = [_get_paddleocr_executable(), "doc_parser", "-i", image_path]
|
|
|
+ cmd.extend(["--precision", "fp32"])
|
|
|
+ cmd.extend(["--use_doc_unwarping", "False"])
|
|
|
+ cmd.extend(["--use_doc_orientation_classify", "False"])
|
|
|
+ cmd.extend(["--use_chart_recognition", "True" if use_chart_recognition else "False"])
|
|
|
+ cmd.extend(["--use_layout_detection", "True" if use_layout_detection else "False"])
|
|
|
+ cmd.extend(["--save_path", save_path_base])
|
|
|
+ cmd.extend(_paddle_ocr_device_args())
|
|
|
+
|
|
|
+ logger.info(f"[PaddleOCR CLI] 执行命令: {' '.join(cmd)}")
|
|
|
+
|
|
|
+ # 执行命令
|
|
|
+ env = _get_paddleocr_subprocess_env()
|
|
|
+ result = subprocess.run(
|
|
|
+ cmd,
|
|
|
+ capture_output=True,
|
|
|
+ text=True,
|
|
|
+ timeout=120,
|
|
|
+ check=False,
|
|
|
+ env=env,
|
|
|
+ )
|
|
|
+
|
|
|
+ if result.returncode != 0:
|
|
|
+ logger.error(f"[PaddleOCR CLI] 执行失败,返回码: {result.returncode}")
|
|
|
+ if result.stderr:
|
|
|
+ logger.error(f"[PaddleOCR CLI] 错误输出: {result.stderr}")
|
|
|
+ return False, None
|
|
|
+
|
|
|
+ # 查找生成的 markdown 文件
|
|
|
+ markdown_file = save_path_base + ".md"
|
|
|
+ if os.path.exists(markdown_file):
|
|
|
+ logger.info(f"[PaddleOCR CLI] 成功,markdown 文件: {markdown_file}")
|
|
|
+ return True, markdown_file
|
|
|
+
|
|
|
+ logger.error(f"[PaddleOCR CLI] 未找到 markdown 输出文件: {markdown_file}")
|
|
|
+ return False, None
|
|
|
+
|
|
|
+ except subprocess.TimeoutExpired:
|
|
|
+ logger.error("[PaddleOCR CLI] 执行超时(120秒)")
|
|
|
+ return False, None
|
|
|
+ except Exception as e:
|
|
|
+ logger.error(f"[PaddleOCR CLI] 处理失败: {e}")
|
|
|
+ import traceback
|
|
|
+ logger.error(traceback.format_exc())
|
|
|
+ return False, None
|
|
|
+
|
|
|
+
|
|
|
+def _call_paddleocr_wrapper(
|
|
|
+ image_path: str,
|
|
|
+ save_path: str,
|
|
|
+ use_chart_recognition: bool = False,
|
|
|
+ use_layout_detection: bool = False
|
|
|
+) -> tuple[bool, Optional[str]]:
|
|
|
+ """使用独立脚本方式调用 PaddleOCR"""
|
|
|
+ try:
|
|
|
# 获取 wrapper 脚本路径
|
|
|
current_dir = os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
|
|
|
wrapper_script = os.path.join(current_dir, "paddleocr_wrapper.py")
|
|
|
@@ -380,11 +477,6 @@ def _call_paddleocr_api(
|
|
|
import traceback
|
|
|
logger.error(traceback.format_exc())
|
|
|
return False, None
|
|
|
- finally:
|
|
|
- # 如果停止了 MinerU 服务,需要重新启动
|
|
|
- if mineru_stopped:
|
|
|
- logger.info("[PaddleOCR Wrapper] PaddleOCR 处理完成,重启 MinerU 服务")
|
|
|
- _start_mineru_service()
|
|
|
|
|
|
|
|
|
def has_recognition_garbage(text: str, min_repeat: int = 10) -> bool:
|