|
|
@@ -49,7 +49,10 @@ except ImportError:
|
|
|
|
|
|
try:
|
|
|
from paddleocr import PaddleOCRVL
|
|
|
- PADDLEOCR_API_AVAILABLE = True
|
|
|
+ # 暂时禁用 Python API,因为在 VLLM 引擎运行时显存不足(需要约 11GB)
|
|
|
+ # 等 VLLM GPU 内存利用率降低或使用独立 GPU 后再启用
|
|
|
+ PADDLEOCR_API_AVAILABLE = False
|
|
|
+ logger.info("[PaddleOCR备用] Python API 已禁用(显存不足),使用命令行方式")
|
|
|
except ImportError:
|
|
|
PADDLEOCR_API_AVAILABLE = False
|
|
|
logger.warning("[PaddleOCR备用] paddleocr Python API 未安装,将使用命令行方式")
|
|
|
@@ -225,6 +228,14 @@ def _call_paddleocr_api(
|
|
|
Returns:
|
|
|
(是否成功, markdown 文件路径)
|
|
|
"""
|
|
|
+ import signal
|
|
|
+
|
|
|
+ class TimeoutError(Exception):
|
|
|
+ pass
|
|
|
+
|
|
|
+ def timeout_handler(signum, frame):
|
|
|
+ raise TimeoutError("PaddleOCR predict 超时")
|
|
|
+
|
|
|
try:
|
|
|
if not os.path.exists(image_path):
|
|
|
logger.error(f"[PaddleOCR API] 图片文件不存在: {image_path}")
|
|
|
@@ -236,13 +247,26 @@ def _call_paddleocr_api(
|
|
|
pipeline = _get_paddleocr_pipeline(use_chart_recognition, use_layout_detection)
|
|
|
|
|
|
logger.info(f"[PaddleOCR API] 开始处理: {image_path}")
|
|
|
+ logger.info(f"[PaddleOCR API] Pipeline 类型: {type(pipeline)}")
|
|
|
|
|
|
- # 执行识别
|
|
|
- result = pipeline.predict(input=image_path)
|
|
|
+ # 执行识别(设置 60 秒超时)
|
|
|
+ logger.info(f"[PaddleOCR API] 调用 predict 方法...")
|
|
|
+ signal.signal(signal.SIGALRM, timeout_handler)
|
|
|
+ signal.alarm(60) # 60 秒超时
|
|
|
+
|
|
|
+ try:
|
|
|
+ result = pipeline.predict(input=image_path)
|
|
|
+ signal.alarm(0) # 取消超时
|
|
|
+ logger.info(f"[PaddleOCR API] predict 返回,结果类型: {type(result)}")
|
|
|
+ except TimeoutError:
|
|
|
+ signal.alarm(0)
|
|
|
+ logger.error(f"[PaddleOCR API] predict 超时(60秒),可能显存不足或模型加载失败")
|
|
|
+ return False, None
|
|
|
|
|
|
# 保存结果
|
|
|
image_basename = os.path.splitext(os.path.basename(image_path))[0]
|
|
|
|
|
|
+ logger.info(f"[PaddleOCR API] 开始处理结果...")
|
|
|
for item in result:
|
|
|
if hasattr(item, 'save'):
|
|
|
item_save_path = os.path.join(save_path, image_basename)
|