|
|
@@ -15,6 +15,13 @@ import ast
|
|
|
import re
|
|
|
|
|
|
from ..utils.logging_config import get_logger
|
|
|
+from ..config import (
|
|
|
+ PADDLEOCR_CMD as _PADDLEOCR_CMD,
|
|
|
+ PADDLE_OCR_DEVICE as _PADDLE_OCR_DEVICE,
|
|
|
+ PADDLE_OCR_DEVICES as _PADDLE_OCR_DEVICES_CONFIG,
|
|
|
+ VL_REC_BACKEND,
|
|
|
+ VL_REC_SERVER_URL,
|
|
|
+)
|
|
|
|
|
|
logger = get_logger("pdf_converter_v2.utils.paddleocr")
|
|
|
|
|
|
@@ -41,10 +48,10 @@ except ImportError:
|
|
|
|
|
|
|
|
|
def _get_paddleocr_executable() -> str:
|
|
|
- """返回 paddleocr 可执行文件路径或命令名,供 subprocess 使用。
|
|
|
+ """返回 paddleocr 可执行文件路径 or 命令名,供 subprocess 使用。
|
|
|
当以 systemd 等方式运行时 PATH 可能不包含 venv/bin,故优先使用当前 Python 同目录下的 paddleocr。
|
|
|
- 可通过环境变量 PADDLEOCR_CMD 显式指定(完整路径或命令名)。"""
|
|
|
- cmd = os.getenv("PADDLEOCR_CMD", "").strip()
|
|
|
+ 可通过配置 PADDLEOCR_CMD 显式指定(完整路径或命令名)。"""
|
|
|
+ cmd = _PADDLEOCR_CMD.strip()
|
|
|
if cmd:
|
|
|
return cmd
|
|
|
# 与当前 Python 同目录(venv/bin)下的 paddleocr
|
|
|
@@ -70,11 +77,11 @@ def _get_paddle_ocr_devices() -> List[str]:
|
|
|
with _PADDLE_OCR_DEVICE_LOCK:
|
|
|
if _PADDLE_OCR_DEVICES:
|
|
|
return _PADDLE_OCR_DEVICES
|
|
|
- multi = os.getenv("PADDLE_OCR_DEVICES", "").strip()
|
|
|
+ multi = _PADDLE_OCR_DEVICES_CONFIG.strip()
|
|
|
if multi:
|
|
|
_PADDLE_OCR_DEVICES[:] = [d.strip() for d in multi.split(",") if d.strip()]
|
|
|
if not _PADDLE_OCR_DEVICES:
|
|
|
- single = os.getenv("PADDLE_OCR_DEVICE", "").strip()
|
|
|
+ single = _PADDLE_OCR_DEVICE.strip()
|
|
|
if not single:
|
|
|
from .device_env import is_npu
|
|
|
if is_npu():
|
|
|
@@ -415,6 +422,12 @@ def call_paddleocr(image_path: str) -> Optional[Dict[str, Any]]:
|
|
|
"--save_path", save_path_base
|
|
|
] + _paddle_ocr_device_args()
|
|
|
|
|
|
+ # 添加 VL 识别后端配置(如果已配置)
|
|
|
+ if VL_REC_BACKEND:
|
|
|
+ cmd.extend(["--vl_rec_backend", VL_REC_BACKEND])
|
|
|
+ if VL_REC_SERVER_URL:
|
|
|
+ cmd.extend(["--vl_rec_server_url", VL_REC_SERVER_URL])
|
|
|
+
|
|
|
# 设置环境变量,限制GPU内存使用
|
|
|
# env = os.environ.copy()
|
|
|
# 设置PaddlePaddle的GPU内存分配策略,使用更保守的内存分配
|
|
|
@@ -853,6 +866,12 @@ def call_paddleocr_doc_parser_for_text(image_path: str, save_path: str) -> tuple
|
|
|
"--save_path", save_path_base
|
|
|
] + _paddle_ocr_device_args()
|
|
|
|
|
|
+ # 添加 VL 识别后端配置(如果已配置)
|
|
|
+ if VL_REC_BACKEND:
|
|
|
+ cmd.extend(["--vl_rec_backend", VL_REC_BACKEND])
|
|
|
+ if VL_REC_SERVER_URL:
|
|
|
+ cmd.extend(["--vl_rec_server_url", VL_REC_SERVER_URL])
|
|
|
+
|
|
|
logger.info(f"[PaddleOCR DocParser] 执行命令: {' '.join(cmd)}")
|
|
|
|
|
|
# 执行命令(env 含 LD_PRELOAD 与 PADDLE_PDX_DISABLE_MODEL_SOURCE_CHECK)
|