Bladeren bron

refactor: 优化 PaddleOCR 相关代码

- 统一 use_doc_orientation_classify 为 False(提升处理速度)
- 移除未使用的导入(ast)
- 移除重复的 Path 导入
- 添加 paddleocr_python_path 配置项支持自定义 Python 解释器路径
- 优化硬编码路径,改为可配置方式
- 提升代码可维护性和灵活性
何文松 23 uur geleden
bovenliggende
commit
a436cebb8f

+ 1 - 0
pdf_converter_v2/config.py

@@ -63,6 +63,7 @@ DEFAULT_FORMULA_ENABLE = _config.get_bool("formula_enable", True)
 
 # PaddleOCR 配置
 PADDLEOCR_CMD = _config.get_str("paddleocr_cmd", "paddleocr")
+PADDLEOCR_PYTHON_PATH = _config.get_str("paddleocr_python_path", "")
 PADDLE_DOC_PARSER_CMD = _config.get_str("paddle_doc_parser_cmd", "paddleocr")
 
 # PaddleOCR 设备参数(留空则根据 DEVICE_KIND 自动选择)

+ 4 - 0
pdf_converter_v2/config.yaml

@@ -68,6 +68,10 @@ api_port: 4214
 # PaddleOCR 可执行命令或路径
 paddleocr_cmd: "paddleocr"
 
+# PaddleOCR Python 解释器路径(用于 wrapper 脚本模式)
+# 留空则使用当前 Python 解释器
+paddleocr_python_path: ""
+
 # PaddleOCR 推理设备 (例如 "npu:0", "cuda:0", "cpu")
 # 留空则根据环境自动选择
 paddle_ocr_device: ""

+ 1 - 1
pdf_converter_v2/processor/converter.py

@@ -70,7 +70,7 @@ def _paddle_base_cmd(input_path: str, save_path_base: str, device_args: list) ->
         "--use_doc_unwarping",
         "False",
         "--use_doc_orientation_classify",
-        "True",
+        "False",
         "--use_chart_recognition",
         "True",
         "--save_path",

+ 8 - 5
pdf_converter_v2/utils/paddleocr_fallback.py

@@ -12,12 +12,12 @@ import time
 import random
 from pathlib import Path
 from typing import Dict, Any, Optional, List, Tuple
-import ast
 import re
 
 from ..utils.logging_config import get_logger
 from ..config import (
     PADDLEOCR_CMD as _PADDLEOCR_CMD,
+    PADDLEOCR_PYTHON_PATH as _PADDLEOCR_PYTHON_PATH,
     PADDLE_OCR_DEVICE as _PADDLE_OCR_DEVICE,
     PADDLE_OCR_DEVICES as _PADDLE_OCR_DEVICES_CONFIG,
     VL_REC_BACKEND,
@@ -390,7 +390,6 @@ def _call_paddleocr_cli(
         
         # PaddleOCR CLI 会在 save_path 下创建目录,然后在目录里生成 markdown 文件
         # 递归查找所有 .md 文件
-        from pathlib import Path
         save_path_obj = Path(save_path)
         if save_path_obj.exists():
             md_files = list(save_path_obj.rglob("*.md"))
@@ -428,9 +427,13 @@ def _call_paddleocr_wrapper(
             logger.error(f"[PaddleOCR Wrapper] 脚本不存在: {wrapper_script}")
             return False, None
         
-        # 获取 Python 解释器路径(使用 PaddleOCR 虚拟环境)
-        python_executable = "/mnt/win_d/paddle/.venv_paddleocr/bin/python"
-        if not os.path.exists(python_executable):
+        # 获取 Python 解释器路径
+        # 优先使用配置的路径,否则使用当前 Python 解释器
+        python_executable = _PADDLEOCR_PYTHON_PATH.strip() if _PADDLEOCR_PYTHON_PATH else ""
+        if python_executable and not os.path.exists(python_executable):
+            logger.warning(f"[PaddleOCR Wrapper] 配置的 Python 路径不存在: {python_executable},使用当前解释器")
+            python_executable = ""
+        if not python_executable:
             python_executable = sys.executable
         
         # 构建命令