Переглянути джерело

chore: 移除 PaddleOCR 子进程 LD_PRELOAD/static TLS 逻辑

Co-authored-by: Cursor <cursoragent@cursor.com>
何文松 2 тижнів тому
батько
коміт
2dd570737c
1 змінених файлів з 4 додано та 37 видалено
  1. 4 37
      pdf_converter_v2/utils/paddleocr_fallback.py

+ 4 - 37
pdf_converter_v2/utils/paddleocr_fallback.py

@@ -105,12 +105,12 @@ def get_paddle_ocr_device_args_for_index(device_index: int) -> list:
     return ["--device", device]
 
 
-# 供 PaddleOCR 子进程使用的环境变量(LD_PRELOAD 避免 sklearn libgomp static TLS 报错;PADDLE_PDX 跳过模型源检查)
+# 供 PaddleOCR 子进程使用的环境变量(PADDLE_PDX 跳过模型源检查;FLAGS_use_stride_kernel 避免 VL kernel 未注册
 _PADDLEOCR_ENV: Optional[Dict[str, str]] = None
 
 
 def _get_paddleocr_subprocess_env() -> Dict[str, str]:
-    """返回调用 paddleocr 子进程时应使用的环境变量(含 LD_PRELOAD 与 PADDLE_PDX_DISABLE_MODEL_SOURCE_CHECK)。"""
+    """返回调用 paddleocr 子进程时应使用的环境变量(PADDLE_PDX_DISABLE_MODEL_SOURCE_CHECK、FLAGS_use_stride_kernel)。"""
     global _PADDLEOCR_ENV
     if _PADDLEOCR_ENV is not None:
         return _PADDLEOCR_ENV
@@ -120,39 +120,6 @@ def _get_paddleocr_subprocess_env() -> Dict[str, str]:
     # doc_parser 加载 PaddleOCR-VL 时 safetensors 会触发 view_dtype(CPU, Undefined(AnyLayout), uint8),
     # 该 kernel 未注册;强制使用 STRIDED 布局可避免:RuntimeError: kernel view_dtype (CPU, Undefined(AnyLayout), uint8) is not registered
     env.setdefault("FLAGS_use_stride_kernel", "1")
-    # 子进程若无 LD_PRELOAD,会触发 sklearn/paddlex 的「cannot allocate memory in static TLS block」
-    if not env.get("LD_PRELOAD"):
-        preload_paths: List[str] = []
-        # 系统 libgomp 优先
-        for p in (
-            "/usr/lib/x86_64-linux-gnu/libgomp.so.1",
-            "/usr/lib/aarch64-linux-gnu/libgomp.so.1",
-            "/usr/lib/libgomp.so.1",
-        ):
-            if os.path.isfile(p):
-                preload_paths.append(p)
-                break
-        # scikit_learn.libs 中的 libgomp(不 import sklearn,仅按路径查找)
-        for sp in getattr(sys, "path", []):
-            if not sp or not os.path.isdir(sp):
-                continue
-            for sub in ("scikit_learn.libs", "simsimd.libs"):
-                d = os.path.join(sp, sub)
-                if not os.path.isdir(d):
-                    continue
-                for name in os.listdir(d):
-                    if name.startswith("libgomp") and (name.endswith(".so") or ".so." in name):
-                        preload_paths.append(os.path.join(d, name))
-        # 固定路径(常见容器)
-        for p in (
-            "/usr/local/lib/python3.10/dist-packages/scikit_learn.libs/libgomp-d22c30c5.so.1.0.0",
-            "/usr/local/lib/python3.10/site-packages/scikit_learn.libs/libgomp-d22c30c5.so.1.0.0",
-        ):
-            if os.path.isfile(p) and p not in preload_paths:
-                preload_paths.append(p)
-        if preload_paths:
-            env["LD_PRELOAD"] = ":".join(preload_paths)
-            logger.debug("[PaddleOCR] 子进程 LD_PRELOAD 已设置,避免 static TLS 报错")
     _PADDLEOCR_ENV = env
     return env
 
@@ -435,7 +402,7 @@ def call_paddleocr(image_path: str) -> Optional[Dict[str, Any]]:
         
         logger.info(f"[PaddleOCR 图表识别] 执行命令: {' '.join(cmd)}")
         
-        # 执行命令(env 含 LD_PRELOAD 与 PADDLE_PDX_DISABLE_MODEL_SOURCE_CHECK,避免 static TLS / 模型源检查
+        # 执行命令(env 含 PADDLE_PDX_DISABLE_MODEL_SOURCE_CHECK、FLAGS_use_stride_kernel
         result = subprocess.run(
             cmd,
             capture_output=True,
@@ -875,7 +842,7 @@ def call_paddleocr_doc_parser_for_text(image_path: str, save_path: str) -> tuple
         
         logger.info(f"[PaddleOCR 图表识别] 执行命令: {' '.join(cmd)}")
         
-        # 执行命令(env 含 LD_PRELOAD 与 PADDLE_PDX_DISABLE_MODEL_SOURCE_CHECK)
+        # 执行命令(env 含 PADDLE_PDX_DISABLE_MODEL_SOURCE_CHECK、FLAGS_use_stride_kernel
         result = subprocess.run(
             cmd,
             capture_output=True,