|
@@ -105,12 +105,12 @@ def get_paddle_ocr_device_args_for_index(device_index: int) -> list:
|
|
|
return ["--device", device]
|
|
return ["--device", device]
|
|
|
|
|
|
|
|
|
|
|
|
|
-# 供 PaddleOCR 子进程使用的环境变量(LD_PRELOAD 避免 sklearn libgomp static TLS 报错;PADDLE_PDX 跳过模型源检查)
|
|
|
|
|
|
|
+# 供 PaddleOCR 子进程使用的环境变量(PADDLE_PDX 跳过模型源检查;FLAGS_use_stride_kernel 避免 VL kernel 未注册)
|
|
|
_PADDLEOCR_ENV: Optional[Dict[str, str]] = None
|
|
_PADDLEOCR_ENV: Optional[Dict[str, str]] = None
|
|
|
|
|
|
|
|
|
|
|
|
|
def _get_paddleocr_subprocess_env() -> Dict[str, str]:
|
|
def _get_paddleocr_subprocess_env() -> Dict[str, str]:
|
|
|
- """返回调用 paddleocr 子进程时应使用的环境变量(含 LD_PRELOAD 与 PADDLE_PDX_DISABLE_MODEL_SOURCE_CHECK)。"""
|
|
|
|
|
|
|
+ """返回调用 paddleocr 子进程时应使用的环境变量(PADDLE_PDX_DISABLE_MODEL_SOURCE_CHECK、FLAGS_use_stride_kernel)。"""
|
|
|
global _PADDLEOCR_ENV
|
|
global _PADDLEOCR_ENV
|
|
|
if _PADDLEOCR_ENV is not None:
|
|
if _PADDLEOCR_ENV is not None:
|
|
|
return _PADDLEOCR_ENV
|
|
return _PADDLEOCR_ENV
|
|
@@ -120,39 +120,6 @@ def _get_paddleocr_subprocess_env() -> Dict[str, str]:
|
|
|
# doc_parser 加载 PaddleOCR-VL 时 safetensors 会触发 view_dtype(CPU, Undefined(AnyLayout), uint8),
|
|
# doc_parser 加载 PaddleOCR-VL 时 safetensors 会触发 view_dtype(CPU, Undefined(AnyLayout), uint8),
|
|
|
# 该 kernel 未注册;强制使用 STRIDED 布局可避免:RuntimeError: kernel view_dtype (CPU, Undefined(AnyLayout), uint8) is not registered
|
|
# 该 kernel 未注册;强制使用 STRIDED 布局可避免:RuntimeError: kernel view_dtype (CPU, Undefined(AnyLayout), uint8) is not registered
|
|
|
env.setdefault("FLAGS_use_stride_kernel", "1")
|
|
env.setdefault("FLAGS_use_stride_kernel", "1")
|
|
|
- # 子进程若无 LD_PRELOAD,会触发 sklearn/paddlex 的「cannot allocate memory in static TLS block」
|
|
|
|
|
- if not env.get("LD_PRELOAD"):
|
|
|
|
|
- preload_paths: List[str] = []
|
|
|
|
|
- # 系统 libgomp 优先
|
|
|
|
|
- for p in (
|
|
|
|
|
- "/usr/lib/x86_64-linux-gnu/libgomp.so.1",
|
|
|
|
|
- "/usr/lib/aarch64-linux-gnu/libgomp.so.1",
|
|
|
|
|
- "/usr/lib/libgomp.so.1",
|
|
|
|
|
- ):
|
|
|
|
|
- if os.path.isfile(p):
|
|
|
|
|
- preload_paths.append(p)
|
|
|
|
|
- break
|
|
|
|
|
- # scikit_learn.libs 中的 libgomp(不 import sklearn,仅按路径查找)
|
|
|
|
|
- for sp in getattr(sys, "path", []):
|
|
|
|
|
- if not sp or not os.path.isdir(sp):
|
|
|
|
|
- continue
|
|
|
|
|
- for sub in ("scikit_learn.libs", "simsimd.libs"):
|
|
|
|
|
- d = os.path.join(sp, sub)
|
|
|
|
|
- if not os.path.isdir(d):
|
|
|
|
|
- continue
|
|
|
|
|
- for name in os.listdir(d):
|
|
|
|
|
- if name.startswith("libgomp") and (name.endswith(".so") or ".so." in name):
|
|
|
|
|
- preload_paths.append(os.path.join(d, name))
|
|
|
|
|
- # 固定路径(常见容器)
|
|
|
|
|
- for p in (
|
|
|
|
|
- "/usr/local/lib/python3.10/dist-packages/scikit_learn.libs/libgomp-d22c30c5.so.1.0.0",
|
|
|
|
|
- "/usr/local/lib/python3.10/site-packages/scikit_learn.libs/libgomp-d22c30c5.so.1.0.0",
|
|
|
|
|
- ):
|
|
|
|
|
- if os.path.isfile(p) and p not in preload_paths:
|
|
|
|
|
- preload_paths.append(p)
|
|
|
|
|
- if preload_paths:
|
|
|
|
|
- env["LD_PRELOAD"] = ":".join(preload_paths)
|
|
|
|
|
- logger.debug("[PaddleOCR] 子进程 LD_PRELOAD 已设置,避免 static TLS 报错")
|
|
|
|
|
_PADDLEOCR_ENV = env
|
|
_PADDLEOCR_ENV = env
|
|
|
return env
|
|
return env
|
|
|
|
|
|
|
@@ -435,7 +402,7 @@ def call_paddleocr(image_path: str) -> Optional[Dict[str, Any]]:
|
|
|
|
|
|
|
|
logger.info(f"[PaddleOCR 图表识别] 执行命令: {' '.join(cmd)}")
|
|
logger.info(f"[PaddleOCR 图表识别] 执行命令: {' '.join(cmd)}")
|
|
|
|
|
|
|
|
- # 执行命令(env 含 LD_PRELOAD 与 PADDLE_PDX_DISABLE_MODEL_SOURCE_CHECK,避免 static TLS / 模型源检查)
|
|
|
|
|
|
|
+ # 执行命令(env 含 PADDLE_PDX_DISABLE_MODEL_SOURCE_CHECK、FLAGS_use_stride_kernel)
|
|
|
result = subprocess.run(
|
|
result = subprocess.run(
|
|
|
cmd,
|
|
cmd,
|
|
|
capture_output=True,
|
|
capture_output=True,
|
|
@@ -875,7 +842,7 @@ def call_paddleocr_doc_parser_for_text(image_path: str, save_path: str) -> tuple
|
|
|
|
|
|
|
|
logger.info(f"[PaddleOCR 图表识别] 执行命令: {' '.join(cmd)}")
|
|
logger.info(f"[PaddleOCR 图表识别] 执行命令: {' '.join(cmd)}")
|
|
|
|
|
|
|
|
- # 执行命令(env 含 LD_PRELOAD 与 PADDLE_PDX_DISABLE_MODEL_SOURCE_CHECK)
|
|
|
|
|
|
|
+ # 执行命令(env 含 PADDLE_PDX_DISABLE_MODEL_SOURCE_CHECK、FLAGS_use_stride_kernel)
|
|
|
result = subprocess.run(
|
|
result = subprocess.run(
|
|
|
cmd,
|
|
cmd,
|
|
|
capture_output=True,
|
|
capture_output=True,
|