|
|
@@ -47,6 +47,16 @@ except ImportError:
|
|
|
PIL_AVAILABLE = False
|
|
|
logger.warning("[PaddleOCR备用] PIL未安装,无法处理图片")
|
|
|
|
|
|
+try:
|
|
|
+ from paddleocr import PaddleOCRVL
|
|
|
+ PADDLEOCR_API_AVAILABLE = True
|
|
|
+except ImportError:
|
|
|
+ PADDLEOCR_API_AVAILABLE = False
|
|
|
+ logger.warning("[PaddleOCR备用] paddleocr Python API 未安装,将使用命令行方式")
|
|
|
+
|
|
|
+# PaddleOCR VL pipeline 单例(避免重复初始化)
|
|
|
+_PADDLEOCR_PIPELINE_CACHE: Dict[tuple, Any] = {}
|
|
|
+
|
|
|
|
|
|
def _get_paddleocr_executable() -> str:
|
|
|
"""返回 paddleocr 可执行文件路径 or 命令名,供 subprocess 使用。
|
|
|
@@ -149,6 +159,111 @@ def _paddle_ocr_device_args() -> list:
|
|
|
return ["--device", device]
|
|
|
|
|
|
|
|
|
+def _get_paddle_ocr_device() -> str:
|
|
|
+ """获取 PaddleOCR Python API 使用的设备字符串(如 'gpu:0' 或 'cpu')"""
|
|
|
+ devices = _get_paddle_ocr_devices()
|
|
|
+ if not devices:
|
|
|
+ return "cpu"
|
|
|
+ global _PADDLE_OCR_DEVICE_INDEX
|
|
|
+ with _PADDLE_OCR_DEVICE_LOCK:
|
|
|
+ idx = _PADDLE_OCR_DEVICE_INDEX % len(devices)
|
|
|
+ _PADDLE_OCR_DEVICE_INDEX += 1
|
|
|
+ device = devices[idx]
|
|
|
+ return device
|
|
|
+
|
|
|
+
|
|
|
+def _get_paddleocr_pipeline(use_chart_recognition: bool = False, use_layout_detection: bool = False):
|
|
|
+ """获取或创建 PaddleOCR VL pipeline 单例
|
|
|
+
|
|
|
+ Args:
|
|
|
+ use_chart_recognition: 是否启用图表识别
|
|
|
+ use_layout_detection: 是否启用版面检测
|
|
|
+
|
|
|
+ Returns:
|
|
|
+ PaddleOCRVL pipeline 实例
|
|
|
+ """
|
|
|
+ if not PADDLEOCR_API_AVAILABLE:
|
|
|
+ raise ImportError("paddleocr Python API 未安装")
|
|
|
+
|
|
|
+ # 使用参数组合作为缓存 key
|
|
|
+ cache_key = (use_chart_recognition, use_layout_detection)
|
|
|
+
|
|
|
+ if cache_key in _PADDLEOCR_PIPELINE_CACHE:
|
|
|
+ return _PADDLEOCR_PIPELINE_CACHE[cache_key]
|
|
|
+
|
|
|
+ device = _get_paddle_ocr_device()
|
|
|
+ logger.info(f"[PaddleOCR API] 初始化 pipeline: device={device}, chart={use_chart_recognition}, layout={use_layout_detection}")
|
|
|
+
|
|
|
+ pipeline = PaddleOCRVL(
|
|
|
+ device=device,
|
|
|
+ use_doc_unwarping=False,
|
|
|
+ use_doc_orientation_classify=True,
|
|
|
+ use_chart_recognition=use_chart_recognition,
|
|
|
+ use_layout_detection=use_layout_detection,
|
|
|
+ )
|
|
|
+
|
|
|
+ _PADDLEOCR_PIPELINE_CACHE[cache_key] = pipeline
|
|
|
+ logger.info(f"[PaddleOCR API] Pipeline 初始化完成")
|
|
|
+
|
|
|
+ return pipeline
|
|
|
+
|
|
|
+
|
|
|
+def _call_paddleocr_api(
|
|
|
+ image_path: str,
|
|
|
+ save_path: str,
|
|
|
+ use_chart_recognition: bool = False,
|
|
|
+ use_layout_detection: bool = False
|
|
|
+) -> tuple[bool, Optional[str]]:
|
|
|
+ """使用 Python API 调用 PaddleOCR VL
|
|
|
+
|
|
|
+ Args:
|
|
|
+ image_path: 输入图片路径
|
|
|
+ save_path: 输出保存路径(目录)
|
|
|
+ use_chart_recognition: 是否启用图表识别
|
|
|
+ use_layout_detection: 是否启用版面检测
|
|
|
+
|
|
|
+ Returns:
|
|
|
+ (是否成功, markdown 文件路径)
|
|
|
+ """
|
|
|
+ try:
|
|
|
+ if not os.path.exists(image_path):
|
|
|
+ logger.error(f"[PaddleOCR API] 图片文件不存在: {image_path}")
|
|
|
+ return False, None
|
|
|
+
|
|
|
+ os.makedirs(save_path, exist_ok=True)
|
|
|
+
|
|
|
+ # 获取 pipeline
|
|
|
+ pipeline = _get_paddleocr_pipeline(use_chart_recognition, use_layout_detection)
|
|
|
+
|
|
|
+ logger.info(f"[PaddleOCR API] 开始处理: {image_path}")
|
|
|
+
|
|
|
+ # 执行识别
|
|
|
+ result = pipeline.predict(input=image_path)
|
|
|
+
|
|
|
+ # 保存结果
|
|
|
+ image_basename = os.path.splitext(os.path.basename(image_path))[0]
|
|
|
+
|
|
|
+ for item in result:
|
|
|
+ if hasattr(item, 'save'):
|
|
|
+ item_save_path = os.path.join(save_path, image_basename)
|
|
|
+ item.save(item_save_path)
|
|
|
+ logger.info(f"[PaddleOCR API] 结果已保存到: {item_save_path}")
|
|
|
+
|
|
|
+ # 查找生成的 markdown 文件
|
|
|
+ markdown_file = os.path.join(item_save_path, f"{image_basename}.md")
|
|
|
+ if os.path.exists(markdown_file):
|
|
|
+ return True, markdown_file
|
|
|
+
|
|
|
+ logger.warning(f"[PaddleOCR API] 未找到 markdown 输出文件")
|
|
|
+ return False, None
|
|
|
+
|
|
|
+ except Exception as e:
|
|
|
+ logger.error(f"[PaddleOCR API] 处理失败: {e}")
|
|
|
+ import traceback
|
|
|
+ logger.error(traceback.format_exc())
|
|
|
+ return False, None
|
|
|
+
|
|
|
+
|
|
|
def has_recognition_garbage(text: str, min_repeat: int = 10) -> bool:
|
|
|
"""检测文本中是否存在同一字符连续重复的识别异常(如 MinerU 误识别的 草草草...)。
|
|
|
用于在解析前判断是否应用 Paddle doc_parser 做补充替换。
|
|
|
@@ -817,7 +932,34 @@ def call_paddleocr_ocr(image_path: str, save_path: str) -> tuple[Optional[List[s
|
|
|
save_path_base = os.path.join(save_path, image_basename)
|
|
|
os.makedirs(save_path_base, exist_ok=True)
|
|
|
|
|
|
- # 使用不识别图表的 doc_parser 参数(文本识别,无 --use_table_recognition)
|
|
|
+ # 优先使用 Python API 方式(use_chart_recognition=False, use_layout_detection=False)
|
|
|
+ if PADDLEOCR_API_AVAILABLE:
|
|
|
+ logger.info(f"[PaddleOCR 文本识别] 使用 Python API 方式")
|
|
|
+ success, md_file = _call_paddleocr_api(
|
|
|
+ image_path,
|
|
|
+ save_path,
|
|
|
+ use_chart_recognition=False,
|
|
|
+ use_layout_detection=False
|
|
|
+ )
|
|
|
+
|
|
|
+ if success and md_file and os.path.exists(md_file):
|
|
|
+ try:
|
|
|
+ with open(md_file, "r", encoding="utf-8") as f:
|
|
|
+ md_content = f.read()
|
|
|
+ if md_content.strip():
|
|
|
+ texts = markdown_to_plain_text(md_content)
|
|
|
+ if texts:
|
|
|
+ json_file = os.path.join(save_path, f"{image_basename}_res.json")
|
|
|
+ with open(json_file, "w", encoding="utf-8") as f:
|
|
|
+ json.dump({"rec_texts": texts}, f, ensure_ascii=False, indent=0)
|
|
|
+ logger.info(f"[PaddleOCR 文本识别] Python API 成功,得到 {len(texts)} 行文本")
|
|
|
+ return texts, json_file
|
|
|
+ except Exception as e:
|
|
|
+ logger.exception(f"[PaddleOCR 文本识别] Python API 结果处理失败: {e}")
|
|
|
+
|
|
|
+ logger.warning("[PaddleOCR 文本识别] Python API 失败,回退到命令行方式")
|
|
|
+
|
|
|
+ # 回退到命令行方式
|
|
|
cmd = [
|
|
|
_get_paddleocr_executable(), "doc_parser", "-i", image_path,
|
|
|
"--precision", "fp32",
|
|
|
@@ -908,12 +1050,35 @@ def call_paddleocr_doc_parser_for_text(image_path: str, save_path: str) -> tuple
|
|
|
logger.error(f"[PaddleOCR 图表识别] 图片文件不存在: {image_path}")
|
|
|
return None, None
|
|
|
|
|
|
- # 生成输出目录和基础文件名(图表识别:开启 use_chart_recognition)
|
|
|
- image_dir = os.path.dirname(image_path)
|
|
|
image_basename = os.path.splitext(os.path.basename(image_path))[0]
|
|
|
save_path_base = os.path.join(save_path, image_basename)
|
|
|
os.makedirs(save_path_base, exist_ok=True)
|
|
|
|
|
|
+ # 优先使用 Python API 方式(use_chart_recognition=True, use_layout_detection=True)
|
|
|
+ if PADDLEOCR_API_AVAILABLE:
|
|
|
+ logger.info(f"[PaddleOCR 图表识别] 使用 Python API 方式")
|
|
|
+ success, md_file = _call_paddleocr_api(
|
|
|
+ image_path,
|
|
|
+ save_path,
|
|
|
+ use_chart_recognition=True,
|
|
|
+ use_layout_detection=True
|
|
|
+ )
|
|
|
+
|
|
|
+ if success and md_file and os.path.exists(md_file):
|
|
|
+ try:
|
|
|
+ with open(md_file, 'r', encoding='utf-8') as f:
|
|
|
+ markdown_content = f.read()
|
|
|
+
|
|
|
+ if markdown_content.strip():
|
|
|
+ plain_text_lines = markdown_to_plain_text(markdown_content)
|
|
|
+ logger.info(f"[PaddleOCR 图表识别] Python API 成功,提取 {len(plain_text_lines)} 行纯文本")
|
|
|
+ return plain_text_lines, md_file
|
|
|
+ except Exception as e:
|
|
|
+ logger.exception(f"[PaddleOCR 图表识别] Python API 结果处理失败: {e}")
|
|
|
+
|
|
|
+ logger.warning("[PaddleOCR 图表识别] Python API 失败,回退到命令行方式")
|
|
|
+
|
|
|
+ # 回退到命令行方式
|
|
|
cmd = [
|
|
|
_get_paddleocr_executable(), "doc_parser", "-i", image_path,
|
|
|
"--precision", "fp32",
|