Explorar el Código

feat: 使用 PaddleOCR Python API 替代命令行方式,支持图表识别和纯文本识别

何文松 hace 1 día
padre
commit
de7b25c053
Se han modificado 1 ficheros con 168 adiciones y 3 borrados
  1. 168 3
      pdf_converter_v2/utils/paddleocr_fallback.py

+ 168 - 3
pdf_converter_v2/utils/paddleocr_fallback.py

@@ -47,6 +47,16 @@ except ImportError:
     PIL_AVAILABLE = False
     logger.warning("[PaddleOCR备用] PIL未安装,无法处理图片")
 
+try:
+    from paddleocr import PaddleOCRVL
+    PADDLEOCR_API_AVAILABLE = True
+except ImportError:
+    PADDLEOCR_API_AVAILABLE = False
+    logger.warning("[PaddleOCR备用] paddleocr Python API 未安装,将使用命令行方式")
+
+# PaddleOCR VL pipeline 单例(避免重复初始化)
+_PADDLEOCR_PIPELINE_CACHE: Dict[tuple, Any] = {}
+
 
 def _get_paddleocr_executable() -> str:
     """返回 paddleocr 可执行文件路径 or 命令名,供 subprocess 使用。
@@ -149,6 +159,111 @@ def _paddle_ocr_device_args() -> list:
     return ["--device", device]
 
 
+def _get_paddle_ocr_device() -> str:
+    """获取 PaddleOCR Python API 使用的设备字符串(如 'gpu:0' 或 'cpu')"""
+    devices = _get_paddle_ocr_devices()
+    if not devices:
+        return "cpu"
+    global _PADDLE_OCR_DEVICE_INDEX
+    with _PADDLE_OCR_DEVICE_LOCK:
+        idx = _PADDLE_OCR_DEVICE_INDEX % len(devices)
+        _PADDLE_OCR_DEVICE_INDEX += 1
+        device = devices[idx]
+    return device
+
+
+def _get_paddleocr_pipeline(use_chart_recognition: bool = False, use_layout_detection: bool = False):
+    """获取或创建 PaddleOCR VL pipeline 单例
+    
+    Args:
+        use_chart_recognition: 是否启用图表识别
+        use_layout_detection: 是否启用版面检测
+        
+    Returns:
+        PaddleOCRVL pipeline 实例
+    """
+    if not PADDLEOCR_API_AVAILABLE:
+        raise ImportError("paddleocr Python API 未安装")
+    
+    # 使用参数组合作为缓存 key
+    cache_key = (use_chart_recognition, use_layout_detection)
+    
+    if cache_key in _PADDLEOCR_PIPELINE_CACHE:
+        return _PADDLEOCR_PIPELINE_CACHE[cache_key]
+    
+    device = _get_paddle_ocr_device()
+    logger.info(f"[PaddleOCR API] 初始化 pipeline: device={device}, chart={use_chart_recognition}, layout={use_layout_detection}")
+    
+    pipeline = PaddleOCRVL(
+        device=device,
+        use_doc_unwarping=False,
+        use_doc_orientation_classify=True,
+        use_chart_recognition=use_chart_recognition,
+        use_layout_detection=use_layout_detection,
+    )
+    
+    _PADDLEOCR_PIPELINE_CACHE[cache_key] = pipeline
+    logger.info(f"[PaddleOCR API] Pipeline 初始化完成")
+    
+    return pipeline
+
+
+def _call_paddleocr_api(
+    image_path: str,
+    save_path: str,
+    use_chart_recognition: bool = False,
+    use_layout_detection: bool = False
+) -> tuple[bool, Optional[str]]:
+    """使用 Python API 调用 PaddleOCR VL
+    
+    Args:
+        image_path: 输入图片路径
+        save_path: 输出保存路径(目录)
+        use_chart_recognition: 是否启用图表识别
+        use_layout_detection: 是否启用版面检测
+        
+    Returns:
+        (是否成功, markdown 文件路径)
+    """
+    try:
+        if not os.path.exists(image_path):
+            logger.error(f"[PaddleOCR API] 图片文件不存在: {image_path}")
+            return False, None
+        
+        os.makedirs(save_path, exist_ok=True)
+        
+        # 获取 pipeline
+        pipeline = _get_paddleocr_pipeline(use_chart_recognition, use_layout_detection)
+        
+        logger.info(f"[PaddleOCR API] 开始处理: {image_path}")
+        
+        # 执行识别
+        result = pipeline.predict(input=image_path)
+        
+        # 保存结果
+        image_basename = os.path.splitext(os.path.basename(image_path))[0]
+        
+        for item in result:
+            if hasattr(item, 'save'):
+                item_save_path = os.path.join(save_path, image_basename)
+                item.save(item_save_path)
+                logger.info(f"[PaddleOCR API] 结果已保存到: {item_save_path}")
+                
+                # 查找生成的 markdown 文件
+                markdown_file = os.path.join(item_save_path, f"{image_basename}.md")
+                if os.path.exists(markdown_file):
+                    return True, markdown_file
+        
+        logger.warning(f"[PaddleOCR API] 未找到 markdown 输出文件")
+        return False, None
+        
+    except Exception as e:
+        logger.error(f"[PaddleOCR API] 处理失败: {e}")
+        import traceback
+        logger.error(traceback.format_exc())
+        return False, None
+
+
 def has_recognition_garbage(text: str, min_repeat: int = 10) -> bool:
     """检测文本中是否存在同一字符连续重复的识别异常(如 MinerU 误识别的 草草草...)。
     用于在解析前判断是否应用 Paddle doc_parser 做补充替换。
@@ -817,7 +932,34 @@ def call_paddleocr_ocr(image_path: str, save_path: str) -> tuple[Optional[List[s
         save_path_base = os.path.join(save_path, image_basename)
         os.makedirs(save_path_base, exist_ok=True)
 
-        # 使用不识别图表的 doc_parser 参数(文本识别,无 --use_table_recognition)
+        # 优先使用 Python API 方式(use_chart_recognition=False, use_layout_detection=False)
+        if PADDLEOCR_API_AVAILABLE:
+            logger.info(f"[PaddleOCR 文本识别] 使用 Python API 方式")
+            success, md_file = _call_paddleocr_api(
+                image_path, 
+                save_path, 
+                use_chart_recognition=False, 
+                use_layout_detection=False
+            )
+            
+            if success and md_file and os.path.exists(md_file):
+                try:
+                    with open(md_file, "r", encoding="utf-8") as f:
+                        md_content = f.read()
+                    if md_content.strip():
+                        texts = markdown_to_plain_text(md_content)
+                        if texts:
+                            json_file = os.path.join(save_path, f"{image_basename}_res.json")
+                            with open(json_file, "w", encoding="utf-8") as f:
+                                json.dump({"rec_texts": texts}, f, ensure_ascii=False, indent=0)
+                            logger.info(f"[PaddleOCR 文本识别] Python API 成功,得到 {len(texts)} 行文本")
+                            return texts, json_file
+                except Exception as e:
+                    logger.exception(f"[PaddleOCR 文本识别] Python API 结果处理失败: {e}")
+            
+            logger.warning("[PaddleOCR 文本识别] Python API 失败,回退到命令行方式")
+        
+        # 回退到命令行方式
         cmd = [
             _get_paddleocr_executable(), "doc_parser", "-i", image_path,
             "--precision", "fp32",
@@ -908,12 +1050,35 @@ def call_paddleocr_doc_parser_for_text(image_path: str, save_path: str) -> tuple
             logger.error(f"[PaddleOCR 图表识别] 图片文件不存在: {image_path}")
             return None, None
         
-        # 生成输出目录和基础文件名(图表识别:开启 use_chart_recognition)
-        image_dir = os.path.dirname(image_path)
         image_basename = os.path.splitext(os.path.basename(image_path))[0]
         save_path_base = os.path.join(save_path, image_basename)
         os.makedirs(save_path_base, exist_ok=True)
         
+        # 优先使用 Python API 方式(use_chart_recognition=True, use_layout_detection=True)
+        if PADDLEOCR_API_AVAILABLE:
+            logger.info(f"[PaddleOCR 图表识别] 使用 Python API 方式")
+            success, md_file = _call_paddleocr_api(
+                image_path, 
+                save_path, 
+                use_chart_recognition=True, 
+                use_layout_detection=True
+            )
+            
+            if success and md_file and os.path.exists(md_file):
+                try:
+                    with open(md_file, 'r', encoding='utf-8') as f:
+                        markdown_content = f.read()
+                    
+                    if markdown_content.strip():
+                        plain_text_lines = markdown_to_plain_text(markdown_content)
+                        logger.info(f"[PaddleOCR 图表识别] Python API 成功,提取 {len(plain_text_lines)} 行纯文本")
+                        return plain_text_lines, md_file
+                except Exception as e:
+                    logger.exception(f"[PaddleOCR 图表识别] Python API 结果处理失败: {e}")
+            
+            logger.warning("[PaddleOCR 图表识别] Python API 失败,回退到命令行方式")
+        
+        # 回退到命令行方式
         cmd = [
             _get_paddleocr_executable(), "doc_parser", "-i", image_path,
             "--precision", "fp32",