hace 1 día · de7b25c053
--- a/pdf_converter_v2/utils/paddleocr_fallback.py
+++ b/pdf_converter_v2/utils/paddleocr_fallback.py
@@ -47,6 +47,16 @@ except ImportError:
 
				     PIL_AVAILABLE = False
			
 
				     logger.warning("[PaddleOCR备用] PIL未安装，无法处理图片")
			
 
				 
			
 
				+try:
			
 
				+    from paddleocr import PaddleOCRVL
			
 
				+    PADDLEOCR_API_AVAILABLE = True
			
 
				+except ImportError:
			
 
				+    PADDLEOCR_API_AVAILABLE = False
			
 
				+    logger.warning("[PaddleOCR备用] paddleocr Python API 未安装，将使用命令行方式")
			
 
				+
			
 
				+# PaddleOCR VL pipeline 单例（避免重复初始化）
			
 
				+_PADDLEOCR_PIPELINE_CACHE: Dict[tuple, Any] = {}
			
 
				+
			
 
				 
			
 
				 def _get_paddleocr_executable() -> str:
			
 
				     """返回 paddleocr 可执行文件路径 or 命令名，供 subprocess 使用。
			
@@ -149,6 +159,111 @@ def _paddle_ocr_device_args() -> list:
 
				     return ["--device", device]
			
 
				 
			
 
				 
			
 
				+def _get_paddle_ocr_device() -> str:
			
 
				+    """获取 PaddleOCR Python API 使用的设备字符串（如 'gpu:0' 或 'cpu'）"""
			
 
				+    devices = _get_paddle_ocr_devices()
			
 
				+    if not devices:
			
 
				+        return "cpu"
			
 
				+    global _PADDLE_OCR_DEVICE_INDEX
			
 
				+    with _PADDLE_OCR_DEVICE_LOCK:
			
 
				+        idx = _PADDLE_OCR_DEVICE_INDEX % len(devices)
			
 
				+        _PADDLE_OCR_DEVICE_INDEX += 1
			
 
				+        device = devices[idx]
			
 
				+    return device
			
 
				+
			
 
				+
			
 
				+def _get_paddleocr_pipeline(use_chart_recognition: bool = False, use_layout_detection: bool = False):
			
 
				+    """获取或创建 PaddleOCR VL pipeline 单例
			
 
				+    
			
 
				+    Args:
			
 
				+        use_chart_recognition: 是否启用图表识别
			
 
				+        use_layout_detection: 是否启用版面检测
			
 
				+        
			
 
				+    Returns:
			
 
				+        PaddleOCRVL pipeline 实例
			
 
				+    """
			
 
				+    if not PADDLEOCR_API_AVAILABLE:
			
 
				+        raise ImportError("paddleocr Python API 未安装")
			
 
				+    
			
 
				+    # 使用参数组合作为缓存 key
			
 
				+    cache_key = (use_chart_recognition, use_layout_detection)
			
 
				+    
			
 
				+    if cache_key in _PADDLEOCR_PIPELINE_CACHE:
			
 
				+        return _PADDLEOCR_PIPELINE_CACHE[cache_key]
			
 
				+    
			
 
				+    device = _get_paddle_ocr_device()
			
 
				+    logger.info(f"[PaddleOCR API] 初始化 pipeline: device={device}, chart={use_chart_recognition}, layout={use_layout_detection}")
			
 
				+    
			
 
				+    pipeline = PaddleOCRVL(
			
 
				+        device=device,
			
 
				+        use_doc_unwarping=False,
			
 
				+        use_doc_orientation_classify=True,
			
 
				+        use_chart_recognition=use_chart_recognition,
			
 
				+        use_layout_detection=use_layout_detection,
			
 
				+    )
			
 
				+    
			
 
				+    _PADDLEOCR_PIPELINE_CACHE[cache_key] = pipeline
			
 
				+    logger.info(f"[PaddleOCR API] Pipeline 初始化完成")
			
 
				+    
			
 
				+    return pipeline
			
 
				+
			
 
				+
			
 
				+def _call_paddleocr_api(
			
 
				+    image_path: str,
			
 
				+    save_path: str,
			
 
				+    use_chart_recognition: bool = False,
			
 
				+    use_layout_detection: bool = False
			
 
				+) -> tuple[bool, Optional[str]]:
			
 
				+    """使用 Python API 调用 PaddleOCR VL
			
 
				+    
			
 
				+    Args:
			
 
				+        image_path: 输入图片路径
			
 
				+        save_path: 输出保存路径（目录）
			
 
				+        use_chart_recognition: 是否启用图表识别
			
 
				+        use_layout_detection: 是否启用版面检测
			
 
				+        
			
 
				+    Returns:
			
 
				+        (是否成功, markdown 文件路径)
			
 
				+    """
			
 
				+    try:
			
 
				+        if not os.path.exists(image_path):
			
 
				+            logger.error(f"[PaddleOCR API] 图片文件不存在: {image_path}")
			
 
				+            return False, None
			
 
				+        
			
 
				+        os.makedirs(save_path, exist_ok=True)
			
 
				+        
			
 
				+        # 获取 pipeline
			
 
				+        pipeline = _get_paddleocr_pipeline(use_chart_recognition, use_layout_detection)
			
 
				+        
			
 
				+        logger.info(f"[PaddleOCR API] 开始处理: {image_path}")
			
 
				+        
			
 
				+        # 执行识别
			
 
				+        result = pipeline.predict(input=image_path)
			
 
				+        
			
 
				+        # 保存结果
			
 
				+        image_basename = os.path.splitext(os.path.basename(image_path))[0]
			
 
				+        
			
 
				+        for item in result:
			
 
				+            if hasattr(item, 'save'):
			
 
				+                item_save_path = os.path.join(save_path, image_basename)
			
 
				+                item.save(item_save_path)
			
 
				+                logger.info(f"[PaddleOCR API] 结果已保存到: {item_save_path}")
			
 
				+                
			
 
				+                # 查找生成的 markdown 文件
			
 
				+                markdown_file = os.path.join(item_save_path, f"{image_basename}.md")
			
 
				+                if os.path.exists(markdown_file):
			
 
				+                    return True, markdown_file
			
 
				+        
			
 
				+        logger.warning(f"[PaddleOCR API] 未找到 markdown 输出文件")
			
 
				+        return False, None
			
 
				+        
			
 
				+    except Exception as e:
			
 
				+        logger.error(f"[PaddleOCR API] 处理失败: {e}")
			
 
				+        import traceback
			
 
				+        logger.error(traceback.format_exc())
			
 
				+        return False, None
			
 
				+
			
 
				+
			
 
				 def has_recognition_garbage(text: str, min_repeat: int = 10) -> bool:
			
 
				     """检测文本中是否存在同一字符连续重复的识别异常（如 MinerU 误识别的 草草草...）。
			
 
				     用于在解析前判断是否应用 Paddle doc_parser 做补充替换。
			
@@ -817,7 +932,34 @@ def call_paddleocr_ocr(image_path: str, save_path: str) -> tuple[Optional[List[s
 
				         save_path_base = os.path.join(save_path, image_basename)
			
 
				         os.makedirs(save_path_base, exist_ok=True)
			
 
				 
			
 
				-        # 使用不识别图表的 doc_parser 参数（文本识别，无 --use_table_recognition）
			
 
				+        # 优先使用 Python API 方式（use_chart_recognition=False, use_layout_detection=False）
			
 
				+        if PADDLEOCR_API_AVAILABLE:
			
 
				+            logger.info(f"[PaddleOCR 文本识别] 使用 Python API 方式")
			
 
				+            success, md_file = _call_paddleocr_api(
			
 
				+                image_path, 
			
 
				+                save_path, 
			
 
				+                use_chart_recognition=False, 
			
 
				+                use_layout_detection=False
			
 
				+            )
			
 
				+            
			
 
				+            if success and md_file and os.path.exists(md_file):
			
 
				+                try:
			
 
				+                    with open(md_file, "r", encoding="utf-8") as f:
			
 
				+                        md_content = f.read()
			
 
				+                    if md_content.strip():
			
 
				+                        texts = markdown_to_plain_text(md_content)
			
 
				+                        if texts:
			
 
				+                            json_file = os.path.join(save_path, f"{image_basename}_res.json")
			
 
				+                            with open(json_file, "w", encoding="utf-8") as f:
			
 
				+                                json.dump({"rec_texts": texts}, f, ensure_ascii=False, indent=0)
			
 
				+                            logger.info(f"[PaddleOCR 文本识别] Python API 成功，得到 {len(texts)} 行文本")
			
 
				+                            return texts, json_file
			
 
				+                except Exception as e:
			
 
				+                    logger.exception(f"[PaddleOCR 文本识别] Python API 结果处理失败: {e}")
			
 
				+            
			
 
				+            logger.warning("[PaddleOCR 文本识别] Python API 失败，回退到命令行方式")
			
 
				+        
			
 
				+        # 回退到命令行方式
			
 
				         cmd = [
			
 
				             _get_paddleocr_executable(), "doc_parser", "-i", image_path,
			
 
				             "--precision", "fp32",
			
@@ -908,12 +1050,35 @@ def call_paddleocr_doc_parser_for_text(image_path: str, save_path: str) -> tuple
 
				             logger.error(f"[PaddleOCR 图表识别] 图片文件不存在: {image_path}")
			
 
				             return None, None
			
 
				         
			
 
				-        # 生成输出目录和基础文件名（图表识别：开启 use_chart_recognition）
			
 
				-        image_dir = os.path.dirname(image_path)
			
 
				         image_basename = os.path.splitext(os.path.basename(image_path))[0]
			
 
				         save_path_base = os.path.join(save_path, image_basename)
			
 
				         os.makedirs(save_path_base, exist_ok=True)
			
 
				         
			
 
				+        # 优先使用 Python API 方式（use_chart_recognition=True, use_layout_detection=True）
			
 
				+        if PADDLEOCR_API_AVAILABLE:
			
 
				+            logger.info(f"[PaddleOCR 图表识别] 使用 Python API 方式")
			
 
				+            success, md_file = _call_paddleocr_api(
			
 
				+                image_path, 
			
 
				+                save_path, 
			
 
				+                use_chart_recognition=True, 
			
 
				+                use_layout_detection=True
			
 
				+            )
			
 
				+            
			
 
				+            if success and md_file and os.path.exists(md_file):
			
 
				+                try:
			
 
				+                    with open(md_file, 'r', encoding='utf-8') as f:
			
 
				+                        markdown_content = f.read()
			
 
				+                    
			
 
				+                    if markdown_content.strip():
			
 
				+                        plain_text_lines = markdown_to_plain_text(markdown_content)
			
 
				+                        logger.info(f"[PaddleOCR 图表识别] Python API 成功，提取 {len(plain_text_lines)} 行纯文本")
			
 
				+                        return plain_text_lines, md_file
			
 
				+                except Exception as e:
			
 
				+                    logger.exception(f"[PaddleOCR 图表识别] Python API 结果处理失败: {e}")
			
 
				+            
			
 
				+            logger.warning("[PaddleOCR 图表识别] Python API 失败，回退到命令行方式")
			
 
				+        
			
 
				+        # 回退到命令行方式
			
 
				         cmd = [
			
 
				             _get_paddleocr_executable(), "doc_parser", "-i", image_path,
			
 
				             "--precision", "fp32",