1 день назад · 4c4a7c4acb
--- a/pdf_converter_v2/utils/paddleocr_fallback.py
+++ b/pdf_converter_v2/utils/paddleocr_fallback.py
@@ -49,13 +49,11 @@ except ImportError:
 
				 
			
 
				 try:
			
 
				     from paddleocr import PaddleOCRVL
			
 
				-    # 暂时禁用 Python API，因为在 VLLM 引擎运行时显存不足（需要约 11GB）
			
 
				-    # 等 VLLM GPU 内存利用率降低或使用独立 GPU 后再启用
			
 
				-    PADDLEOCR_API_AVAILABLE = False
			
 
				-    logger.info("[PaddleOCR备用] Python API 已禁用（显存不足），使用命令行方式")
			
 
				+    PADDLEOCR_API_AVAILABLE = True
			
 
				+    logger.info("[PaddleOCR备用] Python API 已加载")
			
 
				 except ImportError:
			
 
				     PADDLEOCR_API_AVAILABLE = False
			
 
				-    logger.warning("[PaddleOCR备用] paddleocr Python API 未安装，将使用命令行方式")
			
 
				+    logger.warning("[PaddleOCR备用] paddleocr Python API 未安装")
			
 
				 
			
 
				 # PaddleOCR VL pipeline 单例（避免重复初始化）
			
 
				 _PADDLEOCR_PIPELINE_CACHE: Dict[tuple, Any] = {}
			
@@ -956,104 +954,37 @@ def call_paddleocr_ocr(image_path: str, save_path: str) -> tuple[Optional[List[s
 
				         save_path_base = os.path.join(save_path, image_basename)
			
 
				         os.makedirs(save_path_base, exist_ok=True)
			
 
				 
			
 
				-        # 优先使用 Python API 方式（use_chart_recognition=False, use_layout_detection=False）
			
 
				-        if PADDLEOCR_API_AVAILABLE:
			
 
				-            logger.info(f"[PaddleOCR 文本识别] 使用 Python API 方式")
			
 
				-            success, md_file = _call_paddleocr_api(
			
 
				-                image_path, 
			
 
				-                save_path, 
			
 
				-                use_chart_recognition=False, 
			
 
				-                use_layout_detection=False
			
 
				-            )
			
 
				-            
			
 
				-            if success and md_file and os.path.exists(md_file):
			
 
				-                try:
			
 
				-                    with open(md_file, "r", encoding="utf-8") as f:
			
 
				-                        md_content = f.read()
			
 
				-                    if md_content.strip():
			
 
				-                        texts = markdown_to_plain_text(md_content)
			
 
				-                        if texts:
			
 
				-                            json_file = os.path.join(save_path, f"{image_basename}_res.json")
			
 
				-                            with open(json_file, "w", encoding="utf-8") as f:
			
 
				-                                json.dump({"rec_texts": texts}, f, ensure_ascii=False, indent=0)
			
 
				-                            logger.info(f"[PaddleOCR 文本识别] Python API 成功，得到 {len(texts)} 行文本")
			
 
				-                            return texts, json_file
			
 
				-                except Exception as e:
			
 
				-                    logger.exception(f"[PaddleOCR 文本识别] Python API 结果处理失败: {e}")
			
 
				+        # 使用 Python API 方式（use_chart_recognition=False, use_layout_detection=False）
			
 
				+        if not PADDLEOCR_API_AVAILABLE:
			
 
				+            logger.error(f"[PaddleOCR 文本识别] Python API 不可用")
			
 
				+            return None, None
			
 
				             
			
 
				-            logger.warning("[PaddleOCR 文本识别] Python API 失败，回退到命令行方式")
			
 
				-        
			
 
				-        # 回退到命令行方式
			
 
				-        cmd = [
			
 
				-            _get_paddleocr_executable(), "doc_parser", "-i", image_path,
			
 
				-            "--precision", "fp32",
			
 
				-            "--use_doc_unwarping", "False",
			
 
				-            "--use_doc_orientation_classify", "True",
			
 
				-            "--use_chart_recognition", "False",
			
 
				-            "--use_layout_detection", "False",
			
 
				-            "--save_path", save_path_base
			
 
				-        ] + _paddle_ocr_device_args()
			
 
				-        if VL_REC_BACKEND:
			
 
				-            cmd.extend(["--vl_rec_backend", VL_REC_BACKEND])
			
 
				-        if VL_REC_SERVER_URL:
			
 
				-            cmd.extend(["--vl_rec_server_url", VL_REC_SERVER_URL])
			
 
				-
			
 
				-        logger.info(f"[PaddleOCR 文本识别] 执行命令(doc_parser): {' '.join(cmd)}")
			
 
				-
			
 
				-        result = subprocess.run(
			
 
				-            cmd,
			
 
				-            capture_output=True,
			
 
				-            text=True,
			
 
				-            timeout=300,
			
 
				-            check=False,
			
 
				-            env=_get_paddleocr_subprocess_env(),
			
 
				+        logger.info(f"[PaddleOCR 文本识别] 使用 Python API 方式")
			
 
				+        success, md_file = _call_paddleocr_api(
			
 
				+            image_path, 
			
 
				+            save_path, 
			
 
				+            use_chart_recognition=False, 
			
 
				+            use_layout_detection=False
			
 
				         )
			
 
				-
			
 
				-        if result.returncode != 0:
			
 
				-            logger.error(f"[PaddleOCR 文本识别] doc_parser 执行失败，返回码: {result.returncode}")
			
 
				-            if result.stderr and ("too many values to unpack" in result.stderr or "Exception from the 'cv' worker" in result.stderr):
			
 
				-                logger.warning("[PaddleOCR 文本识别] doc_parser 报 cv worker 解包错误，详见 README_STARTUP.md。")
			
 
				-            if result.stderr:
			
 
				-                logger.error(f"[PaddleOCR 文本识别] 错误输出: {result.stderr}")
			
 
				-            return None, None
			
 
				-
			
 
				-        texts = []
			
 
				-        md_file = os.path.join(save_path_base, f"{image_basename}.md")
			
 
				-        if os.path.exists(md_file):
			
 
				+        
			
 
				+        if success and md_file and os.path.exists(md_file):
			
 
				             try:
			
 
				                 with open(md_file, "r", encoding="utf-8") as f:
			
 
				                     md_content = f.read()
			
 
				                 if md_content.strip():
			
 
				                     texts = markdown_to_plain_text(md_content)
			
 
				+                    if texts:
			
 
				+                        json_file = os.path.join(save_path, f"{image_basename}_res.json")
			
 
				+                        with open(json_file, "w", encoding="utf-8") as f:
			
 
				+                            json.dump({"rec_texts": texts}, f, ensure_ascii=False, indent=0)
			
 
				+                        logger.info(f"[PaddleOCR 文本识别] Python API 成功，得到 {len(texts)} 行文本")
			
 
				+                        return texts, json_file
			
 
				             except Exception as e:
			
 
				-                logger.exception(f"[PaddleOCR 文本识别] 读取 Markdown 失败: {e}")
			
 
				-        if not texts and result.stdout.strip():
			
 
				-            parsed = parse_paddleocr_output(result.stdout.strip())
			
 
				-            for item in parsed.get("parsing_res_list", []):
			
 
				-                if isinstance(item, dict) and item.get("block_content"):
			
 
				-                    block = item["block_content"].strip()
			
 
				-                    if "\n" in block:
			
 
				-                        texts.extend([line.strip() for line in block.split("\n") if line.strip()])
			
 
				-                    else:
			
 
				-                        texts.append(block)
			
 
				-        if not texts:
			
 
				-            logger.warning("[PaddleOCR 文本识别] doc_parser 未得到文本")
			
 
				-            return None, None
			
 
				-
			
 
				-        json_file = os.path.join(save_path, f"{image_basename}_res.json")
			
 
				-        try:
			
 
				-            with open(json_file, "w", encoding="utf-8") as f:
			
 
				-                json.dump({"rec_texts": texts}, f, ensure_ascii=False, indent=0)
			
 
				-        except Exception as e:
			
 
				-            logger.exception(f"[PaddleOCR 文本识别] 写入 rec_texts JSON 失败: {e}")
			
 
				-            return texts, None
			
 
				-
			
 
				-        logger.info(f"[PaddleOCR 文本识别] doc_parser 成功提取 {len(texts)} 个文本片段，JSON: {json_file}")
			
 
				-        return texts, json_file
			
 
				-
			
 
				-    except subprocess.TimeoutExpired:
			
 
				-        logger.error("[PaddleOCR 文本识别] 命令执行超时")
			
 
				+                logger.exception(f"[PaddleOCR 文本识别] Python API 结果处理失败: {e}")
			
 
				+        
			
 
				+        logger.error("[PaddleOCR 文本识别] Python API 失败")
			
 
				         return None, None
			
 
				+
			
 
				     except Exception as e:
			
 
				         logger.exception(f"[PaddleOCR 文本识别] 调用失败: {e}")
			
 
				         return None, None
			
@@ -1078,96 +1009,34 @@ def call_paddleocr_doc_parser_for_text(image_path: str, save_path: str) -> tuple
 
				         save_path_base = os.path.join(save_path, image_basename)
			
 
				         os.makedirs(save_path_base, exist_ok=True)
			
 
				         
			
 
				-        # 优先使用 Python API 方式（use_chart_recognition=True, use_layout_detection=True）
			
 
				-        if PADDLEOCR_API_AVAILABLE:
			
 
				-            logger.info(f"[PaddleOCR 图表识别] 使用 Python API 方式")
			
 
				-            success, md_file = _call_paddleocr_api(
			
 
				-                image_path, 
			
 
				-                save_path, 
			
 
				-                use_chart_recognition=True, 
			
 
				-                use_layout_detection=True
			
 
				-            )
			
 
				-            
			
 
				-            if success and md_file and os.path.exists(md_file):
			
 
				-                try:
			
 
				-                    with open(md_file, 'r', encoding='utf-8') as f:
			
 
				-                        markdown_content = f.read()
			
 
				-                    
			
 
				-                    if markdown_content.strip():
			
 
				-                        plain_text_lines = markdown_to_plain_text(markdown_content)
			
 
				-                        logger.info(f"[PaddleOCR 图表识别] Python API 成功，提取 {len(plain_text_lines)} 行纯文本")
			
 
				-                        return plain_text_lines, md_file
			
 
				-                except Exception as e:
			
 
				-                    logger.exception(f"[PaddleOCR 图表识别] Python API 结果处理失败: {e}")
			
 
				+        # 使用 Python API 方式（use_chart_recognition=True, use_layout_detection=True）
			
 
				+        if not PADDLEOCR_API_AVAILABLE:
			
 
				+            logger.error(f"[PaddleOCR 图表识别] Python API 不可用")
			
 
				+            return None, None
			
 
				             
			
 
				-            logger.warning("[PaddleOCR 图表识别] Python API 失败，回退到命令行方式")
			
 
				-        
			
 
				-        # 回退到命令行方式
			
 
				-        cmd = [
			
 
				-            _get_paddleocr_executable(), "doc_parser", "-i", image_path,
			
 
				-            "--precision", "fp32",
			
 
				-            "--use_doc_unwarping", "False",
			
 
				-            "--use_doc_orientation_classify", "True",
			
 
				-            "--use_chart_recognition", "True",
			
 
				-            "--save_path", save_path_base
			
 
				-        ] + _paddle_ocr_device_args()
			
 
				-        if VL_REC_BACKEND:
			
 
				-            cmd.extend(["--vl_rec_backend", VL_REC_BACKEND])
			
 
				-        if VL_REC_SERVER_URL:
			
 
				-            cmd.extend(["--vl_rec_server_url", VL_REC_SERVER_URL])
			
 
				-        
			
 
				-        logger.info(f"[PaddleOCR 图表识别] 执行命令: {' '.join(cmd)}")
			
 
				-        
			
 
				-        # 执行命令（env 含 PADDLE_PDX_DISABLE_MODEL_SOURCE_CHECK、FLAGS_use_stride_kernel）
			
 
				-        result = subprocess.run(
			
 
				-            cmd,
			
 
				-            capture_output=True,
			
 
				-            text=True,
			
 
				-            timeout=300,  # 5分钟超时
			
 
				-            check=False,
			
 
				-            env=_get_paddleocr_subprocess_env(),
			
 
				+        logger.info(f"[PaddleOCR 图表识别] 使用 Python API 方式")
			
 
				+        success, md_file = _call_paddleocr_api(
			
 
				+            image_path, 
			
 
				+            save_path, 
			
 
				+            use_chart_recognition=True, 
			
 
				+            use_layout_detection=True
			
 
				         )
			
 
				         
			
 
				-        if result.returncode != 0:
			
 
				-            logger.error(f"[PaddleOCR 图表识别] 命令执行失败，返回码: {result.returncode}")
			
 
				-            if result.stderr and ("too many values to unpack" in result.stderr or "Exception from the 'cv' worker" in result.stderr):
			
 
				-                logger.warning(
			
 
				-                    "[PaddleOCR 图表识别] 报 cv worker 解包错误，多为 PaddleX 与 PP-DocLayoutV3 不兼容。"
			
 
				-                    " 可尝试: pip install -U paddlex；或改用 文本识别 提取文字。详见 README_STARTUP.md。"
			
 
				-                )
			
 
				-            logger.error(f"[PaddleOCR 图表识别] 错误输出: {result.stderr}")
			
 
				-            return None, None
			
 
				-
			
 
				-        md_file = os.path.join(save_path_base, f"{image_basename}.md")
			
 
				-        if not os.path.exists(md_file):
			
 
				-            md_files = sorted(Path(save_path_base).rglob("*.md"))
			
 
				-            if md_files:
			
 
				-                md_file = str(md_files[0])
			
 
				-                logger.info(f"[PaddleOCR 图表识别] 在子目录中找到Markdown文件: {md_file}")
			
 
				-        
			
 
				-        if not os.path.exists(md_file):
			
 
				-            logger.warning(f"[PaddleOCR 图表识别] Markdown文件不存在: {md_file}")
			
 
				-            return None, None
			
 
				-        
			
 
				-        try:
			
 
				-            with open(md_file, 'r', encoding='utf-8') as f:
			
 
				-                markdown_content = f.read()
			
 
				-            
			
 
				-            if not markdown_content.strip():
			
 
				-                logger.warning("[PaddleOCR 图表识别] Markdown文件内容为空")
			
 
				-                return [], md_file
			
 
				-            
			
 
				-            plain_text_lines = markdown_to_plain_text(markdown_content)
			
 
				-            logger.info(f"[PaddleOCR 图表识别] 成功提取 {len(plain_text_lines)} 行纯文本，Markdown文件: {md_file}")
			
 
				-            return plain_text_lines, md_file
			
 
				+        if success and md_file and os.path.exists(md_file):
			
 
				+            try:
			
 
				+                with open(md_file, 'r', encoding='utf-8') as f:
			
 
				+                    markdown_content = f.read()
			
 
				                 
			
 
				-        except Exception as e:
			
 
				-            logger.exception(f"[PaddleOCR 图表识别] 读取Markdown文件失败: {e}")
			
 
				-            return None, md_file
			
 
				-            
			
 
				-    except subprocess.TimeoutExpired:
			
 
				-        logger.error("[PaddleOCR 图表识别] 命令执行超时")
			
 
				+                if markdown_content.strip():
			
 
				+                    plain_text_lines = markdown_to_plain_text(markdown_content)
			
 
				+                    logger.info(f"[PaddleOCR 图表识别] Python API 成功，提取 {len(plain_text_lines)} 行纯文本")
			
 
				+                    return plain_text_lines, md_file
			
 
				+            except Exception as e:
			
 
				+                logger.exception(f"[PaddleOCR 图表识别] Python API 结果处理失败: {e}")
			
 
				+        
			
 
				+        logger.error("[PaddleOCR 图表识别] Python API 失败")
			
 
				         return None, None
			
 
				+            
			
 
				     except Exception as e:
			
 
				         logger.exception(f"[PaddleOCR 图表识别] 调用失败: {e}")
			
 
				         return None, None