Browse Source

refactor: 将 call_paddleocr_ocr 改为使用不识别图表的 doc_parser 替代 ocr 子命令

Co-authored-by: Cursor <cursoragent@cursor.com>
何文松 2 weeks ago
parent
commit
f6c245facc
2 changed files with 118 additions and 98 deletions
  1. 59 50
      pdf_converter_v2/paddleocr_fallback.py
  2. 59 48
      pdf_converter_v2/utils/paddleocr_fallback.py

+ 59 - 50
pdf_converter_v2/paddleocr_fallback.py

@@ -15,6 +15,7 @@ import ast
 import re
 
 from ..utils.logging_config import get_logger
+from ..config import VL_REC_BACKEND, VL_REC_SERVER_URL
 
 logger = get_logger("pdf_converter_v2.utils.paddleocr")
 
@@ -839,84 +840,93 @@ def extract_table_text(table_html: str) -> List[str]:
 
 
 def call_paddleocr_ocr(image_path: str, save_path: str) -> tuple[Optional[List[str]], Optional[str]]:
-    """调用paddleocr ocr命令提取文本(用于API接口)
+    """使用不识别图表的 doc_parser 提取文本(替代原 ocr 子命令,用于 API 与 JSON 补充)。
+    
+    内部调用 paddleocr doc_parser(--use_chart_recognition False --use_layout_detection False),
+    从结果得到文本列表并写入与 OCR JSON 兼容的 rec_texts 文件,供 supplement_missing_fields_from_ocr_json 使用。
     
     Args:
         image_path: 图片路径
         save_path: 保存路径(目录)
         
     Returns:
-        (OCR识别的文本列表, JSON文件路径),如果失败返回(None, None)
+        (文本列表, 兼容 rec_texts 的 JSON 文件路径),失败返回 (None, None)
     """
-    # 在调用PaddleOCR前停止mineru服务以释放GPU内存
     mineru_stopped = stop_mineru_service()
-    
     try:
         if not os.path.exists(image_path):
             logger.error(f"[PaddleOCR OCR] 图片文件不存在: {image_path}")
             return None, None
 
-        # 构建paddleocr ocr命令(NPU 下需加 --device npu:0,否则走 CPU 易段错误)
-        cmd = ["paddleocr", "ocr", "-i", image_path, "--save_path", save_path] + _paddle_ocr_device_args()
+        image_basename = os.path.splitext(os.path.basename(image_path))[0]
+        save_path_base = os.path.join(save_path, image_basename)
+        os.makedirs(save_path_base, exist_ok=True)
 
-        logger.info(f"[PaddleOCR OCR] 执行命令: {' '.join(cmd)}")
+        cmd = [
+            "paddleocr", "doc_parser", "-i", image_path,
+            "--precision", "fp32",
+            "--use_doc_unwarping", "False",
+            "--use_doc_orientation_classify", "True",
+            "--use_chart_recognition", "False",
+            "--use_layout_detection", "False",
+            "--save_path", save_path_base
+        ] + _paddle_ocr_device_args()
+        if VL_REC_BACKEND:
+            cmd.extend(["--vl_rec_backend", VL_REC_BACKEND])
+        if VL_REC_SERVER_URL:
+            cmd.extend(["--vl_rec_server_url", VL_REC_SERVER_URL])
+
+        logger.info(f"[PaddleOCR OCR] 执行命令(doc_parser): {' '.join(cmd)}")
 
-        # 执行命令
         result = subprocess.run(
             cmd,
             capture_output=True,
             text=True,
-            timeout=300,  # 5分钟超时
+            timeout=300,
             check=False,
         )
 
         if result.returncode != 0:
-            logger.error(f"[PaddleOCR OCR] 命令执行失败,返回码: {result.returncode}")
-            logger.error(f"[PaddleOCR OCR] 错误输出: {result.stderr}")
+            logger.error(f"[PaddleOCR OCR] doc_parser 执行失败,返回码: {result.returncode}")
+            if result.stderr and ("too many values to unpack" in result.stderr or "Exception from the 'cv' worker" in result.stderr):
+                logger.warning("[PaddleOCR OCR] doc_parser 报 cv worker 解包错误,详见 README_STARTUP.md。")
+            if result.stderr:
+                logger.error(f"[PaddleOCR OCR] 错误输出: {result.stderr}")
             return None, None
 
-        # 查找保存的JSON文件
-        # OCR命令会在save_path下生成 {basename}_res.json
-        image_basename = os.path.splitext(os.path.basename(image_path))[0]
-        json_file = os.path.join(save_path, f"{image_basename}_res.json")
-
-        if not os.path.exists(json_file):
-            logger.warning(f"[PaddleOCR OCR] JSON文件不存在: {json_file}")
+        texts = []
+        md_file = os.path.join(save_path_base, f"{image_basename}.md")
+        if os.path.exists(md_file):
+            try:
+                with open(md_file, "r", encoding="utf-8") as f:
+                    md_content = f.read()
+                if md_content.strip():
+                    texts = markdown_to_plain_text(md_content)
+            except Exception as e:
+                logger.exception(f"[PaddleOCR OCR] 读取 Markdown 失败: {e}")
+        if not texts and result.stdout.strip():
+            parsed = parse_paddleocr_output(result.stdout.strip())
+            for item in parsed.get("parsing_res_list", []):
+                if isinstance(item, dict) and item.get("block_content"):
+                    block = item["block_content"].strip()
+                    if "\n" in block:
+                        texts.extend([line.strip() for line in block.split("\n") if line.strip()])
+                    else:
+                        texts.append(block)
+        if not texts:
+            logger.warning("[PaddleOCR OCR] doc_parser 未得到文本")
             return None, None
 
-        # 读取JSON文件
+        json_file = os.path.join(save_path, f"{image_basename}_res.json")
         try:
-            with open(json_file, 'r', encoding='utf-8') as f:
-                ocr_data = json.load(f)
-
-            # 优先提取rec_texts字段(如果存在)
-            if "rec_texts" in ocr_data and isinstance(ocr_data["rec_texts"], list):
-                texts = ocr_data["rec_texts"]
-                logger.info(f"[PaddleOCR OCR] 成功提取 {len(texts)} 个文本片段(从rec_texts)")
-                return texts, json_file
-            
-            # 如果没有rec_texts,尝试从parsing_res_list中提取block_content
-            if "parsing_res_list" in ocr_data and isinstance(ocr_data["parsing_res_list"], list):
-                texts = []
-                for item in ocr_data["parsing_res_list"]:
-                    if isinstance(item, dict) and "block_content" in item:
-                        block_content = item["block_content"]
-                        if block_content and block_content.strip():
-                            # 如果block_content包含换行符,按行分割
-                            if "\n" in block_content:
-                                texts.extend([line.strip() for line in block_content.split("\n") if line.strip()])
-                            else:
-                                texts.append(block_content.strip())
-                if texts:
-                    logger.info(f"[PaddleOCR OCR] 成功提取 {len(texts)} 个文本片段(从parsing_res_list)")
-                    return texts, json_file
-            
-            logger.warning("[PaddleOCR OCR] JSON文件中未找到rec_texts或parsing_res_list字段")
-            return None, json_file
-
+            with open(json_file, "w", encoding="utf-8") as f:
+                json.dump({"rec_texts": texts}, f, ensure_ascii=False, indent=0)
         except Exception as e:
-            logger.exception(f"[PaddleOCR OCR] 读取JSON文件失败: {e}")
-            return None, json_file
+            logger.exception(f"[PaddleOCR OCR] 写入 rec_texts JSON 失败: {e}")
+            return texts, None
+
+        logger.info(f"[PaddleOCR OCR] doc_parser 成功提取 {len(texts)} 个文本片段,JSON: {json_file}")
+        return texts, json_file
 
     except subprocess.TimeoutExpired:
         logger.error("[PaddleOCR OCR] 命令执行超时")
@@ -925,7 +935,6 @@ def call_paddleocr_ocr(image_path: str, save_path: str) -> tuple[Optional[List[s
         logger.exception(f"[PaddleOCR OCR] 调用失败: {e}")
         return None, None
     finally:
-        # 无论成功或失败,都尝试重启mineru服务
         if mineru_stopped:
             start_mineru_service()
 

+ 59 - 48
pdf_converter_v2/utils/paddleocr_fallback.py

@@ -750,83 +750,94 @@ def extract_table_text(table_html: str) -> List[str]:
 
 
 def call_paddleocr_ocr(image_path: str, save_path: str) -> tuple[Optional[List[str]], Optional[str]]:
-    """调用paddleocr ocr命令提取文本(用于API接口)
+    """使用不识别图表的 doc_parser 提取文本(替代原 ocr 子命令,用于 API 与 JSON 补充)。
+    
+    内部调用 paddleocr doc_parser(--use_chart_recognition False --use_layout_detection False),
+    从结果得到文本列表并写入与 OCR JSON 兼容的 rec_texts 文件,供 supplement_missing_fields_from_ocr_json 使用。
     
     Args:
         image_path: 图片路径
         save_path: 保存路径(目录)
         
     Returns:
-        (OCR识别的文本列表, JSON文件路径),如果失败返回(None, None)
+        (文本列表, 兼容 rec_texts 的 JSON 文件路径),失败返回 (None, None)
     """
     try:
         if not os.path.exists(image_path):
             logger.error(f"[PaddleOCR OCR] 图片文件不存在: {image_path}")
             return None, None
 
-        # 构建paddleocr ocr命令(NPU 下需加 --device npu:0,否则走 CPU 易段错误)
-        # 注意:ocr 子命令不支持 --vl_rec_backend 等 VL 相关参数
-        cmd = [_get_paddleocr_executable(), "ocr", "-i", image_path, "--save_path", save_path] + _paddle_ocr_device_args()
+        image_basename = os.path.splitext(os.path.basename(image_path))[0]
+        save_path_base = os.path.join(save_path, image_basename)
+        os.makedirs(save_path_base, exist_ok=True)
 
-        logger.info(f"[PaddleOCR OCR] 执行命令: {' '.join(cmd)}")
+        # 使用与 call_paddleocr 一致的不识别图表的 doc_parser 参数(无 --use_table_recognition)
+        cmd = [
+            _get_paddleocr_executable(), "doc_parser", "-i", image_path,
+            "--precision", "fp32",
+            "--use_doc_unwarping", "False",
+            "--use_doc_orientation_classify", "True",
+            "--use_chart_recognition", "False",
+            "--use_layout_detection", "False",
+            "--save_path", save_path_base
+        ] + _paddle_ocr_device_args()
+        if VL_REC_BACKEND:
+            cmd.extend(["--vl_rec_backend", VL_REC_BACKEND])
+        if VL_REC_SERVER_URL:
+            cmd.extend(["--vl_rec_server_url", VL_REC_SERVER_URL])
+
+        logger.info(f"[PaddleOCR OCR] 执行命令(doc_parser): {' '.join(cmd)}")
 
-        # 执行命令(env 含 LD_PRELOAD 与 PADDLE_PDX_DISABLE_MODEL_SOURCE_CHECK)
         result = subprocess.run(
             cmd,
             capture_output=True,
             text=True,
-            timeout=300,  # 5分钟超时
+            timeout=300,
             check=False,
             env=_get_paddleocr_subprocess_env(),
         )
 
         if result.returncode != 0:
-            logger.error(f"[PaddleOCR OCR] 命令执行失败,返回码: {result.returncode}")
-            logger.error(f"[PaddleOCR OCR] 错误输出: {result.stderr}")
+            logger.error(f"[PaddleOCR OCR] doc_parser 执行失败,返回码: {result.returncode}")
+            if result.stderr and ("too many values to unpack" in result.stderr or "Exception from the 'cv' worker" in result.stderr):
+                logger.warning("[PaddleOCR OCR] doc_parser 报 cv worker 解包错误,详见 README_STARTUP.md。")
+            if result.stderr:
+                logger.error(f"[PaddleOCR OCR] 错误输出: {result.stderr}")
             return None, None
 
-        # 查找保存的JSON文件
-        # OCR命令会在save_path下生成 {basename}_res.json
-        image_basename = os.path.splitext(os.path.basename(image_path))[0]
-        json_file = os.path.join(save_path, f"{image_basename}_res.json")
-
-        if not os.path.exists(json_file):
-            logger.warning(f"[PaddleOCR OCR] JSON文件不存在: {json_file}")
+        texts = []
+        md_file = os.path.join(save_path_base, f"{image_basename}.md")
+        if os.path.exists(md_file):
+            try:
+                with open(md_file, "r", encoding="utf-8") as f:
+                    md_content = f.read()
+                if md_content.strip():
+                    texts = markdown_to_plain_text(md_content)
+            except Exception as e:
+                logger.exception(f"[PaddleOCR OCR] 读取 Markdown 失败: {e}")
+        if not texts and result.stdout.strip():
+            parsed = parse_paddleocr_output(result.stdout.strip())
+            for item in parsed.get("parsing_res_list", []):
+                if isinstance(item, dict) and item.get("block_content"):
+                    block = item["block_content"].strip()
+                    if "\n" in block:
+                        texts.extend([line.strip() for line in block.split("\n") if line.strip()])
+                    else:
+                        texts.append(block)
+        if not texts:
+            logger.warning("[PaddleOCR OCR] doc_parser 未得到文本")
             return None, None
 
-        # 读取JSON文件
+        json_file = os.path.join(save_path, f"{image_basename}_res.json")
         try:
-            with open(json_file, 'r', encoding='utf-8') as f:
-                ocr_data = json.load(f)
-
-            # 优先提取rec_texts字段(如果存在)
-            if "rec_texts" in ocr_data and isinstance(ocr_data["rec_texts"], list):
-                texts = ocr_data["rec_texts"]
-                logger.info(f"[PaddleOCR OCR] 成功提取 {len(texts)} 个文本片段(从rec_texts)")
-                return texts, json_file
-            
-            # 如果没有rec_texts,尝试从parsing_res_list中提取block_content
-            if "parsing_res_list" in ocr_data and isinstance(ocr_data["parsing_res_list"], list):
-                texts = []
-                for item in ocr_data["parsing_res_list"]:
-                    if isinstance(item, dict) and "block_content" in item:
-                        block_content = item["block_content"]
-                        if block_content and block_content.strip():
-                            # 如果block_content包含换行符,按行分割
-                            if "\n" in block_content:
-                                texts.extend([line.strip() for line in block_content.split("\n") if line.strip()])
-                            else:
-                                texts.append(block_content.strip())
-                if texts:
-                    logger.info(f"[PaddleOCR OCR] 成功提取 {len(texts)} 个文本片段(从parsing_res_list)")
-                    return texts, json_file
-            
-            logger.warning("[PaddleOCR OCR] JSON文件中未找到rec_texts或parsing_res_list字段")
-            return None, json_file
-
+            with open(json_file, "w", encoding="utf-8") as f:
+                json.dump({"rec_texts": texts}, f, ensure_ascii=False, indent=0)
         except Exception as e:
-            logger.exception(f"[PaddleOCR OCR] 读取JSON文件失败: {e}")
-            return None, json_file
+            logger.exception(f"[PaddleOCR OCR] 写入 rec_texts JSON 失败: {e}")
+            return texts, None
+
+        logger.info(f"[PaddleOCR OCR] doc_parser 成功提取 {len(texts)} 个文本片段,JSON: {json_file}")
+        return texts, json_file
 
     except subprocess.TimeoutExpired:
         logger.error("[PaddleOCR OCR] 命令执行超时")