2 weeks ago · f6c245facc
--- a/pdf_converter_v2/paddleocr_fallback.py
+++ b/pdf_converter_v2/paddleocr_fallback.py
@@ -15,6 +15,7 @@ import ast
 
				 import re
			
 
				 
			
 
				 from ..utils.logging_config import get_logger
			
 
				+from ..config import VL_REC_BACKEND, VL_REC_SERVER_URL
			
 
				 
			
 
				 logger = get_logger("pdf_converter_v2.utils.paddleocr")
			
 
				 
			
@@ -839,84 +840,93 @@ def extract_table_text(table_html: str) -> List[str]:
 
				 
			
 
				 
			
 
				 def call_paddleocr_ocr(image_path: str, save_path: str) -> tuple[Optional[List[str]], Optional[str]]:
			
 
				-    """调用paddleocr ocr命令提取文本（用于API接口）
			
 
				+    """使用不识别图表的 doc_parser 提取文本（替代原 ocr 子命令，用于 API 与 JSON 补充）。
			
 
				+    
			
 
				+    内部调用 paddleocr doc_parser（--use_chart_recognition False --use_layout_detection False），
			
 
				+    从结果得到文本列表并写入与 OCR JSON 兼容的 rec_texts 文件，供 supplement_missing_fields_from_ocr_json 使用。
			
 
				     
			
 
				     Args:
			
 
				         image_path: 图片路径
			
 
				         save_path: 保存路径（目录）
			
 
				         
			
 
				     Returns:
			
 
				-        (OCR识别的文本列表, JSON文件路径)，如果失败返回(None, None)
			
 
				+        (文本列表, 兼容 rec_texts 的 JSON 文件路径)，失败返回 (None, None)
			
 
				     """
			
 
				-    # 在调用PaddleOCR前停止mineru服务以释放GPU内存
			
 
				     mineru_stopped = stop_mineru_service()
			
 
				-    
			
 
				     try:
			
 
				         if not os.path.exists(image_path):
			
 
				             logger.error(f"[PaddleOCR OCR] 图片文件不存在: {image_path}")
			
 
				             return None, None
			
 
				 
			
 
				-        # 构建paddleocr ocr命令（NPU 下需加 --device npu:0，否则走 CPU 易段错误）
			
 
				-        cmd = ["paddleocr", "ocr", "-i", image_path, "--save_path", save_path] + _paddle_ocr_device_args()
			
 
				+        image_basename = os.path.splitext(os.path.basename(image_path))[0]
			
 
				+        save_path_base = os.path.join(save_path, image_basename)
			
 
				+        os.makedirs(save_path_base, exist_ok=True)
			
 
				 
			
 
				-        logger.info(f"[PaddleOCR OCR] 执行命令: {' '.join(cmd)}")
			
 
				+        cmd = [
			
 
				+            "paddleocr", "doc_parser", "-i", image_path,
			
 
				+            "--precision", "fp32",
			
 
				+            "--use_doc_unwarping", "False",
			
 
				+            "--use_doc_orientation_classify", "True",
			
 
				+            "--use_chart_recognition", "False",
			
 
				+            "--use_layout_detection", "False",
			
 
				+            "--save_path", save_path_base
			
 
				+        ] + _paddle_ocr_device_args()
			
 
				+        if VL_REC_BACKEND:
			
 
				+            cmd.extend(["--vl_rec_backend", VL_REC_BACKEND])
			
 
				+        if VL_REC_SERVER_URL:
			
 
				+            cmd.extend(["--vl_rec_server_url", VL_REC_SERVER_URL])
			
 
				+
			
 
				+        logger.info(f"[PaddleOCR OCR] 执行命令(doc_parser): {' '.join(cmd)}")
			
 
				 
			
 
				-        # 执行命令
			
 
				         result = subprocess.run(
			
 
				             cmd,
			
 
				             capture_output=True,
			
 
				             text=True,
			
 
				-            timeout=300,  # 5分钟超时
			
 
				+            timeout=300,
			
 
				             check=False,
			
 
				         )
			
 
				 
			
 
				         if result.returncode != 0:
			
 
				-            logger.error(f"[PaddleOCR OCR] 命令执行失败，返回码: {result.returncode}")
			
 
				-            logger.error(f"[PaddleOCR OCR] 错误输出: {result.stderr}")
			
 
				+            logger.error(f"[PaddleOCR OCR] doc_parser 执行失败，返回码: {result.returncode}")
			
 
				+            if result.stderr and ("too many values to unpack" in result.stderr or "Exception from the 'cv' worker" in result.stderr):
			
 
				+                logger.warning("[PaddleOCR OCR] doc_parser 报 cv worker 解包错误，详见 README_STARTUP.md。")
			
 
				+            if result.stderr:
			
 
				+                logger.error(f"[PaddleOCR OCR] 错误输出: {result.stderr}")
			
 
				             return None, None
			
 
				 
			
 
				-        # 查找保存的JSON文件
			
 
				-        # OCR命令会在save_path下生成 {basename}_res.json
			
 
				-        image_basename = os.path.splitext(os.path.basename(image_path))[0]
			
 
				-        json_file = os.path.join(save_path, f"{image_basename}_res.json")
			
 
				-
			
 
				-        if not os.path.exists(json_file):
			
 
				-            logger.warning(f"[PaddleOCR OCR] JSON文件不存在: {json_file}")
			
 
				+        texts = []
			
 
				+        md_file = os.path.join(save_path_base, f"{image_basename}.md")
			
 
				+        if os.path.exists(md_file):
			
 
				+            try:
			
 
				+                with open(md_file, "r", encoding="utf-8") as f:
			
 
				+                    md_content = f.read()
			
 
				+                if md_content.strip():
			
 
				+                    texts = markdown_to_plain_text(md_content)
			
 
				+            except Exception as e:
			
 
				+                logger.exception(f"[PaddleOCR OCR] 读取 Markdown 失败: {e}")
			
 
				+        if not texts and result.stdout.strip():
			
 
				+            parsed = parse_paddleocr_output(result.stdout.strip())
			
 
				+            for item in parsed.get("parsing_res_list", []):
			
 
				+                if isinstance(item, dict) and item.get("block_content"):
			
 
				+                    block = item["block_content"].strip()
			
 
				+                    if "\n" in block:
			
 
				+                        texts.extend([line.strip() for line in block.split("\n") if line.strip()])
			
 
				+                    else:
			
 
				+                        texts.append(block)
			
 
				+        if not texts:
			
 
				+            logger.warning("[PaddleOCR OCR] doc_parser 未得到文本")
			
 
				             return None, None
			
 
				 
			
 
				-        # 读取JSON文件
			
 
				+        json_file = os.path.join(save_path, f"{image_basename}_res.json")
			
 
				         try:
			
 
				-            with open(json_file, 'r', encoding='utf-8') as f:
			
 
				-                ocr_data = json.load(f)
			
 
				-
			
 
				-            # 优先提取rec_texts字段（如果存在）
			
 
				-            if "rec_texts" in ocr_data and isinstance(ocr_data["rec_texts"], list):
			
 
				-                texts = ocr_data["rec_texts"]
			
 
				-                logger.info(f"[PaddleOCR OCR] 成功提取 {len(texts)} 个文本片段（从rec_texts）")
			
 
				-                return texts, json_file
			
 
				-            
			
 
				-            # 如果没有rec_texts，尝试从parsing_res_list中提取block_content
			
 
				-            if "parsing_res_list" in ocr_data and isinstance(ocr_data["parsing_res_list"], list):
			
 
				-                texts = []
			
 
				-                for item in ocr_data["parsing_res_list"]:
			
 
				-                    if isinstance(item, dict) and "block_content" in item:
			
 
				-                        block_content = item["block_content"]
			
 
				-                        if block_content and block_content.strip():
			
 
				-                            # 如果block_content包含换行符，按行分割
			
 
				-                            if "\n" in block_content:
			
 
				-                                texts.extend([line.strip() for line in block_content.split("\n") if line.strip()])
			
 
				-                            else:
			
 
				-                                texts.append(block_content.strip())
			
 
				-                if texts:
			
 
				-                    logger.info(f"[PaddleOCR OCR] 成功提取 {len(texts)} 个文本片段（从parsing_res_list）")
			
 
				-                    return texts, json_file
			
 
				-            
			
 
				-            logger.warning("[PaddleOCR OCR] JSON文件中未找到rec_texts或parsing_res_list字段")
			
 
				-            return None, json_file
			
 
				-
			
 
				+            with open(json_file, "w", encoding="utf-8") as f:
			
 
				+                json.dump({"rec_texts": texts}, f, ensure_ascii=False, indent=0)
			
 
				         except Exception as e:
			
 
				-            logger.exception(f"[PaddleOCR OCR] 读取JSON文件失败: {e}")
			
 
				-            return None, json_file
			
 
				+            logger.exception(f"[PaddleOCR OCR] 写入 rec_texts JSON 失败: {e}")
			
 
				+            return texts, None
			
 
				+
			
 
				+        logger.info(f"[PaddleOCR OCR] doc_parser 成功提取 {len(texts)} 个文本片段，JSON: {json_file}")
			
 
				+        return texts, json_file
			
 
				 
			
 
				     except subprocess.TimeoutExpired:
			
 
				         logger.error("[PaddleOCR OCR] 命令执行超时")
			
@@ -925,7 +935,6 @@ def call_paddleocr_ocr(image_path: str, save_path: str) -> tuple[Optional[List[s
 
				         logger.exception(f"[PaddleOCR OCR] 调用失败: {e}")
			
 
				         return None, None
			
 
				     finally:
			
 
				-        # 无论成功或失败，都尝试重启mineru服务
			
 
				         if mineru_stopped:
			
 
				             start_mineru_service()
			
 
				 
			
--- a/pdf_converter_v2/utils/paddleocr_fallback.py
+++ b/pdf_converter_v2/utils/paddleocr_fallback.py
@@ -750,83 +750,94 @@ def extract_table_text(table_html: str) -> List[str]:
 
				 
			
 
				 
			
 
				 def call_paddleocr_ocr(image_path: str, save_path: str) -> tuple[Optional[List[str]], Optional[str]]:
			
 
				-    """调用paddleocr ocr命令提取文本（用于API接口）
			
 
				+    """使用不识别图表的 doc_parser 提取文本（替代原 ocr 子命令，用于 API 与 JSON 补充）。
			
 
				+    
			
 
				+    内部调用 paddleocr doc_parser（--use_chart_recognition False --use_layout_detection False），
			
 
				+    从结果得到文本列表并写入与 OCR JSON 兼容的 rec_texts 文件，供 supplement_missing_fields_from_ocr_json 使用。
			
 
				     
			
 
				     Args:
			
 
				         image_path: 图片路径
			
 
				         save_path: 保存路径（目录）
			
 
				         
			
 
				     Returns:
			
 
				-        (OCR识别的文本列表, JSON文件路径)，如果失败返回(None, None)
			
 
				+        (文本列表, 兼容 rec_texts 的 JSON 文件路径)，失败返回 (None, None)
			
 
				     """
			
 
				     try:
			
 
				         if not os.path.exists(image_path):
			
 
				             logger.error(f"[PaddleOCR OCR] 图片文件不存在: {image_path}")
			
 
				             return None, None
			
 
				 
			
 
				-        # 构建paddleocr ocr命令（NPU 下需加 --device npu:0，否则走 CPU 易段错误）
			
 
				-        # 注意：ocr 子命令不支持 --vl_rec_backend 等 VL 相关参数
			
 
				-        cmd = [_get_paddleocr_executable(), "ocr", "-i", image_path, "--save_path", save_path] + _paddle_ocr_device_args()
			
 
				+        image_basename = os.path.splitext(os.path.basename(image_path))[0]
			
 
				+        save_path_base = os.path.join(save_path, image_basename)
			
 
				+        os.makedirs(save_path_base, exist_ok=True)
			
 
				 
			
 
				-        logger.info(f"[PaddleOCR OCR] 执行命令: {' '.join(cmd)}")
			
 
				+        # 使用与 call_paddleocr 一致的不识别图表的 doc_parser 参数（无 --use_table_recognition）
			
 
				+        cmd = [
			
 
				+            _get_paddleocr_executable(), "doc_parser", "-i", image_path,
			
 
				+            "--precision", "fp32",
			
 
				+            "--use_doc_unwarping", "False",
			
 
				+            "--use_doc_orientation_classify", "True",
			
 
				+            "--use_chart_recognition", "False",
			
 
				+            "--use_layout_detection", "False",
			
 
				+            "--save_path", save_path_base
			
 
				+        ] + _paddle_ocr_device_args()
			
 
				+        if VL_REC_BACKEND:
			
 
				+            cmd.extend(["--vl_rec_backend", VL_REC_BACKEND])
			
 
				+        if VL_REC_SERVER_URL:
			
 
				+            cmd.extend(["--vl_rec_server_url", VL_REC_SERVER_URL])
			
 
				+
			
 
				+        logger.info(f"[PaddleOCR OCR] 执行命令(doc_parser): {' '.join(cmd)}")
			
 
				 
			
 
				-        # 执行命令（env 含 LD_PRELOAD 与 PADDLE_PDX_DISABLE_MODEL_SOURCE_CHECK）
			
 
				         result = subprocess.run(
			
 
				             cmd,
			
 
				             capture_output=True,
			
 
				             text=True,
			
 
				-            timeout=300,  # 5分钟超时
			
 
				+            timeout=300,
			
 
				             check=False,
			
 
				             env=_get_paddleocr_subprocess_env(),
			
 
				         )
			
 
				 
			
 
				         if result.returncode != 0:
			
 
				-            logger.error(f"[PaddleOCR OCR] 命令执行失败，返回码: {result.returncode}")
			
 
				-            logger.error(f"[PaddleOCR OCR] 错误输出: {result.stderr}")
			
 
				+            logger.error(f"[PaddleOCR OCR] doc_parser 执行失败，返回码: {result.returncode}")
			
 
				+            if result.stderr and ("too many values to unpack" in result.stderr or "Exception from the 'cv' worker" in result.stderr):
			
 
				+                logger.warning("[PaddleOCR OCR] doc_parser 报 cv worker 解包错误，详见 README_STARTUP.md。")
			
 
				+            if result.stderr:
			
 
				+                logger.error(f"[PaddleOCR OCR] 错误输出: {result.stderr}")
			
 
				             return None, None
			
 
				 
			
 
				-        # 查找保存的JSON文件
			
 
				-        # OCR命令会在save_path下生成 {basename}_res.json
			
 
				-        image_basename = os.path.splitext(os.path.basename(image_path))[0]
			
 
				-        json_file = os.path.join(save_path, f"{image_basename}_res.json")
			
 
				-
			
 
				-        if not os.path.exists(json_file):
			
 
				-            logger.warning(f"[PaddleOCR OCR] JSON文件不存在: {json_file}")
			
 
				+        texts = []
			
 
				+        md_file = os.path.join(save_path_base, f"{image_basename}.md")
			
 
				+        if os.path.exists(md_file):
			
 
				+            try:
			
 
				+                with open(md_file, "r", encoding="utf-8") as f:
			
 
				+                    md_content = f.read()
			
 
				+                if md_content.strip():
			
 
				+                    texts = markdown_to_plain_text(md_content)
			
 
				+            except Exception as e:
			
 
				+                logger.exception(f"[PaddleOCR OCR] 读取 Markdown 失败: {e}")
			
 
				+        if not texts and result.stdout.strip():
			
 
				+            parsed = parse_paddleocr_output(result.stdout.strip())
			
 
				+            for item in parsed.get("parsing_res_list", []):
			
 
				+                if isinstance(item, dict) and item.get("block_content"):
			
 
				+                    block = item["block_content"].strip()
			
 
				+                    if "\n" in block:
			
 
				+                        texts.extend([line.strip() for line in block.split("\n") if line.strip()])
			
 
				+                    else:
			
 
				+                        texts.append(block)
			
 
				+        if not texts:
			
 
				+            logger.warning("[PaddleOCR OCR] doc_parser 未得到文本")
			
 
				             return None, None
			
 
				 
			
 
				-        # 读取JSON文件
			
 
				+        json_file = os.path.join(save_path, f"{image_basename}_res.json")
			
 
				         try:
			
 
				-            with open(json_file, 'r', encoding='utf-8') as f:
			
 
				-                ocr_data = json.load(f)
			
 
				-
			
 
				-            # 优先提取rec_texts字段（如果存在）
			
 
				-            if "rec_texts" in ocr_data and isinstance(ocr_data["rec_texts"], list):
			
 
				-                texts = ocr_data["rec_texts"]
			
 
				-                logger.info(f"[PaddleOCR OCR] 成功提取 {len(texts)} 个文本片段（从rec_texts）")
			
 
				-                return texts, json_file
			
 
				-            
			
 
				-            # 如果没有rec_texts，尝试从parsing_res_list中提取block_content
			
 
				-            if "parsing_res_list" in ocr_data and isinstance(ocr_data["parsing_res_list"], list):
			
 
				-                texts = []
			
 
				-                for item in ocr_data["parsing_res_list"]:
			
 
				-                    if isinstance(item, dict) and "block_content" in item:
			
 
				-                        block_content = item["block_content"]
			
 
				-                        if block_content and block_content.strip():
			
 
				-                            # 如果block_content包含换行符，按行分割
			
 
				-                            if "\n" in block_content:
			
 
				-                                texts.extend([line.strip() for line in block_content.split("\n") if line.strip()])
			
 
				-                            else:
			
 
				-                                texts.append(block_content.strip())
			
 
				-                if texts:
			
 
				-                    logger.info(f"[PaddleOCR OCR] 成功提取 {len(texts)} 个文本片段（从parsing_res_list）")
			
 
				-                    return texts, json_file
			
 
				-            
			
 
				-            logger.warning("[PaddleOCR OCR] JSON文件中未找到rec_texts或parsing_res_list字段")
			
 
				-            return None, json_file
			
 
				-
			
 
				+            with open(json_file, "w", encoding="utf-8") as f:
			
 
				+                json.dump({"rec_texts": texts}, f, ensure_ascii=False, indent=0)
			
 
				         except Exception as e:
			
 
				-            logger.exception(f"[PaddleOCR OCR] 读取JSON文件失败: {e}")
			
 
				-            return None, json_file
			
 
				+            logger.exception(f"[PaddleOCR OCR] 写入 rec_texts JSON 失败: {e}")
			
 
				+            return texts, None
			
 
				+
			
 
				+        logger.info(f"[PaddleOCR OCR] doc_parser 成功提取 {len(texts)} 个文本片段，JSON: {json_file}")
			
 
				+        return texts, json_file
			
 
				 
			
 
				     except subprocess.TimeoutExpired:
			
 
				         logger.error("[PaddleOCR OCR] 命令执行超时")