|
|
@@ -15,6 +15,7 @@ import ast
|
|
|
import re
|
|
|
|
|
|
from ..utils.logging_config import get_logger
|
|
|
+from ..config import VL_REC_BACKEND, VL_REC_SERVER_URL
|
|
|
|
|
|
logger = get_logger("pdf_converter_v2.utils.paddleocr")
|
|
|
|
|
|
@@ -839,84 +840,93 @@ def extract_table_text(table_html: str) -> List[str]:
|
|
|
|
|
|
|
|
|
def call_paddleocr_ocr(image_path: str, save_path: str) -> tuple[Optional[List[str]], Optional[str]]:
|
|
|
- """调用paddleocr ocr命令提取文本(用于API接口)
|
|
|
+ """使用不识别图表的 doc_parser 提取文本(替代原 ocr 子命令,用于 API 与 JSON 补充)。
|
|
|
+
|
|
|
+ 内部调用 paddleocr doc_parser(--use_chart_recognition False --use_layout_detection False),
|
|
|
+ 从结果得到文本列表并写入与 OCR JSON 兼容的 rec_texts 文件,供 supplement_missing_fields_from_ocr_json 使用。
|
|
|
|
|
|
Args:
|
|
|
image_path: 图片路径
|
|
|
save_path: 保存路径(目录)
|
|
|
|
|
|
Returns:
|
|
|
- (OCR识别的文本列表, JSON文件路径),如果失败返回(None, None)
|
|
|
+ (文本列表, 兼容 rec_texts 的 JSON 文件路径),失败返回 (None, None)
|
|
|
"""
|
|
|
- # 在调用PaddleOCR前停止mineru服务以释放GPU内存
|
|
|
mineru_stopped = stop_mineru_service()
|
|
|
-
|
|
|
try:
|
|
|
if not os.path.exists(image_path):
|
|
|
logger.error(f"[PaddleOCR OCR] 图片文件不存在: {image_path}")
|
|
|
return None, None
|
|
|
|
|
|
- # 构建paddleocr ocr命令(NPU 下需加 --device npu:0,否则走 CPU 易段错误)
|
|
|
- cmd = ["paddleocr", "ocr", "-i", image_path, "--save_path", save_path] + _paddle_ocr_device_args()
|
|
|
+ image_basename = os.path.splitext(os.path.basename(image_path))[0]
|
|
|
+ save_path_base = os.path.join(save_path, image_basename)
|
|
|
+ os.makedirs(save_path_base, exist_ok=True)
|
|
|
|
|
|
- logger.info(f"[PaddleOCR OCR] 执行命令: {' '.join(cmd)}")
|
|
|
+ cmd = [
|
|
|
+ "paddleocr", "doc_parser", "-i", image_path,
|
|
|
+ "--precision", "fp32",
|
|
|
+ "--use_doc_unwarping", "False",
|
|
|
+ "--use_doc_orientation_classify", "True",
|
|
|
+ "--use_chart_recognition", "False",
|
|
|
+ "--use_layout_detection", "False",
|
|
|
+ "--save_path", save_path_base
|
|
|
+ ] + _paddle_ocr_device_args()
|
|
|
+ if VL_REC_BACKEND:
|
|
|
+ cmd.extend(["--vl_rec_backend", VL_REC_BACKEND])
|
|
|
+ if VL_REC_SERVER_URL:
|
|
|
+ cmd.extend(["--vl_rec_server_url", VL_REC_SERVER_URL])
|
|
|
+
|
|
|
+ logger.info(f"[PaddleOCR OCR] 执行命令(doc_parser): {' '.join(cmd)}")
|
|
|
|
|
|
- # 执行命令
|
|
|
result = subprocess.run(
|
|
|
cmd,
|
|
|
capture_output=True,
|
|
|
text=True,
|
|
|
- timeout=300, # 5分钟超时
|
|
|
+ timeout=300,
|
|
|
check=False,
|
|
|
)
|
|
|
|
|
|
if result.returncode != 0:
|
|
|
- logger.error(f"[PaddleOCR OCR] 命令执行失败,返回码: {result.returncode}")
|
|
|
- logger.error(f"[PaddleOCR OCR] 错误输出: {result.stderr}")
|
|
|
+ logger.error(f"[PaddleOCR OCR] doc_parser 执行失败,返回码: {result.returncode}")
|
|
|
+ if result.stderr and ("too many values to unpack" in result.stderr or "Exception from the 'cv' worker" in result.stderr):
|
|
|
+ logger.warning("[PaddleOCR OCR] doc_parser 报 cv worker 解包错误,详见 README_STARTUP.md。")
|
|
|
+ if result.stderr:
|
|
|
+ logger.error(f"[PaddleOCR OCR] 错误输出: {result.stderr}")
|
|
|
return None, None
|
|
|
|
|
|
- # 查找保存的JSON文件
|
|
|
- # OCR命令会在save_path下生成 {basename}_res.json
|
|
|
- image_basename = os.path.splitext(os.path.basename(image_path))[0]
|
|
|
- json_file = os.path.join(save_path, f"{image_basename}_res.json")
|
|
|
-
|
|
|
- if not os.path.exists(json_file):
|
|
|
- logger.warning(f"[PaddleOCR OCR] JSON文件不存在: {json_file}")
|
|
|
+ texts = []
|
|
|
+ md_file = os.path.join(save_path_base, f"{image_basename}.md")
|
|
|
+ if os.path.exists(md_file):
|
|
|
+ try:
|
|
|
+ with open(md_file, "r", encoding="utf-8") as f:
|
|
|
+ md_content = f.read()
|
|
|
+ if md_content.strip():
|
|
|
+ texts = markdown_to_plain_text(md_content)
|
|
|
+ except Exception as e:
|
|
|
+ logger.exception(f"[PaddleOCR OCR] 读取 Markdown 失败: {e}")
|
|
|
+ if not texts and result.stdout.strip():
|
|
|
+ parsed = parse_paddleocr_output(result.stdout.strip())
|
|
|
+ for item in parsed.get("parsing_res_list", []):
|
|
|
+ if isinstance(item, dict) and item.get("block_content"):
|
|
|
+ block = item["block_content"].strip()
|
|
|
+ if "\n" in block:
|
|
|
+ texts.extend([line.strip() for line in block.split("\n") if line.strip()])
|
|
|
+ else:
|
|
|
+ texts.append(block)
|
|
|
+ if not texts:
|
|
|
+ logger.warning("[PaddleOCR OCR] doc_parser 未得到文本")
|
|
|
return None, None
|
|
|
|
|
|
- # 读取JSON文件
|
|
|
+ json_file = os.path.join(save_path, f"{image_basename}_res.json")
|
|
|
try:
|
|
|
- with open(json_file, 'r', encoding='utf-8') as f:
|
|
|
- ocr_data = json.load(f)
|
|
|
-
|
|
|
- # 优先提取rec_texts字段(如果存在)
|
|
|
- if "rec_texts" in ocr_data and isinstance(ocr_data["rec_texts"], list):
|
|
|
- texts = ocr_data["rec_texts"]
|
|
|
- logger.info(f"[PaddleOCR OCR] 成功提取 {len(texts)} 个文本片段(从rec_texts)")
|
|
|
- return texts, json_file
|
|
|
-
|
|
|
- # 如果没有rec_texts,尝试从parsing_res_list中提取block_content
|
|
|
- if "parsing_res_list" in ocr_data and isinstance(ocr_data["parsing_res_list"], list):
|
|
|
- texts = []
|
|
|
- for item in ocr_data["parsing_res_list"]:
|
|
|
- if isinstance(item, dict) and "block_content" in item:
|
|
|
- block_content = item["block_content"]
|
|
|
- if block_content and block_content.strip():
|
|
|
- # 如果block_content包含换行符,按行分割
|
|
|
- if "\n" in block_content:
|
|
|
- texts.extend([line.strip() for line in block_content.split("\n") if line.strip()])
|
|
|
- else:
|
|
|
- texts.append(block_content.strip())
|
|
|
- if texts:
|
|
|
- logger.info(f"[PaddleOCR OCR] 成功提取 {len(texts)} 个文本片段(从parsing_res_list)")
|
|
|
- return texts, json_file
|
|
|
-
|
|
|
- logger.warning("[PaddleOCR OCR] JSON文件中未找到rec_texts或parsing_res_list字段")
|
|
|
- return None, json_file
|
|
|
-
|
|
|
+ with open(json_file, "w", encoding="utf-8") as f:
|
|
|
+ json.dump({"rec_texts": texts}, f, ensure_ascii=False, indent=0)
|
|
|
except Exception as e:
|
|
|
- logger.exception(f"[PaddleOCR OCR] 读取JSON文件失败: {e}")
|
|
|
- return None, json_file
|
|
|
+ logger.exception(f"[PaddleOCR OCR] 写入 rec_texts JSON 失败: {e}")
|
|
|
+ return texts, None
|
|
|
+
|
|
|
+ logger.info(f"[PaddleOCR OCR] doc_parser 成功提取 {len(texts)} 个文本片段,JSON: {json_file}")
|
|
|
+ return texts, json_file
|
|
|
|
|
|
except subprocess.TimeoutExpired:
|
|
|
logger.error("[PaddleOCR OCR] 命令执行超时")
|
|
|
@@ -925,7 +935,6 @@ def call_paddleocr_ocr(image_path: str, save_path: str) -> tuple[Optional[List[s
|
|
|
logger.exception(f"[PaddleOCR OCR] 调用失败: {e}")
|
|
|
return None, None
|
|
|
finally:
|
|
|
- # 无论成功或失败,都尝试重启mineru服务
|
|
|
if mineru_stopped:
|
|
|
start_mineru_service()
|
|
|
|