|
@@ -49,13 +49,11 @@ except ImportError:
|
|
|
|
|
|
|
|
try:
|
|
try:
|
|
|
from paddleocr import PaddleOCRVL
|
|
from paddleocr import PaddleOCRVL
|
|
|
- # 暂时禁用 Python API,因为在 VLLM 引擎运行时显存不足(需要约 11GB)
|
|
|
|
|
- # 等 VLLM GPU 内存利用率降低或使用独立 GPU 后再启用
|
|
|
|
|
- PADDLEOCR_API_AVAILABLE = False
|
|
|
|
|
- logger.info("[PaddleOCR备用] Python API 已禁用(显存不足),使用命令行方式")
|
|
|
|
|
|
|
+ PADDLEOCR_API_AVAILABLE = True
|
|
|
|
|
+ logger.info("[PaddleOCR备用] Python API 已加载")
|
|
|
except ImportError:
|
|
except ImportError:
|
|
|
PADDLEOCR_API_AVAILABLE = False
|
|
PADDLEOCR_API_AVAILABLE = False
|
|
|
- logger.warning("[PaddleOCR备用] paddleocr Python API 未安装,将使用命令行方式")
|
|
|
|
|
|
|
+ logger.warning("[PaddleOCR备用] paddleocr Python API 未安装")
|
|
|
|
|
|
|
|
# PaddleOCR VL pipeline 单例(避免重复初始化)
|
|
# PaddleOCR VL pipeline 单例(避免重复初始化)
|
|
|
_PADDLEOCR_PIPELINE_CACHE: Dict[tuple, Any] = {}
|
|
_PADDLEOCR_PIPELINE_CACHE: Dict[tuple, Any] = {}
|
|
@@ -956,104 +954,37 @@ def call_paddleocr_ocr(image_path: str, save_path: str) -> tuple[Optional[List[s
|
|
|
save_path_base = os.path.join(save_path, image_basename)
|
|
save_path_base = os.path.join(save_path, image_basename)
|
|
|
os.makedirs(save_path_base, exist_ok=True)
|
|
os.makedirs(save_path_base, exist_ok=True)
|
|
|
|
|
|
|
|
- # 优先使用 Python API 方式(use_chart_recognition=False, use_layout_detection=False)
|
|
|
|
|
- if PADDLEOCR_API_AVAILABLE:
|
|
|
|
|
- logger.info(f"[PaddleOCR 文本识别] 使用 Python API 方式")
|
|
|
|
|
- success, md_file = _call_paddleocr_api(
|
|
|
|
|
- image_path,
|
|
|
|
|
- save_path,
|
|
|
|
|
- use_chart_recognition=False,
|
|
|
|
|
- use_layout_detection=False
|
|
|
|
|
- )
|
|
|
|
|
-
|
|
|
|
|
- if success and md_file and os.path.exists(md_file):
|
|
|
|
|
- try:
|
|
|
|
|
- with open(md_file, "r", encoding="utf-8") as f:
|
|
|
|
|
- md_content = f.read()
|
|
|
|
|
- if md_content.strip():
|
|
|
|
|
- texts = markdown_to_plain_text(md_content)
|
|
|
|
|
- if texts:
|
|
|
|
|
- json_file = os.path.join(save_path, f"{image_basename}_res.json")
|
|
|
|
|
- with open(json_file, "w", encoding="utf-8") as f:
|
|
|
|
|
- json.dump({"rec_texts": texts}, f, ensure_ascii=False, indent=0)
|
|
|
|
|
- logger.info(f"[PaddleOCR 文本识别] Python API 成功,得到 {len(texts)} 行文本")
|
|
|
|
|
- return texts, json_file
|
|
|
|
|
- except Exception as e:
|
|
|
|
|
- logger.exception(f"[PaddleOCR 文本识别] Python API 结果处理失败: {e}")
|
|
|
|
|
|
|
+ # 使用 Python API 方式(use_chart_recognition=False, use_layout_detection=False)
|
|
|
|
|
+ if not PADDLEOCR_API_AVAILABLE:
|
|
|
|
|
+ logger.error(f"[PaddleOCR 文本识别] Python API 不可用")
|
|
|
|
|
+ return None, None
|
|
|
|
|
|
|
|
- logger.warning("[PaddleOCR 文本识别] Python API 失败,回退到命令行方式")
|
|
|
|
|
-
|
|
|
|
|
- # 回退到命令行方式
|
|
|
|
|
- cmd = [
|
|
|
|
|
- _get_paddleocr_executable(), "doc_parser", "-i", image_path,
|
|
|
|
|
- "--precision", "fp32",
|
|
|
|
|
- "--use_doc_unwarping", "False",
|
|
|
|
|
- "--use_doc_orientation_classify", "True",
|
|
|
|
|
- "--use_chart_recognition", "False",
|
|
|
|
|
- "--use_layout_detection", "False",
|
|
|
|
|
- "--save_path", save_path_base
|
|
|
|
|
- ] + _paddle_ocr_device_args()
|
|
|
|
|
- if VL_REC_BACKEND:
|
|
|
|
|
- cmd.extend(["--vl_rec_backend", VL_REC_BACKEND])
|
|
|
|
|
- if VL_REC_SERVER_URL:
|
|
|
|
|
- cmd.extend(["--vl_rec_server_url", VL_REC_SERVER_URL])
|
|
|
|
|
-
|
|
|
|
|
- logger.info(f"[PaddleOCR 文本识别] 执行命令(doc_parser): {' '.join(cmd)}")
|
|
|
|
|
-
|
|
|
|
|
- result = subprocess.run(
|
|
|
|
|
- cmd,
|
|
|
|
|
- capture_output=True,
|
|
|
|
|
- text=True,
|
|
|
|
|
- timeout=300,
|
|
|
|
|
- check=False,
|
|
|
|
|
- env=_get_paddleocr_subprocess_env(),
|
|
|
|
|
|
|
+ logger.info(f"[PaddleOCR 文本识别] 使用 Python API 方式")
|
|
|
|
|
+ success, md_file = _call_paddleocr_api(
|
|
|
|
|
+ image_path,
|
|
|
|
|
+ save_path,
|
|
|
|
|
+ use_chart_recognition=False,
|
|
|
|
|
+ use_layout_detection=False
|
|
|
)
|
|
)
|
|
|
-
|
|
|
|
|
- if result.returncode != 0:
|
|
|
|
|
- logger.error(f"[PaddleOCR 文本识别] doc_parser 执行失败,返回码: {result.returncode}")
|
|
|
|
|
- if result.stderr and ("too many values to unpack" in result.stderr or "Exception from the 'cv' worker" in result.stderr):
|
|
|
|
|
- logger.warning("[PaddleOCR 文本识别] doc_parser 报 cv worker 解包错误,详见 README_STARTUP.md。")
|
|
|
|
|
- if result.stderr:
|
|
|
|
|
- logger.error(f"[PaddleOCR 文本识别] 错误输出: {result.stderr}")
|
|
|
|
|
- return None, None
|
|
|
|
|
-
|
|
|
|
|
- texts = []
|
|
|
|
|
- md_file = os.path.join(save_path_base, f"{image_basename}.md")
|
|
|
|
|
- if os.path.exists(md_file):
|
|
|
|
|
|
|
+
|
|
|
|
|
+ if success and md_file and os.path.exists(md_file):
|
|
|
try:
|
|
try:
|
|
|
with open(md_file, "r", encoding="utf-8") as f:
|
|
with open(md_file, "r", encoding="utf-8") as f:
|
|
|
md_content = f.read()
|
|
md_content = f.read()
|
|
|
if md_content.strip():
|
|
if md_content.strip():
|
|
|
texts = markdown_to_plain_text(md_content)
|
|
texts = markdown_to_plain_text(md_content)
|
|
|
|
|
+ if texts:
|
|
|
|
|
+ json_file = os.path.join(save_path, f"{image_basename}_res.json")
|
|
|
|
|
+ with open(json_file, "w", encoding="utf-8") as f:
|
|
|
|
|
+ json.dump({"rec_texts": texts}, f, ensure_ascii=False, indent=0)
|
|
|
|
|
+ logger.info(f"[PaddleOCR 文本识别] Python API 成功,得到 {len(texts)} 行文本")
|
|
|
|
|
+ return texts, json_file
|
|
|
except Exception as e:
|
|
except Exception as e:
|
|
|
- logger.exception(f"[PaddleOCR 文本识别] 读取 Markdown 失败: {e}")
|
|
|
|
|
- if not texts and result.stdout.strip():
|
|
|
|
|
- parsed = parse_paddleocr_output(result.stdout.strip())
|
|
|
|
|
- for item in parsed.get("parsing_res_list", []):
|
|
|
|
|
- if isinstance(item, dict) and item.get("block_content"):
|
|
|
|
|
- block = item["block_content"].strip()
|
|
|
|
|
- if "\n" in block:
|
|
|
|
|
- texts.extend([line.strip() for line in block.split("\n") if line.strip()])
|
|
|
|
|
- else:
|
|
|
|
|
- texts.append(block)
|
|
|
|
|
- if not texts:
|
|
|
|
|
- logger.warning("[PaddleOCR 文本识别] doc_parser 未得到文本")
|
|
|
|
|
- return None, None
|
|
|
|
|
-
|
|
|
|
|
- json_file = os.path.join(save_path, f"{image_basename}_res.json")
|
|
|
|
|
- try:
|
|
|
|
|
- with open(json_file, "w", encoding="utf-8") as f:
|
|
|
|
|
- json.dump({"rec_texts": texts}, f, ensure_ascii=False, indent=0)
|
|
|
|
|
- except Exception as e:
|
|
|
|
|
- logger.exception(f"[PaddleOCR 文本识别] 写入 rec_texts JSON 失败: {e}")
|
|
|
|
|
- return texts, None
|
|
|
|
|
-
|
|
|
|
|
- logger.info(f"[PaddleOCR 文本识别] doc_parser 成功提取 {len(texts)} 个文本片段,JSON: {json_file}")
|
|
|
|
|
- return texts, json_file
|
|
|
|
|
-
|
|
|
|
|
- except subprocess.TimeoutExpired:
|
|
|
|
|
- logger.error("[PaddleOCR 文本识别] 命令执行超时")
|
|
|
|
|
|
|
+ logger.exception(f"[PaddleOCR 文本识别] Python API 结果处理失败: {e}")
|
|
|
|
|
+
|
|
|
|
|
+ logger.error("[PaddleOCR 文本识别] Python API 失败")
|
|
|
return None, None
|
|
return None, None
|
|
|
|
|
+
|
|
|
except Exception as e:
|
|
except Exception as e:
|
|
|
logger.exception(f"[PaddleOCR 文本识别] 调用失败: {e}")
|
|
logger.exception(f"[PaddleOCR 文本识别] 调用失败: {e}")
|
|
|
return None, None
|
|
return None, None
|
|
@@ -1078,96 +1009,34 @@ def call_paddleocr_doc_parser_for_text(image_path: str, save_path: str) -> tuple
|
|
|
save_path_base = os.path.join(save_path, image_basename)
|
|
save_path_base = os.path.join(save_path, image_basename)
|
|
|
os.makedirs(save_path_base, exist_ok=True)
|
|
os.makedirs(save_path_base, exist_ok=True)
|
|
|
|
|
|
|
|
- # 优先使用 Python API 方式(use_chart_recognition=True, use_layout_detection=True)
|
|
|
|
|
- if PADDLEOCR_API_AVAILABLE:
|
|
|
|
|
- logger.info(f"[PaddleOCR 图表识别] 使用 Python API 方式")
|
|
|
|
|
- success, md_file = _call_paddleocr_api(
|
|
|
|
|
- image_path,
|
|
|
|
|
- save_path,
|
|
|
|
|
- use_chart_recognition=True,
|
|
|
|
|
- use_layout_detection=True
|
|
|
|
|
- )
|
|
|
|
|
-
|
|
|
|
|
- if success and md_file and os.path.exists(md_file):
|
|
|
|
|
- try:
|
|
|
|
|
- with open(md_file, 'r', encoding='utf-8') as f:
|
|
|
|
|
- markdown_content = f.read()
|
|
|
|
|
-
|
|
|
|
|
- if markdown_content.strip():
|
|
|
|
|
- plain_text_lines = markdown_to_plain_text(markdown_content)
|
|
|
|
|
- logger.info(f"[PaddleOCR 图表识别] Python API 成功,提取 {len(plain_text_lines)} 行纯文本")
|
|
|
|
|
- return plain_text_lines, md_file
|
|
|
|
|
- except Exception as e:
|
|
|
|
|
- logger.exception(f"[PaddleOCR 图表识别] Python API 结果处理失败: {e}")
|
|
|
|
|
|
|
+ # 使用 Python API 方式(use_chart_recognition=True, use_layout_detection=True)
|
|
|
|
|
+ if not PADDLEOCR_API_AVAILABLE:
|
|
|
|
|
+ logger.error(f"[PaddleOCR 图表识别] Python API 不可用")
|
|
|
|
|
+ return None, None
|
|
|
|
|
|
|
|
- logger.warning("[PaddleOCR 图表识别] Python API 失败,回退到命令行方式")
|
|
|
|
|
-
|
|
|
|
|
- # 回退到命令行方式
|
|
|
|
|
- cmd = [
|
|
|
|
|
- _get_paddleocr_executable(), "doc_parser", "-i", image_path,
|
|
|
|
|
- "--precision", "fp32",
|
|
|
|
|
- "--use_doc_unwarping", "False",
|
|
|
|
|
- "--use_doc_orientation_classify", "True",
|
|
|
|
|
- "--use_chart_recognition", "True",
|
|
|
|
|
- "--save_path", save_path_base
|
|
|
|
|
- ] + _paddle_ocr_device_args()
|
|
|
|
|
- if VL_REC_BACKEND:
|
|
|
|
|
- cmd.extend(["--vl_rec_backend", VL_REC_BACKEND])
|
|
|
|
|
- if VL_REC_SERVER_URL:
|
|
|
|
|
- cmd.extend(["--vl_rec_server_url", VL_REC_SERVER_URL])
|
|
|
|
|
-
|
|
|
|
|
- logger.info(f"[PaddleOCR 图表识别] 执行命令: {' '.join(cmd)}")
|
|
|
|
|
-
|
|
|
|
|
- # 执行命令(env 含 PADDLE_PDX_DISABLE_MODEL_SOURCE_CHECK、FLAGS_use_stride_kernel)
|
|
|
|
|
- result = subprocess.run(
|
|
|
|
|
- cmd,
|
|
|
|
|
- capture_output=True,
|
|
|
|
|
- text=True,
|
|
|
|
|
- timeout=300, # 5分钟超时
|
|
|
|
|
- check=False,
|
|
|
|
|
- env=_get_paddleocr_subprocess_env(),
|
|
|
|
|
|
|
+ logger.info(f"[PaddleOCR 图表识别] 使用 Python API 方式")
|
|
|
|
|
+ success, md_file = _call_paddleocr_api(
|
|
|
|
|
+ image_path,
|
|
|
|
|
+ save_path,
|
|
|
|
|
+ use_chart_recognition=True,
|
|
|
|
|
+ use_layout_detection=True
|
|
|
)
|
|
)
|
|
|
|
|
|
|
|
- if result.returncode != 0:
|
|
|
|
|
- logger.error(f"[PaddleOCR 图表识别] 命令执行失败,返回码: {result.returncode}")
|
|
|
|
|
- if result.stderr and ("too many values to unpack" in result.stderr or "Exception from the 'cv' worker" in result.stderr):
|
|
|
|
|
- logger.warning(
|
|
|
|
|
- "[PaddleOCR 图表识别] 报 cv worker 解包错误,多为 PaddleX 与 PP-DocLayoutV3 不兼容。"
|
|
|
|
|
- " 可尝试: pip install -U paddlex;或改用 文本识别 提取文字。详见 README_STARTUP.md。"
|
|
|
|
|
- )
|
|
|
|
|
- logger.error(f"[PaddleOCR 图表识别] 错误输出: {result.stderr}")
|
|
|
|
|
- return None, None
|
|
|
|
|
-
|
|
|
|
|
- md_file = os.path.join(save_path_base, f"{image_basename}.md")
|
|
|
|
|
- if not os.path.exists(md_file):
|
|
|
|
|
- md_files = sorted(Path(save_path_base).rglob("*.md"))
|
|
|
|
|
- if md_files:
|
|
|
|
|
- md_file = str(md_files[0])
|
|
|
|
|
- logger.info(f"[PaddleOCR 图表识别] 在子目录中找到Markdown文件: {md_file}")
|
|
|
|
|
-
|
|
|
|
|
- if not os.path.exists(md_file):
|
|
|
|
|
- logger.warning(f"[PaddleOCR 图表识别] Markdown文件不存在: {md_file}")
|
|
|
|
|
- return None, None
|
|
|
|
|
-
|
|
|
|
|
- try:
|
|
|
|
|
- with open(md_file, 'r', encoding='utf-8') as f:
|
|
|
|
|
- markdown_content = f.read()
|
|
|
|
|
-
|
|
|
|
|
- if not markdown_content.strip():
|
|
|
|
|
- logger.warning("[PaddleOCR 图表识别] Markdown文件内容为空")
|
|
|
|
|
- return [], md_file
|
|
|
|
|
-
|
|
|
|
|
- plain_text_lines = markdown_to_plain_text(markdown_content)
|
|
|
|
|
- logger.info(f"[PaddleOCR 图表识别] 成功提取 {len(plain_text_lines)} 行纯文本,Markdown文件: {md_file}")
|
|
|
|
|
- return plain_text_lines, md_file
|
|
|
|
|
|
|
+ if success and md_file and os.path.exists(md_file):
|
|
|
|
|
+ try:
|
|
|
|
|
+ with open(md_file, 'r', encoding='utf-8') as f:
|
|
|
|
|
+ markdown_content = f.read()
|
|
|
|
|
|
|
|
- except Exception as e:
|
|
|
|
|
- logger.exception(f"[PaddleOCR 图表识别] 读取Markdown文件失败: {e}")
|
|
|
|
|
- return None, md_file
|
|
|
|
|
-
|
|
|
|
|
- except subprocess.TimeoutExpired:
|
|
|
|
|
- logger.error("[PaddleOCR 图表识别] 命令执行超时")
|
|
|
|
|
|
|
+ if markdown_content.strip():
|
|
|
|
|
+ plain_text_lines = markdown_to_plain_text(markdown_content)
|
|
|
|
|
+ logger.info(f"[PaddleOCR 图表识别] Python API 成功,提取 {len(plain_text_lines)} 行纯文本")
|
|
|
|
|
+ return plain_text_lines, md_file
|
|
|
|
|
+ except Exception as e:
|
|
|
|
|
+ logger.exception(f"[PaddleOCR 图表识别] Python API 结果处理失败: {e}")
|
|
|
|
|
+
|
|
|
|
|
+ logger.error("[PaddleOCR 图表识别] Python API 失败")
|
|
|
return None, None
|
|
return None, None
|
|
|
|
|
+
|
|
|
except Exception as e:
|
|
except Exception as e:
|
|
|
logger.exception(f"[PaddleOCR 图表识别] 调用失败: {e}")
|
|
logger.exception(f"[PaddleOCR 图表识别] 调用失败: {e}")
|
|
|
return None, None
|
|
return None, None
|