|
|
@@ -559,6 +559,65 @@ def extract_first_page_from_pdf(pdf_path: str, output_dir: str) -> Optional[str]
|
|
|
return None
|
|
|
|
|
|
|
|
|
+def extract_all_pages_from_pdf(pdf_path: str, output_dir: str) -> List[str]:
|
|
|
+ """从PDF提取全部页为图片,用于整份文档的 PaddleOCR 解析。
|
|
|
+
|
|
|
+ Args:
|
|
|
+ pdf_path: PDF 文件路径
|
|
|
+ output_dir: 输出目录,用于保存每页图片
|
|
|
+
|
|
|
+ Returns:
|
|
|
+ 每页图片路径列表(按页序),失败返回空列表
|
|
|
+ """
|
|
|
+ if not PIL_AVAILABLE or not os.path.exists(pdf_path):
|
|
|
+ return []
|
|
|
+ os.makedirs(output_dir, exist_ok=True)
|
|
|
+ image_paths: List[str] = []
|
|
|
+
|
|
|
+ if PDFIUM_AVAILABLE:
|
|
|
+ try:
|
|
|
+ pdf = pdfium.PdfDocument(pdf_path)
|
|
|
+ try:
|
|
|
+ n_pages = len(pdf)
|
|
|
+ if n_pages == 0:
|
|
|
+ return []
|
|
|
+ for i in range(n_pages):
|
|
|
+ page = pdf[i]
|
|
|
+ bitmap = page.render(scale=150 / 72)
|
|
|
+ pil_image = bitmap.to_pil()
|
|
|
+ image_filename = f"paddleocr_fallback_page{i}_{int(time.time() * 1000)}_{random.randint(1000, 9999)}.png"
|
|
|
+ image_path = os.path.join(output_dir, image_filename)
|
|
|
+ pil_image.save(image_path, "PNG", optimize=True, compress_level=6)
|
|
|
+ image_paths.append(image_path)
|
|
|
+ bitmap.close()
|
|
|
+ logger.info(f"[PaddleOCR备用] 使用pypdfium2从PDF提取全部 {n_pages} 页图片")
|
|
|
+ return image_paths
|
|
|
+ finally:
|
|
|
+ try:
|
|
|
+ pdf.close()
|
|
|
+ except Exception:
|
|
|
+ pass
|
|
|
+ except Exception as e:
|
|
|
+ logger.warning(f"[PaddleOCR备用] pypdfium2 提取全部页失败: {e}")
|
|
|
+
|
|
|
+ if PDF2IMAGE_AVAILABLE:
|
|
|
+ try:
|
|
|
+ images = convert_from_path(pdf_path, dpi=150)
|
|
|
+ if not images:
|
|
|
+ return []
|
|
|
+ for i, pil_img in enumerate(images):
|
|
|
+ image_filename = f"paddleocr_fallback_page{i}_{int(time.time() * 1000)}_{random.randint(1000, 9999)}.png"
|
|
|
+ image_path = os.path.join(output_dir, image_filename)
|
|
|
+ pil_img.save(image_path, "PNG", optimize=True, compress_level=6)
|
|
|
+ image_paths.append(image_path)
|
|
|
+ logger.info(f"[PaddleOCR备用] 使用pdf2image从PDF提取全部 {len(image_paths)} 页图片")
|
|
|
+ return image_paths
|
|
|
+ except Exception as e:
|
|
|
+ logger.warning(f"[PaddleOCR备用] pdf2image 提取全部页失败: {e}")
|
|
|
+
|
|
|
+ return image_paths
|
|
|
+
|
|
|
+
|
|
|
def find_pdf_file(output_dir: str) -> Optional[str]:
|
|
|
"""在输出目录中查找PDF文件
|
|
|
|
|
|
@@ -1590,29 +1649,30 @@ def fallback_parse_with_paddleocr(
|
|
|
markdown_content: str,
|
|
|
output_dir: Optional[str] = None,
|
|
|
document_type: Optional[str] = None,
|
|
|
- input_file: Optional[str] = None
|
|
|
+ input_file: Optional[str] = None,
|
|
|
+ full_document: bool = False,
|
|
|
) -> Optional[str]:
|
|
|
- """当JSON数据缺失时,使用paddleocr进行备用解析
|
|
|
+ """当JSON数据缺失或识别异常时,使用 paddleocr 进行备用解析。
|
|
|
|
|
|
Args:
|
|
|
json_data: 原始JSON数据
|
|
|
markdown_content: 原始markdown内容
|
|
|
output_dir: 输出目录(用于查找图片)
|
|
|
document_type: 文档类型
|
|
|
- input_file: 原始输入文件路径(PDF或图片),如果未找到图片则从PDF提取第一页
|
|
|
+ input_file: 原始输入文件路径(PDF或图片),如果未找到图片则从PDF提取
|
|
|
+ full_document: 若为 True(如识别异常需整份替换),对 PDF 解析全部页并合并;否则仅第一页
|
|
|
|
|
|
Returns:
|
|
|
补充后的markdown内容,如果失败返回None
|
|
|
"""
|
|
|
try:
|
|
|
- # 注意:调用方已经检查过数据完整性,这里不再重复检查
|
|
|
- # 直接进行备用解析,因为调用方已经确定需要备用解析
|
|
|
doc_type = document_type or json_data.get("document_type", "unknown")
|
|
|
|
|
|
logger.warning("[PaddleOCR备用] 启用PaddleOCR备用解析")
|
|
|
|
|
|
# 尝试从markdown中提取图片路径
|
|
|
image_path = None
|
|
|
+ all_page_images: List[str] = [] # full_document 时存放多页图片路径
|
|
|
if output_dir:
|
|
|
# 首先尝试从markdown中提取
|
|
|
image_path = extract_image_from_markdown(markdown_content, output_dir)
|
|
|
@@ -1645,14 +1705,24 @@ def fallback_parse_with_paddleocr(
|
|
|
file_type = detect_file_type(input_file)
|
|
|
|
|
|
if file_type == 'pdf':
|
|
|
- # 文件是PDF,尝试提取第一页
|
|
|
pdf_path = input_file
|
|
|
logger.info(f"[PaddleOCR备用] 检测到PDF文件(通过内容): {pdf_path}")
|
|
|
- image_path = extract_first_page_from_pdf(pdf_path, output_dir)
|
|
|
- if image_path:
|
|
|
- logger.info(f"[PaddleOCR备用] 成功从PDF提取第一页图片: {image_path}")
|
|
|
+ if full_document:
|
|
|
+ all_page_images = extract_all_pages_from_pdf(pdf_path, output_dir or os.path.dirname(pdf_path))
|
|
|
+ if all_page_images:
|
|
|
+ image_path = all_page_images[0] # 占位,下面按多页分支处理
|
|
|
+ logger.info(f"[PaddleOCR备用] 将解析全部 {len(all_page_images)} 页")
|
|
|
+ else:
|
|
|
+ logger.warning("[PaddleOCR备用] 从PDF提取全部页失败,尝试仅第一页")
|
|
|
+ image_path = extract_first_page_from_pdf(pdf_path, output_dir)
|
|
|
+ if image_path:
|
|
|
+ logger.info(f"[PaddleOCR备用] 成功从PDF提取第一页图片: {image_path}")
|
|
|
else:
|
|
|
- logger.warning("[PaddleOCR备用] 从PDF提取图片失败(可能是PDF文件损坏或缺少必要的库)")
|
|
|
+ image_path = extract_first_page_from_pdf(pdf_path, output_dir)
|
|
|
+ if image_path:
|
|
|
+ logger.info(f"[PaddleOCR备用] 成功从PDF提取第一页图片: {image_path}")
|
|
|
+ else:
|
|
|
+ logger.warning("[PaddleOCR备用] 从PDF提取图片失败(可能是PDF文件损坏或缺少必要的库)")
|
|
|
elif file_type in ['png', 'jpeg', 'jpg']:
|
|
|
# 文件内容是图片,但路径可能为 .pdf(上游保存时扩展名错误),
|
|
|
# 若直接传给 doc_parser 会按扩展名用 PDFium 打开导致 Data format error,
|
|
|
@@ -1709,18 +1779,36 @@ def fallback_parse_with_paddleocr(
|
|
|
logger.warning(f"[PaddleOCR备用] 未找到可用的图片或PDF文件(input_file={input_file}, output_dir={output_dir}),无法进行备用解析")
|
|
|
logger.info("[PaddleOCR备用] 备用解析需要图片文件或PDF文件,如果都没有,将返回原始markdown内容")
|
|
|
|
|
|
- if not image_path:
|
|
|
+ if not image_path and not all_page_images:
|
|
|
logger.warning("[PaddleOCR备用] 未找到可用的图片文件,备用解析无法进行,返回None(将使用原始解析结果)")
|
|
|
return None
|
|
|
|
|
|
- # 使用doc_parser模式解析文档结构
|
|
|
- logger.info("[PaddleOCR备用] 使用doc_parser模式解析文档结构(图表识别)")
|
|
|
- paddleocr_result = call_paddleocr(image_path)
|
|
|
- if not paddleocr_result:
|
|
|
- logger.error("[PaddleOCR备用] PaddleOCR解析失败")
|
|
|
- return None
|
|
|
-
|
|
|
- # 检查返回结果格式
|
|
|
+ # 使用doc_parser模式解析文档结构(多页时逐页解析再合并)
|
|
|
+ if all_page_images:
|
|
|
+ logger.info(f"[PaddleOCR备用] 使用doc_parser模式解析全部 {len(all_page_images)} 页(图表识别)")
|
|
|
+ markdown_parts: List[str] = []
|
|
|
+ for idx, page_image in enumerate(all_page_images):
|
|
|
+ result = call_paddleocr(page_image)
|
|
|
+ if result and "markdown_content" in result:
|
|
|
+ markdown_parts.append(result["markdown_content"].strip())
|
|
|
+ elif result and result.get("parsing_res_list"):
|
|
|
+ markdown_parts.append(paddleocr_to_markdown(result).strip())
|
|
|
+ else:
|
|
|
+ logger.warning(f"[PaddleOCR备用] 第 {idx + 1} 页解析无有效内容,跳过")
|
|
|
+ if not markdown_parts:
|
|
|
+ logger.error("[PaddleOCR备用] 多页解析均无有效结果")
|
|
|
+ return None
|
|
|
+ paddleocr_markdown = "\n\n".join(markdown_parts)
|
|
|
+ logger.info(f"[PaddleOCR备用] 多页合并完成,共 {len(paddleocr_markdown)} 字符")
|
|
|
+ return paddleocr_markdown
|
|
|
+ else:
|
|
|
+ logger.info("[PaddleOCR备用] 使用doc_parser模式解析文档结构(图表识别)")
|
|
|
+ paddleocr_result = call_paddleocr(image_path)
|
|
|
+ if not paddleocr_result:
|
|
|
+ logger.error("[PaddleOCR备用] PaddleOCR解析失败")
|
|
|
+ return None
|
|
|
+
|
|
|
+ # 检查返回结果格式(单页)
|
|
|
if "markdown_content" in paddleocr_result:
|
|
|
# 直接从MD文件读取的内容
|
|
|
paddleocr_markdown = paddleocr_result["markdown_content"]
|