|
|
@@ -15,25 +15,14 @@ from utils.logging_config import get_logger
|
|
|
# 初始化日志
|
|
|
logger = get_logger("pdf_converter_v2.attachment_splitter")
|
|
|
|
|
|
-# 尝试导入 OCR 相关库
|
|
|
+# 导入 Tesseract OCR
|
|
|
try:
|
|
|
import pytesseract
|
|
|
TESSERACT_AVAILABLE = True
|
|
|
logger.info("[附件切割] Tesseract OCR 可用")
|
|
|
except ImportError:
|
|
|
TESSERACT_AVAILABLE = False
|
|
|
- logger.warning("[附件切割] Tesseract OCR 不可用")
|
|
|
-
|
|
|
-# 尝试导入 PaddleOCR 作为备用
|
|
|
-try:
|
|
|
- from paddleocr import PaddleOCR
|
|
|
- PADDLEOCR_AVAILABLE = True
|
|
|
- logger.info("[附件切割] PaddleOCR 可用")
|
|
|
- # 初始化 PaddleOCR(延迟到实际使用时)
|
|
|
- _paddle_ocr = None
|
|
|
-except ImportError:
|
|
|
- PADDLEOCR_AVAILABLE = False
|
|
|
- logger.warning("[附件切割] PaddleOCR 不可用")
|
|
|
+ logger.error("[附件切割] Tesseract OCR 不可用,请安装: apt install tesseract-ocr tesseract-ocr-chi-sim && pip install pytesseract")
|
|
|
|
|
|
try:
|
|
|
import PyPDF2
|
|
|
@@ -104,7 +93,7 @@ NON_TABLE_ATTACHMENT_KEYWORDS = [
|
|
|
|
|
|
def ocr_page_image(image) -> str:
|
|
|
"""
|
|
|
- 对图片进行 OCR 识别(优先使用 Tesseract,备用 PaddleOCR)
|
|
|
+ 对图片进行 OCR 识别(使用 Tesseract)
|
|
|
|
|
|
Args:
|
|
|
image: PIL Image 对象
|
|
|
@@ -112,46 +101,17 @@ def ocr_page_image(image) -> str:
|
|
|
Returns:
|
|
|
str: 识别出的文本
|
|
|
"""
|
|
|
- # 优先使用 Tesseract
|
|
|
- if TESSERACT_AVAILABLE:
|
|
|
- try:
|
|
|
- text = pytesseract.image_to_string(image, lang=OCR_LANG)
|
|
|
- logger.debug(f"[附件切割] Tesseract OCR识别成功,文本长度: {len(text)}")
|
|
|
- return text
|
|
|
- except Exception as e:
|
|
|
- logger.error(f"[附件切割] Tesseract OCR识别失败: {e}")
|
|
|
- # 失败后尝试 PaddleOCR
|
|
|
+ if not TESSERACT_AVAILABLE:
|
|
|
+ logger.warning("[附件切割] Tesseract OCR 不可用,跳过识别")
|
|
|
+ return ""
|
|
|
|
|
|
- # 备用:使用 PaddleOCR
|
|
|
- if PADDLEOCR_AVAILABLE:
|
|
|
- try:
|
|
|
- global _paddle_ocr
|
|
|
- if _paddle_ocr is None:
|
|
|
- logger.info("[附件切割] 初始化 PaddleOCR...")
|
|
|
- _paddle_ocr = PaddleOCR(use_angle_cls=True, lang='ch', show_log=False)
|
|
|
-
|
|
|
- # 将PIL图片转换为numpy数组
|
|
|
- import numpy as np
|
|
|
- img_array = np.array(image)
|
|
|
-
|
|
|
- # 执行OCR
|
|
|
- result = _paddle_ocr.ocr(img_array, cls=True)
|
|
|
-
|
|
|
- # 提取文本
|
|
|
- if result and result[0]:
|
|
|
- texts = [line[1][0] for line in result[0] if line[1][0]]
|
|
|
- text = '\n'.join(texts)
|
|
|
- logger.debug(f"[附件切割] PaddleOCR识别成功,文本长度: {len(text)}")
|
|
|
- return text
|
|
|
- else:
|
|
|
- logger.warning("[附件切割] PaddleOCR未识别出文本")
|
|
|
- return ""
|
|
|
- except Exception as e:
|
|
|
- logger.error(f"[附件切割] PaddleOCR识别失败: {e}")
|
|
|
- return ""
|
|
|
-
|
|
|
- logger.warning("[附件切割] 没有可用的OCR工具")
|
|
|
- return ""
|
|
|
+ try:
|
|
|
+ text = pytesseract.image_to_string(image, lang=OCR_LANG)
|
|
|
+ logger.debug(f"[附件切割] Tesseract OCR识别成功,文本长度: {len(text)}")
|
|
|
+ return text
|
|
|
+ except Exception as e:
|
|
|
+ logger.error(f"[附件切割] Tesseract OCR识别失败: {e}")
|
|
|
+ return ""
|
|
|
|
|
|
def extract_page_text(page, use_ocr: bool = False) -> str:
|
|
|
"""
|