| 1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162 |
- # Copyright (c) Opendatalab. All rights reserved.
- """
- 文件处理工具函数
- """
- import os
- import re
- from pathlib import Path
- from typing import Tuple
- from .logging_config import get_logger
- logger = get_logger("pdf_converter_v2.utils.file")
- def safe_stem(file_path):
- """安全地提取文件名(去除不安全字符)"""
- stem = Path(file_path).stem
- return re.sub(r'[^\w.]', '_', stem)
- def check_pdf_has_text_layer(pdf_path: str, min_text_length: int = 100) -> Tuple[bool, str]:
- """
- 检查 PDF 是否有文本层
-
- Args:
- pdf_path: PDF 文件路径
- min_text_length: 最小文本长度阈值,低于此值认为没有有效文本层
-
- Returns:
- Tuple[bool, str]: (是否有文本层, 提取的文本内容)
- """
- try:
- import pdfplumber
-
- text_content = ""
- with pdfplumber.open(pdf_path) as pdf:
- # 检查前几页的文本
- pages_to_check = min(5, len(pdf.pages))
- for i in range(pages_to_check):
- page = pdf.pages[i]
- page_text = page.extract_text() or ""
- text_content += page_text
-
- # 如果已经有足够的文本,直接返回
- if len(text_content) >= min_text_length:
- logger.info(f"[PDF检测] 文件有文本层,前{i+1}页提取到 {len(text_content)} 字符")
- return True, text_content
-
- # 检查总文本长度
- if len(text_content) >= min_text_length:
- logger.info(f"[PDF检测] 文件有文本层,共提取到 {len(text_content)} 字符")
- return True, text_content
- else:
- logger.warning(f"[PDF检测] 文件文本层不足,仅提取到 {len(text_content)} 字符(阈值: {min_text_length})")
- return False, text_content
-
- except Exception as e:
- logger.error(f"[PDF检测] 检测文本层失败: {e}")
- return False, ""
|