hewensong
/
Clerk2.5


			
							1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162
							# Copyright (c) Opendatalab. All rights reserved.

"""
文件处理工具函数
"""

import os
import re
from pathlib import Path
from typing import Tuple

from .logging_config import get_logger

logger = get_logger("pdf_converter_v2.utils.file")


def safe_stem(file_path):
    """安全地提取文件名（去除不安全字符）"""
    stem = Path(file_path).stem
    return re.sub(r'[^\w.]', '_', stem)


def check_pdf_has_text_layer(pdf_path: str, min_text_length: int = 100) -> Tuple[bool, str]:
    """
    检查 PDF 是否有文本层
    
    Args:
        pdf_path: PDF 文件路径
        min_text_length: 最小文本长度阈值，低于此值认为没有有效文本层
        
    Returns:
        Tuple[bool, str]: (是否有文本层, 提取的文本内容)
    """
    try:
        import pdfplumber
        
        text_content = ""
        with pdfplumber.open(pdf_path) as pdf:
            # 检查前几页的文本
            pages_to_check = min(5, len(pdf.pages))
            for i in range(pages_to_check):
                page = pdf.pages[i]
                page_text = page.extract_text() or ""
                text_content += page_text
                
                # 如果已经有足够的文本，直接返回
                if len(text_content) >= min_text_length:
                    logger.info(f"[PDF检测] 文件有文本层，前{i+1}页提取到 {len(text_content)} 字符")
                    return True, text_content
        
        # 检查总文本长度
        if len(text_content) >= min_text_length:
            logger.info(f"[PDF检测] 文件有文本层，共提取到 {len(text_content)} 字符")
            return True, text_content
        else:
            logger.warning(f"[PDF检测] 文件文本层不足，仅提取到 {len(text_content)} 字符（阈值: {min_text_length}）")
            return False, text_content
            
    except Exception as e:
        logger.error(f"[PDF检测] 检测文本层失败: {e}")
        return False, ""