file_utils.py 2.0 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162
  1. # Copyright (c) Opendatalab. All rights reserved.
  2. """
  3. 文件处理工具函数
  4. """
  5. import os
  6. import re
  7. from pathlib import Path
  8. from typing import Tuple
  9. from .logging_config import get_logger
  10. logger = get_logger("pdf_converter_v2.utils.file")
  11. def safe_stem(file_path):
  12. """安全地提取文件名(去除不安全字符)"""
  13. stem = Path(file_path).stem
  14. return re.sub(r'[^\w.]', '_', stem)
  15. def check_pdf_has_text_layer(pdf_path: str, min_text_length: int = 100) -> Tuple[bool, str]:
  16. """
  17. 检查 PDF 是否有文本层
  18. Args:
  19. pdf_path: PDF 文件路径
  20. min_text_length: 最小文本长度阈值,低于此值认为没有有效文本层
  21. Returns:
  22. Tuple[bool, str]: (是否有文本层, 提取的文本内容)
  23. """
  24. try:
  25. import pdfplumber
  26. text_content = ""
  27. with pdfplumber.open(pdf_path) as pdf:
  28. # 检查前几页的文本
  29. pages_to_check = min(5, len(pdf.pages))
  30. for i in range(pages_to_check):
  31. page = pdf.pages[i]
  32. page_text = page.extract_text() or ""
  33. text_content += page_text
  34. # 如果已经有足够的文本,直接返回
  35. if len(text_content) >= min_text_length:
  36. logger.info(f"[PDF检测] 文件有文本层,前{i+1}页提取到 {len(text_content)} 字符")
  37. return True, text_content
  38. # 检查总文本长度
  39. if len(text_content) >= min_text_length:
  40. logger.info(f"[PDF检测] 文件有文本层,共提取到 {len(text_content)} 字符")
  41. return True, text_content
  42. else:
  43. logger.warning(f"[PDF检测] 文件文本层不足,仅提取到 {len(text_content)} 字符(阈值: {min_text_length})")
  44. return False, text_content
  45. except Exception as e:
  46. logger.error(f"[PDF检测] 检测文本层失败: {e}")
  47. return False, ""