pdf_splitter.py 3.3 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798
  1. # -*- coding: utf-8 -*-
  2. """
  3. PDF 按页切割工具
  4. 将大 PDF 按固定页数切分为多个小 PDF,用于分段转换以降低单次请求内存。
  5. """
  6. import tempfile
  7. from pathlib import Path
  8. from typing import List
  9. from .logging_config import get_logger
  10. logger = get_logger("pdf_converter_v2.utils.pdf_splitter")
  11. try:
  12. import PyPDF2
  13. PYPDF2_AVAILABLE = True
  14. except ImportError:
  15. PYPDF2_AVAILABLE = False
  16. PDF2_REQUIRED_MSG = "PyPDF2 未安装,无法进行 PDF 切割与页数检测。请安装: pip install PyPDF2"
  17. def _require_pypdf2() -> None:
  18. if not PYPDF2_AVAILABLE:
  19. raise RuntimeError(PDF2_REQUIRED_MSG)
  20. def get_pdf_page_count(pdf_path: str) -> int:
  21. """获取 PDF 页数。若无法读取则返回 0。未安装 PyPDF2 时抛出 RuntimeError。"""
  22. _require_pypdf2()
  23. try:
  24. with open(pdf_path, "rb") as f:
  25. reader = PyPDF2.PdfReader(f)
  26. return len(reader.pages)
  27. except Exception as e:
  28. logger.warning(f"[PDF切割] 获取页数失败 {pdf_path}: {e}")
  29. return 0
  30. def split_pdf_by_pages(
  31. input_pdf: str,
  32. output_dir: str,
  33. chunk_size: int = 50,
  34. ) -> List[str]:
  35. """
  36. 将 PDF 按 chunk_size 页一段切分为多个临时 PDF 文件。
  37. :param input_pdf: 输入 PDF 路径
  38. :param output_dir: 存放切分后 PDF 的目录(一般为临时目录)
  39. :param chunk_size: 每段页数,默认 50
  40. :return: 切分后的 PDF 文件路径列表,按页码顺序;失败返回空列表。未安装 PyPDF2 时抛出 RuntimeError。
  41. """
  42. _require_pypdf2()
  43. input_path = Path(input_pdf)
  44. out_path = Path(output_dir)
  45. out_path.mkdir(parents=True, exist_ok=True)
  46. # 整个切割过程保持文件打开,否则 writer.add_page(reader.pages[i]) 时 PyPDF2 会从已关闭的 stream 读取,触发 seek of closed file
  47. try:
  48. with open(input_pdf, "rb") as f:
  49. reader = PyPDF2.PdfReader(f)
  50. total = len(reader.pages)
  51. if total <= 0:
  52. return []
  53. if total <= chunk_size:
  54. return [input_pdf]
  55. chunk_paths: List[str] = []
  56. start = 0
  57. idx = 0
  58. while start < total:
  59. end = min(start + chunk_size, total)
  60. chunk_pdf = out_path / f"chunk_{idx}_{input_path.stem}.pdf"
  61. try:
  62. writer = PyPDF2.PdfWriter()
  63. for i in range(start, end):
  64. writer.add_page(reader.pages[i])
  65. with open(chunk_pdf, "wb") as w:
  66. writer.write(w)
  67. chunk_paths.append(str(chunk_pdf))
  68. logger.info(f"[PDF切割] 段 {idx + 1}: 页 {start + 1}-{end}/{total} -> {chunk_pdf.name}")
  69. except Exception as e:
  70. logger.exception(f"[PDF切割] 写入段 {idx} 失败: {e}")
  71. for p in chunk_paths:
  72. try:
  73. Path(p).unlink(missing_ok=True)
  74. except Exception:
  75. pass
  76. return []
  77. start = end
  78. idx += 1
  79. return chunk_paths
  80. except Exception as e:
  81. logger.error(f"[PDF切割] 读取 PDF 失败 {input_pdf}: {e}")
  82. return []