| 1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798 |
- # -*- coding: utf-8 -*-
- """
- PDF 按页切割工具
- 将大 PDF 按固定页数切分为多个小 PDF,用于分段转换以降低单次请求内存。
- """
- import tempfile
- from pathlib import Path
- from typing import List
- from .logging_config import get_logger
- logger = get_logger("pdf_converter_v2.utils.pdf_splitter")
- try:
- import PyPDF2
- PYPDF2_AVAILABLE = True
- except ImportError:
- PYPDF2_AVAILABLE = False
- PDF2_REQUIRED_MSG = "PyPDF2 未安装,无法进行 PDF 切割与页数检测。请安装: pip install PyPDF2"
- def _require_pypdf2() -> None:
- if not PYPDF2_AVAILABLE:
- raise RuntimeError(PDF2_REQUIRED_MSG)
- def get_pdf_page_count(pdf_path: str) -> int:
- """获取 PDF 页数。若无法读取则返回 0。未安装 PyPDF2 时抛出 RuntimeError。"""
- _require_pypdf2()
- try:
- with open(pdf_path, "rb") as f:
- reader = PyPDF2.PdfReader(f)
- return len(reader.pages)
- except Exception as e:
- logger.warning(f"[PDF切割] 获取页数失败 {pdf_path}: {e}")
- return 0
- def split_pdf_by_pages(
- input_pdf: str,
- output_dir: str,
- chunk_size: int = 50,
- ) -> List[str]:
- """
- 将 PDF 按 chunk_size 页一段切分为多个临时 PDF 文件。
- :param input_pdf: 输入 PDF 路径
- :param output_dir: 存放切分后 PDF 的目录(一般为临时目录)
- :param chunk_size: 每段页数,默认 50
- :return: 切分后的 PDF 文件路径列表,按页码顺序;失败返回空列表。未安装 PyPDF2 时抛出 RuntimeError。
- """
- _require_pypdf2()
- input_path = Path(input_pdf)
- out_path = Path(output_dir)
- out_path.mkdir(parents=True, exist_ok=True)
- # 整个切割过程保持文件打开,否则 writer.add_page(reader.pages[i]) 时 PyPDF2 会从已关闭的 stream 读取,触发 seek of closed file
- try:
- with open(input_pdf, "rb") as f:
- reader = PyPDF2.PdfReader(f)
- total = len(reader.pages)
- if total <= 0:
- return []
- if total <= chunk_size:
- return [input_pdf]
- chunk_paths: List[str] = []
- start = 0
- idx = 0
- while start < total:
- end = min(start + chunk_size, total)
- chunk_pdf = out_path / f"chunk_{idx}_{input_path.stem}.pdf"
- try:
- writer = PyPDF2.PdfWriter()
- for i in range(start, end):
- writer.add_page(reader.pages[i])
- with open(chunk_pdf, "wb") as w:
- writer.write(w)
- chunk_paths.append(str(chunk_pdf))
- logger.info(f"[PDF切割] 段 {idx + 1}: 页 {start + 1}-{end}/{total} -> {chunk_pdf.name}")
- except Exception as e:
- logger.exception(f"[PDF切割] 写入段 {idx} 失败: {e}")
- for p in chunk_paths:
- try:
- Path(p).unlink(missing_ok=True)
- except Exception:
- pass
- return []
- start = end
- idx += 1
- return chunk_paths
- except Exception as e:
- logger.error(f"[PDF切割] 读取 PDF 失败 {input_pdf}: {e}")
- return []
|