hewensong
/
Clerk2.5


			
							1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798
							# -*- coding: utf-8 -*-
"""
PDF 按页切割工具
将大 PDF 按固定页数切分为多个小 PDF，用于分段转换以降低单次请求内存。
"""

import tempfile
from pathlib import Path
from typing import List

from .logging_config import get_logger

logger = get_logger("pdf_converter_v2.utils.pdf_splitter")

try:
    import PyPDF2
    PYPDF2_AVAILABLE = True
except ImportError:
    PYPDF2_AVAILABLE = False

PDF2_REQUIRED_MSG = "PyPDF2 未安装，无法进行 PDF 切割与页数检测。请安装: pip install PyPDF2"


def _require_pypdf2() -> None:
    if not PYPDF2_AVAILABLE:
        raise RuntimeError(PDF2_REQUIRED_MSG)


def get_pdf_page_count(pdf_path: str) -> int:
    """获取 PDF 页数。若无法读取则返回 0。未安装 PyPDF2 时抛出 RuntimeError。"""
    _require_pypdf2()
    try:
        with open(pdf_path, "rb") as f:
            reader = PyPDF2.PdfReader(f)
            return len(reader.pages)
    except Exception as e:
        logger.warning(f"[PDF切割] 获取页数失败 {pdf_path}: {e}")
        return 0


def split_pdf_by_pages(
    input_pdf: str,
    output_dir: str,
    chunk_size: int = 50,
) -> List[str]:
    """
    将 PDF 按 chunk_size 页一段切分为多个临时 PDF 文件。

    :param input_pdf: 输入 PDF 路径
    :param output_dir: 存放切分后 PDF 的目录（一般为临时目录）
    :param chunk_size: 每段页数，默认 50
    :return: 切分后的 PDF 文件路径列表，按页码顺序；失败返回空列表。未安装 PyPDF2 时抛出 RuntimeError。
    """
    _require_pypdf2()

    input_path = Path(input_pdf)
    out_path = Path(output_dir)
    out_path.mkdir(parents=True, exist_ok=True)

    # 整个切割过程保持文件打开，否则 writer.add_page(reader.pages[i]) 时 PyPDF2 会从已关闭的 stream 读取，触发 seek of closed file
    try:
        with open(input_pdf, "rb") as f:
            reader = PyPDF2.PdfReader(f)
            total = len(reader.pages)
            if total <= 0:
                return []
            if total <= chunk_size:
                return [input_pdf]

            chunk_paths: List[str] = []
            start = 0
            idx = 0
            while start < total:
                end = min(start + chunk_size, total)
                chunk_pdf = out_path / f"chunk_{idx}_{input_path.stem}.pdf"
                try:
                    writer = PyPDF2.PdfWriter()
                    for i in range(start, end):
                        writer.add_page(reader.pages[i])
                    with open(chunk_pdf, "wb") as w:
                        writer.write(w)
                    chunk_paths.append(str(chunk_pdf))
                    logger.info(f"[PDF切割] 段 {idx + 1}: 页 {start + 1}-{end}/{total} -> {chunk_pdf.name}")
                except Exception as e:
                    logger.exception(f"[PDF切割] 写入段 {idx} 失败: {e}")
                    for p in chunk_paths:
                        try:
                            Path(p).unlink(missing_ok=True)
                        except Exception:
                            pass
                    return []
                start = end
                idx += 1

            return chunk_paths
    except Exception as e:
        logger.error(f"[PDF切割] 读取 PDF 失败 {input_pdf}: {e}")
        return []