| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269 |
- #!/usr/bin/env python3
- # -*- coding: utf-8 -*-
- """
- PDF去水印工具
- 将PDF转换为图片,去除水印后再转回PDF
- """
- from pathlib import Path
- from typing import List, Optional
- import tempfile
- import shutil
- try:
- from PIL import Image
- PIL_AVAILABLE = True
- except ImportError:
- PIL_AVAILABLE = False
- def _pdf_to_pil_images(input_pdf: str, dpi: int = 200) -> Optional[List["Image.Image"]]:
- """
- 将 PDF 转为 PIL 图片列表。优先 pdf2image(需 poppler),失败时用 pypdfium2(无需 poppler)。
- """
- # 1) 尝试 pdf2image(需系统安装 poppler-utils)
- try:
- from pdf2image import convert_from_path
- return convert_from_path(input_pdf, dpi=dpi)
- except Exception as e:
- err_msg = str(e).lower()
- if "pdfinfo" in err_msg or "poppler" in err_msg or "no such file" in err_msg:
- pass # 无 poppler,尝试 pypdfium2
- else:
- raise
- # 2) 备用:pypdfium2(无需 poppler)
- try:
- import pypdfium2 as pdfium
- pdf = pdfium.PdfDocument(input_pdf)
- try:
- scale = dpi / 72.0
- images = []
- for i in range(len(pdf)):
- page = pdf[i]
- bitmap = page.render(scale=scale)
- try:
- pil_image = bitmap.to_pil()
- images.append(pil_image)
- finally:
- bitmap.close()
- return images
- finally:
- try:
- pdf.close()
- except Exception:
- pass
- except ImportError:
- raise FileNotFoundError(
- "PDF 转图片需要 pdf2image+poppler 或 pypdfium2。"
- " 安装其一:apt install poppler-utils 或 pip install pypdfium2"
- )
- def remove_watermark_from_pdf(
- input_pdf: str,
- output_pdf: str,
- light_threshold: int = 200,
- saturation_threshold: int = 30,
- dpi: int = 200
- ) -> bool:
- """
- 对PDF文件进行去水印处理
-
- 处理流程:
- 1. 将PDF的每一页转换为图片
- 2. 对每张图片进行去水印处理
- 3. 将处理后的图片合并为新的PDF
-
- Args:
- input_pdf: 输入PDF文件路径
- output_pdf: 输出PDF文件路径
- light_threshold: 水印亮度阈值(0-255),高于此值的浅色像素可能是水印
- saturation_threshold: 水印饱和度阈值(0-255),低于此值的低饱和度像素可能是水印
- dpi: PDF转图片的DPI,影响图片质量和处理速度
-
- Returns:
- bool: 是否成功
- """
- try:
- from PIL import Image
- from utils.image_preprocessor import remove_watermark, check_opencv_available
-
- if not check_opencv_available():
- print("⚠ OpenCV 未安装,无法进行去水印处理")
- return False
-
- temp_dir = tempfile.mkdtemp(prefix="pdf_watermark_")
- temp_path = Path(temp_dir)
-
- try:
- print(f"正在将PDF转换为图片(DPI={dpi})...")
- images = _pdf_to_pil_images(input_pdf, dpi=dpi)
- if not images:
- print("⚠ 未得到任何页面图片")
- return False
- print(f"✓ 转换完成,共 {len(images)} 页")
-
- # 处理每一页
- processed_images = []
- for i, image in enumerate(images, 1):
- print(f"处理第 {i}/{len(images)} 页...", end='\r')
-
- # 保存原始图片
- original_path = temp_path / f"page_{i}_original.png"
- image.save(str(original_path), "PNG")
-
- # 去水印
- nowm_path = temp_path / f"page_{i}_nowm.png"
- processed_path = remove_watermark(
- str(original_path),
- output_path=str(nowm_path),
- light_threshold=light_threshold,
- saturation_threshold=saturation_threshold,
- method="hsv"
- )
-
- # 加载处理后的图片
- processed_img = Image.open(processed_path)
- processed_images.append(processed_img)
-
- print(f"\n✓ 所有页面处理完成")
-
- # 将图片合并为PDF
- print("正在生成PDF...")
- if processed_images:
- # 第一张图片作为主图片
- first_image = processed_images[0]
- # 其余图片作为附加页
- other_images = processed_images[1:] if len(processed_images) > 1 else []
-
- # 保存为PDF
- first_image.save(
- output_pdf,
- "PDF",
- resolution=dpi,
- save_all=True,
- append_images=other_images
- )
- print(f"✓ PDF生成完成: {output_pdf}")
- return True
- else:
- print("⚠ 没有处理任何图片")
- return False
-
- finally:
- # 清理临时目录
- try:
- shutil.rmtree(temp_dir)
- except Exception as e:
- print(f"⚠ 清理临时目录失败: {e}")
-
- except ImportError as e:
- print(f"⚠ 缺少必要的库: {e}")
- print("请安装: pip install pypdfium2 或 pdf2image pillow PyPDF2 opencv-python;pdf2image 需系统安装 poppler-utils")
- return False
- except FileNotFoundError as e:
- print(f"⚠ {e}")
- return False
- except Exception as e:
- print(f"⚠ 去水印处理失败: {e}")
- import traceback
- traceback.print_exc()
- return False
- def crop_header_footer_from_pdf(
- input_pdf: str,
- output_pdf: str,
- header_ratio: float = 0.05,
- footer_ratio: float = 0.05,
- auto_detect: bool = False,
- dpi: int = 200
- ) -> bool:
- """
- 对 PDF 文件进行页眉页脚裁剪处理。
- 处理流程:
- 1. 将 PDF 的每一页转换为图片
- 2. 对每张图片进行页眉页脚裁剪
- 3. 将处理后的图片合并为新的 PDF
- Args:
- input_pdf: 输入 PDF 文件路径
- output_pdf: 输出 PDF 文件路径
- header_ratio: 页眉裁剪比例(0-1),默认 0.05 表示裁剪顶部 5%
- footer_ratio: 页脚裁剪比例(0-1),默认 0.05 表示裁剪底部 5%
- auto_detect: 是否自动检测页眉页脚边界
- dpi: PDF 转图片的 DPI
- Returns:
- bool: 是否成功
- """
- try:
- from PIL import Image
- from utils.image_preprocessor import crop_header_footer, check_opencv_available
- if not check_opencv_available():
- print("⚠ OpenCV 未安装,无法进行页眉页脚裁剪")
- return False
- temp_dir = tempfile.mkdtemp(prefix="pdf_crop_hf_")
- temp_path = Path(temp_dir)
- try:
- print(f"正在将 PDF 转换为图片(DPI={dpi})...")
- images = _pdf_to_pil_images(input_pdf, dpi=dpi)
- if not images:
- print("⚠ 未得到任何页面图片")
- return False
- print(f"✓ 转换完成,共 {len(images)} 页")
- processed_images = []
- for i, image in enumerate(images, 1):
- print(f"处理第 {i}/{len(images)} 页...", end="\r")
- original_path = temp_path / f"page_{i}_original.png"
- image.save(str(original_path), "PNG")
- cropped_path = temp_path / f"page_{i}_cropped.png"
- crop_header_footer(
- str(original_path),
- output_path=str(cropped_path),
- header_ratio=header_ratio,
- footer_ratio=footer_ratio,
- auto_detect=auto_detect,
- )
- processed_img = Image.open(cropped_path)
- processed_images.append(processed_img)
- print("\n✓ 所有页面处理完成")
- print("正在生成 PDF...")
- if processed_images:
- first_image = processed_images[0]
- other_images = processed_images[1:] if len(processed_images) > 1 else []
- first_image.save(
- output_pdf,
- "PDF",
- resolution=dpi,
- save_all=True,
- append_images=other_images,
- )
- print(f"✓ PDF 生成完成: {output_pdf}")
- return True
- else:
- print("⚠ 没有处理任何图片")
- return False
- finally:
- try:
- shutil.rmtree(temp_dir)
- except Exception as e:
- print(f"⚠ 清理临时目录失败: {e}")
- except ImportError as e:
- print(f"⚠ 缺少必要的库: {e}")
- print("请安装: pip install pypdfium2 或 pdf2image pillow opencv-python;pdf2image 需系统安装 poppler-utils")
- return False
- except FileNotFoundError as e:
- print(f"⚠ {e}")
- return False
- except Exception as e:
- print(f"⚠ 页眉页脚裁剪失败: {e}")
- import traceback
- traceback.print_exc()
- return False
|