| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122 |
- #!/usr/bin/env python3
- # -*- coding: utf-8 -*-
- """
- PDF去水印工具
- 将PDF转换为图片,去除水印后再转回PDF
- """
- from pathlib import Path
- from typing import Optional
- import tempfile
- import shutil
- def remove_watermark_from_pdf(
- input_pdf: str,
- output_pdf: str,
- light_threshold: int = 200,
- saturation_threshold: int = 30,
- dpi: int = 200
- ) -> bool:
- """
- 对PDF文件进行去水印处理
-
- 处理流程:
- 1. 将PDF的每一页转换为图片
- 2. 对每张图片进行去水印处理
- 3. 将处理后的图片合并为新的PDF
-
- Args:
- input_pdf: 输入PDF文件路径
- output_pdf: 输出PDF文件路径
- light_threshold: 水印亮度阈值(0-255),高于此值的浅色像素可能是水印
- saturation_threshold: 水印饱和度阈值(0-255),低于此值的低饱和度像素可能是水印
- dpi: PDF转图片的DPI,影响图片质量和处理速度
-
- Returns:
- bool: 是否成功
- """
- try:
- # 导入必要的库
- from pdf2image import convert_from_path
- from PIL import Image
- import PyPDF2
- from utils.image_preprocessor import remove_watermark, check_opencv_available
-
- # 检查OpenCV是否可用
- if not check_opencv_available():
- print("⚠ OpenCV 未安装,无法进行去水印处理")
- return False
-
- # 创建临时目录
- temp_dir = tempfile.mkdtemp(prefix="pdf_watermark_")
- temp_path = Path(temp_dir)
-
- try:
- print(f"正在将PDF转换为图片(DPI={dpi})...")
- # 将PDF转换为图片
- images = convert_from_path(input_pdf, dpi=dpi)
- print(f"✓ 转换完成,共 {len(images)} 页")
-
- # 处理每一页
- processed_images = []
- for i, image in enumerate(images, 1):
- print(f"处理第 {i}/{len(images)} 页...", end='\r')
-
- # 保存原始图片
- original_path = temp_path / f"page_{i}_original.png"
- image.save(str(original_path), "PNG")
-
- # 去水印
- nowm_path = temp_path / f"page_{i}_nowm.png"
- processed_path = remove_watermark(
- str(original_path),
- output_path=str(nowm_path),
- light_threshold=light_threshold,
- saturation_threshold=saturation_threshold,
- method="hsv"
- )
-
- # 加载处理后的图片
- processed_img = Image.open(processed_path)
- processed_images.append(processed_img)
-
- print(f"\n✓ 所有页面处理完成")
-
- # 将图片合并为PDF
- print("正在生成PDF...")
- if processed_images:
- # 第一张图片作为主图片
- first_image = processed_images[0]
- # 其余图片作为附加页
- other_images = processed_images[1:] if len(processed_images) > 1 else []
-
- # 保存为PDF
- first_image.save(
- output_pdf,
- "PDF",
- resolution=dpi,
- save_all=True,
- append_images=other_images
- )
- print(f"✓ PDF生成完成: {output_pdf}")
- return True
- else:
- print("⚠ 没有处理任何图片")
- return False
-
- finally:
- # 清理临时目录
- try:
- shutil.rmtree(temp_dir)
- except Exception as e:
- print(f"⚠ 清理临时目录失败: {e}")
-
- except ImportError as e:
- print(f"⚠ 缺少必要的库: {e}")
- print("请安装: pip install pdf2image pillow PyPDF2 opencv-python")
- return False
- except Exception as e:
- print(f"⚠ 去水印处理失败: {e}")
- import traceback
- traceback.print_exc()
- return False
|