pdf_watermark_remover.py 4.1 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122
  1. #!/usr/bin/env python3
  2. # -*- coding: utf-8 -*-
  3. """
  4. PDF去水印工具
  5. 将PDF转换为图片,去除水印后再转回PDF
  6. """
  7. from pathlib import Path
  8. from typing import Optional
  9. import tempfile
  10. import shutil
  11. def remove_watermark_from_pdf(
  12. input_pdf: str,
  13. output_pdf: str,
  14. light_threshold: int = 200,
  15. saturation_threshold: int = 30,
  16. dpi: int = 200
  17. ) -> bool:
  18. """
  19. 对PDF文件进行去水印处理
  20. 处理流程:
  21. 1. 将PDF的每一页转换为图片
  22. 2. 对每张图片进行去水印处理
  23. 3. 将处理后的图片合并为新的PDF
  24. Args:
  25. input_pdf: 输入PDF文件路径
  26. output_pdf: 输出PDF文件路径
  27. light_threshold: 水印亮度阈值(0-255),高于此值的浅色像素可能是水印
  28. saturation_threshold: 水印饱和度阈值(0-255),低于此值的低饱和度像素可能是水印
  29. dpi: PDF转图片的DPI,影响图片质量和处理速度
  30. Returns:
  31. bool: 是否成功
  32. """
  33. try:
  34. # 导入必要的库
  35. from pdf2image import convert_from_path
  36. from PIL import Image
  37. import PyPDF2
  38. from utils.image_preprocessor import remove_watermark, check_opencv_available
  39. # 检查OpenCV是否可用
  40. if not check_opencv_available():
  41. print("⚠ OpenCV 未安装,无法进行去水印处理")
  42. return False
  43. # 创建临时目录
  44. temp_dir = tempfile.mkdtemp(prefix="pdf_watermark_")
  45. temp_path = Path(temp_dir)
  46. try:
  47. print(f"正在将PDF转换为图片(DPI={dpi})...")
  48. # 将PDF转换为图片
  49. images = convert_from_path(input_pdf, dpi=dpi)
  50. print(f"✓ 转换完成,共 {len(images)} 页")
  51. # 处理每一页
  52. processed_images = []
  53. for i, image in enumerate(images, 1):
  54. print(f"处理第 {i}/{len(images)} 页...", end='\r')
  55. # 保存原始图片
  56. original_path = temp_path / f"page_{i}_original.png"
  57. image.save(str(original_path), "PNG")
  58. # 去水印
  59. nowm_path = temp_path / f"page_{i}_nowm.png"
  60. processed_path = remove_watermark(
  61. str(original_path),
  62. output_path=str(nowm_path),
  63. light_threshold=light_threshold,
  64. saturation_threshold=saturation_threshold,
  65. method="hsv"
  66. )
  67. # 加载处理后的图片
  68. processed_img = Image.open(processed_path)
  69. processed_images.append(processed_img)
  70. print(f"\n✓ 所有页面处理完成")
  71. # 将图片合并为PDF
  72. print("正在生成PDF...")
  73. if processed_images:
  74. # 第一张图片作为主图片
  75. first_image = processed_images[0]
  76. # 其余图片作为附加页
  77. other_images = processed_images[1:] if len(processed_images) > 1 else []
  78. # 保存为PDF
  79. first_image.save(
  80. output_pdf,
  81. "PDF",
  82. resolution=dpi,
  83. save_all=True,
  84. append_images=other_images
  85. )
  86. print(f"✓ PDF生成完成: {output_pdf}")
  87. return True
  88. else:
  89. print("⚠ 没有处理任何图片")
  90. return False
  91. finally:
  92. # 清理临时目录
  93. try:
  94. shutil.rmtree(temp_dir)
  95. except Exception as e:
  96. print(f"⚠ 清理临时目录失败: {e}")
  97. except ImportError as e:
  98. print(f"⚠ 缺少必要的库: {e}")
  99. print("请安装: pip install pdf2image pillow PyPDF2 opencv-python")
  100. return False
  101. except Exception as e:
  102. print(f"⚠ 去水印处理失败: {e}")
  103. import traceback
  104. traceback.print_exc()
  105. return False