pdf_watermark_remover.py 9.2 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269
  1. #!/usr/bin/env python3
  2. # -*- coding: utf-8 -*-
  3. """
  4. PDF去水印工具
  5. 将PDF转换为图片,去除水印后再转回PDF
  6. """
  7. from pathlib import Path
  8. from typing import List, Optional
  9. import tempfile
  10. import shutil
  11. try:
  12. from PIL import Image
  13. PIL_AVAILABLE = True
  14. except ImportError:
  15. PIL_AVAILABLE = False
  16. def _pdf_to_pil_images(input_pdf: str, dpi: int = 200) -> Optional[List["Image.Image"]]:
  17. """
  18. 将 PDF 转为 PIL 图片列表。优先 pdf2image(需 poppler),失败时用 pypdfium2(无需 poppler)。
  19. """
  20. # 1) 尝试 pdf2image(需系统安装 poppler-utils)
  21. try:
  22. from pdf2image import convert_from_path
  23. return convert_from_path(input_pdf, dpi=dpi)
  24. except Exception as e:
  25. err_msg = str(e).lower()
  26. if "pdfinfo" in err_msg or "poppler" in err_msg or "no such file" in err_msg:
  27. pass # 无 poppler,尝试 pypdfium2
  28. else:
  29. raise
  30. # 2) 备用:pypdfium2(无需 poppler)
  31. try:
  32. import pypdfium2 as pdfium
  33. pdf = pdfium.PdfDocument(input_pdf)
  34. try:
  35. scale = dpi / 72.0
  36. images = []
  37. for i in range(len(pdf)):
  38. page = pdf[i]
  39. bitmap = page.render(scale=scale)
  40. try:
  41. pil_image = bitmap.to_pil()
  42. images.append(pil_image)
  43. finally:
  44. bitmap.close()
  45. return images
  46. finally:
  47. try:
  48. pdf.close()
  49. except Exception:
  50. pass
  51. except ImportError:
  52. raise FileNotFoundError(
  53. "PDF 转图片需要 pdf2image+poppler 或 pypdfium2。"
  54. " 安装其一:apt install poppler-utils 或 pip install pypdfium2"
  55. )
  56. def remove_watermark_from_pdf(
  57. input_pdf: str,
  58. output_pdf: str,
  59. light_threshold: int = 200,
  60. saturation_threshold: int = 30,
  61. dpi: int = 200
  62. ) -> bool:
  63. """
  64. 对PDF文件进行去水印处理
  65. 处理流程:
  66. 1. 将PDF的每一页转换为图片
  67. 2. 对每张图片进行去水印处理
  68. 3. 将处理后的图片合并为新的PDF
  69. Args:
  70. input_pdf: 输入PDF文件路径
  71. output_pdf: 输出PDF文件路径
  72. light_threshold: 水印亮度阈值(0-255),高于此值的浅色像素可能是水印
  73. saturation_threshold: 水印饱和度阈值(0-255),低于此值的低饱和度像素可能是水印
  74. dpi: PDF转图片的DPI,影响图片质量和处理速度
  75. Returns:
  76. bool: 是否成功
  77. """
  78. try:
  79. from PIL import Image
  80. from utils.image_preprocessor import remove_watermark, check_opencv_available
  81. if not check_opencv_available():
  82. print("⚠ OpenCV 未安装,无法进行去水印处理")
  83. return False
  84. temp_dir = tempfile.mkdtemp(prefix="pdf_watermark_")
  85. temp_path = Path(temp_dir)
  86. try:
  87. print(f"正在将PDF转换为图片(DPI={dpi})...")
  88. images = _pdf_to_pil_images(input_pdf, dpi=dpi)
  89. if not images:
  90. print("⚠ 未得到任何页面图片")
  91. return False
  92. print(f"✓ 转换完成,共 {len(images)} 页")
  93. # 处理每一页
  94. processed_images = []
  95. for i, image in enumerate(images, 1):
  96. print(f"处理第 {i}/{len(images)} 页...", end='\r')
  97. # 保存原始图片
  98. original_path = temp_path / f"page_{i}_original.png"
  99. image.save(str(original_path), "PNG")
  100. # 去水印
  101. nowm_path = temp_path / f"page_{i}_nowm.png"
  102. processed_path = remove_watermark(
  103. str(original_path),
  104. output_path=str(nowm_path),
  105. light_threshold=light_threshold,
  106. saturation_threshold=saturation_threshold,
  107. method="hsv"
  108. )
  109. # 加载处理后的图片
  110. processed_img = Image.open(processed_path)
  111. processed_images.append(processed_img)
  112. print(f"\n✓ 所有页面处理完成")
  113. # 将图片合并为PDF
  114. print("正在生成PDF...")
  115. if processed_images:
  116. # 第一张图片作为主图片
  117. first_image = processed_images[0]
  118. # 其余图片作为附加页
  119. other_images = processed_images[1:] if len(processed_images) > 1 else []
  120. # 保存为PDF
  121. first_image.save(
  122. output_pdf,
  123. "PDF",
  124. resolution=dpi,
  125. save_all=True,
  126. append_images=other_images
  127. )
  128. print(f"✓ PDF生成完成: {output_pdf}")
  129. return True
  130. else:
  131. print("⚠ 没有处理任何图片")
  132. return False
  133. finally:
  134. # 清理临时目录
  135. try:
  136. shutil.rmtree(temp_dir)
  137. except Exception as e:
  138. print(f"⚠ 清理临时目录失败: {e}")
  139. except ImportError as e:
  140. print(f"⚠ 缺少必要的库: {e}")
  141. print("请安装: pip install pypdfium2 或 pdf2image pillow PyPDF2 opencv-python;pdf2image 需系统安装 poppler-utils")
  142. return False
  143. except FileNotFoundError as e:
  144. print(f"⚠ {e}")
  145. return False
  146. except Exception as e:
  147. print(f"⚠ 去水印处理失败: {e}")
  148. import traceback
  149. traceback.print_exc()
  150. return False
  151. def crop_header_footer_from_pdf(
  152. input_pdf: str,
  153. output_pdf: str,
  154. header_ratio: float = 0.05,
  155. footer_ratio: float = 0.05,
  156. auto_detect: bool = False,
  157. dpi: int = 200
  158. ) -> bool:
  159. """
  160. 对 PDF 文件进行页眉页脚裁剪处理。
  161. 处理流程:
  162. 1. 将 PDF 的每一页转换为图片
  163. 2. 对每张图片进行页眉页脚裁剪
  164. 3. 将处理后的图片合并为新的 PDF
  165. Args:
  166. input_pdf: 输入 PDF 文件路径
  167. output_pdf: 输出 PDF 文件路径
  168. header_ratio: 页眉裁剪比例(0-1),默认 0.05 表示裁剪顶部 5%
  169. footer_ratio: 页脚裁剪比例(0-1),默认 0.05 表示裁剪底部 5%
  170. auto_detect: 是否自动检测页眉页脚边界
  171. dpi: PDF 转图片的 DPI
  172. Returns:
  173. bool: 是否成功
  174. """
  175. try:
  176. from PIL import Image
  177. from utils.image_preprocessor import crop_header_footer, check_opencv_available
  178. if not check_opencv_available():
  179. print("⚠ OpenCV 未安装,无法进行页眉页脚裁剪")
  180. return False
  181. temp_dir = tempfile.mkdtemp(prefix="pdf_crop_hf_")
  182. temp_path = Path(temp_dir)
  183. try:
  184. print(f"正在将 PDF 转换为图片(DPI={dpi})...")
  185. images = _pdf_to_pil_images(input_pdf, dpi=dpi)
  186. if not images:
  187. print("⚠ 未得到任何页面图片")
  188. return False
  189. print(f"✓ 转换完成,共 {len(images)} 页")
  190. processed_images = []
  191. for i, image in enumerate(images, 1):
  192. print(f"处理第 {i}/{len(images)} 页...", end="\r")
  193. original_path = temp_path / f"page_{i}_original.png"
  194. image.save(str(original_path), "PNG")
  195. cropped_path = temp_path / f"page_{i}_cropped.png"
  196. crop_header_footer(
  197. str(original_path),
  198. output_path=str(cropped_path),
  199. header_ratio=header_ratio,
  200. footer_ratio=footer_ratio,
  201. auto_detect=auto_detect,
  202. )
  203. processed_img = Image.open(cropped_path)
  204. processed_images.append(processed_img)
  205. print("\n✓ 所有页面处理完成")
  206. print("正在生成 PDF...")
  207. if processed_images:
  208. first_image = processed_images[0]
  209. other_images = processed_images[1:] if len(processed_images) > 1 else []
  210. first_image.save(
  211. output_pdf,
  212. "PDF",
  213. resolution=dpi,
  214. save_all=True,
  215. append_images=other_images,
  216. )
  217. print(f"✓ PDF 生成完成: {output_pdf}")
  218. return True
  219. else:
  220. print("⚠ 没有处理任何图片")
  221. return False
  222. finally:
  223. try:
  224. shutil.rmtree(temp_dir)
  225. except Exception as e:
  226. print(f"⚠ 清理临时目录失败: {e}")
  227. except ImportError as e:
  228. print(f"⚠ 缺少必要的库: {e}")
  229. print("请安装: pip install pypdfium2 或 pdf2image pillow opencv-python;pdf2image 需系统安装 poppler-utils")
  230. return False
  231. except FileNotFoundError as e:
  232. print(f"⚠ {e}")
  233. return False
  234. except Exception as e:
  235. print(f"⚠ 页眉页脚裁剪失败: {e}")
  236. import traceback
  237. traceback.print_exc()
  238. return False