test_no.py 25 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624
  1. #!/usr/bin/env python3
  2. # -*- coding: utf-8 -*-
  3. """
  4. PDF 附件页识别和切割工具
  5. 支持 OCR 识别扫描版 PDF
  6. """
  7. import pdfplumber
  8. from pathlib import Path
  9. from PIL import Image
  10. import io
  11. import re
  12. from utils.logging_config import get_logger
  13. # 初始化日志
  14. logger = get_logger("pdf_converter_v2.attachment_splitter")
  15. # 导入 Tesseract OCR
  16. try:
  17. import pytesseract
  18. TESSERACT_AVAILABLE = True
  19. logger.info("[附件切割] Tesseract OCR 可用")
  20. except ImportError:
  21. TESSERACT_AVAILABLE = False
  22. logger.error("[附件切割] Tesseract OCR 不可用,请安装: apt install tesseract-ocr tesseract-ocr-chi-sim && pip install pytesseract")
  23. try:
  24. import PyPDF2
  25. PYPDF2_AVAILABLE = True
  26. logger.info("[附件切割] PyPDF2 可用")
  27. except ImportError:
  28. PYPDF2_AVAILABLE = False
  29. logger.error("[附件切割] PyPDF2 未安装,无法切割 PDF")
  30. logger.info("[附件切割] 安装命令: pip install PyPDF2")
  31. # 配置
  32. PDF_PATH = '/home/hws/workspace/GitLab/Clerk2.5/pdf_converter_v2/2-数据源/1-(可研评审)晋电经研规划〔2017〕187号(盖章)国网山西经研院关于山西晋城周村220kV输变电工程可行性研究报告的评审意见.pdf'
  33. OUTPUT_DIR = Path('附件页')
  34. USE_OCR = True # 是否启用 OCR
  35. OCR_LANG = 'chi_sim+eng' # OCR 语言
  36. DEBUG_MODE = False # 是否启用调试模式(显示每页的文本内容)
  37. # 去水印配置
  38. REMOVE_WATERMARK = False # 是否对切割后的附件页PDF去水印
  39. WATERMARK_LIGHT_THRESHOLD = 200 # 水印亮度阈值(0-255),高于此值的浅色像素可能是水印
  40. WATERMARK_SATURATION_THRESHOLD = 30 # 水印饱和度阈值(0-255),低于此值的低饱和度像素可能是水印
  41. WATERMARK_DPI = 200 # PDF转图片的DPI(用于去水印)
  42. # 表格附件过滤配置
  43. TABLE_ONLY = True # 是否只保留包含表格的附件页(过滤掉示意图、评审意见等)
  44. # 附件页识别关键词
  45. ATTACHMENT_START_KEYWORDS = [
  46. '附件:',
  47. '附件:',
  48. '附 件:',
  49. '附 件:',
  50. ]
  51. # 表格附件识别关键词(用于过滤只保留包含表格的附件)
  52. TABLE_ATTACHMENT_KEYWORDS = [
  53. '项目表',
  54. '投资估算',
  55. '工程投资',
  56. '建设规模',
  57. '技术方案',
  58. '变电工程',
  59. '线路工程',
  60. '静态投资',
  61. '动态投资',
  62. '单位造价',
  63. '设备购置费',
  64. '安装工程费',
  65. '建筑工程费',
  66. '其他费用',
  67. '基本预备费',
  68. ]
  69. # 非表格附件识别关键词(用于识别需要跳过的附件)
  70. NON_TABLE_ATTACHMENT_KEYWORDS = [
  71. '示意图',
  72. '接入系统示意图',
  73. '母线间隔排列图',
  74. '评审意见',
  75. '技术监督意见',
  76. '参会单位',
  77. '人员一览表',
  78. '经济性评价',
  79. '财务合规',
  80. '审核结果',
  81. '预算编制衔接',
  82. ]
  83. def ocr_page_image(image) -> str:
  84. """
  85. 对图片进行 OCR 识别(使用 Tesseract)
  86. Args:
  87. image: PIL Image 对象
  88. Returns:
  89. str: 识别出的文本
  90. """
  91. if not TESSERACT_AVAILABLE:
  92. logger.warning("[附件切割] Tesseract OCR 不可用,跳过识别")
  93. return ""
  94. try:
  95. text = pytesseract.image_to_string(image, lang=OCR_LANG)
  96. logger.debug(f"[附件切割] Tesseract OCR识别成功,文本长度: {len(text)}")
  97. return text
  98. except Exception as e:
  99. logger.error(f"[附件切割] Tesseract OCR识别失败: {e}")
  100. return ""
  101. def extract_page_text(page, use_ocr: bool = False) -> str:
  102. """
  103. 提取页面文本(支持 OCR)
  104. Args:
  105. page: pdfplumber page 对象
  106. use_ocr: 是否使用 OCR
  107. Returns:
  108. str: 页面文本
  109. """
  110. # 先尝试提取文本层
  111. text = page.extract_text()
  112. if text and text.strip():
  113. logger.debug(f"[附件切割] 第{page.page_number}页: 成功提取文本层,长度: {len(text)}")
  114. return text
  115. # 如果没有文本层,使用 OCR
  116. if use_ocr and (TESSERACT_AVAILABLE or PADDLEOCR_AVAILABLE):
  117. logger.info(f"[附件切割] 第{page.page_number}页: 文本层为空,使用OCR识别")
  118. try:
  119. img = page.to_image(resolution=150) # 降低分辨率加快速度
  120. pil_img = img.original
  121. text = ocr_page_image(pil_img)
  122. return text
  123. except Exception as e:
  124. logger.error(f"[附件切割] 第{page.page_number}页: OCR识别失败: {e}")
  125. return ""
  126. logger.warning(f"[附件切割] 第{page.page_number}页: 无法提取文本(OCR未启用或不可用)")
  127. return ""
  128. def is_table_attachment_page(text: str, page) -> bool:
  129. """
  130. 判断是否是包含表格的附件页
  131. Args:
  132. text: 页面文本
  133. page: pdfplumber page 对象
  134. Returns:
  135. bool: 是否是表格附件页
  136. """
  137. if not text:
  138. return False
  139. text_no_space = text.replace(' ', '').replace('\u3000', '')
  140. # 检查是否包含非表格附件关键词(如示意图、评审意见等)
  141. for keyword in NON_TABLE_ATTACHMENT_KEYWORDS:
  142. keyword_no_space = keyword.replace(' ', '').replace('\u3000', '')
  143. if keyword_no_space in text_no_space:
  144. logger.debug(f"[附件切割] 检测到非表格附件关键词: {keyword}")
  145. return False
  146. # 检查是否包含表格附件关键词
  147. has_table_keyword = False
  148. for keyword in TABLE_ATTACHMENT_KEYWORDS:
  149. keyword_no_space = keyword.replace(' ', '').replace('\u3000', '')
  150. if keyword_no_space in text_no_space:
  151. logger.debug(f"[附件切割] 检测到表格关键词: {keyword}")
  152. has_table_keyword = True
  153. break
  154. # 如果有表格关键词,直接返回True
  155. if has_table_keyword:
  156. return True
  157. # 检查页面是否包含表格(使用pdfplumber的表格检测)
  158. if page is not None:
  159. try:
  160. tables = page.extract_tables()
  161. if tables and len(tables) > 0:
  162. # 检查表格是否足够大(至少有3行3列的数据表格)
  163. for table in tables:
  164. if table and len(table) >= 3:
  165. # 检查是否有多列
  166. non_empty_rows = [row for row in table if row and any(cell for cell in row if cell)]
  167. if len(non_empty_rows) >= 3:
  168. row_with_most_cols = max(non_empty_rows, key=lambda r: len([c for c in r if c]))
  169. if len([c for c in row_with_most_cols if c]) >= 3:
  170. logger.debug(f"[附件切割] 检测到表格: {len(non_empty_rows)}行")
  171. return True
  172. except Exception as e:
  173. logger.warning(f"[附件切割] 表格检测失败: {e}")
  174. return False
  175. def is_attachment_start_page(text: str) -> bool:
  176. """
  177. 判断是否是附件清单页(附件开始的前一页)
  178. Args:
  179. text: 页面文本
  180. Returns:
  181. bool: 是否是附件清单页
  182. """
  183. if not text:
  184. return False
  185. # 去除所有空格后再匹配(处理OCR识别出的空格问题)
  186. text_no_space = text.replace(' ', '').replace('\u3000', '') # 移除普通空格和全角空格
  187. # 检查是否包含"附件:"字样
  188. for keyword in ATTACHMENT_START_KEYWORDS:
  189. keyword_no_space = keyword.replace(' ', '').replace('\u3000', '')
  190. if keyword_no_space in text_no_space:
  191. logger.debug(f"[附件切割] 检测到附件关键词: {keyword}")
  192. return True
  193. # 额外检查:是否包含"附件"后面跟数字(如"附件1"、"附件 1"等)
  194. if re.search(r'附件\s*[0-91234567890一二三四五六七八九十]', text_no_space):
  195. logger.debug("[附件切割] 检测到附件+数字模式")
  196. return True
  197. return False
  198. def find_attachment_start_page(pdf_path: str, use_ocr: bool = False, debug: bool = False) -> int:
  199. """
  200. 查找附件开始的页码
  201. 策略:找到包含"附件:"的页面,附件从下一页开始
  202. Args:
  203. pdf_path: PDF 文件路径
  204. use_ocr: 是否使用 OCR
  205. debug: 是否输出调试信息
  206. Returns:
  207. int: 附件开始页码(从1开始),如果未找到返回 0
  208. """
  209. pdf_path = Path(pdf_path)
  210. if not pdf_path.exists():
  211. logger.error(f"[附件切割] PDF 文件不存在: {pdf_path}")
  212. print(f"⚠ PDF 文件不存在: {pdf_path}")
  213. return 0
  214. logger.info(f"[附件切割] 开始扫描PDF: {pdf_path.name}")
  215. print(f"正在扫描 PDF: {pdf_path.name}")
  216. print("=" * 60)
  217. with pdfplumber.open(pdf_path) as pdf:
  218. total_pages = len(pdf.pages)
  219. logger.info(f"[附件切割] PDF总页数: {total_pages}")
  220. print(f"总页数: {total_pages}")
  221. if use_ocr and not (TESSERACT_AVAILABLE or PADDLEOCR_AVAILABLE):
  222. logger.warning("[附件切割] OCR 不可用(Tesseract和PaddleOCR都不可用),仅检查文本层")
  223. print("⚠ OCR 不可用,仅检查文本层")
  224. use_ocr = False
  225. elif use_ocr:
  226. ocr_tool = "Tesseract" if TESSERACT_AVAILABLE else "PaddleOCR"
  227. logger.info(f"[附件切割] 使用 {ocr_tool} 进行OCR识别")
  228. logger.info(f"[附件切割] OCR: {'启用' if use_ocr else '禁用'}, 调试模式: {'启用' if debug else '禁用'}")
  229. print(f"OCR: {'启用' if use_ocr else '禁用'}")
  230. print(f"调试模式: {'启用' if debug else '禁用'}")
  231. print("=" * 60)
  232. for page_num, page in enumerate(pdf.pages, start=1):
  233. if not debug:
  234. print(f"\r扫描进度: {page_num}/{total_pages}", end='', flush=True)
  235. else:
  236. print(f"\n[DEBUG] 页面 {page_num}/{total_pages}")
  237. # 提取文本
  238. text = extract_page_text(page, use_ocr=use_ocr)
  239. if debug:
  240. # 显示文本长度和前200字符
  241. text_preview = text[:200].replace('\n', ' ') if text else "[无文本]"
  242. print(f" 文本长度: {len(text)} 字符")
  243. print(f" 文本预览: {text_preview}...")
  244. # 去除空格后的文本预览
  245. text_no_space = text.replace(' ', '').replace('\u3000', '')
  246. text_no_space_preview = text_no_space[:100] if text_no_space else "[无文本]"
  247. print(f" 去空格后: {text_no_space_preview}...")
  248. # 检查是否包含关键词
  249. matched_keywords = []
  250. for kw in ATTACHMENT_START_KEYWORDS:
  251. kw_no_space = kw.replace(' ', '').replace('\u3000', '')
  252. if kw_no_space in text_no_space:
  253. matched_keywords.append(kw)
  254. # 检查是否包含"附件"后跟数字
  255. if re.search(r'附件\s*[0-91234567890一二三四五六七八九十]', text_no_space):
  256. matched_keywords.append("附件+数字")
  257. if matched_keywords:
  258. print(f" ✓ 匹配关键词: {', '.join(matched_keywords)}")
  259. else:
  260. print(f" ✗ 未匹配任何关键词")
  261. # 判断是否是附件清单页
  262. if is_attachment_start_page(text):
  263. # 检查是否直接是"附件1"开头(说明当前页就是附件页)
  264. text_no_space = text.replace(' ', '').replace('\u3000', '')
  265. if re.search(r'附件\s*[11一]', text_no_space[:50]): # 检查前50个字符
  266. # 当前页就是附件开始页
  267. attachment_start = page_num
  268. logger.info(f"[附件切割] 发现附件页(直接开始): 第 {page_num} 页")
  269. print(f"\n\n✓ 发现附件页(直接开始): 第 {page_num} 页")
  270. print(f"✓ 附件开始页: 第 {attachment_start} 页")
  271. else:
  272. # 附件从下一页开始
  273. attachment_start = page_num + 1
  274. logger.info(f"[附件切割] 发现附件清单页: 第 {page_num} 页,附件开始页: 第 {attachment_start} 页")
  275. print(f"\n\n✓ 发现附件清单页: 第 {page_num} 页")
  276. print(f"✓ 附件开始页: 第 {attachment_start} 页")
  277. # 显示匹配的关键词
  278. matched_keywords = []
  279. for kw in ATTACHMENT_START_KEYWORDS:
  280. kw_no_space = kw.replace(' ', '').replace('\u3000', '')
  281. if kw_no_space in text_no_space:
  282. matched_keywords.append(kw)
  283. if re.search(r'附件\s*[0-91234567890一二三四五六七八九十]', text_no_space):
  284. matched_keywords.append("附件+数字")
  285. logger.info(f"[附件切割] 匹配关键词: {', '.join(matched_keywords)}")
  286. print(f" 匹配关键词: {', '.join(matched_keywords)}")
  287. # 显示部分文本
  288. preview = text[:300].replace('\n', ' ')
  289. logger.debug(f"[附件切割] 文本预览: {preview}...")
  290. print(f" 文本预览: {preview}...")
  291. return attachment_start
  292. logger.warning("[附件切割] 未找到附件清单页")
  293. print(f"\n\n未找到附件清单页")
  294. return 0
  295. def extract_pages(pdf_path: str, page_numbers: list, output_path: str):
  296. """
  297. 从 PDF 中提取指定页面并保存为新 PDF
  298. Args:
  299. pdf_path: 源 PDF 文件路径
  300. page_numbers: 要提取的页码列表(从1开始)
  301. output_path: 输出 PDF 文件路径
  302. """
  303. if not PYPDF2_AVAILABLE:
  304. logger.error("[附件切割] PyPDF2 未安装,无法切割 PDF")
  305. print("⚠ PyPDF2 未安装,无法切割 PDF")
  306. return
  307. pdf_path = Path(pdf_path)
  308. output_path = Path(output_path)
  309. logger.info(f"[附件切割] 开始提取页面: {page_numbers} 从 {pdf_path.name}")
  310. # 创建输出目录
  311. output_path.parent.mkdir(parents=True, exist_ok=True)
  312. # 读取源 PDF
  313. with open(pdf_path, 'rb') as file:
  314. reader = PyPDF2.PdfReader(file)
  315. writer = PyPDF2.PdfWriter()
  316. total_source_pages = len(reader.pages)
  317. logger.info(f"[附件切割] 源PDF总页数: {total_source_pages}")
  318. # 添加指定页面
  319. extracted_count = 0
  320. for page_num in page_numbers:
  321. if 1 <= page_num <= total_source_pages:
  322. writer.add_page(reader.pages[page_num - 1]) # PyPDF2 页码从0开始
  323. extracted_count += 1
  324. else:
  325. logger.warning(f"[附件切割] 页码 {page_num} 超出范围 (1-{total_source_pages}),跳过")
  326. logger.info(f"[附件切割] 成功提取 {extracted_count}/{len(page_numbers)} 页")
  327. # 保存新 PDF
  328. with open(output_path, 'wb') as output_file:
  329. writer.write(output_file)
  330. logger.info(f"[附件切割] 已保存到: {output_path}")
  331. print(f"✓ 已保存到: {output_path}")
  332. def split_attachment_pages(pdf_path: str, output_dir: Path, use_ocr: bool = False, debug: bool = False,
  333. remove_watermark: bool = False, watermark_light_threshold: int = 200,
  334. watermark_saturation_threshold: int = 30, watermark_dpi: int = 200,
  335. table_only: bool = False):
  336. """
  337. 查找并切割附件页
  338. Args:
  339. pdf_path: PDF 文件路径
  340. output_dir: 输出目录
  341. use_ocr: 是否使用 OCR
  342. debug: 是否输出调试信息
  343. remove_watermark: 是否对切割后的附件页PDF去水印
  344. watermark_light_threshold: 水印亮度阈值(0-255)
  345. watermark_saturation_threshold: 水印饱和度阈值(0-255)
  346. watermark_dpi: PDF转图片的DPI
  347. table_only: 是否只保留包含表格的附件页(过滤掉示意图、评审意见等)
  348. """
  349. logger.info(f"[附件切割] 开始处理PDF: {pdf_path}")
  350. logger.info(f"[附件切割] 只保留表格附件: {'是' if table_only else '否'}")
  351. # 查找附件开始页
  352. attachment_start = find_attachment_start_page(pdf_path, use_ocr=use_ocr, debug=debug)
  353. if attachment_start == 0:
  354. logger.warning(f"[附件切割] 未找到附件页: {pdf_path}")
  355. print("\n未找到附件页")
  356. return
  357. # 获取总页数和筛选表格附件页
  358. with pdfplumber.open(pdf_path) as pdf:
  359. total_pages = len(pdf.pages)
  360. if table_only:
  361. # 只保留包含表格的附件页
  362. logger.info(f"[附件切割] 启用表格附件过滤,开始筛选...")
  363. print(f"\n启用表格附件过滤,开始筛选...")
  364. attachment_pages = []
  365. current_table_section = [] # 当前表格区段的页面
  366. in_table_section = False # 是否在表格区段内
  367. for page_num in range(attachment_start, total_pages + 1):
  368. page = pdf.pages[page_num - 1]
  369. text = extract_page_text(page, use_ocr=use_ocr)
  370. is_table_page = is_table_attachment_page(text, page)
  371. if debug:
  372. print(f" 页面 {page_num}: {'表格页' if is_table_page else '非表格页'}")
  373. if is_table_page:
  374. if not in_table_section:
  375. # 开始新的表格区段
  376. in_table_section = True
  377. current_table_section = [page_num]
  378. logger.debug(f"[附件切割] 开始表格区段: 第 {page_num} 页")
  379. else:
  380. # 继续当前表格区段
  381. current_table_section.append(page_num)
  382. else:
  383. if in_table_section:
  384. # 结束当前表格区段,保存
  385. attachment_pages.extend(current_table_section)
  386. logger.info(f"[附件切割] 表格区段结束: {current_table_section[0]}-{current_table_section[-1]}")
  387. current_table_section = []
  388. in_table_section = False
  389. # 处理最后一个表格区段
  390. if in_table_section and current_table_section:
  391. attachment_pages.extend(current_table_section)
  392. logger.info(f"[附件切割] 最后表格区段: {current_table_section[0]}-{current_table_section[-1]}")
  393. if not attachment_pages:
  394. logger.warning(f"[附件切割] 未找到包含表格的附件页")
  395. print("\n未找到包含表格的附件页")
  396. return
  397. logger.info(f"[附件切割] 筛选后的表格附件页: {attachment_pages}")
  398. print(f"\n筛选后的表格附件页: {attachment_pages}")
  399. print(f"共 {len(attachment_pages)} 页")
  400. else:
  401. # 附件页范围:从附件开始页到最后一页
  402. attachment_pages = list(range(attachment_start, total_pages + 1))
  403. logger.info(f"[附件切割] 附件页范围: {attachment_start}-{total_pages}, 共 {len(attachment_pages)} 页")
  404. print(f"\n附件页范围: 第 {attachment_start} 页 到 第 {total_pages} 页")
  405. print(f"共 {len(attachment_pages)} 页")
  406. # 切割附件页
  407. print("\n" + "=" * 60)
  408. print("开始切割附件页")
  409. print("=" * 60)
  410. pdf_path = Path(pdf_path)
  411. output_dir.mkdir(parents=True, exist_ok=True)
  412. # 保存所有附件页为一个文件
  413. if table_only:
  414. # 表格附件模式:使用筛选后的页面范围
  415. page_range_str = f"{min(attachment_pages)}_{max(attachment_pages)}" if attachment_pages else "none"
  416. output_file = output_dir / f"{pdf_path.stem}_表格附件页_{page_range_str}.pdf"
  417. else:
  418. output_file = output_dir / f"{pdf_path.stem}_附件页_{attachment_start}-{total_pages}.pdf"
  419. logger.info(f"[附件切割] 输出文件: {output_file}")
  420. extract_pages(pdf_path, attachment_pages, output_file)
  421. logger.info(f"[附件切割] 切割完成: {len(attachment_pages)} 页附件已保存")
  422. print(f"\n✓ 切割完成!")
  423. print(f"附件页数: {len(attachment_pages)} 页")
  424. print(f"输出文件: {output_file}")
  425. # 如果启用去水印,对切割后的附件页PDF进行去水印处理
  426. if remove_watermark:
  427. logger.info(f"[附件切割] 开始对附件页PDF进行去水印处理...")
  428. print("\n" + "=" * 60)
  429. print("开始去水印处理")
  430. print("=" * 60)
  431. try:
  432. # 导入去水印模块
  433. import sys
  434. from pathlib import Path as PathLib
  435. sys.path.insert(0, str(PathLib(__file__).parent))
  436. from utils.pdf_watermark_remover import remove_watermark_from_pdf
  437. # 去水印后的PDF路径
  438. nowm_output_file = output_dir / f"{output_file.stem}_nowm.pdf"
  439. logger.info(f"[附件切割] 去水印参数: 亮度阈值={watermark_light_threshold}, 饱和度阈值={watermark_saturation_threshold}, DPI={watermark_dpi}")
  440. print(f"去水印参数:")
  441. print(f" - 亮度阈值: {watermark_light_threshold}")
  442. print(f" - 饱和度阈值: {watermark_saturation_threshold}")
  443. print(f" - DPI: {watermark_dpi}")
  444. # 执行去水印
  445. success = remove_watermark_from_pdf(
  446. input_pdf=str(output_file),
  447. output_pdf=str(nowm_output_file),
  448. light_threshold=watermark_light_threshold,
  449. saturation_threshold=watermark_saturation_threshold,
  450. dpi=watermark_dpi
  451. )
  452. if success and nowm_output_file.exists():
  453. logger.info(f"[附件切割] 去水印完成: {nowm_output_file}")
  454. print(f"\n✓ 去水印完成!")
  455. print(f"去水印后的文件: {nowm_output_file}")
  456. else:
  457. logger.warning(f"[附件切割] 去水印失败")
  458. print(f"\n⚠ 去水印失败,请检查日志")
  459. except ImportError as e:
  460. logger.error(f"[附件切割] 导入去水印模块失败: {e}")
  461. print(f"\n⚠ 去水印模块导入失败: {e}")
  462. print("请确保 utils/pdf_watermark_remover.py 文件存在")
  463. except Exception as e:
  464. logger.exception(f"[附件切割] 去水印处理失败: {e}")
  465. print(f"\n⚠ 去水印处理失败: {e}")
  466. print(f"\n输出目录: {output_dir.absolute()}")
  467. if __name__ == '__main__':
  468. logger.info("[附件切割] " + "=" * 50)
  469. logger.info("[附件切割] PDF 附件页识别和切割工具启动")
  470. logger.info("[附件切割] " + "=" * 50)
  471. print("=" * 60)
  472. print("PDF 附件页识别和切割工具")
  473. print("=" * 60)
  474. # 显示配置信息
  475. print("\n配置信息:")
  476. print(f" - PDF文件: {PDF_PATH}")
  477. print(f" - 输出目录: {OUTPUT_DIR}")
  478. print(f" - OCR: {'启用' if USE_OCR else '禁用'}")
  479. print(f" - 调试模式: {'启用' if DEBUG_MODE else '禁用'}")
  480. print(f" - 只保留表格附件: {'启用' if TABLE_ONLY else '禁用'}")
  481. print(f" - 去水印: {'启用' if REMOVE_WATERMARK else '禁用'}")
  482. if REMOVE_WATERMARK:
  483. print(f" * 亮度阈值: {WATERMARK_LIGHT_THRESHOLD}")
  484. print(f" * 饱和度阈值: {WATERMARK_SATURATION_THRESHOLD}")
  485. print(f" * DPI: {WATERMARK_DPI}")
  486. # 检查依赖
  487. if not TESSERACT_AVAILABLE and USE_OCR:
  488. logger.warning("[附件切割] OCR 功能不可用")
  489. print("\n⚠ OCR 功能不可用")
  490. print("安装方法:")
  491. print(" pip install pytesseract")
  492. print(" sudo apt-get install tesseract-ocr tesseract-ocr-chi-sim")
  493. print("\n将继续使用文本层检测(可能无法识别扫描版)\n")
  494. if not PYPDF2_AVAILABLE:
  495. logger.error("[附件切割] PDF 切割功能不可用")
  496. print("\n⚠ PDF 切割功能不可用")
  497. print("安装方法:")
  498. print(" pip install PyPDF2\n")
  499. if REMOVE_WATERMARK:
  500. print("\n⚠ 去水印功能需要以下依赖:")
  501. print(" - OpenCV (cv2)")
  502. print(" - Pillow (PIL)")
  503. print(" - pdf2image")
  504. print(" - PyPDF2")
  505. print("安装命令:")
  506. print(" pip install opencv-python pillow pdf2image PyPDF2\n")
  507. # 执行切割
  508. logger.info(f"[附件切割] 配置: PDF={PDF_PATH}, 输出={OUTPUT_DIR}, OCR={USE_OCR}, DEBUG={DEBUG_MODE}, 表格附件={TABLE_ONLY}, 去水印={REMOVE_WATERMARK}")
  509. split_attachment_pages(
  510. PDF_PATH,
  511. OUTPUT_DIR,
  512. use_ocr=USE_OCR,
  513. debug=DEBUG_MODE,
  514. remove_watermark=REMOVE_WATERMARK,
  515. watermark_light_threshold=WATERMARK_LIGHT_THRESHOLD,
  516. watermark_saturation_threshold=WATERMARK_SATURATION_THRESHOLD,
  517. watermark_dpi=WATERMARK_DPI,
  518. table_only=TABLE_ONLY
  519. )
  520. logger.info("[附件切割] 程序执行完成")