| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624 |
- #!/usr/bin/env python3
- # -*- coding: utf-8 -*-
- """
- PDF 附件页识别和切割工具
- 支持 OCR 识别扫描版 PDF
- """
- import pdfplumber
- from pathlib import Path
- from PIL import Image
- import io
- import re
- from utils.logging_config import get_logger
- # 初始化日志
- logger = get_logger("pdf_converter_v2.attachment_splitter")
- # 导入 Tesseract OCR
- try:
- import pytesseract
- TESSERACT_AVAILABLE = True
- logger.info("[附件切割] Tesseract OCR 可用")
- except ImportError:
- TESSERACT_AVAILABLE = False
- logger.error("[附件切割] Tesseract OCR 不可用,请安装: apt install tesseract-ocr tesseract-ocr-chi-sim && pip install pytesseract")
- try:
- import PyPDF2
- PYPDF2_AVAILABLE = True
- logger.info("[附件切割] PyPDF2 可用")
- except ImportError:
- PYPDF2_AVAILABLE = False
- logger.error("[附件切割] PyPDF2 未安装,无法切割 PDF")
- logger.info("[附件切割] 安装命令: pip install PyPDF2")
- # 配置
- PDF_PATH = '/home/hws/workspace/GitLab/Clerk2.5/pdf_converter_v2/2-数据源/1-(可研评审)晋电经研规划〔2017〕187号(盖章)国网山西经研院关于山西晋城周村220kV输变电工程可行性研究报告的评审意见.pdf'
- OUTPUT_DIR = Path('附件页')
- USE_OCR = True # 是否启用 OCR
- OCR_LANG = 'chi_sim+eng' # OCR 语言
- DEBUG_MODE = False # 是否启用调试模式(显示每页的文本内容)
- # 去水印配置
- REMOVE_WATERMARK = False # 是否对切割后的附件页PDF去水印
- WATERMARK_LIGHT_THRESHOLD = 200 # 水印亮度阈值(0-255),高于此值的浅色像素可能是水印
- WATERMARK_SATURATION_THRESHOLD = 30 # 水印饱和度阈值(0-255),低于此值的低饱和度像素可能是水印
- WATERMARK_DPI = 200 # PDF转图片的DPI(用于去水印)
- # 表格附件过滤配置
- TABLE_ONLY = True # 是否只保留包含表格的附件页(过滤掉示意图、评审意见等)
- # 附件页识别关键词
- ATTACHMENT_START_KEYWORDS = [
- '附件:',
- '附件:',
- '附 件:',
- '附 件:',
- ]
- # 表格附件识别关键词(用于过滤只保留包含表格的附件)
- TABLE_ATTACHMENT_KEYWORDS = [
- '项目表',
- '投资估算',
- '工程投资',
- '建设规模',
- '技术方案',
- '变电工程',
- '线路工程',
- '静态投资',
- '动态投资',
- '单位造价',
- '设备购置费',
- '安装工程费',
- '建筑工程费',
- '其他费用',
- '基本预备费',
- ]
- # 非表格附件识别关键词(用于识别需要跳过的附件)
- NON_TABLE_ATTACHMENT_KEYWORDS = [
- '示意图',
- '接入系统示意图',
- '母线间隔排列图',
- '评审意见',
- '技术监督意见',
- '参会单位',
- '人员一览表',
- '经济性评价',
- '财务合规',
- '审核结果',
- '预算编制衔接',
- ]
- def ocr_page_image(image) -> str:
- """
- 对图片进行 OCR 识别(使用 Tesseract)
-
- Args:
- image: PIL Image 对象
-
- Returns:
- str: 识别出的文本
- """
- if not TESSERACT_AVAILABLE:
- logger.warning("[附件切割] Tesseract OCR 不可用,跳过识别")
- return ""
-
- try:
- text = pytesseract.image_to_string(image, lang=OCR_LANG)
- logger.debug(f"[附件切割] Tesseract OCR识别成功,文本长度: {len(text)}")
- return text
- except Exception as e:
- logger.error(f"[附件切割] Tesseract OCR识别失败: {e}")
- return ""
- def extract_page_text(page, use_ocr: bool = False) -> str:
- """
- 提取页面文本(支持 OCR)
-
- Args:
- page: pdfplumber page 对象
- use_ocr: 是否使用 OCR
-
- Returns:
- str: 页面文本
- """
- # 先尝试提取文本层
- text = page.extract_text()
-
- if text and text.strip():
- logger.debug(f"[附件切割] 第{page.page_number}页: 成功提取文本层,长度: {len(text)}")
- return text
-
- # 如果没有文本层,使用 OCR
- if use_ocr and (TESSERACT_AVAILABLE or PADDLEOCR_AVAILABLE):
- logger.info(f"[附件切割] 第{page.page_number}页: 文本层为空,使用OCR识别")
- try:
- img = page.to_image(resolution=150) # 降低分辨率加快速度
- pil_img = img.original
- text = ocr_page_image(pil_img)
- return text
- except Exception as e:
- logger.error(f"[附件切割] 第{page.page_number}页: OCR识别失败: {e}")
- return ""
-
- logger.warning(f"[附件切割] 第{page.page_number}页: 无法提取文本(OCR未启用或不可用)")
- return ""
- def is_table_attachment_page(text: str, page) -> bool:
- """
- 判断是否是包含表格的附件页
-
- Args:
- text: 页面文本
- page: pdfplumber page 对象
-
- Returns:
- bool: 是否是表格附件页
- """
- if not text:
- return False
-
- text_no_space = text.replace(' ', '').replace('\u3000', '')
-
- # 检查是否包含非表格附件关键词(如示意图、评审意见等)
- for keyword in NON_TABLE_ATTACHMENT_KEYWORDS:
- keyword_no_space = keyword.replace(' ', '').replace('\u3000', '')
- if keyword_no_space in text_no_space:
- logger.debug(f"[附件切割] 检测到非表格附件关键词: {keyword}")
- return False
-
- # 检查是否包含表格附件关键词
- has_table_keyword = False
- for keyword in TABLE_ATTACHMENT_KEYWORDS:
- keyword_no_space = keyword.replace(' ', '').replace('\u3000', '')
- if keyword_no_space in text_no_space:
- logger.debug(f"[附件切割] 检测到表格关键词: {keyword}")
- has_table_keyword = True
- break
-
- # 如果有表格关键词,直接返回True
- if has_table_keyword:
- return True
-
- # 检查页面是否包含表格(使用pdfplumber的表格检测)
- if page is not None:
- try:
- tables = page.extract_tables()
- if tables and len(tables) > 0:
- # 检查表格是否足够大(至少有3行3列的数据表格)
- for table in tables:
- if table and len(table) >= 3:
- # 检查是否有多列
- non_empty_rows = [row for row in table if row and any(cell for cell in row if cell)]
- if len(non_empty_rows) >= 3:
- row_with_most_cols = max(non_empty_rows, key=lambda r: len([c for c in r if c]))
- if len([c for c in row_with_most_cols if c]) >= 3:
- logger.debug(f"[附件切割] 检测到表格: {len(non_empty_rows)}行")
- return True
- except Exception as e:
- logger.warning(f"[附件切割] 表格检测失败: {e}")
-
- return False
- def is_attachment_start_page(text: str) -> bool:
- """
- 判断是否是附件清单页(附件开始的前一页)
-
- Args:
- text: 页面文本
-
- Returns:
- bool: 是否是附件清单页
- """
- if not text:
- return False
-
- # 去除所有空格后再匹配(处理OCR识别出的空格问题)
- text_no_space = text.replace(' ', '').replace('\u3000', '') # 移除普通空格和全角空格
-
- # 检查是否包含"附件:"字样
- for keyword in ATTACHMENT_START_KEYWORDS:
- keyword_no_space = keyword.replace(' ', '').replace('\u3000', '')
- if keyword_no_space in text_no_space:
- logger.debug(f"[附件切割] 检测到附件关键词: {keyword}")
- return True
-
- # 额外检查:是否包含"附件"后面跟数字(如"附件1"、"附件 1"等)
- if re.search(r'附件\s*[0-91234567890一二三四五六七八九十]', text_no_space):
- logger.debug("[附件切割] 检测到附件+数字模式")
- return True
-
- return False
- def find_attachment_start_page(pdf_path: str, use_ocr: bool = False, debug: bool = False) -> int:
- """
- 查找附件开始的页码
-
- 策略:找到包含"附件:"的页面,附件从下一页开始
-
- Args:
- pdf_path: PDF 文件路径
- use_ocr: 是否使用 OCR
- debug: 是否输出调试信息
-
- Returns:
- int: 附件开始页码(从1开始),如果未找到返回 0
- """
- pdf_path = Path(pdf_path)
-
- if not pdf_path.exists():
- logger.error(f"[附件切割] PDF 文件不存在: {pdf_path}")
- print(f"⚠ PDF 文件不存在: {pdf_path}")
- return 0
-
- logger.info(f"[附件切割] 开始扫描PDF: {pdf_path.name}")
- print(f"正在扫描 PDF: {pdf_path.name}")
- print("=" * 60)
-
- with pdfplumber.open(pdf_path) as pdf:
- total_pages = len(pdf.pages)
- logger.info(f"[附件切割] PDF总页数: {total_pages}")
- print(f"总页数: {total_pages}")
-
- if use_ocr and not (TESSERACT_AVAILABLE or PADDLEOCR_AVAILABLE):
- logger.warning("[附件切割] OCR 不可用(Tesseract和PaddleOCR都不可用),仅检查文本层")
- print("⚠ OCR 不可用,仅检查文本层")
- use_ocr = False
- elif use_ocr:
- ocr_tool = "Tesseract" if TESSERACT_AVAILABLE else "PaddleOCR"
- logger.info(f"[附件切割] 使用 {ocr_tool} 进行OCR识别")
-
- logger.info(f"[附件切割] OCR: {'启用' if use_ocr else '禁用'}, 调试模式: {'启用' if debug else '禁用'}")
- print(f"OCR: {'启用' if use_ocr else '禁用'}")
- print(f"调试模式: {'启用' if debug else '禁用'}")
- print("=" * 60)
-
- for page_num, page in enumerate(pdf.pages, start=1):
- if not debug:
- print(f"\r扫描进度: {page_num}/{total_pages}", end='', flush=True)
- else:
- print(f"\n[DEBUG] 页面 {page_num}/{total_pages}")
-
- # 提取文本
- text = extract_page_text(page, use_ocr=use_ocr)
-
- if debug:
- # 显示文本长度和前200字符
- text_preview = text[:200].replace('\n', ' ') if text else "[无文本]"
- print(f" 文本长度: {len(text)} 字符")
- print(f" 文本预览: {text_preview}...")
-
- # 去除空格后的文本预览
- text_no_space = text.replace(' ', '').replace('\u3000', '')
- text_no_space_preview = text_no_space[:100] if text_no_space else "[无文本]"
- print(f" 去空格后: {text_no_space_preview}...")
-
- # 检查是否包含关键词
- matched_keywords = []
- for kw in ATTACHMENT_START_KEYWORDS:
- kw_no_space = kw.replace(' ', '').replace('\u3000', '')
- if kw_no_space in text_no_space:
- matched_keywords.append(kw)
-
- # 检查是否包含"附件"后跟数字
- if re.search(r'附件\s*[0-91234567890一二三四五六七八九十]', text_no_space):
- matched_keywords.append("附件+数字")
-
- if matched_keywords:
- print(f" ✓ 匹配关键词: {', '.join(matched_keywords)}")
- else:
- print(f" ✗ 未匹配任何关键词")
-
- # 判断是否是附件清单页
- if is_attachment_start_page(text):
- # 检查是否直接是"附件1"开头(说明当前页就是附件页)
- text_no_space = text.replace(' ', '').replace('\u3000', '')
- if re.search(r'附件\s*[11一]', text_no_space[:50]): # 检查前50个字符
- # 当前页就是附件开始页
- attachment_start = page_num
- logger.info(f"[附件切割] 发现附件页(直接开始): 第 {page_num} 页")
- print(f"\n\n✓ 发现附件页(直接开始): 第 {page_num} 页")
- print(f"✓ 附件开始页: 第 {attachment_start} 页")
- else:
- # 附件从下一页开始
- attachment_start = page_num + 1
- logger.info(f"[附件切割] 发现附件清单页: 第 {page_num} 页,附件开始页: 第 {attachment_start} 页")
- print(f"\n\n✓ 发现附件清单页: 第 {page_num} 页")
- print(f"✓ 附件开始页: 第 {attachment_start} 页")
-
- # 显示匹配的关键词
- matched_keywords = []
- for kw in ATTACHMENT_START_KEYWORDS:
- kw_no_space = kw.replace(' ', '').replace('\u3000', '')
- if kw_no_space in text_no_space:
- matched_keywords.append(kw)
- if re.search(r'附件\s*[0-91234567890一二三四五六七八九十]', text_no_space):
- matched_keywords.append("附件+数字")
- logger.info(f"[附件切割] 匹配关键词: {', '.join(matched_keywords)}")
- print(f" 匹配关键词: {', '.join(matched_keywords)}")
-
- # 显示部分文本
- preview = text[:300].replace('\n', ' ')
- logger.debug(f"[附件切割] 文本预览: {preview}...")
- print(f" 文本预览: {preview}...")
-
- return attachment_start
-
- logger.warning("[附件切割] 未找到附件清单页")
- print(f"\n\n未找到附件清单页")
- return 0
- def extract_pages(pdf_path: str, page_numbers: list, output_path: str):
- """
- 从 PDF 中提取指定页面并保存为新 PDF
-
- Args:
- pdf_path: 源 PDF 文件路径
- page_numbers: 要提取的页码列表(从1开始)
- output_path: 输出 PDF 文件路径
- """
- if not PYPDF2_AVAILABLE:
- logger.error("[附件切割] PyPDF2 未安装,无法切割 PDF")
- print("⚠ PyPDF2 未安装,无法切割 PDF")
- return
-
- pdf_path = Path(pdf_path)
- output_path = Path(output_path)
-
- logger.info(f"[附件切割] 开始提取页面: {page_numbers} 从 {pdf_path.name}")
-
- # 创建输出目录
- output_path.parent.mkdir(parents=True, exist_ok=True)
-
- # 读取源 PDF
- with open(pdf_path, 'rb') as file:
- reader = PyPDF2.PdfReader(file)
- writer = PyPDF2.PdfWriter()
-
- total_source_pages = len(reader.pages)
- logger.info(f"[附件切割] 源PDF总页数: {total_source_pages}")
-
- # 添加指定页面
- extracted_count = 0
- for page_num in page_numbers:
- if 1 <= page_num <= total_source_pages:
- writer.add_page(reader.pages[page_num - 1]) # PyPDF2 页码从0开始
- extracted_count += 1
- else:
- logger.warning(f"[附件切割] 页码 {page_num} 超出范围 (1-{total_source_pages}),跳过")
-
- logger.info(f"[附件切割] 成功提取 {extracted_count}/{len(page_numbers)} 页")
-
- # 保存新 PDF
- with open(output_path, 'wb') as output_file:
- writer.write(output_file)
-
- logger.info(f"[附件切割] 已保存到: {output_path}")
- print(f"✓ 已保存到: {output_path}")
- def split_attachment_pages(pdf_path: str, output_dir: Path, use_ocr: bool = False, debug: bool = False,
- remove_watermark: bool = False, watermark_light_threshold: int = 200,
- watermark_saturation_threshold: int = 30, watermark_dpi: int = 200,
- table_only: bool = False):
- """
- 查找并切割附件页
-
- Args:
- pdf_path: PDF 文件路径
- output_dir: 输出目录
- use_ocr: 是否使用 OCR
- debug: 是否输出调试信息
- remove_watermark: 是否对切割后的附件页PDF去水印
- watermark_light_threshold: 水印亮度阈值(0-255)
- watermark_saturation_threshold: 水印饱和度阈值(0-255)
- watermark_dpi: PDF转图片的DPI
- table_only: 是否只保留包含表格的附件页(过滤掉示意图、评审意见等)
- """
- logger.info(f"[附件切割] 开始处理PDF: {pdf_path}")
- logger.info(f"[附件切割] 只保留表格附件: {'是' if table_only else '否'}")
-
- # 查找附件开始页
- attachment_start = find_attachment_start_page(pdf_path, use_ocr=use_ocr, debug=debug)
-
- if attachment_start == 0:
- logger.warning(f"[附件切割] 未找到附件页: {pdf_path}")
- print("\n未找到附件页")
- return
-
- # 获取总页数和筛选表格附件页
- with pdfplumber.open(pdf_path) as pdf:
- total_pages = len(pdf.pages)
-
- if table_only:
- # 只保留包含表格的附件页
- logger.info(f"[附件切割] 启用表格附件过滤,开始筛选...")
- print(f"\n启用表格附件过滤,开始筛选...")
-
- attachment_pages = []
- current_table_section = [] # 当前表格区段的页面
- in_table_section = False # 是否在表格区段内
-
- for page_num in range(attachment_start, total_pages + 1):
- page = pdf.pages[page_num - 1]
- text = extract_page_text(page, use_ocr=use_ocr)
-
- is_table_page = is_table_attachment_page(text, page)
-
- if debug:
- print(f" 页面 {page_num}: {'表格页' if is_table_page else '非表格页'}")
-
- if is_table_page:
- if not in_table_section:
- # 开始新的表格区段
- in_table_section = True
- current_table_section = [page_num]
- logger.debug(f"[附件切割] 开始表格区段: 第 {page_num} 页")
- else:
- # 继续当前表格区段
- current_table_section.append(page_num)
- else:
- if in_table_section:
- # 结束当前表格区段,保存
- attachment_pages.extend(current_table_section)
- logger.info(f"[附件切割] 表格区段结束: {current_table_section[0]}-{current_table_section[-1]}")
- current_table_section = []
- in_table_section = False
-
- # 处理最后一个表格区段
- if in_table_section and current_table_section:
- attachment_pages.extend(current_table_section)
- logger.info(f"[附件切割] 最后表格区段: {current_table_section[0]}-{current_table_section[-1]}")
-
- if not attachment_pages:
- logger.warning(f"[附件切割] 未找到包含表格的附件页")
- print("\n未找到包含表格的附件页")
- return
-
- logger.info(f"[附件切割] 筛选后的表格附件页: {attachment_pages}")
- print(f"\n筛选后的表格附件页: {attachment_pages}")
- print(f"共 {len(attachment_pages)} 页")
- else:
- # 附件页范围:从附件开始页到最后一页
- attachment_pages = list(range(attachment_start, total_pages + 1))
-
- logger.info(f"[附件切割] 附件页范围: {attachment_start}-{total_pages}, 共 {len(attachment_pages)} 页")
- print(f"\n附件页范围: 第 {attachment_start} 页 到 第 {total_pages} 页")
- print(f"共 {len(attachment_pages)} 页")
-
- # 切割附件页
- print("\n" + "=" * 60)
- print("开始切割附件页")
- print("=" * 60)
-
- pdf_path = Path(pdf_path)
- output_dir.mkdir(parents=True, exist_ok=True)
-
- # 保存所有附件页为一个文件
- if table_only:
- # 表格附件模式:使用筛选后的页面范围
- page_range_str = f"{min(attachment_pages)}_{max(attachment_pages)}" if attachment_pages else "none"
- output_file = output_dir / f"{pdf_path.stem}_表格附件页_{page_range_str}.pdf"
- else:
- output_file = output_dir / f"{pdf_path.stem}_附件页_{attachment_start}-{total_pages}.pdf"
-
- logger.info(f"[附件切割] 输出文件: {output_file}")
- extract_pages(pdf_path, attachment_pages, output_file)
-
- logger.info(f"[附件切割] 切割完成: {len(attachment_pages)} 页附件已保存")
- print(f"\n✓ 切割完成!")
- print(f"附件页数: {len(attachment_pages)} 页")
- print(f"输出文件: {output_file}")
-
- # 如果启用去水印,对切割后的附件页PDF进行去水印处理
- if remove_watermark:
- logger.info(f"[附件切割] 开始对附件页PDF进行去水印处理...")
- print("\n" + "=" * 60)
- print("开始去水印处理")
- print("=" * 60)
-
- try:
- # 导入去水印模块
- import sys
- from pathlib import Path as PathLib
- sys.path.insert(0, str(PathLib(__file__).parent))
-
- from utils.pdf_watermark_remover import remove_watermark_from_pdf
-
- # 去水印后的PDF路径
- nowm_output_file = output_dir / f"{output_file.stem}_nowm.pdf"
-
- logger.info(f"[附件切割] 去水印参数: 亮度阈值={watermark_light_threshold}, 饱和度阈值={watermark_saturation_threshold}, DPI={watermark_dpi}")
- print(f"去水印参数:")
- print(f" - 亮度阈值: {watermark_light_threshold}")
- print(f" - 饱和度阈值: {watermark_saturation_threshold}")
- print(f" - DPI: {watermark_dpi}")
-
- # 执行去水印
- success = remove_watermark_from_pdf(
- input_pdf=str(output_file),
- output_pdf=str(nowm_output_file),
- light_threshold=watermark_light_threshold,
- saturation_threshold=watermark_saturation_threshold,
- dpi=watermark_dpi
- )
-
- if success and nowm_output_file.exists():
- logger.info(f"[附件切割] 去水印完成: {nowm_output_file}")
- print(f"\n✓ 去水印完成!")
- print(f"去水印后的文件: {nowm_output_file}")
- else:
- logger.warning(f"[附件切割] 去水印失败")
- print(f"\n⚠ 去水印失败,请检查日志")
- except ImportError as e:
- logger.error(f"[附件切割] 导入去水印模块失败: {e}")
- print(f"\n⚠ 去水印模块导入失败: {e}")
- print("请确保 utils/pdf_watermark_remover.py 文件存在")
- except Exception as e:
- logger.exception(f"[附件切割] 去水印处理失败: {e}")
- print(f"\n⚠ 去水印处理失败: {e}")
-
- print(f"\n输出目录: {output_dir.absolute()}")
- if __name__ == '__main__':
- logger.info("[附件切割] " + "=" * 50)
- logger.info("[附件切割] PDF 附件页识别和切割工具启动")
- logger.info("[附件切割] " + "=" * 50)
-
- print("=" * 60)
- print("PDF 附件页识别和切割工具")
- print("=" * 60)
-
- # 显示配置信息
- print("\n配置信息:")
- print(f" - PDF文件: {PDF_PATH}")
- print(f" - 输出目录: {OUTPUT_DIR}")
- print(f" - OCR: {'启用' if USE_OCR else '禁用'}")
- print(f" - 调试模式: {'启用' if DEBUG_MODE else '禁用'}")
- print(f" - 只保留表格附件: {'启用' if TABLE_ONLY else '禁用'}")
- print(f" - 去水印: {'启用' if REMOVE_WATERMARK else '禁用'}")
- if REMOVE_WATERMARK:
- print(f" * 亮度阈值: {WATERMARK_LIGHT_THRESHOLD}")
- print(f" * 饱和度阈值: {WATERMARK_SATURATION_THRESHOLD}")
- print(f" * DPI: {WATERMARK_DPI}")
-
- # 检查依赖
- if not TESSERACT_AVAILABLE and USE_OCR:
- logger.warning("[附件切割] OCR 功能不可用")
- print("\n⚠ OCR 功能不可用")
- print("安装方法:")
- print(" pip install pytesseract")
- print(" sudo apt-get install tesseract-ocr tesseract-ocr-chi-sim")
- print("\n将继续使用文本层检测(可能无法识别扫描版)\n")
-
- if not PYPDF2_AVAILABLE:
- logger.error("[附件切割] PDF 切割功能不可用")
- print("\n⚠ PDF 切割功能不可用")
- print("安装方法:")
- print(" pip install PyPDF2\n")
-
- if REMOVE_WATERMARK:
- print("\n⚠ 去水印功能需要以下依赖:")
- print(" - OpenCV (cv2)")
- print(" - Pillow (PIL)")
- print(" - pdf2image")
- print(" - PyPDF2")
- print("安装命令:")
- print(" pip install opencv-python pillow pdf2image PyPDF2\n")
-
- # 执行切割
- logger.info(f"[附件切割] 配置: PDF={PDF_PATH}, 输出={OUTPUT_DIR}, OCR={USE_OCR}, DEBUG={DEBUG_MODE}, 表格附件={TABLE_ONLY}, 去水印={REMOVE_WATERMARK}")
- split_attachment_pages(
- PDF_PATH,
- OUTPUT_DIR,
- use_ocr=USE_OCR,
- debug=DEBUG_MODE,
- remove_watermark=REMOVE_WATERMARK,
- watermark_light_threshold=WATERMARK_LIGHT_THRESHOLD,
- watermark_saturation_threshold=WATERMARK_SATURATION_THRESHOLD,
- watermark_dpi=WATERMARK_DPI,
- table_only=TABLE_ONLY
- )
- logger.info("[附件切割] 程序执行完成")
|