Răsfoiți Sursa

优化逻辑并适配NPU

何文松 3 săptămâni în urmă
părinte
comite
d6d3fedfcc

+ 5 - 0
mineru/cli/fast_api.py

@@ -16,6 +16,11 @@ from typing import List, Optional
 from loguru import logger
 from base64 import b64encode
 
+# NumPy 1.24+ removed np.complex; librosa (via transformers) still uses it in constantq.py
+import numpy as _np
+if not hasattr(_np, "complex"):
+    _np.complex = _np.complex128  # type: ignore[attr-defined]
+
 from mineru.cli.common import aio_do_parse, read_fn, pdf_suffixes, image_suffixes
 from mineru.utils.cli_parser import arg_parse
 from mineru.utils.guess_suffix_or_lang import guess_suffix_by_path

+ 5 - 2
mineru/model/mfr/unimernet/unimernet_hf/modeling_unimernet.py

@@ -7,8 +7,11 @@ from ftfy import fix_text
 from loguru import logger
 
 from transformers import AutoConfig, AutoModel, AutoModelForCausalLM, AutoTokenizer, PretrainedConfig, PreTrainedModel
-from transformers import VisionEncoderDecoderConfig, VisionEncoderDecoderModel
-from transformers.models.vision_encoder_decoder.modeling_vision_encoder_decoder import logger as base_model_logger
+from transformers.models.vision_encoder_decoder.configuration_vision_encoder_decoder import VisionEncoderDecoderConfig
+from transformers.models.vision_encoder_decoder.modeling_vision_encoder_decoder import (
+    VisionEncoderDecoderModel,
+    logger as base_model_logger,
+)
 
 from .unimer_swin import UnimerSwinConfig, UnimerSwinModel, UnimerSwinImageProcessor
 from .unimer_mbart import UnimerMBartConfig, UnimerMBartForCausalLM

+ 19 - 0
mineru/requirements-paddle-npu.txt

@@ -0,0 +1,19 @@
+# MinerU API 在 Paddle NPU 容器内运行 /file_parse 所需的最小依赖
+# 容器内已安装 paddle/paddlex,此处不重复安装
+# pipeline 依赖链 (transformers→librosa) 中 librosa 使用 np.complex,NumPy 2.0 已移除该别名,故限制 numpy<2
+numpy<2
+
+# Web 与 CLI
+fastapi>=0.100.0
+uvicorn[standard]>=0.23.0
+click>=8.0.0
+python-multipart>=0.0.6
+
+# 日志
+loguru>=0.7.0
+
+# PDF 处理
+pypdfium2>=4.0.0
+
+# transformers 依赖链 (LayoutLMv3 → processing_utils → audio_utils) 需要 soxr
+soxr>=0.3.0

+ 237 - 9
pdf_converter_v2/api/main.py

@@ -150,6 +150,16 @@ class ConversionRequest(BaseModel):
     """转换请求模型(v2 精简版)"""
     # 新增:强制文档类型(正式全称)
     doc_type: Optional[str] = None
+    # 新增:去水印参数
+    remove_watermark: Optional[bool] = False
+    watermark_light_threshold: Optional[int] = 200
+    watermark_saturation_threshold: Optional[int] = 30
+    crop_header_footer: Optional[bool] = False
+    header_ratio: Optional[float] = 0.05
+    footer_ratio: Optional[float] = 0.05
+    auto_detect_header_footer: Optional[bool] = False
+    # 新增:附件页切割参数
+    table_only: Optional[bool] = False  # 是否只保留包含表格的附件页(默认False)
 
 
 class ConversionResponse(BaseModel):
@@ -200,6 +210,13 @@ class OCRRequest(BaseModel):
     """OCR识别请求模型"""
     image_base64: str  # base64编码的图片数据
     image_format: Optional[str] = "png"  # 图片格式:png, jpg, jpeg
+    remove_watermark: Optional[bool] = False  # 是否去除水印
+    watermark_light_threshold: Optional[int] = 200  # 水印亮度阈值(0-255),高于此值的浅色像素可能是水印
+    watermark_saturation_threshold: Optional[int] = 30  # 水印饱和度阈值(0-255),低于此值的低饱和度像素可能是水印
+    crop_header_footer: Optional[bool] = False  # 是否裁剪页眉页脚
+    header_ratio: Optional[float] = 0.05  # 页眉裁剪比例(0-1),默认5%
+    footer_ratio: Optional[float] = 0.05  # 页脚裁剪比例(0-1),默认5%
+    auto_detect_header_footer: Optional[bool] = False  # 是否自动检测页眉页脚边界
 
 
 class OCRResponse(BaseModel):
@@ -308,11 +325,108 @@ async def process_conversion_task(
         
         logger.info(f"[任务 {task_id}] 开始处理: {file_path}")
         
+        # 文件预处理(支持图片和PDF)
+        from pathlib import Path as PathLib
+        file_suffix = PathLib(file_path).suffix.lower()
+        is_image = file_suffix in ['.png', '.jpg', '.jpeg', '.bmp', '.tiff', '.tif']
+        is_pdf = file_suffix == '.pdf'
+        
+        # 图片预处理:去水印或裁剪页眉页脚
+        if is_image and (request.remove_watermark or request.crop_header_footer):
+            logger.info(f"[任务 {task_id}] 检测到图片文件,开始预处理...")
+            preprocessed_path = file_path
+            
+            # 裁剪页眉页脚
+            if request.crop_header_footer:
+                try:
+                    from ..utils.image_preprocessor import crop_header_footer, check_opencv_available
+                    
+                    if check_opencv_available():
+                        if request.auto_detect_header_footer:
+                            logger.info(f"[任务 {task_id}] 开始自动检测并裁剪页眉页脚")
+                        else:
+                            logger.info(f"[任务 {task_id}] 开始裁剪页眉页脚,顶部比例: {request.header_ratio}, 底部比例: {request.footer_ratio}")
+                        
+                        # 裁剪后的图片路径
+                        cropped_path = str(PathLib(output_dir) / f"preprocessed_cropped{file_suffix}")
+                        
+                        preprocessed_path = await asyncio.to_thread(
+                            crop_header_footer,
+                            preprocessed_path,
+                            output_path=cropped_path,
+                            header_ratio=request.header_ratio or 0.05,
+                            footer_ratio=request.footer_ratio or 0.05,
+                            auto_detect=request.auto_detect_header_footer or False
+                        )
+                        logger.info(f"[任务 {task_id}] 裁剪页眉页脚完成: {preprocessed_path}")
+                    else:
+                        logger.warning(f"[任务 {task_id}] OpenCV 未安装,跳过裁剪页眉页脚")
+                except Exception as e:
+                    logger.warning(f"[任务 {task_id}] 裁剪页眉页脚失败,使用原图继续: {e}")
+            
+            # 去水印
+            if request.remove_watermark:
+                try:
+                    from ..utils.image_preprocessor import remove_watermark, check_opencv_available
+                    
+                    if check_opencv_available():
+                        logger.info(f"[任务 {task_id}] 开始去水印处理,亮度阈值: {request.watermark_light_threshold}, 饱和度阈值: {request.watermark_saturation_threshold}")
+                        
+                        # 去水印后的图片路径
+                        nowm_path = str(PathLib(output_dir) / f"preprocessed_nowm{file_suffix}")
+                        
+                        preprocessed_path = await asyncio.to_thread(
+                            remove_watermark,
+                            preprocessed_path,
+                            output_path=nowm_path,
+                            light_threshold=request.watermark_light_threshold or 200,
+                            saturation_threshold=request.watermark_saturation_threshold or 30,
+                            method="hsv"
+                        )
+                        logger.info(f"[任务 {task_id}] 去水印完成: {preprocessed_path}")
+                    else:
+                        logger.warning(f"[任务 {task_id}] OpenCV 未安装,跳过去水印处理")
+                except Exception as e:
+                    logger.warning(f"[任务 {task_id}] 去水印处理失败,使用原图继续: {e}")
+            
+            # 更新文件路径为预处理后的路径
+            if preprocessed_path != file_path:
+                file_path = preprocessed_path
+                logger.info(f"[任务 {task_id}] 图片预处理完成,使用预处理后的文件: {file_path}")
+        
+        # PDF预处理:去水印
+        elif is_pdf and request.remove_watermark:
+            logger.info(f"[任务 {task_id}] 检测到PDF文件,开始去水印预处理...")
+            try:
+                from ..utils.pdf_watermark_remover import remove_watermark_from_pdf
+                
+                # 去水印后的PDF路径
+                nowm_pdf_path = str(PathLib(output_dir) / f"preprocessed_nowm.pdf")
+                
+                # 执行去水印
+                logger.info(f"[任务 {task_id}] 开始PDF去水印处理,亮度阈值: {request.watermark_light_threshold}, 饱和度阈值: {request.watermark_saturation_threshold}")
+                success = await asyncio.to_thread(
+                    remove_watermark_from_pdf,
+                    input_pdf=file_path,
+                    output_pdf=nowm_pdf_path,
+                    light_threshold=request.watermark_light_threshold or 200,
+                    saturation_threshold=request.watermark_saturation_threshold or 30,
+                    dpi=200  # PDF转图片的DPI
+                )
+                
+                if success and PathLib(nowm_pdf_path).exists():
+                    file_path = nowm_pdf_path
+                    logger.info(f"[任务 {task_id}] PDF去水印完成: {file_path}")
+                else:
+                    logger.warning(f"[任务 {task_id}] PDF去水印失败,使用原PDF继续")
+            except Exception as e:
+                logger.warning(f"[任务 {task_id}] PDF去水印处理失败,使用原PDF继续: {e}")
+        
         result = None
         tables_info = None
         
         # 针对投资估算类型,需要先切割附件页
-        if request.doc_type in ("fsApproval", "fsReview", "pdApproval"):
+        if request.doc_type in ("fsApproval", "fsReview", "pdApproval", "safetyFsApproval"):
             logger.info(f"[任务 {task_id}] 文档类型 {request.doc_type},需要先切割附件页")
             
             # 导入附件页切割函数
@@ -328,18 +442,21 @@ async def process_conversion_task(
                 attachment_dir = PathLib(output_dir) / "attachments"
                 attachment_dir.mkdir(parents=True, exist_ok=True)
                 
-                # 切割附件页
-                logger.info(f"[任务 {task_id}] 开始切割附件页,输出目录: {attachment_dir}")
+                # 切割附件页(根据 table_only 参数决定是否过滤非表格内容)
+                logger.info(f"[任务 {task_id}] 开始切割附件页(table_only={request.table_only}),输出目录: {attachment_dir}")
                 await asyncio.to_thread(
                     split_attachment_pages,
                     file_path,
                     attachment_dir,
                     use_ocr=True,
-                    debug=False
+                    debug=False,
+                    table_only=request.table_only  # 是否只保留包含表格的附件页
                 )
                 
-                # 查找切割后的附件页PDF
-                attachment_pdfs = list(attachment_dir.glob("*_附件页_*.pdf"))
+                # 查找切割后的附件页PDF(优先使用表格附件页,其次使用普通附件页)
+                attachment_pdfs = list(attachment_dir.glob("*_表格附件页_*.pdf"))
+                if not attachment_pdfs:
+                    attachment_pdfs = list(attachment_dir.glob("*_附件页_*.pdf"))
                 logger.info(f"[任务 {task_id}] 附件页目录内容: {list(attachment_dir.iterdir()) if attachment_dir.exists() else '(目录不存在)'}")
                 
                 if attachment_pdfs:
@@ -532,11 +649,44 @@ async def process_conversion_task(
 @app.post("/convert", response_model=ConversionResponse)
 async def convert_file(
     file: Annotated[UploadFile, File(description="上传的PDF或图片文件")],
-    # 新增:类型参数(英文传参) noiseRec | emRec | opStatus | settlementReport | designReview | fsApproval | fsReview | pdApproval | finalAccount
+    # 新增:类型参数(英文传参) noiseRec | emRec | opStatus | settlementReport | designReview | fsApproval | fsReview | pdApproval | finalAccount | safetyFsApproval
     type: Annotated[
-        Optional[Literal["noiseRec", "emRec", "opStatus", "settlementReport", "designReview", "fsApproval", "fsReview", "pdApproval", "finalAccount"]],
-        Form(description="文档类型:noiseRec | emRec | opStatus | settlementReport | designReview | fsApproval | fsReview | pdApproval | finalAccount")
+        Optional[Literal["noiseRec", "emRec", "opStatus", "settlementReport", "designReview", "fsApproval", "fsReview", "pdApproval", "finalAccount", "safetyFsApproval"]],
+        Form(description="文档类型:noiseRec | emRec | opStatus | settlementReport | designReview | fsApproval | fsReview | pdApproval | finalAccount | safetyFsApproval")
     ] = None,
+    # 新增:去水印参数
+    remove_watermark: Annotated[
+        Optional[bool],
+        Form(description="是否去除水印,默认为false")
+    ] = False,
+    watermark_light_threshold: Annotated[
+        Optional[int],
+        Form(description="水印亮度阈值(0-255),默认200,高于此值的浅色像素可能是水印")
+    ] = 200,
+    watermark_saturation_threshold: Annotated[
+        Optional[int],
+        Form(description="水印饱和度阈值(0-255),默认30,低于此值的低饱和度像素可能是水印")
+    ] = 30,
+    crop_header_footer: Annotated[
+        Optional[bool],
+        Form(description="是否裁剪页眉页脚,默认为false")
+    ] = False,
+    header_ratio: Annotated[
+        Optional[float],
+        Form(description="页眉裁剪比例(0-1),默认0.05表示裁剪顶部5%")
+    ] = 0.05,
+    footer_ratio: Annotated[
+        Optional[float],
+        Form(description="页脚裁剪比例(0-1),默认0.05表示裁剪底部5%")
+    ] = 0.05,
+    auto_detect_header_footer: Annotated[
+        Optional[bool],
+        Form(description="是否自动检测页眉页脚边界,默认为false(启用后忽略header_ratio和footer_ratio)")
+    ] = False,
+    table_only: Annotated[
+        Optional[bool],
+        Form(description="是否只保留包含表格的附件页,默认为false")
+    ] = False,
 ):
     """
     转换PDF/图片文件(异步处理)
@@ -557,6 +707,16 @@ async def convert_file(
       * fsApproval - 可研批复投资估算
       * fsReview - 可研评审投资估算
       * pdApproval - 初设批复概算投资
+      * finalAccount - 决算报告
+      * safetyFsApproval - 安评可研批复投资估算
+    - **remove_watermark**: 是否去除水印(仅对图片有效),默认为false
+    - **watermark_light_threshold**: 水印亮度阈值(0-255),默认200
+    - **watermark_saturation_threshold**: 水印饱和度阈值(0-255),默认30
+    - **crop_header_footer**: 是否裁剪页眉页脚(仅对图片有效),默认为false
+    - **header_ratio**: 页眉裁剪比例(0-1),默认0.05
+    - **footer_ratio**: 页脚裁剪比例(0-1),默认0.05
+    - **auto_detect_header_footer**: 是否自动检测页眉页脚边界,默认为false
+    - **table_only**: 是否只保留包含表格的附件页,默认为false
     
     注意:v2 版本内部使用外部API进行转换,v2特有的配置参数(如API URL、backend等)
     通过环境变量或配置文件设置,不通过API参数传入。
@@ -677,6 +837,8 @@ async def convert_file(
         "pdApproval": "pdApproval",
         # 决算报告
         "finalAccount": "finalAccount",
+        # 安评类
+        "safetyFsApproval": "safetyFsApproval",
     }
     doc_type = None
     if type:
@@ -692,6 +854,14 @@ async def convert_file(
     # 创建请求对象(v2 精简)
     request = ConversionRequest(
         doc_type=doc_type,
+        remove_watermark=remove_watermark,
+        watermark_light_threshold=watermark_light_threshold,
+        watermark_saturation_threshold=watermark_saturation_threshold,
+        crop_header_footer=crop_header_footer,
+        header_ratio=header_ratio,
+        footer_ratio=footer_ratio,
+        auto_detect_header_footer=auto_detect_header_footer,
+        table_only=table_only,
     )
     
     # 使用 asyncio.create_task 创建后台任务,确保立即返回
@@ -875,6 +1045,13 @@ async def ocr_image(request: OCRRequest):
     
     - **image_base64**: base64编码的图片数据(可以包含data:image/xxx;base64,前缀)
     - **image_format**: 图片格式(png, jpg, jpeg),默认为png
+    - **remove_watermark**: 是否去除水印,默认为false
+    - **watermark_light_threshold**: 水印亮度阈值(0-255),默认200,高于此值的浅色像素可能是水印
+    - **watermark_saturation_threshold**: 水印饱和度阈值(0-255),默认30,低于此值的低饱和度像素可能是水印
+    - **crop_header_footer**: 是否裁剪页眉页脚,默认为false
+    - **header_ratio**: 页眉裁剪比例(0-1),默认0.05表示裁剪顶部5%
+    - **footer_ratio**: 页脚裁剪比例(0-1),默认0.05表示裁剪底部5%
+    - **auto_detect_header_footer**: 是否自动检测页眉页脚边界,默认为false(启用后忽略header_ratio和footer_ratio)
     
     返回识别出的文本列表和GPU监控信息
     """
@@ -934,6 +1111,57 @@ async def ocr_image(request: OCRRequest):
             f.write(image_bytes)
         logger.info(f"[OCR] 图片已保存: {image_path}")
         
+        # 如果需要裁剪页眉页脚,先进行裁剪
+        if request.crop_header_footer:
+            try:
+                from ..utils.image_preprocessor import crop_header_footer, check_opencv_available
+                
+                if check_opencv_available():
+                    if request.auto_detect_header_footer:
+                        logger.info("[OCR] 开始自动检测并裁剪页眉页脚")
+                    else:
+                        logger.info(f"[OCR] 开始裁剪页眉页脚,顶部比例: {request.header_ratio}, 底部比例: {request.footer_ratio}")
+                    
+                    # 裁剪后的图片路径
+                    cropped_image_path = os.path.join(temp_dir, f"ocr_image_cropped{ext}")
+                    
+                    image_path = crop_header_footer(
+                        image_path,
+                        output_path=cropped_image_path,
+                        header_ratio=request.header_ratio or 0.05,
+                        footer_ratio=request.footer_ratio or 0.05,
+                        auto_detect=request.auto_detect_header_footer or False
+                    )
+                    logger.info(f"[OCR] 裁剪页眉页脚完成: {image_path}")
+                else:
+                    logger.warning("[OCR] OpenCV 未安装,跳过裁剪页眉页脚")
+            except Exception as e:
+                logger.warning(f"[OCR] 裁剪页眉页脚失败,使用原图继续: {e}")
+        
+        # 如果需要去水印,进行预处理
+        if request.remove_watermark:
+            try:
+                from ..utils.image_preprocessor import remove_watermark, check_opencv_available
+                
+                if check_opencv_available():
+                    logger.info(f"[OCR] 开始去水印处理,亮度阈值: {request.watermark_light_threshold}, 饱和度阈值: {request.watermark_saturation_threshold}")
+                    
+                    # 去水印后的图片路径
+                    nowm_image_path = os.path.join(temp_dir, f"ocr_image_nowm{ext}")
+                    
+                    image_path = remove_watermark(
+                        image_path,
+                        output_path=nowm_image_path,
+                        light_threshold=request.watermark_light_threshold or 200,
+                        saturation_threshold=request.watermark_saturation_threshold or 30,
+                        method="hsv"
+                    )
+                    logger.info(f"[OCR] 去水印完成: {image_path}")
+                else:
+                    logger.warning("[OCR] OpenCV 未安装,跳过去水印处理")
+            except Exception as e:
+                logger.warning(f"[OCR] 去水印处理失败,使用原图继续: {e}")
+        
         # 调用PaddleOCR进行识别(监控线程在此期间持续采集数据)
         from ..utils.paddleocr_fallback import call_paddleocr_ocr
         

+ 45 - 1
pdf_converter_v2/models/data_models.py

@@ -283,9 +283,18 @@ class FeasibilityApprovalInvestment:
     - Level 0: 顶层大类(如"山西晋城周村220千伏输变电工程")
     - Level 1: 二级分类(如"变电工程"、"线路工程"),有自己的 items
     - Level 2: 具体项目(如"周村220千伏变电站新建工程")
+    
+    项目信息(可选,用于 safetyFsApproval 类型):
+    - projectName: 工程(项目)名称
+    - projectUnit: 项目单位
+    - designUnit: 设计单位
     """
     def __init__(self):
         self.items: List[InvestmentItem] = []
+        # 项目基本信息(safetyFsApproval 专用)
+        self.projectName: Optional[str] = None
+        self.projectUnit: Optional[str] = None
+        self.designUnit: Optional[str] = None
     
     def to_dict(self):
         """转换为嵌套结构,与 designReview 保持一致
@@ -294,14 +303,38 @@ class FeasibilityApprovalInvestment:
         Level="2" 的项目作为二级分类(Level: 1),有自己的 items
         Level="3" 的项目作为具体项目(Level: 2),放入二级分类的 items
         Level="0" 的项目(合计)跳过
+        
+        特殊处理:如果表格没有 Level=1 的顶层大类(如湖北省格式),
+        自动创建一个虚拟顶层大类来包含所有 Level=2 的项目
         """
         if not self.items:
             return []
         
+        # 检查是否有 Level=1 的顶层大类
+        has_level_1 = any(item.level == "1" for item in self.items)
+        
         result = []
         current_top_category = None  # Level 0 顶层大类
         current_sub_category = None  # Level 1 二级分类
         
+        # 如果没有 Level=1 的顶层大类,创建一个虚拟的
+        if not has_level_1:
+            current_top_category = {
+                "name": "项目总表",
+                "Level": 0,
+                "constructionScaleSubstation": "",
+                "constructionScaleBay": "",
+                "constructionScaleOverheadLine": "",
+                "constructionScaleOpticalCable": "",
+                "staticInvestment": "",
+                "dynamicInvestment": "",
+                "constructionProjectCost": "",
+                "equipmentPurchaseCost": "",
+                "installationProjectCost": "",
+                "otherExpenses": "",
+                "items": []
+            }
+        
         for item in self.items:
             if item.level == "1":
                 # 顶层大类(如"山西晋城周村220千伏输变电工程")
@@ -381,7 +414,18 @@ class FeasibilityApprovalInvestment:
         if current_top_category is not None:
             result.append(current_top_category)
         
-        return result
+        # 如果有项目信息,返回包含项目信息的字典;否则直接返回数据列表
+        if self.projectName or self.projectUnit or self.designUnit:
+            return {
+                "projectInfo": {
+                    "projectName": self.projectName or "",
+                    "projectUnit": self.projectUnit or "",
+                    "designUnit": self.designUnit or ""
+                },
+                "data": result
+            }
+        else:
+            return result
     
     @staticmethod
     def _parse_number(value: str) -> str:

+ 260 - 7
pdf_converter_v2/parser/investment_parser.py

@@ -224,8 +224,11 @@ def parse_feasibility_approval_investment(markdown_content: str) -> FeasibilityA
             table_text += " ".join([str(cell) for cell in row])
         # 移除空格后再匹配
         table_text_no_space = table_text.replace(" ", "")
-        # 选择包含"工程或费用名称"和"静态投资"的表格
-        if "工程或费用名称" in table_text_no_space and "静态投资" in table_text_no_space:
+        # 选择包含"工程或费用名称"或"项目名称",且包含"静态投资"或"静态合计"的表格
+        has_name_col = ("工程或费用名称" in table_text_no_space or "项目名称" in table_text_no_space)
+        has_investment_col = ("静态投资" in table_text_no_space or "静态合计" in table_text_no_space)
+        
+        if has_name_col and has_investment_col:
             all_matching_tables.append((table_idx, table))
             logger.info(f"[可研批复投资] 找到投资估算表格 (表格{table_idx+1}), 行数: {len(table)}")
     
@@ -295,7 +298,7 @@ def parse_feasibility_approval_investment(markdown_content: str) -> FeasibilityA
             
             if "序号" in cell_text and no_idx == -1:
                 no_idx = col_idx
-            elif ("工程或费用名称" in cell_text_no_space) and name_idx == -1:
+            elif ("工程或费用名称" in cell_text_no_space or "项目名称" in cell_text_no_space) and name_idx == -1:
                 name_idx = col_idx
             elif "架空线" in cell_text_no_space and overhead_line_idx == -1:
                 overhead_line_idx = col_idx
@@ -305,9 +308,9 @@ def parse_feasibility_approval_investment(markdown_content: str) -> FeasibilityA
                 substation_idx = col_idx
             elif "光缆" in cell_text and optical_cable_idx == -1:
                 optical_cable_idx = col_idx
-            elif "静态投资" in cell_text_no_space and static_investment_idx == -1:
+            elif ("静态投资" in cell_text_no_space or "静态合计" in cell_text_no_space) and static_investment_idx == -1:
                 static_investment_idx = col_idx
-            elif "动态投资" in cell_text_no_space and dynamic_investment_idx == -1:
+            elif ("动态投资" in cell_text_no_space or "动态合计" in cell_text_no_space) and dynamic_investment_idx == -1:
                 dynamic_investment_idx = col_idx
             # 新增费用字段识别
             elif "建筑工程费" in cell_text_no_space and construction_project_cost_idx == -1:
@@ -322,8 +325,8 @@ def parse_feasibility_approval_investment(markdown_content: str) -> FeasibilityA
                 if "其他费用" in cell_text_no_space:
                     other_expenses_idx = col_idx
         
-        # 如果这一行包含"序号"或"工程或费用名称",记录为表头结束行
-        if ("序号" in row_text or "工程或费用名称" in row_text_no_space) and header_row_idx == -1:
+        # 如果这一行包含"序号"或"工程或费用名称"或"项目名称",记录为表头结束行
+        if ("序号" in row_text or "工程或费用名称" in row_text_no_space or "项目名称" in row_text_no_space) and header_row_idx == -1:
             header_row_idx = row_idx
     
     # 表头结束行应该是最后一个包含表头内容的行
@@ -417,6 +420,253 @@ def parse_feasibility_approval_investment(markdown_content: str) -> FeasibilityA
     return record
 
 
+def parse_safety_feasibility_approval_investment(markdown_content: str) -> FeasibilityApprovalInvestment:
+    """
+    解析安全可研批复投资估算(湖北省格式)
+    
+    特点:
+    - 没有顶层大类(Level=1),直接从二级分类开始
+    - 中文序号(一、二)表示二级分类(如"变电工程"、"线路工程")
+    - 阿拉伯数字(1、2、3)表示具体项目
+    - 列名使用"项目名称"和"静态合计/动态合计"
+    
+    返回结构:
+    - Level 1: 二级分类(如"变电工程"、"线路工程")
+    - Level 2: 具体项目(如"襄阳连云220千伏变电站新建工程")
+    """
+    record = FeasibilityApprovalInvestment()
+    
+    tables = extract_table_with_rowspan_colspan(markdown_content)
+    
+    if not tables:
+        logger.warning("[安全可研批复投资] 未能提取出任何表格内容")
+        return record
+    
+    # 首先尝试提取项目基本信息表格
+    for table_idx, table in enumerate(tables):
+        if len(table) < 2:
+            continue
+        
+        table_text = ""
+        for row in table:
+            table_text += " ".join([str(cell) for cell in row])
+        table_text_no_space = table_text.replace(" ", "").replace("(", "(").replace(")", ")")
+        
+        # 查找包含"工程(项目)名称"的表格
+        if "工程(项目)名称" in table_text_no_space or "工程项目名称" in table_text_no_space:
+            logger.info(f"[安全可研批复投资] 找到项目信息表格 (表格{table_idx+1})")
+            
+            # 提取项目信息
+            for row in table:
+                if len(row) >= 2:
+                    key = str(row[0]).strip()
+                    value = str(row[1]).strip() if len(row) > 1 else ""
+                    
+                    if "工程" in key and "名称" in key:
+                        record.projectName = value
+                        logger.info(f"[安全可研批复投资] 提取工程名称: {value}")
+                    elif "项目单位" in key:
+                        record.projectUnit = value
+                        logger.info(f"[安全可研批复投资] 提取项目单位: {value}")
+                    elif "设计单位" in key:
+                        record.designUnit = value
+                        logger.info(f"[安全可研批复投资] 提取设计单位: {value}")
+            break
+    
+    # 找到所有投资估算表格并合并
+    all_matching_tables = []
+    for table_idx, table in enumerate(tables):
+        table_text = ""
+        for row in table:
+            table_text += " ".join([str(cell) for cell in row])
+        table_text_no_space = table_text.replace(" ", "")
+        
+        # 选择包含"项目名称"且包含"静态合计"或"静态投资"的表格
+        has_name_col = "项目名称" in table_text_no_space
+        has_investment_col = ("静态合计" in table_text_no_space or "静态投资" in table_text_no_space)
+        
+        if has_name_col and has_investment_col:
+            all_matching_tables.append((table_idx, table))
+            logger.info(f"[安全可研批复投资] 找到投资估算表格 (表格{table_idx+1}), 行数: {len(table)}")
+    
+    if not all_matching_tables:
+        logger.warning("[安全可研批复投资] 未找到包含投资估算的表格")
+        return record
+    
+    # 如果只有一个表格,直接使用
+    if len(all_matching_tables) == 1:
+        target_table = all_matching_tables[0][1]
+    else:
+        # 多个表格:合并所有表格的数据行
+        logger.info(f"[安全可研批复投资] 发现 {len(all_matching_tables)} 个投资估算表格,将进行合并")
+        target_table = []
+        first_table = True
+        for table_idx, table in all_matching_tables:
+            if first_table:
+                target_table.extend(table)
+                first_table = False
+            else:
+                # 跳过表头行
+                header_end_idx = 0
+                for row_idx, row in enumerate(table):
+                    row_text = " ".join([str(cell) for cell in row]).replace(" ", "")
+                    if "序号" in row_text or "项目名称" in row_text or "建设规模" in row_text:
+                        header_end_idx = row_idx + 1
+                    elif len(row) > 0:
+                        first_cell = str(row[0]).strip()
+                        if first_cell in ["一", "二", "三", "四", "五", "六", "七", "八", "九", "十"]:
+                            break
+                target_table.extend(table[header_end_idx:])
+                logger.debug(f"[安全可研批复投资] 表格{table_idx+1}: 跳过前{header_end_idx}行表头,添加{len(table)-header_end_idx}行数据")
+        
+        logger.info(f"[安全可研批复投资] 合并后总行数: {len(target_table)}")
+    
+    # 识别表头行和列索引
+    header_row_idx = -1
+    no_idx = -1
+    name_idx = -1
+    overhead_line_idx = -1
+    bay_idx = -1
+    substation_idx = -1
+    optical_cable_idx = -1
+    static_investment_idx = -1
+    dynamic_investment_idx = -1
+    construction_project_cost_idx = -1
+    equipment_purchase_cost_idx = -1
+    installation_project_cost_idx = -1
+    other_expenses_idx = -1
+    
+    # 扫描前几行识别列索引
+    for row_idx in range(min(5, len(target_table))):
+        row = target_table[row_idx]
+        row_text = " ".join([str(cell) for cell in row])
+        row_text_no_space = row_text.replace(" ", "")
+        
+        for col_idx, cell in enumerate(row):
+            cell_text = str(cell).strip()
+            cell_text_no_space = cell_text.replace(" ", "")
+            
+            if "序号" in cell_text and no_idx == -1:
+                no_idx = col_idx
+            elif "项目名称" in cell_text_no_space and name_idx == -1:
+                name_idx = col_idx
+            elif "架空线" in cell_text_no_space and overhead_line_idx == -1:
+                overhead_line_idx = col_idx
+            elif "间隔" in cell_text and bay_idx == -1:
+                bay_idx = col_idx
+            elif "变电" in cell_text and substation_idx == -1:
+                substation_idx = col_idx
+            elif "光缆" in cell_text and optical_cable_idx == -1:
+                optical_cable_idx = col_idx
+            elif ("静态投资" in cell_text_no_space or "静态合计" in cell_text_no_space) and static_investment_idx == -1:
+                static_investment_idx = col_idx
+            elif ("动态投资" in cell_text_no_space or "动态合计" in cell_text_no_space) and dynamic_investment_idx == -1:
+                dynamic_investment_idx = col_idx
+            elif "建筑工程费" in cell_text_no_space and construction_project_cost_idx == -1:
+                construction_project_cost_idx = col_idx
+            elif "设备购置费" in cell_text_no_space and equipment_purchase_cost_idx == -1:
+                equipment_purchase_cost_idx = col_idx
+            elif "安装工程费" in cell_text_no_space and installation_project_cost_idx == -1:
+                installation_project_cost_idx = col_idx
+            elif ("其他费用" in cell_text_no_space or "合计" == cell_text_no_space) and other_expenses_idx == -1:
+                if "其他费用" in cell_text_no_space:
+                    other_expenses_idx = col_idx
+        
+        if ("序号" in row_text or "项目名称" in row_text_no_space) and header_row_idx == -1:
+            header_row_idx = row_idx
+    
+    # 找到第一个数据行
+    for row_idx in range(min(5, len(target_table))):
+        row = target_table[row_idx]
+        if len(row) > 0:
+            first_cell = str(row[0]).strip()
+            if first_cell and first_cell not in ["序号", ""] and (first_cell in ["一", "二", "三", "四", "五"] or first_cell.isdigit()):
+                header_row_idx = row_idx - 1
+                logger.debug(f"[安全可研批复投资] 根据数据行确定表头结束于第{header_row_idx}行")
+                break
+    
+    logger.info(f"[安全可研批复投资] 表头行: {header_row_idx}")
+    logger.info(f"[安全可研批复投资] 列索引: 序号={no_idx}, 名称={name_idx}, "
+               f"架空线={overhead_line_idx}, 间隔={bay_idx}, 变电={substation_idx}, "
+               f"光缆={optical_cable_idx}, 静态投资={static_investment_idx}, 动态投资={dynamic_investment_idx}")
+    
+    if header_row_idx == -1:
+        logger.warning("[安全可研批复投资] 未找到表头行")
+        return record
+    
+    # 解析数据行
+    for row_idx in range(header_row_idx + 1, len(target_table)):
+        row = target_table[row_idx]
+        
+        if len(row) < 3:
+            continue
+        
+        # 检查是否是有效数据行
+        if name_idx >= 0 and name_idx < len(row):
+            name = str(row[name_idx]).strip()
+            if not name or name in ["", "nan", "None"]:
+                continue
+            
+            # 提取序号
+            no = ""
+            if no_idx >= 0 and no_idx < len(row):
+                no = str(row[no_idx]).strip()
+            
+            # 判断等级 - 使用非严格模式,让中文数字直接返回 Level 1
+            level_input = (no + name) if no else name
+            level = determine_level(level_input, name, strict_mode=False)
+            
+            # 对于阿拉伯数字序号,如果当前是 Level 2,且不是具体项目名称,则判定为 Level 2
+            # 这样"1、襄阳连云..."会是 Level 2
+            if level == "2" and no.isdigit():
+                # 阿拉伯数字序号,是具体项目,保持 Level 2
+                pass
+            
+            item = InvestmentItem()
+            item.no = no
+            item.name = name
+            item.level = level
+            
+            # 提取建设规模
+            if overhead_line_idx >= 0 and overhead_line_idx < len(row):
+                item.constructionScaleOverheadLine = str(row[overhead_line_idx]).strip()
+            
+            if bay_idx >= 0 and bay_idx < len(row):
+                item.constructionScaleBay = str(row[bay_idx]).strip()
+            
+            if substation_idx >= 0 and substation_idx < len(row):
+                item.constructionScaleSubstation = str(row[substation_idx]).strip()
+            
+            if optical_cable_idx >= 0 and optical_cable_idx < len(row):
+                item.constructionScaleOpticalCable = str(row[optical_cable_idx]).strip()
+            
+            # 提取投资金额
+            if static_investment_idx >= 0 and static_investment_idx < len(row):
+                item.staticInvestment = str(row[static_investment_idx]).strip()
+            
+            if dynamic_investment_idx >= 0 and dynamic_investment_idx < len(row):
+                item.dynamicInvestment = str(row[dynamic_investment_idx]).strip()
+            
+            # 提取费用
+            if construction_project_cost_idx >= 0 and construction_project_cost_idx < len(row):
+                item.constructionProjectCost = str(row[construction_project_cost_idx]).strip()
+            
+            if equipment_purchase_cost_idx >= 0 and equipment_purchase_cost_idx < len(row):
+                item.equipmentPurchaseCost = str(row[equipment_purchase_cost_idx]).strip()
+            
+            if installation_project_cost_idx >= 0 and installation_project_cost_idx < len(row):
+                item.installationProjectCost = str(row[installation_project_cost_idx]).strip()
+            
+            if other_expenses_idx >= 0 and other_expenses_idx < len(row):
+                item.otherExpenses = str(row[other_expenses_idx]).strip()
+            
+            record.items.append(item)
+            logger.info(f"[安全可研批复投资] 解析到数据: No={item.no}, Name={item.name}, Level={item.level}")
+    
+    logger.info(f"[安全可研批复投资] 共解析到 {len(record.items)} 条数据")
+    return record
+
+
 def parse_feasibility_review_investment(markdown_content: str) -> FeasibilityReviewInvestment:
     """
     解析可研评审投资估算
@@ -821,6 +1071,9 @@ def parse_investment_record(markdown_content: str, investment_type: Optional[str
     result = None
     if investment_type == "fsApproval":
         result = parse_feasibility_approval_investment(markdown_content)
+    elif investment_type == "safetyFsApproval":
+        # safetyFsApproval 使用独立的解析逻辑(湖北省格式)
+        result = parse_safety_feasibility_approval_investment(markdown_content)
     elif investment_type == "fsReview":
         result = parse_feasibility_review_investment(markdown_content)
     elif investment_type == "pdApproval":

+ 23 - 9
pdf_converter_v2/parser/json_converter.py

@@ -329,8 +329,8 @@ def parse_markdown_to_json(markdown_content: str, first_page_image: Optional[Ima
                 op_list = parse_operational_conditions(markdown_content, require_title=False)
             serialized = [oc.to_dict() if hasattr(oc, "to_dict") else oc for oc in (op_list or [])]
             result = {"document_type": forced_document_type, "data": {"operationalConditions": serialized}}
-        elif forced_document_type in ["fsApproval", "fsReview", "pdApproval"]:
-            # 投资估算类型处理
+        elif forced_document_type in ["fsApproval", "fsReview", "pdApproval", "safetyFsApproval"]:
+            # 投资估算类型处理(包括安评类)
             logger.info(f"[JSON转换] 处理投资估算类型: {forced_document_type}")
             logger.debug(f"[JSON转换] Markdown内容长度: {len(markdown_content)} 字符")
             
@@ -338,12 +338,26 @@ def parse_markdown_to_json(markdown_content: str, first_page_image: Optional[Ima
             
             if investment_record:
                 data = investment_record.to_dict()
-                logger.info(f"[JSON转换] 投资估算解析成功,共 {len(data)} 条记录")
                 
-                # 输出前3条记录的摘要
-                if data:
-                    for idx, item in enumerate(data[:3]):
-                        logger.debug(f"[JSON转换] 记录 {idx+1}: No={item.get('No', '')}, Name={item.get('name', '')}, Level={item.get('Level', '')}")
+                # 检查返回的数据格式:可能是列表(旧格式)或字典(包含projectInfo的新格式)
+                if isinstance(data, dict) and "data" in data:
+                    # 新格式:包含 projectInfo 和 data
+                    logger.info(f"[JSON转换] 投资估算解析成功,共 {len(data['data'])} 条记录")
+                    if data.get("projectInfo"):
+                        logger.info(f"[JSON转换] 项目信息: {data['projectInfo'].get('projectName', '')}")
+                    
+                    # 输出前3条记录的摘要
+                    if data["data"]:
+                        for idx, item in enumerate(data["data"][:3]):
+                            logger.debug(f"[JSON转换] 记录 {idx+1}: No={item.get('No', '')}, Name={item.get('name', '')}, Level={item.get('Level', '')}")
+                else:
+                    # 旧格式:直接是数据列表
+                    logger.info(f"[JSON转换] 投资估算解析成功,共 {len(data)} 条记录")
+                    
+                    # 输出前3条记录的摘要
+                    if data:
+                        for idx, item in enumerate(data[:3]):
+                            logger.debug(f"[JSON转换] 记录 {idx+1}: No={item.get('No', '')}, Name={item.get('name', '')}, Level={item.get('Level', '')}")
                 
                 result = {"document_type": forced_document_type, "data": data}
             else:
@@ -432,8 +446,8 @@ def parse_markdown_to_json(markdown_content: str, first_page_image: Optional[Ima
     elif doc_type == "emRec":
         data = parse_electromagnetic_detection_record(markdown_content).to_dict()
         result = {"document_type": doc_type, "data": data}
-    elif doc_type in ["fsApproval", "fsReview", "pdApproval"]:
-        # 新增:投资估算类型
+    elif doc_type in ["fsApproval", "fsReview", "pdApproval", "safetyFsApproval"]:
+        # 新增:投资估算类型(包括安评类)
         logger.info(f"[JSON转换] 检测到投资估算类型: {doc_type}")
         logger.debug(f"[JSON转换] Markdown内容长度: {len(markdown_content)} 字符")
         

+ 20 - 0
pdf_converter_v2/requirements-paddle-npu.txt

@@ -0,0 +1,20 @@
+# PDF Converter v2 - 容器内依赖(Paddle NPU 环境已预装 paddle/paddlex,此处不重复安装)
+
+# 核心依赖
+aiohttp>=3.8.0
+aiofiles>=23.0.0
+Pillow>=9.0.0
+
+# PDF 处理(至少安装一个)
+pypdfium2>=4.0.0
+pdf2image>=1.16.0
+pdfplumber>=0.11.0
+
+# Web 框架
+fastapi>=0.100.0
+uvicorn[standard]>=0.23.0
+pydantic>=2.0.0
+typing-extensions>=4.0.0
+
+# 日志
+loguru>=0.7.0

+ 2 - 2
pdf_converter_v2/test.py

@@ -26,7 +26,7 @@ except ImportError:
     print("  安装命令: pip install PyMuPDF")
 
 # ==================== 配置区域 ====================
-pdf_path = '/home/hws/workspace/GitLab/Clerk2.5/pdf_converter_v2/2-数据源/5-(初设批复)晋电建设〔2019〕566号 国网山西省电力公司关于晋城周村220kV输变电工程初步设计的批复 .pdf'
+pdf_path = '/home/hws/workspace/GitLab/Clerk2.5/pdf_converter_v2/3-数据/鄂电司发展〔2024〕124号 国网湖北省电力有限公司关于襄阳连云220千伏输变电工程可行性研究报告的批复.pdf'
 output_dir = Path('extracted_tables')  # 原始表格输出目录(包含表格前文本)
 merged_output_dir = Path('merged_tables')  # 合并后的表格输出目录(已剔除表格前文本)
 filtered_output_dir = Path('filtered_tables')  # 筛选后的表格输出目录
@@ -89,7 +89,7 @@ TABLE_HEADER_RULES = {
 }
 
 # 是否启用表头过滤(如果为False,则提取所有表格)
-ENABLE_HEADER_FILTER = True
+ENABLE_HEADER_FILTER = False
 
 # 要排除的规则名称列表(如果某个规则匹配了不该匹配的表格,可以在这里排除)
 # 例如: EXCLUDE_RULES = ["物资采购合同2"] 将不会匹配该规则

+ 199 - 21
pdf_converter_v2/test_api.py

@@ -6,6 +6,7 @@ PDF Converter API 测试脚本
 - fsApproval: 可研批复
 - fsReview: 可研评审  
 - pdApproval: 初设批复
+- safetyFsApproval: 安评可研批复
 
 以及现有类型:
 - settlementReport: 结算报告
@@ -27,15 +28,20 @@ API_BASE_URL = "http://47.101.133.94:14213"
 # 测试文件配置
 TEST_DIR = Path(__file__).parent / "test"
 
-# 测试用例:文件名 -> 文档类型
+# 测试用例:文件名 -> (文档类型, 是否去水印, 是否只保留表格附件)
+# 格式: 
+#   "文件名": ("类型", 去水印, 只保留表格) - 完整格式
+#   "文件名": ("类型", 去水印) - 兼容格式,只保留表格默认True
+#   "文件名": "类型" - 旧格式,去水印False,只保留表格True
 TEST_CASES = {
     # 新增投资类型
+    "鄂电司发展〔2024〕124号 国网湖北省电力有限公司关于襄阳连云220千伏输变电工程可行性研究报告的批复.pdf": ("safetyFsApproval", True,False),  # 需要去水印 + 只保留表格附件
     # "2-(可研批复)晋电发展〔2017〕831号+国网山西省电力公司关于临汾古县、晋城周村220kV输变电等工程可行性研究报告的批复.pdf.pdf": "fsApproval",
     # "1-(可研评审)晋电经研规划〔2017〕187号(盖章)国网山西经研院关于山西晋城周村220kV输变电工程可行性研究报告的评审意见.pdf": "fsReview",
     # "5-(初设批复)晋电建设〔2019〕566号 国网山西省电力公司关于晋城周村220kV输变电工程初步设计的批复 .pdf": "pdApproval",
     # 现有类型
     # "9-(结算报告)山西晋城周村220kV输变电工程结算审计报告.pdf": "settlementReport",
-    "4-(初设评审)中电联电力建设技术经济咨询中心技经〔2019〕201号关于山西周村220kV输变电工程初步设计的评审意见.pdf": "designReview",
+    # "4-(初设评审)中电联电力建设技术经济咨询中心技经〔2019〕201号关于山西周村220kV输变电工程初步设计的评审意见.pdf": "designReview",
     # 决算报告
     # "10-(决算报告)盖章页-山西晋城周村220kV输变电工程竣工决算审核报告(中瑞诚鉴字(2021)第002040号).pdf": "finalAccount",
 }
@@ -70,10 +76,21 @@ def check_health() -> bool:
         return False
 
 
-def upload_file(file_path: Path, document_type: str) -> Optional[str]:
-    """上传文件并获取任务 ID"""
+def upload_file(file_path: Path, document_type: str, remove_watermark: bool = False, table_only: bool = True) -> Optional[str]:
+    """上传文件并获取任务 ID
+    
+    Args:
+        file_path: 文件路径
+        document_type: 文档类型
+        remove_watermark: 是否去水印
+        table_only: 是否只保留表格附件
+    """
     print(f"\n  📤 上传文件: {file_path.name}")
     print(f"     类型: {document_type}")
+    if remove_watermark:
+        print(f"     去水印: 是")
+    if table_only:
+        print(f"     只保留表格: 是")
     
     try:
         with open(file_path, "rb") as f:
@@ -81,6 +98,15 @@ def upload_file(file_path: Path, document_type: str) -> Optional[str]:
             # 使用 data 发送表单参数,参数名是 type(不是 document_type)
             data = {"type": document_type}
             
+            # 添加去水印参数
+            if remove_watermark:
+                data["remove_watermark"] = "true"
+                data["watermark_light_threshold"] = "200"
+                data["watermark_saturation_threshold"] = "30"
+            
+            # 添加只保留表格参数
+            data["table_only"] = "true" if table_only else "false"
+            
             response = requests.post(
                 f"{API_BASE_URL}/convert",
                 files=files,
@@ -173,7 +199,21 @@ def validate_result(result: Dict[str, Any], expected_type: str) -> bool:
         return False
     
     # 对于投资类型,检查嵌套结构
-    if expected_type in ["fsApproval", "fsReview", "pdApproval"]:
+    if expected_type in ["fsApproval", "fsReview", "pdApproval", "safetyFsApproval"]:
+        # 检查是否是新格式(包含 projectInfo)
+        project_info = None
+        if isinstance(data, dict) and "data" in data:
+            # 新格式:{"projectInfo": {...}, "data": [...]}
+            project_info = data.get("projectInfo")
+            data = data["data"]
+            
+            if project_info:
+                print(f"\n  📋 项目信息:")
+                print(f"     工程名称: {project_info.get('projectName', '')}")
+                print(f"     项目单位: {project_info.get('projectUnit', '')}")
+                print(f"     设计单位: {project_info.get('designUnit', '')}")
+        
+        # 验证数据格式
         if not isinstance(data, list):
             print_result(False, f"数据格式错误: 期望 list, 实际 {type(data).__name__}")
             return False
@@ -218,13 +258,24 @@ def validate_result(result: Dict[str, Any], expected_type: str) -> bool:
     return True
 
 
-def test_single_file(file_path: Path, document_type: str) -> bool:
-    """测试单个文件"""
+def test_single_file(file_path: Path, document_type: str, remove_watermark: bool = False, table_only: bool = True) -> bool:
+    """测试单个文件
+    
+    Args:
+        file_path: 文件路径
+        document_type: 文档类型
+        remove_watermark: 是否去水印
+        table_only: 是否只保留表格附件
+    """
     print_header(f"测试: {document_type}")
     print(f"  文件: {file_path.name}")
+    if remove_watermark:
+        print(f"  去水印: 是")
+    if table_only:
+        print(f"  只保留表格: 是")
     
     # 1. 上传文件
-    task_id = upload_file(file_path, document_type)
+    task_id = upload_file(file_path, document_type, remove_watermark, table_only)
     if not task_id:
         return False
     
@@ -276,7 +327,23 @@ def run_all_tests():
     skipped = 0
     
     # 运行每个测试用例
-    for filename, document_type in TEST_CASES.items():
+    for filename, config in TEST_CASES.items():
+        # 解析配置格式
+        if isinstance(config, tuple):
+            if len(config) >= 3:
+                document_type, remove_watermark, table_only = config[:3]
+            elif len(config) == 2:
+                document_type, remove_watermark = config
+                table_only = True  # 默认只保留表格
+            else:
+                document_type = config[0]
+                remove_watermark = False
+                table_only = True
+        else:
+            document_type = config
+            remove_watermark = False
+            table_only = True
+        
         file_path = TEST_DIR / filename
         
         if not file_path.exists():
@@ -288,7 +355,7 @@ def run_all_tests():
         total += 1
         
         try:
-            if test_single_file(file_path, document_type):
+            if test_single_file(file_path, document_type, remove_watermark, table_only):
                 passed += 1
             else:
                 failed += 1
@@ -319,11 +386,27 @@ def test_single(document_type: str):
         return
     
     # 查找对应的文件
-    for filename, dtype in TEST_CASES.items():
+    for filename, config in TEST_CASES.items():
+        # 解析配置格式
+        if isinstance(config, tuple):
+            if len(config) >= 3:
+                dtype, remove_watermark, table_only = config[:3]
+            elif len(config) == 2:
+                dtype, remove_watermark = config
+                table_only = True
+            else:
+                dtype = config[0]
+                remove_watermark = False
+                table_only = True
+        else:
+            dtype = config
+            remove_watermark = False
+            table_only = True
+        
         if dtype == document_type:
             file_path = TEST_DIR / filename
             if file_path.exists():
-                test_single_file(file_path, document_type)
+                test_single_file(file_path, document_type, remove_watermark, table_only)
                 return
             else:
                 print_result(False, f"文件不存在: {filename}")
@@ -332,7 +415,16 @@ def test_single(document_type: str):
     print_result(False, f"未找到类型 {document_type} 的测试文件")
 
 
-def test_ocr(image_path: Optional[str] = None) -> bool:
+def test_ocr(
+    image_path: Optional[str] = None,
+    remove_watermark: bool = False,
+    light_threshold: int = 200,
+    saturation_threshold: int = 30,
+    crop_header_footer: bool = False,
+    header_ratio: float = 0.05,
+    footer_ratio: float = 0.05,
+    auto_detect_header_footer: bool = False
+) -> bool:
     """
     测试 OCR 接口
     
@@ -341,6 +433,13 @@ def test_ocr(image_path: Optional[str] = None) -> bool:
                    支持格式:
                    - 图片文件:.png, .jpg, .jpeg
                    - txt文件:包含base64编码的图片数据(可带data:image/xxx;base64,前缀)
+        remove_watermark: 是否去除水印
+        light_threshold: 水印亮度阈值(0-255),默认200
+        saturation_threshold: 水印饱和度阈值(0-255),默认30
+        crop_header_footer: 是否裁剪页眉页脚
+        header_ratio: 页眉裁剪比例(0-1),默认0.05
+        footer_ratio: 页脚裁剪比例(0-1),默认0.05
+        auto_detect_header_footer: 是否自动检测页眉页脚边界
     
     Returns:
         是否测试成功
@@ -419,14 +518,33 @@ def test_ocr(image_path: Optional[str] = None) -> bool:
     
     # 调用 OCR 接口
     print(f"\n  📤 调用 OCR 接口...")
+    # 构建请求参数
+    request_data = {
+        "image_base64": image_base64,
+        "image_format": image_format
+    }
+    
+    if crop_header_footer:
+        request_data["crop_header_footer"] = True
+        if auto_detect_header_footer:
+            request_data["auto_detect_header_footer"] = True
+            print(f"  ✂️  裁剪页眉页脚: 自动检测模式")
+        else:
+            request_data["header_ratio"] = header_ratio
+            request_data["footer_ratio"] = footer_ratio
+            print(f"  ✂️  裁剪页眉页脚: 是 (顶部={header_ratio*100:.0f}%, 底部={footer_ratio*100:.0f}%)")
+    
+    if remove_watermark:
+        request_data["remove_watermark"] = True
+        request_data["watermark_light_threshold"] = light_threshold
+        request_data["watermark_saturation_threshold"] = saturation_threshold
+        print(f"  🔧 去水印: 是 (亮度阈值={light_threshold}, 饱和度阈值={saturation_threshold})")
+    
     try:
         start_time = time.time()
         response = requests.post(
             f"{API_BASE_URL}/ocr",
-            json={
-                "image_base64": image_base64,
-                "image_format": image_format
-            },
+            json=request_data,
             timeout=120
         )
         elapsed = time.time() - start_time
@@ -500,15 +618,75 @@ if __name__ == "__main__":
             print("  python test_api.py          # 运行所有测试")
             print("  python test_api.py <type>   # 测试指定类型")
             print("  python test_api.py ocr      # 测试 OCR 接口")
-            print("  python test_api.py ocr <image_path>  # 测试 OCR(指定图片)")
+            print("  python test_api.py ocr <image_path>  # 测试 OCR(指定图片或txt)")
+            print("  python test_api.py ocr <image_path> --nowm  # 测试 OCR 并去水印")
+            print("  python test_api.py ocr <image_path> --crop  # 测试 OCR 并裁剪页眉页脚")
+            print("  python test_api.py ocr <image_path> --nowm --crop  # 同时去水印和裁剪")
             print("\n可用类型:")
             for dtype in set(TEST_CASES.values()):
                 print(f"  - {dtype}")
             print("  - ocr  (OCR 图片识别)")
+            print("\nOCR 去水印参数:")
+            print("  --nowm         启用去水印")
+            print("  --light=N      亮度阈值(0-255,默认200)")
+            print("  --sat=N        饱和度阈值(0-255,默认30)")
+            print("\nOCR 裁剪页眉页脚参数:")
+            print("  --crop         启用裁剪页眉页脚(固定比例模式)")
+            print("  --crop-auto    启用裁剪页眉页脚(自动检测模式)")
+            print("  --header=N     页眉裁剪比例(0-1,默认0.05表示5%)")
+            print("  --footer=N     页脚裁剪比例(0-1,默认0.05表示5%)")
         elif doc_type == "ocr":
-            # 测试 OCR 接口
-            image_path = sys.argv[2] if len(sys.argv) > 2 else None
-            test_ocr(image_path)
+            # 解析 OCR 参数
+            image_path = None
+            remove_watermark = False
+            light_threshold = 200
+            saturation_threshold = 30
+            crop_header_footer = False
+            header_ratio = 0.05
+            footer_ratio = 0.05
+            auto_detect_header_footer = False
+            
+            for arg in sys.argv[2:]:
+                if arg == "--nowm":
+                    remove_watermark = True
+                elif arg == "--crop":
+                    crop_header_footer = True
+                elif arg == "--crop-auto":
+                    crop_header_footer = True
+                    auto_detect_header_footer = True
+                elif arg.startswith("--light="):
+                    try:
+                        light_threshold = int(arg.split("=")[1])
+                    except ValueError:
+                        print(f"警告: 无效的亮度阈值 {arg},使用默认值 200")
+                elif arg.startswith("--sat="):
+                    try:
+                        saturation_threshold = int(arg.split("=")[1])
+                    except ValueError:
+                        print(f"警告: 无效的饱和度阈值 {arg},使用默认值 30")
+                elif arg.startswith("--header="):
+                    try:
+                        header_ratio = float(arg.split("=")[1])
+                    except ValueError:
+                        print(f"警告: 无效的页眉比例 {arg},使用默认值 0.05")
+                elif arg.startswith("--footer="):
+                    try:
+                        footer_ratio = float(arg.split("=")[1])
+                    except ValueError:
+                        print(f"警告: 无效的页脚比例 {arg},使用默认值 0.05")
+                elif not arg.startswith("--"):
+                    image_path = arg
+            
+            test_ocr(
+                image_path, 
+                remove_watermark, 
+                light_threshold, 
+                saturation_threshold,
+                crop_header_footer,
+                header_ratio,
+                footer_ratio,
+                auto_detect_header_footer
+            )
         else:
             test_single(doc_type)
     else:

+ 261 - 15
pdf_converter_v2/test_no.py

@@ -45,11 +45,20 @@ except ImportError:
     logger.info("[附件切割] 安装命令: pip install PyPDF2")
 
 # 配置
-PDF_PATH = '/home/hws/workspace/GitLab/Clerk2.5/pdf_converter_v2/2-数据源/4-(初设评审)中电联电力建设技术经济咨询中心技经〔2019〕201号关于山西周村220kV输变电工程初步设计的评审意见.pdf'
+PDF_PATH = '/home/hws/workspace/GitLab/Clerk2.5/pdf_converter_v2/2-数据源/1-(可研评审)晋电经研规划〔2017〕187号(盖章)国网山西经研院关于山西晋城周村220kV输变电工程可行性研究报告的评审意见.pdf'
 OUTPUT_DIR = Path('附件页')
 USE_OCR = True  # 是否启用 OCR
 OCR_LANG = 'chi_sim+eng'  # OCR 语言
-DEBUG_MODE = True  # 是否启用调试模式(显示每页的文本内容)
+DEBUG_MODE = False  # 是否启用调试模式(显示每页的文本内容)
+
+# 去水印配置
+REMOVE_WATERMARK = False  # 是否对切割后的附件页PDF去水印
+WATERMARK_LIGHT_THRESHOLD = 200  # 水印亮度阈值(0-255),高于此值的浅色像素可能是水印
+WATERMARK_SATURATION_THRESHOLD = 30  # 水印饱和度阈值(0-255),低于此值的低饱和度像素可能是水印
+WATERMARK_DPI = 200  # PDF转图片的DPI(用于去水印)
+
+# 表格附件过滤配置
+TABLE_ONLY = True  # 是否只保留包含表格的附件页(过滤掉示意图、评审意见等)
 
 # 附件页识别关键词
 ATTACHMENT_START_KEYWORDS = [
@@ -59,6 +68,40 @@ ATTACHMENT_START_KEYWORDS = [
     '附 件:',
 ]
 
+# 表格附件识别关键词(用于过滤只保留包含表格的附件)
+TABLE_ATTACHMENT_KEYWORDS = [
+    '项目表',
+    '投资估算',
+    '工程投资',
+    '建设规模',
+    '技术方案',
+    '变电工程',
+    '线路工程',
+    '静态投资',
+    '动态投资',
+    '单位造价',
+    '设备购置费',
+    '安装工程费',
+    '建筑工程费',
+    '其他费用',
+    '基本预备费',
+]
+
+# 非表格附件识别关键词(用于识别需要跳过的附件)
+NON_TABLE_ATTACHMENT_KEYWORDS = [
+    '示意图',
+    '接入系统示意图',
+    '母线间隔排列图',
+    '评审意见',
+    '技术监督意见',
+    '参会单位',
+    '人员一览表',
+    '经济性评价',
+    '财务合规',
+    '审核结果',
+    '预算编制衔接',
+]
+
 def ocr_page_image(image) -> str:
     """
     对图片进行 OCR 识别(优先使用 Tesseract,备用 PaddleOCR)
@@ -143,6 +186,63 @@ def extract_page_text(page, use_ocr: bool = False) -> str:
     logger.warning(f"[附件切割] 第{page.page_number}页: 无法提取文本(OCR未启用或不可用)")
     return ""
 
+def is_table_attachment_page(text: str, page) -> bool:
+    """
+    判断是否是包含表格的附件页
+    
+    Args:
+        text: 页面文本
+        page: pdfplumber page 对象
+    
+    Returns:
+        bool: 是否是表格附件页
+    """
+    if not text:
+        return False
+    
+    text_no_space = text.replace(' ', '').replace('\u3000', '')
+    
+    # 检查是否包含非表格附件关键词(如示意图、评审意见等)
+    for keyword in NON_TABLE_ATTACHMENT_KEYWORDS:
+        keyword_no_space = keyword.replace(' ', '').replace('\u3000', '')
+        if keyword_no_space in text_no_space:
+            logger.debug(f"[附件切割] 检测到非表格附件关键词: {keyword}")
+            return False
+    
+    # 检查是否包含表格附件关键词
+    has_table_keyword = False
+    for keyword in TABLE_ATTACHMENT_KEYWORDS:
+        keyword_no_space = keyword.replace(' ', '').replace('\u3000', '')
+        if keyword_no_space in text_no_space:
+            logger.debug(f"[附件切割] 检测到表格关键词: {keyword}")
+            has_table_keyword = True
+            break
+    
+    # 如果有表格关键词,直接返回True
+    if has_table_keyword:
+        return True
+    
+    # 检查页面是否包含表格(使用pdfplumber的表格检测)
+    if page is not None:
+        try:
+            tables = page.extract_tables()
+            if tables and len(tables) > 0:
+                # 检查表格是否足够大(至少有3行3列的数据表格)
+                for table in tables:
+                    if table and len(table) >= 3:
+                        # 检查是否有多列
+                        non_empty_rows = [row for row in table if row and any(cell for cell in row if cell)]
+                        if len(non_empty_rows) >= 3:
+                            row_with_most_cols = max(non_empty_rows, key=lambda r: len([c for c in r if c]))
+                            if len([c for c in row_with_most_cols if c]) >= 3:
+                                logger.debug(f"[附件切割] 检测到表格: {len(non_empty_rows)}行")
+                                return True
+        except Exception as e:
+            logger.warning(f"[附件切割] 表格检测失败: {e}")
+    
+    return False
+
+
 def is_attachment_start_page(text: str) -> bool:
     """
     判断是否是附件清单页(附件开始的前一页)
@@ -339,7 +439,10 @@ def extract_pages(pdf_path: str, page_numbers: list, output_path: str):
     logger.info(f"[附件切割] 已保存到: {output_path}")
     print(f"✓ 已保存到: {output_path}")
 
-def split_attachment_pages(pdf_path: str, output_dir: Path, use_ocr: bool = False, debug: bool = False):
+def split_attachment_pages(pdf_path: str, output_dir: Path, use_ocr: bool = False, debug: bool = False, 
+                          remove_watermark: bool = False, watermark_light_threshold: int = 200,
+                          watermark_saturation_threshold: int = 30, watermark_dpi: int = 200,
+                          table_only: bool = False):
     """
     查找并切割附件页
     
@@ -348,8 +451,14 @@ def split_attachment_pages(pdf_path: str, output_dir: Path, use_ocr: bool = Fals
         output_dir: 输出目录
         use_ocr: 是否使用 OCR
         debug: 是否输出调试信息
+        remove_watermark: 是否对切割后的附件页PDF去水印
+        watermark_light_threshold: 水印亮度阈值(0-255)
+        watermark_saturation_threshold: 水印饱和度阈值(0-255)
+        watermark_dpi: PDF转图片的DPI
+        table_only: 是否只保留包含表格的附件页(过滤掉示意图、评审意见等)
     """
     logger.info(f"[附件切割] 开始处理PDF: {pdf_path}")
+    logger.info(f"[附件切割] 只保留表格附件: {'是' if table_only else '否'}")
     
     # 查找附件开始页
     attachment_start = find_attachment_start_page(pdf_path, use_ocr=use_ocr, debug=debug)
@@ -359,16 +468,65 @@ def split_attachment_pages(pdf_path: str, output_dir: Path, use_ocr: bool = Fals
         print("\n未找到附件页")
         return
     
-    # 获取总页数
+    # 获取总页数和筛选表格附件页
     with pdfplumber.open(pdf_path) as pdf:
         total_pages = len(pdf.pages)
-    
-    # 附件页范围:从附件开始页到最后一页
-    attachment_pages = list(range(attachment_start, total_pages + 1))
-    
-    logger.info(f"[附件切割] 附件页范围: {attachment_start}-{total_pages}, 共 {len(attachment_pages)} 页")
-    print(f"\n附件页范围: 第 {attachment_start} 页 到 第 {total_pages} 页")
-    print(f"共 {len(attachment_pages)} 页")
+        
+        if table_only:
+            # 只保留包含表格的附件页
+            logger.info(f"[附件切割] 启用表格附件过滤,开始筛选...")
+            print(f"\n启用表格附件过滤,开始筛选...")
+            
+            attachment_pages = []
+            current_table_section = []  # 当前表格区段的页面
+            in_table_section = False  # 是否在表格区段内
+            
+            for page_num in range(attachment_start, total_pages + 1):
+                page = pdf.pages[page_num - 1]
+                text = extract_page_text(page, use_ocr=use_ocr)
+                
+                is_table_page = is_table_attachment_page(text, page)
+                
+                if debug:
+                    print(f"  页面 {page_num}: {'表格页' if is_table_page else '非表格页'}")
+                
+                if is_table_page:
+                    if not in_table_section:
+                        # 开始新的表格区段
+                        in_table_section = True
+                        current_table_section = [page_num]
+                        logger.debug(f"[附件切割] 开始表格区段: 第 {page_num} 页")
+                    else:
+                        # 继续当前表格区段
+                        current_table_section.append(page_num)
+                else:
+                    if in_table_section:
+                        # 结束当前表格区段,保存
+                        attachment_pages.extend(current_table_section)
+                        logger.info(f"[附件切割] 表格区段结束: {current_table_section[0]}-{current_table_section[-1]}")
+                        current_table_section = []
+                        in_table_section = False
+            
+            # 处理最后一个表格区段
+            if in_table_section and current_table_section:
+                attachment_pages.extend(current_table_section)
+                logger.info(f"[附件切割] 最后表格区段: {current_table_section[0]}-{current_table_section[-1]}")
+            
+            if not attachment_pages:
+                logger.warning(f"[附件切割] 未找到包含表格的附件页")
+                print("\n未找到包含表格的附件页")
+                return
+            
+            logger.info(f"[附件切割] 筛选后的表格附件页: {attachment_pages}")
+            print(f"\n筛选后的表格附件页: {attachment_pages}")
+            print(f"共 {len(attachment_pages)} 页")
+        else:
+            # 附件页范围:从附件开始页到最后一页
+            attachment_pages = list(range(attachment_start, total_pages + 1))
+            
+            logger.info(f"[附件切割] 附件页范围: {attachment_start}-{total_pages}, 共 {len(attachment_pages)} 页")
+            print(f"\n附件页范围: 第 {attachment_start} 页 到 第 {total_pages} 页")
+            print(f"共 {len(attachment_pages)} 页")
     
     # 切割附件页
     print("\n" + "=" * 60)
@@ -379,14 +537,70 @@ def split_attachment_pages(pdf_path: str, output_dir: Path, use_ocr: bool = Fals
     output_dir.mkdir(parents=True, exist_ok=True)
     
     # 保存所有附件页为一个文件
-    output_file = output_dir / f"{pdf_path.stem}_附件页_{attachment_start}-{total_pages}.pdf"
+    if table_only:
+        # 表格附件模式:使用筛选后的页面范围
+        page_range_str = f"{min(attachment_pages)}_{max(attachment_pages)}" if attachment_pages else "none"
+        output_file = output_dir / f"{pdf_path.stem}_表格附件页_{page_range_str}.pdf"
+    else:
+        output_file = output_dir / f"{pdf_path.stem}_附件页_{attachment_start}-{total_pages}.pdf"
+    
     logger.info(f"[附件切割] 输出文件: {output_file}")
     extract_pages(pdf_path, attachment_pages, output_file)
     
     logger.info(f"[附件切割] 切割完成: {len(attachment_pages)} 页附件已保存")
     print(f"\n✓ 切割完成!")
     print(f"附件页数: {len(attachment_pages)} 页")
-    print(f"输出目录: {output_dir.absolute()}")
+    print(f"输出文件: {output_file}")
+    
+    # 如果启用去水印,对切割后的附件页PDF进行去水印处理
+    if remove_watermark:
+        logger.info(f"[附件切割] 开始对附件页PDF进行去水印处理...")
+        print("\n" + "=" * 60)
+        print("开始去水印处理")
+        print("=" * 60)
+        
+        try:
+            # 导入去水印模块
+            import sys
+            from pathlib import Path as PathLib
+            sys.path.insert(0, str(PathLib(__file__).parent))
+            
+            from utils.pdf_watermark_remover import remove_watermark_from_pdf
+            
+            # 去水印后的PDF路径
+            nowm_output_file = output_dir / f"{output_file.stem}_nowm.pdf"
+            
+            logger.info(f"[附件切割] 去水印参数: 亮度阈值={watermark_light_threshold}, 饱和度阈值={watermark_saturation_threshold}, DPI={watermark_dpi}")
+            print(f"去水印参数:")
+            print(f"  - 亮度阈值: {watermark_light_threshold}")
+            print(f"  - 饱和度阈值: {watermark_saturation_threshold}")
+            print(f"  - DPI: {watermark_dpi}")
+            
+            # 执行去水印
+            success = remove_watermark_from_pdf(
+                input_pdf=str(output_file),
+                output_pdf=str(nowm_output_file),
+                light_threshold=watermark_light_threshold,
+                saturation_threshold=watermark_saturation_threshold,
+                dpi=watermark_dpi
+            )
+            
+            if success and nowm_output_file.exists():
+                logger.info(f"[附件切割] 去水印完成: {nowm_output_file}")
+                print(f"\n✓ 去水印完成!")
+                print(f"去水印后的文件: {nowm_output_file}")
+            else:
+                logger.warning(f"[附件切割] 去水印失败")
+                print(f"\n⚠ 去水印失败,请检查日志")
+        except ImportError as e:
+            logger.error(f"[附件切割] 导入去水印模块失败: {e}")
+            print(f"\n⚠ 去水印模块导入失败: {e}")
+            print("请确保 utils/pdf_watermark_remover.py 文件存在")
+        except Exception as e:
+            logger.exception(f"[附件切割] 去水印处理失败: {e}")
+            print(f"\n⚠ 去水印处理失败: {e}")
+    
+    print(f"\n输出目录: {output_dir.absolute()}")
 
 if __name__ == '__main__':
     logger.info("[附件切割] " + "=" * 50)
@@ -397,6 +611,19 @@ if __name__ == '__main__':
     print("PDF 附件页识别和切割工具")
     print("=" * 60)
     
+    # 显示配置信息
+    print("\n配置信息:")
+    print(f"  - PDF文件: {PDF_PATH}")
+    print(f"  - 输出目录: {OUTPUT_DIR}")
+    print(f"  - OCR: {'启用' if USE_OCR else '禁用'}")
+    print(f"  - 调试模式: {'启用' if DEBUG_MODE else '禁用'}")
+    print(f"  - 只保留表格附件: {'启用' if TABLE_ONLY else '禁用'}")
+    print(f"  - 去水印: {'启用' if REMOVE_WATERMARK else '禁用'}")
+    if REMOVE_WATERMARK:
+        print(f"    * 亮度阈值: {WATERMARK_LIGHT_THRESHOLD}")
+        print(f"    * 饱和度阈值: {WATERMARK_SATURATION_THRESHOLD}")
+        print(f"    * DPI: {WATERMARK_DPI}")
+    
     # 检查依赖
     if not TESSERACT_AVAILABLE and USE_OCR:
         logger.warning("[附件切割] OCR 功能不可用")
@@ -412,7 +639,26 @@ if __name__ == '__main__':
         print("安装方法:")
         print("  pip install PyPDF2\n")
     
+    if REMOVE_WATERMARK:
+        print("\n⚠ 去水印功能需要以下依赖:")
+        print("  - OpenCV (cv2)")
+        print("  - Pillow (PIL)")
+        print("  - pdf2image")
+        print("  - PyPDF2")
+        print("安装命令:")
+        print("  pip install opencv-python pillow pdf2image PyPDF2\n")
+    
     # 执行切割
-    logger.info(f"[附件切割] 配置: PDF={PDF_PATH}, 输出={OUTPUT_DIR}, OCR={USE_OCR}, DEBUG={DEBUG_MODE}")
-    split_attachment_pages(PDF_PATH, OUTPUT_DIR, use_ocr=USE_OCR, debug=DEBUG_MODE)
+    logger.info(f"[附件切割] 配置: PDF={PDF_PATH}, 输出={OUTPUT_DIR}, OCR={USE_OCR}, DEBUG={DEBUG_MODE}, 表格附件={TABLE_ONLY}, 去水印={REMOVE_WATERMARK}")
+    split_attachment_pages(
+        PDF_PATH, 
+        OUTPUT_DIR, 
+        use_ocr=USE_OCR, 
+        debug=DEBUG_MODE,
+        remove_watermark=REMOVE_WATERMARK,
+        watermark_light_threshold=WATERMARK_LIGHT_THRESHOLD,
+        watermark_saturation_threshold=WATERMARK_SATURATION_THRESHOLD,
+        watermark_dpi=WATERMARK_DPI,
+        table_only=TABLE_ONLY
+    )
     logger.info("[附件切割] 程序执行完成")

+ 526 - 0
pdf_converter_v2/utils/image_preprocessor.py

@@ -0,0 +1,526 @@
+"""
+图像预处理工具 - 包含去水印等功能
+
+支持的预处理操作:
+- 去水印(颜色过滤法)
+- 灰度转换
+- 二值化
+- 去噪
+"""
+
+import numpy as np
+from pathlib import Path
+from typing import Optional, Tuple
+from loguru import logger
+
+try:
+    from PIL import Image
+    PIL_AVAILABLE = True
+except ImportError:
+    PIL_AVAILABLE = False
+    logger.warning("[图像预处理] PIL 未安装,部分功能不可用")
+
+try:
+    import cv2
+    CV2_AVAILABLE = True
+except ImportError:
+    CV2_AVAILABLE = False
+    logger.warning("[图像预处理] OpenCV 未安装,部分功能不可用")
+
+
+def remove_watermark(
+    image_path: str,
+    output_path: Optional[str] = None,
+    light_threshold: int = 200,
+    saturation_threshold: int = 30,
+    method: str = "auto"
+) -> str:
+    """
+    去除图片水印
+    
+    原理:大多数水印是浅色或半透明的,通过以下方式去除:
+    1. 将浅色像素(亮度高、饱和度低)替换为白色
+    2. 保留深色文字内容
+    
+    Args:
+        image_path: 输入图片路径
+        output_path: 输出图片路径,默认在原文件名后加 _nowm
+        light_threshold: 亮度阈值(0-255),高于此值的浅色像素可能是水印
+        saturation_threshold: 饱和度阈值(0-255),低于此值的低饱和度像素可能是水印
+        method: 去水印方法
+            - "auto": 自动选择最佳方法
+            - "light": 基于亮度的简单方法(快速)
+            - "hsv": 基于HSV颜色空间的方法(更精确)
+            - "adaptive": 自适应阈值方法
+    
+    Returns:
+        处理后的图片路径
+    """
+    if not CV2_AVAILABLE:
+        logger.warning("[去水印] OpenCV 未安装,跳过去水印处理")
+        return image_path
+    
+    logger.info(f"[去水印] 开始处理: {image_path}")
+    logger.info(f"[去水印] 方法: {method}, 亮度阈值: {light_threshold}, 饱和度阈值: {saturation_threshold}")
+    
+    # 读取图片
+    img = cv2.imread(image_path)
+    if img is None:
+        logger.error(f"[去水印] 无法读取图片: {image_path}")
+        return image_path
+    
+    original_shape = img.shape
+    logger.info(f"[去水印] 图片尺寸: {original_shape[1]}x{original_shape[0]}")
+    
+    # 根据方法选择处理逻辑
+    if method == "auto":
+        # 自动检测:先尝试 HSV 方法,如果效果不好则用 adaptive
+        method = "hsv"
+    
+    if method == "light":
+        # 简单亮度方法:将浅色像素替换为白色
+        result = _remove_watermark_light(img, light_threshold)
+    elif method == "hsv":
+        # HSV 方法:基于亮度和饱和度
+        result = _remove_watermark_hsv(img, light_threshold, saturation_threshold)
+    elif method == "adaptive":
+        # 自适应方法:使用自适应阈值
+        result = _remove_watermark_adaptive(img)
+    else:
+        logger.warning(f"[去水印] 未知方法: {method},使用 hsv")
+        result = _remove_watermark_hsv(img, light_threshold, saturation_threshold)
+    
+    # 确定输出路径
+    if output_path is None:
+        path = Path(image_path)
+        output_path = str(path.parent / f"{path.stem}_nowm{path.suffix}")
+    
+    # 保存结果
+    cv2.imwrite(output_path, result)
+    logger.info(f"[去水印] 处理完成,保存到: {output_path}")
+    
+    return output_path
+
+
+def _remove_watermark_light(img: np.ndarray, threshold: int = 200) -> np.ndarray:
+    """
+    简单亮度方法:将浅色像素替换为白色
+    
+    适用于:浅色/灰色水印
+    """
+    # 转为灰度图
+    gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
+    
+    # 创建掩码:亮度高于阈值的区域
+    mask = gray > threshold
+    
+    # 将掩码区域设为白色
+    result = img.copy()
+    result[mask] = [255, 255, 255]
+    
+    return result
+
+
+def _remove_watermark_hsv(
+    img: np.ndarray,
+    light_threshold: int = 200,
+    saturation_threshold: int = 30
+) -> np.ndarray:
+    """
+    HSV 方法:基于亮度和饱和度去除水印
+    
+    原理:水印通常是高亮度、低饱和度的
+    适用于:彩色水印、半透明水印
+    """
+    # 转换到 HSV 颜色空间
+    hsv = cv2.cvtColor(img, cv2.COLOR_BGR2HSV)
+    
+    # 分离通道
+    h, s, v = cv2.split(hsv)
+    
+    # 创建水印掩码:高亮度 AND 低饱和度
+    watermark_mask = (v > light_threshold) & (s < saturation_threshold)
+    
+    # 将水印区域设为白色
+    result = img.copy()
+    result[watermark_mask] = [255, 255, 255]
+    
+    # 可选:对边缘进行平滑处理
+    # kernel = np.ones((3, 3), np.uint8)
+    # watermark_mask_dilated = cv2.dilate(watermark_mask.astype(np.uint8), kernel, iterations=1)
+    # result[watermark_mask_dilated == 1] = [255, 255, 255]
+    
+    return result
+
+
+def _remove_watermark_adaptive(img: np.ndarray) -> np.ndarray:
+    """
+    自适应阈值方法
+    
+    适用于:复杂背景、不均匀光照
+    """
+    # 转为灰度图
+    gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
+    
+    # 使用自适应阈值
+    # 这会根据局部区域计算阈值,保留文字,去除背景和水印
+    binary = cv2.adaptiveThreshold(
+        gray, 255,
+        cv2.ADAPTIVE_THRESH_GAUSSIAN_C,
+        cv2.THRESH_BINARY,
+        blockSize=15,
+        C=10
+    )
+    
+    # 转回 BGR(3通道)
+    result = cv2.cvtColor(binary, cv2.COLOR_GRAY2BGR)
+    
+    return result
+
+
+def enhance_for_ocr(
+    image_path: str,
+    output_path: Optional[str] = None,
+    remove_wm: bool = True,
+    denoise: bool = True,
+    sharpen: bool = False
+) -> str:
+    """
+    OCR 预处理增强
+    
+    组合多种预处理操作,优化 OCR 识别效果
+    
+    Args:
+        image_path: 输入图片路径
+        output_path: 输出图片路径
+        remove_wm: 是否去除水印
+        denoise: 是否去噪
+        sharpen: 是否锐化
+    
+    Returns:
+        处理后的图片路径
+    """
+    if not CV2_AVAILABLE:
+        logger.warning("[OCR预处理] OpenCV 未安装,跳过预处理")
+        return image_path
+    
+    logger.info(f"[OCR预处理] 开始处理: {image_path}")
+    
+    # 读取图片
+    img = cv2.imread(image_path)
+    if img is None:
+        logger.error(f"[OCR预处理] 无法读取图片: {image_path}")
+        return image_path
+    
+    result = img.copy()
+    
+    # 1. 去水印
+    if remove_wm:
+        result = _remove_watermark_hsv(result)
+        logger.info("[OCR预处理] 已去除水印")
+    
+    # 2. 去噪
+    if denoise:
+        result = cv2.fastNlMeansDenoisingColored(result, None, 10, 10, 7, 21)
+        logger.info("[OCR预处理] 已去噪")
+    
+    # 3. 锐化
+    if sharpen:
+        kernel = np.array([[-1, -1, -1],
+                          [-1,  9, -1],
+                          [-1, -1, -1]])
+        result = cv2.filter2D(result, -1, kernel)
+        logger.info("[OCR预处理] 已锐化")
+    
+    # 确定输出路径
+    if output_path is None:
+        path = Path(image_path)
+        output_path = str(path.parent / f"{path.stem}_enhanced{path.suffix}")
+    
+    # 保存结果
+    cv2.imwrite(output_path, result)
+    logger.info(f"[OCR预处理] 处理完成,保存到: {output_path}")
+    
+    return output_path
+
+
+def check_opencv_available() -> bool:
+    """检查 OpenCV 是否可用"""
+    return CV2_AVAILABLE
+
+
+def crop_header_footer(
+    image_path: str,
+    output_path: Optional[str] = None,
+    header_ratio: float = 0.05,
+    footer_ratio: float = 0.05,
+    auto_detect: bool = False
+) -> str:
+    """
+    裁剪图片的页眉和页脚区域
+    
+    通过按比例裁剪图片顶部和底部来去除页眉页脚
+    
+    Args:
+        image_path: 输入图片路径
+        output_path: 输出图片路径,默认在原文件名后加 _cropped
+        header_ratio: 页眉裁剪比例(0-1),默认0.05表示裁剪顶部5%
+        footer_ratio: 页脚裁剪比例(0-1),默认0.05表示裁剪底部5%
+        auto_detect: 是否自动检测页眉页脚边界(忽略 header_ratio 和 footer_ratio)
+    
+    Returns:
+        处理后的图片路径
+    """
+    if not CV2_AVAILABLE:
+        logger.warning("[裁剪页眉页脚] OpenCV 未安装,跳过处理")
+        return image_path
+    
+    logger.info(f"[裁剪页眉页脚] 开始处理: {image_path}")
+    
+    # 读取图片
+    img = cv2.imread(image_path)
+    if img is None:
+        logger.error(f"[裁剪页眉页脚] 无法读取图片: {image_path}")
+        return image_path
+    
+    height, width = img.shape[:2]
+    logger.info(f"[裁剪页眉页脚] 原始尺寸: {width}x{height}")
+    
+    if auto_detect:
+        # 自动检测页眉页脚边界
+        logger.info("[裁剪页眉页脚] 使用自动检测模式")
+        header_pixels, footer_pixels = _detect_header_footer_boundaries(img)
+        logger.info(f"[裁剪页眉页脚] 自动检测结果: 页眉={header_pixels}px, 页脚={footer_pixels}px")
+    else:
+        # 使用固定比例
+        logger.info(f"[裁剪页眉页脚] 使用固定比例: 页眉={header_ratio}, 页脚={footer_ratio}")
+        header_pixels = int(height * header_ratio)
+        footer_pixels = int(height * footer_ratio)
+    
+    # 裁剪图片(保留中间部分)
+    top = header_pixels
+    bottom = height - footer_pixels
+    
+    if top >= bottom:
+        logger.warning("[裁剪页眉页脚] 裁剪区域无效,跳过处理")
+        return image_path
+    
+    result = img[top:bottom, :]
+    
+    new_height = result.shape[0]
+    logger.info(f"[裁剪页眉页脚] 裁剪后尺寸: {width}x{new_height}")
+    logger.info(f"[裁剪页眉页脚] 裁剪了顶部 {header_pixels}px,底部 {footer_pixels}px")
+    
+    # 确定输出路径
+    if output_path is None:
+        path = Path(image_path)
+        output_path = str(path.parent / f"{path.stem}_cropped{path.suffix}")
+    
+    # 保存结果
+    cv2.imwrite(output_path, result)
+    logger.info(f"[裁剪页眉页脚] 处理完成,保存到: {output_path}")
+    
+    return output_path
+
+
+def _detect_header_footer_boundaries(img: np.ndarray) -> Tuple[int, int]:
+    """
+    自动检测页眉页脚边界
+    
+    使用多种方法综合判断:
+    1. 水平线检测 - 检测分隔线
+    2. 文本密度分析 - 页眉页脚通常文字较少
+    3. 空白区域检测 - 检测大面积空白
+    
+    Args:
+        img: 输入图片(BGR格式)
+    
+    Returns:
+        (header_pixels, footer_pixels): 页眉和页脚的像素高度
+    """
+    height, width = img.shape[:2]
+    
+    # 转为灰度图
+    gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
+    
+    # 定义搜索范围(页眉页脚通常在顶部/底部 15% 以内)
+    search_range = int(height * 0.15)
+    min_margin = int(height * 0.02)  # 最小边距 2%
+    
+    # 方法1: 检测水平线
+    header_line = _find_horizontal_line(gray, 0, search_range, from_top=True)
+    footer_line = _find_horizontal_line(gray, height - search_range, height, from_top=False)
+    
+    # 方法2: 分析文本密度变化
+    header_density = _find_content_boundary(gray, 0, search_range, from_top=True)
+    footer_density = _find_content_boundary(gray, height - search_range, height, from_top=False)
+    
+    # 综合判断:取最可靠的结果
+    # 优先使用水平线检测结果,其次使用密度分析结果
+    if header_line > min_margin:
+        header_pixels = header_line
+        logger.debug(f"[自动检测] 页眉: 使用水平线检测结果 {header_pixels}px")
+    elif header_density > min_margin:
+        header_pixels = header_density
+        logger.debug(f"[自动检测] 页眉: 使用密度分析结果 {header_pixels}px")
+    else:
+        header_pixels = min_margin
+        logger.debug(f"[自动检测] 页眉: 使用最小边距 {header_pixels}px")
+    
+    if footer_line > min_margin:
+        footer_pixels = footer_line
+        logger.debug(f"[自动检测] 页脚: 使用水平线检测结果 {footer_pixels}px")
+    elif footer_density > min_margin:
+        footer_pixels = footer_density
+        logger.debug(f"[自动检测] 页脚: 使用密度分析结果 {footer_pixels}px")
+    else:
+        footer_pixels = min_margin
+        logger.debug(f"[自动检测] 页脚: 使用最小边距 {footer_pixels}px")
+    
+    return header_pixels, footer_pixels
+
+
+def _find_horizontal_line(
+    gray: np.ndarray,
+    start_y: int,
+    end_y: int,
+    from_top: bool = True
+) -> int:
+    """
+    在指定区域内查找水平分隔线
+    
+    Args:
+        gray: 灰度图
+        start_y: 搜索起始y坐标
+        end_y: 搜索结束y坐标
+        from_top: True表示从上往下找,False表示从下往上找
+    
+    Returns:
+        分隔线位置(像素),如果没找到返回0
+    """
+    height, width = gray.shape
+    
+    # 使用 Canny 边缘检测
+    edges = cv2.Canny(gray[start_y:end_y, :], 50, 150)
+    
+    # 使用霍夫变换检测直线
+    lines = cv2.HoughLinesP(
+        edges,
+        rho=1,
+        theta=np.pi/180,
+        threshold=int(width * 0.5),  # 线长度至少为图片宽度的50%
+        minLineLength=int(width * 0.4),
+        maxLineGap=20
+    )
+    
+    if lines is None:
+        return 0
+    
+    # 筛选水平线(角度接近0或180度)
+    horizontal_lines = []
+    for line in lines:
+        x1, y1, x2, y2 = line[0]
+        # 计算角度
+        angle = abs(np.arctan2(y2 - y1, x2 - x1) * 180 / np.pi)
+        # 水平线角度应该接近 0 或 180
+        if angle < 5 or angle > 175:
+            avg_y = (y1 + y2) // 2 + start_y
+            horizontal_lines.append(avg_y)
+    
+    if not horizontal_lines:
+        return 0
+    
+    # 根据方向返回最合适的线
+    if from_top:
+        # 从上往下,返回最下面的水平线(作为页眉下边界)
+        return max(horizontal_lines)
+    else:
+        # 从下往上,返回距离底部的距离
+        return height - min(horizontal_lines)
+
+
+def _find_content_boundary(
+    gray: np.ndarray,
+    start_y: int,
+    end_y: int,
+    from_top: bool = True
+) -> int:
+    """
+    通过分析文本/内容密度找到内容边界
+    
+    原理:页眉页脚区域通常是空白或只有少量文字,
+    正文区域文字密度较高。通过检测密度突变点来确定边界。
+    
+    Args:
+        gray: 灰度图
+        start_y: 搜索起始y坐标
+        end_y: 搜索结束y坐标
+        from_top: True表示从上往下找,False表示从下往上找
+    
+    Returns:
+        内容边界位置(像素),如果没找到返回0
+    """
+    height, width = gray.shape
+    
+    # 二值化
+    _, binary = cv2.threshold(gray, 200, 255, cv2.THRESH_BINARY_INV)
+    
+    # 计算每一行的像素密度(黑色像素占比)
+    row_densities = []
+    for y in range(start_y, end_y):
+        row = binary[y, :]
+        density = np.sum(row > 0) / width
+        row_densities.append((y, density))
+    
+    if not row_densities:
+        return 0
+    
+    # 使用滑动窗口平滑密度曲线
+    window_size = 10
+    smoothed = []
+    for i in range(len(row_densities)):
+        start = max(0, i - window_size // 2)
+        end = min(len(row_densities), i + window_size // 2)
+        avg_density = sum(d[1] for d in row_densities[start:end]) / (end - start)
+        smoothed.append((row_densities[i][0], avg_density))
+    
+    # 找到密度突变点
+    # 定义阈值:当密度从低于 0.01 变化到高于 0.02 时,认为进入正文区域
+    low_threshold = 0.005
+    high_threshold = 0.02
+    
+    if from_top:
+        # 从上往下,找到第一个连续高密度区域的起始位置
+        in_content = False
+        content_start = 0
+        consecutive_high = 0
+        
+        for y, density in smoothed:
+            if density > high_threshold:
+                consecutive_high += 1
+                if consecutive_high >= 5 and not in_content:
+                    # 连续5行高密度,认为进入正文
+                    in_content = True
+                    content_start = y - 5  # 往上回退一点
+                    break
+            else:
+                consecutive_high = 0
+        
+        return max(0, content_start - start_y)
+    else:
+        # 从下往上,找到最后一个连续高密度区域的结束位置
+        in_content = False
+        content_end = height
+        consecutive_high = 0
+        
+        for y, density in reversed(smoothed):
+            if density > high_threshold:
+                consecutive_high += 1
+                if consecutive_high >= 5 and not in_content:
+                    in_content = True
+                    content_end = y + 5
+                    break
+            else:
+                consecutive_high = 0
+        
+        return max(0, height - content_end)

+ 122 - 0
pdf_converter_v2/utils/pdf_watermark_remover.py

@@ -0,0 +1,122 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+"""
+PDF去水印工具
+将PDF转换为图片,去除水印后再转回PDF
+"""
+
+from pathlib import Path
+from typing import Optional
+import tempfile
+import shutil
+
+def remove_watermark_from_pdf(
+    input_pdf: str,
+    output_pdf: str,
+    light_threshold: int = 200,
+    saturation_threshold: int = 30,
+    dpi: int = 200
+) -> bool:
+    """
+    对PDF文件进行去水印处理
+    
+    处理流程:
+    1. 将PDF的每一页转换为图片
+    2. 对每张图片进行去水印处理
+    3. 将处理后的图片合并为新的PDF
+    
+    Args:
+        input_pdf: 输入PDF文件路径
+        output_pdf: 输出PDF文件路径
+        light_threshold: 水印亮度阈值(0-255),高于此值的浅色像素可能是水印
+        saturation_threshold: 水印饱和度阈值(0-255),低于此值的低饱和度像素可能是水印
+        dpi: PDF转图片的DPI,影响图片质量和处理速度
+    
+    Returns:
+        bool: 是否成功
+    """
+    try:
+        # 导入必要的库
+        from pdf2image import convert_from_path
+        from PIL import Image
+        import PyPDF2
+        from utils.image_preprocessor import remove_watermark, check_opencv_available
+        
+        # 检查OpenCV是否可用
+        if not check_opencv_available():
+            print("⚠ OpenCV 未安装,无法进行去水印处理")
+            return False
+        
+        # 创建临时目录
+        temp_dir = tempfile.mkdtemp(prefix="pdf_watermark_")
+        temp_path = Path(temp_dir)
+        
+        try:
+            print(f"正在将PDF转换为图片(DPI={dpi})...")
+            # 将PDF转换为图片
+            images = convert_from_path(input_pdf, dpi=dpi)
+            print(f"✓ 转换完成,共 {len(images)} 页")
+            
+            # 处理每一页
+            processed_images = []
+            for i, image in enumerate(images, 1):
+                print(f"处理第 {i}/{len(images)} 页...", end='\r')
+                
+                # 保存原始图片
+                original_path = temp_path / f"page_{i}_original.png"
+                image.save(str(original_path), "PNG")
+                
+                # 去水印
+                nowm_path = temp_path / f"page_{i}_nowm.png"
+                processed_path = remove_watermark(
+                    str(original_path),
+                    output_path=str(nowm_path),
+                    light_threshold=light_threshold,
+                    saturation_threshold=saturation_threshold,
+                    method="hsv"
+                )
+                
+                # 加载处理后的图片
+                processed_img = Image.open(processed_path)
+                processed_images.append(processed_img)
+            
+            print(f"\n✓ 所有页面处理完成")
+            
+            # 将图片合并为PDF
+            print("正在生成PDF...")
+            if processed_images:
+                # 第一张图片作为主图片
+                first_image = processed_images[0]
+                # 其余图片作为附加页
+                other_images = processed_images[1:] if len(processed_images) > 1 else []
+                
+                # 保存为PDF
+                first_image.save(
+                    output_pdf,
+                    "PDF",
+                    resolution=dpi,
+                    save_all=True,
+                    append_images=other_images
+                )
+                print(f"✓ PDF生成完成: {output_pdf}")
+                return True
+            else:
+                print("⚠ 没有处理任何图片")
+                return False
+                
+        finally:
+            # 清理临时目录
+            try:
+                shutil.rmtree(temp_dir)
+            except Exception as e:
+                print(f"⚠ 清理临时目录失败: {e}")
+    
+    except ImportError as e:
+        print(f"⚠ 缺少必要的库: {e}")
+        print("请安装: pip install pdf2image pillow PyPDF2 opencv-python")
+        return False
+    except Exception as e:
+        print(f"⚠ 去水印处理失败: {e}")
+        import traceback
+        traceback.print_exc()
+        return False

+ 15 - 0
start_mineru_in_container.sh

@@ -0,0 +1,15 @@
+#!/usr/bin/env bash
+# 在 Docker 容器内启动 MinerU file_parse API(容器内无 systemd,用本脚本代替 systemd 服务)
+# 使用:bash start_mineru_in_container.sh  或  nohup bash start_mineru_in_container.sh &
+# 工作目录:/root/work/Clerk2.5(可通过 CLERK_ROOT 覆盖)
+
+set -e
+CLERK_ROOT="${CLERK_ROOT:-/root/work/Clerk2.5}"
+PORT="${MINERU_PORT:-5282}"
+
+# NPU/容器内需预加载 libgomp,避免 static TLS 报错(路径以常见 dist-packages 为准,可按本机修改)
+export LD_PRELOAD="${LD_PRELOAD:-/usr/local/lib/python3.10/dist-packages/simsimd.libs/libgomp-a49a47f9.so.1.0.0:/usr/local/lib/python3.10/dist-packages/scikit_learn.libs/libgomp-d22c30c5.so.1.0.0}"
+export PYTHONPATH="${CLERK_ROOT}"
+
+cd "$CLERK_ROOT"
+exec python3 -m uvicorn mineru.cli.fast_api:app --host 0.0.0.0 --port "$PORT"