3 weeks ago · d6d3fedfcc
--- a/mineru/cli/fast_api.py
+++ b/mineru/cli/fast_api.py
@@ -16,6 +16,11 @@ from typing import List, Optional
 
															 from loguru import logger
														
 
															 from base64 import b64encode
														
 
															+# NumPy 1.24+ removed np.complex; librosa (via transformers) still uses it in constantq.py
														
 
															+import numpy as _np
														
 
															+if not hasattr(_np, "complex"):
														
 
															+    _np.complex = _np.complex128  # type: ignore[attr-defined]
														
 
															+
														
 
															 from mineru.cli.common import aio_do_parse, read_fn, pdf_suffixes, image_suffixes
														
 
															 from mineru.utils.cli_parser import arg_parse
														
 
															 from mineru.utils.guess_suffix_or_lang import guess_suffix_by_path
														
--- a/mineru/model/mfr/unimernet/unimernet_hf/modeling_unimernet.py
+++ b/mineru/model/mfr/unimernet/unimernet_hf/modeling_unimernet.py
@@ -7,8 +7,11 @@ from ftfy import fix_text
 
															 from loguru import logger
														
 
															 from transformers import AutoConfig, AutoModel, AutoModelForCausalLM, AutoTokenizer, PretrainedConfig, PreTrainedModel
														
 
															-from transformers import VisionEncoderDecoderConfig, VisionEncoderDecoderModel
														
 
															-from transformers.models.vision_encoder_decoder.modeling_vision_encoder_decoder import logger as base_model_logger
														
 
															+from transformers.models.vision_encoder_decoder.configuration_vision_encoder_decoder import VisionEncoderDecoderConfig
														
 
															+from transformers.models.vision_encoder_decoder.modeling_vision_encoder_decoder import (
														
 
															+    VisionEncoderDecoderModel,
														
 
															+    logger as base_model_logger,
														
 
															+)
														
 
															 from .unimer_swin import UnimerSwinConfig, UnimerSwinModel, UnimerSwinImageProcessor
														
 
															 from .unimer_mbart import UnimerMBartConfig, UnimerMBartForCausalLM
														
--- a/mineru/requirements-paddle-npu.txt
+++ b/mineru/requirements-paddle-npu.txt
@@ -0,0 +1,19 @@
 
															+# MinerU API 在 Paddle NPU 容器内运行 /file_parse 所需的最小依赖
														
 
															+# 容器内已安装 paddle/paddlex，此处不重复安装
														
 
															+# pipeline 依赖链 (transformers→librosa) 中 librosa 使用 np.complex，NumPy 2.0 已移除该别名，故限制 numpy<2
														
 
															+numpy<2
														
 
															+
														
 
															+# Web 与 CLI
														
 
															+fastapi>=0.100.0
														
 
															+uvicorn[standard]>=0.23.0
														
 
															+click>=8.0.0
														
 
															+python-multipart>=0.0.6
														
 
															+
														
 
															+# 日志
														
 
															+loguru>=0.7.0
														
 
															+
														
 
															+# PDF 处理
														
 
															+pypdfium2>=4.0.0
														
 
															+
														
 
															+# transformers 依赖链 (LayoutLMv3 → processing_utils → audio_utils) 需要 soxr
														
 
															+soxr>=0.3.0
														
--- a/pdf_converter_v2/api/main.py
+++ b/pdf_converter_v2/api/main.py
@@ -150,6 +150,16 @@ class ConversionRequest(BaseModel):
 
															     """转换请求模型（v2 精简版）"""
														
 
															     # 新增：强制文档类型（正式全称）
														
 
															     doc_type: Optional[str] = None
														
 
															+    # 新增：去水印参数
														
 
															+    remove_watermark: Optional[bool] = False
														
 
															+    watermark_light_threshold: Optional[int] = 200
														
 
															+    watermark_saturation_threshold: Optional[int] = 30
														
 
															+    crop_header_footer: Optional[bool] = False
														
 
															+    header_ratio: Optional[float] = 0.05
														
 
															+    footer_ratio: Optional[float] = 0.05
														
 
															+    auto_detect_header_footer: Optional[bool] = False
														
 
															+    # 新增：附件页切割参数
														
 
															+    table_only: Optional[bool] = False  # 是否只保留包含表格的附件页（默认False）
														
 
															 class ConversionResponse(BaseModel):
														
@@ -200,6 +210,13 @@ class OCRRequest(BaseModel):
 
															     """OCR识别请求模型"""
														
 
															     image_base64: str  # base64编码的图片数据
														
 
															     image_format: Optional[str] = "png"  # 图片格式：png, jpg, jpeg
														
 
															+    remove_watermark: Optional[bool] = False  # 是否去除水印
														
 
															+    watermark_light_threshold: Optional[int] = 200  # 水印亮度阈值（0-255），高于此值的浅色像素可能是水印
														
 
															+    watermark_saturation_threshold: Optional[int] = 30  # 水印饱和度阈值（0-255），低于此值的低饱和度像素可能是水印
														
 
															+    crop_header_footer: Optional[bool] = False  # 是否裁剪页眉页脚
														
 
															+    header_ratio: Optional[float] = 0.05  # 页眉裁剪比例（0-1），默认5%
														
 
															+    footer_ratio: Optional[float] = 0.05  # 页脚裁剪比例（0-1），默认5%
														
 
															+    auto_detect_header_footer: Optional[bool] = False  # 是否自动检测页眉页脚边界
														
 
															 class OCRResponse(BaseModel):
														
@@ -308,11 +325,108 @@ async def process_conversion_task(
 
															         logger.info(f"[任务 {task_id}] 开始处理: {file_path}")
														
 
															+        # 文件预处理（支持图片和PDF）
														
 
															+        from pathlib import Path as PathLib
														
 
															+        file_suffix = PathLib(file_path).suffix.lower()
														
 
															+        is_image = file_suffix in ['.png', '.jpg', '.jpeg', '.bmp', '.tiff', '.tif']
														
 
															+        is_pdf = file_suffix == '.pdf'
														
 
															+        
														
 
															+        # 图片预处理：去水印或裁剪页眉页脚
														
 
															+        if is_image and (request.remove_watermark or request.crop_header_footer):
														
 
															+            logger.info(f"[任务 {task_id}] 检测到图片文件，开始预处理...")
														
 
															+            preprocessed_path = file_path
														
 
															+            
														
 
															+            # 裁剪页眉页脚
														
 
															+            if request.crop_header_footer:
														
 
															+                try:
														
 
															+                    from ..utils.image_preprocessor import crop_header_footer, check_opencv_available
														
 
															+                    
														
 
															+                    if check_opencv_available():
														
 
															+                        if request.auto_detect_header_footer:
														
 
															+                            logger.info(f"[任务 {task_id}] 开始自动检测并裁剪页眉页脚")
														
 
															+                        else:
														
 
															+                            logger.info(f"[任务 {task_id}] 开始裁剪页眉页脚，顶部比例: {request.header_ratio}, 底部比例: {request.footer_ratio}")
														
 
															+                        
														
 
															+                        # 裁剪后的图片路径
														
 
															+                        cropped_path = str(PathLib(output_dir) / f"preprocessed_cropped{file_suffix}")
														
 
															+                        
														
 
															+                        preprocessed_path = await asyncio.to_thread(
														
 
															+                            crop_header_footer,
														
 
															+                            preprocessed_path,
														
 
															+                            output_path=cropped_path,
														
 
															+                            header_ratio=request.header_ratio or 0.05,
														
 
															+                            footer_ratio=request.footer_ratio or 0.05,
														
 
															+                            auto_detect=request.auto_detect_header_footer or False
														
 
															+                        )
														
 
															+                        logger.info(f"[任务 {task_id}] 裁剪页眉页脚完成: {preprocessed_path}")
														
 
															+                    else:
														
 
															+                        logger.warning(f"[任务 {task_id}] OpenCV 未安装，跳过裁剪页眉页脚")
														
 
															+                except Exception as e:
														
 
															+                    logger.warning(f"[任务 {task_id}] 裁剪页眉页脚失败，使用原图继续: {e}")
														
 
															+            
														
 
															+            # 去水印
														
 
															+            if request.remove_watermark:
														
 
															+                try:
														
 
															+                    from ..utils.image_preprocessor import remove_watermark, check_opencv_available
														
 
															+                    
														
 
															+                    if check_opencv_available():
														
 
															+                        logger.info(f"[任务 {task_id}] 开始去水印处理，亮度阈值: {request.watermark_light_threshold}, 饱和度阈值: {request.watermark_saturation_threshold}")
														
 
															+                        
														
 
															+                        # 去水印后的图片路径
														
 
															+                        nowm_path = str(PathLib(output_dir) / f"preprocessed_nowm{file_suffix}")
														
 
															+                        
														
 
															+                        preprocessed_path = await asyncio.to_thread(
														
 
															+                            remove_watermark,
														
 
															+                            preprocessed_path,
														
 
															+                            output_path=nowm_path,
														
 
															+                            light_threshold=request.watermark_light_threshold or 200,
														
 
															+                            saturation_threshold=request.watermark_saturation_threshold or 30,
														
 
															+                            method="hsv"
														
 
															+                        )
														
 
															+                        logger.info(f"[任务 {task_id}] 去水印完成: {preprocessed_path}")
														
 
															+                    else:
														
 
															+                        logger.warning(f"[任务 {task_id}] OpenCV 未安装，跳过去水印处理")
														
 
															+                except Exception as e:
														
 
															+                    logger.warning(f"[任务 {task_id}] 去水印处理失败，使用原图继续: {e}")
														
 
															+            
														
 
															+            # 更新文件路径为预处理后的路径
														
 
															+            if preprocessed_path != file_path:
														
 
															+                file_path = preprocessed_path
														
 
															+                logger.info(f"[任务 {task_id}] 图片预处理完成，使用预处理后的文件: {file_path}")
														
 
															+        
														
 
															+        # PDF预处理：去水印
														
 
															+        elif is_pdf and request.remove_watermark:
														
 
															+            logger.info(f"[任务 {task_id}] 检测到PDF文件，开始去水印预处理...")
														
 
															+            try:
														
 
															+                from ..utils.pdf_watermark_remover import remove_watermark_from_pdf
														
 
															+                
														
 
															+                # 去水印后的PDF路径
														
 
															+                nowm_pdf_path = str(PathLib(output_dir) / f"preprocessed_nowm.pdf")
														
 
															+                
														
 
															+                # 执行去水印
														
 
															+                logger.info(f"[任务 {task_id}] 开始PDF去水印处理，亮度阈值: {request.watermark_light_threshold}, 饱和度阈值: {request.watermark_saturation_threshold}")
														
 
															+                success = await asyncio.to_thread(
														
 
															+                    remove_watermark_from_pdf,
														
 
															+                    input_pdf=file_path,
														
 
															+                    output_pdf=nowm_pdf_path,
														
 
															+                    light_threshold=request.watermark_light_threshold or 200,
														
 
															+                    saturation_threshold=request.watermark_saturation_threshold or 30,
														
 
															+                    dpi=200  # PDF转图片的DPI
														
 
															+                )
														
 
															+                
														
 
															+                if success and PathLib(nowm_pdf_path).exists():
														
 
															+                    file_path = nowm_pdf_path
														
 
															+                    logger.info(f"[任务 {task_id}] PDF去水印完成: {file_path}")
														
 
															+                else:
														
 
															+                    logger.warning(f"[任务 {task_id}] PDF去水印失败，使用原PDF继续")
														
 
															+            except Exception as e:
														
 
															+                logger.warning(f"[任务 {task_id}] PDF去水印处理失败，使用原PDF继续: {e}")
														
 
															+        
														
 
															         result = None
														
 
															         tables_info = None
														
 
															         # 针对投资估算类型，需要先切割附件页
														
 
															-        if request.doc_type in ("fsApproval", "fsReview", "pdApproval"):
														
 
															+        if request.doc_type in ("fsApproval", "fsReview", "pdApproval", "safetyFsApproval"):
														
 
															             logger.info(f"[任务 {task_id}] 文档类型 {request.doc_type}，需要先切割附件页")
														
 
															             # 导入附件页切割函数
														
@@ -328,18 +442,21 @@ async def process_conversion_task(
 
															                 attachment_dir = PathLib(output_dir) / "attachments"
														
 
															                 attachment_dir.mkdir(parents=True, exist_ok=True)
														
 
															-                # 切割附件页
														
 
															-                logger.info(f"[任务 {task_id}] 开始切割附件页，输出目录: {attachment_dir}")
														
 
															+                # 切割附件页（根据 table_only 参数决定是否过滤非表格内容）
														
 
															+                logger.info(f"[任务 {task_id}] 开始切割附件页（table_only={request.table_only}），输出目录: {attachment_dir}")
														
 
															                 await asyncio.to_thread(
														
 
															                     split_attachment_pages,
														
 
															                     file_path,
														
 
															                     attachment_dir,
														
 
															                     use_ocr=True,
														
 
															-                    debug=False
														
 
															+                    debug=False,
														
 
															+                    table_only=request.table_only  # 是否只保留包含表格的附件页
														
 
															                 )
														
 
															-                # 查找切割后的附件页PDF
														
 
															-                attachment_pdfs = list(attachment_dir.glob("*_附件页_*.pdf"))
														
 
															+                # 查找切割后的附件页PDF（优先使用表格附件页，其次使用普通附件页）
														
 
															+                attachment_pdfs = list(attachment_dir.glob("*_表格附件页_*.pdf"))
														
 
															+                if not attachment_pdfs:
														
 
															+                    attachment_pdfs = list(attachment_dir.glob("*_附件页_*.pdf"))
														
 
															                 logger.info(f"[任务 {task_id}] 附件页目录内容: {list(attachment_dir.iterdir()) if attachment_dir.exists() else '(目录不存在)'}")
														
 
															                 if attachment_pdfs:
														
@@ -532,11 +649,44 @@ async def process_conversion_task(
 
															 @app.post("/convert", response_model=ConversionResponse)
														
 
															 async def convert_file(
														
 
															     file: Annotated[UploadFile, File(description="上传的PDF或图片文件")],
														
 
															-    # 新增：类型参数（英文传参） noiseRec | emRec | opStatus | settlementReport | designReview | fsApproval | fsReview | pdApproval | finalAccount
														
 
															+    # 新增：类型参数（英文传参） noiseRec | emRec | opStatus | settlementReport | designReview | fsApproval | fsReview | pdApproval | finalAccount | safetyFsApproval
														
 
															     type: Annotated[
														
 
															-        Optional[Literal["noiseRec", "emRec", "opStatus", "settlementReport", "designReview", "fsApproval", "fsReview", "pdApproval", "finalAccount"]],
														
 
															-        Form(description="文档类型：noiseRec | emRec | opStatus | settlementReport | designReview | fsApproval | fsReview | pdApproval | finalAccount")
														
 
															+        Optional[Literal["noiseRec", "emRec", "opStatus", "settlementReport", "designReview", "fsApproval", "fsReview", "pdApproval", "finalAccount", "safetyFsApproval"]],
														
 
															+        Form(description="文档类型：noiseRec | emRec | opStatus | settlementReport | designReview | fsApproval | fsReview | pdApproval | finalAccount | safetyFsApproval")
														
 
															     ] = None,
														
 
															+    # 新增：去水印参数
														
 
															+    remove_watermark: Annotated[
														
 
															+        Optional[bool],
														
 
															+        Form(description="是否去除水印，默认为false")
														
 
															+    ] = False,
														
 
															+    watermark_light_threshold: Annotated[
														
 
															+        Optional[int],
														
 
															+        Form(description="水印亮度阈值（0-255），默认200，高于此值的浅色像素可能是水印")
														
 
															+    ] = 200,
														
 
															+    watermark_saturation_threshold: Annotated[
														
 
															+        Optional[int],
														
 
															+        Form(description="水印饱和度阈值（0-255），默认30，低于此值的低饱和度像素可能是水印")
														
 
															+    ] = 30,
														
 
															+    crop_header_footer: Annotated[
														
 
															+        Optional[bool],
														
 
															+        Form(description="是否裁剪页眉页脚，默认为false")
														
 
															+    ] = False,
														
 
															+    header_ratio: Annotated[
														
 
															+        Optional[float],
														
 
															+        Form(description="页眉裁剪比例（0-1），默认0.05表示裁剪顶部5%")
														
 
															+    ] = 0.05,
														
 
															+    footer_ratio: Annotated[
														
 
															+        Optional[float],
														
 
															+        Form(description="页脚裁剪比例（0-1），默认0.05表示裁剪底部5%")
														
 
															+    ] = 0.05,
														
 
															+    auto_detect_header_footer: Annotated[
														
 
															+        Optional[bool],
														
 
															+        Form(description="是否自动检测页眉页脚边界，默认为false（启用后忽略header_ratio和footer_ratio）")
														
 
															+    ] = False,
														
 
															+    table_only: Annotated[
														
 
															+        Optional[bool],
														
 
															+        Form(description="是否只保留包含表格的附件页，默认为false")
														
 
															+    ] = False,
														
 
															 ):
														
 
															     """
														
 
															     转换PDF/图片文件（异步处理）
														
@@ -557,6 +707,16 @@ async def convert_file(
 
															       * fsApproval - 可研批复投资估算
														
 
															       * fsReview - 可研评审投资估算
														
 
															       * pdApproval - 初设批复概算投资
														
 
															+      * finalAccount - 决算报告
														
 
															+      * safetyFsApproval - 安评可研批复投资估算
														
 
															+    - **remove_watermark**: 是否去除水印（仅对图片有效），默认为false
														
 
															+    - **watermark_light_threshold**: 水印亮度阈值（0-255），默认200
														
 
															+    - **watermark_saturation_threshold**: 水印饱和度阈值（0-255），默认30
														
 
															+    - **crop_header_footer**: 是否裁剪页眉页脚（仅对图片有效），默认为false
														
 
															+    - **header_ratio**: 页眉裁剪比例（0-1），默认0.05
														
 
															+    - **footer_ratio**: 页脚裁剪比例（0-1），默认0.05
														
 
															+    - **auto_detect_header_footer**: 是否自动检测页眉页脚边界，默认为false
														
 
															+    - **table_only**: 是否只保留包含表格的附件页，默认为false
														
 
															     注意：v2 版本内部使用外部API进行转换，v2特有的配置参数（如API URL、backend等）
														
 
															     通过环境变量或配置文件设置，不通过API参数传入。
														
@@ -677,6 +837,8 @@ async def convert_file(
 
															         "pdApproval": "pdApproval",
														
 
															         # 决算报告
														
 
															         "finalAccount": "finalAccount",
														
 
															+        # 安评类
														
 
															+        "safetyFsApproval": "safetyFsApproval",
														
 
															     }
														
 
															     doc_type = None
														
 
															     if type:
														
@@ -692,6 +854,14 @@ async def convert_file(
 
															     # 创建请求对象（v2 精简）
														
 
															     request = ConversionRequest(
														
 
															         doc_type=doc_type,
														
 
															+        remove_watermark=remove_watermark,
														
 
															+        watermark_light_threshold=watermark_light_threshold,
														
 
															+        watermark_saturation_threshold=watermark_saturation_threshold,
														
 
															+        crop_header_footer=crop_header_footer,
														
 
															+        header_ratio=header_ratio,
														
 
															+        footer_ratio=footer_ratio,
														
 
															+        auto_detect_header_footer=auto_detect_header_footer,
														
 
															+        table_only=table_only,
														
 
															     )
														
 
															     # 使用 asyncio.create_task 创建后台任务，确保立即返回
														
@@ -875,6 +1045,13 @@ async def ocr_image(request: OCRRequest):
 
															     - **image_base64**: base64编码的图片数据（可以包含data:image/xxx;base64,前缀）
														
 
															     - **image_format**: 图片格式（png, jpg, jpeg），默认为png
														
 
															+    - **remove_watermark**: 是否去除水印，默认为false
														
 
															+    - **watermark_light_threshold**: 水印亮度阈值（0-255），默认200，高于此值的浅色像素可能是水印
														
 
															+    - **watermark_saturation_threshold**: 水印饱和度阈值（0-255），默认30，低于此值的低饱和度像素可能是水印
														
 
															+    - **crop_header_footer**: 是否裁剪页眉页脚，默认为false
														
 
															+    - **header_ratio**: 页眉裁剪比例（0-1），默认0.05表示裁剪顶部5%
														
 
															+    - **footer_ratio**: 页脚裁剪比例（0-1），默认0.05表示裁剪底部5%
														
 
															+    - **auto_detect_header_footer**: 是否自动检测页眉页脚边界，默认为false（启用后忽略header_ratio和footer_ratio）
														
 
															     返回识别出的文本列表和GPU监控信息
														
 
															     """
														
@@ -934,6 +1111,57 @@ async def ocr_image(request: OCRRequest):
 
															             f.write(image_bytes)
														
 
															         logger.info(f"[OCR] 图片已保存: {image_path}")
														
 
															+        # 如果需要裁剪页眉页脚，先进行裁剪
														
 
															+        if request.crop_header_footer:
														
 
															+            try:
														
 
															+                from ..utils.image_preprocessor import crop_header_footer, check_opencv_available
														
 
															+                
														
 
															+                if check_opencv_available():
														
 
															+                    if request.auto_detect_header_footer:
														
 
															+                        logger.info("[OCR] 开始自动检测并裁剪页眉页脚")
														
 
															+                    else:
														
 
															+                        logger.info(f"[OCR] 开始裁剪页眉页脚，顶部比例: {request.header_ratio}, 底部比例: {request.footer_ratio}")
														
 
															+                    
														
 
															+                    # 裁剪后的图片路径
														
 
															+                    cropped_image_path = os.path.join(temp_dir, f"ocr_image_cropped{ext}")
														
 
															+                    
														
 
															+                    image_path = crop_header_footer(
														
 
															+                        image_path,
														
 
															+                        output_path=cropped_image_path,
														
 
															+                        header_ratio=request.header_ratio or 0.05,
														
 
															+                        footer_ratio=request.footer_ratio or 0.05,
														
 
															+                        auto_detect=request.auto_detect_header_footer or False
														
 
															+                    )
														
 
															+                    logger.info(f"[OCR] 裁剪页眉页脚完成: {image_path}")
														
 
															+                else:
														
 
															+                    logger.warning("[OCR] OpenCV 未安装，跳过裁剪页眉页脚")
														
 
															+            except Exception as e:
														
 
															+                logger.warning(f"[OCR] 裁剪页眉页脚失败，使用原图继续: {e}")
														
 
															+        
														
 
															+        # 如果需要去水印，进行预处理
														
 
															+        if request.remove_watermark:
														
 
															+            try:
														
 
															+                from ..utils.image_preprocessor import remove_watermark, check_opencv_available
														
 
															+                
														
 
															+                if check_opencv_available():
														
 
															+                    logger.info(f"[OCR] 开始去水印处理，亮度阈值: {request.watermark_light_threshold}, 饱和度阈值: {request.watermark_saturation_threshold}")
														
 
															+                    
														
 
															+                    # 去水印后的图片路径
														
 
															+                    nowm_image_path = os.path.join(temp_dir, f"ocr_image_nowm{ext}")
														
 
															+                    
														
 
															+                    image_path = remove_watermark(
														
 
															+                        image_path,
														
 
															+                        output_path=nowm_image_path,
														
 
															+                        light_threshold=request.watermark_light_threshold or 200,
														
 
															+                        saturation_threshold=request.watermark_saturation_threshold or 30,
														
 
															+                        method="hsv"
														
 
															+                    )
														
 
															+                    logger.info(f"[OCR] 去水印完成: {image_path}")
														
 
															+                else:
														
 
															+                    logger.warning("[OCR] OpenCV 未安装，跳过去水印处理")
														
 
															+            except Exception as e:
														
 
															+                logger.warning(f"[OCR] 去水印处理失败，使用原图继续: {e}")
														
 
															+        
														
 
															         # 调用PaddleOCR进行识别（监控线程在此期间持续采集数据）
														
 
															         from ..utils.paddleocr_fallback import call_paddleocr_ocr
														
--- a/pdf_converter_v2/models/data_models.py
+++ b/pdf_converter_v2/models/data_models.py
@@ -283,9 +283,18 @@ class FeasibilityApprovalInvestment:
 
															     - Level 0: 顶层大类（如"山西晋城周村220千伏输变电工程"）
														
 
															     - Level 1: 二级分类（如"变电工程"、"线路工程"），有自己的 items
														
 
															     - Level 2: 具体项目（如"周村220千伏变电站新建工程"）
														
 
															+    
														
 
															+    项目信息（可选，用于 safetyFsApproval 类型）：
														
 
															+    - projectName: 工程(项目)名称
														
 
															+    - projectUnit: 项目单位
														
 
															+    - designUnit: 设计单位
														
 
															     """
														
 
															     def __init__(self):
														
 
															         self.items: List[InvestmentItem] = []
														
 
															+        # 项目基本信息（safetyFsApproval 专用）
														
 
															+        self.projectName: Optional[str] = None
														
 
															+        self.projectUnit: Optional[str] = None
														
 
															+        self.designUnit: Optional[str] = None
														
 
															     def to_dict(self):
														
 
															         """转换为嵌套结构，与 designReview 保持一致
														
@@ -294,14 +303,38 @@ class FeasibilityApprovalInvestment:
 
															         Level="2" 的项目作为二级分类（Level: 1），有自己的 items
														
 
															         Level="3" 的项目作为具体项目（Level: 2），放入二级分类的 items
														
 
															         Level="0" 的项目（合计）跳过
														
 
															+        
														
 
															+        特殊处理：如果表格没有 Level=1 的顶层大类（如湖北省格式），
														
 
															+        自动创建一个虚拟顶层大类来包含所有 Level=2 的项目
														
 
															         """
														
 
															         if not self.items:
														
 
															             return []
														
 
															+        # 检查是否有 Level=1 的顶层大类
														
 
															+        has_level_1 = any(item.level == "1" for item in self.items)
														
 
															+        
														
 
															         result = []
														
 
															         current_top_category = None  # Level 0 顶层大类
														
 
															         current_sub_category = None  # Level 1 二级分类
														
 
															+        # 如果没有 Level=1 的顶层大类，创建一个虚拟的
														
 
															+        if not has_level_1:
														
 
															+            current_top_category = {
														
 
															+                "name": "项目总表",
														
 
															+                "Level": 0,
														
 
															+                "constructionScaleSubstation": "",
														
 
															+                "constructionScaleBay": "",
														
 
															+                "constructionScaleOverheadLine": "",
														
 
															+                "constructionScaleOpticalCable": "",
														
 
															+                "staticInvestment": "",
														
 
															+                "dynamicInvestment": "",
														
 
															+                "constructionProjectCost": "",
														
 
															+                "equipmentPurchaseCost": "",
														
 
															+                "installationProjectCost": "",
														
 
															+                "otherExpenses": "",
														
 
															+                "items": []
														
 
															+            }
														
 
															+        
														
 
															         for item in self.items:
														
 
															             if item.level == "1":
														
 
															                 # 顶层大类（如"山西晋城周村220千伏输变电工程"）
														
@@ -381,7 +414,18 @@ class FeasibilityApprovalInvestment:
 
															         if current_top_category is not None:
														
 
															             result.append(current_top_category)
														
 
															-        return result
														
 
															+        # 如果有项目信息，返回包含项目信息的字典；否则直接返回数据列表
														
 
															+        if self.projectName or self.projectUnit or self.designUnit:
														
 
															+            return {
														
 
															+                "projectInfo": {
														
 
															+                    "projectName": self.projectName or "",
														
 
															+                    "projectUnit": self.projectUnit or "",
														
 
															+                    "designUnit": self.designUnit or ""
														
 
															+                },
														
 
															+                "data": result
														
 
															+            }
														
 
															+        else:
														
 
															+            return result
														
 
															     @staticmethod
														
 
															     def _parse_number(value: str) -> str:
														
--- a/pdf_converter_v2/parser/investment_parser.py
+++ b/pdf_converter_v2/parser/investment_parser.py
@@ -224,8 +224,11 @@ def parse_feasibility_approval_investment(markdown_content: str) -> FeasibilityA
 
															             table_text += " ".join([str(cell) for cell in row])
														
 
															         # 移除空格后再匹配
														
 
															         table_text_no_space = table_text.replace(" ", "")
														
 
															-        # 选择包含"工程或费用名称"和"静态投资"的表格
														
 
															-        if "工程或费用名称" in table_text_no_space and "静态投资" in table_text_no_space:
														
 
															+        # 选择包含"工程或费用名称"或"项目名称"，且包含"静态投资"或"静态合计"的表格
														
 
															+        has_name_col = ("工程或费用名称" in table_text_no_space or "项目名称" in table_text_no_space)
														
 
															+        has_investment_col = ("静态投资" in table_text_no_space or "静态合计" in table_text_no_space)
														
 
															+        
														
 
															+        if has_name_col and has_investment_col:
														
 
															             all_matching_tables.append((table_idx, table))
														
 
															             logger.info(f"[可研批复投资] 找到投资估算表格 (表格{table_idx+1}), 行数: {len(table)}")
														
@@ -295,7 +298,7 @@ def parse_feasibility_approval_investment(markdown_content: str) -> FeasibilityA
 
															             if "序号" in cell_text and no_idx == -1:
														
 
															                 no_idx = col_idx
														
 
															-            elif ("工程或费用名称" in cell_text_no_space) and name_idx == -1:
														
 
															+            elif ("工程或费用名称" in cell_text_no_space or "项目名称" in cell_text_no_space) and name_idx == -1:
														
 
															                 name_idx = col_idx
														
 
															             elif "架空线" in cell_text_no_space and overhead_line_idx == -1:
														
 
															                 overhead_line_idx = col_idx
														
@@ -305,9 +308,9 @@ def parse_feasibility_approval_investment(markdown_content: str) -> FeasibilityA
 
															                 substation_idx = col_idx
														
 
															             elif "光缆" in cell_text and optical_cable_idx == -1:
														
 
															                 optical_cable_idx = col_idx
														
 
															-            elif "静态投资" in cell_text_no_space and static_investment_idx == -1:
														
 
															+            elif ("静态投资" in cell_text_no_space or "静态合计" in cell_text_no_space) and static_investment_idx == -1:
														
 
															                 static_investment_idx = col_idx
														
 
															-            elif "动态投资" in cell_text_no_space and dynamic_investment_idx == -1:
														
 
															+            elif ("动态投资" in cell_text_no_space or "动态合计" in cell_text_no_space) and dynamic_investment_idx == -1:
														
 
															                 dynamic_investment_idx = col_idx
														
 
															             # 新增费用字段识别
														
 
															             elif "建筑工程费" in cell_text_no_space and construction_project_cost_idx == -1:
														
@@ -322,8 +325,8 @@ def parse_feasibility_approval_investment(markdown_content: str) -> FeasibilityA
 
															                 if "其他费用" in cell_text_no_space:
														
 
															                     other_expenses_idx = col_idx
														
 
															-        # 如果这一行包含"序号"或"工程或费用名称"，记录为表头结束行
														
 
															-        if ("序号" in row_text or "工程或费用名称" in row_text_no_space) and header_row_idx == -1:
														
 
															+        # 如果这一行包含"序号"或"工程或费用名称"或"项目名称"，记录为表头结束行
														
 
															+        if ("序号" in row_text or "工程或费用名称" in row_text_no_space or "项目名称" in row_text_no_space) and header_row_idx == -1:
														
 
															             header_row_idx = row_idx
														
 
															     # 表头结束行应该是最后一个包含表头内容的行
														
@@ -417,6 +420,253 @@ def parse_feasibility_approval_investment(markdown_content: str) -> FeasibilityA
 
															     return record
														
 
															+def parse_safety_feasibility_approval_investment(markdown_content: str) -> FeasibilityApprovalInvestment:
														
 
															+    """
														
 
															+    解析安全可研批复投资估算（湖北省格式）
														
 
															+    
														
 
															+    特点：
														
 
															+    - 没有顶层大类（Level=1），直接从二级分类开始
														
 
															+    - 中文序号（一、二）表示二级分类（如"变电工程"、"线路工程"）
														
 
															+    - 阿拉伯数字（1、2、3）表示具体项目
														
 
															+    - 列名使用"项目名称"和"静态合计/动态合计"
														
 
															+    
														
 
															+    返回结构：
														
 
															+    - Level 1: 二级分类（如"变电工程"、"线路工程"）
														
 
															+    - Level 2: 具体项目（如"襄阳连云220千伏变电站新建工程"）
														
 
															+    """
														
 
															+    record = FeasibilityApprovalInvestment()
														
 
															+    
														
 
															+    tables = extract_table_with_rowspan_colspan(markdown_content)
														
 
															+    
														
 
															+    if not tables:
														
 
															+        logger.warning("[安全可研批复投资] 未能提取出任何表格内容")
														
 
															+        return record
														
 
															+    
														
 
															+    # 首先尝试提取项目基本信息表格
														
 
															+    for table_idx, table in enumerate(tables):
														
 
															+        if len(table) < 2:
														
 
															+            continue
														
 
															+        
														
 
															+        table_text = ""
														
 
															+        for row in table:
														
 
															+            table_text += " ".join([str(cell) for cell in row])
														
 
															+        table_text_no_space = table_text.replace(" ", "").replace("(", "（").replace(")", "）")
														
 
															+        
														
 
															+        # 查找包含"工程(项目)名称"的表格
														
 
															+        if "工程（项目）名称" in table_text_no_space or "工程项目名称" in table_text_no_space:
														
 
															+            logger.info(f"[安全可研批复投资] 找到项目信息表格 (表格{table_idx+1})")
														
 
															+            
														
 
															+            # 提取项目信息
														
 
															+            for row in table:
														
 
															+                if len(row) >= 2:
														
 
															+                    key = str(row[0]).strip()
														
 
															+                    value = str(row[1]).strip() if len(row) > 1 else ""
														
 
															+                    
														
 
															+                    if "工程" in key and "名称" in key:
														
 
															+                        record.projectName = value
														
 
															+                        logger.info(f"[安全可研批复投资] 提取工程名称: {value}")
														
 
															+                    elif "项目单位" in key:
														
 
															+                        record.projectUnit = value
														
 
															+                        logger.info(f"[安全可研批复投资] 提取项目单位: {value}")
														
 
															+                    elif "设计单位" in key:
														
 
															+                        record.designUnit = value
														
 
															+                        logger.info(f"[安全可研批复投资] 提取设计单位: {value}")
														
 
															+            break
														
 
															+    
														
 
															+    # 找到所有投资估算表格并合并
														
 
															+    all_matching_tables = []
														
 
															+    for table_idx, table in enumerate(tables):
														
 
															+        table_text = ""
														
 
															+        for row in table:
														
 
															+            table_text += " ".join([str(cell) for cell in row])
														
 
															+        table_text_no_space = table_text.replace(" ", "")
														
 
															+        
														
 
															+        # 选择包含"项目名称"且包含"静态合计"或"静态投资"的表格
														
 
															+        has_name_col = "项目名称" in table_text_no_space
														
 
															+        has_investment_col = ("静态合计" in table_text_no_space or "静态投资" in table_text_no_space)
														
 
															+        
														
 
															+        if has_name_col and has_investment_col:
														
 
															+            all_matching_tables.append((table_idx, table))
														
 
															+            logger.info(f"[安全可研批复投资] 找到投资估算表格 (表格{table_idx+1}), 行数: {len(table)}")
														
 
															+    
														
 
															+    if not all_matching_tables:
														
 
															+        logger.warning("[安全可研批复投资] 未找到包含投资估算的表格")
														
 
															+        return record
														
 
															+    
														
 
															+    # 如果只有一个表格，直接使用
														
 
															+    if len(all_matching_tables) == 1:
														
 
															+        target_table = all_matching_tables[0][1]
														
 
															+    else:
														
 
															+        # 多个表格：合并所有表格的数据行
														
 
															+        logger.info(f"[安全可研批复投资] 发现 {len(all_matching_tables)} 个投资估算表格，将进行合并")
														
 
															+        target_table = []
														
 
															+        first_table = True
														
 
															+        for table_idx, table in all_matching_tables:
														
 
															+            if first_table:
														
 
															+                target_table.extend(table)
														
 
															+                first_table = False
														
 
															+            else:
														
 
															+                # 跳过表头行
														
 
															+                header_end_idx = 0
														
 
															+                for row_idx, row in enumerate(table):
														
 
															+                    row_text = " ".join([str(cell) for cell in row]).replace(" ", "")
														
 
															+                    if "序号" in row_text or "项目名称" in row_text or "建设规模" in row_text:
														
 
															+                        header_end_idx = row_idx + 1
														
 
															+                    elif len(row) > 0:
														
 
															+                        first_cell = str(row[0]).strip()
														
 
															+                        if first_cell in ["一", "二", "三", "四", "五", "六", "七", "八", "九", "十"]:
														
 
															+                            break
														
 
															+                target_table.extend(table[header_end_idx:])
														
 
															+                logger.debug(f"[安全可研批复投资] 表格{table_idx+1}: 跳过前{header_end_idx}行表头，添加{len(table)-header_end_idx}行数据")
														
 
															+        
														
 
															+        logger.info(f"[安全可研批复投资] 合并后总行数: {len(target_table)}")
														
 
															+    
														
 
															+    # 识别表头行和列索引
														
 
															+    header_row_idx = -1
														
 
															+    no_idx = -1
														
 
															+    name_idx = -1
														
 
															+    overhead_line_idx = -1
														
 
															+    bay_idx = -1
														
 
															+    substation_idx = -1
														
 
															+    optical_cable_idx = -1
														
 
															+    static_investment_idx = -1
														
 
															+    dynamic_investment_idx = -1
														
 
															+    construction_project_cost_idx = -1
														
 
															+    equipment_purchase_cost_idx = -1
														
 
															+    installation_project_cost_idx = -1
														
 
															+    other_expenses_idx = -1
														
 
															+    
														
 
															+    # 扫描前几行识别列索引
														
 
															+    for row_idx in range(min(5, len(target_table))):
														
 
															+        row = target_table[row_idx]
														
 
															+        row_text = " ".join([str(cell) for cell in row])
														
 
															+        row_text_no_space = row_text.replace(" ", "")
														
 
															+        
														
 
															+        for col_idx, cell in enumerate(row):
														
 
															+            cell_text = str(cell).strip()
														
 
															+            cell_text_no_space = cell_text.replace(" ", "")
														
 
															+            
														
 
															+            if "序号" in cell_text and no_idx == -1:
														
 
															+                no_idx = col_idx
														
 
															+            elif "项目名称" in cell_text_no_space and name_idx == -1:
														
 
															+                name_idx = col_idx
														
 
															+            elif "架空线" in cell_text_no_space and overhead_line_idx == -1:
														
 
															+                overhead_line_idx = col_idx
														
 
															+            elif "间隔" in cell_text and bay_idx == -1:
														
 
															+                bay_idx = col_idx
														
 
															+            elif "变电" in cell_text and substation_idx == -1:
														
 
															+                substation_idx = col_idx
														
 
															+            elif "光缆" in cell_text and optical_cable_idx == -1:
														
 
															+                optical_cable_idx = col_idx
														
 
															+            elif ("静态投资" in cell_text_no_space or "静态合计" in cell_text_no_space) and static_investment_idx == -1:
														
 
															+                static_investment_idx = col_idx
														
 
															+            elif ("动态投资" in cell_text_no_space or "动态合计" in cell_text_no_space) and dynamic_investment_idx == -1:
														
 
															+                dynamic_investment_idx = col_idx
														
 
															+            elif "建筑工程费" in cell_text_no_space and construction_project_cost_idx == -1:
														
 
															+                construction_project_cost_idx = col_idx
														
 
															+            elif "设备购置费" in cell_text_no_space and equipment_purchase_cost_idx == -1:
														
 
															+                equipment_purchase_cost_idx = col_idx
														
 
															+            elif "安装工程费" in cell_text_no_space and installation_project_cost_idx == -1:
														
 
															+                installation_project_cost_idx = col_idx
														
 
															+            elif ("其他费用" in cell_text_no_space or "合计" == cell_text_no_space) and other_expenses_idx == -1:
														
 
															+                if "其他费用" in cell_text_no_space:
														
 
															+                    other_expenses_idx = col_idx
														
 
															+        
														
 
															+        if ("序号" in row_text or "项目名称" in row_text_no_space) and header_row_idx == -1:
														
 
															+            header_row_idx = row_idx
														
 
															+    
														
 
															+    # 找到第一个数据行
														
 
															+    for row_idx in range(min(5, len(target_table))):
														
 
															+        row = target_table[row_idx]
														
 
															+        if len(row) > 0:
														
 
															+            first_cell = str(row[0]).strip()
														
 
															+            if first_cell and first_cell not in ["序号", ""] and (first_cell in ["一", "二", "三", "四", "五"] or first_cell.isdigit()):
														
 
															+                header_row_idx = row_idx - 1
														
 
															+                logger.debug(f"[安全可研批复投资] 根据数据行确定表头结束于第{header_row_idx}行")
														
 
															+                break
														
 
															+    
														
 
															+    logger.info(f"[安全可研批复投资] 表头行: {header_row_idx}")
														
 
															+    logger.info(f"[安全可研批复投资] 列索引: 序号={no_idx}, 名称={name_idx}, "
														
 
															+               f"架空线={overhead_line_idx}, 间隔={bay_idx}, 变电={substation_idx}, "
														
 
															+               f"光缆={optical_cable_idx}, 静态投资={static_investment_idx}, 动态投资={dynamic_investment_idx}")
														
 
															+    
														
 
															+    if header_row_idx == -1:
														
 
															+        logger.warning("[安全可研批复投资] 未找到表头行")
														
 
															+        return record
														
 
															+    
														
 
															+    # 解析数据行
														
 
															+    for row_idx in range(header_row_idx + 1, len(target_table)):
														
 
															+        row = target_table[row_idx]
														
 
															+        
														
 
															+        if len(row) < 3:
														
 
															+            continue
														
 
															+        
														
 
															+        # 检查是否是有效数据行
														
 
															+        if name_idx >= 0 and name_idx < len(row):
														
 
															+            name = str(row[name_idx]).strip()
														
 
															+            if not name or name in ["", "nan", "None"]:
														
 
															+                continue
														
 
															+            
														
 
															+            # 提取序号
														
 
															+            no = ""
														
 
															+            if no_idx >= 0 and no_idx < len(row):
														
 
															+                no = str(row[no_idx]).strip()
														
 
															+            
														
 
															+            # 判断等级 - 使用非严格模式，让中文数字直接返回 Level 1
														
 
															+            level_input = (no + name) if no else name
														
 
															+            level = determine_level(level_input, name, strict_mode=False)
														
 
															+            
														
 
															+            # 对于阿拉伯数字序号，如果当前是 Level 2，且不是具体项目名称，则判定为 Level 2
														
 
															+            # 这样"1、襄阳连云..."会是 Level 2
														
 
															+            if level == "2" and no.isdigit():
														
 
															+                # 阿拉伯数字序号，是具体项目，保持 Level 2
														
 
															+                pass
														
 
															+            
														
 
															+            item = InvestmentItem()
														
 
															+            item.no = no
														
 
															+            item.name = name
														
 
															+            item.level = level
														
 
															+            
														
 
															+            # 提取建设规模
														
 
															+            if overhead_line_idx >= 0 and overhead_line_idx < len(row):
														
 
															+                item.constructionScaleOverheadLine = str(row[overhead_line_idx]).strip()
														
 
															+            
														
 
															+            if bay_idx >= 0 and bay_idx < len(row):
														
 
															+                item.constructionScaleBay = str(row[bay_idx]).strip()
														
 
															+            
														
 
															+            if substation_idx >= 0 and substation_idx < len(row):
														
 
															+                item.constructionScaleSubstation = str(row[substation_idx]).strip()
														
 
															+            
														
 
															+            if optical_cable_idx >= 0 and optical_cable_idx < len(row):
														
 
															+                item.constructionScaleOpticalCable = str(row[optical_cable_idx]).strip()
														
 
															+            
														
 
															+            # 提取投资金额
														
 
															+            if static_investment_idx >= 0 and static_investment_idx < len(row):
														
 
															+                item.staticInvestment = str(row[static_investment_idx]).strip()
														
 
															+            
														
 
															+            if dynamic_investment_idx >= 0 and dynamic_investment_idx < len(row):
														
 
															+                item.dynamicInvestment = str(row[dynamic_investment_idx]).strip()
														
 
															+            
														
 
															+            # 提取费用
														
 
															+            if construction_project_cost_idx >= 0 and construction_project_cost_idx < len(row):
														
 
															+                item.constructionProjectCost = str(row[construction_project_cost_idx]).strip()
														
 
															+            
														
 
															+            if equipment_purchase_cost_idx >= 0 and equipment_purchase_cost_idx < len(row):
														
 
															+                item.equipmentPurchaseCost = str(row[equipment_purchase_cost_idx]).strip()
														
 
															+            
														
 
															+            if installation_project_cost_idx >= 0 and installation_project_cost_idx < len(row):
														
 
															+                item.installationProjectCost = str(row[installation_project_cost_idx]).strip()
														
 
															+            
														
 
															+            if other_expenses_idx >= 0 and other_expenses_idx < len(row):
														
 
															+                item.otherExpenses = str(row[other_expenses_idx]).strip()
														
 
															+            
														
 
															+            record.items.append(item)
														
 
															+            logger.info(f"[安全可研批复投资] 解析到数据: No={item.no}, Name={item.name}, Level={item.level}")
														
 
															+    
														
 
															+    logger.info(f"[安全可研批复投资] 共解析到 {len(record.items)} 条数据")
														
 
															+    return record
														
 
															+
														
 
															+
														
 
															 def parse_feasibility_review_investment(markdown_content: str) -> FeasibilityReviewInvestment:
														
 
															     """
														
 
															     解析可研评审投资估算
														
@@ -821,6 +1071,9 @@ def parse_investment_record(markdown_content: str, investment_type: Optional[str
 
															     result = None
														
 
															     if investment_type == "fsApproval":
														
 
															         result = parse_feasibility_approval_investment(markdown_content)
														
 
															+    elif investment_type == "safetyFsApproval":
														
 
															+        # safetyFsApproval 使用独立的解析逻辑（湖北省格式）
														
 
															+        result = parse_safety_feasibility_approval_investment(markdown_content)
														
 
															     elif investment_type == "fsReview":
														
 
															         result = parse_feasibility_review_investment(markdown_content)
														
 
															     elif investment_type == "pdApproval":
														
--- a/pdf_converter_v2/parser/json_converter.py
+++ b/pdf_converter_v2/parser/json_converter.py
@@ -329,8 +329,8 @@ def parse_markdown_to_json(markdown_content: str, first_page_image: Optional[Ima
 
															                 op_list = parse_operational_conditions(markdown_content, require_title=False)
														
 
															             serialized = [oc.to_dict() if hasattr(oc, "to_dict") else oc for oc in (op_list or [])]
														
 
															             result = {"document_type": forced_document_type, "data": {"operationalConditions": serialized}}
														
 
															-        elif forced_document_type in ["fsApproval", "fsReview", "pdApproval"]:
														
 
															-            # 投资估算类型处理
														
 
															+        elif forced_document_type in ["fsApproval", "fsReview", "pdApproval", "safetyFsApproval"]:
														
 
															+            # 投资估算类型处理（包括安评类）
														
 
															             logger.info(f"[JSON转换] 处理投资估算类型: {forced_document_type}")
														
 
															             logger.debug(f"[JSON转换] Markdown内容长度: {len(markdown_content)} 字符")
														
@@ -338,12 +338,26 @@ def parse_markdown_to_json(markdown_content: str, first_page_image: Optional[Ima
 
															             if investment_record:
														
 
															                 data = investment_record.to_dict()
														
 
															-                logger.info(f"[JSON转换] 投资估算解析成功，共 {len(data)} 条记录")
														
 
															-                # 输出前3条记录的摘要
														
 
															-                if data:
														
 
															-                    for idx, item in enumerate(data[:3]):
														
 
															-                        logger.debug(f"[JSON转换] 记录 {idx+1}: No={item.get('No', '')}, Name={item.get('name', '')}, Level={item.get('Level', '')}")
														
 
															+                # 检查返回的数据格式：可能是列表（旧格式）或字典（包含projectInfo的新格式）
														
 
															+                if isinstance(data, dict) and "data" in data:
														
 
															+                    # 新格式：包含 projectInfo 和 data
														
 
															+                    logger.info(f"[JSON转换] 投资估算解析成功，共 {len(data['data'])} 条记录")
														
 
															+                    if data.get("projectInfo"):
														
 
															+                        logger.info(f"[JSON转换] 项目信息: {data['projectInfo'].get('projectName', '')}")
														
 
															+                    
														
 
															+                    # 输出前3条记录的摘要
														
 
															+                    if data["data"]:
														
 
															+                        for idx, item in enumerate(data["data"][:3]):
														
 
															+                            logger.debug(f"[JSON转换] 记录 {idx+1}: No={item.get('No', '')}, Name={item.get('name', '')}, Level={item.get('Level', '')}")
														
 
															+                else:
														
 
															+                    # 旧格式：直接是数据列表
														
 
															+                    logger.info(f"[JSON转换] 投资估算解析成功，共 {len(data)} 条记录")
														
 
															+                    
														
 
															+                    # 输出前3条记录的摘要
														
 
															+                    if data:
														
 
															+                        for idx, item in enumerate(data[:3]):
														
 
															+                            logger.debug(f"[JSON转换] 记录 {idx+1}: No={item.get('No', '')}, Name={item.get('name', '')}, Level={item.get('Level', '')}")
														
 
															                 result = {"document_type": forced_document_type, "data": data}
														
 
															             else:
														
@@ -432,8 +446,8 @@ def parse_markdown_to_json(markdown_content: str, first_page_image: Optional[Ima
 
															     elif doc_type == "emRec":
														
 
															         data = parse_electromagnetic_detection_record(markdown_content).to_dict()
														
 
															         result = {"document_type": doc_type, "data": data}
														
 
															-    elif doc_type in ["fsApproval", "fsReview", "pdApproval"]:
														
 
															-        # 新增：投资估算类型
														
 
															+    elif doc_type in ["fsApproval", "fsReview", "pdApproval", "safetyFsApproval"]:
														
 
															+        # 新增：投资估算类型（包括安评类）
														
 
															         logger.info(f"[JSON转换] 检测到投资估算类型: {doc_type}")
														
 
															         logger.debug(f"[JSON转换] Markdown内容长度: {len(markdown_content)} 字符")
														
--- a/pdf_converter_v2/requirements-paddle-npu.txt
+++ b/pdf_converter_v2/requirements-paddle-npu.txt
@@ -0,0 +1,20 @@
 
															+# PDF Converter v2 - 容器内依赖（Paddle NPU 环境已预装 paddle/paddlex，此处不重复安装）
														
 
															+
														
 
															+# 核心依赖
														
 
															+aiohttp>=3.8.0
														
 
															+aiofiles>=23.0.0
														
 
															+Pillow>=9.0.0
														
 
															+
														
 
															+# PDF 处理（至少安装一个）
														
 
															+pypdfium2>=4.0.0
														
 
															+pdf2image>=1.16.0
														
 
															+pdfplumber>=0.11.0
														
 
															+
														
 
															+# Web 框架
														
 
															+fastapi>=0.100.0
														
 
															+uvicorn[standard]>=0.23.0
														
 
															+pydantic>=2.0.0
														
 
															+typing-extensions>=4.0.0
														
 
															+
														
 
															+# 日志
														
 
															+loguru>=0.7.0
														
--- a/pdf_converter_v2/test.py
+++ b/pdf_converter_v2/test.py
@@ -26,7 +26,7 @@ except ImportError:
 
															     print("  安装命令: pip install PyMuPDF")
														
 
															 # ==================== 配置区域 ====================
														
 
															-pdf_path = '/home/hws/workspace/GitLab/Clerk2.5/pdf_converter_v2/2-数据源/5-（初设批复）晋电建设〔2019〕566号　国网山西省电力公司关于晋城周村220kV输变电工程初步设计的批复 .pdf'
														
 
															+pdf_path = '/home/hws/workspace/GitLab/Clerk2.5/pdf_converter_v2/3-数据/鄂电司发展〔2024〕124号　国网湖北省电力有限公司关于襄阳连云220千伏输变电工程可行性研究报告的批复.pdf'
														
 
															 output_dir = Path('extracted_tables')  # 原始表格输出目录（包含表格前文本）
														
 
															 merged_output_dir = Path('merged_tables')  # 合并后的表格输出目录（已剔除表格前文本）
														
 
															 filtered_output_dir = Path('filtered_tables')  # 筛选后的表格输出目录
														
@@ -89,7 +89,7 @@ TABLE_HEADER_RULES = {
 
															 }
														
 
															 # 是否启用表头过滤（如果为False，则提取所有表格）
														
 
															-ENABLE_HEADER_FILTER = True
														
 
															+ENABLE_HEADER_FILTER = False
														
 
															 # 要排除的规则名称列表（如果某个规则匹配了不该匹配的表格，可以在这里排除）
														
 
															 # 例如: EXCLUDE_RULES = ["物资采购合同2"] 将不会匹配该规则
														
--- a/pdf_converter_v2/test_api.py
+++ b/pdf_converter_v2/test_api.py
@@ -6,6 +6,7 @@ PDF Converter API 测试脚本
 
															 - fsApproval: 可研批复
														
 
															 - fsReview: 可研评审  
														
 
															 - pdApproval: 初设批复
														
 
															+- safetyFsApproval: 安评可研批复
														
 
															 以及现有类型：
														
 
															 - settlementReport: 结算报告
														
@@ -27,15 +28,20 @@ API_BASE_URL = "http://47.101.133.94:14213"
 
															 # 测试文件配置
														
 
															 TEST_DIR = Path(__file__).parent / "test"
														
 
															-# 测试用例：文件名 -> 文档类型
														
 
															+# 测试用例：文件名 -> (文档类型, 是否去水印, 是否只保留表格附件)
														
 
															+# 格式: 
														
 
															+#   "文件名": ("类型", 去水印, 只保留表格) - 完整格式
														
 
															+#   "文件名": ("类型", 去水印) - 兼容格式，只保留表格默认True
														
 
															+#   "文件名": "类型" - 旧格式，去水印False，只保留表格True
														
 
															 TEST_CASES = {
														
 
															     # 新增投资类型
														
 
															+    "鄂电司发展〔2024〕124号　国网湖北省电力有限公司关于襄阳连云220千伏输变电工程可行性研究报告的批复.pdf": ("safetyFsApproval", True,False),  # 需要去水印 + 只保留表格附件
														
 
															     # "2-（可研批复）晋电发展〔2017〕831号+国网山西省电力公司关于临汾古县、晋城周村220kV输变电等工程可行性研究报告的批复.pdf.pdf": "fsApproval",
														
 
															     # "1-（可研评审）晋电经研规划〔2017〕187号(盖章)国网山西经研院关于山西晋城周村220kV输变电工程可行性研究报告的评审意见.pdf": "fsReview",
														
 
															     # "5-（初设批复）晋电建设〔2019〕566号　国网山西省电力公司关于晋城周村220kV输变电工程初步设计的批复 .pdf": "pdApproval",
														
 
															     # 现有类型
														
 
															     # "9-（结算报告）山西晋城周村220kV输变电工程结算审计报告.pdf": "settlementReport",
														
 
															-    "4-（初设评审）中电联电力建设技术经济咨询中心技经〔2019〕201号关于山西周村220kV输变电工程初步设计的评审意见.pdf": "designReview",
														
 
															+    # "4-（初设评审）中电联电力建设技术经济咨询中心技经〔2019〕201号关于山西周村220kV输变电工程初步设计的评审意见.pdf": "designReview",
														
 
															     # 决算报告
														
 
															     # "10-（决算报告）盖章页-山西晋城周村220kV输变电工程竣工决算审核报告（中瑞诚鉴字（2021）第002040号）.pdf": "finalAccount",
														
 
															 }
														
@@ -70,10 +76,21 @@ def check_health() -> bool:
 
															         return False
														
 
															-def upload_file(file_path: Path, document_type: str) -> Optional[str]:
														
 
															-    """上传文件并获取任务 ID"""
														
 
															+def upload_file(file_path: Path, document_type: str, remove_watermark: bool = False, table_only: bool = True) -> Optional[str]:
														
 
															+    """上传文件并获取任务 ID
														
 
															+    
														
 
															+    Args:
														
 
															+        file_path: 文件路径
														
 
															+        document_type: 文档类型
														
 
															+        remove_watermark: 是否去水印
														
 
															+        table_only: 是否只保留表格附件
														
 
															+    """
														
 
															     print(f"\n  📤 上传文件: {file_path.name}")
														
 
															     print(f"     类型: {document_type}")
														
 
															+    if remove_watermark:
														
 
															+        print(f"     去水印: 是")
														
 
															+    if table_only:
														
 
															+        print(f"     只保留表格: 是")
														
 
															     try:
														
 
															         with open(file_path, "rb") as f:
														
@@ -81,6 +98,15 @@ def upload_file(file_path: Path, document_type: str) -> Optional[str]:
 
															             # 使用 data 发送表单参数，参数名是 type（不是 document_type）
														
 
															             data = {"type": document_type}
														
 
															+            # 添加去水印参数
														
 
															+            if remove_watermark:
														
 
															+                data["remove_watermark"] = "true"
														
 
															+                data["watermark_light_threshold"] = "200"
														
 
															+                data["watermark_saturation_threshold"] = "30"
														
 
															+            
														
 
															+            # 添加只保留表格参数
														
 
															+            data["table_only"] = "true" if table_only else "false"
														
 
															+            
														
 
															             response = requests.post(
														
 
															                 f"{API_BASE_URL}/convert",
														
 
															                 files=files,
														
@@ -173,7 +199,21 @@ def validate_result(result: Dict[str, Any], expected_type: str) -> bool:
 
															         return False
														
 
															     # 对于投资类型，检查嵌套结构
														
 
															-    if expected_type in ["fsApproval", "fsReview", "pdApproval"]:
														
 
															+    if expected_type in ["fsApproval", "fsReview", "pdApproval", "safetyFsApproval"]:
														
 
															+        # 检查是否是新格式（包含 projectInfo）
														
 
															+        project_info = None
														
 
															+        if isinstance(data, dict) and "data" in data:
														
 
															+            # 新格式：{"projectInfo": {...}, "data": [...]}
														
 
															+            project_info = data.get("projectInfo")
														
 
															+            data = data["data"]
														
 
															+            
														
 
															+            if project_info:
														
 
															+                print(f"\n  📋 项目信息:")
														
 
															+                print(f"     工程名称: {project_info.get('projectName', '')}")
														
 
															+                print(f"     项目单位: {project_info.get('projectUnit', '')}")
														
 
															+                print(f"     设计单位: {project_info.get('designUnit', '')}")
														
 
															+        
														
 
															+        # 验证数据格式
														
 
															         if not isinstance(data, list):
														
 
															             print_result(False, f"数据格式错误: 期望 list, 实际 {type(data).__name__}")
														
 
															             return False
														
@@ -218,13 +258,24 @@ def validate_result(result: Dict[str, Any], expected_type: str) -> bool:
 
															     return True
														
 
															-def test_single_file(file_path: Path, document_type: str) -> bool:
														
 
															-    """测试单个文件"""
														
 
															+def test_single_file(file_path: Path, document_type: str, remove_watermark: bool = False, table_only: bool = True) -> bool:
														
 
															+    """测试单个文件
														
 
															+    
														
 
															+    Args:
														
 
															+        file_path: 文件路径
														
 
															+        document_type: 文档类型
														
 
															+        remove_watermark: 是否去水印
														
 
															+        table_only: 是否只保留表格附件
														
 
															+    """
														
 
															     print_header(f"测试: {document_type}")
														
 
															     print(f"  文件: {file_path.name}")
														
 
															+    if remove_watermark:
														
 
															+        print(f"  去水印: 是")
														
 
															+    if table_only:
														
 
															+        print(f"  只保留表格: 是")
														
 
															     # 1. 上传文件
														
 
															-    task_id = upload_file(file_path, document_type)
														
 
															+    task_id = upload_file(file_path, document_type, remove_watermark, table_only)
														
 
															     if not task_id:
														
 
															         return False
														
@@ -276,7 +327,23 @@ def run_all_tests():
 
															     skipped = 0
														
 
															     # 运行每个测试用例
														
 
															-    for filename, document_type in TEST_CASES.items():
														
 
															+    for filename, config in TEST_CASES.items():
														
 
															+        # 解析配置格式
														
 
															+        if isinstance(config, tuple):
														
 
															+            if len(config) >= 3:
														
 
															+                document_type, remove_watermark, table_only = config[:3]
														
 
															+            elif len(config) == 2:
														
 
															+                document_type, remove_watermark = config
														
 
															+                table_only = True  # 默认只保留表格
														
 
															+            else:
														
 
															+                document_type = config[0]
														
 
															+                remove_watermark = False
														
 
															+                table_only = True
														
 
															+        else:
														
 
															+            document_type = config
														
 
															+            remove_watermark = False
														
 
															+            table_only = True
														
 
															+        
														
 
															         file_path = TEST_DIR / filename
														
 
															         if not file_path.exists():
														
@@ -288,7 +355,7 @@ def run_all_tests():
 
															         total += 1
														
 
															         try:
														
 
															-            if test_single_file(file_path, document_type):
														
 
															+            if test_single_file(file_path, document_type, remove_watermark, table_only):
														
 
															                 passed += 1
														
 
															             else:
														
 
															                 failed += 1
														
@@ -319,11 +386,27 @@ def test_single(document_type: str):
 
															         return
														
 
															     # 查找对应的文件
														
 
															-    for filename, dtype in TEST_CASES.items():
														
 
															+    for filename, config in TEST_CASES.items():
														
 
															+        # 解析配置格式
														
 
															+        if isinstance(config, tuple):
														
 
															+            if len(config) >= 3:
														
 
															+                dtype, remove_watermark, table_only = config[:3]
														
 
															+            elif len(config) == 2:
														
 
															+                dtype, remove_watermark = config
														
 
															+                table_only = True
														
 
															+            else:
														
 
															+                dtype = config[0]
														
 
															+                remove_watermark = False
														
 
															+                table_only = True
														
 
															+        else:
														
 
															+            dtype = config
														
 
															+            remove_watermark = False
														
 
															+            table_only = True
														
 
															+        
														
 
															         if dtype == document_type:
														
 
															             file_path = TEST_DIR / filename
														
 
															             if file_path.exists():
														
 
															-                test_single_file(file_path, document_type)
														
 
															+                test_single_file(file_path, document_type, remove_watermark, table_only)
														
 
															                 return
														
 
															             else:
														
 
															                 print_result(False, f"文件不存在: {filename}")
														
@@ -332,7 +415,16 @@ def test_single(document_type: str):
 
															     print_result(False, f"未找到类型 {document_type} 的测试文件")
														
 
															-def test_ocr(image_path: Optional[str] = None) -> bool:
														
 
															+def test_ocr(
														
 
															+    image_path: Optional[str] = None,
														
 
															+    remove_watermark: bool = False,
														
 
															+    light_threshold: int = 200,
														
 
															+    saturation_threshold: int = 30,
														
 
															+    crop_header_footer: bool = False,
														
 
															+    header_ratio: float = 0.05,
														
 
															+    footer_ratio: float = 0.05,
														
 
															+    auto_detect_header_footer: bool = False
														
 
															+) -> bool:
														
 
															     """
														
 
															     测试 OCR 接口
														
@@ -341,6 +433,13 @@ def test_ocr(image_path: Optional[str] = None) -> bool:
 
															                    支持格式：
														
 
															                    - 图片文件：.png, .jpg, .jpeg
														
 
															                    - txt文件：包含base64编码的图片数据（可带data:image/xxx;base64,前缀）
														
 
															+        remove_watermark: 是否去除水印
														
 
															+        light_threshold: 水印亮度阈值（0-255），默认200
														
 
															+        saturation_threshold: 水印饱和度阈值（0-255），默认30
														
 
															+        crop_header_footer: 是否裁剪页眉页脚
														
 
															+        header_ratio: 页眉裁剪比例（0-1），默认0.05
														
 
															+        footer_ratio: 页脚裁剪比例（0-1），默认0.05
														
 
															+        auto_detect_header_footer: 是否自动检测页眉页脚边界
														
 
															     Returns:
														
 
															         是否测试成功
														
@@ -419,14 +518,33 @@ def test_ocr(image_path: Optional[str] = None) -> bool:
 
															     # 调用 OCR 接口
														
 
															     print(f"\n  📤 调用 OCR 接口...")
														
 
															+    # 构建请求参数
														
 
															+    request_data = {
														
 
															+        "image_base64": image_base64,
														
 
															+        "image_format": image_format
														
 
															+    }
														
 
															+    
														
 
															+    if crop_header_footer:
														
 
															+        request_data["crop_header_footer"] = True
														
 
															+        if auto_detect_header_footer:
														
 
															+            request_data["auto_detect_header_footer"] = True
														
 
															+            print(f"  ✂️  裁剪页眉页脚: 自动检测模式")
														
 
															+        else:
														
 
															+            request_data["header_ratio"] = header_ratio
														
 
															+            request_data["footer_ratio"] = footer_ratio
														
 
															+            print(f"  ✂️  裁剪页眉页脚: 是 (顶部={header_ratio*100:.0f}%, 底部={footer_ratio*100:.0f}%)")
														
 
															+    
														
 
															+    if remove_watermark:
														
 
															+        request_data["remove_watermark"] = True
														
 
															+        request_data["watermark_light_threshold"] = light_threshold
														
 
															+        request_data["watermark_saturation_threshold"] = saturation_threshold
														
 
															+        print(f"  🔧 去水印: 是 (亮度阈值={light_threshold}, 饱和度阈值={saturation_threshold})")
														
 
															+    
														
 
															     try:
														
 
															         start_time = time.time()
														
 
															         response = requests.post(
														
 
															             f"{API_BASE_URL}/ocr",
														
 
															-            json={
														
 
															-                "image_base64": image_base64,
														
 
															-                "image_format": image_format
														
 
															-            },
														
 
															+            json=request_data,
														
 
															             timeout=120
														
 
															         )
														
 
															         elapsed = time.time() - start_time
														
@@ -500,15 +618,75 @@ if __name__ == "__main__":
 
															             print("  python test_api.py          # 运行所有测试")
														
 
															             print("  python test_api.py <type>   # 测试指定类型")
														
 
															             print("  python test_api.py ocr      # 测试 OCR 接口")
														
 
															-            print("  python test_api.py ocr <image_path>  # 测试 OCR（指定图片）")
														
 
															+            print("  python test_api.py ocr <image_path>  # 测试 OCR（指定图片或txt）")
														
 
															+            print("  python test_api.py ocr <image_path> --nowm  # 测试 OCR 并去水印")
														
 
															+            print("  python test_api.py ocr <image_path> --crop  # 测试 OCR 并裁剪页眉页脚")
														
 
															+            print("  python test_api.py ocr <image_path> --nowm --crop  # 同时去水印和裁剪")
														
 
															             print("\n可用类型:")
														
 
															             for dtype in set(TEST_CASES.values()):
														
 
															                 print(f"  - {dtype}")
														
 
															             print("  - ocr  (OCR 图片识别)")
														
 
															+            print("\nOCR 去水印参数:")
														
 
															+            print("  --nowm         启用去水印")
														
 
															+            print("  --light=N      亮度阈值（0-255，默认200）")
														
 
															+            print("  --sat=N        饱和度阈值（0-255，默认30）")
														
 
															+            print("\nOCR 裁剪页眉页脚参数:")
														
 
															+            print("  --crop         启用裁剪页眉页脚（固定比例模式）")
														
 
															+            print("  --crop-auto    启用裁剪页眉页脚（自动检测模式）")
														
 
															+            print("  --header=N     页眉裁剪比例（0-1，默认0.05表示5%）")
														
 
															+            print("  --footer=N     页脚裁剪比例（0-1，默认0.05表示5%）")
														
 
															         elif doc_type == "ocr":
														
 
															-            # 测试 OCR 接口
														
 
															-            image_path = sys.argv[2] if len(sys.argv) > 2 else None
														
 
															-            test_ocr(image_path)
														
 
															+            # 解析 OCR 参数
														
 
															+            image_path = None
														
 
															+            remove_watermark = False
														
 
															+            light_threshold = 200
														
 
															+            saturation_threshold = 30
														
 
															+            crop_header_footer = False
														
 
															+            header_ratio = 0.05
														
 
															+            footer_ratio = 0.05
														
 
															+            auto_detect_header_footer = False
														
 
															+            
														
 
															+            for arg in sys.argv[2:]:
														
 
															+                if arg == "--nowm":
														
 
															+                    remove_watermark = True
														
 
															+                elif arg == "--crop":
														
 
															+                    crop_header_footer = True
														
 
															+                elif arg == "--crop-auto":
														
 
															+                    crop_header_footer = True
														
 
															+                    auto_detect_header_footer = True
														
 
															+                elif arg.startswith("--light="):
														
 
															+                    try:
														
 
															+                        light_threshold = int(arg.split("=")[1])
														
 
															+                    except ValueError:
														
 
															+                        print(f"警告: 无效的亮度阈值 {arg}，使用默认值 200")
														
 
															+                elif arg.startswith("--sat="):
														
 
															+                    try:
														
 
															+                        saturation_threshold = int(arg.split("=")[1])
														
 
															+                    except ValueError:
														
 
															+                        print(f"警告: 无效的饱和度阈值 {arg}，使用默认值 30")
														
 
															+                elif arg.startswith("--header="):
														
 
															+                    try:
														
 
															+                        header_ratio = float(arg.split("=")[1])
														
 
															+                    except ValueError:
														
 
															+                        print(f"警告: 无效的页眉比例 {arg}，使用默认值 0.05")
														
 
															+                elif arg.startswith("--footer="):
														
 
															+                    try:
														
 
															+                        footer_ratio = float(arg.split("=")[1])
														
 
															+                    except ValueError:
														
 
															+                        print(f"警告: 无效的页脚比例 {arg}，使用默认值 0.05")
														
 
															+                elif not arg.startswith("--"):
														
 
															+                    image_path = arg
														
 
															+            
														
 
															+            test_ocr(
														
 
															+                image_path, 
														
 
															+                remove_watermark, 
														
 
															+                light_threshold, 
														
 
															+                saturation_threshold,
														
 
															+                crop_header_footer,
														
 
															+                header_ratio,
														
 
															+                footer_ratio,
														
 
															+                auto_detect_header_footer
														
 
															+            )
														
 
															         else:
														
 
															             test_single(doc_type)
														
 
															     else:
														
--- a/pdf_converter_v2/test_no.py
+++ b/pdf_converter_v2/test_no.py
@@ -45,11 +45,20 @@ except ImportError:
 
															     logger.info("[附件切割] 安装命令: pip install PyPDF2")
														
 
															 # 配置
														
 
															-PDF_PATH = '/home/hws/workspace/GitLab/Clerk2.5/pdf_converter_v2/2-数据源/4-（初设评审）中电联电力建设技术经济咨询中心技经〔2019〕201号关于山西周村220kV输变电工程初步设计的评审意见.pdf'
														
 
															+PDF_PATH = '/home/hws/workspace/GitLab/Clerk2.5/pdf_converter_v2/2-数据源/1-（可研评审）晋电经研规划〔2017〕187号(盖章)国网山西经研院关于山西晋城周村220kV输变电工程可行性研究报告的评审意见.pdf'
														
 
															 OUTPUT_DIR = Path('附件页')
														
 
															 USE_OCR = True  # 是否启用 OCR
														
 
															 OCR_LANG = 'chi_sim+eng'  # OCR 语言
														
 
															-DEBUG_MODE = True  # 是否启用调试模式（显示每页的文本内容）
														
 
															+DEBUG_MODE = False  # 是否启用调试模式（显示每页的文本内容）
														
 
															+
														
 
															+# 去水印配置
														
 
															+REMOVE_WATERMARK = False  # 是否对切割后的附件页PDF去水印
														
 
															+WATERMARK_LIGHT_THRESHOLD = 200  # 水印亮度阈值（0-255），高于此值的浅色像素可能是水印
														
 
															+WATERMARK_SATURATION_THRESHOLD = 30  # 水印饱和度阈值（0-255），低于此值的低饱和度像素可能是水印
														
 
															+WATERMARK_DPI = 200  # PDF转图片的DPI（用于去水印）
														
 
															+
														
 
															+# 表格附件过滤配置
														
 
															+TABLE_ONLY = True  # 是否只保留包含表格的附件页（过滤掉示意图、评审意见等）
														
 
															 # 附件页识别关键词
														
 
															 ATTACHMENT_START_KEYWORDS = [
														
@@ -59,6 +68,40 @@ ATTACHMENT_START_KEYWORDS = [
 
															     '附 件：',
														
 
															 ]
														
 
															+# 表格附件识别关键词（用于过滤只保留包含表格的附件）
														
 
															+TABLE_ATTACHMENT_KEYWORDS = [
														
 
															+    '项目表',
														
 
															+    '投资估算',
														
 
															+    '工程投资',
														
 
															+    '建设规模',
														
 
															+    '技术方案',
														
 
															+    '变电工程',
														
 
															+    '线路工程',
														
 
															+    '静态投资',
														
 
															+    '动态投资',
														
 
															+    '单位造价',
														
 
															+    '设备购置费',
														
 
															+    '安装工程费',
														
 
															+    '建筑工程费',
														
 
															+    '其他费用',
														
 
															+    '基本预备费',
														
 
															+]
														
 
															+
														
 
															+# 非表格附件识别关键词（用于识别需要跳过的附件）
														
 
															+NON_TABLE_ATTACHMENT_KEYWORDS = [
														
 
															+    '示意图',
														
 
															+    '接入系统示意图',
														
 
															+    '母线间隔排列图',
														
 
															+    '评审意见',
														
 
															+    '技术监督意见',
														
 
															+    '参会单位',
														
 
															+    '人员一览表',
														
 
															+    '经济性评价',
														
 
															+    '财务合规',
														
 
															+    '审核结果',
														
 
															+    '预算编制衔接',
														
 
															+]
														
 
															+
														
 
															 def ocr_page_image(image) -> str:
														
 
															     """
														
 
															     对图片进行 OCR 识别（优先使用 Tesseract，备用 PaddleOCR）
														
@@ -143,6 +186,63 @@ def extract_page_text(page, use_ocr: bool = False) -> str:
 
															     logger.warning(f"[附件切割] 第{page.page_number}页: 无法提取文本（OCR未启用或不可用）")
														
 
															     return ""
														
 
															+def is_table_attachment_page(text: str, page) -> bool:
														
 
															+    """
														
 
															+    判断是否是包含表格的附件页
														
 
															+    
														
 
															+    Args:
														
 
															+        text: 页面文本
														
 
															+        page: pdfplumber page 对象
														
 
															+    
														
 
															+    Returns:
														
 
															+        bool: 是否是表格附件页
														
 
															+    """
														
 
															+    if not text:
														
 
															+        return False
														
 
															+    
														
 
															+    text_no_space = text.replace(' ', '').replace('\u3000', '')
														
 
															+    
														
 
															+    # 检查是否包含非表格附件关键词（如示意图、评审意见等）
														
 
															+    for keyword in NON_TABLE_ATTACHMENT_KEYWORDS:
														
 
															+        keyword_no_space = keyword.replace(' ', '').replace('\u3000', '')
														
 
															+        if keyword_no_space in text_no_space:
														
 
															+            logger.debug(f"[附件切割] 检测到非表格附件关键词: {keyword}")
														
 
															+            return False
														
 
															+    
														
 
															+    # 检查是否包含表格附件关键词
														
 
															+    has_table_keyword = False
														
 
															+    for keyword in TABLE_ATTACHMENT_KEYWORDS:
														
 
															+        keyword_no_space = keyword.replace(' ', '').replace('\u3000', '')
														
 
															+        if keyword_no_space in text_no_space:
														
 
															+            logger.debug(f"[附件切割] 检测到表格关键词: {keyword}")
														
 
															+            has_table_keyword = True
														
 
															+            break
														
 
															+    
														
 
															+    # 如果有表格关键词，直接返回True
														
 
															+    if has_table_keyword:
														
 
															+        return True
														
 
															+    
														
 
															+    # 检查页面是否包含表格（使用pdfplumber的表格检测）
														
 
															+    if page is not None:
														
 
															+        try:
														
 
															+            tables = page.extract_tables()
														
 
															+            if tables and len(tables) > 0:
														
 
															+                # 检查表格是否足够大（至少有3行3列的数据表格）
														
 
															+                for table in tables:
														
 
															+                    if table and len(table) >= 3:
														
 
															+                        # 检查是否有多列
														
 
															+                        non_empty_rows = [row for row in table if row and any(cell for cell in row if cell)]
														
 
															+                        if len(non_empty_rows) >= 3:
														
 
															+                            row_with_most_cols = max(non_empty_rows, key=lambda r: len([c for c in r if c]))
														
 
															+                            if len([c for c in row_with_most_cols if c]) >= 3:
														
 
															+                                logger.debug(f"[附件切割] 检测到表格: {len(non_empty_rows)}行")
														
 
															+                                return True
														
 
															+        except Exception as e:
														
 
															+            logger.warning(f"[附件切割] 表格检测失败: {e}")
														
 
															+    
														
 
															+    return False
														
 
															+
														
 
															+
														
 
															 def is_attachment_start_page(text: str) -> bool:
														
 
															     """
														
 
															     判断是否是附件清单页（附件开始的前一页）
														
@@ -339,7 +439,10 @@ def extract_pages(pdf_path: str, page_numbers: list, output_path: str):
 
															     logger.info(f"[附件切割] 已保存到: {output_path}")
														
 
															     print(f"✓ 已保存到: {output_path}")
														
 
															-def split_attachment_pages(pdf_path: str, output_dir: Path, use_ocr: bool = False, debug: bool = False):
														
 
															+def split_attachment_pages(pdf_path: str, output_dir: Path, use_ocr: bool = False, debug: bool = False, 
														
 
															+                          remove_watermark: bool = False, watermark_light_threshold: int = 200,
														
 
															+                          watermark_saturation_threshold: int = 30, watermark_dpi: int = 200,
														
 
															+                          table_only: bool = False):
														
 
															     """
														
 
															     查找并切割附件页
														
@@ -348,8 +451,14 @@ def split_attachment_pages(pdf_path: str, output_dir: Path, use_ocr: bool = Fals
 
															         output_dir: 输出目录
														
 
															         use_ocr: 是否使用 OCR
														
 
															         debug: 是否输出调试信息
														
 
															+        remove_watermark: 是否对切割后的附件页PDF去水印
														
 
															+        watermark_light_threshold: 水印亮度阈值（0-255）
														
 
															+        watermark_saturation_threshold: 水印饱和度阈值（0-255）
														
 
															+        watermark_dpi: PDF转图片的DPI
														
 
															+        table_only: 是否只保留包含表格的附件页（过滤掉示意图、评审意见等）
														
 
															     """
														
 
															     logger.info(f"[附件切割] 开始处理PDF: {pdf_path}")
														
 
															+    logger.info(f"[附件切割] 只保留表格附件: {'是' if table_only else '否'}")
														
 
															     # 查找附件开始页
														
 
															     attachment_start = find_attachment_start_page(pdf_path, use_ocr=use_ocr, debug=debug)
														
@@ -359,16 +468,65 @@ def split_attachment_pages(pdf_path: str, output_dir: Path, use_ocr: bool = Fals
 
															         print("\n未找到附件页")
														
 
															         return
														
 
															-    # 获取总页数
														
 
															+    # 获取总页数和筛选表格附件页
														
 
															     with pdfplumber.open(pdf_path) as pdf:
														
 
															         total_pages = len(pdf.pages)
														
 
															-    
														
 
															-    # 附件页范围：从附件开始页到最后一页
														
 
															-    attachment_pages = list(range(attachment_start, total_pages + 1))
														
 
															-    
														
 
															-    logger.info(f"[附件切割] 附件页范围: {attachment_start}-{total_pages}, 共 {len(attachment_pages)} 页")
														
 
															-    print(f"\n附件页范围: 第 {attachment_start} 页 到 第 {total_pages} 页")
														
 
															-    print(f"共 {len(attachment_pages)} 页")
														
 
															+        
														
 
															+        if table_only:
														
 
															+            # 只保留包含表格的附件页
														
 
															+            logger.info(f"[附件切割] 启用表格附件过滤，开始筛选...")
														
 
															+            print(f"\n启用表格附件过滤，开始筛选...")
														
 
															+            
														
 
															+            attachment_pages = []
														
 
															+            current_table_section = []  # 当前表格区段的页面
														
 
															+            in_table_section = False  # 是否在表格区段内
														
 
															+            
														
 
															+            for page_num in range(attachment_start, total_pages + 1):
														
 
															+                page = pdf.pages[page_num - 1]
														
 
															+                text = extract_page_text(page, use_ocr=use_ocr)
														
 
															+                
														
 
															+                is_table_page = is_table_attachment_page(text, page)
														
 
															+                
														
 
															+                if debug:
														
 
															+                    print(f"  页面 {page_num}: {'表格页' if is_table_page else '非表格页'}")
														
 
															+                
														
 
															+                if is_table_page:
														
 
															+                    if not in_table_section:
														
 
															+                        # 开始新的表格区段
														
 
															+                        in_table_section = True
														
 
															+                        current_table_section = [page_num]
														
 
															+                        logger.debug(f"[附件切割] 开始表格区段: 第 {page_num} 页")
														
 
															+                    else:
														
 
															+                        # 继续当前表格区段
														
 
															+                        current_table_section.append(page_num)
														
 
															+                else:
														
 
															+                    if in_table_section:
														
 
															+                        # 结束当前表格区段，保存
														
 
															+                        attachment_pages.extend(current_table_section)
														
 
															+                        logger.info(f"[附件切割] 表格区段结束: {current_table_section[0]}-{current_table_section[-1]}")
														
 
															+                        current_table_section = []
														
 
															+                        in_table_section = False
														
 
															+            
														
 
															+            # 处理最后一个表格区段
														
 
															+            if in_table_section and current_table_section:
														
 
															+                attachment_pages.extend(current_table_section)
														
 
															+                logger.info(f"[附件切割] 最后表格区段: {current_table_section[0]}-{current_table_section[-1]}")
														
 
															+            
														
 
															+            if not attachment_pages:
														
 
															+                logger.warning(f"[附件切割] 未找到包含表格的附件页")
														
 
															+                print("\n未找到包含表格的附件页")
														
 
															+                return
														
 
															+            
														
 
															+            logger.info(f"[附件切割] 筛选后的表格附件页: {attachment_pages}")
														
 
															+            print(f"\n筛选后的表格附件页: {attachment_pages}")
														
 
															+            print(f"共 {len(attachment_pages)} 页")
														
 
															+        else:
														
 
															+            # 附件页范围：从附件开始页到最后一页
														
 
															+            attachment_pages = list(range(attachment_start, total_pages + 1))
														
 
															+            
														
 
															+            logger.info(f"[附件切割] 附件页范围: {attachment_start}-{total_pages}, 共 {len(attachment_pages)} 页")
														
 
															+            print(f"\n附件页范围: 第 {attachment_start} 页 到 第 {total_pages} 页")
														
 
															+            print(f"共 {len(attachment_pages)} 页")
														
 
															     # 切割附件页
														
 
															     print("\n" + "=" * 60)
														
@@ -379,14 +537,70 @@ def split_attachment_pages(pdf_path: str, output_dir: Path, use_ocr: bool = Fals
 
															     output_dir.mkdir(parents=True, exist_ok=True)
														
 
															     # 保存所有附件页为一个文件
														
 
															-    output_file = output_dir / f"{pdf_path.stem}_附件页_{attachment_start}-{total_pages}.pdf"
														
 
															+    if table_only:
														
 
															+        # 表格附件模式：使用筛选后的页面范围
														
 
															+        page_range_str = f"{min(attachment_pages)}_{max(attachment_pages)}" if attachment_pages else "none"
														
 
															+        output_file = output_dir / f"{pdf_path.stem}_表格附件页_{page_range_str}.pdf"
														
 
															+    else:
														
 
															+        output_file = output_dir / f"{pdf_path.stem}_附件页_{attachment_start}-{total_pages}.pdf"
														
 
															+    
														
 
															     logger.info(f"[附件切割] 输出文件: {output_file}")
														
 
															     extract_pages(pdf_path, attachment_pages, output_file)
														
 
															     logger.info(f"[附件切割] 切割完成: {len(attachment_pages)} 页附件已保存")
														
 
															     print(f"\n✓ 切割完成！")
														
 
															     print(f"附件页数: {len(attachment_pages)} 页")
														
 
															-    print(f"输出目录: {output_dir.absolute()}")
														
 
															+    print(f"输出文件: {output_file}")
														
 
															+    
														
 
															+    # 如果启用去水印，对切割后的附件页PDF进行去水印处理
														
 
															+    if remove_watermark:
														
 
															+        logger.info(f"[附件切割] 开始对附件页PDF进行去水印处理...")
														
 
															+        print("\n" + "=" * 60)
														
 
															+        print("开始去水印处理")
														
 
															+        print("=" * 60)
														
 
															+        
														
 
															+        try:
														
 
															+            # 导入去水印模块
														
 
															+            import sys
														
 
															+            from pathlib import Path as PathLib
														
 
															+            sys.path.insert(0, str(PathLib(__file__).parent))
														
 
															+            
														
 
															+            from utils.pdf_watermark_remover import remove_watermark_from_pdf
														
 
															+            
														
 
															+            # 去水印后的PDF路径
														
 
															+            nowm_output_file = output_dir / f"{output_file.stem}_nowm.pdf"
														
 
															+            
														
 
															+            logger.info(f"[附件切割] 去水印参数: 亮度阈值={watermark_light_threshold}, 饱和度阈值={watermark_saturation_threshold}, DPI={watermark_dpi}")
														
 
															+            print(f"去水印参数:")
														
 
															+            print(f"  - 亮度阈值: {watermark_light_threshold}")
														
 
															+            print(f"  - 饱和度阈值: {watermark_saturation_threshold}")
														
 
															+            print(f"  - DPI: {watermark_dpi}")
														
 
															+            
														
 
															+            # 执行去水印
														
 
															+            success = remove_watermark_from_pdf(
														
 
															+                input_pdf=str(output_file),
														
 
															+                output_pdf=str(nowm_output_file),
														
 
															+                light_threshold=watermark_light_threshold,
														
 
															+                saturation_threshold=watermark_saturation_threshold,
														
 
															+                dpi=watermark_dpi
														
 
															+            )
														
 
															+            
														
 
															+            if success and nowm_output_file.exists():
														
 
															+                logger.info(f"[附件切割] 去水印完成: {nowm_output_file}")
														
 
															+                print(f"\n✓ 去水印完成！")
														
 
															+                print(f"去水印后的文件: {nowm_output_file}")
														
 
															+            else:
														
 
															+                logger.warning(f"[附件切割] 去水印失败")
														
 
															+                print(f"\n⚠ 去水印失败，请检查日志")
														
 
															+        except ImportError as e:
														
 
															+            logger.error(f"[附件切割] 导入去水印模块失败: {e}")
														
 
															+            print(f"\n⚠ 去水印模块导入失败: {e}")
														
 
															+            print("请确保 utils/pdf_watermark_remover.py 文件存在")
														
 
															+        except Exception as e:
														
 
															+            logger.exception(f"[附件切割] 去水印处理失败: {e}")
														
 
															+            print(f"\n⚠ 去水印处理失败: {e}")
														
 
															+    
														
 
															+    print(f"\n输出目录: {output_dir.absolute()}")
														
 
															 if __name__ == '__main__':
														
 
															     logger.info("[附件切割] " + "=" * 50)
														
@@ -397,6 +611,19 @@ if __name__ == '__main__':
 
															     print("PDF 附件页识别和切割工具")
														
 
															     print("=" * 60)
														
 
															+    # 显示配置信息
														
 
															+    print("\n配置信息:")
														
 
															+    print(f"  - PDF文件: {PDF_PATH}")
														
 
															+    print(f"  - 输出目录: {OUTPUT_DIR}")
														
 
															+    print(f"  - OCR: {'启用' if USE_OCR else '禁用'}")
														
 
															+    print(f"  - 调试模式: {'启用' if DEBUG_MODE else '禁用'}")
														
 
															+    print(f"  - 只保留表格附件: {'启用' if TABLE_ONLY else '禁用'}")
														
 
															+    print(f"  - 去水印: {'启用' if REMOVE_WATERMARK else '禁用'}")
														
 
															+    if REMOVE_WATERMARK:
														
 
															+        print(f"    * 亮度阈值: {WATERMARK_LIGHT_THRESHOLD}")
														
 
															+        print(f"    * 饱和度阈值: {WATERMARK_SATURATION_THRESHOLD}")
														
 
															+        print(f"    * DPI: {WATERMARK_DPI}")
														
 
															+    
														
 
															     # 检查依赖
														
 
															     if not TESSERACT_AVAILABLE and USE_OCR:
														
 
															         logger.warning("[附件切割] OCR 功能不可用")
														
@@ -412,7 +639,26 @@ if __name__ == '__main__':
 
															         print("安装方法:")
														
 
															         print("  pip install PyPDF2\n")
														
 
															+    if REMOVE_WATERMARK:
														
 
															+        print("\n⚠ 去水印功能需要以下依赖:")
														
 
															+        print("  - OpenCV (cv2)")
														
 
															+        print("  - Pillow (PIL)")
														
 
															+        print("  - pdf2image")
														
 
															+        print("  - PyPDF2")
														
 
															+        print("安装命令:")
														
 
															+        print("  pip install opencv-python pillow pdf2image PyPDF2\n")
														
 
															+    
														
 
															     # 执行切割
														
 
															-    logger.info(f"[附件切割] 配置: PDF={PDF_PATH}, 输出={OUTPUT_DIR}, OCR={USE_OCR}, DEBUG={DEBUG_MODE}")
														
 
															-    split_attachment_pages(PDF_PATH, OUTPUT_DIR, use_ocr=USE_OCR, debug=DEBUG_MODE)
														
 
															+    logger.info(f"[附件切割] 配置: PDF={PDF_PATH}, 输出={OUTPUT_DIR}, OCR={USE_OCR}, DEBUG={DEBUG_MODE}, 表格附件={TABLE_ONLY}, 去水印={REMOVE_WATERMARK}")
														
 
															+    split_attachment_pages(
														
 
															+        PDF_PATH, 
														
 
															+        OUTPUT_DIR, 
														
 
															+        use_ocr=USE_OCR, 
														
 
															+        debug=DEBUG_MODE,
														
 
															+        remove_watermark=REMOVE_WATERMARK,
														
 
															+        watermark_light_threshold=WATERMARK_LIGHT_THRESHOLD,
														
 
															+        watermark_saturation_threshold=WATERMARK_SATURATION_THRESHOLD,
														
 
															+        watermark_dpi=WATERMARK_DPI,
														
 
															+        table_only=TABLE_ONLY
														
 
															+    )
														
 
															     logger.info("[附件切割] 程序执行完成")
														
--- a/pdf_converter_v2/utils/image_preprocessor.py
+++ b/pdf_converter_v2/utils/image_preprocessor.py
@@ -0,0 +1,526 @@
 
															+"""
														
 
															+图像预处理工具 - 包含去水印等功能
														
 
															+
														
 
															+支持的预处理操作：
														
 
															+- 去水印（颜色过滤法）
														
 
															+- 灰度转换
														
 
															+- 二值化
														
 
															+- 去噪
														
 
															+"""
														
 
															+
														
 
															+import numpy as np
														
 
															+from pathlib import Path
														
 
															+from typing import Optional, Tuple
														
 
															+from loguru import logger
														
 
															+
														
 
															+try:
														
 
															+    from PIL import Image
														
 
															+    PIL_AVAILABLE = True
														
 
															+except ImportError:
														
 
															+    PIL_AVAILABLE = False
														
 
															+    logger.warning("[图像预处理] PIL 未安装，部分功能不可用")
														
 
															+
														
 
															+try:
														
 
															+    import cv2
														
 
															+    CV2_AVAILABLE = True
														
 
															+except ImportError:
														
 
															+    CV2_AVAILABLE = False
														
 
															+    logger.warning("[图像预处理] OpenCV 未安装，部分功能不可用")
														
 
															+
														
 
															+
														
 
															+def remove_watermark(
														
 
															+    image_path: str,
														
 
															+    output_path: Optional[str] = None,
														
 
															+    light_threshold: int = 200,
														
 
															+    saturation_threshold: int = 30,
														
 
															+    method: str = "auto"
														
 
															+) -> str:
														
 
															+    """
														
 
															+    去除图片水印
														
 
															+    
														
 
															+    原理：大多数水印是浅色或半透明的，通过以下方式去除：
														
 
															+    1. 将浅色像素（亮度高、饱和度低）替换为白色
														
 
															+    2. 保留深色文字内容
														
 
															+    
														
 
															+    Args:
														
 
															+        image_path: 输入图片路径
														
 
															+        output_path: 输出图片路径，默认在原文件名后加 _nowm
														
 
															+        light_threshold: 亮度阈值（0-255），高于此值的浅色像素可能是水印
														
 
															+        saturation_threshold: 饱和度阈值（0-255），低于此值的低饱和度像素可能是水印
														
 
															+        method: 去水印方法
														
 
															+            - "auto": 自动选择最佳方法
														
 
															+            - "light": 基于亮度的简单方法（快速）
														
 
															+            - "hsv": 基于HSV颜色空间的方法（更精确）
														
 
															+            - "adaptive": 自适应阈值方法
														
 
															+    
														
 
															+    Returns:
														
 
															+        处理后的图片路径
														
 
															+    """
														
 
															+    if not CV2_AVAILABLE:
														
 
															+        logger.warning("[去水印] OpenCV 未安装，跳过去水印处理")
														
 
															+        return image_path
														
 
															+    
														
 
															+    logger.info(f"[去水印] 开始处理: {image_path}")
														
 
															+    logger.info(f"[去水印] 方法: {method}, 亮度阈值: {light_threshold}, 饱和度阈值: {saturation_threshold}")
														
 
															+    
														
 
															+    # 读取图片
														
 
															+    img = cv2.imread(image_path)
														
 
															+    if img is None:
														
 
															+        logger.error(f"[去水印] 无法读取图片: {image_path}")
														
 
															+        return image_path
														
 
															+    
														
 
															+    original_shape = img.shape
														
 
															+    logger.info(f"[去水印] 图片尺寸: {original_shape[1]}x{original_shape[0]}")
														
 
															+    
														
 
															+    # 根据方法选择处理逻辑
														
 
															+    if method == "auto":
														
 
															+        # 自动检测：先尝试 HSV 方法，如果效果不好则用 adaptive
														
 
															+        method = "hsv"
														
 
															+    
														
 
															+    if method == "light":
														
 
															+        # 简单亮度方法：将浅色像素替换为白色
														
 
															+        result = _remove_watermark_light(img, light_threshold)
														
 
															+    elif method == "hsv":
														
 
															+        # HSV 方法：基于亮度和饱和度
														
 
															+        result = _remove_watermark_hsv(img, light_threshold, saturation_threshold)
														
 
															+    elif method == "adaptive":
														
 
															+        # 自适应方法：使用自适应阈值
														
 
															+        result = _remove_watermark_adaptive(img)
														
 
															+    else:
														
 
															+        logger.warning(f"[去水印] 未知方法: {method}，使用 hsv")
														
 
															+        result = _remove_watermark_hsv(img, light_threshold, saturation_threshold)
														
 
															+    
														
 
															+    # 确定输出路径
														
 
															+    if output_path is None:
														
 
															+        path = Path(image_path)
														
 
															+        output_path = str(path.parent / f"{path.stem}_nowm{path.suffix}")
														
 
															+    
														
 
															+    # 保存结果
														
 
															+    cv2.imwrite(output_path, result)
														
 
															+    logger.info(f"[去水印] 处理完成，保存到: {output_path}")
														
 
															+    
														
 
															+    return output_path
														
 
															+
														
 
															+
														
 
															+def _remove_watermark_light(img: np.ndarray, threshold: int = 200) -> np.ndarray:
														
 
															+    """
														
 
															+    简单亮度方法：将浅色像素替换为白色
														
 
															+    
														
 
															+    适用于：浅色/灰色水印
														
 
															+    """
														
 
															+    # 转为灰度图
														
 
															+    gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
														
 
															+    
														
 
															+    # 创建掩码：亮度高于阈值的区域
														
 
															+    mask = gray > threshold
														
 
															+    
														
 
															+    # 将掩码区域设为白色
														
 
															+    result = img.copy()
														
 
															+    result[mask] = [255, 255, 255]
														
 
															+    
														
 
															+    return result
														
 
															+
														
 
															+
														
 
															+def _remove_watermark_hsv(
														
 
															+    img: np.ndarray,
														
 
															+    light_threshold: int = 200,
														
 
															+    saturation_threshold: int = 30
														
 
															+) -> np.ndarray:
														
 
															+    """
														
 
															+    HSV 方法：基于亮度和饱和度去除水印
														
 
															+    
														
 
															+    原理：水印通常是高亮度、低饱和度的
														
 
															+    适用于：彩色水印、半透明水印
														
 
															+    """
														
 
															+    # 转换到 HSV 颜色空间
														
 
															+    hsv = cv2.cvtColor(img, cv2.COLOR_BGR2HSV)
														
 
															+    
														
 
															+    # 分离通道
														
 
															+    h, s, v = cv2.split(hsv)
														
 
															+    
														
 
															+    # 创建水印掩码：高亮度 AND 低饱和度
														
 
															+    watermark_mask = (v > light_threshold) & (s < saturation_threshold)
														
 
															+    
														
 
															+    # 将水印区域设为白色
														
 
															+    result = img.copy()
														
 
															+    result[watermark_mask] = [255, 255, 255]
														
 
															+    
														
 
															+    # 可选：对边缘进行平滑处理
														
 
															+    # kernel = np.ones((3, 3), np.uint8)
														
 
															+    # watermark_mask_dilated = cv2.dilate(watermark_mask.astype(np.uint8), kernel, iterations=1)
														
 
															+    # result[watermark_mask_dilated == 1] = [255, 255, 255]
														
 
															+    
														
 
															+    return result
														
 
															+
														
 
															+
														
 
															+def _remove_watermark_adaptive(img: np.ndarray) -> np.ndarray:
														
 
															+    """
														
 
															+    自适应阈值方法
														
 
															+    
														
 
															+    适用于：复杂背景、不均匀光照
														
 
															+    """
														
 
															+    # 转为灰度图
														
 
															+    gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
														
 
															+    
														
 
															+    # 使用自适应阈值
														
 
															+    # 这会根据局部区域计算阈值，保留文字，去除背景和水印
														
 
															+    binary = cv2.adaptiveThreshold(
														
 
															+        gray, 255,
														
 
															+        cv2.ADAPTIVE_THRESH_GAUSSIAN_C,
														
 
															+        cv2.THRESH_BINARY,
														
 
															+        blockSize=15,
														
 
															+        C=10
														
 
															+    )
														
 
															+    
														
 
															+    # 转回 BGR（3通道）
														
 
															+    result = cv2.cvtColor(binary, cv2.COLOR_GRAY2BGR)
														
 
															+    
														
 
															+    return result
														
 
															+
														
 
															+
														
 
															+def enhance_for_ocr(
														
 
															+    image_path: str,
														
 
															+    output_path: Optional[str] = None,
														
 
															+    remove_wm: bool = True,
														
 
															+    denoise: bool = True,
														
 
															+    sharpen: bool = False
														
 
															+) -> str:
														
 
															+    """
														
 
															+    OCR 预处理增强
														
 
															+    
														
 
															+    组合多种预处理操作，优化 OCR 识别效果
														
 
															+    
														
 
															+    Args:
														
 
															+        image_path: 输入图片路径
														
 
															+        output_path: 输出图片路径
														
 
															+        remove_wm: 是否去除水印
														
 
															+        denoise: 是否去噪
														
 
															+        sharpen: 是否锐化
														
 
															+    
														
 
															+    Returns:
														
 
															+        处理后的图片路径
														
 
															+    """
														
 
															+    if not CV2_AVAILABLE:
														
 
															+        logger.warning("[OCR预处理] OpenCV 未安装，跳过预处理")
														
 
															+        return image_path
														
 
															+    
														
 
															+    logger.info(f"[OCR预处理] 开始处理: {image_path}")
														
 
															+    
														
 
															+    # 读取图片
														
 
															+    img = cv2.imread(image_path)
														
 
															+    if img is None:
														
 
															+        logger.error(f"[OCR预处理] 无法读取图片: {image_path}")
														
 
															+        return image_path
														
 
															+    
														
 
															+    result = img.copy()
														
 
															+    
														
 
															+    # 1. 去水印
														
 
															+    if remove_wm:
														
 
															+        result = _remove_watermark_hsv(result)
														
 
															+        logger.info("[OCR预处理] 已去除水印")
														
 
															+    
														
 
															+    # 2. 去噪
														
 
															+    if denoise:
														
 
															+        result = cv2.fastNlMeansDenoisingColored(result, None, 10, 10, 7, 21)
														
 
															+        logger.info("[OCR预处理] 已去噪")
														
 
															+    
														
 
															+    # 3. 锐化
														
 
															+    if sharpen:
														
 
															+        kernel = np.array([[-1, -1, -1],
														
 
															+                          [-1,  9, -1],
														
 
															+                          [-1, -1, -1]])
														
 
															+        result = cv2.filter2D(result, -1, kernel)
														
 
															+        logger.info("[OCR预处理] 已锐化")
														
 
															+    
														
 
															+    # 确定输出路径
														
 
															+    if output_path is None:
														
 
															+        path = Path(image_path)
														
 
															+        output_path = str(path.parent / f"{path.stem}_enhanced{path.suffix}")
														
 
															+    
														
 
															+    # 保存结果
														
 
															+    cv2.imwrite(output_path, result)
														
 
															+    logger.info(f"[OCR预处理] 处理完成，保存到: {output_path}")
														
 
															+    
														
 
															+    return output_path
														
 
															+
														
 
															+
														
 
															+def check_opencv_available() -> bool:
														
 
															+    """检查 OpenCV 是否可用"""
														
 
															+    return CV2_AVAILABLE
														
 
															+
														
 
															+
														
 
															+def crop_header_footer(
														
 
															+    image_path: str,
														
 
															+    output_path: Optional[str] = None,
														
 
															+    header_ratio: float = 0.05,
														
 
															+    footer_ratio: float = 0.05,
														
 
															+    auto_detect: bool = False
														
 
															+) -> str:
														
 
															+    """
														
 
															+    裁剪图片的页眉和页脚区域
														
 
															+    
														
 
															+    通过按比例裁剪图片顶部和底部来去除页眉页脚
														
 
															+    
														
 
															+    Args:
														
 
															+        image_path: 输入图片路径
														
 
															+        output_path: 输出图片路径，默认在原文件名后加 _cropped
														
 
															+        header_ratio: 页眉裁剪比例（0-1），默认0.05表示裁剪顶部5%
														
 
															+        footer_ratio: 页脚裁剪比例（0-1），默认0.05表示裁剪底部5%
														
 
															+        auto_detect: 是否自动检测页眉页脚边界（忽略 header_ratio 和 footer_ratio）
														
 
															+    
														
 
															+    Returns:
														
 
															+        处理后的图片路径
														
 
															+    """
														
 
															+    if not CV2_AVAILABLE:
														
 
															+        logger.warning("[裁剪页眉页脚] OpenCV 未安装，跳过处理")
														
 
															+        return image_path
														
 
															+    
														
 
															+    logger.info(f"[裁剪页眉页脚] 开始处理: {image_path}")
														
 
															+    
														
 
															+    # 读取图片
														
 
															+    img = cv2.imread(image_path)
														
 
															+    if img is None:
														
 
															+        logger.error(f"[裁剪页眉页脚] 无法读取图片: {image_path}")
														
 
															+        return image_path
														
 
															+    
														
 
															+    height, width = img.shape[:2]
														
 
															+    logger.info(f"[裁剪页眉页脚] 原始尺寸: {width}x{height}")
														
 
															+    
														
 
															+    if auto_detect:
														
 
															+        # 自动检测页眉页脚边界
														
 
															+        logger.info("[裁剪页眉页脚] 使用自动检测模式")
														
 
															+        header_pixels, footer_pixels = _detect_header_footer_boundaries(img)
														
 
															+        logger.info(f"[裁剪页眉页脚] 自动检测结果: 页眉={header_pixels}px, 页脚={footer_pixels}px")
														
 
															+    else:
														
 
															+        # 使用固定比例
														
 
															+        logger.info(f"[裁剪页眉页脚] 使用固定比例: 页眉={header_ratio}, 页脚={footer_ratio}")
														
 
															+        header_pixels = int(height * header_ratio)
														
 
															+        footer_pixels = int(height * footer_ratio)
														
 
															+    
														
 
															+    # 裁剪图片（保留中间部分）
														
 
															+    top = header_pixels
														
 
															+    bottom = height - footer_pixels
														
 
															+    
														
 
															+    if top >= bottom:
														
 
															+        logger.warning("[裁剪页眉页脚] 裁剪区域无效，跳过处理")
														
 
															+        return image_path
														
 
															+    
														
 
															+    result = img[top:bottom, :]
														
 
															+    
														
 
															+    new_height = result.shape[0]
														
 
															+    logger.info(f"[裁剪页眉页脚] 裁剪后尺寸: {width}x{new_height}")
														
 
															+    logger.info(f"[裁剪页眉页脚] 裁剪了顶部 {header_pixels}px，底部 {footer_pixels}px")
														
 
															+    
														
 
															+    # 确定输出路径
														
 
															+    if output_path is None:
														
 
															+        path = Path(image_path)
														
 
															+        output_path = str(path.parent / f"{path.stem}_cropped{path.suffix}")
														
 
															+    
														
 
															+    # 保存结果
														
 
															+    cv2.imwrite(output_path, result)
														
 
															+    logger.info(f"[裁剪页眉页脚] 处理完成，保存到: {output_path}")
														
 
															+    
														
 
															+    return output_path
														
 
															+
														
 
															+
														
 
															+def _detect_header_footer_boundaries(img: np.ndarray) -> Tuple[int, int]:
														
 
															+    """
														
 
															+    自动检测页眉页脚边界
														
 
															+    
														
 
															+    使用多种方法综合判断：
														
 
															+    1. 水平线检测 - 检测分隔线
														
 
															+    2. 文本密度分析 - 页眉页脚通常文字较少
														
 
															+    3. 空白区域检测 - 检测大面积空白
														
 
															+    
														
 
															+    Args:
														
 
															+        img: 输入图片（BGR格式）
														
 
															+    
														
 
															+    Returns:
														
 
															+        (header_pixels, footer_pixels): 页眉和页脚的像素高度
														
 
															+    """
														
 
															+    height, width = img.shape[:2]
														
 
															+    
														
 
															+    # 转为灰度图
														
 
															+    gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
														
 
															+    
														
 
															+    # 定义搜索范围（页眉页脚通常在顶部/底部 15% 以内）
														
 
															+    search_range = int(height * 0.15)
														
 
															+    min_margin = int(height * 0.02)  # 最小边距 2%
														
 
															+    
														
 
															+    # 方法1: 检测水平线
														
 
															+    header_line = _find_horizontal_line(gray, 0, search_range, from_top=True)
														
 
															+    footer_line = _find_horizontal_line(gray, height - search_range, height, from_top=False)
														
 
															+    
														
 
															+    # 方法2: 分析文本密度变化
														
 
															+    header_density = _find_content_boundary(gray, 0, search_range, from_top=True)
														
 
															+    footer_density = _find_content_boundary(gray, height - search_range, height, from_top=False)
														
 
															+    
														
 
															+    # 综合判断：取最可靠的结果
														
 
															+    # 优先使用水平线检测结果，其次使用密度分析结果
														
 
															+    if header_line > min_margin:
														
 
															+        header_pixels = header_line
														
 
															+        logger.debug(f"[自动检测] 页眉: 使用水平线检测结果 {header_pixels}px")
														
 
															+    elif header_density > min_margin:
														
 
															+        header_pixels = header_density
														
 
															+        logger.debug(f"[自动检测] 页眉: 使用密度分析结果 {header_pixels}px")
														
 
															+    else:
														
 
															+        header_pixels = min_margin
														
 
															+        logger.debug(f"[自动检测] 页眉: 使用最小边距 {header_pixels}px")
														
 
															+    
														
 
															+    if footer_line > min_margin:
														
 
															+        footer_pixels = footer_line
														
 
															+        logger.debug(f"[自动检测] 页脚: 使用水平线检测结果 {footer_pixels}px")
														
 
															+    elif footer_density > min_margin:
														
 
															+        footer_pixels = footer_density
														
 
															+        logger.debug(f"[自动检测] 页脚: 使用密度分析结果 {footer_pixels}px")
														
 
															+    else:
														
 
															+        footer_pixels = min_margin
														
 
															+        logger.debug(f"[自动检测] 页脚: 使用最小边距 {footer_pixels}px")
														
 
															+    
														
 
															+    return header_pixels, footer_pixels
														
 
															+
														
 
															+
														
 
															+def _find_horizontal_line(
														
 
															+    gray: np.ndarray,
														
 
															+    start_y: int,
														
 
															+    end_y: int,
														
 
															+    from_top: bool = True
														
 
															+) -> int:
														
 
															+    """
														
 
															+    在指定区域内查找水平分隔线
														
 
															+    
														
 
															+    Args:
														
 
															+        gray: 灰度图
														
 
															+        start_y: 搜索起始y坐标
														
 
															+        end_y: 搜索结束y坐标
														
 
															+        from_top: True表示从上往下找，False表示从下往上找
														
 
															+    
														
 
															+    Returns:
														
 
															+        分隔线位置（像素），如果没找到返回0
														
 
															+    """
														
 
															+    height, width = gray.shape
														
 
															+    
														
 
															+    # 使用 Canny 边缘检测
														
 
															+    edges = cv2.Canny(gray[start_y:end_y, :], 50, 150)
														
 
															+    
														
 
															+    # 使用霍夫变换检测直线
														
 
															+    lines = cv2.HoughLinesP(
														
 
															+        edges,
														
 
															+        rho=1,
														
 
															+        theta=np.pi/180,
														
 
															+        threshold=int(width * 0.5),  # 线长度至少为图片宽度的50%
														
 
															+        minLineLength=int(width * 0.4),
														
 
															+        maxLineGap=20
														
 
															+    )
														
 
															+    
														
 
															+    if lines is None:
														
 
															+        return 0
														
 
															+    
														
 
															+    # 筛选水平线（角度接近0或180度）
														
 
															+    horizontal_lines = []
														
 
															+    for line in lines:
														
 
															+        x1, y1, x2, y2 = line[0]
														
 
															+        # 计算角度
														
 
															+        angle = abs(np.arctan2(y2 - y1, x2 - x1) * 180 / np.pi)
														
 
															+        # 水平线角度应该接近 0 或 180
														
 
															+        if angle < 5 or angle > 175:
														
 
															+            avg_y = (y1 + y2) // 2 + start_y
														
 
															+            horizontal_lines.append(avg_y)
														
 
															+    
														
 
															+    if not horizontal_lines:
														
 
															+        return 0
														
 
															+    
														
 
															+    # 根据方向返回最合适的线
														
 
															+    if from_top:
														
 
															+        # 从上往下，返回最下面的水平线（作为页眉下边界）
														
 
															+        return max(horizontal_lines)
														
 
															+    else:
														
 
															+        # 从下往上，返回距离底部的距离
														
 
															+        return height - min(horizontal_lines)
														
 
															+
														
 
															+
														
 
															+def _find_content_boundary(
														
 
															+    gray: np.ndarray,
														
 
															+    start_y: int,
														
 
															+    end_y: int,
														
 
															+    from_top: bool = True
														
 
															+) -> int:
														
 
															+    """
														
 
															+    通过分析文本/内容密度找到内容边界
														
 
															+    
														
 
															+    原理：页眉页脚区域通常是空白或只有少量文字，
														
 
															+    正文区域文字密度较高。通过检测密度突变点来确定边界。
														
 
															+    
														
 
															+    Args:
														
 
															+        gray: 灰度图
														
 
															+        start_y: 搜索起始y坐标
														
 
															+        end_y: 搜索结束y坐标
														
 
															+        from_top: True表示从上往下找，False表示从下往上找
														
 
															+    
														
 
															+    Returns:
														
 
															+        内容边界位置（像素），如果没找到返回0
														
 
															+    """
														
 
															+    height, width = gray.shape
														
 
															+    
														
 
															+    # 二值化
														
 
															+    _, binary = cv2.threshold(gray, 200, 255, cv2.THRESH_BINARY_INV)
														
 
															+    
														
 
															+    # 计算每一行的像素密度（黑色像素占比）
														
 
															+    row_densities = []
														
 
															+    for y in range(start_y, end_y):
														
 
															+        row = binary[y, :]
														
 
															+        density = np.sum(row > 0) / width
														
 
															+        row_densities.append((y, density))
														
 
															+    
														
 
															+    if not row_densities:
														
 
															+        return 0
														
 
															+    
														
 
															+    # 使用滑动窗口平滑密度曲线
														
 
															+    window_size = 10
														
 
															+    smoothed = []
														
 
															+    for i in range(len(row_densities)):
														
 
															+        start = max(0, i - window_size // 2)
														
 
															+        end = min(len(row_densities), i + window_size // 2)
														
 
															+        avg_density = sum(d[1] for d in row_densities[start:end]) / (end - start)
														
 
															+        smoothed.append((row_densities[i][0], avg_density))
														
 
															+    
														
 
															+    # 找到密度突变点
														
 
															+    # 定义阈值：当密度从低于 0.01 变化到高于 0.02 时，认为进入正文区域
														
 
															+    low_threshold = 0.005
														
 
															+    high_threshold = 0.02
														
 
															+    
														
 
															+    if from_top:
														
 
															+        # 从上往下，找到第一个连续高密度区域的起始位置
														
 
															+        in_content = False
														
 
															+        content_start = 0
														
 
															+        consecutive_high = 0
														
 
															+        
														
 
															+        for y, density in smoothed:
														
 
															+            if density > high_threshold:
														
 
															+                consecutive_high += 1
														
 
															+                if consecutive_high >= 5 and not in_content:
														
 
															+                    # 连续5行高密度，认为进入正文
														
 
															+                    in_content = True
														
 
															+                    content_start = y - 5  # 往上回退一点
														
 
															+                    break
														
 
															+            else:
														
 
															+                consecutive_high = 0
														
 
															+        
														
 
															+        return max(0, content_start - start_y)
														
 
															+    else:
														
 
															+        # 从下往上，找到最后一个连续高密度区域的结束位置
														
 
															+        in_content = False
														
 
															+        content_end = height
														
 
															+        consecutive_high = 0
														
 
															+        
														
 
															+        for y, density in reversed(smoothed):
														
 
															+            if density > high_threshold:
														
 
															+                consecutive_high += 1
														
 
															+                if consecutive_high >= 5 and not in_content:
														
 
															+                    in_content = True
														
 
															+                    content_end = y + 5
														
 
															+                    break
														
 
															+            else:
														
 
															+                consecutive_high = 0
														
 
															+        
														
 
															+        return max(0, height - content_end)
														
--- a/pdf_converter_v2/utils/pdf_watermark_remover.py
+++ b/pdf_converter_v2/utils/pdf_watermark_remover.py
@@ -0,0 +1,122 @@
 
															+#!/usr/bin/env python3
														
 
															+# -*- coding: utf-8 -*-
														
 
															+"""
														
 
															+PDF去水印工具
														
 
															+将PDF转换为图片，去除水印后再转回PDF
														
 
															+"""
														
 
															+
														
 
															+from pathlib import Path
														
 
															+from typing import Optional
														
 
															+import tempfile
														
 
															+import shutil
														
 
															+
														
 
															+def remove_watermark_from_pdf(
														
 
															+    input_pdf: str,
														
 
															+    output_pdf: str,
														
 
															+    light_threshold: int = 200,
														
 
															+    saturation_threshold: int = 30,
														
 
															+    dpi: int = 200
														
 
															+) -> bool:
														
 
															+    """
														
 
															+    对PDF文件进行去水印处理
														
 
															+    
														
 
															+    处理流程：
														
 
															+    1. 将PDF的每一页转换为图片
														
 
															+    2. 对每张图片进行去水印处理
														
 
															+    3. 将处理后的图片合并为新的PDF
														
 
															+    
														
 
															+    Args:
														
 
															+        input_pdf: 输入PDF文件路径
														
 
															+        output_pdf: 输出PDF文件路径
														
 
															+        light_threshold: 水印亮度阈值（0-255），高于此值的浅色像素可能是水印
														
 
															+        saturation_threshold: 水印饱和度阈值（0-255），低于此值的低饱和度像素可能是水印
														
 
															+        dpi: PDF转图片的DPI，影响图片质量和处理速度
														
 
															+    
														
 
															+    Returns:
														
 
															+        bool: 是否成功
														
 
															+    """
														
 
															+    try:
														
 
															+        # 导入必要的库
														
 
															+        from pdf2image import convert_from_path
														
 
															+        from PIL import Image
														
 
															+        import PyPDF2
														
 
															+        from utils.image_preprocessor import remove_watermark, check_opencv_available
														
 
															+        
														
 
															+        # 检查OpenCV是否可用
														
 
															+        if not check_opencv_available():
														
 
															+            print("⚠ OpenCV 未安装，无法进行去水印处理")
														
 
															+            return False
														
 
															+        
														
 
															+        # 创建临时目录
														
 
															+        temp_dir = tempfile.mkdtemp(prefix="pdf_watermark_")
														
 
															+        temp_path = Path(temp_dir)
														
 
															+        
														
 
															+        try:
														
 
															+            print(f"正在将PDF转换为图片（DPI={dpi}）...")
														
 
															+            # 将PDF转换为图片
														
 
															+            images = convert_from_path(input_pdf, dpi=dpi)
														
 
															+            print(f"✓ 转换完成，共 {len(images)} 页")
														
 
															+            
														
 
															+            # 处理每一页
														
 
															+            processed_images = []
														
 
															+            for i, image in enumerate(images, 1):
														
 
															+                print(f"处理第 {i}/{len(images)} 页...", end='\r')
														
 
															+                
														
 
															+                # 保存原始图片
														
 
															+                original_path = temp_path / f"page_{i}_original.png"
														
 
															+                image.save(str(original_path), "PNG")
														
 
															+                
														
 
															+                # 去水印
														
 
															+                nowm_path = temp_path / f"page_{i}_nowm.png"
														
 
															+                processed_path = remove_watermark(
														
 
															+                    str(original_path),
														
 
															+                    output_path=str(nowm_path),
														
 
															+                    light_threshold=light_threshold,
														
 
															+                    saturation_threshold=saturation_threshold,
														
 
															+                    method="hsv"
														
 
															+                )
														
 
															+                
														
 
															+                # 加载处理后的图片
														
 
															+                processed_img = Image.open(processed_path)
														
 
															+                processed_images.append(processed_img)
														
 
															+            
														
 
															+            print(f"\n✓ 所有页面处理完成")
														
 
															+            
														
 
															+            # 将图片合并为PDF
														
 
															+            print("正在生成PDF...")
														
 
															+            if processed_images:
														
 
															+                # 第一张图片作为主图片
														
 
															+                first_image = processed_images[0]
														
 
															+                # 其余图片作为附加页
														
 
															+                other_images = processed_images[1:] if len(processed_images) > 1 else []
														
 
															+                
														
 
															+                # 保存为PDF
														
 
															+                first_image.save(
														
 
															+                    output_pdf,
														
 
															+                    "PDF",
														
 
															+                    resolution=dpi,
														
 
															+                    save_all=True,
														
 
															+                    append_images=other_images
														
 
															+                )
														
 
															+                print(f"✓ PDF生成完成: {output_pdf}")
														
 
															+                return True
														
 
															+            else:
														
 
															+                print("⚠ 没有处理任何图片")
														
 
															+                return False
														
 
															+                
														
 
															+        finally:
														
 
															+            # 清理临时目录
														
 
															+            try:
														
 
															+                shutil.rmtree(temp_dir)
														
 
															+            except Exception as e:
														
 
															+                print(f"⚠ 清理临时目录失败: {e}")
														
 
															+    
														
 
															+    except ImportError as e:
														
 
															+        print(f"⚠ 缺少必要的库: {e}")
														
 
															+        print("请安装: pip install pdf2image pillow PyPDF2 opencv-python")
														
 
															+        return False
														
 
															+    except Exception as e:
														
 
															+        print(f"⚠ 去水印处理失败: {e}")
														
 
															+        import traceback
														
 
															+        traceback.print_exc()
														
 
															+        return False
														
--- a/start_mineru_in_container.sh
+++ b/start_mineru_in_container.sh
@@ -0,0 +1,15 @@
 
															+#!/usr/bin/env bash
														
 
															+# 在 Docker 容器内启动 MinerU file_parse API（容器内无 systemd，用本脚本代替 systemd 服务）
														
 
															+# 使用：bash start_mineru_in_container.sh  或  nohup bash start_mineru_in_container.sh &
														
 
															+# 工作目录：/root/work/Clerk2.5（可通过 CLERK_ROOT 覆盖）
														
 
															+
														
 
															+set -e
														
 
															+CLERK_ROOT="${CLERK_ROOT:-/root/work/Clerk2.5}"
														
 
															+PORT="${MINERU_PORT:-5282}"
														
 
															+
														
 
															+# NPU/容器内需预加载 libgomp，避免 static TLS 报错（路径以常见 dist-packages 为准，可按本机修改）
														
 
															+export LD_PRELOAD="${LD_PRELOAD:-/usr/local/lib/python3.10/dist-packages/simsimd.libs/libgomp-a49a47f9.so.1.0.0:/usr/local/lib/python3.10/dist-packages/scikit_learn.libs/libgomp-d22c30c5.so.1.0.0}"
														
 
															+export PYTHONPATH="${CLERK_ROOT}"
														
 
															+
														
 
															+cd "$CLERK_ROOT"
														
 
															+exec python3 -m uvicorn mineru.cli.fast_api:app --host 0.0.0.0 --port "$PORT"