3 săptămâni în urmă · d6d3fedfcc
--- a/mineru/cli/fast_api.py
+++ b/mineru/cli/fast_api.py
@@ -16,6 +16,11 @@ from typing import List, Optional
 
				 from loguru import logger
			
 
				 from base64 import b64encode
			
 
				 
			
 
				+# NumPy 1.24+ removed np.complex; librosa (via transformers) still uses it in constantq.py
			
 
				+import numpy as _np
			
 
				+if not hasattr(_np, "complex"):
			
 
				+    _np.complex = _np.complex128  # type: ignore[attr-defined]
			
 
				+
			
 
				 from mineru.cli.common import aio_do_parse, read_fn, pdf_suffixes, image_suffixes
			
 
				 from mineru.utils.cli_parser import arg_parse
			
 
				 from mineru.utils.guess_suffix_or_lang import guess_suffix_by_path
			
--- a/mineru/model/mfr/unimernet/unimernet_hf/modeling_unimernet.py
+++ b/mineru/model/mfr/unimernet/unimernet_hf/modeling_unimernet.py
@@ -7,8 +7,11 @@ from ftfy import fix_text
 
				 from loguru import logger
			
 
				 
			
 
				 from transformers import AutoConfig, AutoModel, AutoModelForCausalLM, AutoTokenizer, PretrainedConfig, PreTrainedModel
			
 
				-from transformers import VisionEncoderDecoderConfig, VisionEncoderDecoderModel
			
 
				-from transformers.models.vision_encoder_decoder.modeling_vision_encoder_decoder import logger as base_model_logger
			
 
				+from transformers.models.vision_encoder_decoder.configuration_vision_encoder_decoder import VisionEncoderDecoderConfig
			
 
				+from transformers.models.vision_encoder_decoder.modeling_vision_encoder_decoder import (
			
 
				+    VisionEncoderDecoderModel,
			
 
				+    logger as base_model_logger,
			
 
				+)
			
 
				 
			
 
				 from .unimer_swin import UnimerSwinConfig, UnimerSwinModel, UnimerSwinImageProcessor
			
 
				 from .unimer_mbart import UnimerMBartConfig, UnimerMBartForCausalLM
			
--- a/mineru/requirements-paddle-npu.txt
+++ b/mineru/requirements-paddle-npu.txt
@@ -0,0 +1,19 @@
 
				+# MinerU API 在 Paddle NPU 容器内运行 /file_parse 所需的最小依赖
			
 
				+# 容器内已安装 paddle/paddlex，此处不重复安装
			
 
				+# pipeline 依赖链 (transformers→librosa) 中 librosa 使用 np.complex，NumPy 2.0 已移除该别名，故限制 numpy<2
			
 
				+numpy<2
			
 
				+
			
 
				+# Web 与 CLI
			
 
				+fastapi>=0.100.0
			
 
				+uvicorn[standard]>=0.23.0
			
 
				+click>=8.0.0
			
 
				+python-multipart>=0.0.6
			
 
				+
			
 
				+# 日志
			
 
				+loguru>=0.7.0
			
 
				+
			
 
				+# PDF 处理
			
 
				+pypdfium2>=4.0.0
			
 
				+
			
 
				+# transformers 依赖链 (LayoutLMv3 → processing_utils → audio_utils) 需要 soxr
			
 
				+soxr>=0.3.0
			
--- a/pdf_converter_v2/api/main.py
+++ b/pdf_converter_v2/api/main.py
@@ -150,6 +150,16 @@ class ConversionRequest(BaseModel):
 
				     """转换请求模型（v2 精简版）"""
			
 
				     # 新增：强制文档类型（正式全称）
			
 
				     doc_type: Optional[str] = None
			
 
				+    # 新增：去水印参数
			
 
				+    remove_watermark: Optional[bool] = False
			
 
				+    watermark_light_threshold: Optional[int] = 200
			
 
				+    watermark_saturation_threshold: Optional[int] = 30
			
 
				+    crop_header_footer: Optional[bool] = False
			
 
				+    header_ratio: Optional[float] = 0.05
			
 
				+    footer_ratio: Optional[float] = 0.05
			
 
				+    auto_detect_header_footer: Optional[bool] = False
			
 
				+    # 新增：附件页切割参数
			
 
				+    table_only: Optional[bool] = False  # 是否只保留包含表格的附件页（默认False）
			
 
				 
			
 
				 
			
 
				 class ConversionResponse(BaseModel):
			
@@ -200,6 +210,13 @@ class OCRRequest(BaseModel):
 
				     """OCR识别请求模型"""
			
 
				     image_base64: str  # base64编码的图片数据
			
 
				     image_format: Optional[str] = "png"  # 图片格式：png, jpg, jpeg
			
 
				+    remove_watermark: Optional[bool] = False  # 是否去除水印
			
 
				+    watermark_light_threshold: Optional[int] = 200  # 水印亮度阈值（0-255），高于此值的浅色像素可能是水印
			
 
				+    watermark_saturation_threshold: Optional[int] = 30  # 水印饱和度阈值（0-255），低于此值的低饱和度像素可能是水印
			
 
				+    crop_header_footer: Optional[bool] = False  # 是否裁剪页眉页脚
			
 
				+    header_ratio: Optional[float] = 0.05  # 页眉裁剪比例（0-1），默认5%
			
 
				+    footer_ratio: Optional[float] = 0.05  # 页脚裁剪比例（0-1），默认5%
			
 
				+    auto_detect_header_footer: Optional[bool] = False  # 是否自动检测页眉页脚边界
			
 
				 
			
 
				 
			
 
				 class OCRResponse(BaseModel):
			
@@ -308,11 +325,108 @@ async def process_conversion_task(
 
				         
			
 
				         logger.info(f"[任务 {task_id}] 开始处理: {file_path}")
			
 
				         
			
 
				+        # 文件预处理（支持图片和PDF）
			
 
				+        from pathlib import Path as PathLib
			
 
				+        file_suffix = PathLib(file_path).suffix.lower()
			
 
				+        is_image = file_suffix in ['.png', '.jpg', '.jpeg', '.bmp', '.tiff', '.tif']
			
 
				+        is_pdf = file_suffix == '.pdf'
			
 
				+        
			
 
				+        # 图片预处理：去水印或裁剪页眉页脚
			
 
				+        if is_image and (request.remove_watermark or request.crop_header_footer):
			
 
				+            logger.info(f"[任务 {task_id}] 检测到图片文件，开始预处理...")
			
 
				+            preprocessed_path = file_path
			
 
				+            
			
 
				+            # 裁剪页眉页脚
			
 
				+            if request.crop_header_footer:
			
 
				+                try:
			
 
				+                    from ..utils.image_preprocessor import crop_header_footer, check_opencv_available
			
 
				+                    
			
 
				+                    if check_opencv_available():
			
 
				+                        if request.auto_detect_header_footer:
			
 
				+                            logger.info(f"[任务 {task_id}] 开始自动检测并裁剪页眉页脚")
			
 
				+                        else:
			
 
				+                            logger.info(f"[任务 {task_id}] 开始裁剪页眉页脚，顶部比例: {request.header_ratio}, 底部比例: {request.footer_ratio}")
			
 
				+                        
			
 
				+                        # 裁剪后的图片路径
			
 
				+                        cropped_path = str(PathLib(output_dir) / f"preprocessed_cropped{file_suffix}")
			
 
				+                        
			
 
				+                        preprocessed_path = await asyncio.to_thread(
			
 
				+                            crop_header_footer,
			
 
				+                            preprocessed_path,
			
 
				+                            output_path=cropped_path,
			
 
				+                            header_ratio=request.header_ratio or 0.05,
			
 
				+                            footer_ratio=request.footer_ratio or 0.05,
			
 
				+                            auto_detect=request.auto_detect_header_footer or False
			
 
				+                        )
			
 
				+                        logger.info(f"[任务 {task_id}] 裁剪页眉页脚完成: {preprocessed_path}")
			
 
				+                    else:
			
 
				+                        logger.warning(f"[任务 {task_id}] OpenCV 未安装，跳过裁剪页眉页脚")
			
 
				+                except Exception as e:
			
 
				+                    logger.warning(f"[任务 {task_id}] 裁剪页眉页脚失败，使用原图继续: {e}")
			
 
				+            
			
 
				+            # 去水印
			
 
				+            if request.remove_watermark:
			
 
				+                try:
			
 
				+                    from ..utils.image_preprocessor import remove_watermark, check_opencv_available
			
 
				+                    
			
 
				+                    if check_opencv_available():
			
 
				+                        logger.info(f"[任务 {task_id}] 开始去水印处理，亮度阈值: {request.watermark_light_threshold}, 饱和度阈值: {request.watermark_saturation_threshold}")
			
 
				+                        
			
 
				+                        # 去水印后的图片路径
			
 
				+                        nowm_path = str(PathLib(output_dir) / f"preprocessed_nowm{file_suffix}")
			
 
				+                        
			
 
				+                        preprocessed_path = await asyncio.to_thread(
			
 
				+                            remove_watermark,
			
 
				+                            preprocessed_path,
			
 
				+                            output_path=nowm_path,
			
 
				+                            light_threshold=request.watermark_light_threshold or 200,
			
 
				+                            saturation_threshold=request.watermark_saturation_threshold or 30,
			
 
				+                            method="hsv"
			
 
				+                        )
			
 
				+                        logger.info(f"[任务 {task_id}] 去水印完成: {preprocessed_path}")
			
 
				+                    else:
			
 
				+                        logger.warning(f"[任务 {task_id}] OpenCV 未安装，跳过去水印处理")
			
 
				+                except Exception as e:
			
 
				+                    logger.warning(f"[任务 {task_id}] 去水印处理失败，使用原图继续: {e}")
			
 
				+            
			
 
				+            # 更新文件路径为预处理后的路径
			
 
				+            if preprocessed_path != file_path:
			
 
				+                file_path = preprocessed_path
			
 
				+                logger.info(f"[任务 {task_id}] 图片预处理完成，使用预处理后的文件: {file_path}")
			
 
				+        
			
 
				+        # PDF预处理：去水印
			
 
				+        elif is_pdf and request.remove_watermark:
			
 
				+            logger.info(f"[任务 {task_id}] 检测到PDF文件，开始去水印预处理...")
			
 
				+            try:
			
 
				+                from ..utils.pdf_watermark_remover import remove_watermark_from_pdf
			
 
				+                
			
 
				+                # 去水印后的PDF路径
			
 
				+                nowm_pdf_path = str(PathLib(output_dir) / f"preprocessed_nowm.pdf")
			
 
				+                
			
 
				+                # 执行去水印
			
 
				+                logger.info(f"[任务 {task_id}] 开始PDF去水印处理，亮度阈值: {request.watermark_light_threshold}, 饱和度阈值: {request.watermark_saturation_threshold}")
			
 
				+                success = await asyncio.to_thread(
			
 
				+                    remove_watermark_from_pdf,
			
 
				+                    input_pdf=file_path,
			
 
				+                    output_pdf=nowm_pdf_path,
			
 
				+                    light_threshold=request.watermark_light_threshold or 200,
			
 
				+                    saturation_threshold=request.watermark_saturation_threshold or 30,
			
 
				+                    dpi=200  # PDF转图片的DPI
			
 
				+                )
			
 
				+                
			
 
				+                if success and PathLib(nowm_pdf_path).exists():
			
 
				+                    file_path = nowm_pdf_path
			
 
				+                    logger.info(f"[任务 {task_id}] PDF去水印完成: {file_path}")
			
 
				+                else:
			
 
				+                    logger.warning(f"[任务 {task_id}] PDF去水印失败，使用原PDF继续")
			
 
				+            except Exception as e:
			
 
				+                logger.warning(f"[任务 {task_id}] PDF去水印处理失败，使用原PDF继续: {e}")
			
 
				+        
			
 
				         result = None
			
 
				         tables_info = None
			
 
				         
			
 
				         # 针对投资估算类型，需要先切割附件页
			
 
				-        if request.doc_type in ("fsApproval", "fsReview", "pdApproval"):
			
 
				+        if request.doc_type in ("fsApproval", "fsReview", "pdApproval", "safetyFsApproval"):
			
 
				             logger.info(f"[任务 {task_id}] 文档类型 {request.doc_type}，需要先切割附件页")
			
 
				             
			
 
				             # 导入附件页切割函数
			
@@ -328,18 +442,21 @@ async def process_conversion_task(
 
				                 attachment_dir = PathLib(output_dir) / "attachments"
			
 
				                 attachment_dir.mkdir(parents=True, exist_ok=True)
			
 
				                 
			
 
				-                # 切割附件页
			
 
				-                logger.info(f"[任务 {task_id}] 开始切割附件页，输出目录: {attachment_dir}")
			
 
				+                # 切割附件页（根据 table_only 参数决定是否过滤非表格内容）
			
 
				+                logger.info(f"[任务 {task_id}] 开始切割附件页（table_only={request.table_only}），输出目录: {attachment_dir}")
			
 
				                 await asyncio.to_thread(
			
 
				                     split_attachment_pages,
			
 
				                     file_path,
			
 
				                     attachment_dir,
			
 
				                     use_ocr=True,
			
 
				-                    debug=False
			
 
				+                    debug=False,
			
 
				+                    table_only=request.table_only  # 是否只保留包含表格的附件页
			
 
				                 )
			
 
				                 
			
 
				-                # 查找切割后的附件页PDF
			
 
				-                attachment_pdfs = list(attachment_dir.glob("*_附件页_*.pdf"))
			
 
				+                # 查找切割后的附件页PDF（优先使用表格附件页，其次使用普通附件页）
			
 
				+                attachment_pdfs = list(attachment_dir.glob("*_表格附件页_*.pdf"))
			
 
				+                if not attachment_pdfs:
			
 
				+                    attachment_pdfs = list(attachment_dir.glob("*_附件页_*.pdf"))
			
 
				                 logger.info(f"[任务 {task_id}] 附件页目录内容: {list(attachment_dir.iterdir()) if attachment_dir.exists() else '(目录不存在)'}")
			
 
				                 
			
 
				                 if attachment_pdfs:
			
@@ -532,11 +649,44 @@ async def process_conversion_task(
 
				 @app.post("/convert", response_model=ConversionResponse)
			
 
				 async def convert_file(
			
 
				     file: Annotated[UploadFile, File(description="上传的PDF或图片文件")],
			
 
				-    # 新增：类型参数（英文传参） noiseRec | emRec | opStatus | settlementReport | designReview | fsApproval | fsReview | pdApproval | finalAccount
			
 
				+    # 新增：类型参数（英文传参） noiseRec | emRec | opStatus | settlementReport | designReview | fsApproval | fsReview | pdApproval | finalAccount | safetyFsApproval
			
 
				     type: Annotated[
			
 
				-        Optional[Literal["noiseRec", "emRec", "opStatus", "settlementReport", "designReview", "fsApproval", "fsReview", "pdApproval", "finalAccount"]],
			
 
				-        Form(description="文档类型：noiseRec | emRec | opStatus | settlementReport | designReview | fsApproval | fsReview | pdApproval | finalAccount")
			
 
				+        Optional[Literal["noiseRec", "emRec", "opStatus", "settlementReport", "designReview", "fsApproval", "fsReview", "pdApproval", "finalAccount", "safetyFsApproval"]],
			
 
				+        Form(description="文档类型：noiseRec | emRec | opStatus | settlementReport | designReview | fsApproval | fsReview | pdApproval | finalAccount | safetyFsApproval")
			
 
				     ] = None,
			
 
				+    # 新增：去水印参数
			
 
				+    remove_watermark: Annotated[
			
 
				+        Optional[bool],
			
 
				+        Form(description="是否去除水印，默认为false")
			
 
				+    ] = False,
			
 
				+    watermark_light_threshold: Annotated[
			
 
				+        Optional[int],
			
 
				+        Form(description="水印亮度阈值（0-255），默认200，高于此值的浅色像素可能是水印")
			
 
				+    ] = 200,
			
 
				+    watermark_saturation_threshold: Annotated[
			
 
				+        Optional[int],
			
 
				+        Form(description="水印饱和度阈值（0-255），默认30，低于此值的低饱和度像素可能是水印")
			
 
				+    ] = 30,
			
 
				+    crop_header_footer: Annotated[
			
 
				+        Optional[bool],
			
 
				+        Form(description="是否裁剪页眉页脚，默认为false")
			
 
				+    ] = False,
			
 
				+    header_ratio: Annotated[
			
 
				+        Optional[float],
			
 
				+        Form(description="页眉裁剪比例（0-1），默认0.05表示裁剪顶部5%")
			
 
				+    ] = 0.05,
			
 
				+    footer_ratio: Annotated[
			
 
				+        Optional[float],
			
 
				+        Form(description="页脚裁剪比例（0-1），默认0.05表示裁剪底部5%")
			
 
				+    ] = 0.05,
			
 
				+    auto_detect_header_footer: Annotated[
			
 
				+        Optional[bool],
			
 
				+        Form(description="是否自动检测页眉页脚边界，默认为false（启用后忽略header_ratio和footer_ratio）")
			
 
				+    ] = False,
			
 
				+    table_only: Annotated[
			
 
				+        Optional[bool],
			
 
				+        Form(description="是否只保留包含表格的附件页，默认为false")
			
 
				+    ] = False,
			
 
				 ):
			
 
				     """
			
 
				     转换PDF/图片文件（异步处理）
			
@@ -557,6 +707,16 @@ async def convert_file(
 
				       * fsApproval - 可研批复投资估算
			
 
				       * fsReview - 可研评审投资估算
			
 
				       * pdApproval - 初设批复概算投资
			
 
				+      * finalAccount - 决算报告
			
 
				+      * safetyFsApproval - 安评可研批复投资估算
			
 
				+    - **remove_watermark**: 是否去除水印（仅对图片有效），默认为false
			
 
				+    - **watermark_light_threshold**: 水印亮度阈值（0-255），默认200
			
 
				+    - **watermark_saturation_threshold**: 水印饱和度阈值（0-255），默认30
			
 
				+    - **crop_header_footer**: 是否裁剪页眉页脚（仅对图片有效），默认为false
			
 
				+    - **header_ratio**: 页眉裁剪比例（0-1），默认0.05
			
 
				+    - **footer_ratio**: 页脚裁剪比例（0-1），默认0.05
			
 
				+    - **auto_detect_header_footer**: 是否自动检测页眉页脚边界，默认为false
			
 
				+    - **table_only**: 是否只保留包含表格的附件页，默认为false
			
 
				     
			
 
				     注意：v2 版本内部使用外部API进行转换，v2特有的配置参数（如API URL、backend等）
			
 
				     通过环境变量或配置文件设置，不通过API参数传入。
			
@@ -677,6 +837,8 @@ async def convert_file(
 
				         "pdApproval": "pdApproval",
			
 
				         # 决算报告
			
 
				         "finalAccount": "finalAccount",
			
 
				+        # 安评类
			
 
				+        "safetyFsApproval": "safetyFsApproval",
			
 
				     }
			
 
				     doc_type = None
			
 
				     if type:
			
@@ -692,6 +854,14 @@ async def convert_file(
 
				     # 创建请求对象（v2 精简）
			
 
				     request = ConversionRequest(
			
 
				         doc_type=doc_type,
			
 
				+        remove_watermark=remove_watermark,
			
 
				+        watermark_light_threshold=watermark_light_threshold,
			
 
				+        watermark_saturation_threshold=watermark_saturation_threshold,
			
 
				+        crop_header_footer=crop_header_footer,
			
 
				+        header_ratio=header_ratio,
			
 
				+        footer_ratio=footer_ratio,
			
 
				+        auto_detect_header_footer=auto_detect_header_footer,
			
 
				+        table_only=table_only,
			
 
				     )
			
 
				     
			
 
				     # 使用 asyncio.create_task 创建后台任务，确保立即返回
			
@@ -875,6 +1045,13 @@ async def ocr_image(request: OCRRequest):
 
				     
			
 
				     - **image_base64**: base64编码的图片数据（可以包含data:image/xxx;base64,前缀）
			
 
				     - **image_format**: 图片格式（png, jpg, jpeg），默认为png
			
 
				+    - **remove_watermark**: 是否去除水印，默认为false
			
 
				+    - **watermark_light_threshold**: 水印亮度阈值（0-255），默认200，高于此值的浅色像素可能是水印
			
 
				+    - **watermark_saturation_threshold**: 水印饱和度阈值（0-255），默认30，低于此值的低饱和度像素可能是水印
			
 
				+    - **crop_header_footer**: 是否裁剪页眉页脚，默认为false
			
 
				+    - **header_ratio**: 页眉裁剪比例（0-1），默认0.05表示裁剪顶部5%
			
 
				+    - **footer_ratio**: 页脚裁剪比例（0-1），默认0.05表示裁剪底部5%
			
 
				+    - **auto_detect_header_footer**: 是否自动检测页眉页脚边界，默认为false（启用后忽略header_ratio和footer_ratio）
			
 
				     
			
 
				     返回识别出的文本列表和GPU监控信息
			
 
				     """
			
@@ -934,6 +1111,57 @@ async def ocr_image(request: OCRRequest):
 
				             f.write(image_bytes)
			
 
				         logger.info(f"[OCR] 图片已保存: {image_path}")
			
 
				         
			
 
				+        # 如果需要裁剪页眉页脚，先进行裁剪
			
 
				+        if request.crop_header_footer:
			
 
				+            try:
			
 
				+                from ..utils.image_preprocessor import crop_header_footer, check_opencv_available
			
 
				+                
			
 
				+                if check_opencv_available():
			
 
				+                    if request.auto_detect_header_footer:
			
 
				+                        logger.info("[OCR] 开始自动检测并裁剪页眉页脚")
			
 
				+                    else:
			
 
				+                        logger.info(f"[OCR] 开始裁剪页眉页脚，顶部比例: {request.header_ratio}, 底部比例: {request.footer_ratio}")
			
 
				+                    
			
 
				+                    # 裁剪后的图片路径
			
 
				+                    cropped_image_path = os.path.join(temp_dir, f"ocr_image_cropped{ext}")
			
 
				+                    
			
 
				+                    image_path = crop_header_footer(
			
 
				+                        image_path,
			
 
				+                        output_path=cropped_image_path,
			
 
				+                        header_ratio=request.header_ratio or 0.05,
			
 
				+                        footer_ratio=request.footer_ratio or 0.05,
			
 
				+                        auto_detect=request.auto_detect_header_footer or False
			
 
				+                    )
			
 
				+                    logger.info(f"[OCR] 裁剪页眉页脚完成: {image_path}")
			
 
				+                else:
			
 
				+                    logger.warning("[OCR] OpenCV 未安装，跳过裁剪页眉页脚")
			
 
				+            except Exception as e:
			
 
				+                logger.warning(f"[OCR] 裁剪页眉页脚失败，使用原图继续: {e}")
			
 
				+        
			
 
				+        # 如果需要去水印，进行预处理
			
 
				+        if request.remove_watermark:
			
 
				+            try:
			
 
				+                from ..utils.image_preprocessor import remove_watermark, check_opencv_available
			
 
				+                
			
 
				+                if check_opencv_available():
			
 
				+                    logger.info(f"[OCR] 开始去水印处理，亮度阈值: {request.watermark_light_threshold}, 饱和度阈值: {request.watermark_saturation_threshold}")
			
 
				+                    
			
 
				+                    # 去水印后的图片路径
			
 
				+                    nowm_image_path = os.path.join(temp_dir, f"ocr_image_nowm{ext}")
			
 
				+                    
			
 
				+                    image_path = remove_watermark(
			
 
				+                        image_path,
			
 
				+                        output_path=nowm_image_path,
			
 
				+                        light_threshold=request.watermark_light_threshold or 200,
			
 
				+                        saturation_threshold=request.watermark_saturation_threshold or 30,
			
 
				+                        method="hsv"
			
 
				+                    )
			
 
				+                    logger.info(f"[OCR] 去水印完成: {image_path}")
			
 
				+                else:
			
 
				+                    logger.warning("[OCR] OpenCV 未安装，跳过去水印处理")
			
 
				+            except Exception as e:
			
 
				+                logger.warning(f"[OCR] 去水印处理失败，使用原图继续: {e}")
			
 
				+        
			
 
				         # 调用PaddleOCR进行识别（监控线程在此期间持续采集数据）
			
 
				         from ..utils.paddleocr_fallback import call_paddleocr_ocr
			
 
				         
			
--- a/pdf_converter_v2/models/data_models.py
+++ b/pdf_converter_v2/models/data_models.py
@@ -283,9 +283,18 @@ class FeasibilityApprovalInvestment:
 
				     - Level 0: 顶层大类（如"山西晋城周村220千伏输变电工程"）
			
 
				     - Level 1: 二级分类（如"变电工程"、"线路工程"），有自己的 items
			
 
				     - Level 2: 具体项目（如"周村220千伏变电站新建工程"）
			
 
				+    
			
 
				+    项目信息（可选，用于 safetyFsApproval 类型）：
			
 
				+    - projectName: 工程(项目)名称
			
 
				+    - projectUnit: 项目单位
			
 
				+    - designUnit: 设计单位
			
 
				     """
			
 
				     def __init__(self):
			
 
				         self.items: List[InvestmentItem] = []
			
 
				+        # 项目基本信息（safetyFsApproval 专用）
			
 
				+        self.projectName: Optional[str] = None
			
 
				+        self.projectUnit: Optional[str] = None
			
 
				+        self.designUnit: Optional[str] = None
			
 
				     
			
 
				     def to_dict(self):
			
 
				         """转换为嵌套结构，与 designReview 保持一致
			
@@ -294,14 +303,38 @@ class FeasibilityApprovalInvestment:
 
				         Level="2" 的项目作为二级分类（Level: 1），有自己的 items
			
 
				         Level="3" 的项目作为具体项目（Level: 2），放入二级分类的 items
			
 
				         Level="0" 的项目（合计）跳过
			
 
				+        
			
 
				+        特殊处理：如果表格没有 Level=1 的顶层大类（如湖北省格式），
			
 
				+        自动创建一个虚拟顶层大类来包含所有 Level=2 的项目
			
 
				         """
			
 
				         if not self.items:
			
 
				             return []
			
 
				         
			
 
				+        # 检查是否有 Level=1 的顶层大类
			
 
				+        has_level_1 = any(item.level == "1" for item in self.items)
			
 
				+        
			
 
				         result = []
			
 
				         current_top_category = None  # Level 0 顶层大类
			
 
				         current_sub_category = None  # Level 1 二级分类
			
 
				         
			
 
				+        # 如果没有 Level=1 的顶层大类，创建一个虚拟的
			
 
				+        if not has_level_1:
			
 
				+            current_top_category = {
			
 
				+                "name": "项目总表",
			
 
				+                "Level": 0,
			
 
				+                "constructionScaleSubstation": "",
			
 
				+                "constructionScaleBay": "",
			
 
				+                "constructionScaleOverheadLine": "",
			
 
				+                "constructionScaleOpticalCable": "",
			
 
				+                "staticInvestment": "",
			
 
				+                "dynamicInvestment": "",
			
 
				+                "constructionProjectCost": "",
			
 
				+                "equipmentPurchaseCost": "",
			
 
				+                "installationProjectCost": "",
			
 
				+                "otherExpenses": "",
			
 
				+                "items": []
			
 
				+            }
			
 
				+        
			
 
				         for item in self.items:
			
 
				             if item.level == "1":
			
 
				                 # 顶层大类（如"山西晋城周村220千伏输变电工程"）
			
@@ -381,7 +414,18 @@ class FeasibilityApprovalInvestment:
 
				         if current_top_category is not None:
			
 
				             result.append(current_top_category)
			
 
				         
			
 
				-        return result
			
 
				+        # 如果有项目信息，返回包含项目信息的字典；否则直接返回数据列表
			
 
				+        if self.projectName or self.projectUnit or self.designUnit:
			
 
				+            return {
			
 
				+                "projectInfo": {
			
 
				+                    "projectName": self.projectName or "",
			
 
				+                    "projectUnit": self.projectUnit or "",
			
 
				+                    "designUnit": self.designUnit or ""
			
 
				+                },
			
 
				+                "data": result
			
 
				+            }
			
 
				+        else:
			
 
				+            return result
			
 
				     
			
 
				     @staticmethod
			
 
				     def _parse_number(value: str) -> str:
			
--- a/pdf_converter_v2/parser/investment_parser.py
+++ b/pdf_converter_v2/parser/investment_parser.py
@@ -224,8 +224,11 @@ def parse_feasibility_approval_investment(markdown_content: str) -> FeasibilityA
 
				             table_text += " ".join([str(cell) for cell in row])
			
 
				         # 移除空格后再匹配
			
 
				         table_text_no_space = table_text.replace(" ", "")
			
 
				-        # 选择包含"工程或费用名称"和"静态投资"的表格
			
 
				-        if "工程或费用名称" in table_text_no_space and "静态投资" in table_text_no_space:
			
 
				+        # 选择包含"工程或费用名称"或"项目名称"，且包含"静态投资"或"静态合计"的表格
			
 
				+        has_name_col = ("工程或费用名称" in table_text_no_space or "项目名称" in table_text_no_space)
			
 
				+        has_investment_col = ("静态投资" in table_text_no_space or "静态合计" in table_text_no_space)
			
 
				+        
			
 
				+        if has_name_col and has_investment_col:
			
 
				             all_matching_tables.append((table_idx, table))
			
 
				             logger.info(f"[可研批复投资] 找到投资估算表格 (表格{table_idx+1}), 行数: {len(table)}")
			
 
				     
			
@@ -295,7 +298,7 @@ def parse_feasibility_approval_investment(markdown_content: str) -> FeasibilityA
 
				             
			
 
				             if "序号" in cell_text and no_idx == -1:
			
 
				                 no_idx = col_idx
			
 
				-            elif ("工程或费用名称" in cell_text_no_space) and name_idx == -1:
			
 
				+            elif ("工程或费用名称" in cell_text_no_space or "项目名称" in cell_text_no_space) and name_idx == -1:
			
 
				                 name_idx = col_idx
			
 
				             elif "架空线" in cell_text_no_space and overhead_line_idx == -1:
			
 
				                 overhead_line_idx = col_idx
			
@@ -305,9 +308,9 @@ def parse_feasibility_approval_investment(markdown_content: str) -> FeasibilityA
 
				                 substation_idx = col_idx
			
 
				             elif "光缆" in cell_text and optical_cable_idx == -1:
			
 
				                 optical_cable_idx = col_idx
			
 
				-            elif "静态投资" in cell_text_no_space and static_investment_idx == -1:
			
 
				+            elif ("静态投资" in cell_text_no_space or "静态合计" in cell_text_no_space) and static_investment_idx == -1:
			
 
				                 static_investment_idx = col_idx
			
 
				-            elif "动态投资" in cell_text_no_space and dynamic_investment_idx == -1:
			
 
				+            elif ("动态投资" in cell_text_no_space or "动态合计" in cell_text_no_space) and dynamic_investment_idx == -1:
			
 
				                 dynamic_investment_idx = col_idx
			
 
				             # 新增费用字段识别
			
 
				             elif "建筑工程费" in cell_text_no_space and construction_project_cost_idx == -1:
			
@@ -322,8 +325,8 @@ def parse_feasibility_approval_investment(markdown_content: str) -> FeasibilityA
 
				                 if "其他费用" in cell_text_no_space:
			
 
				                     other_expenses_idx = col_idx
			
 
				         
			
 
				-        # 如果这一行包含"序号"或"工程或费用名称"，记录为表头结束行
			
 
				-        if ("序号" in row_text or "工程或费用名称" in row_text_no_space) and header_row_idx == -1:
			
 
				+        # 如果这一行包含"序号"或"工程或费用名称"或"项目名称"，记录为表头结束行
			
 
				+        if ("序号" in row_text or "工程或费用名称" in row_text_no_space or "项目名称" in row_text_no_space) and header_row_idx == -1:
			
 
				             header_row_idx = row_idx
			
 
				     
			
 
				     # 表头结束行应该是最后一个包含表头内容的行
			
@@ -417,6 +420,253 @@ def parse_feasibility_approval_investment(markdown_content: str) -> FeasibilityA
 
				     return record
			
 
				 
			
 
				 
			
 
				+def parse_safety_feasibility_approval_investment(markdown_content: str) -> FeasibilityApprovalInvestment:
			
 
				+    """
			
 
				+    解析安全可研批复投资估算（湖北省格式）
			
 
				+    
			
 
				+    特点：
			
 
				+    - 没有顶层大类（Level=1），直接从二级分类开始
			
 
				+    - 中文序号（一、二）表示二级分类（如"变电工程"、"线路工程"）
			
 
				+    - 阿拉伯数字（1、2、3）表示具体项目
			
 
				+    - 列名使用"项目名称"和"静态合计/动态合计"
			
 
				+    
			
 
				+    返回结构：
			
 
				+    - Level 1: 二级分类（如"变电工程"、"线路工程"）
			
 
				+    - Level 2: 具体项目（如"襄阳连云220千伏变电站新建工程"）
			
 
				+    """
			
 
				+    record = FeasibilityApprovalInvestment()
			
 
				+    
			
 
				+    tables = extract_table_with_rowspan_colspan(markdown_content)
			
 
				+    
			
 
				+    if not tables:
			
 
				+        logger.warning("[安全可研批复投资] 未能提取出任何表格内容")
			
 
				+        return record
			
 
				+    
			
 
				+    # 首先尝试提取项目基本信息表格
			
 
				+    for table_idx, table in enumerate(tables):
			
 
				+        if len(table) < 2:
			
 
				+            continue
			
 
				+        
			
 
				+        table_text = ""
			
 
				+        for row in table:
			
 
				+            table_text += " ".join([str(cell) for cell in row])
			
 
				+        table_text_no_space = table_text.replace(" ", "").replace("(", "（").replace(")", "）")
			
 
				+        
			
 
				+        # 查找包含"工程(项目)名称"的表格
			
 
				+        if "工程（项目）名称" in table_text_no_space or "工程项目名称" in table_text_no_space:
			
 
				+            logger.info(f"[安全可研批复投资] 找到项目信息表格 (表格{table_idx+1})")
			
 
				+            
			
 
				+            # 提取项目信息
			
 
				+            for row in table:
			
 
				+                if len(row) >= 2:
			
 
				+                    key = str(row[0]).strip()
			
 
				+                    value = str(row[1]).strip() if len(row) > 1 else ""
			
 
				+                    
			
 
				+                    if "工程" in key and "名称" in key:
			
 
				+                        record.projectName = value
			
 
				+                        logger.info(f"[安全可研批复投资] 提取工程名称: {value}")
			
 
				+                    elif "项目单位" in key:
			
 
				+                        record.projectUnit = value
			
 
				+                        logger.info(f"[安全可研批复投资] 提取项目单位: {value}")
			
 
				+                    elif "设计单位" in key:
			
 
				+                        record.designUnit = value
			
 
				+                        logger.info(f"[安全可研批复投资] 提取设计单位: {value}")
			
 
				+            break
			
 
				+    
			
 
				+    # 找到所有投资估算表格并合并
			
 
				+    all_matching_tables = []
			
 
				+    for table_idx, table in enumerate(tables):
			
 
				+        table_text = ""
			
 
				+        for row in table:
			
 
				+            table_text += " ".join([str(cell) for cell in row])
			
 
				+        table_text_no_space = table_text.replace(" ", "")
			
 
				+        
			
 
				+        # 选择包含"项目名称"且包含"静态合计"或"静态投资"的表格
			
 
				+        has_name_col = "项目名称" in table_text_no_space
			
 
				+        has_investment_col = ("静态合计" in table_text_no_space or "静态投资" in table_text_no_space)
			
 
				+        
			
 
				+        if has_name_col and has_investment_col:
			
 
				+            all_matching_tables.append((table_idx, table))
			
 
				+            logger.info(f"[安全可研批复投资] 找到投资估算表格 (表格{table_idx+1}), 行数: {len(table)}")
			
 
				+    
			
 
				+    if not all_matching_tables:
			
 
				+        logger.warning("[安全可研批复投资] 未找到包含投资估算的表格")
			
 
				+        return record
			
 
				+    
			
 
				+    # 如果只有一个表格，直接使用
			
 
				+    if len(all_matching_tables) == 1:
			
 
				+        target_table = all_matching_tables[0][1]
			
 
				+    else:
			
 
				+        # 多个表格：合并所有表格的数据行
			
 
				+        logger.info(f"[安全可研批复投资] 发现 {len(all_matching_tables)} 个投资估算表格，将进行合并")
			
 
				+        target_table = []
			
 
				+        first_table = True
			
 
				+        for table_idx, table in all_matching_tables:
			
 
				+            if first_table:
			
 
				+                target_table.extend(table)
			
 
				+                first_table = False
			
 
				+            else:
			
 
				+                # 跳过表头行
			
 
				+                header_end_idx = 0
			
 
				+                for row_idx, row in enumerate(table):
			
 
				+                    row_text = " ".join([str(cell) for cell in row]).replace(" ", "")
			
 
				+                    if "序号" in row_text or "项目名称" in row_text or "建设规模" in row_text:
			
 
				+                        header_end_idx = row_idx + 1
			
 
				+                    elif len(row) > 0:
			
 
				+                        first_cell = str(row[0]).strip()
			
 
				+                        if first_cell in ["一", "二", "三", "四", "五", "六", "七", "八", "九", "十"]:
			
 
				+                            break
			
 
				+                target_table.extend(table[header_end_idx:])
			
 
				+                logger.debug(f"[安全可研批复投资] 表格{table_idx+1}: 跳过前{header_end_idx}行表头，添加{len(table)-header_end_idx}行数据")
			
 
				+        
			
 
				+        logger.info(f"[安全可研批复投资] 合并后总行数: {len(target_table)}")
			
 
				+    
			
 
				+    # 识别表头行和列索引
			
 
				+    header_row_idx = -1
			
 
				+    no_idx = -1
			
 
				+    name_idx = -1
			
 
				+    overhead_line_idx = -1
			
 
				+    bay_idx = -1
			
 
				+    substation_idx = -1
			
 
				+    optical_cable_idx = -1
			
 
				+    static_investment_idx = -1
			
 
				+    dynamic_investment_idx = -1
			
 
				+    construction_project_cost_idx = -1
			
 
				+    equipment_purchase_cost_idx = -1
			
 
				+    installation_project_cost_idx = -1
			
 
				+    other_expenses_idx = -1
			
 
				+    
			
 
				+    # 扫描前几行识别列索引
			
 
				+    for row_idx in range(min(5, len(target_table))):
			
 
				+        row = target_table[row_idx]
			
 
				+        row_text = " ".join([str(cell) for cell in row])
			
 
				+        row_text_no_space = row_text.replace(" ", "")
			
 
				+        
			
 
				+        for col_idx, cell in enumerate(row):
			
 
				+            cell_text = str(cell).strip()
			
 
				+            cell_text_no_space = cell_text.replace(" ", "")
			
 
				+            
			
 
				+            if "序号" in cell_text and no_idx == -1:
			
 
				+                no_idx = col_idx
			
 
				+            elif "项目名称" in cell_text_no_space and name_idx == -1:
			
 
				+                name_idx = col_idx
			
 
				+            elif "架空线" in cell_text_no_space and overhead_line_idx == -1:
			
 
				+                overhead_line_idx = col_idx
			
 
				+            elif "间隔" in cell_text and bay_idx == -1:
			
 
				+                bay_idx = col_idx
			
 
				+            elif "变电" in cell_text and substation_idx == -1:
			
 
				+                substation_idx = col_idx
			
 
				+            elif "光缆" in cell_text and optical_cable_idx == -1:
			
 
				+                optical_cable_idx = col_idx
			
 
				+            elif ("静态投资" in cell_text_no_space or "静态合计" in cell_text_no_space) and static_investment_idx == -1:
			
 
				+                static_investment_idx = col_idx
			
 
				+            elif ("动态投资" in cell_text_no_space or "动态合计" in cell_text_no_space) and dynamic_investment_idx == -1:
			
 
				+                dynamic_investment_idx = col_idx
			
 
				+            elif "建筑工程费" in cell_text_no_space and construction_project_cost_idx == -1:
			
 
				+                construction_project_cost_idx = col_idx
			
 
				+            elif "设备购置费" in cell_text_no_space and equipment_purchase_cost_idx == -1:
			
 
				+                equipment_purchase_cost_idx = col_idx
			
 
				+            elif "安装工程费" in cell_text_no_space and installation_project_cost_idx == -1:
			
 
				+                installation_project_cost_idx = col_idx
			
 
				+            elif ("其他费用" in cell_text_no_space or "合计" == cell_text_no_space) and other_expenses_idx == -1:
			
 
				+                if "其他费用" in cell_text_no_space:
			
 
				+                    other_expenses_idx = col_idx
			
 
				+        
			
 
				+        if ("序号" in row_text or "项目名称" in row_text_no_space) and header_row_idx == -1:
			
 
				+            header_row_idx = row_idx
			
 
				+    
			
 
				+    # 找到第一个数据行
			
 
				+    for row_idx in range(min(5, len(target_table))):
			
 
				+        row = target_table[row_idx]
			
 
				+        if len(row) > 0:
			
 
				+            first_cell = str(row[0]).strip()
			
 
				+            if first_cell and first_cell not in ["序号", ""] and (first_cell in ["一", "二", "三", "四", "五"] or first_cell.isdigit()):
			
 
				+                header_row_idx = row_idx - 1
			
 
				+                logger.debug(f"[安全可研批复投资] 根据数据行确定表头结束于第{header_row_idx}行")
			
 
				+                break
			
 
				+    
			
 
				+    logger.info(f"[安全可研批复投资] 表头行: {header_row_idx}")
			
 
				+    logger.info(f"[安全可研批复投资] 列索引: 序号={no_idx}, 名称={name_idx}, "
			
 
				+               f"架空线={overhead_line_idx}, 间隔={bay_idx}, 变电={substation_idx}, "
			
 
				+               f"光缆={optical_cable_idx}, 静态投资={static_investment_idx}, 动态投资={dynamic_investment_idx}")
			
 
				+    
			
 
				+    if header_row_idx == -1:
			
 
				+        logger.warning("[安全可研批复投资] 未找到表头行")
			
 
				+        return record
			
 
				+    
			
 
				+    # 解析数据行
			
 
				+    for row_idx in range(header_row_idx + 1, len(target_table)):
			
 
				+        row = target_table[row_idx]
			
 
				+        
			
 
				+        if len(row) < 3:
			
 
				+            continue
			
 
				+        
			
 
				+        # 检查是否是有效数据行
			
 
				+        if name_idx >= 0 and name_idx < len(row):
			
 
				+            name = str(row[name_idx]).strip()
			
 
				+            if not name or name in ["", "nan", "None"]:
			
 
				+                continue
			
 
				+            
			
 
				+            # 提取序号
			
 
				+            no = ""
			
 
				+            if no_idx >= 0 and no_idx < len(row):
			
 
				+                no = str(row[no_idx]).strip()
			
 
				+            
			
 
				+            # 判断等级 - 使用非严格模式，让中文数字直接返回 Level 1
			
 
				+            level_input = (no + name) if no else name
			
 
				+            level = determine_level(level_input, name, strict_mode=False)
			
 
				+            
			
 
				+            # 对于阿拉伯数字序号，如果当前是 Level 2，且不是具体项目名称，则判定为 Level 2
			
 
				+            # 这样"1、襄阳连云..."会是 Level 2
			
 
				+            if level == "2" and no.isdigit():
			
 
				+                # 阿拉伯数字序号，是具体项目，保持 Level 2
			
 
				+                pass
			
 
				+            
			
 
				+            item = InvestmentItem()
			
 
				+            item.no = no
			
 
				+            item.name = name
			
 
				+            item.level = level
			
 
				+            
			
 
				+            # 提取建设规模
			
 
				+            if overhead_line_idx >= 0 and overhead_line_idx < len(row):
			
 
				+                item.constructionScaleOverheadLine = str(row[overhead_line_idx]).strip()
			
 
				+            
			
 
				+            if bay_idx >= 0 and bay_idx < len(row):
			
 
				+                item.constructionScaleBay = str(row[bay_idx]).strip()
			
 
				+            
			
 
				+            if substation_idx >= 0 and substation_idx < len(row):
			
 
				+                item.constructionScaleSubstation = str(row[substation_idx]).strip()
			
 
				+            
			
 
				+            if optical_cable_idx >= 0 and optical_cable_idx < len(row):
			
 
				+                item.constructionScaleOpticalCable = str(row[optical_cable_idx]).strip()
			
 
				+            
			
 
				+            # 提取投资金额
			
 
				+            if static_investment_idx >= 0 and static_investment_idx < len(row):
			
 
				+                item.staticInvestment = str(row[static_investment_idx]).strip()
			
 
				+            
			
 
				+            if dynamic_investment_idx >= 0 and dynamic_investment_idx < len(row):
			
 
				+                item.dynamicInvestment = str(row[dynamic_investment_idx]).strip()
			
 
				+            
			
 
				+            # 提取费用
			
 
				+            if construction_project_cost_idx >= 0 and construction_project_cost_idx < len(row):
			
 
				+                item.constructionProjectCost = str(row[construction_project_cost_idx]).strip()
			
 
				+            
			
 
				+            if equipment_purchase_cost_idx >= 0 and equipment_purchase_cost_idx < len(row):
			
 
				+                item.equipmentPurchaseCost = str(row[equipment_purchase_cost_idx]).strip()
			
 
				+            
			
 
				+            if installation_project_cost_idx >= 0 and installation_project_cost_idx < len(row):
			
 
				+                item.installationProjectCost = str(row[installation_project_cost_idx]).strip()
			
 
				+            
			
 
				+            if other_expenses_idx >= 0 and other_expenses_idx < len(row):
			
 
				+                item.otherExpenses = str(row[other_expenses_idx]).strip()
			
 
				+            
			
 
				+            record.items.append(item)
			
 
				+            logger.info(f"[安全可研批复投资] 解析到数据: No={item.no}, Name={item.name}, Level={item.level}")
			
 
				+    
			
 
				+    logger.info(f"[安全可研批复投资] 共解析到 {len(record.items)} 条数据")
			
 
				+    return record
			
 
				+
			
 
				+
			
 
				 def parse_feasibility_review_investment(markdown_content: str) -> FeasibilityReviewInvestment:
			
 
				     """
			
 
				     解析可研评审投资估算
			
@@ -821,6 +1071,9 @@ def parse_investment_record(markdown_content: str, investment_type: Optional[str
 
				     result = None
			
 
				     if investment_type == "fsApproval":
			
 
				         result = parse_feasibility_approval_investment(markdown_content)
			
 
				+    elif investment_type == "safetyFsApproval":
			
 
				+        # safetyFsApproval 使用独立的解析逻辑（湖北省格式）
			
 
				+        result = parse_safety_feasibility_approval_investment(markdown_content)
			
 
				     elif investment_type == "fsReview":
			
 
				         result = parse_feasibility_review_investment(markdown_content)
			
 
				     elif investment_type == "pdApproval":
			
--- a/pdf_converter_v2/parser/json_converter.py
+++ b/pdf_converter_v2/parser/json_converter.py
@@ -329,8 +329,8 @@ def parse_markdown_to_json(markdown_content: str, first_page_image: Optional[Ima
 
				                 op_list = parse_operational_conditions(markdown_content, require_title=False)
			
 
				             serialized = [oc.to_dict() if hasattr(oc, "to_dict") else oc for oc in (op_list or [])]
			
 
				             result = {"document_type": forced_document_type, "data": {"operationalConditions": serialized}}
			
 
				-        elif forced_document_type in ["fsApproval", "fsReview", "pdApproval"]:
			
 
				-            # 投资估算类型处理
			
 
				+        elif forced_document_type in ["fsApproval", "fsReview", "pdApproval", "safetyFsApproval"]:
			
 
				+            # 投资估算类型处理（包括安评类）
			
 
				             logger.info(f"[JSON转换] 处理投资估算类型: {forced_document_type}")
			
 
				             logger.debug(f"[JSON转换] Markdown内容长度: {len(markdown_content)} 字符")
			
 
				             
			
@@ -338,12 +338,26 @@ def parse_markdown_to_json(markdown_content: str, first_page_image: Optional[Ima
 
				             
			
 
				             if investment_record:
			
 
				                 data = investment_record.to_dict()
			
 
				-                logger.info(f"[JSON转换] 投资估算解析成功，共 {len(data)} 条记录")
			
 
				                 
			
 
				-                # 输出前3条记录的摘要
			
 
				-                if data:
			
 
				-                    for idx, item in enumerate(data[:3]):
			
 
				-                        logger.debug(f"[JSON转换] 记录 {idx+1}: No={item.get('No', '')}, Name={item.get('name', '')}, Level={item.get('Level', '')}")
			
 
				+                # 检查返回的数据格式：可能是列表（旧格式）或字典（包含projectInfo的新格式）
			
 
				+                if isinstance(data, dict) and "data" in data:
			
 
				+                    # 新格式：包含 projectInfo 和 data
			
 
				+                    logger.info(f"[JSON转换] 投资估算解析成功，共 {len(data['data'])} 条记录")
			
 
				+                    if data.get("projectInfo"):
			
 
				+                        logger.info(f"[JSON转换] 项目信息: {data['projectInfo'].get('projectName', '')}")
			
 
				+                    
			
 
				+                    # 输出前3条记录的摘要
			
 
				+                    if data["data"]:
			
 
				+                        for idx, item in enumerate(data["data"][:3]):
			
 
				+                            logger.debug(f"[JSON转换] 记录 {idx+1}: No={item.get('No', '')}, Name={item.get('name', '')}, Level={item.get('Level', '')}")
			
 
				+                else:
			
 
				+                    # 旧格式：直接是数据列表
			
 
				+                    logger.info(f"[JSON转换] 投资估算解析成功，共 {len(data)} 条记录")
			
 
				+                    
			
 
				+                    # 输出前3条记录的摘要
			
 
				+                    if data:
			
 
				+                        for idx, item in enumerate(data[:3]):
			
 
				+                            logger.debug(f"[JSON转换] 记录 {idx+1}: No={item.get('No', '')}, Name={item.get('name', '')}, Level={item.get('Level', '')}")
			
 
				                 
			
 
				                 result = {"document_type": forced_document_type, "data": data}
			
 
				             else:
			
@@ -432,8 +446,8 @@ def parse_markdown_to_json(markdown_content: str, first_page_image: Optional[Ima
 
				     elif doc_type == "emRec":
			
 
				         data = parse_electromagnetic_detection_record(markdown_content).to_dict()
			
 
				         result = {"document_type": doc_type, "data": data}
			
 
				-    elif doc_type in ["fsApproval", "fsReview", "pdApproval"]:
			
 
				-        # 新增：投资估算类型
			
 
				+    elif doc_type in ["fsApproval", "fsReview", "pdApproval", "safetyFsApproval"]:
			
 
				+        # 新增：投资估算类型（包括安评类）
			
 
				         logger.info(f"[JSON转换] 检测到投资估算类型: {doc_type}")
			
 
				         logger.debug(f"[JSON转换] Markdown内容长度: {len(markdown_content)} 字符")
			
 
				         
			
--- a/pdf_converter_v2/requirements-paddle-npu.txt
+++ b/pdf_converter_v2/requirements-paddle-npu.txt
@@ -0,0 +1,20 @@
 
				+# PDF Converter v2 - 容器内依赖（Paddle NPU 环境已预装 paddle/paddlex，此处不重复安装）
			
 
				+
			
 
				+# 核心依赖
			
 
				+aiohttp>=3.8.0
			
 
				+aiofiles>=23.0.0
			
 
				+Pillow>=9.0.0
			
 
				+
			
 
				+# PDF 处理（至少安装一个）
			
 
				+pypdfium2>=4.0.0
			
 
				+pdf2image>=1.16.0
			
 
				+pdfplumber>=0.11.0
			
 
				+
			
 
				+# Web 框架
			
 
				+fastapi>=0.100.0
			
 
				+uvicorn[standard]>=0.23.0
			
 
				+pydantic>=2.0.0
			
 
				+typing-extensions>=4.0.0
			
 
				+
			
 
				+# 日志
			
 
				+loguru>=0.7.0
			
--- a/pdf_converter_v2/test.py
+++ b/pdf_converter_v2/test.py
@@ -26,7 +26,7 @@ except ImportError:
 
				     print("  安装命令: pip install PyMuPDF")
			
 
				 
			
 
				 # ==================== 配置区域 ====================
			
 
				-pdf_path = '/home/hws/workspace/GitLab/Clerk2.5/pdf_converter_v2/2-数据源/5-（初设批复）晋电建设〔2019〕566号　国网山西省电力公司关于晋城周村220kV输变电工程初步设计的批复 .pdf'
			
 
				+pdf_path = '/home/hws/workspace/GitLab/Clerk2.5/pdf_converter_v2/3-数据/鄂电司发展〔2024〕124号　国网湖北省电力有限公司关于襄阳连云220千伏输变电工程可行性研究报告的批复.pdf'
			
 
				 output_dir = Path('extracted_tables')  # 原始表格输出目录（包含表格前文本）
			
 
				 merged_output_dir = Path('merged_tables')  # 合并后的表格输出目录（已剔除表格前文本）
			
 
				 filtered_output_dir = Path('filtered_tables')  # 筛选后的表格输出目录
			
@@ -89,7 +89,7 @@ TABLE_HEADER_RULES = {
 
				 }
			
 
				 
			
 
				 # 是否启用表头过滤（如果为False，则提取所有表格）
			
 
				-ENABLE_HEADER_FILTER = True
			
 
				+ENABLE_HEADER_FILTER = False
			
 
				 
			
 
				 # 要排除的规则名称列表（如果某个规则匹配了不该匹配的表格，可以在这里排除）
			
 
				 # 例如: EXCLUDE_RULES = ["物资采购合同2"] 将不会匹配该规则
			
--- a/pdf_converter_v2/test_api.py
+++ b/pdf_converter_v2/test_api.py
@@ -6,6 +6,7 @@ PDF Converter API 测试脚本
 
				 - fsApproval: 可研批复
			
 
				 - fsReview: 可研评审  
			
 
				 - pdApproval: 初设批复
			
 
				+- safetyFsApproval: 安评可研批复
			
 
				 
			
 
				 以及现有类型：
			
 
				 - settlementReport: 结算报告
			
@@ -27,15 +28,20 @@ API_BASE_URL = "http://47.101.133.94:14213"
 
				 # 测试文件配置
			
 
				 TEST_DIR = Path(__file__).parent / "test"
			
 
				 
			
 
				-# 测试用例：文件名 -> 文档类型
			
 
				+# 测试用例：文件名 -> (文档类型, 是否去水印, 是否只保留表格附件)
			
 
				+# 格式: 
			
 
				+#   "文件名": ("类型", 去水印, 只保留表格) - 完整格式
			
 
				+#   "文件名": ("类型", 去水印) - 兼容格式，只保留表格默认True
			
 
				+#   "文件名": "类型" - 旧格式，去水印False，只保留表格True
			
 
				 TEST_CASES = {
			
 
				     # 新增投资类型
			
 
				+    "鄂电司发展〔2024〕124号　国网湖北省电力有限公司关于襄阳连云220千伏输变电工程可行性研究报告的批复.pdf": ("safetyFsApproval", True,False),  # 需要去水印 + 只保留表格附件
			
 
				     # "2-（可研批复）晋电发展〔2017〕831号+国网山西省电力公司关于临汾古县、晋城周村220kV输变电等工程可行性研究报告的批复.pdf.pdf": "fsApproval",
			
 
				     # "1-（可研评审）晋电经研规划〔2017〕187号(盖章)国网山西经研院关于山西晋城周村220kV输变电工程可行性研究报告的评审意见.pdf": "fsReview",
			
 
				     # "5-（初设批复）晋电建设〔2019〕566号　国网山西省电力公司关于晋城周村220kV输变电工程初步设计的批复 .pdf": "pdApproval",
			
 
				     # 现有类型
			
 
				     # "9-（结算报告）山西晋城周村220kV输变电工程结算审计报告.pdf": "settlementReport",
			
 
				-    "4-（初设评审）中电联电力建设技术经济咨询中心技经〔2019〕201号关于山西周村220kV输变电工程初步设计的评审意见.pdf": "designReview",
			
 
				+    # "4-（初设评审）中电联电力建设技术经济咨询中心技经〔2019〕201号关于山西周村220kV输变电工程初步设计的评审意见.pdf": "designReview",
			
 
				     # 决算报告
			
 
				     # "10-（决算报告）盖章页-山西晋城周村220kV输变电工程竣工决算审核报告（中瑞诚鉴字（2021）第002040号）.pdf": "finalAccount",
			
 
				 }
			
@@ -70,10 +76,21 @@ def check_health() -> bool:
 
				         return False
			
 
				 
			
 
				 
			
 
				-def upload_file(file_path: Path, document_type: str) -> Optional[str]:
			
 
				-    """上传文件并获取任务 ID"""
			
 
				+def upload_file(file_path: Path, document_type: str, remove_watermark: bool = False, table_only: bool = True) -> Optional[str]:
			
 
				+    """上传文件并获取任务 ID
			
 
				+    
			
 
				+    Args:
			
 
				+        file_path: 文件路径
			
 
				+        document_type: 文档类型
			
 
				+        remove_watermark: 是否去水印
			
 
				+        table_only: 是否只保留表格附件
			
 
				+    """
			
 
				     print(f"\n  📤 上传文件: {file_path.name}")
			
 
				     print(f"     类型: {document_type}")
			
 
				+    if remove_watermark:
			
 
				+        print(f"     去水印: 是")
			
 
				+    if table_only:
			
 
				+        print(f"     只保留表格: 是")
			
 
				     
			
 
				     try:
			
 
				         with open(file_path, "rb") as f:
			
@@ -81,6 +98,15 @@ def upload_file(file_path: Path, document_type: str) -> Optional[str]:
 
				             # 使用 data 发送表单参数，参数名是 type（不是 document_type）
			
 
				             data = {"type": document_type}
			
 
				             
			
 
				+            # 添加去水印参数
			
 
				+            if remove_watermark:
			
 
				+                data["remove_watermark"] = "true"
			
 
				+                data["watermark_light_threshold"] = "200"
			
 
				+                data["watermark_saturation_threshold"] = "30"
			
 
				+            
			
 
				+            # 添加只保留表格参数
			
 
				+            data["table_only"] = "true" if table_only else "false"
			
 
				+            
			
 
				             response = requests.post(
			
 
				                 f"{API_BASE_URL}/convert",
			
 
				                 files=files,
			
@@ -173,7 +199,21 @@ def validate_result(result: Dict[str, Any], expected_type: str) -> bool:
 
				         return False
			
 
				     
			
 
				     # 对于投资类型，检查嵌套结构
			
 
				-    if expected_type in ["fsApproval", "fsReview", "pdApproval"]:
			
 
				+    if expected_type in ["fsApproval", "fsReview", "pdApproval", "safetyFsApproval"]:
			
 
				+        # 检查是否是新格式（包含 projectInfo）
			
 
				+        project_info = None
			
 
				+        if isinstance(data, dict) and "data" in data:
			
 
				+            # 新格式：{"projectInfo": {...}, "data": [...]}
			
 
				+            project_info = data.get("projectInfo")
			
 
				+            data = data["data"]
			
 
				+            
			
 
				+            if project_info:
			
 
				+                print(f"\n  📋 项目信息:")
			
 
				+                print(f"     工程名称: {project_info.get('projectName', '')}")
			
 
				+                print(f"     项目单位: {project_info.get('projectUnit', '')}")
			
 
				+                print(f"     设计单位: {project_info.get('designUnit', '')}")
			
 
				+        
			
 
				+        # 验证数据格式
			
 
				         if not isinstance(data, list):
			
 
				             print_result(False, f"数据格式错误: 期望 list, 实际 {type(data).__name__}")
			
 
				             return False
			
@@ -218,13 +258,24 @@ def validate_result(result: Dict[str, Any], expected_type: str) -> bool:
 
				     return True
			
 
				 
			
 
				 
			
 
				-def test_single_file(file_path: Path, document_type: str) -> bool:
			
 
				-    """测试单个文件"""
			
 
				+def test_single_file(file_path: Path, document_type: str, remove_watermark: bool = False, table_only: bool = True) -> bool:
			
 
				+    """测试单个文件
			
 
				+    
			
 
				+    Args:
			
 
				+        file_path: 文件路径
			
 
				+        document_type: 文档类型
			
 
				+        remove_watermark: 是否去水印
			
 
				+        table_only: 是否只保留表格附件
			
 
				+    """
			
 
				     print_header(f"测试: {document_type}")
			
 
				     print(f"  文件: {file_path.name}")
			
 
				+    if remove_watermark:
			
 
				+        print(f"  去水印: 是")
			
 
				+    if table_only:
			
 
				+        print(f"  只保留表格: 是")
			
 
				     
			
 
				     # 1. 上传文件
			
 
				-    task_id = upload_file(file_path, document_type)
			
 
				+    task_id = upload_file(file_path, document_type, remove_watermark, table_only)
			
 
				     if not task_id:
			
 
				         return False
			
 
				     
			
@@ -276,7 +327,23 @@ def run_all_tests():
 
				     skipped = 0
			
 
				     
			
 
				     # 运行每个测试用例
			
 
				-    for filename, document_type in TEST_CASES.items():
			
 
				+    for filename, config in TEST_CASES.items():
			
 
				+        # 解析配置格式
			
 
				+        if isinstance(config, tuple):
			
 
				+            if len(config) >= 3:
			
 
				+                document_type, remove_watermark, table_only = config[:3]
			
 
				+            elif len(config) == 2:
			
 
				+                document_type, remove_watermark = config
			
 
				+                table_only = True  # 默认只保留表格
			
 
				+            else:
			
 
				+                document_type = config[0]
			
 
				+                remove_watermark = False
			
 
				+                table_only = True
			
 
				+        else:
			
 
				+            document_type = config
			
 
				+            remove_watermark = False
			
 
				+            table_only = True
			
 
				+        
			
 
				         file_path = TEST_DIR / filename
			
 
				         
			
 
				         if not file_path.exists():
			
@@ -288,7 +355,7 @@ def run_all_tests():
 
				         total += 1
			
 
				         
			
 
				         try:
			
 
				-            if test_single_file(file_path, document_type):
			
 
				+            if test_single_file(file_path, document_type, remove_watermark, table_only):
			
 
				                 passed += 1
			
 
				             else:
			
 
				                 failed += 1
			
@@ -319,11 +386,27 @@ def test_single(document_type: str):
 
				         return
			
 
				     
			
 
				     # 查找对应的文件
			
 
				-    for filename, dtype in TEST_CASES.items():
			
 
				+    for filename, config in TEST_CASES.items():
			
 
				+        # 解析配置格式
			
 
				+        if isinstance(config, tuple):
			
 
				+            if len(config) >= 3:
			
 
				+                dtype, remove_watermark, table_only = config[:3]
			
 
				+            elif len(config) == 2:
			
 
				+                dtype, remove_watermark = config
			
 
				+                table_only = True
			
 
				+            else:
			
 
				+                dtype = config[0]
			
 
				+                remove_watermark = False
			
 
				+                table_only = True
			
 
				+        else:
			
 
				+            dtype = config
			
 
				+            remove_watermark = False
			
 
				+            table_only = True
			
 
				+        
			
 
				         if dtype == document_type:
			
 
				             file_path = TEST_DIR / filename
			
 
				             if file_path.exists():
			
 
				-                test_single_file(file_path, document_type)
			
 
				+                test_single_file(file_path, document_type, remove_watermark, table_only)
			
 
				                 return
			
 
				             else:
			
 
				                 print_result(False, f"文件不存在: {filename}")
			
@@ -332,7 +415,16 @@ def test_single(document_type: str):
 
				     print_result(False, f"未找到类型 {document_type} 的测试文件")
			
 
				 
			
 
				 
			
 
				-def test_ocr(image_path: Optional[str] = None) -> bool:
			
 
				+def test_ocr(
			
 
				+    image_path: Optional[str] = None,
			
 
				+    remove_watermark: bool = False,
			
 
				+    light_threshold: int = 200,
			
 
				+    saturation_threshold: int = 30,
			
 
				+    crop_header_footer: bool = False,
			
 
				+    header_ratio: float = 0.05,
			
 
				+    footer_ratio: float = 0.05,
			
 
				+    auto_detect_header_footer: bool = False
			
 
				+) -> bool:
			
 
				     """
			
 
				     测试 OCR 接口
			
 
				     
			
@@ -341,6 +433,13 @@ def test_ocr(image_path: Optional[str] = None) -> bool:
 
				                    支持格式：
			
 
				                    - 图片文件：.png, .jpg, .jpeg
			
 
				                    - txt文件：包含base64编码的图片数据（可带data:image/xxx;base64,前缀）
			
 
				+        remove_watermark: 是否去除水印
			
 
				+        light_threshold: 水印亮度阈值（0-255），默认200
			
 
				+        saturation_threshold: 水印饱和度阈值（0-255），默认30
			
 
				+        crop_header_footer: 是否裁剪页眉页脚
			
 
				+        header_ratio: 页眉裁剪比例（0-1），默认0.05
			
 
				+        footer_ratio: 页脚裁剪比例（0-1），默认0.05
			
 
				+        auto_detect_header_footer: 是否自动检测页眉页脚边界
			
 
				     
			
 
				     Returns:
			
 
				         是否测试成功
			
@@ -419,14 +518,33 @@ def test_ocr(image_path: Optional[str] = None) -> bool:
 
				     
			
 
				     # 调用 OCR 接口
			
 
				     print(f"\n  📤 调用 OCR 接口...")
			
 
				+    # 构建请求参数
			
 
				+    request_data = {
			
 
				+        "image_base64": image_base64,
			
 
				+        "image_format": image_format
			
 
				+    }
			
 
				+    
			
 
				+    if crop_header_footer:
			
 
				+        request_data["crop_header_footer"] = True
			
 
				+        if auto_detect_header_footer:
			
 
				+            request_data["auto_detect_header_footer"] = True
			
 
				+            print(f"  ✂️  裁剪页眉页脚: 自动检测模式")
			
 
				+        else:
			
 
				+            request_data["header_ratio"] = header_ratio
			
 
				+            request_data["footer_ratio"] = footer_ratio
			
 
				+            print(f"  ✂️  裁剪页眉页脚: 是 (顶部={header_ratio*100:.0f}%, 底部={footer_ratio*100:.0f}%)")
			
 
				+    
			
 
				+    if remove_watermark:
			
 
				+        request_data["remove_watermark"] = True
			
 
				+        request_data["watermark_light_threshold"] = light_threshold
			
 
				+        request_data["watermark_saturation_threshold"] = saturation_threshold
			
 
				+        print(f"  🔧 去水印: 是 (亮度阈值={light_threshold}, 饱和度阈值={saturation_threshold})")
			
 
				+    
			
 
				     try:
			
 
				         start_time = time.time()
			
 
				         response = requests.post(
			
 
				             f"{API_BASE_URL}/ocr",
			
 
				-            json={
			
 
				-                "image_base64": image_base64,
			
 
				-                "image_format": image_format
			
 
				-            },
			
 
				+            json=request_data,
			
 
				             timeout=120
			
 
				         )
			
 
				         elapsed = time.time() - start_time
			
@@ -500,15 +618,75 @@ if __name__ == "__main__":
 
				             print("  python test_api.py          # 运行所有测试")
			
 
				             print("  python test_api.py <type>   # 测试指定类型")
			
 
				             print("  python test_api.py ocr      # 测试 OCR 接口")
			
 
				-            print("  python test_api.py ocr <image_path>  # 测试 OCR（指定图片）")
			
 
				+            print("  python test_api.py ocr <image_path>  # 测试 OCR（指定图片或txt）")
			
 
				+            print("  python test_api.py ocr <image_path> --nowm  # 测试 OCR 并去水印")
			
 
				+            print("  python test_api.py ocr <image_path> --crop  # 测试 OCR 并裁剪页眉页脚")
			
 
				+            print("  python test_api.py ocr <image_path> --nowm --crop  # 同时去水印和裁剪")
			
 
				             print("\n可用类型:")
			
 
				             for dtype in set(TEST_CASES.values()):
			
 
				                 print(f"  - {dtype}")
			
 
				             print("  - ocr  (OCR 图片识别)")
			
 
				+            print("\nOCR 去水印参数:")
			
 
				+            print("  --nowm         启用去水印")
			
 
				+            print("  --light=N      亮度阈值（0-255，默认200）")
			
 
				+            print("  --sat=N        饱和度阈值（0-255，默认30）")
			
 
				+            print("\nOCR 裁剪页眉页脚参数:")
			
 
				+            print("  --crop         启用裁剪页眉页脚（固定比例模式）")
			
 
				+            print("  --crop-auto    启用裁剪页眉页脚（自动检测模式）")
			
 
				+            print("  --header=N     页眉裁剪比例（0-1，默认0.05表示5%）")
			
 
				+            print("  --footer=N     页脚裁剪比例（0-1，默认0.05表示5%）")
			
 
				         elif doc_type == "ocr":
			
 
				-            # 测试 OCR 接口
			
 
				-            image_path = sys.argv[2] if len(sys.argv) > 2 else None
			
 
				-            test_ocr(image_path)
			
 
				+            # 解析 OCR 参数
			
 
				+            image_path = None
			
 
				+            remove_watermark = False
			
 
				+            light_threshold = 200
			
 
				+            saturation_threshold = 30
			
 
				+            crop_header_footer = False
			
 
				+            header_ratio = 0.05
			
 
				+            footer_ratio = 0.05
			
 
				+            auto_detect_header_footer = False
			
 
				+            
			
 
				+            for arg in sys.argv[2:]:
			
 
				+                if arg == "--nowm":
			
 
				+                    remove_watermark = True
			
 
				+                elif arg == "--crop":
			
 
				+                    crop_header_footer = True
			
 
				+                elif arg == "--crop-auto":
			
 
				+                    crop_header_footer = True
			
 
				+                    auto_detect_header_footer = True
			
 
				+                elif arg.startswith("--light="):
			
 
				+                    try:
			
 
				+                        light_threshold = int(arg.split("=")[1])
			
 
				+                    except ValueError:
			
 
				+                        print(f"警告: 无效的亮度阈值 {arg}，使用默认值 200")
			
 
				+                elif arg.startswith("--sat="):
			
 
				+                    try:
			
 
				+                        saturation_threshold = int(arg.split("=")[1])
			
 
				+                    except ValueError:
			
 
				+                        print(f"警告: 无效的饱和度阈值 {arg}，使用默认值 30")
			
 
				+                elif arg.startswith("--header="):
			
 
				+                    try:
			
 
				+                        header_ratio = float(arg.split("=")[1])
			
 
				+                    except ValueError:
			
 
				+                        print(f"警告: 无效的页眉比例 {arg}，使用默认值 0.05")
			
 
				+                elif arg.startswith("--footer="):
			
 
				+                    try:
			
 
				+                        footer_ratio = float(arg.split("=")[1])
			
 
				+                    except ValueError:
			
 
				+                        print(f"警告: 无效的页脚比例 {arg}，使用默认值 0.05")
			
 
				+                elif not arg.startswith("--"):
			
 
				+                    image_path = arg
			
 
				+            
			
 
				+            test_ocr(
			
 
				+                image_path, 
			
 
				+                remove_watermark, 
			
 
				+                light_threshold, 
			
 
				+                saturation_threshold,
			
 
				+                crop_header_footer,
			
 
				+                header_ratio,
			
 
				+                footer_ratio,
			
 
				+                auto_detect_header_footer
			
 
				+            )
			
 
				         else:
			
 
				             test_single(doc_type)
			
 
				     else:
			
--- a/pdf_converter_v2/test_no.py
+++ b/pdf_converter_v2/test_no.py
@@ -45,11 +45,20 @@ except ImportError:
 
				     logger.info("[附件切割] 安装命令: pip install PyPDF2")
			
 
				 
			
 
				 # 配置
			
 
				-PDF_PATH = '/home/hws/workspace/GitLab/Clerk2.5/pdf_converter_v2/2-数据源/4-（初设评审）中电联电力建设技术经济咨询中心技经〔2019〕201号关于山西周村220kV输变电工程初步设计的评审意见.pdf'
			
 
				+PDF_PATH = '/home/hws/workspace/GitLab/Clerk2.5/pdf_converter_v2/2-数据源/1-（可研评审）晋电经研规划〔2017〕187号(盖章)国网山西经研院关于山西晋城周村220kV输变电工程可行性研究报告的评审意见.pdf'
			
 
				 OUTPUT_DIR = Path('附件页')
			
 
				 USE_OCR = True  # 是否启用 OCR
			
 
				 OCR_LANG = 'chi_sim+eng'  # OCR 语言
			
 
				-DEBUG_MODE = True  # 是否启用调试模式（显示每页的文本内容）
			
 
				+DEBUG_MODE = False  # 是否启用调试模式（显示每页的文本内容）
			
 
				+
			
 
				+# 去水印配置
			
 
				+REMOVE_WATERMARK = False  # 是否对切割后的附件页PDF去水印
			
 
				+WATERMARK_LIGHT_THRESHOLD = 200  # 水印亮度阈值（0-255），高于此值的浅色像素可能是水印
			
 
				+WATERMARK_SATURATION_THRESHOLD = 30  # 水印饱和度阈值（0-255），低于此值的低饱和度像素可能是水印
			
 
				+WATERMARK_DPI = 200  # PDF转图片的DPI（用于去水印）
			
 
				+
			
 
				+# 表格附件过滤配置
			
 
				+TABLE_ONLY = True  # 是否只保留包含表格的附件页（过滤掉示意图、评审意见等）
			
 
				 
			
 
				 # 附件页识别关键词
			
 
				 ATTACHMENT_START_KEYWORDS = [
			
@@ -59,6 +68,40 @@ ATTACHMENT_START_KEYWORDS = [
 
				     '附 件：',
			
 
				 ]
			
 
				 
			
 
				+# 表格附件识别关键词（用于过滤只保留包含表格的附件）
			
 
				+TABLE_ATTACHMENT_KEYWORDS = [
			
 
				+    '项目表',
			
 
				+    '投资估算',
			
 
				+    '工程投资',
			
 
				+    '建设规模',
			
 
				+    '技术方案',
			
 
				+    '变电工程',
			
 
				+    '线路工程',
			
 
				+    '静态投资',
			
 
				+    '动态投资',
			
 
				+    '单位造价',
			
 
				+    '设备购置费',
			
 
				+    '安装工程费',
			
 
				+    '建筑工程费',
			
 
				+    '其他费用',
			
 
				+    '基本预备费',
			
 
				+]
			
 
				+
			
 
				+# 非表格附件识别关键词（用于识别需要跳过的附件）
			
 
				+NON_TABLE_ATTACHMENT_KEYWORDS = [
			
 
				+    '示意图',
			
 
				+    '接入系统示意图',
			
 
				+    '母线间隔排列图',
			
 
				+    '评审意见',
			
 
				+    '技术监督意见',
			
 
				+    '参会单位',
			
 
				+    '人员一览表',
			
 
				+    '经济性评价',
			
 
				+    '财务合规',
			
 
				+    '审核结果',
			
 
				+    '预算编制衔接',
			
 
				+]
			
 
				+
			
 
				 def ocr_page_image(image) -> str:
			
 
				     """
			
 
				     对图片进行 OCR 识别（优先使用 Tesseract，备用 PaddleOCR）
			
@@ -143,6 +186,63 @@ def extract_page_text(page, use_ocr: bool = False) -> str:
 
				     logger.warning(f"[附件切割] 第{page.page_number}页: 无法提取文本（OCR未启用或不可用）")
			
 
				     return ""
			
 
				 
			
 
				+def is_table_attachment_page(text: str, page) -> bool:
			
 
				+    """
			
 
				+    判断是否是包含表格的附件页
			
 
				+    
			
 
				+    Args:
			
 
				+        text: 页面文本
			
 
				+        page: pdfplumber page 对象
			
 
				+    
			
 
				+    Returns:
			
 
				+        bool: 是否是表格附件页
			
 
				+    """
			
 
				+    if not text:
			
 
				+        return False
			
 
				+    
			
 
				+    text_no_space = text.replace(' ', '').replace('\u3000', '')
			
 
				+    
			
 
				+    # 检查是否包含非表格附件关键词（如示意图、评审意见等）
			
 
				+    for keyword in NON_TABLE_ATTACHMENT_KEYWORDS:
			
 
				+        keyword_no_space = keyword.replace(' ', '').replace('\u3000', '')
			
 
				+        if keyword_no_space in text_no_space:
			
 
				+            logger.debug(f"[附件切割] 检测到非表格附件关键词: {keyword}")
			
 
				+            return False
			
 
				+    
			
 
				+    # 检查是否包含表格附件关键词
			
 
				+    has_table_keyword = False
			
 
				+    for keyword in TABLE_ATTACHMENT_KEYWORDS:
			
 
				+        keyword_no_space = keyword.replace(' ', '').replace('\u3000', '')
			
 
				+        if keyword_no_space in text_no_space:
			
 
				+            logger.debug(f"[附件切割] 检测到表格关键词: {keyword}")
			
 
				+            has_table_keyword = True
			
 
				+            break
			
 
				+    
			
 
				+    # 如果有表格关键词，直接返回True
			
 
				+    if has_table_keyword:
			
 
				+        return True
			
 
				+    
			
 
				+    # 检查页面是否包含表格（使用pdfplumber的表格检测）
			
 
				+    if page is not None:
			
 
				+        try:
			
 
				+            tables = page.extract_tables()
			
 
				+            if tables and len(tables) > 0:
			
 
				+                # 检查表格是否足够大（至少有3行3列的数据表格）
			
 
				+                for table in tables:
			
 
				+                    if table and len(table) >= 3:
			
 
				+                        # 检查是否有多列
			
 
				+                        non_empty_rows = [row for row in table if row and any(cell for cell in row if cell)]
			
 
				+                        if len(non_empty_rows) >= 3:
			
 
				+                            row_with_most_cols = max(non_empty_rows, key=lambda r: len([c for c in r if c]))
			
 
				+                            if len([c for c in row_with_most_cols if c]) >= 3:
			
 
				+                                logger.debug(f"[附件切割] 检测到表格: {len(non_empty_rows)}行")
			
 
				+                                return True
			
 
				+        except Exception as e:
			
 
				+            logger.warning(f"[附件切割] 表格检测失败: {e}")
			
 
				+    
			
 
				+    return False
			
 
				+
			
 
				+
			
 
				 def is_attachment_start_page(text: str) -> bool:
			
 
				     """
			
 
				     判断是否是附件清单页（附件开始的前一页）
			
@@ -339,7 +439,10 @@ def extract_pages(pdf_path: str, page_numbers: list, output_path: str):
 
				     logger.info(f"[附件切割] 已保存到: {output_path}")
			
 
				     print(f"✓ 已保存到: {output_path}")
			
 
				 
			
 
				-def split_attachment_pages(pdf_path: str, output_dir: Path, use_ocr: bool = False, debug: bool = False):
			
 
				+def split_attachment_pages(pdf_path: str, output_dir: Path, use_ocr: bool = False, debug: bool = False, 
			
 
				+                          remove_watermark: bool = False, watermark_light_threshold: int = 200,
			
 
				+                          watermark_saturation_threshold: int = 30, watermark_dpi: int = 200,
			
 
				+                          table_only: bool = False):
			
 
				     """
			
 
				     查找并切割附件页
			
 
				     
			
@@ -348,8 +451,14 @@ def split_attachment_pages(pdf_path: str, output_dir: Path, use_ocr: bool = Fals
 
				         output_dir: 输出目录
			
 
				         use_ocr: 是否使用 OCR
			
 
				         debug: 是否输出调试信息
			
 
				+        remove_watermark: 是否对切割后的附件页PDF去水印
			
 
				+        watermark_light_threshold: 水印亮度阈值（0-255）
			
 
				+        watermark_saturation_threshold: 水印饱和度阈值（0-255）
			
 
				+        watermark_dpi: PDF转图片的DPI
			
 
				+        table_only: 是否只保留包含表格的附件页（过滤掉示意图、评审意见等）
			
 
				     """
			
 
				     logger.info(f"[附件切割] 开始处理PDF: {pdf_path}")
			
 
				+    logger.info(f"[附件切割] 只保留表格附件: {'是' if table_only else '否'}")
			
 
				     
			
 
				     # 查找附件开始页
			
 
				     attachment_start = find_attachment_start_page(pdf_path, use_ocr=use_ocr, debug=debug)
			
@@ -359,16 +468,65 @@ def split_attachment_pages(pdf_path: str, output_dir: Path, use_ocr: bool = Fals
 
				         print("\n未找到附件页")
			
 
				         return
			
 
				     
			
 
				-    # 获取总页数
			
 
				+    # 获取总页数和筛选表格附件页
			
 
				     with pdfplumber.open(pdf_path) as pdf:
			
 
				         total_pages = len(pdf.pages)
			
 
				-    
			
 
				-    # 附件页范围：从附件开始页到最后一页
			
 
				-    attachment_pages = list(range(attachment_start, total_pages + 1))
			
 
				-    
			
 
				-    logger.info(f"[附件切割] 附件页范围: {attachment_start}-{total_pages}, 共 {len(attachment_pages)} 页")
			
 
				-    print(f"\n附件页范围: 第 {attachment_start} 页 到 第 {total_pages} 页")
			
 
				-    print(f"共 {len(attachment_pages)} 页")
			
 
				+        
			
 
				+        if table_only:
			
 
				+            # 只保留包含表格的附件页
			
 
				+            logger.info(f"[附件切割] 启用表格附件过滤，开始筛选...")
			
 
				+            print(f"\n启用表格附件过滤，开始筛选...")
			
 
				+            
			
 
				+            attachment_pages = []
			
 
				+            current_table_section = []  # 当前表格区段的页面
			
 
				+            in_table_section = False  # 是否在表格区段内
			
 
				+            
			
 
				+            for page_num in range(attachment_start, total_pages + 1):
			
 
				+                page = pdf.pages[page_num - 1]
			
 
				+                text = extract_page_text(page, use_ocr=use_ocr)
			
 
				+                
			
 
				+                is_table_page = is_table_attachment_page(text, page)
			
 
				+                
			
 
				+                if debug:
			
 
				+                    print(f"  页面 {page_num}: {'表格页' if is_table_page else '非表格页'}")
			
 
				+                
			
 
				+                if is_table_page:
			
 
				+                    if not in_table_section:
			
 
				+                        # 开始新的表格区段
			
 
				+                        in_table_section = True
			
 
				+                        current_table_section = [page_num]
			
 
				+                        logger.debug(f"[附件切割] 开始表格区段: 第 {page_num} 页")
			
 
				+                    else:
			
 
				+                        # 继续当前表格区段
			
 
				+                        current_table_section.append(page_num)
			
 
				+                else:
			
 
				+                    if in_table_section:
			
 
				+                        # 结束当前表格区段，保存
			
 
				+                        attachment_pages.extend(current_table_section)
			
 
				+                        logger.info(f"[附件切割] 表格区段结束: {current_table_section[0]}-{current_table_section[-1]}")
			
 
				+                        current_table_section = []
			
 
				+                        in_table_section = False
			
 
				+            
			
 
				+            # 处理最后一个表格区段
			
 
				+            if in_table_section and current_table_section:
			
 
				+                attachment_pages.extend(current_table_section)
			
 
				+                logger.info(f"[附件切割] 最后表格区段: {current_table_section[0]}-{current_table_section[-1]}")
			
 
				+            
			
 
				+            if not attachment_pages:
			
 
				+                logger.warning(f"[附件切割] 未找到包含表格的附件页")
			
 
				+                print("\n未找到包含表格的附件页")
			
 
				+                return
			
 
				+            
			
 
				+            logger.info(f"[附件切割] 筛选后的表格附件页: {attachment_pages}")
			
 
				+            print(f"\n筛选后的表格附件页: {attachment_pages}")
			
 
				+            print(f"共 {len(attachment_pages)} 页")
			
 
				+        else:
			
 
				+            # 附件页范围：从附件开始页到最后一页
			
 
				+            attachment_pages = list(range(attachment_start, total_pages + 1))
			
 
				+            
			
 
				+            logger.info(f"[附件切割] 附件页范围: {attachment_start}-{total_pages}, 共 {len(attachment_pages)} 页")
			
 
				+            print(f"\n附件页范围: 第 {attachment_start} 页 到 第 {total_pages} 页")
			
 
				+            print(f"共 {len(attachment_pages)} 页")
			
 
				     
			
 
				     # 切割附件页
			
 
				     print("\n" + "=" * 60)
			
@@ -379,14 +537,70 @@ def split_attachment_pages(pdf_path: str, output_dir: Path, use_ocr: bool = Fals
 
				     output_dir.mkdir(parents=True, exist_ok=True)
			
 
				     
			
 
				     # 保存所有附件页为一个文件
			
 
				-    output_file = output_dir / f"{pdf_path.stem}_附件页_{attachment_start}-{total_pages}.pdf"
			
 
				+    if table_only:
			
 
				+        # 表格附件模式：使用筛选后的页面范围
			
 
				+        page_range_str = f"{min(attachment_pages)}_{max(attachment_pages)}" if attachment_pages else "none"
			
 
				+        output_file = output_dir / f"{pdf_path.stem}_表格附件页_{page_range_str}.pdf"
			
 
				+    else:
			
 
				+        output_file = output_dir / f"{pdf_path.stem}_附件页_{attachment_start}-{total_pages}.pdf"
			
 
				+    
			
 
				     logger.info(f"[附件切割] 输出文件: {output_file}")
			
 
				     extract_pages(pdf_path, attachment_pages, output_file)
			
 
				     
			
 
				     logger.info(f"[附件切割] 切割完成: {len(attachment_pages)} 页附件已保存")
			
 
				     print(f"\n✓ 切割完成！")
			
 
				     print(f"附件页数: {len(attachment_pages)} 页")
			
 
				-    print(f"输出目录: {output_dir.absolute()}")
			
 
				+    print(f"输出文件: {output_file}")
			
 
				+    
			
 
				+    # 如果启用去水印，对切割后的附件页PDF进行去水印处理
			
 
				+    if remove_watermark:
			
 
				+        logger.info(f"[附件切割] 开始对附件页PDF进行去水印处理...")
			
 
				+        print("\n" + "=" * 60)
			
 
				+        print("开始去水印处理")
			
 
				+        print("=" * 60)
			
 
				+        
			
 
				+        try:
			
 
				+            # 导入去水印模块
			
 
				+            import sys
			
 
				+            from pathlib import Path as PathLib
			
 
				+            sys.path.insert(0, str(PathLib(__file__).parent))
			
 
				+            
			
 
				+            from utils.pdf_watermark_remover import remove_watermark_from_pdf
			
 
				+            
			
 
				+            # 去水印后的PDF路径
			
 
				+            nowm_output_file = output_dir / f"{output_file.stem}_nowm.pdf"
			
 
				+            
			
 
				+            logger.info(f"[附件切割] 去水印参数: 亮度阈值={watermark_light_threshold}, 饱和度阈值={watermark_saturation_threshold}, DPI={watermark_dpi}")
			
 
				+            print(f"去水印参数:")
			
 
				+            print(f"  - 亮度阈值: {watermark_light_threshold}")
			
 
				+            print(f"  - 饱和度阈值: {watermark_saturation_threshold}")
			
 
				+            print(f"  - DPI: {watermark_dpi}")
			
 
				+            
			
 
				+            # 执行去水印
			
 
				+            success = remove_watermark_from_pdf(
			
 
				+                input_pdf=str(output_file),
			
 
				+                output_pdf=str(nowm_output_file),
			
 
				+                light_threshold=watermark_light_threshold,
			
 
				+                saturation_threshold=watermark_saturation_threshold,
			
 
				+                dpi=watermark_dpi
			
 
				+            )
			
 
				+            
			
 
				+            if success and nowm_output_file.exists():
			
 
				+                logger.info(f"[附件切割] 去水印完成: {nowm_output_file}")
			
 
				+                print(f"\n✓ 去水印完成！")
			
 
				+                print(f"去水印后的文件: {nowm_output_file}")
			
 
				+            else:
			
 
				+                logger.warning(f"[附件切割] 去水印失败")
			
 
				+                print(f"\n⚠ 去水印失败，请检查日志")
			
 
				+        except ImportError as e:
			
 
				+            logger.error(f"[附件切割] 导入去水印模块失败: {e}")
			
 
				+            print(f"\n⚠ 去水印模块导入失败: {e}")
			
 
				+            print("请确保 utils/pdf_watermark_remover.py 文件存在")
			
 
				+        except Exception as e:
			
 
				+            logger.exception(f"[附件切割] 去水印处理失败: {e}")
			
 
				+            print(f"\n⚠ 去水印处理失败: {e}")
			
 
				+    
			
 
				+    print(f"\n输出目录: {output_dir.absolute()}")
			
 
				 
			
 
				 if __name__ == '__main__':
			
 
				     logger.info("[附件切割] " + "=" * 50)
			
@@ -397,6 +611,19 @@ if __name__ == '__main__':
 
				     print("PDF 附件页识别和切割工具")
			
 
				     print("=" * 60)
			
 
				     
			
 
				+    # 显示配置信息
			
 
				+    print("\n配置信息:")
			
 
				+    print(f"  - PDF文件: {PDF_PATH}")
			
 
				+    print(f"  - 输出目录: {OUTPUT_DIR}")
			
 
				+    print(f"  - OCR: {'启用' if USE_OCR else '禁用'}")
			
 
				+    print(f"  - 调试模式: {'启用' if DEBUG_MODE else '禁用'}")
			
 
				+    print(f"  - 只保留表格附件: {'启用' if TABLE_ONLY else '禁用'}")
			
 
				+    print(f"  - 去水印: {'启用' if REMOVE_WATERMARK else '禁用'}")
			
 
				+    if REMOVE_WATERMARK:
			
 
				+        print(f"    * 亮度阈值: {WATERMARK_LIGHT_THRESHOLD}")
			
 
				+        print(f"    * 饱和度阈值: {WATERMARK_SATURATION_THRESHOLD}")
			
 
				+        print(f"    * DPI: {WATERMARK_DPI}")
			
 
				+    
			
 
				     # 检查依赖
			
 
				     if not TESSERACT_AVAILABLE and USE_OCR:
			
 
				         logger.warning("[附件切割] OCR 功能不可用")
			
@@ -412,7 +639,26 @@ if __name__ == '__main__':
 
				         print("安装方法:")
			
 
				         print("  pip install PyPDF2\n")
			
 
				     
			
 
				+    if REMOVE_WATERMARK:
			
 
				+        print("\n⚠ 去水印功能需要以下依赖:")
			
 
				+        print("  - OpenCV (cv2)")
			
 
				+        print("  - Pillow (PIL)")
			
 
				+        print("  - pdf2image")
			
 
				+        print("  - PyPDF2")
			
 
				+        print("安装命令:")
			
 
				+        print("  pip install opencv-python pillow pdf2image PyPDF2\n")
			
 
				+    
			
 
				     # 执行切割
			
 
				-    logger.info(f"[附件切割] 配置: PDF={PDF_PATH}, 输出={OUTPUT_DIR}, OCR={USE_OCR}, DEBUG={DEBUG_MODE}")
			
 
				-    split_attachment_pages(PDF_PATH, OUTPUT_DIR, use_ocr=USE_OCR, debug=DEBUG_MODE)
			
 
				+    logger.info(f"[附件切割] 配置: PDF={PDF_PATH}, 输出={OUTPUT_DIR}, OCR={USE_OCR}, DEBUG={DEBUG_MODE}, 表格附件={TABLE_ONLY}, 去水印={REMOVE_WATERMARK}")
			
 
				+    split_attachment_pages(
			
 
				+        PDF_PATH, 
			
 
				+        OUTPUT_DIR, 
			
 
				+        use_ocr=USE_OCR, 
			
 
				+        debug=DEBUG_MODE,
			
 
				+        remove_watermark=REMOVE_WATERMARK,
			
 
				+        watermark_light_threshold=WATERMARK_LIGHT_THRESHOLD,
			
 
				+        watermark_saturation_threshold=WATERMARK_SATURATION_THRESHOLD,
			
 
				+        watermark_dpi=WATERMARK_DPI,
			
 
				+        table_only=TABLE_ONLY
			
 
				+    )
			
 
				     logger.info("[附件切割] 程序执行完成")
			
--- a/pdf_converter_v2/utils/image_preprocessor.py
+++ b/pdf_converter_v2/utils/image_preprocessor.py
@@ -0,0 +1,526 @@
 
				+"""
			
 
				+图像预处理工具 - 包含去水印等功能
			
 
				+
			
 
				+支持的预处理操作：
			
 
				+- 去水印（颜色过滤法）
			
 
				+- 灰度转换
			
 
				+- 二值化
			
 
				+- 去噪
			
 
				+"""
			
 
				+
			
 
				+import numpy as np
			
 
				+from pathlib import Path
			
 
				+from typing import Optional, Tuple
			
 
				+from loguru import logger
			
 
				+
			
 
				+try:
			
 
				+    from PIL import Image
			
 
				+    PIL_AVAILABLE = True
			
 
				+except ImportError:
			
 
				+    PIL_AVAILABLE = False
			
 
				+    logger.warning("[图像预处理] PIL 未安装，部分功能不可用")
			
 
				+
			
 
				+try:
			
 
				+    import cv2
			
 
				+    CV2_AVAILABLE = True
			
 
				+except ImportError:
			
 
				+    CV2_AVAILABLE = False
			
 
				+    logger.warning("[图像预处理] OpenCV 未安装，部分功能不可用")
			
 
				+
			
 
				+
			
 
				+def remove_watermark(
			
 
				+    image_path: str,
			
 
				+    output_path: Optional[str] = None,
			
 
				+    light_threshold: int = 200,
			
 
				+    saturation_threshold: int = 30,
			
 
				+    method: str = "auto"
			
 
				+) -> str:
			
 
				+    """
			
 
				+    去除图片水印
			
 
				+    
			
 
				+    原理：大多数水印是浅色或半透明的，通过以下方式去除：
			
 
				+    1. 将浅色像素（亮度高、饱和度低）替换为白色
			
 
				+    2. 保留深色文字内容
			
 
				+    
			
 
				+    Args:
			
 
				+        image_path: 输入图片路径
			
 
				+        output_path: 输出图片路径，默认在原文件名后加 _nowm
			
 
				+        light_threshold: 亮度阈值（0-255），高于此值的浅色像素可能是水印
			
 
				+        saturation_threshold: 饱和度阈值（0-255），低于此值的低饱和度像素可能是水印
			
 
				+        method: 去水印方法
			
 
				+            - "auto": 自动选择最佳方法
			
 
				+            - "light": 基于亮度的简单方法（快速）
			
 
				+            - "hsv": 基于HSV颜色空间的方法（更精确）
			
 
				+            - "adaptive": 自适应阈值方法
			
 
				+    
			
 
				+    Returns:
			
 
				+        处理后的图片路径
			
 
				+    """
			
 
				+    if not CV2_AVAILABLE:
			
 
				+        logger.warning("[去水印] OpenCV 未安装，跳过去水印处理")
			
 
				+        return image_path
			
 
				+    
			
 
				+    logger.info(f"[去水印] 开始处理: {image_path}")
			
 
				+    logger.info(f"[去水印] 方法: {method}, 亮度阈值: {light_threshold}, 饱和度阈值: {saturation_threshold}")
			
 
				+    
			
 
				+    # 读取图片
			
 
				+    img = cv2.imread(image_path)
			
 
				+    if img is None:
			
 
				+        logger.error(f"[去水印] 无法读取图片: {image_path}")
			
 
				+        return image_path
			
 
				+    
			
 
				+    original_shape = img.shape
			
 
				+    logger.info(f"[去水印] 图片尺寸: {original_shape[1]}x{original_shape[0]}")
			
 
				+    
			
 
				+    # 根据方法选择处理逻辑
			
 
				+    if method == "auto":
			
 
				+        # 自动检测：先尝试 HSV 方法，如果效果不好则用 adaptive
			
 
				+        method = "hsv"
			
 
				+    
			
 
				+    if method == "light":
			
 
				+        # 简单亮度方法：将浅色像素替换为白色
			
 
				+        result = _remove_watermark_light(img, light_threshold)
			
 
				+    elif method == "hsv":
			
 
				+        # HSV 方法：基于亮度和饱和度
			
 
				+        result = _remove_watermark_hsv(img, light_threshold, saturation_threshold)
			
 
				+    elif method == "adaptive":
			
 
				+        # 自适应方法：使用自适应阈值
			
 
				+        result = _remove_watermark_adaptive(img)
			
 
				+    else:
			
 
				+        logger.warning(f"[去水印] 未知方法: {method}，使用 hsv")
			
 
				+        result = _remove_watermark_hsv(img, light_threshold, saturation_threshold)
			
 
				+    
			
 
				+    # 确定输出路径
			
 
				+    if output_path is None:
			
 
				+        path = Path(image_path)
			
 
				+        output_path = str(path.parent / f"{path.stem}_nowm{path.suffix}")
			
 
				+    
			
 
				+    # 保存结果
			
 
				+    cv2.imwrite(output_path, result)
			
 
				+    logger.info(f"[去水印] 处理完成，保存到: {output_path}")
			
 
				+    
			
 
				+    return output_path
			
 
				+
			
 
				+
			
 
				+def _remove_watermark_light(img: np.ndarray, threshold: int = 200) -> np.ndarray:
			
 
				+    """
			
 
				+    简单亮度方法：将浅色像素替换为白色
			
 
				+    
			
 
				+    适用于：浅色/灰色水印
			
 
				+    """
			
 
				+    # 转为灰度图
			
 
				+    gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
			
 
				+    
			
 
				+    # 创建掩码：亮度高于阈值的区域
			
 
				+    mask = gray > threshold
			
 
				+    
			
 
				+    # 将掩码区域设为白色
			
 
				+    result = img.copy()
			
 
				+    result[mask] = [255, 255, 255]
			
 
				+    
			
 
				+    return result
			
 
				+
			
 
				+
			
 
				+def _remove_watermark_hsv(
			
 
				+    img: np.ndarray,
			
 
				+    light_threshold: int = 200,
			
 
				+    saturation_threshold: int = 30
			
 
				+) -> np.ndarray:
			
 
				+    """
			
 
				+    HSV 方法：基于亮度和饱和度去除水印
			
 
				+    
			
 
				+    原理：水印通常是高亮度、低饱和度的
			
 
				+    适用于：彩色水印、半透明水印
			
 
				+    """
			
 
				+    # 转换到 HSV 颜色空间
			
 
				+    hsv = cv2.cvtColor(img, cv2.COLOR_BGR2HSV)
			
 
				+    
			
 
				+    # 分离通道
			
 
				+    h, s, v = cv2.split(hsv)
			
 
				+    
			
 
				+    # 创建水印掩码：高亮度 AND 低饱和度
			
 
				+    watermark_mask = (v > light_threshold) & (s < saturation_threshold)
			
 
				+    
			
 
				+    # 将水印区域设为白色
			
 
				+    result = img.copy()
			
 
				+    result[watermark_mask] = [255, 255, 255]
			
 
				+    
			
 
				+    # 可选：对边缘进行平滑处理
			
 
				+    # kernel = np.ones((3, 3), np.uint8)
			
 
				+    # watermark_mask_dilated = cv2.dilate(watermark_mask.astype(np.uint8), kernel, iterations=1)
			
 
				+    # result[watermark_mask_dilated == 1] = [255, 255, 255]
			
 
				+    
			
 
				+    return result
			
 
				+
			
 
				+
			
 
				+def _remove_watermark_adaptive(img: np.ndarray) -> np.ndarray:
			
 
				+    """
			
 
				+    自适应阈值方法
			
 
				+    
			
 
				+    适用于：复杂背景、不均匀光照
			
 
				+    """
			
 
				+    # 转为灰度图
			
 
				+    gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
			
 
				+    
			
 
				+    # 使用自适应阈值
			
 
				+    # 这会根据局部区域计算阈值，保留文字，去除背景和水印
			
 
				+    binary = cv2.adaptiveThreshold(
			
 
				+        gray, 255,
			
 
				+        cv2.ADAPTIVE_THRESH_GAUSSIAN_C,
			
 
				+        cv2.THRESH_BINARY,
			
 
				+        blockSize=15,
			
 
				+        C=10
			
 
				+    )
			
 
				+    
			
 
				+    # 转回 BGR（3通道）
			
 
				+    result = cv2.cvtColor(binary, cv2.COLOR_GRAY2BGR)
			
 
				+    
			
 
				+    return result
			
 
				+
			
 
				+
			
 
				+def enhance_for_ocr(
			
 
				+    image_path: str,
			
 
				+    output_path: Optional[str] = None,
			
 
				+    remove_wm: bool = True,
			
 
				+    denoise: bool = True,
			
 
				+    sharpen: bool = False
			
 
				+) -> str:
			
 
				+    """
			
 
				+    OCR 预处理增强
			
 
				+    
			
 
				+    组合多种预处理操作，优化 OCR 识别效果
			
 
				+    
			
 
				+    Args:
			
 
				+        image_path: 输入图片路径
			
 
				+        output_path: 输出图片路径
			
 
				+        remove_wm: 是否去除水印
			
 
				+        denoise: 是否去噪
			
 
				+        sharpen: 是否锐化
			
 
				+    
			
 
				+    Returns:
			
 
				+        处理后的图片路径
			
 
				+    """
			
 
				+    if not CV2_AVAILABLE:
			
 
				+        logger.warning("[OCR预处理] OpenCV 未安装，跳过预处理")
			
 
				+        return image_path
			
 
				+    
			
 
				+    logger.info(f"[OCR预处理] 开始处理: {image_path}")
			
 
				+    
			
 
				+    # 读取图片
			
 
				+    img = cv2.imread(image_path)
			
 
				+    if img is None:
			
 
				+        logger.error(f"[OCR预处理] 无法读取图片: {image_path}")
			
 
				+        return image_path
			
 
				+    
			
 
				+    result = img.copy()
			
 
				+    
			
 
				+    # 1. 去水印
			
 
				+    if remove_wm:
			
 
				+        result = _remove_watermark_hsv(result)
			
 
				+        logger.info("[OCR预处理] 已去除水印")
			
 
				+    
			
 
				+    # 2. 去噪
			
 
				+    if denoise:
			
 
				+        result = cv2.fastNlMeansDenoisingColored(result, None, 10, 10, 7, 21)
			
 
				+        logger.info("[OCR预处理] 已去噪")
			
 
				+    
			
 
				+    # 3. 锐化
			
 
				+    if sharpen:
			
 
				+        kernel = np.array([[-1, -1, -1],
			
 
				+                          [-1,  9, -1],
			
 
				+                          [-1, -1, -1]])
			
 
				+        result = cv2.filter2D(result, -1, kernel)
			
 
				+        logger.info("[OCR预处理] 已锐化")
			
 
				+    
			
 
				+    # 确定输出路径
			
 
				+    if output_path is None:
			
 
				+        path = Path(image_path)
			
 
				+        output_path = str(path.parent / f"{path.stem}_enhanced{path.suffix}")
			
 
				+    
			
 
				+    # 保存结果
			
 
				+    cv2.imwrite(output_path, result)
			
 
				+    logger.info(f"[OCR预处理] 处理完成，保存到: {output_path}")
			
 
				+    
			
 
				+    return output_path
			
 
				+
			
 
				+
			
 
				+def check_opencv_available() -> bool:
			
 
				+    """检查 OpenCV 是否可用"""
			
 
				+    return CV2_AVAILABLE
			
 
				+
			
 
				+
			
 
				+def crop_header_footer(
			
 
				+    image_path: str,
			
 
				+    output_path: Optional[str] = None,
			
 
				+    header_ratio: float = 0.05,
			
 
				+    footer_ratio: float = 0.05,
			
 
				+    auto_detect: bool = False
			
 
				+) -> str:
			
 
				+    """
			
 
				+    裁剪图片的页眉和页脚区域
			
 
				+    
			
 
				+    通过按比例裁剪图片顶部和底部来去除页眉页脚
			
 
				+    
			
 
				+    Args:
			
 
				+        image_path: 输入图片路径
			
 
				+        output_path: 输出图片路径，默认在原文件名后加 _cropped
			
 
				+        header_ratio: 页眉裁剪比例（0-1），默认0.05表示裁剪顶部5%
			
 
				+        footer_ratio: 页脚裁剪比例（0-1），默认0.05表示裁剪底部5%
			
 
				+        auto_detect: 是否自动检测页眉页脚边界（忽略 header_ratio 和 footer_ratio）
			
 
				+    
			
 
				+    Returns:
			
 
				+        处理后的图片路径
			
 
				+    """
			
 
				+    if not CV2_AVAILABLE:
			
 
				+        logger.warning("[裁剪页眉页脚] OpenCV 未安装，跳过处理")
			
 
				+        return image_path
			
 
				+    
			
 
				+    logger.info(f"[裁剪页眉页脚] 开始处理: {image_path}")
			
 
				+    
			
 
				+    # 读取图片
			
 
				+    img = cv2.imread(image_path)
			
 
				+    if img is None:
			
 
				+        logger.error(f"[裁剪页眉页脚] 无法读取图片: {image_path}")
			
 
				+        return image_path
			
 
				+    
			
 
				+    height, width = img.shape[:2]
			
 
				+    logger.info(f"[裁剪页眉页脚] 原始尺寸: {width}x{height}")
			
 
				+    
			
 
				+    if auto_detect:
			
 
				+        # 自动检测页眉页脚边界
			
 
				+        logger.info("[裁剪页眉页脚] 使用自动检测模式")
			
 
				+        header_pixels, footer_pixels = _detect_header_footer_boundaries(img)
			
 
				+        logger.info(f"[裁剪页眉页脚] 自动检测结果: 页眉={header_pixels}px, 页脚={footer_pixels}px")
			
 
				+    else:
			
 
				+        # 使用固定比例
			
 
				+        logger.info(f"[裁剪页眉页脚] 使用固定比例: 页眉={header_ratio}, 页脚={footer_ratio}")
			
 
				+        header_pixels = int(height * header_ratio)
			
 
				+        footer_pixels = int(height * footer_ratio)
			
 
				+    
			
 
				+    # 裁剪图片（保留中间部分）
			
 
				+    top = header_pixels
			
 
				+    bottom = height - footer_pixels
			
 
				+    
			
 
				+    if top >= bottom:
			
 
				+        logger.warning("[裁剪页眉页脚] 裁剪区域无效，跳过处理")
			
 
				+        return image_path
			
 
				+    
			
 
				+    result = img[top:bottom, :]
			
 
				+    
			
 
				+    new_height = result.shape[0]
			
 
				+    logger.info(f"[裁剪页眉页脚] 裁剪后尺寸: {width}x{new_height}")
			
 
				+    logger.info(f"[裁剪页眉页脚] 裁剪了顶部 {header_pixels}px，底部 {footer_pixels}px")
			
 
				+    
			
 
				+    # 确定输出路径
			
 
				+    if output_path is None:
			
 
				+        path = Path(image_path)
			
 
				+        output_path = str(path.parent / f"{path.stem}_cropped{path.suffix}")
			
 
				+    
			
 
				+    # 保存结果
			
 
				+    cv2.imwrite(output_path, result)
			
 
				+    logger.info(f"[裁剪页眉页脚] 处理完成，保存到: {output_path}")
			
 
				+    
			
 
				+    return output_path
			
 
				+
			
 
				+
			
 
				+def _detect_header_footer_boundaries(img: np.ndarray) -> Tuple[int, int]:
			
 
				+    """
			
 
				+    自动检测页眉页脚边界
			
 
				+    
			
 
				+    使用多种方法综合判断：
			
 
				+    1. 水平线检测 - 检测分隔线
			
 
				+    2. 文本密度分析 - 页眉页脚通常文字较少
			
 
				+    3. 空白区域检测 - 检测大面积空白
			
 
				+    
			
 
				+    Args:
			
 
				+        img: 输入图片（BGR格式）
			
 
				+    
			
 
				+    Returns:
			
 
				+        (header_pixels, footer_pixels): 页眉和页脚的像素高度
			
 
				+    """
			
 
				+    height, width = img.shape[:2]
			
 
				+    
			
 
				+    # 转为灰度图
			
 
				+    gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
			
 
				+    
			
 
				+    # 定义搜索范围（页眉页脚通常在顶部/底部 15% 以内）
			
 
				+    search_range = int(height * 0.15)
			
 
				+    min_margin = int(height * 0.02)  # 最小边距 2%
			
 
				+    
			
 
				+    # 方法1: 检测水平线
			
 
				+    header_line = _find_horizontal_line(gray, 0, search_range, from_top=True)
			
 
				+    footer_line = _find_horizontal_line(gray, height - search_range, height, from_top=False)
			
 
				+    
			
 
				+    # 方法2: 分析文本密度变化
			
 
				+    header_density = _find_content_boundary(gray, 0, search_range, from_top=True)
			
 
				+    footer_density = _find_content_boundary(gray, height - search_range, height, from_top=False)
			
 
				+    
			
 
				+    # 综合判断：取最可靠的结果
			
 
				+    # 优先使用水平线检测结果，其次使用密度分析结果
			
 
				+    if header_line > min_margin:
			
 
				+        header_pixels = header_line
			
 
				+        logger.debug(f"[自动检测] 页眉: 使用水平线检测结果 {header_pixels}px")
			
 
				+    elif header_density > min_margin:
			
 
				+        header_pixels = header_density
			
 
				+        logger.debug(f"[自动检测] 页眉: 使用密度分析结果 {header_pixels}px")
			
 
				+    else:
			
 
				+        header_pixels = min_margin
			
 
				+        logger.debug(f"[自动检测] 页眉: 使用最小边距 {header_pixels}px")
			
 
				+    
			
 
				+    if footer_line > min_margin:
			
 
				+        footer_pixels = footer_line
			
 
				+        logger.debug(f"[自动检测] 页脚: 使用水平线检测结果 {footer_pixels}px")
			
 
				+    elif footer_density > min_margin:
			
 
				+        footer_pixels = footer_density
			
 
				+        logger.debug(f"[自动检测] 页脚: 使用密度分析结果 {footer_pixels}px")
			
 
				+    else:
			
 
				+        footer_pixels = min_margin
			
 
				+        logger.debug(f"[自动检测] 页脚: 使用最小边距 {footer_pixels}px")
			
 
				+    
			
 
				+    return header_pixels, footer_pixels
			
 
				+
			
 
				+
			
 
				+def _find_horizontal_line(
			
 
				+    gray: np.ndarray,
			
 
				+    start_y: int,
			
 
				+    end_y: int,
			
 
				+    from_top: bool = True
			
 
				+) -> int:
			
 
				+    """
			
 
				+    在指定区域内查找水平分隔线
			
 
				+    
			
 
				+    Args:
			
 
				+        gray: 灰度图
			
 
				+        start_y: 搜索起始y坐标
			
 
				+        end_y: 搜索结束y坐标
			
 
				+        from_top: True表示从上往下找，False表示从下往上找
			
 
				+    
			
 
				+    Returns:
			
 
				+        分隔线位置（像素），如果没找到返回0
			
 
				+    """
			
 
				+    height, width = gray.shape
			
 
				+    
			
 
				+    # 使用 Canny 边缘检测
			
 
				+    edges = cv2.Canny(gray[start_y:end_y, :], 50, 150)
			
 
				+    
			
 
				+    # 使用霍夫变换检测直线
			
 
				+    lines = cv2.HoughLinesP(
			
 
				+        edges,
			
 
				+        rho=1,
			
 
				+        theta=np.pi/180,
			
 
				+        threshold=int(width * 0.5),  # 线长度至少为图片宽度的50%
			
 
				+        minLineLength=int(width * 0.4),
			
 
				+        maxLineGap=20
			
 
				+    )
			
 
				+    
			
 
				+    if lines is None:
			
 
				+        return 0
			
 
				+    
			
 
				+    # 筛选水平线（角度接近0或180度）
			
 
				+    horizontal_lines = []
			
 
				+    for line in lines:
			
 
				+        x1, y1, x2, y2 = line[0]
			
 
				+        # 计算角度
			
 
				+        angle = abs(np.arctan2(y2 - y1, x2 - x1) * 180 / np.pi)
			
 
				+        # 水平线角度应该接近 0 或 180
			
 
				+        if angle < 5 or angle > 175:
			
 
				+            avg_y = (y1 + y2) // 2 + start_y
			
 
				+            horizontal_lines.append(avg_y)
			
 
				+    
			
 
				+    if not horizontal_lines:
			
 
				+        return 0
			
 
				+    
			
 
				+    # 根据方向返回最合适的线
			
 
				+    if from_top:
			
 
				+        # 从上往下，返回最下面的水平线（作为页眉下边界）
			
 
				+        return max(horizontal_lines)
			
 
				+    else:
			
 
				+        # 从下往上，返回距离底部的距离
			
 
				+        return height - min(horizontal_lines)
			
 
				+
			
 
				+
			
 
				+def _find_content_boundary(
			
 
				+    gray: np.ndarray,
			
 
				+    start_y: int,
			
 
				+    end_y: int,
			
 
				+    from_top: bool = True
			
 
				+) -> int:
			
 
				+    """
			
 
				+    通过分析文本/内容密度找到内容边界
			
 
				+    
			
 
				+    原理：页眉页脚区域通常是空白或只有少量文字，
			
 
				+    正文区域文字密度较高。通过检测密度突变点来确定边界。
			
 
				+    
			
 
				+    Args:
			
 
				+        gray: 灰度图
			
 
				+        start_y: 搜索起始y坐标
			
 
				+        end_y: 搜索结束y坐标
			
 
				+        from_top: True表示从上往下找，False表示从下往上找
			
 
				+    
			
 
				+    Returns:
			
 
				+        内容边界位置（像素），如果没找到返回0
			
 
				+    """
			
 
				+    height, width = gray.shape
			
 
				+    
			
 
				+    # 二值化
			
 
				+    _, binary = cv2.threshold(gray, 200, 255, cv2.THRESH_BINARY_INV)
			
 
				+    
			
 
				+    # 计算每一行的像素密度（黑色像素占比）
			
 
				+    row_densities = []
			
 
				+    for y in range(start_y, end_y):
			
 
				+        row = binary[y, :]
			
 
				+        density = np.sum(row > 0) / width
			
 
				+        row_densities.append((y, density))
			
 
				+    
			
 
				+    if not row_densities:
			
 
				+        return 0
			
 
				+    
			
 
				+    # 使用滑动窗口平滑密度曲线
			
 
				+    window_size = 10
			
 
				+    smoothed = []
			
 
				+    for i in range(len(row_densities)):
			
 
				+        start = max(0, i - window_size // 2)
			
 
				+        end = min(len(row_densities), i + window_size // 2)
			
 
				+        avg_density = sum(d[1] for d in row_densities[start:end]) / (end - start)
			
 
				+        smoothed.append((row_densities[i][0], avg_density))
			
 
				+    
			
 
				+    # 找到密度突变点
			
 
				+    # 定义阈值：当密度从低于 0.01 变化到高于 0.02 时，认为进入正文区域
			
 
				+    low_threshold = 0.005
			
 
				+    high_threshold = 0.02
			
 
				+    
			
 
				+    if from_top:
			
 
				+        # 从上往下，找到第一个连续高密度区域的起始位置
			
 
				+        in_content = False
			
 
				+        content_start = 0
			
 
				+        consecutive_high = 0
			
 
				+        
			
 
				+        for y, density in smoothed:
			
 
				+            if density > high_threshold:
			
 
				+                consecutive_high += 1
			
 
				+                if consecutive_high >= 5 and not in_content:
			
 
				+                    # 连续5行高密度，认为进入正文
			
 
				+                    in_content = True
			
 
				+                    content_start = y - 5  # 往上回退一点
			
 
				+                    break
			
 
				+            else:
			
 
				+                consecutive_high = 0
			
 
				+        
			
 
				+        return max(0, content_start - start_y)
			
 
				+    else:
			
 
				+        # 从下往上，找到最后一个连续高密度区域的结束位置
			
 
				+        in_content = False
			
 
				+        content_end = height
			
 
				+        consecutive_high = 0
			
 
				+        
			
 
				+        for y, density in reversed(smoothed):
			
 
				+            if density > high_threshold:
			
 
				+                consecutive_high += 1
			
 
				+                if consecutive_high >= 5 and not in_content:
			
 
				+                    in_content = True
			
 
				+                    content_end = y + 5
			
 
				+                    break
			
 
				+            else:
			
 
				+                consecutive_high = 0
			
 
				+        
			
 
				+        return max(0, height - content_end)
			
--- a/pdf_converter_v2/utils/pdf_watermark_remover.py
+++ b/pdf_converter_v2/utils/pdf_watermark_remover.py
@@ -0,0 +1,122 @@
 
				+#!/usr/bin/env python3
			
 
				+# -*- coding: utf-8 -*-
			
 
				+"""
			
 
				+PDF去水印工具
			
 
				+将PDF转换为图片，去除水印后再转回PDF
			
 
				+"""
			
 
				+
			
 
				+from pathlib import Path
			
 
				+from typing import Optional
			
 
				+import tempfile
			
 
				+import shutil
			
 
				+
			
 
				+def remove_watermark_from_pdf(
			
 
				+    input_pdf: str,
			
 
				+    output_pdf: str,
			
 
				+    light_threshold: int = 200,
			
 
				+    saturation_threshold: int = 30,
			
 
				+    dpi: int = 200
			
 
				+) -> bool:
			
 
				+    """
			
 
				+    对PDF文件进行去水印处理
			
 
				+    
			
 
				+    处理流程：
			
 
				+    1. 将PDF的每一页转换为图片
			
 
				+    2. 对每张图片进行去水印处理
			
 
				+    3. 将处理后的图片合并为新的PDF
			
 
				+    
			
 
				+    Args:
			
 
				+        input_pdf: 输入PDF文件路径
			
 
				+        output_pdf: 输出PDF文件路径
			
 
				+        light_threshold: 水印亮度阈值（0-255），高于此值的浅色像素可能是水印
			
 
				+        saturation_threshold: 水印饱和度阈值（0-255），低于此值的低饱和度像素可能是水印
			
 
				+        dpi: PDF转图片的DPI，影响图片质量和处理速度
			
 
				+    
			
 
				+    Returns:
			
 
				+        bool: 是否成功
			
 
				+    """
			
 
				+    try:
			
 
				+        # 导入必要的库
			
 
				+        from pdf2image import convert_from_path
			
 
				+        from PIL import Image
			
 
				+        import PyPDF2
			
 
				+        from utils.image_preprocessor import remove_watermark, check_opencv_available
			
 
				+        
			
 
				+        # 检查OpenCV是否可用
			
 
				+        if not check_opencv_available():
			
 
				+            print("⚠ OpenCV 未安装，无法进行去水印处理")
			
 
				+            return False
			
 
				+        
			
 
				+        # 创建临时目录
			
 
				+        temp_dir = tempfile.mkdtemp(prefix="pdf_watermark_")
			
 
				+        temp_path = Path(temp_dir)
			
 
				+        
			
 
				+        try:
			
 
				+            print(f"正在将PDF转换为图片（DPI={dpi}）...")
			
 
				+            # 将PDF转换为图片
			
 
				+            images = convert_from_path(input_pdf, dpi=dpi)
			
 
				+            print(f"✓ 转换完成，共 {len(images)} 页")
			
 
				+            
			
 
				+            # 处理每一页
			
 
				+            processed_images = []
			
 
				+            for i, image in enumerate(images, 1):
			
 
				+                print(f"处理第 {i}/{len(images)} 页...", end='\r')
			
 
				+                
			
 
				+                # 保存原始图片
			
 
				+                original_path = temp_path / f"page_{i}_original.png"
			
 
				+                image.save(str(original_path), "PNG")
			
 
				+                
			
 
				+                # 去水印
			
 
				+                nowm_path = temp_path / f"page_{i}_nowm.png"
			
 
				+                processed_path = remove_watermark(
			
 
				+                    str(original_path),
			
 
				+                    output_path=str(nowm_path),
			
 
				+                    light_threshold=light_threshold,
			
 
				+                    saturation_threshold=saturation_threshold,
			
 
				+                    method="hsv"
			
 
				+                )
			
 
				+                
			
 
				+                # 加载处理后的图片
			
 
				+                processed_img = Image.open(processed_path)
			
 
				+                processed_images.append(processed_img)
			
 
				+            
			
 
				+            print(f"\n✓ 所有页面处理完成")
			
 
				+            
			
 
				+            # 将图片合并为PDF
			
 
				+            print("正在生成PDF...")
			
 
				+            if processed_images:
			
 
				+                # 第一张图片作为主图片
			
 
				+                first_image = processed_images[0]
			
 
				+                # 其余图片作为附加页
			
 
				+                other_images = processed_images[1:] if len(processed_images) > 1 else []
			
 
				+                
			
 
				+                # 保存为PDF
			
 
				+                first_image.save(
			
 
				+                    output_pdf,
			
 
				+                    "PDF",
			
 
				+                    resolution=dpi,
			
 
				+                    save_all=True,
			
 
				+                    append_images=other_images
			
 
				+                )
			
 
				+                print(f"✓ PDF生成完成: {output_pdf}")
			
 
				+                return True
			
 
				+            else:
			
 
				+                print("⚠ 没有处理任何图片")
			
 
				+                return False
			
 
				+                
			
 
				+        finally:
			
 
				+            # 清理临时目录
			
 
				+            try:
			
 
				+                shutil.rmtree(temp_dir)
			
 
				+            except Exception as e:
			
 
				+                print(f"⚠ 清理临时目录失败: {e}")
			
 
				+    
			
 
				+    except ImportError as e:
			
 
				+        print(f"⚠ 缺少必要的库: {e}")
			
 
				+        print("请安装: pip install pdf2image pillow PyPDF2 opencv-python")
			
 
				+        return False
			
 
				+    except Exception as e:
			
 
				+        print(f"⚠ 去水印处理失败: {e}")
			
 
				+        import traceback
			
 
				+        traceback.print_exc()
			
 
				+        return False
			
--- a/start_mineru_in_container.sh
+++ b/start_mineru_in_container.sh
@@ -0,0 +1,15 @@
 
				+#!/usr/bin/env bash
			
 
				+# 在 Docker 容器内启动 MinerU file_parse API（容器内无 systemd，用本脚本代替 systemd 服务）
			
 
				+# 使用：bash start_mineru_in_container.sh  或  nohup bash start_mineru_in_container.sh &
			
 
				+# 工作目录：/root/work/Clerk2.5（可通过 CLERK_ROOT 覆盖）
			
 
				+
			
 
				+set -e
			
 
				+CLERK_ROOT="${CLERK_ROOT:-/root/work/Clerk2.5}"
			
 
				+PORT="${MINERU_PORT:-5282}"
			
 
				+
			
 
				+# NPU/容器内需预加载 libgomp，避免 static TLS 报错（路径以常见 dist-packages 为准，可按本机修改）
			
 
				+export LD_PRELOAD="${LD_PRELOAD:-/usr/local/lib/python3.10/dist-packages/simsimd.libs/libgomp-a49a47f9.so.1.0.0:/usr/local/lib/python3.10/dist-packages/scikit_learn.libs/libgomp-d22c30c5.so.1.0.0}"
			
 
				+export PYTHONPATH="${CLERK_ROOT}"
			
 
				+
			
 
				+cd "$CLERK_ROOT"
			
 
				+exec python3 -m uvicorn mineru.cli.fast_api:app --host 0.0.0.0 --port "$PORT"