3 هفته پیش · 30c6facd53
--- a/pdf_converter_v2/converter.py
+++ b/pdf_converter_v2/converter.py
@@ -0,0 +1,431 @@
 
				+# Copyright (c) Opendatalab. All rights reserved.
			
 
				+
			
 
				+"""PDF转换主函数模块 v2 - 使用新的API接口"""
			
 
				+
			
 
				+import asyncio
			
 
				+import json
			
 
				+import os
			
 
				+import time
			
 
				+import zipfile
			
 
				+import tempfile
			
 
				+import shutil
			
 
				+from pathlib import Path
			
 
				+from typing import Optional, Sequence
			
 
				+
			
 
				+import aiohttp
			
 
				+import aiofiles
			
 
				+from PIL import Image
			
 
				+
			
 
				+from ..utils.logging_config import get_logger
			
 
				+from ..utils.file_utils import safe_stem
			
 
				+
			
 
				+logger = get_logger("pdf_converter_v2.processor")
			
 
				+PADDLE_CMD = os.getenv("PADDLE_DOC_PARSER_CMD", "paddleocr")
			
 
				+
			
 
				+
			
 
				+async def _run_paddle_doc_parser(cmd: Sequence[str]) -> tuple[int, str, str]:
			
 
				+    """异步执行 paddleocr doc_parser 命令"""
			
 
				+    logger.info(f"[Paddle] 执行命令: {' '.join(cmd)}")
			
 
				+    process = await asyncio.create_subprocess_exec(
			
 
				+        *cmd,
			
 
				+        stdout=asyncio.subprocess.PIPE,
			
 
				+        stderr=asyncio.subprocess.PIPE,
			
 
				+    )
			
 
				+    stdout_bytes, stderr_bytes = await process.communicate()
			
 
				+    stdout = stdout_bytes.decode("utf-8", errors="ignore")
			
 
				+    stderr = stderr_bytes.decode("utf-8", errors="ignore")
			
 
				+    if stdout:
			
 
				+        logger.debug(f"[Paddle] stdout: {stdout[:2000]}")
			
 
				+    if stderr:
			
 
				+        logger.debug(f"[Paddle] stderr: {stderr[:2000]}")
			
 
				+    return process.returncode, stdout, stderr
			
 
				+
			
 
				+
			
 
				+async def _convert_with_paddle(
			
 
				+    input_file: str,
			
 
				+    output_dir: str,
			
 
				+    embed_images: bool,
			
 
				+    output_json: bool,
			
 
				+    forced_document_type: Optional[str],
			
 
				+):
			
 
				+    """针对工况附件使用 PaddleOCR doc_parser 直接转换"""
			
 
				+    if not os.path.exists(input_file):
			
 
				+        logger.error(f"[Paddle] 输入文件不存在: {input_file}")
			
 
				+        return None
			
 
				+    
			
 
				+    file_name = f'{safe_stem(Path(input_file).stem)}_{time.strftime("%y%m%d_%H%M%S")}'
			
 
				+    os.makedirs(output_dir, exist_ok=True)
			
 
				+    
			
 
				+    temp_dir = tempfile.mkdtemp(prefix=f"pdf_converter_paddle_{file_name}_")
			
 
				+    logger.info(f"[Paddle] 创建临时目录: {temp_dir}")
			
 
				+    save_path_base = os.path.join(temp_dir, Path(input_file).stem)
			
 
				+    os.makedirs(save_path_base, exist_ok=True)
			
 
				+    
			
 
				+    cmd = [
			
 
				+        PADDLE_CMD,
			
 
				+        "doc_parser",
			
 
				+        "-i",
			
 
				+        input_file,
			
 
				+        "--precision",
			
 
				+        "fp32",
			
 
				+        "--use_doc_unwarping",
			
 
				+        "False",
			
 
				+        "--use_doc_orientation_classify",
			
 
				+        "True",
			
 
				+        "--use_chart_recognition",
			
 
				+        "True",
			
 
				+        "--save_path",
			
 
				+        save_path_base,
			
 
				+    ]
			
 
				+    # NPU 下需加 --device npu:0，否则 PaddleOCR 走 CPU 易段错误
			
 
				+    paddle_device = os.getenv("PADDLE_OCR_DEVICE", "").strip()
			
 
				+    if paddle_device:
			
 
				+        cmd.extend(["--device", paddle_device])
			
 
				+    try:
			
 
				+        return_code, _, stderr = await _run_paddle_doc_parser(cmd)
			
 
				+        if return_code != 0:
			
 
				+            logger.error(f"[Paddle] doc_parser 执行失败 code={return_code}")
			
 
				+            if stderr:
			
 
				+                logger.error(stderr)
			
 
				+            return None
			
 
				+        
			
 
				+        md_files = sorted(Path(save_path_base).rglob("*.md"))
			
 
				+        if not md_files:
			
 
				+            logger.error("[Paddle] 未找到Markdown文件")
			
 
				+            return None
			
 
				+        
			
 
				+        markdown_parts = []
			
 
				+        for md_file in md_files:
			
 
				+            async with aiofiles.open(md_file, "r", encoding="utf-8") as f:
			
 
				+                markdown_parts.append(await f.read())
			
 
				+        final_content = "\n\n".join(markdown_parts)
			
 
				+        logger.info(f"[Paddle] 合并后的markdown长度: {len(final_content)}")
			
 
				+        
			
 
				+        local_md_dir = os.path.join(output_dir, file_name, "markdown")
			
 
				+        os.makedirs(local_md_dir, exist_ok=True)
			
 
				+        md_path = os.path.join(local_md_dir, f"{file_name}.md")
			
 
				+        async with aiofiles.open(md_path, "w", encoding="utf-8") as f:
			
 
				+            await f.write(final_content)
			
 
				+        
			
 
				+        output_md_path = os.path.join(output_dir, f"{file_name}.md")
			
 
				+        async with aiofiles.open(output_md_path, "w", encoding="utf-8") as f:
			
 
				+            await f.write(final_content)
			
 
				+        
			
 
				+        if embed_images:
			
 
				+            local_image_dir = os.path.join(output_dir, file_name, "images")
			
 
				+            os.makedirs(local_image_dir, exist_ok=True)
			
 
				+            for asset in Path(save_path_base).rglob("*"):
			
 
				+                if asset.is_file() and asset.suffix.lower() in {".png", ".jpg", ".jpeg", ".bmp", ".tif", ".tiff"}:
			
 
				+                    shutil.copy2(asset, os.path.join(local_image_dir, asset.name))
			
 
				+        
			
 
				+        json_data = None
			
 
				+        json_path = None
			
 
				+        if output_json:
			
 
				+            try:
			
 
				+                from ..parser.json_converter import parse_markdown_to_json
			
 
				+                json_output_dir = os.path.join(output_dir, file_name)
			
 
				+                json_data = parse_markdown_to_json(
			
 
				+                    final_content,
			
 
				+                    first_page_image=None,
			
 
				+                    output_dir=json_output_dir,
			
 
				+                    forced_document_type=forced_document_type,
			
 
				+                    enable_paddleocr_fallback=True,
			
 
				+                    input_file=input_file,
			
 
				+                )
			
 
				+                json_path = os.path.join(output_dir, f"{file_name}.json")
			
 
				+                async with aiofiles.open(json_path, "w", encoding="utf-8") as f:
			
 
				+                    await f.write(json.dumps(json_data, ensure_ascii=False, indent=2))
			
 
				+            except Exception as exc:
			
 
				+                logger.exception(f"[Paddle] JSON转换失败: {exc}")
			
 
				+        
			
 
				+        return {
			
 
				+            "markdown_file": output_md_path,
			
 
				+            "json_file": json_path,
			
 
				+            "json_data": json_data,
			
 
				+            "content": final_content,
			
 
				+        }
			
 
				+    finally:
			
 
				+        try:
			
 
				+            shutil.rmtree(temp_dir)
			
 
				+        except Exception as exc:
			
 
				+            logger.warning(f"[Paddle] 清理临时目录失败: {exc}")
			
 
				+
			
 
				+async def convert_to_markdown(
			
 
				+    input_file: str,
			
 
				+    output_dir: str = "./output",
			
 
				+    max_pages: int = 10,
			
 
				+    is_ocr: bool = False,
			
 
				+    formula_enable: bool = True,
			
 
				+    table_enable: bool = True,
			
 
				+    language: str = "ch",
			
 
				+    backend: str = "vlm-vllm-async-engine",
			
 
				+    url: str = "http://192.168.2.3:8000",
			
 
				+    embed_images: bool = True,
			
 
				+    output_json: bool = False,
			
 
				+    start_page_id: int = 0,
			
 
				+    end_page_id: int = 99999,
			
 
				+    parse_method: str = "auto",
			
 
				+    server_url: str = "string",
			
 
				+    response_format_zip: bool = True,
			
 
				+    return_middle_json: bool = False,
			
 
				+    return_model_output: bool = True,
			
 
				+    return_md: bool = True,
			
 
				+    return_images: bool = True,  # 默认启用，以便PaddleOCR备用解析可以使用
			
 
				+    return_content_list: bool = False,
			
 
				+    forced_document_type: Optional[str] = None
			
 
				+):
			
 
				+    """将PDF/图片转换为Markdown的主要函数（使用新的API接口）"""
			
 
				+    
			
 
				+    if not os.path.exists(input_file):
			
 
				+        logger.error(f"输入文件不存在: {input_file}")
			
 
				+        return None
			
 
				+
			
 
				+    # 生成文件名
			
 
				+    file_name = f'{safe_stem(Path(input_file).stem)}_{time.strftime("%y%m%d_%H%M%S")}'
			
 
				+    
			
 
				+    try:
			
 
				+        os.makedirs(output_dir, exist_ok=True)
			
 
				+        
			
 
				+        # 构建API请求URL
			
 
				+        api_url = f"{url}/file_parse"
			
 
				+        logger.info(f"调用API接口: {api_url}")
			
 
				+        
			
 
				+        # 创建临时目录用于解压zip文件
			
 
				+        temp_dir = tempfile.mkdtemp(prefix=f"pdf_converter_v2_{file_name}_")
			
 
				+        logger.info(f"创建临时目录: {temp_dir}")
			
 
				+        
			
 
				+        try:
			
 
				+            # 准备表单数据
			
 
				+            form_data = aiohttp.FormData()
			
 
				+            form_data.add_field('return_middle_json', str(return_middle_json).lower())
			
 
				+            form_data.add_field('return_model_output', str(return_model_output).lower())
			
 
				+            form_data.add_field('return_md', str(return_md).lower())
			
 
				+            form_data.add_field('return_images', str(return_images).lower())
			
 
				+            form_data.add_field('end_page_id', str(end_page_id))
			
 
				+            form_data.add_field('parse_method', parse_method)
			
 
				+            form_data.add_field('start_page_id', str(start_page_id))
			
 
				+            form_data.add_field('lang_list', language)
			
 
				+            form_data.add_field('output_dir', './output')
			
 
				+            form_data.add_field('server_url', server_url)
			
 
				+            form_data.add_field('return_content_list', str(return_content_list).lower())
			
 
				+            form_data.add_field('backend', backend)
			
 
				+            form_data.add_field('table_enable', str(table_enable).lower())
			
 
				+            form_data.add_field('response_format_zip', str(response_format_zip).lower())
			
 
				+            form_data.add_field('formula_enable', str(formula_enable).lower())
			
 
				+            
			
 
				+            # 打开文件并添加到表单数据（文件会在请求发送时读取）
			
 
				+            file_obj = open(input_file, 'rb')
			
 
				+            try:
			
 
				+                # 根据扩展名设置内容类型，默认使用application/octet-stream
			
 
				+                ext = (Path(input_file).suffix or "").lower()
			
 
				+                content_type = 'application/octet-stream'
			
 
				+                if ext == '.pdf':
			
 
				+                    content_type = 'application/pdf'
			
 
				+                elif ext in {'.png'}:
			
 
				+                    content_type = 'image/png'
			
 
				+                elif ext in {'.jpg', '.jpeg'}:
			
 
				+                    content_type = 'image/jpeg'
			
 
				+                elif ext in {'.bmp'}:
			
 
				+                    content_type = 'image/bmp'
			
 
				+                elif ext in {'.tif', '.tiff'}:
			
 
				+                    content_type = 'image/tiff'
			
 
				+                elif ext in {'.webp'}:
			
 
				+                    content_type = 'image/webp'
			
 
				+
			
 
				+                # 不使用原始文件名，直接使用简单的固定命名，避免对端服务在构造输出路径时触发 “File name too long”
			
 
				+                # 从文件路径获取扩展名
			
 
				+                ext = Path(input_file).suffix or ".pdf"
			
 
				+                upload_name = f"file{ext}"
			
 
				+
			
 
				+                form_data.add_field(
			
 
				+                    'files',
			
 
				+                    file_obj,
			
 
				+                    filename=upload_name,
			
 
				+                    content_type=content_type
			
 
				+                )
			
 
				+                
			
 
				+                # 发送API请求（设置超时时间：总超时600秒，连接超时30秒，socket读取超时300秒）
			
 
				+                timeout = aiohttp.ClientTimeout(total=600, connect=30, sock_read=300)
			
 
				+                
			
 
				+                # 添加重试机制
			
 
				+                max_retries = 3
			
 
				+                retry_count = 0
			
 
				+                last_error = None
			
 
				+                
			
 
				+                while retry_count < max_retries:
			
 
				+                    try:
			
 
				+                        async with aiohttp.ClientSession(timeout=timeout) as session:
			
 
				+                            if retry_count > 0:
			
 
				+                                logger.warning(f"重试第 {retry_count} 次上传文件: {input_file}")
			
 
				+                            else:
			
 
				+                                logger.info(f"开始上传文件: {input_file}")
			
 
				+                            
			
 
				+                            async with session.post(api_url, data=form_data) as response:
			
 
				+                                if response.status != 200:
			
 
				+                                    error_text = await response.text()
			
 
				+                                    logger.error(f"API请求失败，状态码: {response.status}, 错误: {error_text}")
			
 
				+                                    return None
			
 
				+                                
			
 
				+                                # 检查Content-Type是否为zip
			
 
				+                                content_type = response.headers.get('Content-Type', '')
			
 
				+                                if 'zip' not in content_type and 'application/zip' not in content_type:
			
 
				+                                    # 如果不是zip，尝试检查响应内容
			
 
				+                                    content_disposition = response.headers.get('Content-Disposition', '')
			
 
				+                                    if 'zip' not in content_disposition.lower():
			
 
				+                                        logger.warning(f"响应Content-Type可能不是zip: {content_type}")
			
 
				+                                
			
 
				+                                # 保存zip文件
			
 
				+                                zip_path = os.path.join(temp_dir, f"{file_name}.zip")
			
 
				+                                async with aiofiles.open(zip_path, 'wb') as f:
			
 
				+                                    async for chunk in response.content.iter_chunked(8192):
			
 
				+                                        await f.write(chunk)
			
 
				+                                
			
 
				+                                logger.info(f"Zip文件已保存: {zip_path}")
			
 
				+                                # 成功，跳出重试循环
			
 
				+                                break
			
 
				+                                
			
 
				+                    except (aiohttp.ClientError, asyncio.TimeoutError) as e:
			
 
				+                        last_error = e
			
 
				+                        retry_count += 1
			
 
				+                        error_type = type(e).__name__
			
 
				+                        logger.warning(f"API请求失败 ({error_type}): {e}, 重试 {retry_count}/{max_retries}")
			
 
				+                        
			
 
				+                        if retry_count < max_retries:
			
 
				+                            # 等待一段时间后重试（指数退避）
			
 
				+                            wait_time = 2 ** retry_count
			
 
				+                            logger.info(f"等待 {wait_time} 秒后重试...")
			
 
				+                            await asyncio.sleep(wait_time)
			
 
				+                            
			
 
				+                            # 重新创建 form_data（因为已经被消费了）
			
 
				+                            file_obj.seek(0)  # 重置文件指针
			
 
				+                            form_data = aiohttp.FormData()
			
 
				+                            form_data.add_field('return_middle_json', str(return_middle_json).lower())
			
 
				+                            form_data.add_field('return_model_output', str(return_model_output).lower())
			
 
				+                            form_data.add_field('return_md', str(return_md).lower())
			
 
				+                            form_data.add_field('return_images', str(return_images).lower())
			
 
				+                            form_data.add_field('end_page_id', str(end_page_id))
			
 
				+                            form_data.add_field('parse_method', parse_method)
			
 
				+                            form_data.add_field('start_page_id', str(start_page_id))
			
 
				+                            form_data.add_field('lang_list', language)
			
 
				+                            form_data.add_field('output_dir', './output')
			
 
				+                            form_data.add_field('server_url', server_url)
			
 
				+                            form_data.add_field('return_content_list', str(return_content_list).lower())
			
 
				+                            form_data.add_field('backend', backend)
			
 
				+                            form_data.add_field('table_enable', str(table_enable).lower())
			
 
				+                            form_data.add_field('response_format_zip', str(response_format_zip).lower())
			
 
				+                            form_data.add_field('formula_enable', str(formula_enable).lower())
			
 
				+                            form_data.add_field('files', file_obj, filename=upload_name, content_type=content_type)
			
 
				+                        else:
			
 
				+                            logger.error(f"API请求失败，已达到最大重试次数 ({max_retries})")
			
 
				+                            raise last_error
			
 
				+                            
			
 
				+            finally:
			
 
				+                # 关闭文件对象
			
 
				+                file_obj.close()
			
 
				+            
			
 
				+            # 解压zip文件
			
 
				+            logger.info("开始解压zip文件...")
			
 
				+            with zipfile.ZipFile(zip_path, 'r') as zip_ref:
			
 
				+                zip_ref.extractall(temp_dir)
			
 
				+            
			
 
				+            # 查找md文件
			
 
				+            md_files = list(Path(temp_dir).rglob("*.md"))
			
 
				+            if not md_files:
			
 
				+                logger.error("在zip文件中未找到md文件")
			
 
				+                return None
			
 
				+            
			
 
				+            logger.info(f"找到 {len(md_files)} 个md文件")
			
 
				+            
			
 
				+            # 读取所有md文件并合并
			
 
				+            markdown_parts = []
			
 
				+            for md_file in sorted(md_files):
			
 
				+                logger.info(f"读取md文件: {md_file}")
			
 
				+                async with aiofiles.open(md_file, 'r', encoding='utf-8') as f:
			
 
				+                    content = await f.read()
			
 
				+                    markdown_parts.append(content)
			
 
				+            
			
 
				+            # 合并所有页面内容
			
 
				+            original_content = "\n\n".join(markdown_parts)
			
 
				+            logger.info(f"合并后的markdown长度: {len(original_content)} 字符")
			
 
				+            
			
 
				+            # 准备输出目录
			
 
				+            local_md_dir = os.path.join(output_dir, file_name, "markdown")
			
 
				+            os.makedirs(local_md_dir, exist_ok=True)
			
 
				+            
			
 
				+            # 处理图片嵌入（如果需要）
			
 
				+            final_content = original_content
			
 
				+            if embed_images:
			
 
				+                # 查找图片文件
			
 
				+                image_files = list(Path(temp_dir).rglob("*.png")) + list(Path(temp_dir).rglob("*.jpg")) + list(Path(temp_dir).rglob("*.jpeg"))
			
 
				+                if image_files:
			
 
				+                    local_image_dir = os.path.join(output_dir, file_name, "images")
			
 
				+                    os.makedirs(local_image_dir, exist_ok=True)
			
 
				+                    
			
 
				+                    # 复制图片到输出目录
			
 
				+                    for img_file in image_files:
			
 
				+                        dst_path = os.path.join(local_image_dir, img_file.name)
			
 
				+                        shutil.copy2(img_file, dst_path)
			
 
				+                        logger.debug(f"复制图片: {img_file} -> {dst_path}")
			
 
				+            
			
 
				+            # 保存Markdown文件
			
 
				+            md_path = os.path.join(local_md_dir, f"{file_name}.md")
			
 
				+            async with aiofiles.open(md_path, 'w', encoding='utf-8') as f:
			
 
				+                await f.write(final_content)
			
 
				+            logger.info(f"Markdown文件已保存: {md_path}")
			
 
				+            
			
 
				+            # 生成输出文件路径（在output_dir根目录下也保存一份）
			
 
				+            output_md_path = os.path.join(output_dir, f"{file_name}.md")
			
 
				+            async with aiofiles.open(output_md_path, 'w', encoding='utf-8') as f:
			
 
				+                await f.write(final_content)
			
 
				+            
			
 
				+            logger.info(f"转换完成: {output_md_path}")
			
 
				+            
			
 
				+            # JSON转换（如果需要）
			
 
				+            json_data = None
			
 
				+            json_path = None
			
 
				+            if output_json:
			
 
				+                try:
			
 
				+                    logger.info("开始转换为JSON格式...")
			
 
				+                    # 复用v1的json解析逻辑
			
 
				+                    # 注意：v2版本不涉及MinerU和PaddleOCR的具体调用，只进行JSON解析
			
 
				+                    # first_page_image设为None，因为v2版本不处理PDF图片
			
 
				+                    from ..parser.json_converter import parse_markdown_to_json
			
 
				+                    # 构建完整的输出目录路径，包含文件名的子目录
			
 
				+                    json_output_dir = os.path.join(output_dir, file_name) if file_name else output_dir
			
 
				+                    json_data = parse_markdown_to_json(
			
 
				+                        original_content,
			
 
				+                        first_page_image=None,
			
 
				+                        output_dir=json_output_dir,
			
 
				+                        forced_document_type=forced_document_type,
			
 
				+                        enable_paddleocr_fallback=True,
			
 
				+                        input_file=input_file,
			
 
				+                    )
			
 
				+                    json_path = os.path.join(output_dir, f"{file_name}.json")
			
 
				+                    async with aiofiles.open(json_path, 'w', encoding='utf-8') as f:
			
 
				+                        await f.write(json.dumps(json_data, ensure_ascii=False, indent=2))
			
 
				+                    logger.info(f"JSON文件已保存: {json_path}")
			
 
				+                    logger.info(f"文档类型: {json_data.get('document_type', 'unknown')}")
			
 
				+                except Exception as e:
			
 
				+                    logger.exception(f"JSON转换失败: {e}")
			
 
				+                    json_data = None
			
 
				+            
			
 
				+            return {
			
 
				+                'markdown_file': output_md_path,
			
 
				+                'json_file': json_path,
			
 
				+                'json_data': json_data,
			
 
				+                'content': final_content,
			
 
				+                'original_content': original_content
			
 
				+            }
			
 
				+            
			
 
				+        finally:
			
 
				+            # 清理临时目录
			
 
				+            try:
			
 
				+                shutil.rmtree(temp_dir)
			
 
				+                logger.debug(f"已清理临时目录: {temp_dir}")
			
 
				+            except Exception as e:
			
 
				+                logger.warning(f"清理临时目录失败: {e}")
			
 
				+    
			
 
				+    except Exception as e:
			
 
				+        logger.exception(f"转换过程出错: {e}")
			
 
				+        return None
			
 
				+
			
--- a/pdf_converter_v2/paddleocr_fallback.py
+++ b/pdf_converter_v2/paddleocr_fallback.py
@@ -0,0 +1,2071 @@
 
				+# Copyright (c) Opendatalab. All rights reserved.
			
 
				+
			
 
				+"""PaddleOCR备用解析模块 - 当MinerU解析结果缺失时使用"""
			
 
				+
			
 
				+import json
			
 
				+import os
			
 
				+import subprocess
			
 
				+import tempfile
			
 
				+import time
			
 
				+import random
			
 
				+import fcntl
			
 
				+from pathlib import Path
			
 
				+from typing import Dict, Any, Optional, List, Tuple
			
 
				+import ast
			
 
				+import re
			
 
				+
			
 
				+from ..utils.logging_config import get_logger
			
 
				+
			
 
				+logger = get_logger("pdf_converter_v2.utils.paddleocr")
			
 
				+
			
 
				+try:
			
 
				+    import pypdfium2 as pdfium
			
 
				+    PDFIUM_AVAILABLE = True
			
 
				+except ImportError:
			
 
				+    PDFIUM_AVAILABLE = False
			
 
				+    logger.warning("[PaddleOCR备用] pypdfium2未安装，无法从PDF提取图片")
			
 
				+
			
 
				+try:
			
 
				+    from pdf2image import convert_from_path
			
 
				+    PDF2IMAGE_AVAILABLE = True
			
 
				+except ImportError:
			
 
				+    PDF2IMAGE_AVAILABLE = False
			
 
				+    logger.warning("[PaddleOCR备用] pdf2image未安装，无法使用备用方法从PDF提取图片")
			
 
				+
			
 
				+try:
			
 
				+    from PIL import Image
			
 
				+    PIL_AVAILABLE = True
			
 
				+except ImportError:
			
 
				+    PIL_AVAILABLE = False
			
 
				+    logger.warning("[PaddleOCR备用] PIL未安装，无法处理图片")
			
 
				+
			
 
				+
			
 
				+# 用于管理mineru服务状态的锁文件路径
			
 
				+MINERU_LOCK_FILE = "/tmp/mineru_service_lock"
			
 
				+MINERU_COUNT_FILE = "/tmp/mineru_service_count"
			
 
				+
			
 
				+# PaddleOCR 推理设备：NPU 环境下需设为 npu 或 npu:0，否则会走 CPU 并可能段错误
			
 
				+# 通过环境变量 PADDLE_OCR_DEVICE 指定，例如：export PADDLE_OCR_DEVICE=npu:0
			
 
				+def _paddle_ocr_device_args() -> list:
			
 
				+    """返回 PaddleOCR 命令的 --device 参数列表（若未设置则返回空列表）"""
			
 
				+    device = os.getenv("PADDLE_OCR_DEVICE", "").strip()
			
 
				+    if device:
			
 
				+        return ["--device", device]
			
 
				+    return []
			
 
				+
			
 
				+
			
 
				+def _acquire_service_lock() -> Optional[object]:
			
 
				+    """获取服务操作锁（文件锁）
			
 
				+    
			
 
				+    Returns:
			
 
				+        文件对象（用于释放锁），如果失败返回None
			
 
				+    """
			
 
				+    try:
			
 
				+        lock_file = open(MINERU_LOCK_FILE, 'w')
			
 
				+        fcntl.flock(lock_file.fileno(), fcntl.LOCK_EX | fcntl.LOCK_NB)
			
 
				+        return lock_file
			
 
				+    except (IOError, OSError) as e:
			
 
				+        logger.debug(f"[PaddleOCR] 获取服务锁失败（可能其他进程正在操作）: {e}")
			
 
				+        return None
			
 
				+
			
 
				+
			
 
				+def _release_service_lock(lock_file: object) -> None:
			
 
				+    """释放服务操作锁
			
 
				+    
			
 
				+    Args:
			
 
				+        lock_file: 锁文件对象
			
 
				+    """
			
 
				+    try:
			
 
				+        if lock_file:
			
 
				+            fcntl.flock(lock_file.fileno(), fcntl.LOCK_UN)
			
 
				+            lock_file.close()
			
 
				+    except Exception as e:
			
 
				+        logger.warning(f"[PaddleOCR] 释放服务锁失败: {e}")
			
 
				+
			
 
				+
			
 
				+def _increment_service_count(lock_file: object) -> int:
			
 
				+    """增加服务使用计数（需要在锁保护下调用）
			
 
				+    
			
 
				+    Args:
			
 
				+        lock_file: 已获取的锁文件对象
			
 
				+    
			
 
				+    Returns:
			
 
				+        当前计数
			
 
				+    """
			
 
				+    try:
			
 
				+        count = 0
			
 
				+        if os.path.exists(MINERU_COUNT_FILE):
			
 
				+            with open(MINERU_COUNT_FILE, 'r') as f:
			
 
				+                count = int(f.read().strip() or '0')
			
 
				+        count += 1
			
 
				+        with open(MINERU_COUNT_FILE, 'w') as f:
			
 
				+            f.write(str(count))
			
 
				+        return count
			
 
				+    except Exception as e:
			
 
				+        logger.warning(f"[PaddleOCR] 增加服务计数失败: {e}")
			
 
				+        return 1
			
 
				+
			
 
				+
			
 
				+def _decrement_service_count(lock_file: object) -> int:
			
 
				+    """减少服务使用计数（需要在锁保护下调用）
			
 
				+    
			
 
				+    Args:
			
 
				+        lock_file: 已获取的锁文件对象
			
 
				+    
			
 
				+    Returns:
			
 
				+        当前计数
			
 
				+    """
			
 
				+    try:
			
 
				+        count = 0
			
 
				+        if os.path.exists(MINERU_COUNT_FILE):
			
 
				+            with open(MINERU_COUNT_FILE, 'r') as f:
			
 
				+                count = int(f.read().strip() or '0')
			
 
				+        count = max(0, count - 1)
			
 
				+        with open(MINERU_COUNT_FILE, 'w') as f:
			
 
				+            f.write(str(count))
			
 
				+        return count
			
 
				+    except Exception as e:
			
 
				+        logger.warning(f"[PaddleOCR] 减少服务计数失败: {e}")
			
 
				+        return 0
			
 
				+
			
 
				+
			
 
				+def stop_mineru_service() -> bool:
			
 
				+    """停止mineru-api.service以释放GPU内存（线程安全）
			
 
				+    
			
 
				+    Returns:
			
 
				+        True表示成功停止或已停止，False表示失败
			
 
				+    """
			
 
				+    lock_file = _acquire_service_lock()
			
 
				+    if not lock_file:
			
 
				+        # 如果无法获取锁，等待一小段时间后检查服务状态
			
 
				+        time.sleep(0.5)
			
 
				+        try:
			
 
				+            result = subprocess.run(
			
 
				+                ["systemctl", "is-active", "mineru-api.service"],
			
 
				+                capture_output=True,
			
 
				+                text=True,
			
 
				+                timeout=5,
			
 
				+                check=False
			
 
				+            )
			
 
				+            if result.returncode != 0 or result.stdout.strip() != "active":
			
 
				+                # 服务已经停止
			
 
				+                logger.debug("[PaddleOCR] 服务已停止（其他进程已处理）")
			
 
				+                return True
			
 
				+        except Exception:
			
 
				+            pass
			
 
				+        return False
			
 
				+    
			
 
				+    try:
			
 
				+        # 检查服务当前状态
			
 
				+        result = subprocess.run(
			
 
				+            ["systemctl", "is-active", "mineru-api.service"],
			
 
				+            capture_output=True,
			
 
				+            text=True,
			
 
				+            timeout=5,
			
 
				+            check=False
			
 
				+        )
			
 
				+        is_active = result.returncode == 0 and result.stdout.strip() == "active"
			
 
				+        
			
 
				+        if not is_active:
			
 
				+            logger.debug("[PaddleOCR] mineru-api.service已经停止")
			
 
				+            return True
			
 
				+        
			
 
				+        # 增加使用计数（在锁保护下）
			
 
				+        count = _increment_service_count(lock_file)
			
 
				+        logger.debug(f"[PaddleOCR] 服务使用计数: {count}")
			
 
				+        
			
 
				+        # 停止服务
			
 
				+        result = subprocess.run(
			
 
				+            ["systemctl", "stop", "mineru-api.service"],
			
 
				+            capture_output=True,
			
 
				+            text=True,
			
 
				+            timeout=30,
			
 
				+            check=False
			
 
				+        )
			
 
				+        if result.returncode == 0:
			
 
				+            logger.info("[PaddleOCR] 成功停止mineru-api.service以释放GPU内存")
			
 
				+            return True
			
 
				+        else:
			
 
				+            logger.warning(f"[PaddleOCR] 停止mineru-api.service失败: {result.stderr}")
			
 
				+            _decrement_service_count(lock_file)  # 回滚计数
			
 
				+            return False
			
 
				+    except Exception as e:
			
 
				+        logger.warning(f"[PaddleOCR] 停止mineru-api.service时出错: {e}")
			
 
				+        if lock_file:
			
 
				+            _decrement_service_count(lock_file)  # 回滚计数
			
 
				+        return False
			
 
				+    finally:
			
 
				+        _release_service_lock(lock_file)
			
 
				+
			
 
				+
			
 
				+def start_mineru_service() -> bool:
			
 
				+    """启动mineru-api.service（线程安全）
			
 
				+    
			
 
				+    Returns:
			
 
				+        True表示成功启动或已启动，False表示失败
			
 
				+    """
			
 
				+    lock_file = _acquire_service_lock()
			
 
				+    if not lock_file:
			
 
				+        # 如果无法获取锁，等待一小段时间后检查服务状态
			
 
				+        time.sleep(0.5)
			
 
				+        try:
			
 
				+            result = subprocess.run(
			
 
				+                ["systemctl", "is-active", "mineru-api.service"],
			
 
				+                capture_output=True,
			
 
				+                text=True,
			
 
				+                timeout=5,
			
 
				+                check=False
			
 
				+            )
			
 
				+            if result.returncode == 0 and result.stdout.strip() == "active":
			
 
				+                # 服务已经启动
			
 
				+                logger.debug("[PaddleOCR] 服务已启动（其他进程已处理）")
			
 
				+                return True
			
 
				+        except Exception:
			
 
				+            pass
			
 
				+        return False
			
 
				+    
			
 
				+    try:
			
 
				+        # 减少使用计数（在锁保护下）
			
 
				+        count = _decrement_service_count(lock_file)
			
 
				+        logger.debug(f"[PaddleOCR] 服务使用计数: {count}")
			
 
				+        
			
 
				+        # 如果还有其他进程在使用，不启动服务
			
 
				+        if count > 0:
			
 
				+            logger.info(f"[PaddleOCR] 还有其他进程在使用GPU（计数={count}），暂不启动mineru-api.service")
			
 
				+            return True
			
 
				+        
			
 
				+        # 检查服务当前状态
			
 
				+        result = subprocess.run(
			
 
				+            ["systemctl", "is-active", "mineru-api.service"],
			
 
				+            capture_output=True,
			
 
				+            text=True,
			
 
				+            timeout=5,
			
 
				+            check=False
			
 
				+        )
			
 
				+        is_active = result.returncode == 0 and result.stdout.strip() == "active"
			
 
				+        
			
 
				+        if is_active:
			
 
				+            logger.debug("[PaddleOCR] mineru-api.service已经启动")
			
 
				+            return True
			
 
				+        
			
 
				+        # 启动服务
			
 
				+        result = subprocess.run(
			
 
				+            ["systemctl", "start", "mineru-api.service"],
			
 
				+            capture_output=True,
			
 
				+            text=True,
			
 
				+            timeout=30,
			
 
				+            check=False
			
 
				+        )
			
 
				+        if result.returncode == 0:
			
 
				+            logger.info("[PaddleOCR] 成功启动mineru-api.service")
			
 
				+            return True
			
 
				+        else:
			
 
				+            logger.warning(f"[PaddleOCR] 启动mineru-api.service失败: {result.stderr}")
			
 
				+            return False
			
 
				+    except Exception as e:
			
 
				+        logger.warning(f"[PaddleOCR] 启动mineru-api.service时出错: {e}")
			
 
				+        return False
			
 
				+    finally:
			
 
				+        _release_service_lock(lock_file)
			
 
				+
			
 
				+
			
 
				+def detect_file_type(file_path: str) -> Optional[str]:
			
 
				+    """通过文件内容（魔数）检测文件类型，不依赖扩展名
			
 
				+    
			
 
				+    Args:
			
 
				+        file_path: 文件路径
			
 
				+        
			
 
				+    Returns:
			
 
				+        文件类型：'pdf', 'png', 'jpeg', 'jpg' 或 None
			
 
				+    """
			
 
				+    if not file_path or not os.path.exists(file_path):
			
 
				+        return None
			
 
				+    
			
 
				+    try:
			
 
				+        with open(file_path, 'rb') as f:
			
 
				+            # 读取文件头部（前16字节足够识别常见格式）
			
 
				+            header = f.read(16)
			
 
				+            
			
 
				+            if not header:
			
 
				+                return None
			
 
				+            
			
 
				+            # PDF文件：以 %PDF 开头
			
 
				+            if header.startswith(b'%PDF'):
			
 
				+                return 'pdf'
			
 
				+            
			
 
				+            # PNG图片：以 \x89PNG\r\n\x1a\n 开头
			
 
				+            if header.startswith(b'\x89PNG\r\n\x1a\n'):
			
 
				+                return 'png'
			
 
				+            
			
 
				+            # JPEG图片：以 \xff\xd8\xff 开头
			
 
				+            if header.startswith(b'\xff\xd8\xff'):
			
 
				+                return 'jpeg'
			
 
				+            
			
 
				+            # 其他格式可以继续扩展
			
 
				+            return None
			
 
				+            
			
 
				+    except Exception as e:
			
 
				+        logger.debug(f"[PaddleOCR备用] 检测文件类型失败: {e}")
			
 
				+        return None
			
 
				+
			
 
				+
			
 
				+def check_json_data_completeness(json_data: Dict[str, Any], document_type: str) -> bool:
			
 
				+    """检查JSON数据是否大面积缺失
			
 
				+    
			
 
				+    Args:
			
 
				+        json_data: 解析后的JSON数据
			
 
				+        document_type: 文档类型
			
 
				+        
			
 
				+    Returns:
			
 
				+        True表示数据完整，False表示数据缺失
			
 
				+    """
			
 
				+    if not json_data or "data" not in json_data:
			
 
				+        return False
			
 
				+    
			
 
				+    data = json_data["data"]
			
 
				+    
			
 
				+    # 根据文档类型检查关键字段
			
 
				+    if document_type == "noiseMonitoringRecord":
			
 
				+        # 检查噪声检测记录的关键字段（不包括noise数组，noise数组由表格解析生成，不依赖OCR）
			
 
				+        required_fields = ["project", "standardReferences", "soundLevelMeterMode", "soundCalibratorMode"]
			
 
				+        missing_count = sum(1 for field in required_fields if not data.get(field))
			
 
				+        
			
 
				+        # 如果超过一半的关键字段缺失，认为数据缺失
			
 
				+        if missing_count >= len(required_fields) / 2:
			
 
				+            logger.warning(f"[数据完整性检查] 关键字段缺失过多: {missing_count}/{len(required_fields)}")
			
 
				+            return False
			
 
				+
			
 
				+        # 检查天气字段是否异常（例如解析成“天气”标签或风向全部缺失）
			
 
				+        weather_list = data.get("weather") or []
			
 
				+        if weather_list:
			
 
				+            weather_label_tokens = {"天气", "天气状况", "天气情况"}
			
 
				+            has_label_as_value = any(
			
 
				+                (item.get("weather") or "").strip() in weather_label_tokens for item in weather_list
			
 
				+            )
			
 
				+            all_wind_direction_missing = all(
			
 
				+                not (item.get("windDirection") or "").strip() for item in weather_list
			
 
				+            )
			
 
				+            if has_label_as_value:
			
 
				+                logger.warning("[数据完整性检查] 天气字段疑似被解析为标签，触发备用解析")
			
 
				+                return False
			
 
				+            if all_wind_direction_missing:
			
 
				+                logger.warning("[数据完整性检查] 风向字段全部缺失，触发备用解析")
			
 
				+                return False
			
 
				+        
			
 
				+        return True
			
 
				+    
			
 
				+    elif document_type == "electromagneticTestRecord":
			
 
				+        # 检查电磁检测记录的关键字段
			
 
				+        # 区分必需字段和可选字段：
			
 
				+        # - deviceName 和 deviceMode 是必需字段（仪器信息）
			
 
				+        # - project 和 standardReferences 可能为空（某些文档可能没有填写）
			
 
				+        required_fields = ["deviceName", "deviceMode"]  # 必需字段
			
 
				+        optional_fields = ["project", "standardReferences"]  # 可选字段
			
 
				+        
			
 
				+        # 检查必需字段
			
 
				+        missing_required = sum(1 for field in required_fields if not data.get(field) or not str(data.get(field)).strip())
			
 
				+        # 检查可选字段（如果所有可选字段都为空，也算缺失）
			
 
				+        missing_optional = sum(1 for field in optional_fields if not data.get(field) or not str(data.get(field)).strip())
			
 
				+        
			
 
				+        # 检查电磁数据
			
 
				+        em_list = data.get("electricMagnetic", [])
			
 
				+        if len(em_list) == 0:
			
 
				+            logger.warning("[数据完整性检查] 电磁数据列表为空")
			
 
				+            return False
			
 
				+        
			
 
				+        # 如果必需字段缺失，认为数据不完整
			
 
				+        if missing_required > 0:
			
 
				+            logger.warning(f"[数据完整性检查] 必需字段缺失: {missing_required}/{len(required_fields)} (deviceName, deviceMode)")
			
 
				+            return False
			
 
				+        
			
 
				+        # 如果所有字段（必需+可选）都缺失，也认为数据不完整
			
 
				+        if missing_required + missing_optional >= len(required_fields) + len(optional_fields):
			
 
				+            logger.warning(f"[数据完整性检查] 所有关键字段都缺失: {missing_required + missing_optional}/{len(required_fields) + len(optional_fields)}")
			
 
				+            return False
			
 
				+        
			
 
				+        # 检查project和address字段：如果project为空且所有address都为空，说明minerU和Paddle doc_parser都丢失了，需要运行Paddle OCR
			
 
				+        project_empty = not data.get("project") or not str(data.get("project")).strip()
			
 
				+        if project_empty:
			
 
				+            # 检查所有电磁数据项的address字段是否都为空
			
 
				+            all_address_empty = True
			
 
				+            for em_item in em_list:
			
 
				+                address = em_item.get("address", "")
			
 
				+                if address and str(address).strip():
			
 
				+                    all_address_empty = False
			
 
				+                    break
			
 
				+            
			
 
				+            if all_address_empty:
			
 
				+                logger.warning("[数据完整性检查] project为空且所有address字段都为空，说明minerU和Paddle doc_parser都丢失了，需要运行Paddle OCR")
			
 
				+                return False
			
 
				+        
			
 
				+        return True
			
 
				+    
			
 
				+    elif document_type == "operatingConditionInfo":
			
 
				+        # 检查工况信息
			
 
				+        op_list = data.get("operationalConditions", [])
			
 
				+        if len(op_list) == 0:
			
 
				+            logger.warning("[数据完整性检查] 工况信息列表为空")
			
 
				+            return False
			
 
				+        
			
 
				+        return True
			
 
				+    
			
 
				+    # 未知类型，默认认为完整
			
 
				+    return True
			
 
				+
			
 
				+
			
 
				+def parse_paddleocr_output(output_text: str) -> Dict[str, Any]:
			
 
				+    """解析paddleocr的输出文本
			
 
				+    
			
 
				+    Args:
			
 
				+        output_text: paddleocr命令的输出文本
			
 
				+        
			
 
				+    Returns:
			
 
				+        解析后的字典，包含parsing_res_list
			
 
				+    """
			
 
				+    try:
			
 
				+        # 清理输出文本，移除可能的额外空白
			
 
				+        output_text = output_text.strip()
			
 
				+        
			
 
				+        # 尝试直接eval（因为输出是Python字典格式）
			
 
				+        # 先处理np.float32等numpy类型
			
 
				+        output_text = output_text.replace('np.float32', 'float')
			
 
				+        output_text = output_text.replace('np.int32', 'int')
			
 
				+        output_text = output_text.replace('np.int64', 'int')
			
 
				+        
			
 
				+        # 尝试使用ast.literal_eval安全解析
			
 
				+        try:
			
 
				+            result = ast.literal_eval(output_text)
			
 
				+        except (ValueError, SyntaxError):
			
 
				+            # 如果literal_eval失败，尝试使用eval（不推荐，但paddleocr输出可能需要）
			
 
				+            logger.warning("[PaddleOCR解析] literal_eval失败，尝试使用eval")
			
 
				+            # 创建一个安全的eval环境
			
 
				+            safe_dict = {"__builtins__": {}}
			
 
				+            result = eval(output_text, safe_dict)
			
 
				+        
			
 
				+        if isinstance(result, dict):
			
 
				+            # 检查是否有res键
			
 
				+            if "res" in result:
			
 
				+                parsing_res_list = result.get("res", {}).get("parsing_res_list", [])
			
 
				+                return {"parsing_res_list": parsing_res_list}
			
 
				+            # 也可能直接包含parsing_res_list
			
 
				+            elif "parsing_res_list" in result:
			
 
				+                return {"parsing_res_list": result.get("parsing_res_list", [])}
			
 
				+        
			
 
				+        return {"parsing_res_list": []}
			
 
				+    except Exception as e:
			
 
				+        logger.error(f"[PaddleOCR解析] 解析输出失败: {e}")
			
 
				+        logger.debug(f"[PaddleOCR解析] 输出内容: {output_text[:500]}")
			
 
				+        return {"parsing_res_list": []}
			
 
				+
			
 
				+
			
 
				+def paddleocr_to_markdown(paddleocr_result: Dict[str, Any]) -> str:
			
 
				+    """将paddleocr的解析结果转换为markdown格式
			
 
				+    
			
 
				+    Args:
			
 
				+        paddleocr_result: paddleocr解析结果
			
 
				+        
			
 
				+    Returns:
			
 
				+        markdown格式的文本
			
 
				+    """
			
 
				+    markdown_parts = []
			
 
				+    parsing_res_list = paddleocr_result.get("parsing_res_list", [])
			
 
				+    
			
 
				+    for item in parsing_res_list:
			
 
				+        block_label = item.get("block_label", "")
			
 
				+        block_content = item.get("block_content", "")
			
 
				+        
			
 
				+        if block_label == "table":
			
 
				+            # 表格直接使用HTML格式
			
 
				+            markdown_parts.append(block_content)
			
 
				+        elif block_label in ["header", "title", "figure_title"]:
			
 
				+            # 标题使用markdown标题格式
			
 
				+            markdown_parts.append(f"# {block_content}")
			
 
				+        elif block_label == "text":
			
 
				+            # 普通文本
			
 
				+            markdown_parts.append(block_content)
			
 
				+        else:
			
 
				+            # 其他类型直接添加内容
			
 
				+            markdown_parts.append(block_content)
			
 
				+    
			
 
				+    return "\n\n".join(markdown_parts)
			
 
				+
			
 
				+
			
 
				+def call_paddleocr(image_path: str) -> Optional[Dict[str, Any]]:
			
 
				+    """调用paddleocr命令解析图片
			
 
				+    
			
 
				+    Args:
			
 
				+        image_path: 图片路径
			
 
				+        
			
 
				+    Returns:
			
 
				+        paddleocr解析结果，如果失败返回None
			
 
				+    """
			
 
				+    # 在调用PaddleOCR前停止mineru服务以释放GPU内存
			
 
				+    mineru_stopped = stop_mineru_service()
			
 
				+    
			
 
				+    try:
			
 
				+        # 检查图片文件是否存在
			
 
				+        if not os.path.exists(image_path):
			
 
				+            logger.error(f"[PaddleOCR] 图片文件不存在: {image_path}")
			
 
				+            return None
			
 
				+        
			
 
				+        # 生成输出目录和基础文件名
			
 
				+        image_dir = os.path.dirname(image_path)
			
 
				+        image_basename = os.path.splitext(os.path.basename(image_path))[0]
			
 
				+        save_path_base = os.path.join(image_dir, image_basename)
			
 
				+        
			
 
				+        # 构建paddleocr命令，添加所有参数（NPU 下需加 --device npu:0，否则走 CPU 易段错误）
			
 
				+        # PaddleOCR会在save_path下创建目录，文件保存在该目录内
			
 
				+        cmd = [
			
 
				+            "paddleocr", "doc_parser", "-i", image_path,
			
 
				+            "--precision", "fp32",
			
 
				+            "--use_doc_unwarping", "False",
			
 
				+            "--use_doc_orientation_classify", "True",
			
 
				+            "--use_chart_recognition", "True",
			
 
				+            "--save_path", save_path_base
			
 
				+        ] + _paddle_ocr_device_args()
			
 
				+        
			
 
				+        # 设置环境变量，限制GPU内存使用
			
 
				+        # env = os.environ.copy()
			
 
				+        # 设置PaddlePaddle的GPU内存分配策略，使用更保守的内存分配
			
 
				+        # env["FLAGS_fraction_of_gpu_memory_to_use"] = "0.3"  # 只使用30%的GPU内存
			
 
				+        # env["FLAGS_allocator_strategy"] = "auto_growth"  # 使用自动增长策略，避免一次性分配过多内存
			
 
				+        
			
 
				+        logger.info(f"[PaddleOCR] 执行命令: {' '.join(cmd)}")
			
 
				+        
			
 
				+        # 执行命令
			
 
				+        result = subprocess.run(
			
 
				+            cmd,
			
 
				+            capture_output=True,
			
 
				+            text=True,
			
 
				+            timeout=300,  # 5分钟超时
			
 
				+            check=False,
			
 
				+        )
			
 
				+        
			
 
				+        if result.returncode != 0:
			
 
				+            logger.error(f"[PaddleOCR] 命令执行失败，返回码: {result.returncode}")
			
 
				+            logger.error(f"[PaddleOCR] 错误输出: {result.stderr}")
			
 
				+            return None
			
 
				+        
			
 
				+        # 从保存的Markdown文件中读取结果
			
 
				+        # PaddleOCR会在save_path下创建目录，文件路径为: {save_path}/{basename}.md
			
 
				+        md_file = os.path.join(save_path_base, f"{image_basename}.md")
			
 
				+        if os.path.exists(md_file):
			
 
				+            logger.info(f"[PaddleOCR] 从Markdown文件读取结果: {md_file}")
			
 
				+            try:
			
 
				+                with open(md_file, 'r', encoding='utf-8') as f:
			
 
				+                    markdown_content = f.read()
			
 
				+                    if markdown_content.strip():
			
 
				+                        # 将markdown内容转换为标准格式
			
 
				+                        # 为了兼容现有代码，我们需要将markdown转换回parsing_res_list格式
			
 
				+                        # 但实际上，我们可以直接返回markdown内容，让调用方处理
			
 
				+                        # 这里我们返回一个特殊标记，表示这是markdown格式
			
 
				+                        logger.info(f"[PaddleOCR] 成功读取Markdown文件，内容长度: {len(markdown_content)} 字符")
			
 
				+                        # 返回markdown内容，使用特殊键标记
			
 
				+                        return {"markdown_content": markdown_content}
			
 
				+                    else:
			
 
				+                        logger.warning("[PaddleOCR] Markdown文件内容为空")
			
 
				+            except Exception as e:
			
 
				+                logger.exception(f"[PaddleOCR] 读取Markdown文件失败: {e}")
			
 
				+        else:
			
 
				+            logger.warning(f"[PaddleOCR] Markdown文件不存在: {md_file}")
			
 
				+        
			
 
				+        # 如果Markdown文件不存在或读取失败，尝试从stdout解析
			
 
				+        output_text = result.stdout.strip()
			
 
				+        if output_text:
			
 
				+            logger.info("[PaddleOCR] 从stdout解析输出")
			
 
				+            parsed_result = parse_paddleocr_output(output_text)
			
 
				+            logger.info(f"[PaddleOCR] 解析成功，获得 {len(parsed_result.get('parsing_res_list', []))} 个区块")
			
 
				+            return parsed_result
			
 
				+        else:
			
 
				+            logger.warning("[PaddleOCR] stdout输出为空，且未找到Markdown文件")
			
 
				+            return None
			
 
				+        
			
 
				+    except subprocess.TimeoutExpired:
			
 
				+        logger.error("[PaddleOCR] 命令执行超时")
			
 
				+        return None
			
 
				+    except Exception as e:
			
 
				+        logger.exception(f"[PaddleOCR] 调用失败: {e}")
			
 
				+        return None
			
 
				+    finally:
			
 
				+        # 无论成功或失败，都尝试重启mineru服务
			
 
				+        if mineru_stopped:
			
 
				+            start_mineru_service()
			
 
				+
			
 
				+
			
 
				+def extract_first_page_from_pdf(pdf_path: str, output_dir: str) -> Optional[str]:
			
 
				+    """从PDF文件中提取第一页作为图片
			
 
				+    
			
 
				+    优先使用pypdfium2，如果不可用则使用pdf2image作为后备方案。
			
 
				+    
			
 
				+    Args:
			
 
				+        pdf_path: PDF文件路径
			
 
				+        output_dir: 输出目录，用于保存提取的图片
			
 
				+        
			
 
				+    Returns:
			
 
				+        提取的图片路径，如果失败返回None
			
 
				+    """
			
 
				+    if not PIL_AVAILABLE:
			
 
				+        logger.error("[PaddleOCR备用] 缺少必要的库（PIL/Pillow），无法处理图片")
			
 
				+        return None
			
 
				+
			
 
				+    if not os.path.exists(pdf_path):
			
 
				+        logger.error(f"[PaddleOCR备用] PDF文件不存在: {pdf_path}")
			
 
				+        return None
			
 
				+
			
 
				+    # 方法1: 尝试使用pypdfium2（优先方法）
			
 
				+    if PDFIUM_AVAILABLE:
			
 
				+        try:
			
 
				+            pdf = pdfium.PdfDocument(pdf_path)
			
 
				+            try:
			
 
				+                if len(pdf) == 0:
			
 
				+                    logger.error("[PaddleOCR备用] PDF文件为空")
			
 
				+                    return None
			
 
				+
			
 
				+                page = pdf[0]
			
 
				+                bitmap = page.render(scale=150 / 72)  # 150 DPI
			
 
				+                pil_image = bitmap.to_pil()
			
 
				+
			
 
				+                os.makedirs(output_dir, exist_ok=True)
			
 
				+                image_filename = f"paddleocr_fallback_page0_{int(time.time() * 1000)}_{random.randint(1000, 9999)}.png"
			
 
				+                image_path = os.path.join(output_dir, image_filename)
			
 
				+                pil_image.save(image_path, "PNG", optimize=True, compress_level=6)
			
 
				+
			
 
				+                logger.info(f"[PaddleOCR备用] 使用pypdfium2从PDF提取第一页图片: {image_path}")
			
 
				+                bitmap.close()
			
 
				+                return image_path
			
 
				+            finally:
			
 
				+                try:
			
 
				+                    pdf.close()
			
 
				+                except Exception:
			
 
				+                    pass
			
 
				+        except Exception as e:
			
 
				+            logger.warning(f"[PaddleOCR备用] 使用pypdfium2提取图片失败，尝试pdf2image: {e}")
			
 
				+
			
 
				+    # 方法2: 使用 pdf2image 作为后备方案
			
 
				+    if PDF2IMAGE_AVAILABLE:
			
 
				+        try:
			
 
				+            images = convert_from_path(pdf_path, dpi=150, first_page=1, last_page=1)
			
 
				+            if not images:
			
 
				+                logger.error("[PaddleOCR备用] pdf2image未能提取到图片")
			
 
				+                return None
			
 
				+
			
 
				+            os.makedirs(output_dir, exist_ok=True)
			
 
				+            image_filename = f"paddleocr_fallback_page0_{int(time.time() * 1000)}_{random.randint(1000, 9999)}.png"
			
 
				+            image_path = os.path.join(output_dir, image_filename)
			
 
				+            images[0].save(image_path, "PNG", optimize=True, compress_level=6)
			
 
				+
			
 
				+            logger.info(f"[PaddleOCR备用] 使用pdf2image从PDF提取第一页图片: {image_path}")
			
 
				+            return image_path
			
 
				+        except Exception as e:
			
 
				+            logger.exception(f"[PaddleOCR备用] 使用pdf2image提取图片失败: {e}")
			
 
				+
			
 
				+    # 如果两种方法都不可用
			
 
				+    missing_libs = []
			
 
				+    if not PDFIUM_AVAILABLE:
			
 
				+        missing_libs.append("pypdfium2")
			
 
				+    if not PDF2IMAGE_AVAILABLE:
			
 
				+        missing_libs.append("pdf2image")
			
 
				+
			
 
				+    logger.error(
			
 
				+        f"[PaddleOCR备用] 缺少必要的库（{'或'.join(missing_libs)}），无法从PDF提取图片。请安装: pip install {' '.join(missing_libs)}"
			
 
				+    )
			
 
				+    return None
			
 
				+
			
 
				+
			
 
				+def find_pdf_file(output_dir: str) -> Optional[str]:
			
 
				+    """在输出目录中查找PDF文件
			
 
				+    
			
 
				+    Args:
			
 
				+        output_dir: 输出目录
			
 
				+        
			
 
				+    Returns:
			
 
				+        PDF文件路径，如果未找到返回None
			
 
				+    """
			
 
				+    if not os.path.exists(output_dir):
			
 
				+        return None
			
 
				+    
			
 
				+    # 查找PDF文件
			
 
				+    pdf_files = list(Path(output_dir).rglob("*.pdf"))
			
 
				+    if pdf_files:
			
 
				+        # 返回第一个找到的PDF文件
			
 
				+        return str(pdf_files[0])
			
 
				+    
			
 
				+    return None
			
 
				+
			
 
				+
			
 
				+def markdown_to_plain_text(markdown_content: str) -> List[str]:
			
 
				+    """将Markdown内容转换为纯文本列表（按行分割）
			
 
				+    
			
 
				+    Args:
			
 
				+        markdown_content: Markdown格式的文本
			
 
				+        
			
 
				+    Returns:
			
 
				+        纯文本列表，每行一个元素
			
 
				+    """
			
 
				+    if not markdown_content:
			
 
				+        return []
			
 
				+    
			
 
				+    lines = []
			
 
				+    in_code_block = False
			
 
				+    
			
 
				+    # 先处理HTML表格：提取整个表格，转换为文本行
			
 
				+    # 查找所有<table>...</table>块
			
 
				+    table_pattern = r'<table[^>]*>.*?</table>'
			
 
				+    tables = re.findall(table_pattern, markdown_content, re.DOTALL)
			
 
				+    
			
 
				+    # 将表格内容替换为占位符，稍后处理
			
 
				+    table_placeholders = []
			
 
				+    for i, table in enumerate(tables):
			
 
				+        placeholder = f"__TABLE_PLACEHOLDER_{i}__"
			
 
				+        table_placeholders.append((placeholder, table))
			
 
				+        markdown_content = markdown_content.replace(table, placeholder, 1)
			
 
				+    
			
 
				+    # 处理每一行
			
 
				+    for line in markdown_content.split('\n'):
			
 
				+        line = line.rstrip()  # 只移除右侧空格
			
 
				+        
			
 
				+        # 检测代码块
			
 
				+        if line.strip().startswith('```'):
			
 
				+            in_code_block = not in_code_block
			
 
				+            continue
			
 
				+        
			
 
				+        if in_code_block:
			
 
				+            # 代码块内的内容保留原样
			
 
				+            if line.strip():
			
 
				+                lines.append(line)
			
 
				+            continue
			
 
				+        
			
 
				+        # 处理表格占位符
			
 
				+        if '__TABLE_PLACEHOLDER_' in line:
			
 
				+            # 找到对应的表格
			
 
				+            for placeholder, table_html in table_placeholders:
			
 
				+                if placeholder in line:
			
 
				+                    # 提取表格中的所有单元格文本
			
 
				+                    table_lines = extract_table_text(table_html)
			
 
				+                    lines.extend(table_lines)
			
 
				+                    break
			
 
				+            continue
			
 
				+        
			
 
				+        # 检测Markdown表格（以 | 开头）
			
 
				+        if '|' in line and line.strip().startswith('|'):
			
 
				+            # 处理表格行：移除首尾的 |，分割单元格
			
 
				+            cells = [cell.strip() for cell in line.split('|') if cell.strip()]
			
 
				+            # 移除表格分隔行（只包含 - 和 |）
			
 
				+            if all(c in ['-', ':', ' '] for c in ''.join(cells)):
			
 
				+                continue
			
 
				+            # 合并单元格内容，用空格分隔
			
 
				+            table_line = ' '.join(cells)
			
 
				+            if table_line.strip():
			
 
				+                lines.append(table_line)
			
 
				+            continue
			
 
				+        
			
 
				+        # 移除Markdown语法标记
			
 
				+        # 移除标题标记 (# ## ### 等)
			
 
				+        line = re.sub(r'^#+\s*', '', line)
			
 
				+        # 移除列表标记 (- * + 等)
			
 
				+        line = re.sub(r'^[-*+]\s+', '', line)
			
 
				+        # 移除数字列表标记
			
 
				+        line = re.sub(r'^\d+\.\s+', '', line)
			
 
				+        # 移除粗体和斜体标记
			
 
				+        line = re.sub(r'\*\*([^*]+)\*\*', r'\1', line)  # **bold**
			
 
				+        line = re.sub(r'\*([^*]+)\*', r'\1', line)  # *italic*
			
 
				+        line = re.sub(r'__([^_]+)__', r'\1', line)  # __bold__
			
 
				+        line = re.sub(r'_([^_]+)_', r'\1', line)  # _italic_
			
 
				+        # 移除链接格式 [text](url) -> text
			
 
				+        line = re.sub(r'\[([^\]]+)\]\([^\)]+\)', r'\1', line)
			
 
				+        # 移除图片格式 ![alt](url) -> alt
			
 
				+        line = re.sub(r'!\[([^\]]*)\]\([^\)]+\)', r'\1', line)
			
 
				+        # 移除行内代码标记
			
 
				+        line = re.sub(r'`([^`]+)`', r'\1', line)
			
 
				+        
			
 
				+        # 移除HTML标签（div、span等）
			
 
				+        line = re.sub(r'<div[^>]*>', '', line)
			
 
				+        line = re.sub(r'</div>', '', line)
			
 
				+        line = re.sub(r'<span[^>]*>', '', line)
			
 
				+        line = re.sub(r'</span>', '', line)
			
 
				+        line = re.sub(r'<[^>]+>', '', line)  # 移除其他HTML标签
			
 
				+        
			
 
				+        # 清理多余空格
			
 
				+        line = line.strip()
			
 
				+        
			
 
				+        if line:  # 只保留非空行
			
 
				+            lines.append(line)
			
 
				+    
			
 
				+    return lines
			
 
				+
			
 
				+
			
 
				+def extract_table_text(table_html: str) -> List[str]:
			
 
				+    """从HTML表格中提取文本，每行一个元素
			
 
				+    
			
 
				+    Args:
			
 
				+        table_html: HTML表格字符串
			
 
				+        
			
 
				+    Returns:
			
 
				+        文本行列表
			
 
				+    """
			
 
				+    table_lines = []
			
 
				+    
			
 
				+    try:
			
 
				+        # 提取所有<tr>标签
			
 
				+        tr_pattern = r'<tr[^>]*>(.*?)</tr>'
			
 
				+        tr_matches = re.findall(tr_pattern, table_html, re.DOTALL)
			
 
				+        
			
 
				+        for tr_content in tr_matches:
			
 
				+            # 提取所有<td>和<th>标签内的文本
			
 
				+            cell_pattern = r'<(?:td|th)[^>]*>(.*?)</(?:td|th)>'
			
 
				+            cells = re.findall(cell_pattern, tr_content, re.DOTALL)
			
 
				+            
			
 
				+            if cells:
			
 
				+                # 清理每个单元格的文本
			
 
				+                cleaned_cells = []
			
 
				+                for cell in cells:
			
 
				+                    # 移除嵌套的HTML标签
			
 
				+                    cleaned = re.sub(r'<[^>]+>', '', cell)
			
 
				+                    # 移除HTML实体
			
 
				+                    cleaned = cleaned.replace('&nbsp;', ' ')
			
 
				+                    cleaned = cleaned.strip()
			
 
				+                    if cleaned:
			
 
				+                        cleaned_cells.append(cleaned)
			
 
				+                
			
 
				+                if cleaned_cells:
			
 
				+                    # 合并单元格内容，用空格分隔
			
 
				+                    table_line = ' '.join(cleaned_cells)
			
 
				+                    if table_line.strip():
			
 
				+                        table_lines.append(table_line)
			
 
				+    except Exception as e:
			
 
				+        logger.warning(f"[Markdown转换] 提取表格文本失败: {e}")
			
 
				+    
			
 
				+    return table_lines
			
 
				+
			
 
				+
			
 
				+def call_paddleocr_ocr(image_path: str, save_path: str) -> tuple[Optional[List[str]], Optional[str]]:
			
 
				+    """调用paddleocr ocr命令提取文本（用于API接口）
			
 
				+    
			
 
				+    Args:
			
 
				+        image_path: 图片路径
			
 
				+        save_path: 保存路径（目录）
			
 
				+        
			
 
				+    Returns:
			
 
				+        (OCR识别的文本列表, JSON文件路径)，如果失败返回(None, None)
			
 
				+    """
			
 
				+    # 在调用PaddleOCR前停止mineru服务以释放GPU内存
			
 
				+    mineru_stopped = stop_mineru_service()
			
 
				+    
			
 
				+    try:
			
 
				+        if not os.path.exists(image_path):
			
 
				+            logger.error(f"[PaddleOCR OCR] 图片文件不存在: {image_path}")
			
 
				+            return None, None
			
 
				+
			
 
				+        # 构建paddleocr ocr命令（NPU 下需加 --device npu:0，否则走 CPU 易段错误）
			
 
				+        cmd = ["paddleocr", "ocr", "-i", image_path, "--save_path", save_path] + _paddle_ocr_device_args()
			
 
				+
			
 
				+        logger.info(f"[PaddleOCR OCR] 执行命令: {' '.join(cmd)}")
			
 
				+
			
 
				+        # 执行命令
			
 
				+        result = subprocess.run(
			
 
				+            cmd,
			
 
				+            capture_output=True,
			
 
				+            text=True,
			
 
				+            timeout=300,  # 5分钟超时
			
 
				+            check=False,
			
 
				+        )
			
 
				+
			
 
				+        if result.returncode != 0:
			
 
				+            logger.error(f"[PaddleOCR OCR] 命令执行失败，返回码: {result.returncode}")
			
 
				+            logger.error(f"[PaddleOCR OCR] 错误输出: {result.stderr}")
			
 
				+            return None, None
			
 
				+
			
 
				+        # 查找保存的JSON文件
			
 
				+        # OCR命令会在save_path下生成 {basename}_res.json
			
 
				+        image_basename = os.path.splitext(os.path.basename(image_path))[0]
			
 
				+        json_file = os.path.join(save_path, f"{image_basename}_res.json")
			
 
				+
			
 
				+        if not os.path.exists(json_file):
			
 
				+            logger.warning(f"[PaddleOCR OCR] JSON文件不存在: {json_file}")
			
 
				+            return None, None
			
 
				+
			
 
				+        # 读取JSON文件
			
 
				+        try:
			
 
				+            with open(json_file, 'r', encoding='utf-8') as f:
			
 
				+                ocr_data = json.load(f)
			
 
				+
			
 
				+            # 优先提取rec_texts字段（如果存在）
			
 
				+            if "rec_texts" in ocr_data and isinstance(ocr_data["rec_texts"], list):
			
 
				+                texts = ocr_data["rec_texts"]
			
 
				+                logger.info(f"[PaddleOCR OCR] 成功提取 {len(texts)} 个文本片段（从rec_texts）")
			
 
				+                return texts, json_file
			
 
				+            
			
 
				+            # 如果没有rec_texts，尝试从parsing_res_list中提取block_content
			
 
				+            if "parsing_res_list" in ocr_data and isinstance(ocr_data["parsing_res_list"], list):
			
 
				+                texts = []
			
 
				+                for item in ocr_data["parsing_res_list"]:
			
 
				+                    if isinstance(item, dict) and "block_content" in item:
			
 
				+                        block_content = item["block_content"]
			
 
				+                        if block_content and block_content.strip():
			
 
				+                            # 如果block_content包含换行符，按行分割
			
 
				+                            if "\n" in block_content:
			
 
				+                                texts.extend([line.strip() for line in block_content.split("\n") if line.strip()])
			
 
				+                            else:
			
 
				+                                texts.append(block_content.strip())
			
 
				+                if texts:
			
 
				+                    logger.info(f"[PaddleOCR OCR] 成功提取 {len(texts)} 个文本片段（从parsing_res_list）")
			
 
				+                    return texts, json_file
			
 
				+            
			
 
				+            logger.warning("[PaddleOCR OCR] JSON文件中未找到rec_texts或parsing_res_list字段")
			
 
				+            return None, json_file
			
 
				+
			
 
				+        except Exception as e:
			
 
				+            logger.exception(f"[PaddleOCR OCR] 读取JSON文件失败: {e}")
			
 
				+            return None, json_file
			
 
				+
			
 
				+    except subprocess.TimeoutExpired:
			
 
				+        logger.error("[PaddleOCR OCR] 命令执行超时")
			
 
				+        return None, None
			
 
				+    except Exception as e:
			
 
				+        logger.exception(f"[PaddleOCR OCR] 调用失败: {e}")
			
 
				+        return None, None
			
 
				+    finally:
			
 
				+        # 无论成功或失败，都尝试重启mineru服务
			
 
				+        if mineru_stopped:
			
 
				+            start_mineru_service()
			
 
				+
			
 
				+
			
 
				+def call_paddleocr_doc_parser_for_text(image_path: str, save_path: str) -> tuple[Optional[List[str]], Optional[str]]:
			
 
				+    """调用paddleocr doc_parser命令，将markdown转换为纯文本（用于内部调用提取关键词）
			
 
				+    
			
 
				+    Args:
			
 
				+        image_path: 图片路径
			
 
				+        save_path: 保存路径（目录）
			
 
				+        
			
 
				+    Returns:
			
 
				+        (纯文本列表（按行分割）, markdown文件路径)，如果失败返回(None, None)
			
 
				+    """
			
 
				+    try:
			
 
				+        if not os.path.exists(image_path):
			
 
				+            logger.error(f"[PaddleOCR DocParser] 图片文件不存在: {image_path}")
			
 
				+            return None, None
			
 
				+        
			
 
				+        # 生成输出目录和基础文件名
			
 
				+        image_dir = os.path.dirname(image_path)
			
 
				+        image_basename = os.path.splitext(os.path.basename(image_path))[0]
			
 
				+        save_path_base = os.path.join(save_path, image_basename)
			
 
				+        os.makedirs(save_path_base, exist_ok=True)
			
 
				+        
			
 
				+        # 构建paddleocr doc_parser命令（NPU 下需加 --device npu:0，否则走 CPU 易段错误）
			
 
				+        cmd = [
			
 
				+            "paddleocr", "doc_parser", "-i", image_path,
			
 
				+            "--precision", "fp32",
			
 
				+            "--use_doc_unwarping", "False",
			
 
				+            "--use_doc_orientation_classify", "True",
			
 
				+            "--use_chart_recognition", "True",
			
 
				+            "--save_path", save_path_base
			
 
				+        ] + _paddle_ocr_device_args()
			
 
				+        
			
 
				+        logger.info(f"[PaddleOCR DocParser] 执行命令: {' '.join(cmd)}")
			
 
				+        
			
 
				+        # 执行命令
			
 
				+        result = subprocess.run(
			
 
				+            cmd,
			
 
				+            capture_output=True,
			
 
				+            text=True,
			
 
				+            timeout=300,  # 5分钟超时
			
 
				+            check=False,
			
 
				+        )
			
 
				+        
			
 
				+        if result.returncode != 0:
			
 
				+            logger.error(f"[PaddleOCR DocParser] 命令执行失败，返回码: {result.returncode}")
			
 
				+            logger.error(f"[PaddleOCR DocParser] 错误输出: {result.stderr}")
			
 
				+            return None, None
			
 
				+        
			
 
				+        # 查找保存的Markdown文件
			
 
				+        # PaddleOCR会在save_path下创建目录，文件路径为: {save_path}/{basename}.md
			
 
				+        md_file = os.path.join(save_path_base, f"{image_basename}.md")
			
 
				+        
			
 
				+        # 也可能在子目录中
			
 
				+        if not os.path.exists(md_file):
			
 
				+            md_files = sorted(Path(save_path_base).rglob("*.md"))
			
 
				+            if md_files:
			
 
				+                md_file = str(md_files[0])
			
 
				+                logger.info(f"[PaddleOCR DocParser] 在子目录中找到Markdown文件: {md_file}")
			
 
				+        
			
 
				+        if not os.path.exists(md_file):
			
 
				+            logger.warning(f"[PaddleOCR DocParser] Markdown文件不存在: {md_file}")
			
 
				+            return None, None
			
 
				+        
			
 
				+        # 读取Markdown文件并转换为纯文本
			
 
				+        try:
			
 
				+            with open(md_file, 'r', encoding='utf-8') as f:
			
 
				+                markdown_content = f.read()
			
 
				+            
			
 
				+            if not markdown_content.strip():
			
 
				+                logger.warning("[PaddleOCR DocParser] Markdown文件内容为空")
			
 
				+                return [], md_file
			
 
				+            
			
 
				+            # 将Markdown转换为纯文本列表
			
 
				+            plain_text_lines = markdown_to_plain_text(markdown_content)
			
 
				+            logger.info(f"[PaddleOCR DocParser] 成功提取 {len(plain_text_lines)} 行纯文本，Markdown文件: {md_file}")
			
 
				+            return plain_text_lines, md_file
			
 
				+                
			
 
				+        except Exception as e:
			
 
				+            logger.exception(f"[PaddleOCR DocParser] 读取Markdown文件失败: {e}")
			
 
				+            return None, md_file
			
 
				+            
			
 
				+    except subprocess.TimeoutExpired:
			
 
				+        logger.error("[PaddleOCR DocParser] 命令执行超时")
			
 
				+        return None, None
			
 
				+    except Exception as e:
			
 
				+        logger.exception(f"[PaddleOCR DocParser] 调用失败: {e}")
			
 
				+        return None, None
			
 
				+
			
 
				+
			
 
				+def extract_keywords_from_ocr_texts(ocr_texts: List[str]) -> Dict[str, Any]:
			
 
				+    """从OCR文本列表中提取关键信息
			
 
				+    
			
 
				+    Args:
			
 
				+        ocr_texts: OCR识别的文本列表
			
 
				+        
			
 
				+    Returns:
			
 
				+        包含提取的关键信息的字典
			
 
				+    """
			
 
				+    keywords = {
			
 
				+        "project": "",
			
 
				+        "standardReferences": "",
			
 
				+        "soundLevelMeterMode": "",
			
 
				+        "soundCalibratorMode": "",
			
 
				+        "calibrationValueBefore": "",
			
 
				+        "calibrationValueAfter": "",
			
 
				+        "weather_info": [],  # 存储天气相关信息
			
 
				+        "address_mapping": {}  # 存储编号到地址的映射，用于电磁检测记录
			
 
				+    }
			
 
				+    
			
 
				+    if not ocr_texts:
			
 
				+        return keywords
			
 
				+    
			
 
				+    # 将所有文本合并，用于匹配
			
 
				+    full_text = " ".join(ocr_texts)
			
 
				+    
			
 
				+    # 提取项目名称
			
 
				+    # 先尝试匹配"项目名称："格式
			
 
				+    project_match = re.search(r'项目名称[:：]([^检测依据声级计声校准器检测前检测后气象条件日期]+)', full_text)
			
 
				+    if project_match:
			
 
				+        project = project_match.group(1).strip()
			
 
				+        # 清理可能的后续内容
			
 
				+        project = re.sub(r'检测依据.*$', '', project).strip()
			
 
				+        keywords["project"] = project
			
 
				+        logger.debug(f"[关键词提取] 提取到项目名称: {project}")
			
 
				+    else:
			
 
				+        # 如果没找到，尝试查找"项目名称"文本，然后检查后续文本片段
			
 
				+        for i, text in enumerate(ocr_texts):
			
 
				+            if "项目名称" in text:
			
 
				+                # 检查当前文本中是否有值（在冒号后面）
			
 
				+                if "：" in text or ":" in text:
			
 
				+                    project_match = re.search(r'项目名称[:：]([^检测依据声级计声校准器检测前检测后气象条件日期]+)', text)
			
 
				+                    if project_match:
			
 
				+                        project = project_match.group(1).strip()
			
 
				+                        project = re.sub(r'检测依据.*$', '', project).strip()
			
 
				+                        if project:
			
 
				+                            keywords["project"] = project
			
 
				+                            logger.debug(f"[关键词提取] 从当前文本提取到项目名称: {project}")
			
 
				+                            break
			
 
				+                # 如果当前文本只有"项目名称"，检查下一个文本片段
			
 
				+                elif text.strip() == "项目名称" or text.strip().startswith("项目名称"):
			
 
				+                    # 检查后续几个文本片段，找到项目名称值
			
 
				+                    for j in range(i + 1, min(i + 3, len(ocr_texts))):
			
 
				+                        next_text = ocr_texts[j].strip()
			
 
				+                        # 如果下一个文本不是"检测依据"、"监测依据"等标签，且包含中文字符，可能是项目名称
			
 
				+                        if next_text and not re.match(r'^(检测依据|监测依据|检查依据|声级计|声校准器|检测前|检测后|气象条件|日期)', next_text):
			
 
				+                            # 检查是否包含中文字符（项目名称通常是中文）
			
 
				+                            if re.search(r'[\u4e00-\u9fa5]', next_text):
			
 
				+                                # 提取项目名称（直到遇到"检测依据"等关键词）
			
 
				+                                project = re.sub(r'(检测依据|监测依据|检查依据).*$', '', next_text).strip()
			
 
				+                                if project:
			
 
				+                                    keywords["project"] = project
			
 
				+                                    logger.debug(f"[关键词提取] 从后续文本提取到项目名称: {project}")
			
 
				+                                    break
			
 
				+                    if keywords["project"]:
			
 
				+                        break
			
 
				+    
			
 
				+    # 提取检测依据
			
 
				+    standard_match = re.search(r'检测依据[:：]([^声级计声校准器检测前检测后气象条件日期]+)', full_text)
			
 
				+    if standard_match:
			
 
				+        standard = standard_match.group(1).strip()
			
 
				+        # 提取GB标准
			
 
				+        gb_standards = re.findall(r'GB\s*\d+[-\.]?\d*[-\.]?\d*', standard)
			
 
				+        if gb_standards:
			
 
				+            keywords["standardReferences"] = " ".join(gb_standards)
			
 
				+        else:
			
 
				+            keywords["standardReferences"] = standard.replace("□其他：", "").strip()
			
 
				+        logger.debug(f"[关键词提取] 提取到检测依据: {keywords['standardReferences']}")
			
 
				+    
			
 
				+    # 提取声级计型号/编号
			
 
				+    sound_meter_match = re.search(r'声级计型号[/：:]?(?:编号)?[:：]\s*([A-Z0-9+/]+)', full_text)
			
 
				+    if sound_meter_match:
			
 
				+        keywords["soundLevelMeterMode"] = sound_meter_match.group(1).strip()
			
 
				+        logger.debug(f"[关键词提取] 提取到声级计型号: {keywords['soundLevelMeterMode']}")
			
 
				+    
			
 
				+    # 提取声校准器型号/编号
			
 
				+    calibrator_match = re.search(r'声校准器型号[/：:]?(?:编号)?[:：]\s*([A-Z0-9+/]+)', full_text)
			
 
				+    if calibrator_match:
			
 
				+        keywords["soundCalibratorMode"] = calibrator_match.group(1).strip()
			
 
				+        logger.debug(f"[关键词提取] 提取到声校准器型号: {keywords['soundCalibratorMode']}")
			
 
				+    
			
 
				+    # 提取校准值 - 按照出现顺序：第一个dB(A)是检测前，第二个是检测后
			
 
				+    # 首先尝试通过字段名匹配
			
 
				+    before_cal_found = False
			
 
				+    after_cal_found = False
			
 
				+    
			
 
				+    # 先尝试通过字段名精确匹配
			
 
				+    for i, text in enumerate(ocr_texts):
			
 
				+        if "检测前校准值" in text and not before_cal_found:
			
 
				+            # 在当前文本中查找（可能格式：检测前校准值：93.8 dB（A））
			
 
				+            before_cal_match = re.search(r'检测前校准值[:：]\s*([0-9.]+)\s*dB[（(]?A[）)]?', text)
			
 
				+            if before_cal_match:
			
 
				+                cal_value = before_cal_match.group(1).strip()
			
 
				+                keywords["calibrationValueBefore"] = f"{cal_value} dB(A)"
			
 
				+                logger.debug(f"[关键词提取] 提取到检测前校准值: {keywords['calibrationValueBefore']}")
			
 
				+                before_cal_found = True
			
 
				+                continue
			
 
				+            # 如果当前文本只有字段名（如"检测前校准值："），检查相邻文本片段
			
 
				+            elif re.search(r'检测前校准值[:：]\s*$', text) or (text.strip() == "检测前校准值："):
			
 
				+                # 检查后续3个文本片段，查找包含dB（A）的文本
			
 
				+                for j in range(i + 1, min(i + 4, len(ocr_texts))):
			
 
				+                    next_text = ocr_texts[j]
			
 
				+                    # 查找包含dB（A）的文本（如"93.8dB（A）"）
			
 
				+                    db_match = re.search(r'([0-9.]+)\s*dB[（(]?A[）)]?', next_text)
			
 
				+                    if db_match:
			
 
				+                        cal_value = db_match.group(1).strip()
			
 
				+                        keywords["calibrationValueBefore"] = f"{cal_value} dB(A)"
			
 
				+                        logger.debug(f"[关键词提取] 从相邻文本提取到检测前校准值: {keywords['calibrationValueBefore']}")
			
 
				+                        before_cal_found = True
			
 
				+                        break
			
 
				+                if before_cal_found:
			
 
				+                    continue
			
 
				+        
			
 
				+        if "检测后校准值" in text and not after_cal_found:
			
 
				+            # 在当前文本中查找（可能格式：检测后校准值：93.8 dB（A）或 93.8dB（A）检测后校准值：_93.8dB（A)）
			
 
				+            after_cal_match = re.search(r'检测后校准值[:：]\s*([0-9.]+)\s*dB[（(]?A[）)]?', text)
			
 
				+            if after_cal_match:
			
 
				+                cal_value = after_cal_match.group(1).strip()
			
 
				+                keywords["calibrationValueAfter"] = f"{cal_value} dB(A)"
			
 
				+                logger.debug(f"[关键词提取] 提取到检测后校准值: {keywords['calibrationValueAfter']}")
			
 
				+                after_cal_found = True
			
 
				+                continue
			
 
				+            # 如果当前文本包含"检测后校准值"但值在文本前面（如"93.8dB（A）检测后校准值："）
			
 
				+            elif re.search(r'([0-9.]+)\s*dB[（(]?A[）)]?\s*检测后校准值', text):
			
 
				+                db_match = re.search(r'([0-9.]+)\s*dB[（(]?A[）)]?', text)
			
 
				+                if db_match:
			
 
				+                    cal_value = db_match.group(1).strip()
			
 
				+                    keywords["calibrationValueAfter"] = f"{cal_value} dB(A)"
			
 
				+                    logger.debug(f"[关键词提取] 从同一文本提取到检测后校准值: {keywords['calibrationValueAfter']}")
			
 
				+                    after_cal_found = True
			
 
				+                    continue
			
 
				+            # 如果当前文本只有字段名（如"检测后校准值："），检查相邻文本片段
			
 
				+            elif re.search(r'检测后校准值[:：]\s*$', text) or (text.strip() == "检测后校准值："):
			
 
				+                # 检查后续3个文本片段，查找包含dB（A）的文本
			
 
				+                for j in range(i + 1, min(i + 4, len(ocr_texts))):
			
 
				+                    next_text = ocr_texts[j]
			
 
				+                    # 查找包含dB（A）的文本（如"93.8dB（A）"）
			
 
				+                    db_match = re.search(r'([0-9.]+)\s*dB[（(]?A[）)]?', next_text)
			
 
				+                    if db_match:
			
 
				+                        cal_value = db_match.group(1).strip()
			
 
				+                        keywords["calibrationValueAfter"] = f"{cal_value} dB(A)"
			
 
				+                        logger.debug(f"[关键词提取] 从相邻文本提取到检测后校准值: {keywords['calibrationValueAfter']}")
			
 
				+                        after_cal_found = True
			
 
				+                        break
			
 
				+                if after_cal_found:
			
 
				+                    continue
			
 
				+    
			
 
				+    # 如果通过字段名没有找到，按照出现顺序：第一个dB(A)是检测前，第二个是检测后
			
 
				+    if not before_cal_found or not after_cal_found:
			
 
				+        db_a_matches = []  # 存储所有找到的dB(A)值及其位置
			
 
				+        for i, text in enumerate(ocr_texts):
			
 
				+            # 查找包含dB（A）的文本
			
 
				+            db_matches = re.finditer(r'([0-9.]+)\s*dB[（(]?A[）)]?', text)
			
 
				+            for match in db_matches:
			
 
				+                cal_value = match.group(1).strip()
			
 
				+                db_a_matches.append((i, cal_value, text))
			
 
				+        
			
 
				+        # 如果找到至少一个dB(A)，且还没有检测前校准值，第一个就是检测前
			
 
				+        if db_a_matches and not before_cal_found:
			
 
				+            first_cal_value = db_a_matches[0][1]
			
 
				+            keywords["calibrationValueBefore"] = f"{first_cal_value} dB(A)"
			
 
				+            logger.debug(f"[关键词提取] 按出现顺序提取到检测前校准值（第一个dB(A)）: {keywords['calibrationValueBefore']}")
			
 
				+            before_cal_found = True
			
 
				+        
			
 
				+        # 如果找到至少两个dB(A)，且还没有检测后校准值，第二个就是检测后
			
 
				+        if len(db_a_matches) >= 2 and not after_cal_found:
			
 
				+            second_cal_value = db_a_matches[1][1]
			
 
				+            keywords["calibrationValueAfter"] = f"{second_cal_value} dB(A)"
			
 
				+            logger.debug(f"[关键词提取] 按出现顺序提取到检测后校准值（第二个dB(A)）: {keywords['calibrationValueAfter']}")
			
 
				+            after_cal_found = True
			
 
				+        # 如果只找到一个dB(A)，且还没有检测后校准值，且检测前已经找到，那么这个就是检测后（可能是同一个值）
			
 
				+        elif len(db_a_matches) == 1 and not after_cal_found and before_cal_found:
			
 
				+            # 如果检测前和检测后是同一个值，也设置检测后
			
 
				+            if keywords["calibrationValueBefore"]:
			
 
				+                keywords["calibrationValueAfter"] = keywords["calibrationValueBefore"]
			
 
				+                logger.debug(f"[关键词提取] 检测前和检测后校准值相同: {keywords['calibrationValueAfter']}")
			
 
				+    
			
 
				+    # 提取天气信息（从文本片段中查找包含日期和天气信息的片段）
			
 
				+    # 需要处理文本可能分散在多个片段中的情况
			
 
				+    # 只有当"日期："存在且后续有天气相关信息时才提取
			
 
				+    current_weather_info = None
			
 
				+    weather_start_idx = -1  # 记录天气信息开始的索引
			
 
				+    
			
 
				+    for i, text in enumerate(ocr_texts):
			
 
				+        # 查找包含"日期："的文本，开始新的天气记录
			
 
				+        # 只有当后续文本中有天气相关信息时才创建记录
			
 
				+        date_match = re.search(r'日期[:：]\s*([\d.\-]+)', text)
			
 
				+        if date_match:
			
 
				+            # 检查后续10个文本片段中是否有天气相关信息（天气、温度、湿度、风速、风向等）
			
 
				+            has_weather_info = False
			
 
				+            for j in range(i, min(i + 10, len(ocr_texts))):
			
 
				+                check_text = ocr_texts[j]
			
 
				+                if any(keyword in check_text for keyword in ["天气", "温度", "湿度", "风速", "风向", "℃", "%RH", "m/s"]):
			
 
				+                    has_weather_info = True
			
 
				+                    break
			
 
				+            
			
 
				+            if has_weather_info:
			
 
				+                # 如果之前有未完成的天气记录，先保存
			
 
				+                if current_weather_info and any([current_weather_info["monitorAt"], current_weather_info["weather"], 
			
 
				+                                                 current_weather_info["temp"], current_weather_info["humidity"], 
			
 
				+                                                 current_weather_info["windSpeed"], current_weather_info["windDirection"]]):
			
 
				+                    keywords["weather_info"].append(current_weather_info)
			
 
				+                
			
 
				+                # 创建新的天气记录
			
 
				+                current_weather_info = {
			
 
				+                    "monitorAt": date_match.group(1).strip(),
			
 
				+                    "weather": "",
			
 
				+                    "temp": "",
			
 
				+                    "humidity": "",
			
 
				+                    "windSpeed": "",
			
 
				+                    "windDirection": ""
			
 
				+                }
			
 
				+                weather_start_idx = i
			
 
				+        
			
 
				+        # 如果当前有天气记录，继续提取信息（从当前文本和后续几个文本中）
			
 
				+        if current_weather_info:
			
 
				+            # 只在天气记录开始后的10个文本片段内查找（避免跨太远）
			
 
				+            if weather_start_idx >= 0 and i <= weather_start_idx + 10:
			
 
				+                # 查找天气（在同一文本或后续文本中）
			
 
				+                if not current_weather_info["weather"]:
			
 
				+                    weather_match = re.search(r'天气\s*([^\s温度湿度风速风向]+)', text)
			
 
				+                    if weather_match:
			
 
				+                        weather_value = weather_match.group(1).strip()
			
 
				+                        if weather_value and weather_value != "_" and not re.match(r'^[\d.\-]+$', weather_value):
			
 
				+                            current_weather_info["weather"] = weather_value
			
 
				+                
			
 
				+                # 查找温度（可能格式：温度29.5-35.0 或 温度 29.5-35.0）
			
 
				+                if not current_weather_info["temp"]:
			
 
				+                    temp_match = re.search(r'温度\s*([0-9.\-]+)', text)
			
 
				+                    if temp_match:
			
 
				+                        current_weather_info["temp"] = temp_match.group(1).strip()
			
 
				+                
			
 
				+                # 查找湿度（可能格式：湿度74.0-74.1 或 在"℃ 湿度"之后的文本中）
			
 
				+                if not current_weather_info["humidity"]:
			
 
				+                    # 先检查当前文本是否包含湿度值
			
 
				+                    humidity_match = re.search(r'湿度\s*([0-9.\-]+)', text)
			
 
				+                    if humidity_match:
			
 
				+                        current_weather_info["humidity"] = humidity_match.group(1).strip()
			
 
				+                    # 如果当前文本是"℃ 湿度"或类似格式，湿度值可能在下一行
			
 
				+                    elif "湿度" in text and i + 1 < len(ocr_texts):
			
 
				+                        next_text = ocr_texts[i + 1]
			
 
				+                        if re.match(r'^[0-9.\-]+', next_text):
			
 
				+                            current_weather_info["humidity"] = next_text.strip()
			
 
				+                
			
 
				+                # 查找风速（可能格式：风速0.4-0.5 或 在"%RH 风速"之后的文本中）
			
 
				+                if not current_weather_info["windSpeed"]:
			
 
				+                    # 先检查当前文本是否包含风速值
			
 
				+                    wind_speed_match = re.search(r'风速\s*([0-9.\-]+)', text)
			
 
				+                    if wind_speed_match:
			
 
				+                        current_weather_info["windSpeed"] = wind_speed_match.group(1).strip()
			
 
				+                    # 如果当前文本是"%RH 风速"或类似格式，风速值可能在下一行
			
 
				+                    elif "风速" in text and i + 1 < len(ocr_texts):
			
 
				+                        next_text = ocr_texts[i + 1]
			
 
				+                        if re.match(r'^[0-9.\-]+', next_text):
			
 
				+                            current_weather_info["windSpeed"] = next_text.strip()
			
 
				+                
			
 
				+                # 查找风向（可能格式：风向南风 或 在"m/s风向"之后的文本中，或 "_m/s风向南风" 或 "m/s风向南风"）
			
 
				+                if not current_weather_info["windDirection"]:
			
 
				+                    # 先检查当前文本是否包含风向值（格式：风向南风）
			
 
				+                    # 改进正则表达式，匹配更长的风向值（如"南风"、"东北"、"东偏北"等）
			
 
				+                    # 注意：不要排除"风"字，因为"风速"中包含"风"，会导致"南风"只匹配到"南"
			
 
				+                    wind_dir_match = re.search(r'风向\s*([^\s日期温度湿度]+?)(?=\s|日期|温度|湿度|风速|$)', text)
			
 
				+                    if wind_dir_match:
			
 
				+                        wind_value = wind_dir_match.group(1).strip()
			
 
				+                        # 确保不是"m/s"或数字
			
 
				+                        if wind_value and wind_value != "m/s" and not re.match(r'^[0-9.\-]+$', wind_value):
			
 
				+                            # 如果只匹配到单个方向字（如"南"），检查下一个文本片段是否是"风"
			
 
				+                            if len(wind_value) == 1 and i + 1 < len(ocr_texts):
			
 
				+                                next_text = ocr_texts[i + 1].strip()
			
 
				+                                # 如果下一个文本是"风"，合并为"南风"等
			
 
				+                                if next_text == "风" or next_text.startswith("风"):
			
 
				+                                    wind_value = wind_value + "风"
			
 
				+                                    logger.debug(f"[关键词提取] 合并风向值: {wind_value}")
			
 
				+                            current_weather_info["windDirection"] = wind_value
			
 
				+                    # 如果当前文本是"m/s风向"或"_m/s风向"格式，风向值在同一文本中（如 "_m/s风向南风" 或 "m/s风向南风"）
			
 
				+                    if not current_weather_info["windDirection"]:
			
 
				+                        # 注意：不要排除"风"字，因为"风速"中包含"风"，会导致"南风"只匹配到"南"
			
 
				+                        wind_dir_match = re.search(r'[_\s]*m/s\s*风向\s*([^\s日期温度湿度]+?)(?=\s|日期|温度|湿度|风速|$)', text)
			
 
				+                        if wind_dir_match:
			
 
				+                            wind_value = wind_dir_match.group(1).strip()
			
 
				+                            if wind_value and not re.match(r'^[0-9.\-]+$', wind_value):
			
 
				+                                # 如果只匹配到单个方向字，检查下一个文本片段
			
 
				+                                if len(wind_value) == 1 and i + 1 < len(ocr_texts):
			
 
				+                                    next_text = ocr_texts[i + 1].strip()
			
 
				+                                    if next_text == "风" or next_text.startswith("风"):
			
 
				+                                        wind_value = wind_value + "风"
			
 
				+                                        logger.debug(f"[关键词提取] 合并风向值: {wind_value}")
			
 
				+                                current_weather_info["windDirection"] = wind_value
			
 
				+                    # 如果当前文本是"m/s"或类似格式，风向值可能在下一行
			
 
				+                    if not current_weather_info["windDirection"]:
			
 
				+                        if ("m/s" in text or "风向" in text) and i + 1 < len(ocr_texts):
			
 
				+                            next_text = ocr_texts[i + 1].strip()
			
 
				+                            if next_text and not re.match(r'^[0-9.\-]+', next_text) and "风向" not in next_text:
			
 
				+                                wind_value = next_text
			
 
				+                                # 如果下一个文本是单个方向字，再检查下下个文本是否是"风"
			
 
				+                                if len(wind_value) == 1 and i + 2 < len(ocr_texts):
			
 
				+                                    next_next_text = ocr_texts[i + 2].strip()
			
 
				+                                    if next_next_text == "风" or next_next_text.startswith("风"):
			
 
				+                                        wind_value = wind_value + "风"
			
 
				+                                        logger.debug(f"[关键词提取] 合并风向值: {wind_value}")
			
 
				+                                current_weather_info["windDirection"] = wind_value
			
 
				+    
			
 
				+    # 保存最后一个天气记录
			
 
				+    if current_weather_info and any([current_weather_info["monitorAt"], current_weather_info["weather"], 
			
 
				+                                     current_weather_info["temp"], current_weather_info["humidity"], 
			
 
				+                                     current_weather_info["windSpeed"], current_weather_info["windDirection"]]):
			
 
				+        keywords["weather_info"].append(current_weather_info)
			
 
				+    
			
 
				+    # 提取监测地点（address）信息，用于电磁检测记录
			
 
				+    # 匹配模式：编号（如EB1, EB2, ZB1等）后面跟着地址信息
			
 
				+    # 地址通常在编号之后，可能在同一个文本片段或相邻的文本片段中
			
 
				+    for i, text in enumerate(ocr_texts):
			
 
				+        # 查找编号模式：EB1, EB2, ZB1, ZB2等
			
 
				+        code_match = re.search(r'(E[ZB]\d+|Z[ZB]\d+)', text, re.IGNORECASE)
			
 
				+        if code_match:
			
 
				+            code = code_match.group(1).upper()  # 统一转为大写
			
 
				+            # 在当前文本中查找地址（编号后面的非数字、非时间格式的文本）
			
 
				+            # 地址通常在编号之后，可能是中文地名
			
 
				+            address_candidates = []
			
 
				+            
			
 
				+            # 在当前文本中，编号之后查找地址
			
 
				+            code_pos = code_match.end()
			
 
				+            remaining_text = text[code_pos:].strip()
			
 
				+            # 跳过可能的空格、标点等
			
 
				+            remaining_text = re.sub(r'^[\s,，。、]+', '', remaining_text)
			
 
				+            
			
 
				+            # 如果剩余文本不为空且不是纯数字或时间格式，可能是地址
			
 
				+            if remaining_text and not re.match(r'^[\d.\-:\s]+$', remaining_text):
			
 
				+                # 提取地址（直到遇到数字、时间或特定关键词）
			
 
				+                address_match = re.search(r'^([^\d\n]+?)(?=\d|时间|线高|$)', remaining_text)
			
 
				+                if address_match:
			
 
				+                    address = address_match.group(1).strip()
			
 
				+                    # 清理地址，移除常见的非地址字符
			
 
				+                    address = re.sub(r'[，。、\s]+$', '', address)
			
 
				+                    if address and len(address) > 0:
			
 
				+                        address_candidates.append(address)
			
 
				+            
			
 
				+            # 如果当前文本中没有找到地址，检查相邻的文本片段
			
 
				+            if not address_candidates:
			
 
				+                # 检查编号之前的文本片段（地址可能在编号之前，需要跳过数字、时间、高度等）
			
 
				+                # 向前查找最多5个文本片段，跳过数字、时间、高度等，找到中文地名
			
 
				+                for j in range(i - 1, max(i - 6, -1), -1):
			
 
				+                    prev_text = ocr_texts[j].strip()
			
 
				+                    if not prev_text:
			
 
				+                        continue
			
 
				+                    # 跳过编号、数字、时间、高度等
			
 
				+                    if re.match(r'^(E[ZB]\d+|Z[ZB]\d+|\d+|时间|线高|编号|均值|24m|\d{4}[.\-]\d{1,2}[.\-]\d{1,2})', prev_text, re.IGNORECASE):
			
 
				+                        continue
			
 
				+                    # 检查是否是中文地名（包含至少2个中文字符）
			
 
				+                    if re.search(r'[\u4e00-\u9fa5]{2,}', prev_text):
			
 
				+                        # 进一步确认：不是纯数字、时间格式等
			
 
				+                        if not re.match(r'^[\d.\-:\s]+$', prev_text):
			
 
				+                            address_candidates.append(prev_text)
			
 
				+                            logger.debug(f"[关键词提取] 在编号{code}之前找到地址候选 (索引{j}): {prev_text}")
			
 
				+                            break  # 找到第一个地址就停止
			
 
				+                
			
 
				+                # 检查编号之后的文本片段
			
 
				+                if not address_candidates and i + 1 < len(ocr_texts):
			
 
				+                    next_text = ocr_texts[i + 1].strip()
			
 
				+                    # 如果下一个文本不是编号、数字、时间等，可能是地址
			
 
				+                    if next_text and not re.match(r'^(E[ZB]\d+|Z[ZB]\d+|\d+|时间|线高|编号|均值|24m|\d{4}[.\-]\d{1,2}[.\-]\d{1,2})', next_text, re.IGNORECASE):
			
 
				+                        # 检查是否是中文地名
			
 
				+                        if re.search(r'[\u4e00-\u9fa5]{2,}', next_text):
			
 
				+                            address_candidates.append(next_text)
			
 
				+            
			
 
				+            # 如果找到地址候选，选择最合适的（通常是第一个非空的）
			
 
				+            if address_candidates:
			
 
				+                address = address_candidates[0]
			
 
				+                # 进一步清理地址
			
 
				+                address = re.sub(r'^[，。、\s]+|[，。、\s]+$', '', address)
			
 
				+                if address:
			
 
				+                    keywords["address_mapping"][code] = address
			
 
				+                    logger.debug(f"[关键词提取] 提取到监测地点: {code} -> {address}")
			
 
				+    
			
 
				+    return keywords
			
 
				+
			
 
				+
			
 
				+def extract_keywords_from_markdown(markdown_content: str) -> Dict[str, Any]:
			
 
				+    """从markdown内容中直接提取关键信息
			
 
				+    
			
 
				+    Args:
			
 
				+        markdown_content: markdown内容字符串
			
 
				+        
			
 
				+    Returns:
			
 
				+        包含提取的关键信息的字典
			
 
				+    """
			
 
				+    keywords = {
			
 
				+        "project": "",
			
 
				+        "standardReferences": "",
			
 
				+        "soundLevelMeterMode": "",
			
 
				+        "soundCalibratorMode": "",
			
 
				+        "calibrationValueBefore": "",
			
 
				+        "calibrationValueAfter": "",
			
 
				+        "weather_info": []  # 存储天气相关信息
			
 
				+    }
			
 
				+    
			
 
				+    if not markdown_content:
			
 
				+        return keywords
			
 
				+    
			
 
				+    # 移除HTML标签，保留文本内容（但保留表格结构信息）
			
 
				+    # 先提取表格中的文本内容
			
 
				+    text_content = markdown_content
			
 
				+    
			
 
				+    # 提取项目名称
			
 
				+    project_match = re.search(r'项目名称[:：]([^检测依据声级计声校准器检测前检测后气象条件日期<>]+)', text_content)
			
 
				+    if project_match:
			
 
				+        project = project_match.group(1).strip()
			
 
				+        # 清理可能的后续内容和HTML标签
			
 
				+        project = re.sub(r'检测依据.*$', '', project).strip()
			
 
				+        project = re.sub(r'<[^>]+>', '', project).strip()
			
 
				+        if project:
			
 
				+            keywords["project"] = project
			
 
				+            logger.debug(f"[Markdown关键词提取] 提取到项目名称: {project}")
			
 
				+    
			
 
				+    # 提取检测依据
			
 
				+    standard_match = re.search(r'检测依据[:：]([^声级计声校准器检测前检测后气象条件日期<>]+)', text_content)
			
 
				+    if standard_match:
			
 
				+        standard = standard_match.group(1).strip()
			
 
				+        # 提取GB标准
			
 
				+        gb_standards = re.findall(r'GB\s*\d+[-\.]?\d*[-\.]?\d*', standard)
			
 
				+        if gb_standards:
			
 
				+            keywords["standardReferences"] = " ".join(gb_standards)
			
 
				+        else:
			
 
				+            keywords["standardReferences"] = re.sub(r'<[^>]+>', '', standard).replace("□其他：", "").strip()
			
 
				+        logger.debug(f"[Markdown关键词提取] 提取到检测依据: {keywords['standardReferences']}")
			
 
				+    
			
 
				+    # 提取声级计型号/编号
			
 
				+    sound_meter_match = re.search(r'声级计型号[/：:]?(?:编号)?[:：]\s*([A-Z0-9+/（）()]+)', text_content)
			
 
				+    if sound_meter_match:
			
 
				+        sound_meter = sound_meter_match.group(1).strip()
			
 
				+        sound_meter = re.sub(r'<[^>]+>', '', sound_meter).strip()
			
 
				+        if sound_meter:
			
 
				+            keywords["soundLevelMeterMode"] = sound_meter
			
 
				+            logger.debug(f"[Markdown关键词提取] 提取到声级计型号: {keywords['soundLevelMeterMode']}")
			
 
				+    
			
 
				+    # 提取声校准器型号/编号
			
 
				+    calibrator_match = re.search(r'声校准器型号[/：:]?(?:编号)?[:：]\s*([A-Z0-9+/（）()]+)', text_content)
			
 
				+    if calibrator_match:
			
 
				+        calibrator = calibrator_match.group(1).strip()
			
 
				+        calibrator = re.sub(r'<[^>]+>', '', calibrator).strip()
			
 
				+        if calibrator:
			
 
				+            keywords["soundCalibratorMode"] = calibrator
			
 
				+            logger.debug(f"[Markdown关键词提取] 提取到声校准器型号: {keywords['soundCalibratorMode']}")
			
 
				+    
			
 
				+    # 提取检测前校准值
			
 
				+    before_cal_match = re.search(r'检测前校准值[:：]\s*([0-9.]+)\s*dB[（(]?A[）)]?', text_content)
			
 
				+    if before_cal_match:
			
 
				+        cal_value = before_cal_match.group(1).strip()
			
 
				+        keywords["calibrationValueBefore"] = f"{cal_value} dB(A)"
			
 
				+        logger.debug(f"[Markdown关键词提取] 提取到检测前校准值: {keywords['calibrationValueBefore']}")
			
 
				+    
			
 
				+    # 提取检测后校准值
			
 
				+    after_cal_match = re.search(r'检测后校准值[:：]\s*([0-9.]+)\s*dB[（(]?A[）)]?', text_content)
			
 
				+    if after_cal_match:
			
 
				+        cal_value = after_cal_match.group(1).strip()
			
 
				+        keywords["calibrationValueAfter"] = f"{cal_value} dB(A)"
			
 
				+        logger.debug(f"[Markdown关键词提取] 提取到检测后校准值: {keywords['calibrationValueAfter']}")
			
 
				+    
			
 
				+    # 提取天气信息
			
 
				+    # 查找所有包含"日期："的行或片段
			
 
				+    date_pattern = r'日期[:：]\s*([\d.\-]+)'
			
 
				+    date_matches = list(re.finditer(date_pattern, text_content))
			
 
				+    
			
 
				+    for date_match in date_matches:
			
 
				+        date_value = date_match.group(1).strip()
			
 
				+        # 获取日期匹配位置后的文本（最多500字符）
			
 
				+        start_pos = date_match.end()
			
 
				+        weather_section = text_content[start_pos:start_pos + 500]
			
 
				+        
			
 
				+        weather_info = {
			
 
				+            "monitorAt": date_value,
			
 
				+            "weather": "",
			
 
				+            "temp": "",
			
 
				+            "humidity": "",
			
 
				+            "windSpeed": "",
			
 
				+            "windDirection": ""
			
 
				+        }
			
 
				+        
			
 
				+        # 提取天气
			
 
				+        weather_match = re.search(r'天气\s*([^\s温度湿度风速风向<>]+)', weather_section)
			
 
				+        if weather_match:
			
 
				+            weather_value = weather_match.group(1).strip()
			
 
				+            weather_value = re.sub(r'<[^>]+>', '', weather_value).strip()
			
 
				+            if weather_value and weather_value != "_" and not re.match(r'^[\d.\-]+$', weather_value):
			
 
				+                weather_info["weather"] = weather_value
			
 
				+        
			
 
				+        # 提取温度
			
 
				+        temp_match = re.search(r'温度[:：]?\s*([0-9.\-]+)', weather_section)
			
 
				+        if temp_match:
			
 
				+            weather_info["temp"] = temp_match.group(1).strip()
			
 
				+        
			
 
				+        # 提取湿度
			
 
				+        humidity_match = re.search(r'湿度[:：]?\s*([0-9.\-]+)', weather_section)
			
 
				+        if humidity_match:
			
 
				+            weather_info["humidity"] = humidity_match.group(1).strip()
			
 
				+        
			
 
				+        # 提取风速
			
 
				+        wind_speed_match = re.search(r'风速[:：]?\s*([0-9.\-]+)', weather_section)
			
 
				+        if wind_speed_match:
			
 
				+            weather_info["windSpeed"] = wind_speed_match.group(1).strip()
			
 
				+        
			
 
				+        # 提取风向
			
 
				+        wind_dir_match = re.search(r'风向[:：]?\s*([^\s日期温度湿度风速<>]+?)(?=\s|日期|温度|湿度|风速|$|<)', weather_section)
			
 
				+        if wind_dir_match:
			
 
				+            wind_value = wind_dir_match.group(1).strip()
			
 
				+            wind_value = re.sub(r'<[^>]+>', '', wind_value).strip()
			
 
				+            if wind_value and wind_value != "m/s" and not re.match(r'^[0-9.\-]+$', wind_value):
			
 
				+                weather_info["windDirection"] = wind_value
			
 
				+        
			
 
				+        # 如果至少有一个字段不为空，则添加这条记录
			
 
				+        if any([weather_info["monitorAt"], weather_info["weather"], weather_info["temp"], 
			
 
				+                weather_info["humidity"], weather_info["windSpeed"], weather_info["windDirection"]]):
			
 
				+            keywords["weather_info"].append(weather_info)
			
 
				+            logger.debug(f"[Markdown关键词提取] 提取到天气记录: {weather_info}")
			
 
				+    
			
 
				+    return keywords
			
 
				+
			
 
				+
			
 
				+def supplement_missing_fields_from_ocr_json(
			
 
				+    records: List[Dict[str, Any]], 
			
 
				+    ocr_json_path: str,
			
 
				+    field_mapping: Dict[str, str] = None
			
 
				+) -> List[Dict[str, Any]]:
			
 
				+    """从OCR的JSON输出中补充缺失字段
			
 
				+    
			
 
				+    根据文本位置关系来补充缺失字段。例如，如果找到了maxReactivePower的值（如"-2.48"），
			
 
				+    那么minReactivePower的值就在它后面的位置（"-4.75"）。
			
 
				+    
			
 
				+    Args:
			
 
				+        records: 原始解析记录列表（OperationalConditionV2格式）
			
 
				+        ocr_json_path: OCR输出的JSON文件路径
			
 
				+        field_mapping: 字段映射关系，如{"maxReactivePower": "minReactivePower"}，表示maxReactivePower后面是minReactivePower
			
 
				+        
			
 
				+    Returns:
			
 
				+        补充后的记录列表
			
 
				+    """
			
 
				+    if not records or not ocr_json_path or not os.path.exists(ocr_json_path):
			
 
				+        return records
			
 
				+    
			
 
				+    try:
			
 
				+        # 读取OCR JSON文件
			
 
				+        with open(ocr_json_path, 'r', encoding='utf-8') as f:
			
 
				+            ocr_data = json.load(f)
			
 
				+        
			
 
				+        # 提取rec_texts数组
			
 
				+        rec_texts = ocr_data.get("rec_texts", [])
			
 
				+        if not rec_texts:
			
 
				+            logger.warning("[OCR字段补充] JSON中未找到rec_texts字段")
			
 
				+            return records
			
 
				+        
			
 
				+        logger.info(f"[OCR字段补充] 从OCR JSON中提取到 {len(rec_texts)} 个文本片段")
			
 
				+        
			
 
				+        # 默认字段映射：max字段后面是min字段
			
 
				+        if field_mapping is None:
			
 
				+            field_mapping = {
			
 
				+                "maxVoltage": "minVoltage",
			
 
				+                "maxCurrent": "minCurrent",
			
 
				+                "maxActivePower": "minActivePower",
			
 
				+                "maxReactivePower": "minReactivePower"
			
 
				+            }
			
 
				+        
			
 
				+        # 为每条记录补充缺失字段
			
 
				+        for record in records:
			
 
				+            record_name = record.get("name", "")
			
 
				+            logger.debug(f"[OCR字段补充] 处理记录: {record_name}")
			
 
				+            
			
 
				+            # 对于每个max字段，如果对应的min字段为空，尝试从OCR中补充
			
 
				+            for max_field, min_field in field_mapping.items():
			
 
				+                max_value = record.get(max_field, "").strip()
			
 
				+                min_value = record.get(min_field, "").strip()
			
 
				+                
			
 
				+                # 如果max字段有值但min字段为空，尝试从OCR中补充
			
 
				+                if max_value and not min_value:
			
 
				+                    logger.debug(f"[OCR字段补充] 记录 {record_name}: {max_field}={max_value}, {min_field}为空，尝试从OCR补充")
			
 
				+                    
			
 
				+                    # 在rec_texts中查找max_value
			
 
				+                    try:
			
 
				+                        max_value_float = float(max_value)
			
 
				+                        # 查找匹配的文本（允许小的数值差异）
			
 
				+                        found_max = False
			
 
				+                        for i, text in enumerate(rec_texts):
			
 
				+                            # 尝试将文本转换为数值
			
 
				+                            try:
			
 
				+                                text_float = float(text.strip())
			
 
				+                                # 如果数值匹配（允许小的误差）
			
 
				+                                if abs(text_float - max_value_float) < 0.01:
			
 
				+                                    found_max = True
			
 
				+                                    # 检查后续几个文本，找到第一个数值作为min_value
			
 
				+                                    # 在表格中，max和min通常是相邻的，但中间可能有其他文本
			
 
				+                                    for j in range(i + 1, min(i + 5, len(rec_texts))):  # 检查后续最多4个文本
			
 
				+                                        next_text = rec_texts[j].strip()
			
 
				+                                        try:
			
 
				+                                            next_value_float = float(next_text)
			
 
				+                                            # 如果找到数值，且与max_value不同，则作为min_value
			
 
				+                                            if abs(next_value_float - max_value_float) > 0.01:
			
 
				+                                                record[min_field] = next_text
			
 
				+                                                logger.info(f"[OCR字段补充] 从OCR补充 {min_field}: {next_text} (在 {max_field}={max_value} 之后，位置 {j})")
			
 
				+                                                break
			
 
				+                                        except ValueError:
			
 
				+                                            # 不是数值，继续查找
			
 
				+                                            continue
			
 
				+                                    if record.get(min_field):
			
 
				+                                        break
			
 
				+                            except ValueError:
			
 
				+                                # 文本不是数值，继续
			
 
				+                                pass
			
 
				+                        
			
 
				+                        if not found_max:
			
 
				+                            logger.debug(f"[OCR字段补充] 未在OCR中找到 {max_field} 的值 '{max_value}'")
			
 
				+                    except ValueError:
			
 
				+                        # max_value不是数值，跳过
			
 
				+                        logger.debug(f"[OCR字段补充] {max_field}值 '{max_value}' 不是数值，跳过")
			
 
				+                        pass
			
 
				+        
			
 
				+        logger.info("[OCR字段补充] 字段补充完成")
			
 
				+        return records
			
 
				+        
			
 
				+    except Exception as e:
			
 
				+        logger.exception(f"[OCR字段补充] 补充过程出错: {e}")
			
 
				+        return records
			
 
				+
			
 
				+
			
 
				+def extract_image_from_markdown(markdown_content: str, output_dir: str) -> Optional[str]:
			
 
				+    """从markdown内容中提取第一张图片路径
			
 
				+    
			
 
				+    Args:
			
 
				+        markdown_content: markdown内容
			
 
				+        output_dir: 输出目录
			
 
				+        
			
 
				+    Returns:
			
 
				+        图片路径，如果未找到返回None
			
 
				+    """
			
 
				+    # 查找markdown中的图片引用
			
 
				+    # 格式: ![alt](path) 或 <img src="path">
			
 
				+    image_patterns = [
			
 
				+        r'!\[.*?\]\((.*?)\)',  # markdown图片格式
			
 
				+        r'<img[^>]+src=["\'](.*?)["\']',  # HTML img标签
			
 
				+        r'<img[^>]+src=(.*?)(?:\s|>)',  # HTML img标签（无引号）
			
 
				+    ]
			
 
				+    
			
 
				+    for pattern in image_patterns:
			
 
				+        matches = re.findall(pattern, markdown_content)
			
 
				+        if matches:
			
 
				+            image_path = matches[0]
			
 
				+            # 如果是相对路径，尝试在output_dir中查找
			
 
				+            if not os.path.isabs(image_path):
			
 
				+                # 尝试多个可能的路径
			
 
				+                possible_paths = [
			
 
				+                    os.path.join(output_dir, image_path),
			
 
				+                    os.path.join(output_dir, "images", os.path.basename(image_path)),
			
 
				+                    os.path.join(output_dir, os.path.basename(image_path)),
			
 
				+                ]
			
 
				+                for full_path in possible_paths:
			
 
				+                    if os.path.exists(full_path):
			
 
				+                        return full_path
			
 
				+            elif os.path.exists(image_path):
			
 
				+                return image_path
			
 
				+    
			
 
				+    return None
			
 
				+
			
 
				+
			
 
				+def fallback_parse_with_paddleocr(
			
 
				+    json_data: Dict[str, Any],
			
 
				+    markdown_content: str,
			
 
				+    output_dir: Optional[str] = None,
			
 
				+    document_type: Optional[str] = None,
			
 
				+    input_file: Optional[str] = None
			
 
				+) -> Optional[str]:
			
 
				+    """当JSON数据缺失时，使用paddleocr进行备用解析
			
 
				+    
			
 
				+    Args:
			
 
				+        json_data: 原始JSON数据
			
 
				+        markdown_content: 原始markdown内容
			
 
				+        output_dir: 输出目录（用于查找图片）
			
 
				+        document_type: 文档类型
			
 
				+        input_file: 原始输入文件路径（PDF或图片），如果未找到图片则从PDF提取第一页
			
 
				+        
			
 
				+    Returns:
			
 
				+        补充后的markdown内容，如果失败返回None
			
 
				+    """
			
 
				+    try:
			
 
				+        # 注意：调用方已经检查过数据完整性，这里不再重复检查
			
 
				+        # 直接进行备用解析，因为调用方已经确定需要备用解析
			
 
				+        doc_type = document_type or json_data.get("document_type", "unknown")
			
 
				+        
			
 
				+        logger.warning("[PaddleOCR备用] 启用PaddleOCR备用解析")
			
 
				+        
			
 
				+        # 尝试从markdown中提取图片路径
			
 
				+        image_path = None
			
 
				+        if output_dir:
			
 
				+            # 首先尝试从markdown中提取
			
 
				+            image_path = extract_image_from_markdown(markdown_content, output_dir)
			
 
				+            if image_path:
			
 
				+                logger.info(f"[PaddleOCR备用] 从markdown中找到图片: {image_path}")
			
 
				+            
			
 
				+            # 如果找不到，尝试在output_dir中查找png文件
			
 
				+            if not image_path and os.path.exists(output_dir):
			
 
				+                # 查找所有png文件
			
 
				+                png_files = list(Path(output_dir).rglob("*.png"))
			
 
				+                if png_files:
			
 
				+                    # 优先查找包含"粘贴"或"image"的文件名
			
 
				+                    for png_file in png_files:
			
 
				+                        if "粘贴" in png_file.name or "image" in png_file.name.lower():
			
 
				+                            image_path = str(png_file)
			
 
				+                            logger.info(f"[PaddleOCR备用] 使用找到的图片: {image_path}")
			
 
				+                            break
			
 
				+                    
			
 
				+                    # 如果没找到特殊名称的，使用第一个
			
 
				+                    if not image_path:
			
 
				+                        image_path = str(png_files[0])
			
 
				+                        logger.info(f"[PaddleOCR备用] 使用找到的图片: {image_path}")
			
 
				+        
			
 
				+        # 如果仍未找到图片，尝试从input_file处理
			
 
				+        if not image_path:
			
 
				+            logger.warning("[PaddleOCR备用] 未找到可用的图片文件，尝试从input_file处理")
			
 
				+            
			
 
				+            if input_file and os.path.exists(input_file):
			
 
				+                # 检测文件实际类型（不依赖扩展名）
			
 
				+                file_type = detect_file_type(input_file)
			
 
				+                
			
 
				+                if file_type == 'pdf':
			
 
				+                    # 文件是PDF，尝试提取第一页
			
 
				+                    pdf_path = input_file
			
 
				+                    logger.info(f"[PaddleOCR备用] 检测到PDF文件（通过内容）: {pdf_path}")
			
 
				+                    image_path = extract_first_page_from_pdf(pdf_path, output_dir)
			
 
				+                    if image_path:
			
 
				+                        logger.info(f"[PaddleOCR备用] 成功从PDF提取第一页图片: {image_path}")
			
 
				+                    else:
			
 
				+                        logger.warning("[PaddleOCR备用] 从PDF提取图片失败（可能是PDF文件损坏或缺少必要的库）")
			
 
				+                elif file_type in ['png', 'jpeg', 'jpg']:
			
 
				+                    # 文件是图片，直接使用
			
 
				+                    image_path = input_file
			
 
				+                    logger.info(f"[PaddleOCR备用] 检测到图片文件（{file_type}）: {image_path}")
			
 
				+                else:
			
 
				+                    # 文件类型未知，尝试按PDF处理（可能是PDF但没有正确识别）
			
 
				+                    logger.debug(f"[PaddleOCR备用] input_file类型未知（{file_type}），尝试按PDF处理: {input_file}")
			
 
				+                    if PDFIUM_AVAILABLE or PDF2IMAGE_AVAILABLE:
			
 
				+                        try:
			
 
				+                            # 尝试打开为PDF
			
 
				+                            pdf_path = input_file
			
 
				+                            image_path = extract_first_page_from_pdf(pdf_path, output_dir)
			
 
				+                            if image_path:
			
 
				+                                logger.info(f"[PaddleOCR备用] 成功将文件作为PDF处理并提取第一页: {image_path}")
			
 
				+                        except Exception as e:
			
 
				+                            logger.debug(f"[PaddleOCR备用] 无法将文件作为PDF处理: {e}")
			
 
				+            
			
 
				+            # 如果input_file处理失败，尝试在output_dir中查找PDF文件
			
 
				+            if not image_path and output_dir:
			
 
				+                pdf_path = find_pdf_file(output_dir)
			
 
				+                if pdf_path:
			
 
				+                    logger.info(f"[PaddleOCR备用] 在输出目录中找到PDF文件: {pdf_path}")
			
 
				+                    image_path = extract_first_page_from_pdf(pdf_path, output_dir)
			
 
				+                    if image_path:
			
 
				+                        logger.info(f"[PaddleOCR备用] 成功从PDF提取第一页图片: {image_path}")
			
 
				+            
			
 
				+            # 如果仍未找到，尝试在input_file的父目录中查找
			
 
				+            if not image_path and input_file:
			
 
				+                parent_dir = os.path.dirname(input_file)
			
 
				+                if parent_dir and os.path.exists(parent_dir):
			
 
				+                    pdf_path = find_pdf_file(parent_dir)
			
 
				+                    if pdf_path:
			
 
				+                        logger.info(f"[PaddleOCR备用] 在input_file父目录中找到PDF文件: {pdf_path}")
			
 
				+                        image_path = extract_first_page_from_pdf(pdf_path, output_dir)
			
 
				+                        if image_path:
			
 
				+                            logger.info(f"[PaddleOCR备用] 成功从PDF提取第一页图片: {image_path}")
			
 
				+            
			
 
				+            if not image_path:
			
 
				+                logger.warning(f"[PaddleOCR备用] 未找到可用的图片或PDF文件（input_file={input_file}, output_dir={output_dir}），无法进行备用解析")
			
 
				+                logger.info("[PaddleOCR备用] 备用解析需要图片文件或PDF文件，如果都没有，将返回原始markdown内容")
			
 
				+        
			
 
				+        if not image_path:
			
 
				+            logger.warning("[PaddleOCR备用] 未找到可用的图片文件，备用解析无法进行，返回None（将使用原始解析结果）")
			
 
				+            return None
			
 
				+        
			
 
				+        # 使用doc_parser模式解析文档结构
			
 
				+        logger.info("[PaddleOCR备用] 使用doc_parser模式解析文档结构")
			
 
				+        paddleocr_result = call_paddleocr(image_path)
			
 
				+        if not paddleocr_result:
			
 
				+            logger.error("[PaddleOCR备用] PaddleOCR解析失败")
			
 
				+            return None
			
 
				+        
			
 
				+        # 检查返回结果格式
			
 
				+        if "markdown_content" in paddleocr_result:
			
 
				+            # 直接从MD文件读取的内容
			
 
				+            paddleocr_markdown = paddleocr_result["markdown_content"]
			
 
				+            logger.info(f"[PaddleOCR备用] 成功从MD文件读取，生成 {len(paddleocr_markdown)} 字符的markdown")
			
 
				+            
			
 
				+            # 从markdown内容中提取关键词来补充数据
			
 
				+            logger.info("[PaddleOCR备用] 从MD文件内容中提取关键词补充数据")
			
 
				+            keywords = extract_keywords_from_markdown(paddleocr_markdown)
			
 
				+            
			
 
				+            # 将关键词信息添加到markdown中（作为注释，供后续解析使用）
			
 
				+            keywords_comment = "\n\n<!-- Markdown关键词补充:\n"
			
 
				+            if keywords["project"]:
			
 
				+                keywords_comment += f"项目名称：{keywords['project']}\n"
			
 
				+            if keywords["standardReferences"]:
			
 
				+                keywords_comment += f"检测依据：{keywords['standardReferences']}\n"
			
 
				+            if keywords["soundLevelMeterMode"]:
			
 
				+                keywords_comment += f"声级计型号/编号：{keywords['soundLevelMeterMode']}\n"
			
 
				+            if keywords["soundCalibratorMode"]:
			
 
				+                keywords_comment += f"声校准器型号/编号：{keywords['soundCalibratorMode']}\n"
			
 
				+            if keywords["calibrationValueBefore"]:
			
 
				+                keywords_comment += f"检测前校准值：{keywords['calibrationValueBefore']}\n"
			
 
				+            if keywords["calibrationValueAfter"]:
			
 
				+                keywords_comment += f"检测后校准值：{keywords['calibrationValueAfter']}\n"
			
 
				+            if keywords.get("address_mapping"):
			
 
				+                for code, address in keywords["address_mapping"].items():
			
 
				+                    keywords_comment += f"监测地点-{code}：{address}\n"
			
 
				+            if keywords["weather_info"]:
			
 
				+                for weather in keywords["weather_info"]:
			
 
				+                    keywords_comment += f"日期：{weather['monitorAt']} 天气：{weather['weather']} 温度：{weather['temp']} 湿度：{weather['humidity']} 风速：{weather['windSpeed']} 风向：{weather['windDirection']}\n"
			
 
				+            keywords_comment += "-->\n"
			
 
				+            
			
 
				+            # 将关键词信息合并到markdown中
			
 
				+            paddleocr_markdown = paddleocr_markdown + keywords_comment
			
 
				+            # 统计补充的字段数量（不包括weather_info列表）
			
 
				+            field_count = sum(1 for k, v in keywords.items() if k != "weather_info" and v) + len(keywords.get("weather_info", []))
			
 
				+            logger.info(f"[PaddleOCR备用] MD文件关键词提取完成，补充了 {field_count} 个字段")
			
 
				+        elif "parsing_res_list" in paddleocr_result:
			
 
				+            # 从JSON或stdout解析的结果，需要转换为markdown
			
 
				+            paddleocr_markdown = paddleocr_to_markdown(paddleocr_result)
			
 
				+            if not paddleocr_markdown:
			
 
				+                logger.warning("[PaddleOCR备用] PaddleOCR未解析出有效内容")
			
 
				+                return None
			
 
				+            logger.info(f"[PaddleOCR备用] 成功解析，生成 {len(paddleocr_markdown)} 字符的markdown")
			
 
				+        else:
			
 
				+            logger.error("[PaddleOCR备用] PaddleOCR返回格式不正确")
			
 
				+            return None
			
 
				+        
			
 
				+        # 调用paddleocr ocr提取关键词来补充数据（作为doc_parser的补充）
			
 
				+        logger.info("[PaddleOCR备用] 调用OCR提取关键词补充数据")
			
 
				+        ocr_save_path = os.path.dirname(image_path)  # 使用图片所在目录作为保存路径
			
 
				+        ocr_texts, _ = call_paddleocr_ocr(image_path, ocr_save_path)
			
 
				+        
			
 
				+        if ocr_texts:
			
 
				+            # 从OCR文本中提取关键词
			
 
				+            keywords = extract_keywords_from_ocr_texts(ocr_texts)
			
 
				+            
			
 
				+            # 将关键词信息添加到markdown中（作为注释，供后续解析使用）
			
 
				+            keywords_comment = "\n\n<!-- OCR关键词补充:\n"
			
 
				+            if keywords["project"]:
			
 
				+                keywords_comment += f"项目名称：{keywords['project']}\n"
			
 
				+            if keywords["standardReferences"]:
			
 
				+                keywords_comment += f"检测依据：{keywords['standardReferences']}\n"
			
 
				+            if keywords["soundLevelMeterMode"]:
			
 
				+                keywords_comment += f"声级计型号/编号：{keywords['soundLevelMeterMode']}\n"
			
 
				+            if keywords["soundCalibratorMode"]:
			
 
				+                keywords_comment += f"声校准器型号/编号：{keywords['soundCalibratorMode']}\n"
			
 
				+            if keywords["calibrationValueBefore"]:
			
 
				+                keywords_comment += f"检测前校准值：{keywords['calibrationValueBefore']}\n"
			
 
				+            if keywords.get("address_mapping"):
			
 
				+                for code, address in keywords["address_mapping"].items():
			
 
				+                    keywords_comment += f"监测地点-{code}：{address}\n"
			
 
				+            if keywords["calibrationValueAfter"]:
			
 
				+                keywords_comment += f"检测后校准值：{keywords['calibrationValueAfter']}\n"
			
 
				+            if keywords["weather_info"]:
			
 
				+                for weather in keywords["weather_info"]:
			
 
				+                    keywords_comment += f"日期：{weather['monitorAt']} 天气：{weather['weather']} 温度：{weather['temp']} 湿度：{weather['humidity']} 风速：{weather['windSpeed']} 风向：{weather['windDirection']}\n"
			
 
				+            keywords_comment += "-->\n"
			
 
				+            
			
 
				+            # 将关键词信息合并到markdown中
			
 
				+            paddleocr_markdown = paddleocr_markdown + keywords_comment
			
 
				+            logger.info(f"[PaddleOCR备用] OCR关键词提取完成，补充了 {len(keywords)} 个字段")
			
 
				+
			
 
				+        
			
 
				+        # 合并原始markdown和paddleocr结果
			
 
				+        # 优先使用paddleocr的结果，因为它更完整
			
 
				+        combined_markdown = f"{paddleocr_markdown}\n\n<!-- 原始内容（可能不完整） -->\n{markdown_content}"
			
 
				+        
			
 
				+        return combined_markdown
			
 
				+        
			
 
				+    except Exception as e:
			
 
				+        logger.exception(f"[PaddleOCR备用] 备用解析过程出错: {e}")
			
 
				+        return None
			
 
				+
			
 
				+
			
 
				+def extract_text_with_paragraphs_from_ocr_json(json_path: str, line_height_threshold: float = 1.5, paragraph_gap_threshold: float = 2.0) -> str:
			
 
				+    """
			
 
				+    从PaddleOCR的JSON输出中提取带段落分割的纯文本
			
 
				+    
			
 
				+    Args:
			
 
				+        json_path: OCR输出的JSON文件路径
			
 
				+        line_height_threshold: 行高倍数阈值，用于判断是否在同一行（默认1.5）
			
 
				+        paragraph_gap_threshold: 段落间距倍数阈值，用于判断是否需要分段（默认2.0）
			
 
				+    
			
 
				+    Returns:
			
 
				+        带段落分割的纯文本字符串
			
 
				+    """
			
 
				+    try:
			
 
				+        with open(json_path, 'r', encoding='utf-8') as f:
			
 
				+            ocr_data = json.load(f)
			
 
				+        
			
 
				+        # 提取文本和坐标信息
			
 
				+        rec_texts = ocr_data.get("rec_texts", [])
			
 
				+        dt_polys = ocr_data.get("dt_polys", [])
			
 
				+        
			
 
				+        if not rec_texts or not dt_polys:
			
 
				+            logger.warning("[OCR文本提取] JSON中缺少rec_texts或dt_polys字段")
			
 
				+            return ""
			
 
				+        
			
 
				+        if len(rec_texts) != len(dt_polys):
			
 
				+            logger.warning(f"[OCR文本提取] rec_texts长度({len(rec_texts)})与dt_polys长度({len(dt_polys)})不匹配")
			
 
				+            # 取较小的长度
			
 
				+            min_len = min(len(rec_texts), len(dt_polys))
			
 
				+            rec_texts = rec_texts[:min_len]
			
 
				+            dt_polys = dt_polys[:min_len]
			
 
				+        
			
 
				+        # 计算每个文本块的边界框和中心点
			
 
				+        text_blocks = []
			
 
				+        for i, (text, poly) in enumerate(zip(rec_texts, dt_polys)):
			
 
				+            if not text or not text.strip():
			
 
				+                continue
			
 
				+            
			
 
				+            # 从多边形坐标计算边界框
			
 
				+            # poly格式: [[x1, y1], [x2, y2], [x3, y3], [x4, y4]]
			
 
				+            if len(poly) >= 4:
			
 
				+                xs = [point[0] for point in poly]
			
 
				+                ys = [point[1] for point in poly]
			
 
				+                x_min, x_max = min(xs), max(xs)
			
 
				+                y_min, y_max = min(ys), max(ys)
			
 
				+                
			
 
				+                # 计算中心点和高度
			
 
				+                center_x = (x_min + x_max) / 2
			
 
				+                center_y = (y_min + y_max) / 2
			
 
				+                height = y_max - y_min
			
 
				+                width = x_max - x_min
			
 
				+                
			
 
				+                text_blocks.append({
			
 
				+                    'text': text.strip(),
			
 
				+                    'x_min': x_min,
			
 
				+                    'x_max': x_max,
			
 
				+                    'y_min': y_min,
			
 
				+                    'y_max': y_max,
			
 
				+                    'center_x': center_x,
			
 
				+                    'center_y': center_y,
			
 
				+                    'height': height,
			
 
				+                    'width': width,
			
 
				+                    'index': i
			
 
				+                })
			
 
				+        
			
 
				+        if not text_blocks:
			
 
				+            logger.warning("[OCR文本提取] 没有有效的文本块")
			
 
				+            return ""
			
 
				+        
			
 
				+        # 按Y坐标（从上到下）排序
			
 
				+        text_blocks.sort(key=lambda b: (b['y_min'], b['x_min']))
			
 
				+        
			
 
				+        # 计算平均行高（用于判断行间距）
			
 
				+        heights = [b['height'] for b in text_blocks]
			
 
				+        avg_height = sum(heights) / len(heights) if heights else 20
			
 
				+        
			
 
				+        # 将文本块按行分组
			
 
				+        lines = []
			
 
				+        current_line = [text_blocks[0]]
			
 
				+        
			
 
				+        for i in range(1, len(text_blocks)):
			
 
				+            prev_block = text_blocks[i - 1]
			
 
				+            curr_block = text_blocks[i]
			
 
				+            
			
 
				+            # 计算Y坐标重叠度
			
 
				+            y_overlap = min(prev_block['y_max'], curr_block['y_max']) - max(prev_block['y_min'], curr_block['y_min'])
			
 
				+            overlap_ratio = y_overlap / min(prev_block['height'], curr_block['height']) if min(prev_block['height'], curr_block['height']) > 0 else 0
			
 
				+            
			
 
				+            # 计算Y坐标间距
			
 
				+            y_gap = curr_block['y_min'] - prev_block['y_max']
			
 
				+            gap_ratio = y_gap / avg_height if avg_height > 0 else 0
			
 
				+            
			
 
				+            # 判断是否在同一行：有重叠或间距小于行高阈值
			
 
				+            if overlap_ratio > 0.3 or (y_gap >= 0 and gap_ratio < line_height_threshold):
			
 
				+                current_line.append(curr_block)
			
 
				+            else:
			
 
				+                # 新行开始，保存当前行
			
 
				+                lines.append(current_line)
			
 
				+                current_line = [curr_block]
			
 
				+        
			
 
				+        # 添加最后一行
			
 
				+        if current_line:
			
 
				+            lines.append(current_line)
			
 
				+        
			
 
				+        # 对每行内的文本块按X坐标排序（从左到右）
			
 
				+        for line in lines:
			
 
				+            line.sort(key=lambda b: b['x_min'])
			
 
				+        
			
 
				+        # 生成文本，根据行间距判断段落分割
			
 
				+        result_lines = []
			
 
				+        prev_line_y = None
			
 
				+        prev_line_height = None
			
 
				+        
			
 
				+        for line_idx, line in enumerate(lines):
			
 
				+            # 计算当前行的Y坐标和高度
			
 
				+            line_y_min = min(b['y_min'] for b in line)
			
 
				+            line_y_max = max(b['y_max'] for b in line)
			
 
				+            line_height = line_y_max - line_y_min
			
 
				+            line_center_y = (line_y_min + line_y_max) / 2
			
 
				+            
			
 
				+            # 拼接当前行的文本
			
 
				+            # 对于表格数据，使用制表符分隔；对于普通文本，使用空格
			
 
				+            line_text = ""
			
 
				+            prev_x_max = None
			
 
				+            
			
 
				+            # 判断是否是表格行（如果一行中有多个文本块且X坐标分布较均匀）
			
 
				+            is_table_row = len(line) > 2
			
 
				+            
			
 
				+            for block in line:
			
 
				+                if prev_x_max is not None:
			
 
				+                    x_gap = block['x_min'] - prev_x_max
			
 
				+                    # 如果间距较大，添加分隔符
			
 
				+                    if x_gap > avg_height * 0.3:
			
 
				+                        if is_table_row:
			
 
				+                            # 表格使用制表符
			
 
				+                            line_text += "\t"
			
 
				+                        else:
			
 
				+                            # 普通文本使用空格
			
 
				+                            line_text += " "
			
 
				+                line_text += block['text']
			
 
				+                prev_x_max = block['x_max']
			
 
				+            
			
 
				+            # 判断是否需要换段
			
 
				+            if prev_line_y is not None and prev_line_height is not None:
			
 
				+                # 计算行间距
			
 
				+                line_gap = line_y_min - prev_line_y
			
 
				+                gap_ratio = line_gap / prev_line_height if prev_line_height > 0 else 0
			
 
				+                
			
 
				+                # 如果行间距大于段落阈值，添加空行
			
 
				+                if gap_ratio > paragraph_gap_threshold:
			
 
				+                    result_lines.append("")  # 空行表示段落分隔
			
 
				+            
			
 
				+            result_lines.append(line_text)
			
 
				+            prev_line_y = line_y_max
			
 
				+            prev_line_height = line_height
			
 
				+        
			
 
				+        # 合并为最终文本
			
 
				+        result_text = "\n".join(result_lines)
			
 
				+        logger.info(f"[OCR文本提取] 成功提取文本，共 {len(lines)} 行，{len(result_lines)} 行（含段落分隔）")
			
 
				+        
			
 
				+        return result_text
			
 
				+        
			
 
				+    except Exception as e:
			
 
				+        logger.exception(f"[OCR文本提取] 处理失败: {e}")
			
 
				+        return ""
			
 
				+