paddleocr_fallback.py 94 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221122212231224122512261227122812291230123112321233123412351236123712381239124012411242124312441245124612471248124912501251125212531254125512561257125812591260126112621263126412651266126712681269127012711272127312741275127612771278127912801281128212831284128512861287128812891290129112921293129412951296129712981299130013011302130313041305130613071308130913101311131213131314131513161317131813191320132113221323132413251326132713281329133013311332133313341335133613371338133913401341134213431344134513461347134813491350135113521353135413551356135713581359136013611362136313641365136613671368136913701371137213731374137513761377137813791380138113821383138413851386138713881389139013911392139313941395139613971398139914001401140214031404140514061407140814091410141114121413141414151416141714181419142014211422142314241425142614271428142914301431143214331434143514361437143814391440144114421443144414451446144714481449145014511452145314541455145614571458145914601461146214631464146514661467146814691470147114721473147414751476147714781479148014811482148314841485148614871488148914901491149214931494149514961497149814991500150115021503150415051506150715081509151015111512151315141515151615171518151915201521152215231524152515261527152815291530153115321533153415351536153715381539154015411542154315441545154615471548154915501551155215531554155515561557155815591560156115621563156415651566156715681569157015711572157315741575157615771578157915801581158215831584158515861587158815891590159115921593159415951596159715981599160016011602160316041605160616071608160916101611161216131614161516161617161816191620162116221623162416251626162716281629163016311632163316341635163616371638163916401641164216431644164516461647164816491650165116521653165416551656165716581659166016611662166316641665166616671668166916701671167216731674167516761677167816791680168116821683168416851686168716881689169016911692169316941695169616971698169917001701170217031704170517061707170817091710171117121713171417151716171717181719172017211722172317241725172617271728172917301731173217331734173517361737173817391740174117421743174417451746174717481749175017511752175317541755175617571758175917601761176217631764176517661767176817691770177117721773177417751776177717781779178017811782178317841785178617871788178917901791179217931794179517961797179817991800180118021803180418051806180718081809181018111812181318141815181618171818181918201821182218231824182518261827182818291830183118321833183418351836183718381839184018411842184318441845184618471848184918501851185218531854185518561857185818591860186118621863186418651866186718681869187018711872187318741875187618771878187918801881188218831884188518861887188818891890189118921893189418951896189718981899190019011902190319041905190619071908190919101911191219131914191519161917191819191920192119221923192419251926192719281929193019311932193319341935193619371938193919401941194219431944194519461947194819491950195119521953195419551956195719581959196019611962196319641965196619671968196919701971197219731974
  1. # Copyright (c) Opendatalab. All rights reserved.
  2. """PaddleOCR备用解析模块 - 当MinerU解析结果缺失时使用"""
  3. import json
  4. import os
  5. import subprocess
  6. import sys
  7. import tempfile
  8. import time
  9. import random
  10. from pathlib import Path
  11. from typing import Dict, Any, Optional, List, Tuple
  12. import ast
  13. import re
  14. from ..utils.logging_config import get_logger
  15. from ..config import (
  16. PADDLEOCR_CMD as _PADDLEOCR_CMD,
  17. PADDLE_OCR_DEVICE as _PADDLE_OCR_DEVICE,
  18. PADDLE_OCR_DEVICES as _PADDLE_OCR_DEVICES_CONFIG,
  19. )
  20. logger = get_logger("pdf_converter_v2.utils.paddleocr")
  21. try:
  22. import pypdfium2 as pdfium
  23. PDFIUM_AVAILABLE = True
  24. except ImportError:
  25. PDFIUM_AVAILABLE = False
  26. logger.warning("[PaddleOCR备用] pypdfium2未安装,无法从PDF提取图片")
  27. try:
  28. from pdf2image import convert_from_path
  29. PDF2IMAGE_AVAILABLE = True
  30. except ImportError:
  31. PDF2IMAGE_AVAILABLE = False
  32. logger.warning("[PaddleOCR备用] pdf2image未安装,无法使用备用方法从PDF提取图片")
  33. try:
  34. from PIL import Image
  35. PIL_AVAILABLE = True
  36. except ImportError:
  37. PIL_AVAILABLE = False
  38. logger.warning("[PaddleOCR备用] PIL未安装,无法处理图片")
  39. def _get_paddleocr_executable() -> str:
  40. """返回 paddleocr 可执行文件路径或命令名,供 subprocess 使用。
  41. 当以 systemd 等方式运行时 PATH 可能不包含 venv/bin,故优先使用当前 Python 同目录下的 paddleocr。
  42. 可通过配置 PADDLEOCR_CMD 显式指定(完整路径或命令名)。"""
  43. cmd = _PADDLEOCR_CMD.strip()
  44. if cmd:
  45. return cmd
  46. # 与当前 Python 同目录(venv/bin)下的 paddleocr
  47. bin_dir = os.path.dirname(os.path.abspath(sys.executable))
  48. candidate = os.path.join(bin_dir, "paddleocr")
  49. if os.path.isfile(candidate) and os.access(candidate, os.X_OK):
  50. return candidate
  51. return "paddleocr"
  52. # PaddleOCR 推理设备:支持单卡与多卡轮询
  53. # 单卡:PADDLE_OCR_DEVICE=npu:0 或未设置时 NPU 下默认 npu:0
  54. # 多卡:PADDLE_OCR_DEVICES=npu:0,npu:1 时按请求轮询使用
  55. import threading as _threading
  56. _PADDLE_OCR_DEVICES: List[str] = []
  57. _PADDLE_OCR_DEVICE_INDEX: int = 0
  58. _PADDLE_OCR_DEVICE_LOCK = _threading.Lock()
  59. def _get_paddle_ocr_devices() -> List[str]:
  60. """解析 PADDLE_OCR_DEVICES 或 PADDLE_OCR_DEVICE,返回设备列表(惰性、线程安全)。"""
  61. global _PADDLE_OCR_DEVICES
  62. with _PADDLE_OCR_DEVICE_LOCK:
  63. if _PADDLE_OCR_DEVICES:
  64. return _PADDLE_OCR_DEVICES
  65. multi = _PADDLE_OCR_DEVICES_CONFIG.strip()
  66. if multi:
  67. _PADDLE_OCR_DEVICES[:] = [d.strip() for d in multi.split(",") if d.strip()]
  68. if not _PADDLE_OCR_DEVICES:
  69. single = _PADDLE_OCR_DEVICE.strip()
  70. if not single:
  71. from .device_env import is_npu
  72. if is_npu():
  73. single = "npu:0"
  74. if single:
  75. _PADDLE_OCR_DEVICES.append(single)
  76. return _PADDLE_OCR_DEVICES
  77. def get_paddle_ocr_devices() -> List[str]:
  78. """返回 PaddleOCR 设备列表(用于单任务多卡:按页拆分后并行使用各卡)。"""
  79. return list(_get_paddle_ocr_devices())
  80. def get_paddle_ocr_device_args_for_index(device_index: int) -> list:
  81. """返回指定设备索引的 --device 参数列表;用于多卡并行时显式指定每段用哪张卡。"""
  82. devices = _get_paddle_ocr_devices()
  83. if not devices:
  84. return []
  85. device = devices[device_index % len(devices)]
  86. return ["--device", device]
  87. # 供 PaddleOCR 子进程使用的环境变量(LD_PRELOAD 避免 sklearn libgomp static TLS 报错;PADDLE_PDX 跳过模型源检查)
  88. _PADDLEOCR_ENV: Optional[Dict[str, str]] = None
  89. def _get_paddleocr_subprocess_env() -> Dict[str, str]:
  90. """返回调用 paddleocr 子进程时应使用的环境变量(含 LD_PRELOAD 与 PADDLE_PDX_DISABLE_MODEL_SOURCE_CHECK)。"""
  91. global _PADDLEOCR_ENV
  92. if _PADDLEOCR_ENV is not None:
  93. return _PADDLEOCR_ENV
  94. env = dict(os.environ)
  95. # 跳过「Checking connectivity to the model hosters」
  96. env.setdefault("PADDLE_PDX_DISABLE_MODEL_SOURCE_CHECK", "True")
  97. # doc_parser 加载 PaddleOCR-VL 时 safetensors 会触发 view_dtype(CPU, Undefined(AnyLayout), uint8),
  98. # 该 kernel 未注册;强制使用 STRIDED 布局可避免:RuntimeError: kernel view_dtype (CPU, Undefined(AnyLayout), uint8) is not registered
  99. env.setdefault("FLAGS_use_stride_kernel", "1")
  100. # 子进程若无 LD_PRELOAD,会触发 sklearn/paddlex 的「cannot allocate memory in static TLS block」
  101. if not env.get("LD_PRELOAD"):
  102. preload_paths: List[str] = []
  103. # 系统 libgomp 优先
  104. for p in (
  105. "/usr/lib/x86_64-linux-gnu/libgomp.so.1",
  106. "/usr/lib/aarch64-linux-gnu/libgomp.so.1",
  107. "/usr/lib/libgomp.so.1",
  108. ):
  109. if os.path.isfile(p):
  110. preload_paths.append(p)
  111. break
  112. # scikit_learn.libs 中的 libgomp(不 import sklearn,仅按路径查找)
  113. for sp in getattr(sys, "path", []):
  114. if not sp or not os.path.isdir(sp):
  115. continue
  116. for sub in ("scikit_learn.libs", "simsimd.libs"):
  117. d = os.path.join(sp, sub)
  118. if not os.path.isdir(d):
  119. continue
  120. for name in os.listdir(d):
  121. if name.startswith("libgomp") and (name.endswith(".so") or ".so." in name):
  122. preload_paths.append(os.path.join(d, name))
  123. # 固定路径(常见容器)
  124. for p in (
  125. "/usr/local/lib/python3.10/dist-packages/scikit_learn.libs/libgomp-d22c30c5.so.1.0.0",
  126. "/usr/local/lib/python3.10/site-packages/scikit_learn.libs/libgomp-d22c30c5.so.1.0.0",
  127. ):
  128. if os.path.isfile(p) and p not in preload_paths:
  129. preload_paths.append(p)
  130. if preload_paths:
  131. env["LD_PRELOAD"] = ":".join(preload_paths)
  132. logger.debug("[PaddleOCR] 子进程 LD_PRELOAD 已设置,避免 static TLS 报错")
  133. _PADDLEOCR_ENV = env
  134. return env
  135. def _paddle_ocr_device_args() -> list:
  136. """返回 PaddleOCR 命令的 --device 参数列表;多卡时按请求轮询。"""
  137. devices = _get_paddle_ocr_devices()
  138. if not devices:
  139. return []
  140. global _PADDLE_OCR_DEVICE_INDEX
  141. with _PADDLE_OCR_DEVICE_LOCK:
  142. idx = _PADDLE_OCR_DEVICE_INDEX % len(devices)
  143. _PADDLE_OCR_DEVICE_INDEX += 1
  144. device = devices[idx]
  145. return ["--device", device]
  146. def detect_file_type(file_path: str) -> Optional[str]:
  147. """通过文件内容(魔数)检测文件类型,不依赖扩展名
  148. Args:
  149. file_path: 文件路径
  150. Returns:
  151. 文件类型:'pdf', 'png', 'jpeg', 'jpg' 或 None
  152. """
  153. if not file_path or not os.path.exists(file_path):
  154. return None
  155. try:
  156. with open(file_path, 'rb') as f:
  157. # 读取文件头部(前16字节足够识别常见格式)
  158. header = f.read(16)
  159. if not header:
  160. return None
  161. # PDF文件:以 %PDF 开头
  162. if header.startswith(b'%PDF'):
  163. return 'pdf'
  164. # PNG图片:以 \x89PNG\r\n\x1a\n 开头
  165. if header.startswith(b'\x89PNG\r\n\x1a\n'):
  166. return 'png'
  167. # JPEG图片:以 \xff\xd8\xff 开头
  168. if header.startswith(b'\xff\xd8\xff'):
  169. return 'jpeg'
  170. # 其他格式可以继续扩展
  171. return None
  172. except Exception as e:
  173. logger.debug(f"[PaddleOCR备用] 检测文件类型失败: {e}")
  174. return None
  175. def check_json_data_completeness(json_data: Dict[str, Any], document_type: str) -> bool:
  176. """检查JSON数据是否大面积缺失
  177. Args:
  178. json_data: 解析后的JSON数据
  179. document_type: 文档类型
  180. Returns:
  181. True表示数据完整,False表示数据缺失
  182. """
  183. if not json_data or "data" not in json_data:
  184. return False
  185. data = json_data["data"]
  186. # 根据文档类型检查关键字段
  187. if document_type == "noiseMonitoringRecord":
  188. # 检查噪声检测记录的关键字段(不包括noise数组,noise数组由表格解析生成,不依赖OCR)
  189. required_fields = ["project", "standardReferences", "soundLevelMeterMode", "soundCalibratorMode"]
  190. missing_count = sum(1 for field in required_fields if not data.get(field))
  191. # 如果超过一半的关键字段缺失,认为数据缺失
  192. if missing_count >= len(required_fields) / 2:
  193. logger.warning(f"[数据完整性检查] 关键字段缺失过多: {missing_count}/{len(required_fields)}")
  194. return False
  195. # 检查天气字段是否异常(例如解析成“天气”标签或风向全部缺失)
  196. weather_list = data.get("weather") or []
  197. if weather_list:
  198. weather_label_tokens = {"天气", "天气状况", "天气情况"}
  199. has_label_as_value = any(
  200. (item.get("weather") or "").strip() in weather_label_tokens for item in weather_list
  201. )
  202. all_wind_direction_missing = all(
  203. not (item.get("windDirection") or "").strip() for item in weather_list
  204. )
  205. if has_label_as_value:
  206. logger.warning("[数据完整性检查] 天气字段疑似被解析为标签,触发备用解析")
  207. return False
  208. if all_wind_direction_missing:
  209. logger.warning("[数据完整性检查] 风向字段全部缺失,触发备用解析")
  210. return False
  211. return True
  212. elif document_type == "electromagneticTestRecord":
  213. # 检查电磁检测记录的关键字段
  214. # 区分必需字段和可选字段:
  215. # - deviceName 和 deviceMode 是必需字段(仪器信息)
  216. # - project 和 standardReferences 可能为空(某些文档可能没有填写)
  217. required_fields = ["deviceName", "deviceMode"] # 必需字段
  218. optional_fields = ["project", "standardReferences"] # 可选字段
  219. # 检查必需字段
  220. missing_required = sum(1 for field in required_fields if not data.get(field) or not str(data.get(field)).strip())
  221. # 检查可选字段(如果所有可选字段都为空,也算缺失)
  222. missing_optional = sum(1 for field in optional_fields if not data.get(field) or not str(data.get(field)).strip())
  223. # 检查电磁数据
  224. em_list = data.get("electricMagnetic", [])
  225. if len(em_list) == 0:
  226. logger.warning("[数据完整性检查] 电磁数据列表为空")
  227. return False
  228. # 如果必需字段缺失,认为数据不完整
  229. if missing_required > 0:
  230. logger.warning(f"[数据完整性检查] 必需字段缺失: {missing_required}/{len(required_fields)} (deviceName, deviceMode)")
  231. return False
  232. # 如果所有字段(必需+可选)都缺失,也认为数据不完整
  233. if missing_required + missing_optional >= len(required_fields) + len(optional_fields):
  234. logger.warning(f"[数据完整性检查] 所有关键字段都缺失: {missing_required + missing_optional}/{len(required_fields) + len(optional_fields)}")
  235. return False
  236. # 检查project和address字段:如果project为空且所有address都为空,说明minerU和Paddle doc_parser都丢失了,需要运行Paddle OCR
  237. project_empty = not data.get("project") or not str(data.get("project")).strip()
  238. if project_empty:
  239. # 检查所有电磁数据项的address字段是否都为空
  240. all_address_empty = True
  241. for em_item in em_list:
  242. address = em_item.get("address", "")
  243. if address and str(address).strip():
  244. all_address_empty = False
  245. break
  246. if all_address_empty:
  247. logger.warning("[数据完整性检查] project为空且所有address字段都为空,说明minerU和Paddle doc_parser都丢失了,需要运行Paddle OCR")
  248. return False
  249. return True
  250. elif document_type == "operatingConditionInfo":
  251. # 检查工况信息
  252. op_list = data.get("operationalConditions", [])
  253. if len(op_list) == 0:
  254. logger.warning("[数据完整性检查] 工况信息列表为空")
  255. return False
  256. return True
  257. # 未知类型,默认认为完整
  258. return True
  259. def parse_paddleocr_output(output_text: str) -> Dict[str, Any]:
  260. """解析paddleocr的输出文本
  261. Args:
  262. output_text: paddleocr命令的输出文本
  263. Returns:
  264. 解析后的字典,包含parsing_res_list
  265. """
  266. try:
  267. # 清理输出文本,移除可能的额外空白
  268. output_text = output_text.strip()
  269. # 尝试直接eval(因为输出是Python字典格式)
  270. # 先处理np.float32等numpy类型
  271. output_text = output_text.replace('np.float32', 'float')
  272. output_text = output_text.replace('np.int32', 'int')
  273. output_text = output_text.replace('np.int64', 'int')
  274. # 尝试使用ast.literal_eval安全解析
  275. try:
  276. result = ast.literal_eval(output_text)
  277. except (ValueError, SyntaxError):
  278. # 如果literal_eval失败,尝试使用eval(不推荐,但paddleocr输出可能需要)
  279. logger.warning("[PaddleOCR解析] literal_eval失败,尝试使用eval")
  280. # 创建一个安全的eval环境
  281. safe_dict = {"__builtins__": {}}
  282. result = eval(output_text, safe_dict)
  283. if isinstance(result, dict):
  284. # 检查是否有res键
  285. if "res" in result:
  286. parsing_res_list = result.get("res", {}).get("parsing_res_list", [])
  287. return {"parsing_res_list": parsing_res_list}
  288. # 也可能直接包含parsing_res_list
  289. elif "parsing_res_list" in result:
  290. return {"parsing_res_list": result.get("parsing_res_list", [])}
  291. return {"parsing_res_list": []}
  292. except Exception as e:
  293. logger.error(f"[PaddleOCR解析] 解析输出失败: {e}")
  294. logger.debug(f"[PaddleOCR解析] 输出内容: {output_text[:500]}")
  295. return {"parsing_res_list": []}
  296. def paddleocr_to_markdown(paddleocr_result: Dict[str, Any]) -> str:
  297. """将paddleocr的解析结果转换为markdown格式
  298. Args:
  299. paddleocr_result: paddleocr解析结果
  300. Returns:
  301. markdown格式的文本
  302. """
  303. markdown_parts = []
  304. parsing_res_list = paddleocr_result.get("parsing_res_list", [])
  305. for item in parsing_res_list:
  306. block_label = item.get("block_label", "")
  307. block_content = item.get("block_content", "")
  308. if block_label == "table":
  309. # 表格直接使用HTML格式
  310. markdown_parts.append(block_content)
  311. elif block_label in ["header", "title", "figure_title"]:
  312. # 标题使用markdown标题格式
  313. markdown_parts.append(f"# {block_content}")
  314. elif block_label == "text":
  315. # 普通文本
  316. markdown_parts.append(block_content)
  317. else:
  318. # 其他类型直接添加内容
  319. markdown_parts.append(block_content)
  320. return "\n\n".join(markdown_parts)
  321. def call_paddleocr(image_path: str) -> Optional[Dict[str, Any]]:
  322. """调用paddleocr命令解析图片
  323. Args:
  324. image_path: 图片路径
  325. Returns:
  326. paddleocr解析结果,如果失败返回None
  327. """
  328. try:
  329. # 检查图片文件是否存在
  330. if not os.path.exists(image_path):
  331. logger.error(f"[PaddleOCR] 图片文件不存在: {image_path}")
  332. return None
  333. # 生成输出目录和基础文件名
  334. image_dir = os.path.dirname(image_path)
  335. image_basename = os.path.splitext(os.path.basename(image_path))[0]
  336. save_path_base = os.path.join(image_dir, image_basename)
  337. # 构建paddleocr命令,添加所有参数(NPU 下需加 --device npu:0,否则走 CPU 易段错误)
  338. # PaddleOCR会在save_path下创建目录,文件保存在该目录内
  339. cmd = [
  340. _get_paddleocr_executable(), "doc_parser", "-i", image_path,
  341. "--precision", "fp32",
  342. "--use_doc_unwarping", "False",
  343. "--use_doc_orientation_classify", "True",
  344. "--use_chart_recognition", "True",
  345. "--save_path", save_path_base
  346. ] + _paddle_ocr_device_args()
  347. # 设置环境变量,限制GPU内存使用
  348. # env = os.environ.copy()
  349. # 设置PaddlePaddle的GPU内存分配策略,使用更保守的内存分配
  350. # env["FLAGS_fraction_of_gpu_memory_to_use"] = "0.3" # 只使用30%的GPU内存
  351. # env["FLAGS_allocator_strategy"] = "auto_growth" # 使用自动增长策略,避免一次性分配过多内存
  352. logger.info(f"[PaddleOCR] 执行命令: {' '.join(cmd)}")
  353. # 执行命令(env 含 LD_PRELOAD 与 PADDLE_PDX_DISABLE_MODEL_SOURCE_CHECK,避免 static TLS / 模型源检查)
  354. result = subprocess.run(
  355. cmd,
  356. capture_output=True,
  357. text=True,
  358. timeout=300, # 5分钟超时
  359. check=False,
  360. env=_get_paddleocr_subprocess_env(),
  361. )
  362. if result.returncode != 0:
  363. logger.error(f"[PaddleOCR] 命令执行失败,返回码: {result.returncode}")
  364. # doc_parser 已知问题:PP-DocLayoutV3 返回 3 值而管道按 2 值解包,报 "too many values to unpack (expected 2)"
  365. if result.stderr and ("too many values to unpack" in result.stderr or "Exception from the 'cv' worker" in result.stderr):
  366. logger.warning(
  367. "[PaddleOCR] doc_parser 报 cv worker 解包错误,多为 PaddleX 与 PP-DocLayoutV3 不兼容。"
  368. " 可尝试: pip install -U paddlex;或仅需文字时改用 ocr 模式。详见 README_STARTUP.md。"
  369. )
  370. # 完整 stderr 便于排查(NPU 初始化日志较长,真正错误常在末尾)
  371. if result.stderr:
  372. logger.error(f"[PaddleOCR] stderr: {result.stderr}")
  373. if result.stdout:
  374. logger.error(f"[PaddleOCR] stdout(末 2000 字符): {result.stdout[-2000:] if len(result.stdout) > 2000 else result.stdout}")
  375. return None
  376. # 从保存的Markdown文件中读取结果
  377. # PaddleOCR会在save_path下创建目录,文件路径为: {save_path}/{basename}.md
  378. md_file = os.path.join(save_path_base, f"{image_basename}.md")
  379. if os.path.exists(md_file):
  380. logger.info(f"[PaddleOCR] 从Markdown文件读取结果: {md_file}")
  381. try:
  382. with open(md_file, 'r', encoding='utf-8') as f:
  383. markdown_content = f.read()
  384. if markdown_content.strip():
  385. # 将markdown内容转换为标准格式
  386. # 为了兼容现有代码,我们需要将markdown转换回parsing_res_list格式
  387. # 但实际上,我们可以直接返回markdown内容,让调用方处理
  388. # 这里我们返回一个特殊标记,表示这是markdown格式
  389. logger.info(f"[PaddleOCR] 成功读取Markdown文件,内容长度: {len(markdown_content)} 字符")
  390. # 返回markdown内容,使用特殊键标记
  391. return {"markdown_content": markdown_content}
  392. else:
  393. logger.warning("[PaddleOCR] Markdown文件内容为空")
  394. except Exception as e:
  395. logger.exception(f"[PaddleOCR] 读取Markdown文件失败: {e}")
  396. else:
  397. logger.warning(f"[PaddleOCR] Markdown文件不存在: {md_file}")
  398. # 如果Markdown文件不存在或读取失败,尝试从stdout解析
  399. output_text = result.stdout.strip()
  400. if output_text:
  401. logger.info("[PaddleOCR] 从stdout解析输出")
  402. parsed_result = parse_paddleocr_output(output_text)
  403. logger.info(f"[PaddleOCR] 解析成功,获得 {len(parsed_result.get('parsing_res_list', []))} 个区块")
  404. return parsed_result
  405. else:
  406. logger.warning("[PaddleOCR] stdout输出为空,且未找到Markdown文件")
  407. return None
  408. except subprocess.TimeoutExpired:
  409. logger.error("[PaddleOCR] 命令执行超时")
  410. return None
  411. except Exception as e:
  412. logger.exception(f"[PaddleOCR] 调用失败: {e}")
  413. return None
  414. def extract_first_page_from_pdf(pdf_path: str, output_dir: str) -> Optional[str]:
  415. """从PDF文件中提取第一页作为图片
  416. 优先使用pypdfium2,如果不可用则使用pdf2image作为后备方案。
  417. Args:
  418. pdf_path: PDF文件路径
  419. output_dir: 输出目录,用于保存提取的图片
  420. Returns:
  421. 提取的图片路径,如果失败返回None
  422. """
  423. if not PIL_AVAILABLE:
  424. logger.error("[PaddleOCR备用] 缺少必要的库(PIL/Pillow),无法处理图片")
  425. return None
  426. if not os.path.exists(pdf_path):
  427. logger.error(f"[PaddleOCR备用] PDF文件不存在: {pdf_path}")
  428. return None
  429. # 方法1: 尝试使用pypdfium2(优先方法)
  430. if PDFIUM_AVAILABLE:
  431. try:
  432. pdf = pdfium.PdfDocument(pdf_path)
  433. try:
  434. if len(pdf) == 0:
  435. logger.error("[PaddleOCR备用] PDF文件为空")
  436. return None
  437. page = pdf[0]
  438. bitmap = page.render(scale=150 / 72) # 150 DPI
  439. pil_image = bitmap.to_pil()
  440. os.makedirs(output_dir, exist_ok=True)
  441. image_filename = f"paddleocr_fallback_page0_{int(time.time() * 1000)}_{random.randint(1000, 9999)}.png"
  442. image_path = os.path.join(output_dir, image_filename)
  443. pil_image.save(image_path, "PNG", optimize=True, compress_level=6)
  444. logger.info(f"[PaddleOCR备用] 使用pypdfium2从PDF提取第一页图片: {image_path}")
  445. bitmap.close()
  446. return image_path
  447. finally:
  448. try:
  449. pdf.close()
  450. except Exception:
  451. pass
  452. except Exception as e:
  453. logger.warning(f"[PaddleOCR备用] 使用pypdfium2提取图片失败,尝试pdf2image: {e}")
  454. # 方法2: 使用 pdf2image 作为后备方案
  455. if PDF2IMAGE_AVAILABLE:
  456. try:
  457. images = convert_from_path(pdf_path, dpi=150, first_page=1, last_page=1)
  458. if not images:
  459. logger.error("[PaddleOCR备用] pdf2image未能提取到图片")
  460. return None
  461. os.makedirs(output_dir, exist_ok=True)
  462. image_filename = f"paddleocr_fallback_page0_{int(time.time() * 1000)}_{random.randint(1000, 9999)}.png"
  463. image_path = os.path.join(output_dir, image_filename)
  464. images[0].save(image_path, "PNG", optimize=True, compress_level=6)
  465. logger.info(f"[PaddleOCR备用] 使用pdf2image从PDF提取第一页图片: {image_path}")
  466. return image_path
  467. except Exception as e:
  468. logger.exception(f"[PaddleOCR备用] 使用pdf2image提取图片失败: {e}")
  469. # 如果两种方法都不可用
  470. missing_libs = []
  471. if not PDFIUM_AVAILABLE:
  472. missing_libs.append("pypdfium2")
  473. if not PDF2IMAGE_AVAILABLE:
  474. missing_libs.append("pdf2image")
  475. logger.error(
  476. f"[PaddleOCR备用] 缺少必要的库({'或'.join(missing_libs)}),无法从PDF提取图片。请安装: pip install {' '.join(missing_libs)}"
  477. )
  478. return None
  479. def find_pdf_file(output_dir: str) -> Optional[str]:
  480. """在输出目录中查找PDF文件
  481. Args:
  482. output_dir: 输出目录
  483. Returns:
  484. PDF文件路径,如果未找到返回None
  485. """
  486. if not os.path.exists(output_dir):
  487. return None
  488. # 查找PDF文件
  489. pdf_files = list(Path(output_dir).rglob("*.pdf"))
  490. if pdf_files:
  491. # 返回第一个找到的PDF文件
  492. return str(pdf_files[0])
  493. return None
  494. def markdown_to_plain_text(markdown_content: str) -> List[str]:
  495. """将Markdown内容转换为纯文本列表(按行分割)
  496. Args:
  497. markdown_content: Markdown格式的文本
  498. Returns:
  499. 纯文本列表,每行一个元素
  500. """
  501. if not markdown_content:
  502. return []
  503. lines = []
  504. in_code_block = False
  505. # 先处理HTML表格:提取整个表格,转换为文本行
  506. # 查找所有<table>...</table>块
  507. table_pattern = r'<table[^>]*>.*?</table>'
  508. tables = re.findall(table_pattern, markdown_content, re.DOTALL)
  509. # 将表格内容替换为占位符,稍后处理
  510. table_placeholders = []
  511. for i, table in enumerate(tables):
  512. placeholder = f"__TABLE_PLACEHOLDER_{i}__"
  513. table_placeholders.append((placeholder, table))
  514. markdown_content = markdown_content.replace(table, placeholder, 1)
  515. # 处理每一行
  516. for line in markdown_content.split('\n'):
  517. line = line.rstrip() # 只移除右侧空格
  518. # 检测代码块
  519. if line.strip().startswith('```'):
  520. in_code_block = not in_code_block
  521. continue
  522. if in_code_block:
  523. # 代码块内的内容保留原样
  524. if line.strip():
  525. lines.append(line)
  526. continue
  527. # 处理表格占位符
  528. if '__TABLE_PLACEHOLDER_' in line:
  529. # 找到对应的表格
  530. for placeholder, table_html in table_placeholders:
  531. if placeholder in line:
  532. # 提取表格中的所有单元格文本
  533. table_lines = extract_table_text(table_html)
  534. lines.extend(table_lines)
  535. break
  536. continue
  537. # 检测Markdown表格(以 | 开头)
  538. if '|' in line and line.strip().startswith('|'):
  539. # 处理表格行:移除首尾的 |,分割单元格
  540. cells = [cell.strip() for cell in line.split('|') if cell.strip()]
  541. # 移除表格分隔行(只包含 - 和 |)
  542. if all(c in ['-', ':', ' '] for c in ''.join(cells)):
  543. continue
  544. # 合并单元格内容,用空格分隔
  545. table_line = ' '.join(cells)
  546. if table_line.strip():
  547. lines.append(table_line)
  548. continue
  549. # 移除Markdown语法标记
  550. # 移除标题标记 (# ## ### 等)
  551. line = re.sub(r'^#+\s*', '', line)
  552. # 移除列表标记 (- * + 等)
  553. line = re.sub(r'^[-*+]\s+', '', line)
  554. # 移除数字列表标记
  555. line = re.sub(r'^\d+\.\s+', '', line)
  556. # 移除粗体和斜体标记
  557. line = re.sub(r'\*\*([^*]+)\*\*', r'\1', line) # **bold**
  558. line = re.sub(r'\*([^*]+)\*', r'\1', line) # *italic*
  559. line = re.sub(r'__([^_]+)__', r'\1', line) # __bold__
  560. line = re.sub(r'_([^_]+)_', r'\1', line) # _italic_
  561. # 移除链接格式 [text](url) -> text
  562. line = re.sub(r'\[([^\]]+)\]\([^\)]+\)', r'\1', line)
  563. # 移除图片格式 ![alt](url) -> alt
  564. line = re.sub(r'!\[([^\]]*)\]\([^\)]+\)', r'\1', line)
  565. # 移除行内代码标记
  566. line = re.sub(r'`([^`]+)`', r'\1', line)
  567. # 移除HTML标签(div、span等)
  568. line = re.sub(r'<div[^>]*>', '', line)
  569. line = re.sub(r'</div>', '', line)
  570. line = re.sub(r'<span[^>]*>', '', line)
  571. line = re.sub(r'</span>', '', line)
  572. line = re.sub(r'<[^>]+>', '', line) # 移除其他HTML标签
  573. # 清理多余空格
  574. line = line.strip()
  575. if line: # 只保留非空行
  576. lines.append(line)
  577. return lines
  578. def extract_table_text(table_html: str) -> List[str]:
  579. """从HTML表格中提取文本,每行一个元素
  580. Args:
  581. table_html: HTML表格字符串
  582. Returns:
  583. 文本行列表
  584. """
  585. table_lines = []
  586. try:
  587. # 提取所有<tr>标签
  588. tr_pattern = r'<tr[^>]*>(.*?)</tr>'
  589. tr_matches = re.findall(tr_pattern, table_html, re.DOTALL)
  590. for tr_content in tr_matches:
  591. # 提取所有<td>和<th>标签内的文本
  592. cell_pattern = r'<(?:td|th)[^>]*>(.*?)</(?:td|th)>'
  593. cells = re.findall(cell_pattern, tr_content, re.DOTALL)
  594. if cells:
  595. # 清理每个单元格的文本
  596. cleaned_cells = []
  597. for cell in cells:
  598. # 移除嵌套的HTML标签
  599. cleaned = re.sub(r'<[^>]+>', '', cell)
  600. # 移除HTML实体
  601. cleaned = cleaned.replace('&nbsp;', ' ')
  602. cleaned = cleaned.strip()
  603. if cleaned:
  604. cleaned_cells.append(cleaned)
  605. if cleaned_cells:
  606. # 合并单元格内容,用空格分隔
  607. table_line = ' '.join(cleaned_cells)
  608. if table_line.strip():
  609. table_lines.append(table_line)
  610. except Exception as e:
  611. logger.warning(f"[Markdown转换] 提取表格文本失败: {e}")
  612. return table_lines
  613. def call_paddleocr_ocr(image_path: str, save_path: str) -> tuple[Optional[List[str]], Optional[str]]:
  614. """调用paddleocr ocr命令提取文本(用于API接口)
  615. Args:
  616. image_path: 图片路径
  617. save_path: 保存路径(目录)
  618. Returns:
  619. (OCR识别的文本列表, JSON文件路径),如果失败返回(None, None)
  620. """
  621. try:
  622. if not os.path.exists(image_path):
  623. logger.error(f"[PaddleOCR OCR] 图片文件不存在: {image_path}")
  624. return None, None
  625. # 构建paddleocr ocr命令(NPU 下需加 --device npu:0,否则走 CPU 易段错误)
  626. cmd = [_get_paddleocr_executable(), "ocr", "-i", image_path, "--save_path", save_path] + _paddle_ocr_device_args()
  627. logger.info(f"[PaddleOCR OCR] 执行命令: {' '.join(cmd)}")
  628. # 执行命令(env 含 LD_PRELOAD 与 PADDLE_PDX_DISABLE_MODEL_SOURCE_CHECK)
  629. result = subprocess.run(
  630. cmd,
  631. capture_output=True,
  632. text=True,
  633. timeout=300, # 5分钟超时
  634. check=False,
  635. env=_get_paddleocr_subprocess_env(),
  636. )
  637. if result.returncode != 0:
  638. logger.error(f"[PaddleOCR OCR] 命令执行失败,返回码: {result.returncode}")
  639. logger.error(f"[PaddleOCR OCR] 错误输出: {result.stderr}")
  640. return None, None
  641. # 查找保存的JSON文件
  642. # OCR命令会在save_path下生成 {basename}_res.json
  643. image_basename = os.path.splitext(os.path.basename(image_path))[0]
  644. json_file = os.path.join(save_path, f"{image_basename}_res.json")
  645. if not os.path.exists(json_file):
  646. logger.warning(f"[PaddleOCR OCR] JSON文件不存在: {json_file}")
  647. return None, None
  648. # 读取JSON文件
  649. try:
  650. with open(json_file, 'r', encoding='utf-8') as f:
  651. ocr_data = json.load(f)
  652. # 优先提取rec_texts字段(如果存在)
  653. if "rec_texts" in ocr_data and isinstance(ocr_data["rec_texts"], list):
  654. texts = ocr_data["rec_texts"]
  655. logger.info(f"[PaddleOCR OCR] 成功提取 {len(texts)} 个文本片段(从rec_texts)")
  656. return texts, json_file
  657. # 如果没有rec_texts,尝试从parsing_res_list中提取block_content
  658. if "parsing_res_list" in ocr_data and isinstance(ocr_data["parsing_res_list"], list):
  659. texts = []
  660. for item in ocr_data["parsing_res_list"]:
  661. if isinstance(item, dict) and "block_content" in item:
  662. block_content = item["block_content"]
  663. if block_content and block_content.strip():
  664. # 如果block_content包含换行符,按行分割
  665. if "\n" in block_content:
  666. texts.extend([line.strip() for line in block_content.split("\n") if line.strip()])
  667. else:
  668. texts.append(block_content.strip())
  669. if texts:
  670. logger.info(f"[PaddleOCR OCR] 成功提取 {len(texts)} 个文本片段(从parsing_res_list)")
  671. return texts, json_file
  672. logger.warning("[PaddleOCR OCR] JSON文件中未找到rec_texts或parsing_res_list字段")
  673. return None, json_file
  674. except Exception as e:
  675. logger.exception(f"[PaddleOCR OCR] 读取JSON文件失败: {e}")
  676. return None, json_file
  677. except subprocess.TimeoutExpired:
  678. logger.error("[PaddleOCR OCR] 命令执行超时")
  679. return None, None
  680. except Exception as e:
  681. logger.exception(f"[PaddleOCR OCR] 调用失败: {e}")
  682. return None, None
  683. def call_paddleocr_doc_parser_for_text(image_path: str, save_path: str) -> tuple[Optional[List[str]], Optional[str]]:
  684. """调用paddleocr doc_parser命令,将markdown转换为纯文本(用于内部调用提取关键词)
  685. Args:
  686. image_path: 图片路径
  687. save_path: 保存路径(目录)
  688. Returns:
  689. (纯文本列表(按行分割), markdown文件路径),如果失败返回(None, None)
  690. """
  691. try:
  692. if not os.path.exists(image_path):
  693. logger.error(f"[PaddleOCR DocParser] 图片文件不存在: {image_path}")
  694. return None, None
  695. # 生成输出目录和基础文件名
  696. image_dir = os.path.dirname(image_path)
  697. image_basename = os.path.splitext(os.path.basename(image_path))[0]
  698. save_path_base = os.path.join(save_path, image_basename)
  699. os.makedirs(save_path_base, exist_ok=True)
  700. # 构建paddleocr doc_parser命令(NPU 下需加 --device npu:0,否则走 CPU 易段错误)
  701. cmd = [
  702. _get_paddleocr_executable(), "doc_parser", "-i", image_path,
  703. "--precision", "fp32",
  704. "--use_doc_unwarping", "False",
  705. "--use_doc_orientation_classify", "True",
  706. "--use_chart_recognition", "True",
  707. "--save_path", save_path_base
  708. ] + _paddle_ocr_device_args()
  709. logger.info(f"[PaddleOCR DocParser] 执行命令: {' '.join(cmd)}")
  710. # 执行命令(env 含 LD_PRELOAD 与 PADDLE_PDX_DISABLE_MODEL_SOURCE_CHECK)
  711. result = subprocess.run(
  712. cmd,
  713. capture_output=True,
  714. text=True,
  715. timeout=300, # 5分钟超时
  716. check=False,
  717. env=_get_paddleocr_subprocess_env(),
  718. )
  719. if result.returncode != 0:
  720. logger.error(f"[PaddleOCR DocParser] 命令执行失败,返回码: {result.returncode}")
  721. if result.stderr and ("too many values to unpack" in result.stderr or "Exception from the 'cv' worker" in result.stderr):
  722. logger.warning(
  723. "[PaddleOCR DocParser] 报 cv worker 解包错误,多为 PaddleX 与 PP-DocLayoutV3 不兼容。"
  724. " 可尝试: pip install -U paddlex;或改用 ocr 模式提取文字。详见 README_STARTUP.md。"
  725. )
  726. logger.error(f"[PaddleOCR DocParser] 错误输出: {result.stderr}")
  727. return None, None
  728. # 查找保存的Markdown文件
  729. # PaddleOCR会在save_path下创建目录,文件路径为: {save_path}/{basename}.md
  730. md_file = os.path.join(save_path_base, f"{image_basename}.md")
  731. # 也可能在子目录中
  732. if not os.path.exists(md_file):
  733. md_files = sorted(Path(save_path_base).rglob("*.md"))
  734. if md_files:
  735. md_file = str(md_files[0])
  736. logger.info(f"[PaddleOCR DocParser] 在子目录中找到Markdown文件: {md_file}")
  737. if not os.path.exists(md_file):
  738. logger.warning(f"[PaddleOCR DocParser] Markdown文件不存在: {md_file}")
  739. return None, None
  740. # 读取Markdown文件并转换为纯文本
  741. try:
  742. with open(md_file, 'r', encoding='utf-8') as f:
  743. markdown_content = f.read()
  744. if not markdown_content.strip():
  745. logger.warning("[PaddleOCR DocParser] Markdown文件内容为空")
  746. return [], md_file
  747. # 将Markdown转换为纯文本列表
  748. plain_text_lines = markdown_to_plain_text(markdown_content)
  749. logger.info(f"[PaddleOCR DocParser] 成功提取 {len(plain_text_lines)} 行纯文本,Markdown文件: {md_file}")
  750. return plain_text_lines, md_file
  751. except Exception as e:
  752. logger.exception(f"[PaddleOCR DocParser] 读取Markdown文件失败: {e}")
  753. return None, md_file
  754. except subprocess.TimeoutExpired:
  755. logger.error("[PaddleOCR DocParser] 命令执行超时")
  756. return None, None
  757. except Exception as e:
  758. logger.exception(f"[PaddleOCR DocParser] 调用失败: {e}")
  759. return None, None
  760. def extract_keywords_from_ocr_texts(ocr_texts: List[str]) -> Dict[str, Any]:
  761. """从OCR文本列表中提取关键信息
  762. Args:
  763. ocr_texts: OCR识别的文本列表
  764. Returns:
  765. 包含提取的关键信息的字典
  766. """
  767. keywords = {
  768. "project": "",
  769. "standardReferences": "",
  770. "soundLevelMeterMode": "",
  771. "soundCalibratorMode": "",
  772. "calibrationValueBefore": "",
  773. "calibrationValueAfter": "",
  774. "weather_info": [], # 存储天气相关信息
  775. "address_mapping": {} # 存储编号到地址的映射,用于电磁检测记录
  776. }
  777. if not ocr_texts:
  778. return keywords
  779. # 将所有文本合并,用于匹配
  780. full_text = " ".join(ocr_texts)
  781. # 提取项目名称
  782. # 先尝试匹配"项目名称:"格式
  783. project_match = re.search(r'项目名称[::]([^检测依据声级计声校准器检测前检测后气象条件日期]+)', full_text)
  784. if project_match:
  785. project = project_match.group(1).strip()
  786. # 清理可能的后续内容
  787. project = re.sub(r'检测依据.*$', '', project).strip()
  788. keywords["project"] = project
  789. logger.debug(f"[关键词提取] 提取到项目名称: {project}")
  790. else:
  791. # 如果没找到,尝试查找"项目名称"文本,然后检查后续文本片段
  792. for i, text in enumerate(ocr_texts):
  793. if "项目名称" in text:
  794. # 检查当前文本中是否有值(在冒号后面)
  795. if ":" in text or ":" in text:
  796. project_match = re.search(r'项目名称[::]([^检测依据声级计声校准器检测前检测后气象条件日期]+)', text)
  797. if project_match:
  798. project = project_match.group(1).strip()
  799. project = re.sub(r'检测依据.*$', '', project).strip()
  800. if project:
  801. keywords["project"] = project
  802. logger.debug(f"[关键词提取] 从当前文本提取到项目名称: {project}")
  803. break
  804. # 如果当前文本只有"项目名称",检查下一个文本片段
  805. elif text.strip() == "项目名称" or text.strip().startswith("项目名称"):
  806. # 检查后续几个文本片段,找到项目名称值
  807. for j in range(i + 1, min(i + 3, len(ocr_texts))):
  808. next_text = ocr_texts[j].strip()
  809. # 如果下一个文本不是"检测依据"、"监测依据"等标签,且包含中文字符,可能是项目名称
  810. if next_text and not re.match(r'^(检测依据|监测依据|检查依据|声级计|声校准器|检测前|检测后|气象条件|日期)', next_text):
  811. # 检查是否包含中文字符(项目名称通常是中文)
  812. if re.search(r'[\u4e00-\u9fa5]', next_text):
  813. # 提取项目名称(直到遇到"检测依据"等关键词)
  814. project = re.sub(r'(检测依据|监测依据|检查依据).*$', '', next_text).strip()
  815. if project:
  816. keywords["project"] = project
  817. logger.debug(f"[关键词提取] 从后续文本提取到项目名称: {project}")
  818. break
  819. if keywords["project"]:
  820. break
  821. # 提取检测依据
  822. standard_match = re.search(r'检测依据[::]([^声级计声校准器检测前检测后气象条件日期]+)', full_text)
  823. if standard_match:
  824. standard = standard_match.group(1).strip()
  825. # 提取GB标准
  826. gb_standards = re.findall(r'GB\s*\d+[-\.]?\d*[-\.]?\d*', standard)
  827. if gb_standards:
  828. keywords["standardReferences"] = " ".join(gb_standards)
  829. else:
  830. keywords["standardReferences"] = standard.replace("□其他:", "").strip()
  831. logger.debug(f"[关键词提取] 提取到检测依据: {keywords['standardReferences']}")
  832. # 提取声级计型号/编号
  833. sound_meter_match = re.search(r'声级计型号[/::]?(?:编号)?[::]\s*([A-Z0-9+/]+)', full_text)
  834. if sound_meter_match:
  835. keywords["soundLevelMeterMode"] = sound_meter_match.group(1).strip()
  836. logger.debug(f"[关键词提取] 提取到声级计型号: {keywords['soundLevelMeterMode']}")
  837. # 提取声校准器型号/编号
  838. calibrator_match = re.search(r'声校准器型号[/::]?(?:编号)?[::]\s*([A-Z0-9+/]+)', full_text)
  839. if calibrator_match:
  840. keywords["soundCalibratorMode"] = calibrator_match.group(1).strip()
  841. logger.debug(f"[关键词提取] 提取到声校准器型号: {keywords['soundCalibratorMode']}")
  842. # 提取校准值 - 按照出现顺序:第一个dB(A)是检测前,第二个是检测后
  843. # 首先尝试通过字段名匹配
  844. before_cal_found = False
  845. after_cal_found = False
  846. # 先尝试通过字段名精确匹配
  847. for i, text in enumerate(ocr_texts):
  848. if "检测前校准值" in text and not before_cal_found:
  849. # 在当前文本中查找(可能格式:检测前校准值:93.8 dB(A))
  850. before_cal_match = re.search(r'检测前校准值[::]\s*([0-9.]+)\s*dB[((]?A[))]?', text)
  851. if before_cal_match:
  852. cal_value = before_cal_match.group(1).strip()
  853. keywords["calibrationValueBefore"] = f"{cal_value} dB(A)"
  854. logger.debug(f"[关键词提取] 提取到检测前校准值: {keywords['calibrationValueBefore']}")
  855. before_cal_found = True
  856. continue
  857. # 如果当前文本只有字段名(如"检测前校准值:"),检查相邻文本片段
  858. elif re.search(r'检测前校准值[::]\s*$', text) or (text.strip() == "检测前校准值:"):
  859. # 检查后续3个文本片段,查找包含dB(A)的文本
  860. for j in range(i + 1, min(i + 4, len(ocr_texts))):
  861. next_text = ocr_texts[j]
  862. # 查找包含dB(A)的文本(如"93.8dB(A)")
  863. db_match = re.search(r'([0-9.]+)\s*dB[((]?A[))]?', next_text)
  864. if db_match:
  865. cal_value = db_match.group(1).strip()
  866. keywords["calibrationValueBefore"] = f"{cal_value} dB(A)"
  867. logger.debug(f"[关键词提取] 从相邻文本提取到检测前校准值: {keywords['calibrationValueBefore']}")
  868. before_cal_found = True
  869. break
  870. if before_cal_found:
  871. continue
  872. if "检测后校准值" in text and not after_cal_found:
  873. # 在当前文本中查找(可能格式:检测后校准值:93.8 dB(A)或 93.8dB(A)检测后校准值:_93.8dB(A))
  874. after_cal_match = re.search(r'检测后校准值[::]\s*([0-9.]+)\s*dB[((]?A[))]?', text)
  875. if after_cal_match:
  876. cal_value = after_cal_match.group(1).strip()
  877. keywords["calibrationValueAfter"] = f"{cal_value} dB(A)"
  878. logger.debug(f"[关键词提取] 提取到检测后校准值: {keywords['calibrationValueAfter']}")
  879. after_cal_found = True
  880. continue
  881. # 如果当前文本包含"检测后校准值"但值在文本前面(如"93.8dB(A)检测后校准值:")
  882. elif re.search(r'([0-9.]+)\s*dB[((]?A[))]?\s*检测后校准值', text):
  883. db_match = re.search(r'([0-9.]+)\s*dB[((]?A[))]?', text)
  884. if db_match:
  885. cal_value = db_match.group(1).strip()
  886. keywords["calibrationValueAfter"] = f"{cal_value} dB(A)"
  887. logger.debug(f"[关键词提取] 从同一文本提取到检测后校准值: {keywords['calibrationValueAfter']}")
  888. after_cal_found = True
  889. continue
  890. # 如果当前文本只有字段名(如"检测后校准值:"),检查相邻文本片段
  891. elif re.search(r'检测后校准值[::]\s*$', text) or (text.strip() == "检测后校准值:"):
  892. # 检查后续3个文本片段,查找包含dB(A)的文本
  893. for j in range(i + 1, min(i + 4, len(ocr_texts))):
  894. next_text = ocr_texts[j]
  895. # 查找包含dB(A)的文本(如"93.8dB(A)")
  896. db_match = re.search(r'([0-9.]+)\s*dB[((]?A[))]?', next_text)
  897. if db_match:
  898. cal_value = db_match.group(1).strip()
  899. keywords["calibrationValueAfter"] = f"{cal_value} dB(A)"
  900. logger.debug(f"[关键词提取] 从相邻文本提取到检测后校准值: {keywords['calibrationValueAfter']}")
  901. after_cal_found = True
  902. break
  903. if after_cal_found:
  904. continue
  905. # 如果通过字段名没有找到,按照出现顺序:第一个dB(A)是检测前,第二个是检测后
  906. if not before_cal_found or not after_cal_found:
  907. db_a_matches = [] # 存储所有找到的dB(A)值及其位置
  908. for i, text in enumerate(ocr_texts):
  909. # 查找包含dB(A)的文本
  910. db_matches = re.finditer(r'([0-9.]+)\s*dB[((]?A[))]?', text)
  911. for match in db_matches:
  912. cal_value = match.group(1).strip()
  913. db_a_matches.append((i, cal_value, text))
  914. # 如果找到至少一个dB(A),且还没有检测前校准值,第一个就是检测前
  915. if db_a_matches and not before_cal_found:
  916. first_cal_value = db_a_matches[0][1]
  917. keywords["calibrationValueBefore"] = f"{first_cal_value} dB(A)"
  918. logger.debug(f"[关键词提取] 按出现顺序提取到检测前校准值(第一个dB(A)): {keywords['calibrationValueBefore']}")
  919. before_cal_found = True
  920. # 如果找到至少两个dB(A),且还没有检测后校准值,第二个就是检测后
  921. if len(db_a_matches) >= 2 and not after_cal_found:
  922. second_cal_value = db_a_matches[1][1]
  923. keywords["calibrationValueAfter"] = f"{second_cal_value} dB(A)"
  924. logger.debug(f"[关键词提取] 按出现顺序提取到检测后校准值(第二个dB(A)): {keywords['calibrationValueAfter']}")
  925. after_cal_found = True
  926. # 如果只找到一个dB(A),且还没有检测后校准值,且检测前已经找到,那么这个就是检测后(可能是同一个值)
  927. elif len(db_a_matches) == 1 and not after_cal_found and before_cal_found:
  928. # 如果检测前和检测后是同一个值,也设置检测后
  929. if keywords["calibrationValueBefore"]:
  930. keywords["calibrationValueAfter"] = keywords["calibrationValueBefore"]
  931. logger.debug(f"[关键词提取] 检测前和检测后校准值相同: {keywords['calibrationValueAfter']}")
  932. # 提取天气信息(从文本片段中查找包含日期和天气信息的片段)
  933. # 需要处理文本可能分散在多个片段中的情况
  934. # 只有当"日期:"存在且后续有天气相关信息时才提取
  935. current_weather_info = None
  936. weather_start_idx = -1 # 记录天气信息开始的索引
  937. for i, text in enumerate(ocr_texts):
  938. # 查找包含"日期:"的文本,开始新的天气记录
  939. # 只有当后续文本中有天气相关信息时才创建记录
  940. date_match = re.search(r'日期[::]\s*([\d.\-]+)', text)
  941. if date_match:
  942. # 检查后续10个文本片段中是否有天气相关信息(天气、温度、湿度、风速、风向等)
  943. has_weather_info = False
  944. for j in range(i, min(i + 10, len(ocr_texts))):
  945. check_text = ocr_texts[j]
  946. if any(keyword in check_text for keyword in ["天气", "温度", "湿度", "风速", "风向", "℃", "%RH", "m/s"]):
  947. has_weather_info = True
  948. break
  949. if has_weather_info:
  950. # 如果之前有未完成的天气记录,先保存
  951. if current_weather_info and any([current_weather_info["monitorAt"], current_weather_info["weather"],
  952. current_weather_info["temp"], current_weather_info["humidity"],
  953. current_weather_info["windSpeed"], current_weather_info["windDirection"]]):
  954. keywords["weather_info"].append(current_weather_info)
  955. # 创建新的天气记录
  956. current_weather_info = {
  957. "monitorAt": date_match.group(1).strip(),
  958. "weather": "",
  959. "temp": "",
  960. "humidity": "",
  961. "windSpeed": "",
  962. "windDirection": ""
  963. }
  964. weather_start_idx = i
  965. # 如果当前有天气记录,继续提取信息(从当前文本和后续几个文本中)
  966. if current_weather_info:
  967. # 只在天气记录开始后的10个文本片段内查找(避免跨太远)
  968. if weather_start_idx >= 0 and i <= weather_start_idx + 10:
  969. # 查找天气(在同一文本或后续文本中)
  970. if not current_weather_info["weather"]:
  971. weather_match = re.search(r'天气\s*([^\s温度湿度风速风向]+)', text)
  972. if weather_match:
  973. weather_value = weather_match.group(1).strip()
  974. if weather_value and weather_value != "_" and not re.match(r'^[\d.\-]+$', weather_value):
  975. current_weather_info["weather"] = weather_value
  976. # 查找温度(可能格式:温度29.5-35.0 或 温度 29.5-35.0)
  977. if not current_weather_info["temp"]:
  978. temp_match = re.search(r'温度\s*([0-9.\-]+)', text)
  979. if temp_match:
  980. current_weather_info["temp"] = temp_match.group(1).strip()
  981. # 查找湿度(可能格式:湿度74.0-74.1 或 在"℃ 湿度"之后的文本中)
  982. if not current_weather_info["humidity"]:
  983. # 先检查当前文本是否包含湿度值
  984. humidity_match = re.search(r'湿度\s*([0-9.\-]+)', text)
  985. if humidity_match:
  986. current_weather_info["humidity"] = humidity_match.group(1).strip()
  987. # 如果当前文本是"℃ 湿度"或类似格式,湿度值可能在下一行
  988. elif "湿度" in text and i + 1 < len(ocr_texts):
  989. next_text = ocr_texts[i + 1]
  990. if re.match(r'^[0-9.\-]+', next_text):
  991. current_weather_info["humidity"] = next_text.strip()
  992. # 查找风速(可能格式:风速0.4-0.5 或 在"%RH 风速"之后的文本中)
  993. if not current_weather_info["windSpeed"]:
  994. # 先检查当前文本是否包含风速值
  995. wind_speed_match = re.search(r'风速\s*([0-9.\-]+)', text)
  996. if wind_speed_match:
  997. current_weather_info["windSpeed"] = wind_speed_match.group(1).strip()
  998. # 如果当前文本是"%RH 风速"或类似格式,风速值可能在下一行
  999. elif "风速" in text and i + 1 < len(ocr_texts):
  1000. next_text = ocr_texts[i + 1]
  1001. if re.match(r'^[0-9.\-]+', next_text):
  1002. current_weather_info["windSpeed"] = next_text.strip()
  1003. # 查找风向(可能格式:风向南风 或 在"m/s风向"之后的文本中,或 "_m/s风向南风" 或 "m/s风向南风")
  1004. if not current_weather_info["windDirection"]:
  1005. # 先检查当前文本是否包含风向值(格式:风向南风)
  1006. # 改进正则表达式,匹配更长的风向值(如"南风"、"东北"、"东偏北"等)
  1007. # 注意:不要排除"风"字,因为"风速"中包含"风",会导致"南风"只匹配到"南"
  1008. wind_dir_match = re.search(r'风向\s*([^\s日期温度湿度]+?)(?=\s|日期|温度|湿度|风速|$)', text)
  1009. if wind_dir_match:
  1010. wind_value = wind_dir_match.group(1).strip()
  1011. # 确保不是"m/s"或数字
  1012. if wind_value and wind_value != "m/s" and not re.match(r'^[0-9.\-]+$', wind_value):
  1013. # 如果只匹配到单个方向字(如"南"),检查下一个文本片段是否是"风"
  1014. if len(wind_value) == 1 and i + 1 < len(ocr_texts):
  1015. next_text = ocr_texts[i + 1].strip()
  1016. # 如果下一个文本是"风",合并为"南风"等
  1017. if next_text == "风" or next_text.startswith("风"):
  1018. wind_value = wind_value + "风"
  1019. logger.debug(f"[关键词提取] 合并风向值: {wind_value}")
  1020. current_weather_info["windDirection"] = wind_value
  1021. # 如果当前文本是"m/s风向"或"_m/s风向"格式,风向值在同一文本中(如 "_m/s风向南风" 或 "m/s风向南风")
  1022. if not current_weather_info["windDirection"]:
  1023. # 注意:不要排除"风"字,因为"风速"中包含"风",会导致"南风"只匹配到"南"
  1024. wind_dir_match = re.search(r'[_\s]*m/s\s*风向\s*([^\s日期温度湿度]+?)(?=\s|日期|温度|湿度|风速|$)', text)
  1025. if wind_dir_match:
  1026. wind_value = wind_dir_match.group(1).strip()
  1027. if wind_value and not re.match(r'^[0-9.\-]+$', wind_value):
  1028. # 如果只匹配到单个方向字,检查下一个文本片段
  1029. if len(wind_value) == 1 and i + 1 < len(ocr_texts):
  1030. next_text = ocr_texts[i + 1].strip()
  1031. if next_text == "风" or next_text.startswith("风"):
  1032. wind_value = wind_value + "风"
  1033. logger.debug(f"[关键词提取] 合并风向值: {wind_value}")
  1034. current_weather_info["windDirection"] = wind_value
  1035. # 如果当前文本是"m/s"或类似格式,风向值可能在下一行
  1036. if not current_weather_info["windDirection"]:
  1037. if ("m/s" in text or "风向" in text) and i + 1 < len(ocr_texts):
  1038. next_text = ocr_texts[i + 1].strip()
  1039. if next_text and not re.match(r'^[0-9.\-]+', next_text) and "风向" not in next_text:
  1040. wind_value = next_text
  1041. # 如果下一个文本是单个方向字,再检查下下个文本是否是"风"
  1042. if len(wind_value) == 1 and i + 2 < len(ocr_texts):
  1043. next_next_text = ocr_texts[i + 2].strip()
  1044. if next_next_text == "风" or next_next_text.startswith("风"):
  1045. wind_value = wind_value + "风"
  1046. logger.debug(f"[关键词提取] 合并风向值: {wind_value}")
  1047. current_weather_info["windDirection"] = wind_value
  1048. # 保存最后一个天气记录
  1049. if current_weather_info and any([current_weather_info["monitorAt"], current_weather_info["weather"],
  1050. current_weather_info["temp"], current_weather_info["humidity"],
  1051. current_weather_info["windSpeed"], current_weather_info["windDirection"]]):
  1052. keywords["weather_info"].append(current_weather_info)
  1053. # 提取监测地点(address)信息,用于电磁检测记录
  1054. # 匹配模式:编号(如EB1, EB2, ZB1等)后面跟着地址信息
  1055. # 地址通常在编号之后,可能在同一个文本片段或相邻的文本片段中
  1056. for i, text in enumerate(ocr_texts):
  1057. # 查找编号模式:EB1, EB2, ZB1, ZB2等
  1058. code_match = re.search(r'(E[ZB]\d+|Z[ZB]\d+)', text, re.IGNORECASE)
  1059. if code_match:
  1060. code = code_match.group(1).upper() # 统一转为大写
  1061. # 在当前文本中查找地址(编号后面的非数字、非时间格式的文本)
  1062. # 地址通常在编号之后,可能是中文地名
  1063. address_candidates = []
  1064. # 在当前文本中,编号之后查找地址
  1065. code_pos = code_match.end()
  1066. remaining_text = text[code_pos:].strip()
  1067. # 跳过可能的空格、标点等
  1068. remaining_text = re.sub(r'^[\s,,。、]+', '', remaining_text)
  1069. # 如果剩余文本不为空且不是纯数字或时间格式,可能是地址
  1070. if remaining_text and not re.match(r'^[\d.\-:\s]+$', remaining_text):
  1071. # 提取地址(直到遇到数字、时间或特定关键词)
  1072. address_match = re.search(r'^([^\d\n]+?)(?=\d|时间|线高|$)', remaining_text)
  1073. if address_match:
  1074. address = address_match.group(1).strip()
  1075. # 清理地址,移除常见的非地址字符
  1076. address = re.sub(r'[,。、\s]+$', '', address)
  1077. if address and len(address) > 0:
  1078. address_candidates.append(address)
  1079. # 如果当前文本中没有找到地址,检查相邻的文本片段
  1080. if not address_candidates:
  1081. # 检查编号之前的文本片段(地址可能在编号之前,需要跳过数字、时间、高度等)
  1082. # 向前查找最多5个文本片段,跳过数字、时间、高度等,找到中文地名
  1083. for j in range(i - 1, max(i - 6, -1), -1):
  1084. prev_text = ocr_texts[j].strip()
  1085. if not prev_text:
  1086. continue
  1087. # 跳过编号、数字、时间、高度等
  1088. if re.match(r'^(E[ZB]\d+|Z[ZB]\d+|\d+|时间|线高|编号|均值|24m|\d{4}[.\-]\d{1,2}[.\-]\d{1,2})', prev_text, re.IGNORECASE):
  1089. continue
  1090. # 检查是否是中文地名(包含至少2个中文字符)
  1091. if re.search(r'[\u4e00-\u9fa5]{2,}', prev_text):
  1092. # 进一步确认:不是纯数字、时间格式等
  1093. if not re.match(r'^[\d.\-:\s]+$', prev_text):
  1094. address_candidates.append(prev_text)
  1095. logger.debug(f"[关键词提取] 在编号{code}之前找到地址候选 (索引{j}): {prev_text}")
  1096. break # 找到第一个地址就停止
  1097. # 检查编号之后的文本片段
  1098. if not address_candidates and i + 1 < len(ocr_texts):
  1099. next_text = ocr_texts[i + 1].strip()
  1100. # 如果下一个文本不是编号、数字、时间等,可能是地址
  1101. if next_text and not re.match(r'^(E[ZB]\d+|Z[ZB]\d+|\d+|时间|线高|编号|均值|24m|\d{4}[.\-]\d{1,2}[.\-]\d{1,2})', next_text, re.IGNORECASE):
  1102. # 检查是否是中文地名
  1103. if re.search(r'[\u4e00-\u9fa5]{2,}', next_text):
  1104. address_candidates.append(next_text)
  1105. # 如果找到地址候选,选择最合适的(通常是第一个非空的)
  1106. if address_candidates:
  1107. address = address_candidates[0]
  1108. # 进一步清理地址
  1109. address = re.sub(r'^[,。、\s]+|[,。、\s]+$', '', address)
  1110. if address:
  1111. keywords["address_mapping"][code] = address
  1112. logger.debug(f"[关键词提取] 提取到监测地点: {code} -> {address}")
  1113. return keywords
  1114. def extract_keywords_from_markdown(markdown_content: str) -> Dict[str, Any]:
  1115. """从markdown内容中直接提取关键信息
  1116. Args:
  1117. markdown_content: markdown内容字符串
  1118. Returns:
  1119. 包含提取的关键信息的字典
  1120. """
  1121. keywords = {
  1122. "project": "",
  1123. "standardReferences": "",
  1124. "soundLevelMeterMode": "",
  1125. "soundCalibratorMode": "",
  1126. "calibrationValueBefore": "",
  1127. "calibrationValueAfter": "",
  1128. "weather_info": [] # 存储天气相关信息
  1129. }
  1130. if not markdown_content:
  1131. return keywords
  1132. # 移除HTML标签,保留文本内容(但保留表格结构信息)
  1133. # 先提取表格中的文本内容
  1134. text_content = markdown_content
  1135. # 提取项目名称
  1136. project_match = re.search(r'项目名称[::]([^检测依据声级计声校准器检测前检测后气象条件日期<>]+)', text_content)
  1137. if project_match:
  1138. project = project_match.group(1).strip()
  1139. # 清理可能的后续内容和HTML标签
  1140. project = re.sub(r'检测依据.*$', '', project).strip()
  1141. project = re.sub(r'<[^>]+>', '', project).strip()
  1142. if project:
  1143. keywords["project"] = project
  1144. logger.debug(f"[Markdown关键词提取] 提取到项目名称: {project}")
  1145. # 提取检测依据
  1146. standard_match = re.search(r'检测依据[::]([^声级计声校准器检测前检测后气象条件日期<>]+)', text_content)
  1147. if standard_match:
  1148. standard = standard_match.group(1).strip()
  1149. # 提取GB标准
  1150. gb_standards = re.findall(r'GB\s*\d+[-\.]?\d*[-\.]?\d*', standard)
  1151. if gb_standards:
  1152. keywords["standardReferences"] = " ".join(gb_standards)
  1153. else:
  1154. keywords["standardReferences"] = re.sub(r'<[^>]+>', '', standard).replace("□其他:", "").strip()
  1155. logger.debug(f"[Markdown关键词提取] 提取到检测依据: {keywords['standardReferences']}")
  1156. # 提取声级计型号/编号
  1157. sound_meter_match = re.search(r'声级计型号[/::]?(?:编号)?[::]\s*([A-Z0-9+/()()]+)', text_content)
  1158. if sound_meter_match:
  1159. sound_meter = sound_meter_match.group(1).strip()
  1160. sound_meter = re.sub(r'<[^>]+>', '', sound_meter).strip()
  1161. if sound_meter:
  1162. keywords["soundLevelMeterMode"] = sound_meter
  1163. logger.debug(f"[Markdown关键词提取] 提取到声级计型号: {keywords['soundLevelMeterMode']}")
  1164. # 提取声校准器型号/编号
  1165. calibrator_match = re.search(r'声校准器型号[/::]?(?:编号)?[::]\s*([A-Z0-9+/()()]+)', text_content)
  1166. if calibrator_match:
  1167. calibrator = calibrator_match.group(1).strip()
  1168. calibrator = re.sub(r'<[^>]+>', '', calibrator).strip()
  1169. if calibrator:
  1170. keywords["soundCalibratorMode"] = calibrator
  1171. logger.debug(f"[Markdown关键词提取] 提取到声校准器型号: {keywords['soundCalibratorMode']}")
  1172. # 提取检测前校准值
  1173. before_cal_match = re.search(r'检测前校准值[::]\s*([0-9.]+)\s*dB[((]?A[))]?', text_content)
  1174. if before_cal_match:
  1175. cal_value = before_cal_match.group(1).strip()
  1176. keywords["calibrationValueBefore"] = f"{cal_value} dB(A)"
  1177. logger.debug(f"[Markdown关键词提取] 提取到检测前校准值: {keywords['calibrationValueBefore']}")
  1178. # 提取检测后校准值
  1179. after_cal_match = re.search(r'检测后校准值[::]\s*([0-9.]+)\s*dB[((]?A[))]?', text_content)
  1180. if after_cal_match:
  1181. cal_value = after_cal_match.group(1).strip()
  1182. keywords["calibrationValueAfter"] = f"{cal_value} dB(A)"
  1183. logger.debug(f"[Markdown关键词提取] 提取到检测后校准值: {keywords['calibrationValueAfter']}")
  1184. # 提取天气信息
  1185. # 查找所有包含"日期:"的行或片段
  1186. date_pattern = r'日期[::]\s*([\d.\-]+)'
  1187. date_matches = list(re.finditer(date_pattern, text_content))
  1188. for date_match in date_matches:
  1189. date_value = date_match.group(1).strip()
  1190. # 获取日期匹配位置后的文本(最多500字符)
  1191. start_pos = date_match.end()
  1192. weather_section = text_content[start_pos:start_pos + 500]
  1193. weather_info = {
  1194. "monitorAt": date_value,
  1195. "weather": "",
  1196. "temp": "",
  1197. "humidity": "",
  1198. "windSpeed": "",
  1199. "windDirection": ""
  1200. }
  1201. # 提取天气
  1202. weather_match = re.search(r'天气\s*([^\s温度湿度风速风向<>]+)', weather_section)
  1203. if weather_match:
  1204. weather_value = weather_match.group(1).strip()
  1205. weather_value = re.sub(r'<[^>]+>', '', weather_value).strip()
  1206. if weather_value and weather_value != "_" and not re.match(r'^[\d.\-]+$', weather_value):
  1207. weather_info["weather"] = weather_value
  1208. # 提取温度
  1209. temp_match = re.search(r'温度[::]?\s*([0-9.\-]+)', weather_section)
  1210. if temp_match:
  1211. weather_info["temp"] = temp_match.group(1).strip()
  1212. # 提取湿度
  1213. humidity_match = re.search(r'湿度[::]?\s*([0-9.\-]+)', weather_section)
  1214. if humidity_match:
  1215. weather_info["humidity"] = humidity_match.group(1).strip()
  1216. # 提取风速
  1217. wind_speed_match = re.search(r'风速[::]?\s*([0-9.\-]+)', weather_section)
  1218. if wind_speed_match:
  1219. weather_info["windSpeed"] = wind_speed_match.group(1).strip()
  1220. # 提取风向
  1221. wind_dir_match = re.search(r'风向[::]?\s*([^\s日期温度湿度风速<>]+?)(?=\s|日期|温度|湿度|风速|$|<)', weather_section)
  1222. if wind_dir_match:
  1223. wind_value = wind_dir_match.group(1).strip()
  1224. wind_value = re.sub(r'<[^>]+>', '', wind_value).strip()
  1225. if wind_value and wind_value != "m/s" and not re.match(r'^[0-9.\-]+$', wind_value):
  1226. weather_info["windDirection"] = wind_value
  1227. # 如果至少有一个字段不为空,则添加这条记录
  1228. if any([weather_info["monitorAt"], weather_info["weather"], weather_info["temp"],
  1229. weather_info["humidity"], weather_info["windSpeed"], weather_info["windDirection"]]):
  1230. keywords["weather_info"].append(weather_info)
  1231. logger.debug(f"[Markdown关键词提取] 提取到天气记录: {weather_info}")
  1232. return keywords
  1233. def supplement_missing_fields_from_ocr_json(
  1234. records: List[Dict[str, Any]],
  1235. ocr_json_path: str,
  1236. field_mapping: Dict[str, str] = None
  1237. ) -> List[Dict[str, Any]]:
  1238. """从OCR的JSON输出中补充缺失字段
  1239. 根据文本位置关系来补充缺失字段。例如,如果找到了maxReactivePower的值(如"-2.48"),
  1240. 那么minReactivePower的值就在它后面的位置("-4.75")。
  1241. Args:
  1242. records: 原始解析记录列表(OperationalConditionV2格式)
  1243. ocr_json_path: OCR输出的JSON文件路径
  1244. field_mapping: 字段映射关系,如{"maxReactivePower": "minReactivePower"},表示maxReactivePower后面是minReactivePower
  1245. Returns:
  1246. 补充后的记录列表
  1247. """
  1248. if not records or not ocr_json_path or not os.path.exists(ocr_json_path):
  1249. return records
  1250. try:
  1251. # 读取OCR JSON文件
  1252. with open(ocr_json_path, 'r', encoding='utf-8') as f:
  1253. ocr_data = json.load(f)
  1254. # 提取rec_texts数组
  1255. rec_texts = ocr_data.get("rec_texts", [])
  1256. if not rec_texts:
  1257. logger.warning("[OCR字段补充] JSON中未找到rec_texts字段")
  1258. return records
  1259. logger.info(f"[OCR字段补充] 从OCR JSON中提取到 {len(rec_texts)} 个文本片段")
  1260. # 默认字段映射:max字段后面是min字段
  1261. if field_mapping is None:
  1262. field_mapping = {
  1263. "maxVoltage": "minVoltage",
  1264. "maxCurrent": "minCurrent",
  1265. "maxActivePower": "minActivePower",
  1266. "maxReactivePower": "minReactivePower"
  1267. }
  1268. # 为每条记录补充缺失字段
  1269. for record in records:
  1270. record_name = record.get("name", "")
  1271. logger.debug(f"[OCR字段补充] 处理记录: {record_name}")
  1272. # 对于每个max字段,如果对应的min字段为空,尝试从OCR中补充
  1273. for max_field, min_field in field_mapping.items():
  1274. max_value = record.get(max_field, "").strip()
  1275. min_value = record.get(min_field, "").strip()
  1276. # 如果max字段有值但min字段为空,尝试从OCR中补充
  1277. if max_value and not min_value:
  1278. logger.debug(f"[OCR字段补充] 记录 {record_name}: {max_field}={max_value}, {min_field}为空,尝试从OCR补充")
  1279. # 在rec_texts中查找max_value
  1280. try:
  1281. max_value_float = float(max_value)
  1282. # 查找匹配的文本(允许小的数值差异)
  1283. found_max = False
  1284. for i, text in enumerate(rec_texts):
  1285. # 尝试将文本转换为数值
  1286. try:
  1287. text_float = float(text.strip())
  1288. # 如果数值匹配(允许小的误差)
  1289. if abs(text_float - max_value_float) < 0.01:
  1290. found_max = True
  1291. # 检查后续几个文本,找到第一个数值作为min_value
  1292. # 在表格中,max和min通常是相邻的,但中间可能有其他文本
  1293. for j in range(i + 1, min(i + 5, len(rec_texts))): # 检查后续最多4个文本
  1294. next_text = rec_texts[j].strip()
  1295. try:
  1296. next_value_float = float(next_text)
  1297. # 如果找到数值,且与max_value不同,则作为min_value
  1298. if abs(next_value_float - max_value_float) > 0.01:
  1299. record[min_field] = next_text
  1300. logger.info(f"[OCR字段补充] 从OCR补充 {min_field}: {next_text} (在 {max_field}={max_value} 之后,位置 {j})")
  1301. break
  1302. except ValueError:
  1303. # 不是数值,继续查找
  1304. continue
  1305. if record.get(min_field):
  1306. break
  1307. except ValueError:
  1308. # 文本不是数值,继续
  1309. pass
  1310. if not found_max:
  1311. logger.debug(f"[OCR字段补充] 未在OCR中找到 {max_field} 的值 '{max_value}'")
  1312. except ValueError:
  1313. # max_value不是数值,跳过
  1314. logger.debug(f"[OCR字段补充] {max_field}值 '{max_value}' 不是数值,跳过")
  1315. pass
  1316. logger.info("[OCR字段补充] 字段补充完成")
  1317. return records
  1318. except Exception as e:
  1319. logger.exception(f"[OCR字段补充] 补充过程出错: {e}")
  1320. return records
  1321. def extract_image_from_markdown(markdown_content: str, output_dir: str) -> Optional[str]:
  1322. """从markdown内容中提取第一张图片路径
  1323. Args:
  1324. markdown_content: markdown内容
  1325. output_dir: 输出目录
  1326. Returns:
  1327. 图片路径,如果未找到返回None
  1328. """
  1329. # 查找markdown中的图片引用
  1330. # 格式: ![alt](path) 或 <img src="path">
  1331. image_patterns = [
  1332. r'!\[.*?\]\((.*?)\)', # markdown图片格式
  1333. r'<img[^>]+src=["\'](.*?)["\']', # HTML img标签
  1334. r'<img[^>]+src=(.*?)(?:\s|>)', # HTML img标签(无引号)
  1335. ]
  1336. for pattern in image_patterns:
  1337. matches = re.findall(pattern, markdown_content)
  1338. if matches:
  1339. image_path = matches[0]
  1340. # 如果是相对路径,尝试在output_dir中查找
  1341. if not os.path.isabs(image_path):
  1342. # 尝试多个可能的路径
  1343. possible_paths = [
  1344. os.path.join(output_dir, image_path),
  1345. os.path.join(output_dir, "images", os.path.basename(image_path)),
  1346. os.path.join(output_dir, os.path.basename(image_path)),
  1347. ]
  1348. for full_path in possible_paths:
  1349. if os.path.exists(full_path):
  1350. return full_path
  1351. elif os.path.exists(image_path):
  1352. return image_path
  1353. return None
  1354. def fallback_parse_with_paddleocr(
  1355. json_data: Dict[str, Any],
  1356. markdown_content: str,
  1357. output_dir: Optional[str] = None,
  1358. document_type: Optional[str] = None,
  1359. input_file: Optional[str] = None
  1360. ) -> Optional[str]:
  1361. """当JSON数据缺失时,使用paddleocr进行备用解析
  1362. Args:
  1363. json_data: 原始JSON数据
  1364. markdown_content: 原始markdown内容
  1365. output_dir: 输出目录(用于查找图片)
  1366. document_type: 文档类型
  1367. input_file: 原始输入文件路径(PDF或图片),如果未找到图片则从PDF提取第一页
  1368. Returns:
  1369. 补充后的markdown内容,如果失败返回None
  1370. """
  1371. try:
  1372. # 注意:调用方已经检查过数据完整性,这里不再重复检查
  1373. # 直接进行备用解析,因为调用方已经确定需要备用解析
  1374. doc_type = document_type or json_data.get("document_type", "unknown")
  1375. logger.warning("[PaddleOCR备用] 启用PaddleOCR备用解析")
  1376. # 尝试从markdown中提取图片路径
  1377. image_path = None
  1378. if output_dir:
  1379. # 首先尝试从markdown中提取
  1380. image_path = extract_image_from_markdown(markdown_content, output_dir)
  1381. if image_path:
  1382. logger.info(f"[PaddleOCR备用] 从markdown中找到图片: {image_path}")
  1383. # 如果找不到,尝试在output_dir中查找png文件
  1384. if not image_path and os.path.exists(output_dir):
  1385. # 查找所有png文件
  1386. png_files = list(Path(output_dir).rglob("*.png"))
  1387. if png_files:
  1388. # 优先查找包含"粘贴"或"image"的文件名
  1389. for png_file in png_files:
  1390. if "粘贴" in png_file.name or "image" in png_file.name.lower():
  1391. image_path = str(png_file)
  1392. logger.info(f"[PaddleOCR备用] 使用找到的图片: {image_path}")
  1393. break
  1394. # 如果没找到特殊名称的,使用第一个
  1395. if not image_path:
  1396. image_path = str(png_files[0])
  1397. logger.info(f"[PaddleOCR备用] 使用找到的图片: {image_path}")
  1398. # 如果仍未找到图片,尝试从input_file处理
  1399. if not image_path:
  1400. logger.warning("[PaddleOCR备用] 未找到可用的图片文件,尝试从input_file处理")
  1401. if input_file and os.path.exists(input_file):
  1402. # 检测文件实际类型(不依赖扩展名)
  1403. file_type = detect_file_type(input_file)
  1404. if file_type == 'pdf':
  1405. # 文件是PDF,尝试提取第一页
  1406. pdf_path = input_file
  1407. logger.info(f"[PaddleOCR备用] 检测到PDF文件(通过内容): {pdf_path}")
  1408. image_path = extract_first_page_from_pdf(pdf_path, output_dir)
  1409. if image_path:
  1410. logger.info(f"[PaddleOCR备用] 成功从PDF提取第一页图片: {image_path}")
  1411. else:
  1412. logger.warning("[PaddleOCR备用] 从PDF提取图片失败(可能是PDF文件损坏或缺少必要的库)")
  1413. elif file_type in ['png', 'jpeg', 'jpg']:
  1414. # 文件是图片,直接使用
  1415. image_path = input_file
  1416. logger.info(f"[PaddleOCR备用] 检测到图片文件({file_type}): {image_path}")
  1417. else:
  1418. # 文件类型未知,尝试按PDF处理(可能是PDF但没有正确识别)
  1419. logger.debug(f"[PaddleOCR备用] input_file类型未知({file_type}),尝试按PDF处理: {input_file}")
  1420. if PDFIUM_AVAILABLE or PDF2IMAGE_AVAILABLE:
  1421. try:
  1422. # 尝试打开为PDF
  1423. pdf_path = input_file
  1424. image_path = extract_first_page_from_pdf(pdf_path, output_dir)
  1425. if image_path:
  1426. logger.info(f"[PaddleOCR备用] 成功将文件作为PDF处理并提取第一页: {image_path}")
  1427. except Exception as e:
  1428. logger.debug(f"[PaddleOCR备用] 无法将文件作为PDF处理: {e}")
  1429. # 如果input_file处理失败,尝试在output_dir中查找PDF文件
  1430. if not image_path and output_dir:
  1431. pdf_path = find_pdf_file(output_dir)
  1432. if pdf_path:
  1433. logger.info(f"[PaddleOCR备用] 在输出目录中找到PDF文件: {pdf_path}")
  1434. image_path = extract_first_page_from_pdf(pdf_path, output_dir)
  1435. if image_path:
  1436. logger.info(f"[PaddleOCR备用] 成功从PDF提取第一页图片: {image_path}")
  1437. # 如果仍未找到,尝试在input_file的父目录中查找
  1438. if not image_path and input_file:
  1439. parent_dir = os.path.dirname(input_file)
  1440. if parent_dir and os.path.exists(parent_dir):
  1441. pdf_path = find_pdf_file(parent_dir)
  1442. if pdf_path:
  1443. logger.info(f"[PaddleOCR备用] 在input_file父目录中找到PDF文件: {pdf_path}")
  1444. image_path = extract_first_page_from_pdf(pdf_path, output_dir)
  1445. if image_path:
  1446. logger.info(f"[PaddleOCR备用] 成功从PDF提取第一页图片: {image_path}")
  1447. if not image_path:
  1448. logger.warning(f"[PaddleOCR备用] 未找到可用的图片或PDF文件(input_file={input_file}, output_dir={output_dir}),无法进行备用解析")
  1449. logger.info("[PaddleOCR备用] 备用解析需要图片文件或PDF文件,如果都没有,将返回原始markdown内容")
  1450. if not image_path:
  1451. logger.warning("[PaddleOCR备用] 未找到可用的图片文件,备用解析无法进行,返回None(将使用原始解析结果)")
  1452. return None
  1453. # 使用doc_parser模式解析文档结构
  1454. logger.info("[PaddleOCR备用] 使用doc_parser模式解析文档结构")
  1455. paddleocr_result = call_paddleocr(image_path)
  1456. if not paddleocr_result:
  1457. logger.error("[PaddleOCR备用] PaddleOCR解析失败")
  1458. return None
  1459. # 检查返回结果格式
  1460. if "markdown_content" in paddleocr_result:
  1461. # 直接从MD文件读取的内容
  1462. paddleocr_markdown = paddleocr_result["markdown_content"]
  1463. logger.info(f"[PaddleOCR备用] 成功从MD文件读取,生成 {len(paddleocr_markdown)} 字符的markdown")
  1464. # 从markdown内容中提取关键词来补充数据
  1465. logger.info("[PaddleOCR备用] 从MD文件内容中提取关键词补充数据")
  1466. keywords = extract_keywords_from_markdown(paddleocr_markdown)
  1467. # 将关键词信息添加到markdown中(作为注释,供后续解析使用)
  1468. keywords_comment = "\n\n<!-- Markdown关键词补充:\n"
  1469. if keywords["project"]:
  1470. keywords_comment += f"项目名称:{keywords['project']}\n"
  1471. if keywords["standardReferences"]:
  1472. keywords_comment += f"检测依据:{keywords['standardReferences']}\n"
  1473. if keywords["soundLevelMeterMode"]:
  1474. keywords_comment += f"声级计型号/编号:{keywords['soundLevelMeterMode']}\n"
  1475. if keywords["soundCalibratorMode"]:
  1476. keywords_comment += f"声校准器型号/编号:{keywords['soundCalibratorMode']}\n"
  1477. if keywords["calibrationValueBefore"]:
  1478. keywords_comment += f"检测前校准值:{keywords['calibrationValueBefore']}\n"
  1479. if keywords["calibrationValueAfter"]:
  1480. keywords_comment += f"检测后校准值:{keywords['calibrationValueAfter']}\n"
  1481. if keywords.get("address_mapping"):
  1482. for code, address in keywords["address_mapping"].items():
  1483. keywords_comment += f"监测地点-{code}:{address}\n"
  1484. if keywords["weather_info"]:
  1485. for weather in keywords["weather_info"]:
  1486. keywords_comment += f"日期:{weather['monitorAt']} 天气:{weather['weather']} 温度:{weather['temp']} 湿度:{weather['humidity']} 风速:{weather['windSpeed']} 风向:{weather['windDirection']}\n"
  1487. keywords_comment += "-->\n"
  1488. # 将关键词信息合并到markdown中
  1489. paddleocr_markdown = paddleocr_markdown + keywords_comment
  1490. # 统计补充的字段数量(不包括weather_info列表)
  1491. field_count = sum(1 for k, v in keywords.items() if k != "weather_info" and v) + len(keywords.get("weather_info", []))
  1492. logger.info(f"[PaddleOCR备用] MD文件关键词提取完成,补充了 {field_count} 个字段")
  1493. elif "parsing_res_list" in paddleocr_result:
  1494. # 从JSON或stdout解析的结果,需要转换为markdown
  1495. paddleocr_markdown = paddleocr_to_markdown(paddleocr_result)
  1496. if not paddleocr_markdown:
  1497. logger.warning("[PaddleOCR备用] PaddleOCR未解析出有效内容")
  1498. return None
  1499. logger.info(f"[PaddleOCR备用] 成功解析,生成 {len(paddleocr_markdown)} 字符的markdown")
  1500. else:
  1501. logger.error("[PaddleOCR备用] PaddleOCR返回格式不正确")
  1502. return None
  1503. # 调用paddleocr ocr提取关键词来补充数据(作为doc_parser的补充)
  1504. logger.info("[PaddleOCR备用] 调用OCR提取关键词补充数据")
  1505. ocr_save_path = os.path.dirname(image_path) # 使用图片所在目录作为保存路径
  1506. ocr_texts, _ = call_paddleocr_ocr(image_path, ocr_save_path)
  1507. if ocr_texts:
  1508. # 从OCR文本中提取关键词
  1509. keywords = extract_keywords_from_ocr_texts(ocr_texts)
  1510. # 将关键词信息添加到markdown中(作为注释,供后续解析使用)
  1511. keywords_comment = "\n\n<!-- OCR关键词补充:\n"
  1512. if keywords["project"]:
  1513. keywords_comment += f"项目名称:{keywords['project']}\n"
  1514. if keywords["standardReferences"]:
  1515. keywords_comment += f"检测依据:{keywords['standardReferences']}\n"
  1516. if keywords["soundLevelMeterMode"]:
  1517. keywords_comment += f"声级计型号/编号:{keywords['soundLevelMeterMode']}\n"
  1518. if keywords["soundCalibratorMode"]:
  1519. keywords_comment += f"声校准器型号/编号:{keywords['soundCalibratorMode']}\n"
  1520. if keywords["calibrationValueBefore"]:
  1521. keywords_comment += f"检测前校准值:{keywords['calibrationValueBefore']}\n"
  1522. if keywords.get("address_mapping"):
  1523. for code, address in keywords["address_mapping"].items():
  1524. keywords_comment += f"监测地点-{code}:{address}\n"
  1525. if keywords["calibrationValueAfter"]:
  1526. keywords_comment += f"检测后校准值:{keywords['calibrationValueAfter']}\n"
  1527. if keywords["weather_info"]:
  1528. for weather in keywords["weather_info"]:
  1529. keywords_comment += f"日期:{weather['monitorAt']} 天气:{weather['weather']} 温度:{weather['temp']} 湿度:{weather['humidity']} 风速:{weather['windSpeed']} 风向:{weather['windDirection']}\n"
  1530. keywords_comment += "-->\n"
  1531. # 将关键词信息合并到markdown中
  1532. paddleocr_markdown = paddleocr_markdown + keywords_comment
  1533. logger.info(f"[PaddleOCR备用] OCR关键词提取完成,补充了 {len(keywords)} 个字段")
  1534. # 合并原始markdown和paddleocr结果
  1535. # 优先使用paddleocr的结果,因为它更完整
  1536. combined_markdown = f"{paddleocr_markdown}\n\n<!-- 原始内容(可能不完整) -->\n{markdown_content}"
  1537. return combined_markdown
  1538. except Exception as e:
  1539. logger.exception(f"[PaddleOCR备用] 备用解析过程出错: {e}")
  1540. return None
  1541. def extract_text_with_paragraphs_from_ocr_json(json_path: str, line_height_threshold: float = 1.5, paragraph_gap_threshold: float = 2.0) -> str:
  1542. """
  1543. 从PaddleOCR的JSON输出中提取带段落分割的纯文本
  1544. Args:
  1545. json_path: OCR输出的JSON文件路径
  1546. line_height_threshold: 行高倍数阈值,用于判断是否在同一行(默认1.5)
  1547. paragraph_gap_threshold: 段落间距倍数阈值,用于判断是否需要分段(默认2.0)
  1548. Returns:
  1549. 带段落分割的纯文本字符串
  1550. """
  1551. try:
  1552. with open(json_path, 'r', encoding='utf-8') as f:
  1553. ocr_data = json.load(f)
  1554. # 提取文本和坐标信息
  1555. rec_texts = ocr_data.get("rec_texts", [])
  1556. dt_polys = ocr_data.get("dt_polys", [])
  1557. if not rec_texts or not dt_polys:
  1558. logger.warning("[OCR文本提取] JSON中缺少rec_texts或dt_polys字段")
  1559. return ""
  1560. if len(rec_texts) != len(dt_polys):
  1561. logger.warning(f"[OCR文本提取] rec_texts长度({len(rec_texts)})与dt_polys长度({len(dt_polys)})不匹配")
  1562. # 取较小的长度
  1563. min_len = min(len(rec_texts), len(dt_polys))
  1564. rec_texts = rec_texts[:min_len]
  1565. dt_polys = dt_polys[:min_len]
  1566. # 计算每个文本块的边界框和中心点
  1567. text_blocks = []
  1568. for i, (text, poly) in enumerate(zip(rec_texts, dt_polys)):
  1569. if not text or not text.strip():
  1570. continue
  1571. # 从多边形坐标计算边界框
  1572. # poly格式: [[x1, y1], [x2, y2], [x3, y3], [x4, y4]]
  1573. if len(poly) >= 4:
  1574. xs = [point[0] for point in poly]
  1575. ys = [point[1] for point in poly]
  1576. x_min, x_max = min(xs), max(xs)
  1577. y_min, y_max = min(ys), max(ys)
  1578. # 计算中心点和高度
  1579. center_x = (x_min + x_max) / 2
  1580. center_y = (y_min + y_max) / 2
  1581. height = y_max - y_min
  1582. width = x_max - x_min
  1583. text_blocks.append({
  1584. 'text': text.strip(),
  1585. 'x_min': x_min,
  1586. 'x_max': x_max,
  1587. 'y_min': y_min,
  1588. 'y_max': y_max,
  1589. 'center_x': center_x,
  1590. 'center_y': center_y,
  1591. 'height': height,
  1592. 'width': width,
  1593. 'index': i
  1594. })
  1595. if not text_blocks:
  1596. logger.warning("[OCR文本提取] 没有有效的文本块")
  1597. return ""
  1598. # 按Y坐标(从上到下)排序
  1599. text_blocks.sort(key=lambda b: (b['y_min'], b['x_min']))
  1600. # 计算平均行高(用于判断行间距)
  1601. heights = [b['height'] for b in text_blocks]
  1602. avg_height = sum(heights) / len(heights) if heights else 20
  1603. # 将文本块按行分组
  1604. lines = []
  1605. current_line = [text_blocks[0]]
  1606. for i in range(1, len(text_blocks)):
  1607. prev_block = text_blocks[i - 1]
  1608. curr_block = text_blocks[i]
  1609. # 计算Y坐标重叠度
  1610. y_overlap = min(prev_block['y_max'], curr_block['y_max']) - max(prev_block['y_min'], curr_block['y_min'])
  1611. overlap_ratio = y_overlap / min(prev_block['height'], curr_block['height']) if min(prev_block['height'], curr_block['height']) > 0 else 0
  1612. # 计算Y坐标间距
  1613. y_gap = curr_block['y_min'] - prev_block['y_max']
  1614. gap_ratio = y_gap / avg_height if avg_height > 0 else 0
  1615. # 判断是否在同一行:有重叠或间距小于行高阈值
  1616. if overlap_ratio > 0.3 or (y_gap >= 0 and gap_ratio < line_height_threshold):
  1617. current_line.append(curr_block)
  1618. else:
  1619. # 新行开始,保存当前行
  1620. lines.append(current_line)
  1621. current_line = [curr_block]
  1622. # 添加最后一行
  1623. if current_line:
  1624. lines.append(current_line)
  1625. # 对每行内的文本块按X坐标排序(从左到右)
  1626. for line in lines:
  1627. line.sort(key=lambda b: b['x_min'])
  1628. # 生成文本,根据行间距判断段落分割
  1629. result_lines = []
  1630. prev_line_y = None
  1631. prev_line_height = None
  1632. for line_idx, line in enumerate(lines):
  1633. # 计算当前行的Y坐标和高度
  1634. line_y_min = min(b['y_min'] for b in line)
  1635. line_y_max = max(b['y_max'] for b in line)
  1636. line_height = line_y_max - line_y_min
  1637. line_center_y = (line_y_min + line_y_max) / 2
  1638. # 拼接当前行的文本
  1639. # 对于表格数据,使用制表符分隔;对于普通文本,使用空格
  1640. line_text = ""
  1641. prev_x_max = None
  1642. # 判断是否是表格行(如果一行中有多个文本块且X坐标分布较均匀)
  1643. is_table_row = len(line) > 2
  1644. for block in line:
  1645. if prev_x_max is not None:
  1646. x_gap = block['x_min'] - prev_x_max
  1647. # 如果间距较大,添加分隔符
  1648. if x_gap > avg_height * 0.3:
  1649. if is_table_row:
  1650. # 表格使用制表符
  1651. line_text += "\t"
  1652. else:
  1653. # 普通文本使用空格
  1654. line_text += " "
  1655. line_text += block['text']
  1656. prev_x_max = block['x_max']
  1657. # 判断是否需要换段
  1658. if prev_line_y is not None and prev_line_height is not None:
  1659. # 计算行间距
  1660. line_gap = line_y_min - prev_line_y
  1661. gap_ratio = line_gap / prev_line_height if prev_line_height > 0 else 0
  1662. # 如果行间距大于段落阈值,添加空行
  1663. if gap_ratio > paragraph_gap_threshold:
  1664. result_lines.append("") # 空行表示段落分隔
  1665. result_lines.append(line_text)
  1666. prev_line_y = line_y_max
  1667. prev_line_height = line_height
  1668. # 合并为最终文本
  1669. result_text = "\n".join(result_lines)
  1670. logger.info(f"[OCR文本提取] 成功提取文本,共 {len(lines)} 行,{len(result_lines)} 行(含段落分隔)")
  1671. return result_text
  1672. except Exception as e:
  1673. logger.exception(f"[OCR文本提取] 处理失败: {e}")
  1674. return ""