paddleocr_fallback.py 94 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130113111321133113411351136113711381139114011411142114311441145114611471148114911501151115211531154115511561157115811591160116111621163116411651166116711681169117011711172117311741175117611771178117911801181118211831184118511861187118811891190119111921193119411951196119711981199120012011202120312041205120612071208120912101211121212131214121512161217121812191220122112221223122412251226122712281229123012311232123312341235123612371238123912401241124212431244124512461247124812491250125112521253125412551256125712581259126012611262126312641265126612671268126912701271127212731274127512761277127812791280128112821283128412851286128712881289129012911292129312941295129612971298129913001301130213031304130513061307130813091310131113121313131413151316131713181319132013211322132313241325132613271328132913301331133213331334133513361337133813391340134113421343134413451346134713481349135013511352135313541355135613571358135913601361136213631364136513661367136813691370137113721373137413751376137713781379138013811382138313841385138613871388138913901391139213931394139513961397139813991400140114021403140414051406140714081409141014111412141314141415141614171418141914201421142214231424142514261427142814291430143114321433143414351436143714381439144014411442144314441445144614471448144914501451145214531454145514561457145814591460146114621463146414651466146714681469147014711472147314741475147614771478147914801481148214831484148514861487148814891490149114921493149414951496149714981499150015011502150315041505150615071508150915101511151215131514151515161517151815191520152115221523152415251526152715281529153015311532153315341535153615371538153915401541154215431544154515461547154815491550155115521553155415551556155715581559156015611562156315641565156615671568156915701571157215731574157515761577157815791580158115821583158415851586158715881589159015911592159315941595159615971598159916001601160216031604160516061607160816091610161116121613161416151616161716181619162016211622162316241625162616271628162916301631163216331634163516361637163816391640164116421643164416451646164716481649165016511652165316541655165616571658165916601661166216631664166516661667166816691670167116721673167416751676167716781679168016811682168316841685168616871688168916901691169216931694169516961697169816991700170117021703170417051706170717081709171017111712171317141715171617171718171917201721172217231724172517261727172817291730173117321733173417351736173717381739174017411742174317441745174617471748174917501751175217531754175517561757175817591760176117621763176417651766176717681769177017711772177317741775177617771778177917801781178217831784178517861787178817891790179117921793179417951796179717981799180018011802180318041805180618071808180918101811181218131814181518161817181818191820182118221823182418251826182718281829183018311832183318341835183618371838183918401841184218431844184518461847184818491850185118521853185418551856185718581859186018611862186318641865186618671868186918701871187218731874187518761877187818791880188118821883188418851886188718881889189018911892189318941895189618971898189919001901190219031904190519061907190819091910191119121913191419151916191719181919192019211922192319241925192619271928192919301931193219331934193519361937193819391940194119421943194419451946194719481949195019511952195319541955195619571958195919601961196219631964196519661967196819691970197119721973197419751976197719781979198019811982198319841985198619871988198919901991199219931994199519961997199819992000200120022003200420052006200720082009201020112012201320142015201620172018201920202021202220232024202520262027202820292030203120322033203420352036203720382039204020412042204320442045204620472048204920502051205220532054205520562057205820592060206120622063206420652066206720682069207020712072207320742075
  1. # Copyright (c) Opendatalab. All rights reserved.
  2. """PaddleOCR备用解析模块 - 当MinerU解析结果缺失时使用"""
  3. import json
  4. import os
  5. import subprocess
  6. import tempfile
  7. import time
  8. import random
  9. import fcntl
  10. from pathlib import Path
  11. from typing import Dict, Any, Optional, List, Tuple
  12. import ast
  13. import re
  14. from ..utils.logging_config import get_logger
  15. logger = get_logger("pdf_converter_v2.utils.paddleocr")
  16. try:
  17. import pypdfium2 as pdfium
  18. PDFIUM_AVAILABLE = True
  19. except ImportError:
  20. PDFIUM_AVAILABLE = False
  21. logger.warning("[PaddleOCR备用] pypdfium2未安装,无法从PDF提取图片")
  22. try:
  23. from pdf2image import convert_from_path
  24. PDF2IMAGE_AVAILABLE = True
  25. except ImportError:
  26. PDF2IMAGE_AVAILABLE = False
  27. logger.warning("[PaddleOCR备用] pdf2image未安装,无法使用备用方法从PDF提取图片")
  28. try:
  29. from PIL import Image
  30. PIL_AVAILABLE = True
  31. except ImportError:
  32. PIL_AVAILABLE = False
  33. logger.warning("[PaddleOCR备用] PIL未安装,无法处理图片")
  34. # 用于管理mineru服务状态的锁文件路径
  35. MINERU_LOCK_FILE = "/tmp/mineru_service_lock"
  36. MINERU_COUNT_FILE = "/tmp/mineru_service_count"
  37. # PaddleOCR 推理设备:NPU 环境下需设为 npu 或 npu:0,否则会走 CPU 并可能段错误
  38. # 通过环境变量 PADDLE_OCR_DEVICE 指定;未设置时根据设备环境自动选择(NPU 下默认 npu:0)
  39. def _paddle_ocr_device_args() -> list:
  40. """返回 PaddleOCR 命令的 --device 参数列表(若未设置则返回空列表)"""
  41. device = os.getenv("PADDLE_OCR_DEVICE", "").strip()
  42. if not device:
  43. from .device_env import is_npu
  44. if is_npu():
  45. device = "npu:0"
  46. if device:
  47. return ["--device", device]
  48. return []
  49. def _acquire_service_lock() -> Optional[object]:
  50. """获取服务操作锁(文件锁)
  51. Returns:
  52. 文件对象(用于释放锁),如果失败返回None
  53. """
  54. try:
  55. lock_file = open(MINERU_LOCK_FILE, 'w')
  56. fcntl.flock(lock_file.fileno(), fcntl.LOCK_EX | fcntl.LOCK_NB)
  57. return lock_file
  58. except (IOError, OSError) as e:
  59. logger.debug(f"[PaddleOCR] 获取服务锁失败(可能其他进程正在操作): {e}")
  60. return None
  61. def _release_service_lock(lock_file: object) -> None:
  62. """释放服务操作锁
  63. Args:
  64. lock_file: 锁文件对象
  65. """
  66. try:
  67. if lock_file:
  68. fcntl.flock(lock_file.fileno(), fcntl.LOCK_UN)
  69. lock_file.close()
  70. except Exception as e:
  71. logger.warning(f"[PaddleOCR] 释放服务锁失败: {e}")
  72. def _increment_service_count(lock_file: object) -> int:
  73. """增加服务使用计数(需要在锁保护下调用)
  74. Args:
  75. lock_file: 已获取的锁文件对象
  76. Returns:
  77. 当前计数
  78. """
  79. try:
  80. count = 0
  81. if os.path.exists(MINERU_COUNT_FILE):
  82. with open(MINERU_COUNT_FILE, 'r') as f:
  83. count = int(f.read().strip() or '0')
  84. count += 1
  85. with open(MINERU_COUNT_FILE, 'w') as f:
  86. f.write(str(count))
  87. return count
  88. except Exception as e:
  89. logger.warning(f"[PaddleOCR] 增加服务计数失败: {e}")
  90. return 1
  91. def _decrement_service_count(lock_file: object) -> int:
  92. """减少服务使用计数(需要在锁保护下调用)
  93. Args:
  94. lock_file: 已获取的锁文件对象
  95. Returns:
  96. 当前计数
  97. """
  98. try:
  99. count = 0
  100. if os.path.exists(MINERU_COUNT_FILE):
  101. with open(MINERU_COUNT_FILE, 'r') as f:
  102. count = int(f.read().strip() or '0')
  103. count = max(0, count - 1)
  104. with open(MINERU_COUNT_FILE, 'w') as f:
  105. f.write(str(count))
  106. return count
  107. except Exception as e:
  108. logger.warning(f"[PaddleOCR] 减少服务计数失败: {e}")
  109. return 0
  110. def stop_mineru_service() -> bool:
  111. """停止mineru-api.service以释放GPU内存(线程安全)
  112. Returns:
  113. True表示成功停止或已停止,False表示失败
  114. """
  115. lock_file = _acquire_service_lock()
  116. if not lock_file:
  117. # 如果无法获取锁,等待一小段时间后检查服务状态
  118. time.sleep(0.5)
  119. try:
  120. result = subprocess.run(
  121. ["systemctl", "is-active", "mineru-api.service"],
  122. capture_output=True,
  123. text=True,
  124. timeout=5,
  125. check=False
  126. )
  127. if result.returncode != 0 or result.stdout.strip() != "active":
  128. # 服务已经停止
  129. logger.debug("[PaddleOCR] 服务已停止(其他进程已处理)")
  130. return True
  131. except Exception:
  132. pass
  133. return False
  134. try:
  135. # 检查服务当前状态
  136. result = subprocess.run(
  137. ["systemctl", "is-active", "mineru-api.service"],
  138. capture_output=True,
  139. text=True,
  140. timeout=5,
  141. check=False
  142. )
  143. is_active = result.returncode == 0 and result.stdout.strip() == "active"
  144. if not is_active:
  145. logger.debug("[PaddleOCR] mineru-api.service已经停止")
  146. return True
  147. # 增加使用计数(在锁保护下)
  148. count = _increment_service_count(lock_file)
  149. logger.debug(f"[PaddleOCR] 服务使用计数: {count}")
  150. # 停止服务
  151. result = subprocess.run(
  152. ["systemctl", "stop", "mineru-api.service"],
  153. capture_output=True,
  154. text=True,
  155. timeout=30,
  156. check=False
  157. )
  158. if result.returncode == 0:
  159. logger.info("[PaddleOCR] 成功停止mineru-api.service以释放GPU内存")
  160. return True
  161. else:
  162. logger.warning(f"[PaddleOCR] 停止mineru-api.service失败: {result.stderr}")
  163. _decrement_service_count(lock_file) # 回滚计数
  164. return False
  165. except Exception as e:
  166. logger.warning(f"[PaddleOCR] 停止mineru-api.service时出错: {e}")
  167. if lock_file:
  168. _decrement_service_count(lock_file) # 回滚计数
  169. return False
  170. finally:
  171. _release_service_lock(lock_file)
  172. def start_mineru_service() -> bool:
  173. """启动mineru-api.service(线程安全)
  174. Returns:
  175. True表示成功启动或已启动,False表示失败
  176. """
  177. lock_file = _acquire_service_lock()
  178. if not lock_file:
  179. # 如果无法获取锁,等待一小段时间后检查服务状态
  180. time.sleep(0.5)
  181. try:
  182. result = subprocess.run(
  183. ["systemctl", "is-active", "mineru-api.service"],
  184. capture_output=True,
  185. text=True,
  186. timeout=5,
  187. check=False
  188. )
  189. if result.returncode == 0 and result.stdout.strip() == "active":
  190. # 服务已经启动
  191. logger.debug("[PaddleOCR] 服务已启动(其他进程已处理)")
  192. return True
  193. except Exception:
  194. pass
  195. return False
  196. try:
  197. # 减少使用计数(在锁保护下)
  198. count = _decrement_service_count(lock_file)
  199. logger.debug(f"[PaddleOCR] 服务使用计数: {count}")
  200. # 如果还有其他进程在使用,不启动服务
  201. if count > 0:
  202. logger.info(f"[PaddleOCR] 还有其他进程在使用GPU(计数={count}),暂不启动mineru-api.service")
  203. return True
  204. # 检查服务当前状态
  205. result = subprocess.run(
  206. ["systemctl", "is-active", "mineru-api.service"],
  207. capture_output=True,
  208. text=True,
  209. timeout=5,
  210. check=False
  211. )
  212. is_active = result.returncode == 0 and result.stdout.strip() == "active"
  213. if is_active:
  214. logger.debug("[PaddleOCR] mineru-api.service已经启动")
  215. return True
  216. # 启动服务
  217. result = subprocess.run(
  218. ["systemctl", "start", "mineru-api.service"],
  219. capture_output=True,
  220. text=True,
  221. timeout=30,
  222. check=False
  223. )
  224. if result.returncode == 0:
  225. logger.info("[PaddleOCR] 成功启动mineru-api.service")
  226. return True
  227. else:
  228. logger.warning(f"[PaddleOCR] 启动mineru-api.service失败: {result.stderr}")
  229. return False
  230. except Exception as e:
  231. logger.warning(f"[PaddleOCR] 启动mineru-api.service时出错: {e}")
  232. return False
  233. finally:
  234. _release_service_lock(lock_file)
  235. def detect_file_type(file_path: str) -> Optional[str]:
  236. """通过文件内容(魔数)检测文件类型,不依赖扩展名
  237. Args:
  238. file_path: 文件路径
  239. Returns:
  240. 文件类型:'pdf', 'png', 'jpeg', 'jpg' 或 None
  241. """
  242. if not file_path or not os.path.exists(file_path):
  243. return None
  244. try:
  245. with open(file_path, 'rb') as f:
  246. # 读取文件头部(前16字节足够识别常见格式)
  247. header = f.read(16)
  248. if not header:
  249. return None
  250. # PDF文件:以 %PDF 开头
  251. if header.startswith(b'%PDF'):
  252. return 'pdf'
  253. # PNG图片:以 \x89PNG\r\n\x1a\n 开头
  254. if header.startswith(b'\x89PNG\r\n\x1a\n'):
  255. return 'png'
  256. # JPEG图片:以 \xff\xd8\xff 开头
  257. if header.startswith(b'\xff\xd8\xff'):
  258. return 'jpeg'
  259. # 其他格式可以继续扩展
  260. return None
  261. except Exception as e:
  262. logger.debug(f"[PaddleOCR备用] 检测文件类型失败: {e}")
  263. return None
  264. def check_json_data_completeness(json_data: Dict[str, Any], document_type: str) -> bool:
  265. """检查JSON数据是否大面积缺失
  266. Args:
  267. json_data: 解析后的JSON数据
  268. document_type: 文档类型
  269. Returns:
  270. True表示数据完整,False表示数据缺失
  271. """
  272. if not json_data or "data" not in json_data:
  273. return False
  274. data = json_data["data"]
  275. # 根据文档类型检查关键字段
  276. if document_type == "noiseMonitoringRecord":
  277. # 检查噪声检测记录的关键字段(不包括noise数组,noise数组由表格解析生成,不依赖OCR)
  278. required_fields = ["project", "standardReferences", "soundLevelMeterMode", "soundCalibratorMode"]
  279. missing_count = sum(1 for field in required_fields if not data.get(field))
  280. # 如果超过一半的关键字段缺失,认为数据缺失
  281. if missing_count >= len(required_fields) / 2:
  282. logger.warning(f"[数据完整性检查] 关键字段缺失过多: {missing_count}/{len(required_fields)}")
  283. return False
  284. # 检查天气字段是否异常(例如解析成“天气”标签或风向全部缺失)
  285. weather_list = data.get("weather") or []
  286. if weather_list:
  287. weather_label_tokens = {"天气", "天气状况", "天气情况"}
  288. has_label_as_value = any(
  289. (item.get("weather") or "").strip() in weather_label_tokens for item in weather_list
  290. )
  291. all_wind_direction_missing = all(
  292. not (item.get("windDirection") or "").strip() for item in weather_list
  293. )
  294. if has_label_as_value:
  295. logger.warning("[数据完整性检查] 天气字段疑似被解析为标签,触发备用解析")
  296. return False
  297. if all_wind_direction_missing:
  298. logger.warning("[数据完整性检查] 风向字段全部缺失,触发备用解析")
  299. return False
  300. return True
  301. elif document_type == "electromagneticTestRecord":
  302. # 检查电磁检测记录的关键字段
  303. # 区分必需字段和可选字段:
  304. # - deviceName 和 deviceMode 是必需字段(仪器信息)
  305. # - project 和 standardReferences 可能为空(某些文档可能没有填写)
  306. required_fields = ["deviceName", "deviceMode"] # 必需字段
  307. optional_fields = ["project", "standardReferences"] # 可选字段
  308. # 检查必需字段
  309. missing_required = sum(1 for field in required_fields if not data.get(field) or not str(data.get(field)).strip())
  310. # 检查可选字段(如果所有可选字段都为空,也算缺失)
  311. missing_optional = sum(1 for field in optional_fields if not data.get(field) or not str(data.get(field)).strip())
  312. # 检查电磁数据
  313. em_list = data.get("electricMagnetic", [])
  314. if len(em_list) == 0:
  315. logger.warning("[数据完整性检查] 电磁数据列表为空")
  316. return False
  317. # 如果必需字段缺失,认为数据不完整
  318. if missing_required > 0:
  319. logger.warning(f"[数据完整性检查] 必需字段缺失: {missing_required}/{len(required_fields)} (deviceName, deviceMode)")
  320. return False
  321. # 如果所有字段(必需+可选)都缺失,也认为数据不完整
  322. if missing_required + missing_optional >= len(required_fields) + len(optional_fields):
  323. logger.warning(f"[数据完整性检查] 所有关键字段都缺失: {missing_required + missing_optional}/{len(required_fields) + len(optional_fields)}")
  324. return False
  325. # 检查project和address字段:如果project为空且所有address都为空,说明minerU和Paddle doc_parser都丢失了,需要运行Paddle OCR
  326. project_empty = not data.get("project") or not str(data.get("project")).strip()
  327. if project_empty:
  328. # 检查所有电磁数据项的address字段是否都为空
  329. all_address_empty = True
  330. for em_item in em_list:
  331. address = em_item.get("address", "")
  332. if address and str(address).strip():
  333. all_address_empty = False
  334. break
  335. if all_address_empty:
  336. logger.warning("[数据完整性检查] project为空且所有address字段都为空,说明minerU和Paddle doc_parser都丢失了,需要运行Paddle OCR")
  337. return False
  338. return True
  339. elif document_type == "operatingConditionInfo":
  340. # 检查工况信息
  341. op_list = data.get("operationalConditions", [])
  342. if len(op_list) == 0:
  343. logger.warning("[数据完整性检查] 工况信息列表为空")
  344. return False
  345. return True
  346. # 未知类型,默认认为完整
  347. return True
  348. def parse_paddleocr_output(output_text: str) -> Dict[str, Any]:
  349. """解析paddleocr的输出文本
  350. Args:
  351. output_text: paddleocr命令的输出文本
  352. Returns:
  353. 解析后的字典,包含parsing_res_list
  354. """
  355. try:
  356. # 清理输出文本,移除可能的额外空白
  357. output_text = output_text.strip()
  358. # 尝试直接eval(因为输出是Python字典格式)
  359. # 先处理np.float32等numpy类型
  360. output_text = output_text.replace('np.float32', 'float')
  361. output_text = output_text.replace('np.int32', 'int')
  362. output_text = output_text.replace('np.int64', 'int')
  363. # 尝试使用ast.literal_eval安全解析
  364. try:
  365. result = ast.literal_eval(output_text)
  366. except (ValueError, SyntaxError):
  367. # 如果literal_eval失败,尝试使用eval(不推荐,但paddleocr输出可能需要)
  368. logger.warning("[PaddleOCR解析] literal_eval失败,尝试使用eval")
  369. # 创建一个安全的eval环境
  370. safe_dict = {"__builtins__": {}}
  371. result = eval(output_text, safe_dict)
  372. if isinstance(result, dict):
  373. # 检查是否有res键
  374. if "res" in result:
  375. parsing_res_list = result.get("res", {}).get("parsing_res_list", [])
  376. return {"parsing_res_list": parsing_res_list}
  377. # 也可能直接包含parsing_res_list
  378. elif "parsing_res_list" in result:
  379. return {"parsing_res_list": result.get("parsing_res_list", [])}
  380. return {"parsing_res_list": []}
  381. except Exception as e:
  382. logger.error(f"[PaddleOCR解析] 解析输出失败: {e}")
  383. logger.debug(f"[PaddleOCR解析] 输出内容: {output_text[:500]}")
  384. return {"parsing_res_list": []}
  385. def paddleocr_to_markdown(paddleocr_result: Dict[str, Any]) -> str:
  386. """将paddleocr的解析结果转换为markdown格式
  387. Args:
  388. paddleocr_result: paddleocr解析结果
  389. Returns:
  390. markdown格式的文本
  391. """
  392. markdown_parts = []
  393. parsing_res_list = paddleocr_result.get("parsing_res_list", [])
  394. for item in parsing_res_list:
  395. block_label = item.get("block_label", "")
  396. block_content = item.get("block_content", "")
  397. if block_label == "table":
  398. # 表格直接使用HTML格式
  399. markdown_parts.append(block_content)
  400. elif block_label in ["header", "title", "figure_title"]:
  401. # 标题使用markdown标题格式
  402. markdown_parts.append(f"# {block_content}")
  403. elif block_label == "text":
  404. # 普通文本
  405. markdown_parts.append(block_content)
  406. else:
  407. # 其他类型直接添加内容
  408. markdown_parts.append(block_content)
  409. return "\n\n".join(markdown_parts)
  410. def call_paddleocr(image_path: str) -> Optional[Dict[str, Any]]:
  411. """调用paddleocr命令解析图片
  412. Args:
  413. image_path: 图片路径
  414. Returns:
  415. paddleocr解析结果,如果失败返回None
  416. """
  417. # 在调用PaddleOCR前停止mineru服务以释放GPU内存
  418. mineru_stopped = stop_mineru_service()
  419. try:
  420. # 检查图片文件是否存在
  421. if not os.path.exists(image_path):
  422. logger.error(f"[PaddleOCR] 图片文件不存在: {image_path}")
  423. return None
  424. # 生成输出目录和基础文件名
  425. image_dir = os.path.dirname(image_path)
  426. image_basename = os.path.splitext(os.path.basename(image_path))[0]
  427. save_path_base = os.path.join(image_dir, image_basename)
  428. # 构建paddleocr命令,添加所有参数(NPU 下需加 --device npu:0,否则走 CPU 易段错误)
  429. # PaddleOCR会在save_path下创建目录,文件保存在该目录内
  430. cmd = [
  431. "paddleocr", "doc_parser", "-i", image_path,
  432. "--precision", "fp32",
  433. "--use_doc_unwarping", "False",
  434. "--use_doc_orientation_classify", "True",
  435. "--use_chart_recognition", "True",
  436. "--save_path", save_path_base
  437. ] + _paddle_ocr_device_args()
  438. # 设置环境变量,限制GPU内存使用
  439. # env = os.environ.copy()
  440. # 设置PaddlePaddle的GPU内存分配策略,使用更保守的内存分配
  441. # env["FLAGS_fraction_of_gpu_memory_to_use"] = "0.3" # 只使用30%的GPU内存
  442. # env["FLAGS_allocator_strategy"] = "auto_growth" # 使用自动增长策略,避免一次性分配过多内存
  443. logger.info(f"[PaddleOCR] 执行命令: {' '.join(cmd)}")
  444. # 执行命令
  445. result = subprocess.run(
  446. cmd,
  447. capture_output=True,
  448. text=True,
  449. timeout=300, # 5分钟超时
  450. check=False,
  451. )
  452. if result.returncode != 0:
  453. logger.error(f"[PaddleOCR] 命令执行失败,返回码: {result.returncode}")
  454. logger.error(f"[PaddleOCR] 错误输出: {result.stderr}")
  455. return None
  456. # 从保存的Markdown文件中读取结果
  457. # PaddleOCR会在save_path下创建目录,文件路径为: {save_path}/{basename}.md
  458. md_file = os.path.join(save_path_base, f"{image_basename}.md")
  459. if os.path.exists(md_file):
  460. logger.info(f"[PaddleOCR] 从Markdown文件读取结果: {md_file}")
  461. try:
  462. with open(md_file, 'r', encoding='utf-8') as f:
  463. markdown_content = f.read()
  464. if markdown_content.strip():
  465. # 将markdown内容转换为标准格式
  466. # 为了兼容现有代码,我们需要将markdown转换回parsing_res_list格式
  467. # 但实际上,我们可以直接返回markdown内容,让调用方处理
  468. # 这里我们返回一个特殊标记,表示这是markdown格式
  469. logger.info(f"[PaddleOCR] 成功读取Markdown文件,内容长度: {len(markdown_content)} 字符")
  470. # 返回markdown内容,使用特殊键标记
  471. return {"markdown_content": markdown_content}
  472. else:
  473. logger.warning("[PaddleOCR] Markdown文件内容为空")
  474. except Exception as e:
  475. logger.exception(f"[PaddleOCR] 读取Markdown文件失败: {e}")
  476. else:
  477. logger.warning(f"[PaddleOCR] Markdown文件不存在: {md_file}")
  478. # 如果Markdown文件不存在或读取失败,尝试从stdout解析
  479. output_text = result.stdout.strip()
  480. if output_text:
  481. logger.info("[PaddleOCR] 从stdout解析输出")
  482. parsed_result = parse_paddleocr_output(output_text)
  483. logger.info(f"[PaddleOCR] 解析成功,获得 {len(parsed_result.get('parsing_res_list', []))} 个区块")
  484. return parsed_result
  485. else:
  486. logger.warning("[PaddleOCR] stdout输出为空,且未找到Markdown文件")
  487. return None
  488. except subprocess.TimeoutExpired:
  489. logger.error("[PaddleOCR] 命令执行超时")
  490. return None
  491. except Exception as e:
  492. logger.exception(f"[PaddleOCR] 调用失败: {e}")
  493. return None
  494. finally:
  495. # 无论成功或失败,都尝试重启mineru服务
  496. if mineru_stopped:
  497. start_mineru_service()
  498. def extract_first_page_from_pdf(pdf_path: str, output_dir: str) -> Optional[str]:
  499. """从PDF文件中提取第一页作为图片
  500. 优先使用pypdfium2,如果不可用则使用pdf2image作为后备方案。
  501. Args:
  502. pdf_path: PDF文件路径
  503. output_dir: 输出目录,用于保存提取的图片
  504. Returns:
  505. 提取的图片路径,如果失败返回None
  506. """
  507. if not PIL_AVAILABLE:
  508. logger.error("[PaddleOCR备用] 缺少必要的库(PIL/Pillow),无法处理图片")
  509. return None
  510. if not os.path.exists(pdf_path):
  511. logger.error(f"[PaddleOCR备用] PDF文件不存在: {pdf_path}")
  512. return None
  513. # 方法1: 尝试使用pypdfium2(优先方法)
  514. if PDFIUM_AVAILABLE:
  515. try:
  516. pdf = pdfium.PdfDocument(pdf_path)
  517. try:
  518. if len(pdf) == 0:
  519. logger.error("[PaddleOCR备用] PDF文件为空")
  520. return None
  521. page = pdf[0]
  522. bitmap = page.render(scale=150 / 72) # 150 DPI
  523. pil_image = bitmap.to_pil()
  524. os.makedirs(output_dir, exist_ok=True)
  525. image_filename = f"paddleocr_fallback_page0_{int(time.time() * 1000)}_{random.randint(1000, 9999)}.png"
  526. image_path = os.path.join(output_dir, image_filename)
  527. pil_image.save(image_path, "PNG", optimize=True, compress_level=6)
  528. logger.info(f"[PaddleOCR备用] 使用pypdfium2从PDF提取第一页图片: {image_path}")
  529. bitmap.close()
  530. return image_path
  531. finally:
  532. try:
  533. pdf.close()
  534. except Exception:
  535. pass
  536. except Exception as e:
  537. logger.warning(f"[PaddleOCR备用] 使用pypdfium2提取图片失败,尝试pdf2image: {e}")
  538. # 方法2: 使用 pdf2image 作为后备方案
  539. if PDF2IMAGE_AVAILABLE:
  540. try:
  541. images = convert_from_path(pdf_path, dpi=150, first_page=1, last_page=1)
  542. if not images:
  543. logger.error("[PaddleOCR备用] pdf2image未能提取到图片")
  544. return None
  545. os.makedirs(output_dir, exist_ok=True)
  546. image_filename = f"paddleocr_fallback_page0_{int(time.time() * 1000)}_{random.randint(1000, 9999)}.png"
  547. image_path = os.path.join(output_dir, image_filename)
  548. images[0].save(image_path, "PNG", optimize=True, compress_level=6)
  549. logger.info(f"[PaddleOCR备用] 使用pdf2image从PDF提取第一页图片: {image_path}")
  550. return image_path
  551. except Exception as e:
  552. logger.exception(f"[PaddleOCR备用] 使用pdf2image提取图片失败: {e}")
  553. # 如果两种方法都不可用
  554. missing_libs = []
  555. if not PDFIUM_AVAILABLE:
  556. missing_libs.append("pypdfium2")
  557. if not PDF2IMAGE_AVAILABLE:
  558. missing_libs.append("pdf2image")
  559. logger.error(
  560. f"[PaddleOCR备用] 缺少必要的库({'或'.join(missing_libs)}),无法从PDF提取图片。请安装: pip install {' '.join(missing_libs)}"
  561. )
  562. return None
  563. def find_pdf_file(output_dir: str) -> Optional[str]:
  564. """在输出目录中查找PDF文件
  565. Args:
  566. output_dir: 输出目录
  567. Returns:
  568. PDF文件路径,如果未找到返回None
  569. """
  570. if not os.path.exists(output_dir):
  571. return None
  572. # 查找PDF文件
  573. pdf_files = list(Path(output_dir).rglob("*.pdf"))
  574. if pdf_files:
  575. # 返回第一个找到的PDF文件
  576. return str(pdf_files[0])
  577. return None
  578. def markdown_to_plain_text(markdown_content: str) -> List[str]:
  579. """将Markdown内容转换为纯文本列表(按行分割)
  580. Args:
  581. markdown_content: Markdown格式的文本
  582. Returns:
  583. 纯文本列表,每行一个元素
  584. """
  585. if not markdown_content:
  586. return []
  587. lines = []
  588. in_code_block = False
  589. # 先处理HTML表格:提取整个表格,转换为文本行
  590. # 查找所有<table>...</table>块
  591. table_pattern = r'<table[^>]*>.*?</table>'
  592. tables = re.findall(table_pattern, markdown_content, re.DOTALL)
  593. # 将表格内容替换为占位符,稍后处理
  594. table_placeholders = []
  595. for i, table in enumerate(tables):
  596. placeholder = f"__TABLE_PLACEHOLDER_{i}__"
  597. table_placeholders.append((placeholder, table))
  598. markdown_content = markdown_content.replace(table, placeholder, 1)
  599. # 处理每一行
  600. for line in markdown_content.split('\n'):
  601. line = line.rstrip() # 只移除右侧空格
  602. # 检测代码块
  603. if line.strip().startswith('```'):
  604. in_code_block = not in_code_block
  605. continue
  606. if in_code_block:
  607. # 代码块内的内容保留原样
  608. if line.strip():
  609. lines.append(line)
  610. continue
  611. # 处理表格占位符
  612. if '__TABLE_PLACEHOLDER_' in line:
  613. # 找到对应的表格
  614. for placeholder, table_html in table_placeholders:
  615. if placeholder in line:
  616. # 提取表格中的所有单元格文本
  617. table_lines = extract_table_text(table_html)
  618. lines.extend(table_lines)
  619. break
  620. continue
  621. # 检测Markdown表格(以 | 开头)
  622. if '|' in line and line.strip().startswith('|'):
  623. # 处理表格行:移除首尾的 |,分割单元格
  624. cells = [cell.strip() for cell in line.split('|') if cell.strip()]
  625. # 移除表格分隔行(只包含 - 和 |)
  626. if all(c in ['-', ':', ' '] for c in ''.join(cells)):
  627. continue
  628. # 合并单元格内容,用空格分隔
  629. table_line = ' '.join(cells)
  630. if table_line.strip():
  631. lines.append(table_line)
  632. continue
  633. # 移除Markdown语法标记
  634. # 移除标题标记 (# ## ### 等)
  635. line = re.sub(r'^#+\s*', '', line)
  636. # 移除列表标记 (- * + 等)
  637. line = re.sub(r'^[-*+]\s+', '', line)
  638. # 移除数字列表标记
  639. line = re.sub(r'^\d+\.\s+', '', line)
  640. # 移除粗体和斜体标记
  641. line = re.sub(r'\*\*([^*]+)\*\*', r'\1', line) # **bold**
  642. line = re.sub(r'\*([^*]+)\*', r'\1', line) # *italic*
  643. line = re.sub(r'__([^_]+)__', r'\1', line) # __bold__
  644. line = re.sub(r'_([^_]+)_', r'\1', line) # _italic_
  645. # 移除链接格式 [text](url) -> text
  646. line = re.sub(r'\[([^\]]+)\]\([^\)]+\)', r'\1', line)
  647. # 移除图片格式 ![alt](url) -> alt
  648. line = re.sub(r'!\[([^\]]*)\]\([^\)]+\)', r'\1', line)
  649. # 移除行内代码标记
  650. line = re.sub(r'`([^`]+)`', r'\1', line)
  651. # 移除HTML标签(div、span等)
  652. line = re.sub(r'<div[^>]*>', '', line)
  653. line = re.sub(r'</div>', '', line)
  654. line = re.sub(r'<span[^>]*>', '', line)
  655. line = re.sub(r'</span>', '', line)
  656. line = re.sub(r'<[^>]+>', '', line) # 移除其他HTML标签
  657. # 清理多余空格
  658. line = line.strip()
  659. if line: # 只保留非空行
  660. lines.append(line)
  661. return lines
  662. def extract_table_text(table_html: str) -> List[str]:
  663. """从HTML表格中提取文本,每行一个元素
  664. Args:
  665. table_html: HTML表格字符串
  666. Returns:
  667. 文本行列表
  668. """
  669. table_lines = []
  670. try:
  671. # 提取所有<tr>标签
  672. tr_pattern = r'<tr[^>]*>(.*?)</tr>'
  673. tr_matches = re.findall(tr_pattern, table_html, re.DOTALL)
  674. for tr_content in tr_matches:
  675. # 提取所有<td>和<th>标签内的文本
  676. cell_pattern = r'<(?:td|th)[^>]*>(.*?)</(?:td|th)>'
  677. cells = re.findall(cell_pattern, tr_content, re.DOTALL)
  678. if cells:
  679. # 清理每个单元格的文本
  680. cleaned_cells = []
  681. for cell in cells:
  682. # 移除嵌套的HTML标签
  683. cleaned = re.sub(r'<[^>]+>', '', cell)
  684. # 移除HTML实体
  685. cleaned = cleaned.replace('&nbsp;', ' ')
  686. cleaned = cleaned.strip()
  687. if cleaned:
  688. cleaned_cells.append(cleaned)
  689. if cleaned_cells:
  690. # 合并单元格内容,用空格分隔
  691. table_line = ' '.join(cleaned_cells)
  692. if table_line.strip():
  693. table_lines.append(table_line)
  694. except Exception as e:
  695. logger.warning(f"[Markdown转换] 提取表格文本失败: {e}")
  696. return table_lines
  697. def call_paddleocr_ocr(image_path: str, save_path: str) -> tuple[Optional[List[str]], Optional[str]]:
  698. """调用paddleocr ocr命令提取文本(用于API接口)
  699. Args:
  700. image_path: 图片路径
  701. save_path: 保存路径(目录)
  702. Returns:
  703. (OCR识别的文本列表, JSON文件路径),如果失败返回(None, None)
  704. """
  705. # 在调用PaddleOCR前停止mineru服务以释放GPU内存
  706. mineru_stopped = stop_mineru_service()
  707. try:
  708. if not os.path.exists(image_path):
  709. logger.error(f"[PaddleOCR OCR] 图片文件不存在: {image_path}")
  710. return None, None
  711. # 构建paddleocr ocr命令(NPU 下需加 --device npu:0,否则走 CPU 易段错误)
  712. cmd = ["paddleocr", "ocr", "-i", image_path, "--save_path", save_path] + _paddle_ocr_device_args()
  713. logger.info(f"[PaddleOCR OCR] 执行命令: {' '.join(cmd)}")
  714. # 执行命令
  715. result = subprocess.run(
  716. cmd,
  717. capture_output=True,
  718. text=True,
  719. timeout=300, # 5分钟超时
  720. check=False,
  721. )
  722. if result.returncode != 0:
  723. logger.error(f"[PaddleOCR OCR] 命令执行失败,返回码: {result.returncode}")
  724. logger.error(f"[PaddleOCR OCR] 错误输出: {result.stderr}")
  725. return None, None
  726. # 查找保存的JSON文件
  727. # OCR命令会在save_path下生成 {basename}_res.json
  728. image_basename = os.path.splitext(os.path.basename(image_path))[0]
  729. json_file = os.path.join(save_path, f"{image_basename}_res.json")
  730. if not os.path.exists(json_file):
  731. logger.warning(f"[PaddleOCR OCR] JSON文件不存在: {json_file}")
  732. return None, None
  733. # 读取JSON文件
  734. try:
  735. with open(json_file, 'r', encoding='utf-8') as f:
  736. ocr_data = json.load(f)
  737. # 优先提取rec_texts字段(如果存在)
  738. if "rec_texts" in ocr_data and isinstance(ocr_data["rec_texts"], list):
  739. texts = ocr_data["rec_texts"]
  740. logger.info(f"[PaddleOCR OCR] 成功提取 {len(texts)} 个文本片段(从rec_texts)")
  741. return texts, json_file
  742. # 如果没有rec_texts,尝试从parsing_res_list中提取block_content
  743. if "parsing_res_list" in ocr_data and isinstance(ocr_data["parsing_res_list"], list):
  744. texts = []
  745. for item in ocr_data["parsing_res_list"]:
  746. if isinstance(item, dict) and "block_content" in item:
  747. block_content = item["block_content"]
  748. if block_content and block_content.strip():
  749. # 如果block_content包含换行符,按行分割
  750. if "\n" in block_content:
  751. texts.extend([line.strip() for line in block_content.split("\n") if line.strip()])
  752. else:
  753. texts.append(block_content.strip())
  754. if texts:
  755. logger.info(f"[PaddleOCR OCR] 成功提取 {len(texts)} 个文本片段(从parsing_res_list)")
  756. return texts, json_file
  757. logger.warning("[PaddleOCR OCR] JSON文件中未找到rec_texts或parsing_res_list字段")
  758. return None, json_file
  759. except Exception as e:
  760. logger.exception(f"[PaddleOCR OCR] 读取JSON文件失败: {e}")
  761. return None, json_file
  762. except subprocess.TimeoutExpired:
  763. logger.error("[PaddleOCR OCR] 命令执行超时")
  764. return None, None
  765. except Exception as e:
  766. logger.exception(f"[PaddleOCR OCR] 调用失败: {e}")
  767. return None, None
  768. finally:
  769. # 无论成功或失败,都尝试重启mineru服务
  770. if mineru_stopped:
  771. start_mineru_service()
  772. def call_paddleocr_doc_parser_for_text(image_path: str, save_path: str) -> tuple[Optional[List[str]], Optional[str]]:
  773. """调用paddleocr doc_parser命令,将markdown转换为纯文本(用于内部调用提取关键词)
  774. Args:
  775. image_path: 图片路径
  776. save_path: 保存路径(目录)
  777. Returns:
  778. (纯文本列表(按行分割), markdown文件路径),如果失败返回(None, None)
  779. """
  780. try:
  781. if not os.path.exists(image_path):
  782. logger.error(f"[PaddleOCR DocParser] 图片文件不存在: {image_path}")
  783. return None, None
  784. # 生成输出目录和基础文件名
  785. image_dir = os.path.dirname(image_path)
  786. image_basename = os.path.splitext(os.path.basename(image_path))[0]
  787. save_path_base = os.path.join(save_path, image_basename)
  788. os.makedirs(save_path_base, exist_ok=True)
  789. # 构建paddleocr doc_parser命令(NPU 下需加 --device npu:0,否则走 CPU 易段错误)
  790. cmd = [
  791. "paddleocr", "doc_parser", "-i", image_path,
  792. "--precision", "fp32",
  793. "--use_doc_unwarping", "False",
  794. "--use_doc_orientation_classify", "True",
  795. "--use_chart_recognition", "True",
  796. "--save_path", save_path_base
  797. ] + _paddle_ocr_device_args()
  798. logger.info(f"[PaddleOCR DocParser] 执行命令: {' '.join(cmd)}")
  799. # 执行命令
  800. result = subprocess.run(
  801. cmd,
  802. capture_output=True,
  803. text=True,
  804. timeout=300, # 5分钟超时
  805. check=False,
  806. )
  807. if result.returncode != 0:
  808. logger.error(f"[PaddleOCR DocParser] 命令执行失败,返回码: {result.returncode}")
  809. logger.error(f"[PaddleOCR DocParser] 错误输出: {result.stderr}")
  810. return None, None
  811. # 查找保存的Markdown文件
  812. # PaddleOCR会在save_path下创建目录,文件路径为: {save_path}/{basename}.md
  813. md_file = os.path.join(save_path_base, f"{image_basename}.md")
  814. # 也可能在子目录中
  815. if not os.path.exists(md_file):
  816. md_files = sorted(Path(save_path_base).rglob("*.md"))
  817. if md_files:
  818. md_file = str(md_files[0])
  819. logger.info(f"[PaddleOCR DocParser] 在子目录中找到Markdown文件: {md_file}")
  820. if not os.path.exists(md_file):
  821. logger.warning(f"[PaddleOCR DocParser] Markdown文件不存在: {md_file}")
  822. return None, None
  823. # 读取Markdown文件并转换为纯文本
  824. try:
  825. with open(md_file, 'r', encoding='utf-8') as f:
  826. markdown_content = f.read()
  827. if not markdown_content.strip():
  828. logger.warning("[PaddleOCR DocParser] Markdown文件内容为空")
  829. return [], md_file
  830. # 将Markdown转换为纯文本列表
  831. plain_text_lines = markdown_to_plain_text(markdown_content)
  832. logger.info(f"[PaddleOCR DocParser] 成功提取 {len(plain_text_lines)} 行纯文本,Markdown文件: {md_file}")
  833. return plain_text_lines, md_file
  834. except Exception as e:
  835. logger.exception(f"[PaddleOCR DocParser] 读取Markdown文件失败: {e}")
  836. return None, md_file
  837. except subprocess.TimeoutExpired:
  838. logger.error("[PaddleOCR DocParser] 命令执行超时")
  839. return None, None
  840. except Exception as e:
  841. logger.exception(f"[PaddleOCR DocParser] 调用失败: {e}")
  842. return None, None
  843. def extract_keywords_from_ocr_texts(ocr_texts: List[str]) -> Dict[str, Any]:
  844. """从OCR文本列表中提取关键信息
  845. Args:
  846. ocr_texts: OCR识别的文本列表
  847. Returns:
  848. 包含提取的关键信息的字典
  849. """
  850. keywords = {
  851. "project": "",
  852. "standardReferences": "",
  853. "soundLevelMeterMode": "",
  854. "soundCalibratorMode": "",
  855. "calibrationValueBefore": "",
  856. "calibrationValueAfter": "",
  857. "weather_info": [], # 存储天气相关信息
  858. "address_mapping": {} # 存储编号到地址的映射,用于电磁检测记录
  859. }
  860. if not ocr_texts:
  861. return keywords
  862. # 将所有文本合并,用于匹配
  863. full_text = " ".join(ocr_texts)
  864. # 提取项目名称
  865. # 先尝试匹配"项目名称:"格式
  866. project_match = re.search(r'项目名称[::]([^检测依据声级计声校准器检测前检测后气象条件日期]+)', full_text)
  867. if project_match:
  868. project = project_match.group(1).strip()
  869. # 清理可能的后续内容
  870. project = re.sub(r'检测依据.*$', '', project).strip()
  871. keywords["project"] = project
  872. logger.debug(f"[关键词提取] 提取到项目名称: {project}")
  873. else:
  874. # 如果没找到,尝试查找"项目名称"文本,然后检查后续文本片段
  875. for i, text in enumerate(ocr_texts):
  876. if "项目名称" in text:
  877. # 检查当前文本中是否有值(在冒号后面)
  878. if ":" in text or ":" in text:
  879. project_match = re.search(r'项目名称[::]([^检测依据声级计声校准器检测前检测后气象条件日期]+)', text)
  880. if project_match:
  881. project = project_match.group(1).strip()
  882. project = re.sub(r'检测依据.*$', '', project).strip()
  883. if project:
  884. keywords["project"] = project
  885. logger.debug(f"[关键词提取] 从当前文本提取到项目名称: {project}")
  886. break
  887. # 如果当前文本只有"项目名称",检查下一个文本片段
  888. elif text.strip() == "项目名称" or text.strip().startswith("项目名称"):
  889. # 检查后续几个文本片段,找到项目名称值
  890. for j in range(i + 1, min(i + 3, len(ocr_texts))):
  891. next_text = ocr_texts[j].strip()
  892. # 如果下一个文本不是"检测依据"、"监测依据"等标签,且包含中文字符,可能是项目名称
  893. if next_text and not re.match(r'^(检测依据|监测依据|检查依据|声级计|声校准器|检测前|检测后|气象条件|日期)', next_text):
  894. # 检查是否包含中文字符(项目名称通常是中文)
  895. if re.search(r'[\u4e00-\u9fa5]', next_text):
  896. # 提取项目名称(直到遇到"检测依据"等关键词)
  897. project = re.sub(r'(检测依据|监测依据|检查依据).*$', '', next_text).strip()
  898. if project:
  899. keywords["project"] = project
  900. logger.debug(f"[关键词提取] 从后续文本提取到项目名称: {project}")
  901. break
  902. if keywords["project"]:
  903. break
  904. # 提取检测依据
  905. standard_match = re.search(r'检测依据[::]([^声级计声校准器检测前检测后气象条件日期]+)', full_text)
  906. if standard_match:
  907. standard = standard_match.group(1).strip()
  908. # 提取GB标准
  909. gb_standards = re.findall(r'GB\s*\d+[-\.]?\d*[-\.]?\d*', standard)
  910. if gb_standards:
  911. keywords["standardReferences"] = " ".join(gb_standards)
  912. else:
  913. keywords["standardReferences"] = standard.replace("□其他:", "").strip()
  914. logger.debug(f"[关键词提取] 提取到检测依据: {keywords['standardReferences']}")
  915. # 提取声级计型号/编号
  916. sound_meter_match = re.search(r'声级计型号[/::]?(?:编号)?[::]\s*([A-Z0-9+/]+)', full_text)
  917. if sound_meter_match:
  918. keywords["soundLevelMeterMode"] = sound_meter_match.group(1).strip()
  919. logger.debug(f"[关键词提取] 提取到声级计型号: {keywords['soundLevelMeterMode']}")
  920. # 提取声校准器型号/编号
  921. calibrator_match = re.search(r'声校准器型号[/::]?(?:编号)?[::]\s*([A-Z0-9+/]+)', full_text)
  922. if calibrator_match:
  923. keywords["soundCalibratorMode"] = calibrator_match.group(1).strip()
  924. logger.debug(f"[关键词提取] 提取到声校准器型号: {keywords['soundCalibratorMode']}")
  925. # 提取校准值 - 按照出现顺序:第一个dB(A)是检测前,第二个是检测后
  926. # 首先尝试通过字段名匹配
  927. before_cal_found = False
  928. after_cal_found = False
  929. # 先尝试通过字段名精确匹配
  930. for i, text in enumerate(ocr_texts):
  931. if "检测前校准值" in text and not before_cal_found:
  932. # 在当前文本中查找(可能格式:检测前校准值:93.8 dB(A))
  933. before_cal_match = re.search(r'检测前校准值[::]\s*([0-9.]+)\s*dB[((]?A[))]?', text)
  934. if before_cal_match:
  935. cal_value = before_cal_match.group(1).strip()
  936. keywords["calibrationValueBefore"] = f"{cal_value} dB(A)"
  937. logger.debug(f"[关键词提取] 提取到检测前校准值: {keywords['calibrationValueBefore']}")
  938. before_cal_found = True
  939. continue
  940. # 如果当前文本只有字段名(如"检测前校准值:"),检查相邻文本片段
  941. elif re.search(r'检测前校准值[::]\s*$', text) or (text.strip() == "检测前校准值:"):
  942. # 检查后续3个文本片段,查找包含dB(A)的文本
  943. for j in range(i + 1, min(i + 4, len(ocr_texts))):
  944. next_text = ocr_texts[j]
  945. # 查找包含dB(A)的文本(如"93.8dB(A)")
  946. db_match = re.search(r'([0-9.]+)\s*dB[((]?A[))]?', next_text)
  947. if db_match:
  948. cal_value = db_match.group(1).strip()
  949. keywords["calibrationValueBefore"] = f"{cal_value} dB(A)"
  950. logger.debug(f"[关键词提取] 从相邻文本提取到检测前校准值: {keywords['calibrationValueBefore']}")
  951. before_cal_found = True
  952. break
  953. if before_cal_found:
  954. continue
  955. if "检测后校准值" in text and not after_cal_found:
  956. # 在当前文本中查找(可能格式:检测后校准值:93.8 dB(A)或 93.8dB(A)检测后校准值:_93.8dB(A))
  957. after_cal_match = re.search(r'检测后校准值[::]\s*([0-9.]+)\s*dB[((]?A[))]?', text)
  958. if after_cal_match:
  959. cal_value = after_cal_match.group(1).strip()
  960. keywords["calibrationValueAfter"] = f"{cal_value} dB(A)"
  961. logger.debug(f"[关键词提取] 提取到检测后校准值: {keywords['calibrationValueAfter']}")
  962. after_cal_found = True
  963. continue
  964. # 如果当前文本包含"检测后校准值"但值在文本前面(如"93.8dB(A)检测后校准值:")
  965. elif re.search(r'([0-9.]+)\s*dB[((]?A[))]?\s*检测后校准值', text):
  966. db_match = re.search(r'([0-9.]+)\s*dB[((]?A[))]?', text)
  967. if db_match:
  968. cal_value = db_match.group(1).strip()
  969. keywords["calibrationValueAfter"] = f"{cal_value} dB(A)"
  970. logger.debug(f"[关键词提取] 从同一文本提取到检测后校准值: {keywords['calibrationValueAfter']}")
  971. after_cal_found = True
  972. continue
  973. # 如果当前文本只有字段名(如"检测后校准值:"),检查相邻文本片段
  974. elif re.search(r'检测后校准值[::]\s*$', text) or (text.strip() == "检测后校准值:"):
  975. # 检查后续3个文本片段,查找包含dB(A)的文本
  976. for j in range(i + 1, min(i + 4, len(ocr_texts))):
  977. next_text = ocr_texts[j]
  978. # 查找包含dB(A)的文本(如"93.8dB(A)")
  979. db_match = re.search(r'([0-9.]+)\s*dB[((]?A[))]?', next_text)
  980. if db_match:
  981. cal_value = db_match.group(1).strip()
  982. keywords["calibrationValueAfter"] = f"{cal_value} dB(A)"
  983. logger.debug(f"[关键词提取] 从相邻文本提取到检测后校准值: {keywords['calibrationValueAfter']}")
  984. after_cal_found = True
  985. break
  986. if after_cal_found:
  987. continue
  988. # 如果通过字段名没有找到,按照出现顺序:第一个dB(A)是检测前,第二个是检测后
  989. if not before_cal_found or not after_cal_found:
  990. db_a_matches = [] # 存储所有找到的dB(A)值及其位置
  991. for i, text in enumerate(ocr_texts):
  992. # 查找包含dB(A)的文本
  993. db_matches = re.finditer(r'([0-9.]+)\s*dB[((]?A[))]?', text)
  994. for match in db_matches:
  995. cal_value = match.group(1).strip()
  996. db_a_matches.append((i, cal_value, text))
  997. # 如果找到至少一个dB(A),且还没有检测前校准值,第一个就是检测前
  998. if db_a_matches and not before_cal_found:
  999. first_cal_value = db_a_matches[0][1]
  1000. keywords["calibrationValueBefore"] = f"{first_cal_value} dB(A)"
  1001. logger.debug(f"[关键词提取] 按出现顺序提取到检测前校准值(第一个dB(A)): {keywords['calibrationValueBefore']}")
  1002. before_cal_found = True
  1003. # 如果找到至少两个dB(A),且还没有检测后校准值,第二个就是检测后
  1004. if len(db_a_matches) >= 2 and not after_cal_found:
  1005. second_cal_value = db_a_matches[1][1]
  1006. keywords["calibrationValueAfter"] = f"{second_cal_value} dB(A)"
  1007. logger.debug(f"[关键词提取] 按出现顺序提取到检测后校准值(第二个dB(A)): {keywords['calibrationValueAfter']}")
  1008. after_cal_found = True
  1009. # 如果只找到一个dB(A),且还没有检测后校准值,且检测前已经找到,那么这个就是检测后(可能是同一个值)
  1010. elif len(db_a_matches) == 1 and not after_cal_found and before_cal_found:
  1011. # 如果检测前和检测后是同一个值,也设置检测后
  1012. if keywords["calibrationValueBefore"]:
  1013. keywords["calibrationValueAfter"] = keywords["calibrationValueBefore"]
  1014. logger.debug(f"[关键词提取] 检测前和检测后校准值相同: {keywords['calibrationValueAfter']}")
  1015. # 提取天气信息(从文本片段中查找包含日期和天气信息的片段)
  1016. # 需要处理文本可能分散在多个片段中的情况
  1017. # 只有当"日期:"存在且后续有天气相关信息时才提取
  1018. current_weather_info = None
  1019. weather_start_idx = -1 # 记录天气信息开始的索引
  1020. for i, text in enumerate(ocr_texts):
  1021. # 查找包含"日期:"的文本,开始新的天气记录
  1022. # 只有当后续文本中有天气相关信息时才创建记录
  1023. date_match = re.search(r'日期[::]\s*([\d.\-]+)', text)
  1024. if date_match:
  1025. # 检查后续10个文本片段中是否有天气相关信息(天气、温度、湿度、风速、风向等)
  1026. has_weather_info = False
  1027. for j in range(i, min(i + 10, len(ocr_texts))):
  1028. check_text = ocr_texts[j]
  1029. if any(keyword in check_text for keyword in ["天气", "温度", "湿度", "风速", "风向", "℃", "%RH", "m/s"]):
  1030. has_weather_info = True
  1031. break
  1032. if has_weather_info:
  1033. # 如果之前有未完成的天气记录,先保存
  1034. if current_weather_info and any([current_weather_info["monitorAt"], current_weather_info["weather"],
  1035. current_weather_info["temp"], current_weather_info["humidity"],
  1036. current_weather_info["windSpeed"], current_weather_info["windDirection"]]):
  1037. keywords["weather_info"].append(current_weather_info)
  1038. # 创建新的天气记录
  1039. current_weather_info = {
  1040. "monitorAt": date_match.group(1).strip(),
  1041. "weather": "",
  1042. "temp": "",
  1043. "humidity": "",
  1044. "windSpeed": "",
  1045. "windDirection": ""
  1046. }
  1047. weather_start_idx = i
  1048. # 如果当前有天气记录,继续提取信息(从当前文本和后续几个文本中)
  1049. if current_weather_info:
  1050. # 只在天气记录开始后的10个文本片段内查找(避免跨太远)
  1051. if weather_start_idx >= 0 and i <= weather_start_idx + 10:
  1052. # 查找天气(在同一文本或后续文本中)
  1053. if not current_weather_info["weather"]:
  1054. weather_match = re.search(r'天气\s*([^\s温度湿度风速风向]+)', text)
  1055. if weather_match:
  1056. weather_value = weather_match.group(1).strip()
  1057. if weather_value and weather_value != "_" and not re.match(r'^[\d.\-]+$', weather_value):
  1058. current_weather_info["weather"] = weather_value
  1059. # 查找温度(可能格式:温度29.5-35.0 或 温度 29.5-35.0)
  1060. if not current_weather_info["temp"]:
  1061. temp_match = re.search(r'温度\s*([0-9.\-]+)', text)
  1062. if temp_match:
  1063. current_weather_info["temp"] = temp_match.group(1).strip()
  1064. # 查找湿度(可能格式:湿度74.0-74.1 或 在"℃ 湿度"之后的文本中)
  1065. if not current_weather_info["humidity"]:
  1066. # 先检查当前文本是否包含湿度值
  1067. humidity_match = re.search(r'湿度\s*([0-9.\-]+)', text)
  1068. if humidity_match:
  1069. current_weather_info["humidity"] = humidity_match.group(1).strip()
  1070. # 如果当前文本是"℃ 湿度"或类似格式,湿度值可能在下一行
  1071. elif "湿度" in text and i + 1 < len(ocr_texts):
  1072. next_text = ocr_texts[i + 1]
  1073. if re.match(r'^[0-9.\-]+', next_text):
  1074. current_weather_info["humidity"] = next_text.strip()
  1075. # 查找风速(可能格式:风速0.4-0.5 或 在"%RH 风速"之后的文本中)
  1076. if not current_weather_info["windSpeed"]:
  1077. # 先检查当前文本是否包含风速值
  1078. wind_speed_match = re.search(r'风速\s*([0-9.\-]+)', text)
  1079. if wind_speed_match:
  1080. current_weather_info["windSpeed"] = wind_speed_match.group(1).strip()
  1081. # 如果当前文本是"%RH 风速"或类似格式,风速值可能在下一行
  1082. elif "风速" in text and i + 1 < len(ocr_texts):
  1083. next_text = ocr_texts[i + 1]
  1084. if re.match(r'^[0-9.\-]+', next_text):
  1085. current_weather_info["windSpeed"] = next_text.strip()
  1086. # 查找风向(可能格式:风向南风 或 在"m/s风向"之后的文本中,或 "_m/s风向南风" 或 "m/s风向南风")
  1087. if not current_weather_info["windDirection"]:
  1088. # 先检查当前文本是否包含风向值(格式:风向南风)
  1089. # 改进正则表达式,匹配更长的风向值(如"南风"、"东北"、"东偏北"等)
  1090. # 注意:不要排除"风"字,因为"风速"中包含"风",会导致"南风"只匹配到"南"
  1091. wind_dir_match = re.search(r'风向\s*([^\s日期温度湿度]+?)(?=\s|日期|温度|湿度|风速|$)', text)
  1092. if wind_dir_match:
  1093. wind_value = wind_dir_match.group(1).strip()
  1094. # 确保不是"m/s"或数字
  1095. if wind_value and wind_value != "m/s" and not re.match(r'^[0-9.\-]+$', wind_value):
  1096. # 如果只匹配到单个方向字(如"南"),检查下一个文本片段是否是"风"
  1097. if len(wind_value) == 1 and i + 1 < len(ocr_texts):
  1098. next_text = ocr_texts[i + 1].strip()
  1099. # 如果下一个文本是"风",合并为"南风"等
  1100. if next_text == "风" or next_text.startswith("风"):
  1101. wind_value = wind_value + "风"
  1102. logger.debug(f"[关键词提取] 合并风向值: {wind_value}")
  1103. current_weather_info["windDirection"] = wind_value
  1104. # 如果当前文本是"m/s风向"或"_m/s风向"格式,风向值在同一文本中(如 "_m/s风向南风" 或 "m/s风向南风")
  1105. if not current_weather_info["windDirection"]:
  1106. # 注意:不要排除"风"字,因为"风速"中包含"风",会导致"南风"只匹配到"南"
  1107. wind_dir_match = re.search(r'[_\s]*m/s\s*风向\s*([^\s日期温度湿度]+?)(?=\s|日期|温度|湿度|风速|$)', text)
  1108. if wind_dir_match:
  1109. wind_value = wind_dir_match.group(1).strip()
  1110. if wind_value and not re.match(r'^[0-9.\-]+$', wind_value):
  1111. # 如果只匹配到单个方向字,检查下一个文本片段
  1112. if len(wind_value) == 1 and i + 1 < len(ocr_texts):
  1113. next_text = ocr_texts[i + 1].strip()
  1114. if next_text == "风" or next_text.startswith("风"):
  1115. wind_value = wind_value + "风"
  1116. logger.debug(f"[关键词提取] 合并风向值: {wind_value}")
  1117. current_weather_info["windDirection"] = wind_value
  1118. # 如果当前文本是"m/s"或类似格式,风向值可能在下一行
  1119. if not current_weather_info["windDirection"]:
  1120. if ("m/s" in text or "风向" in text) and i + 1 < len(ocr_texts):
  1121. next_text = ocr_texts[i + 1].strip()
  1122. if next_text and not re.match(r'^[0-9.\-]+', next_text) and "风向" not in next_text:
  1123. wind_value = next_text
  1124. # 如果下一个文本是单个方向字,再检查下下个文本是否是"风"
  1125. if len(wind_value) == 1 and i + 2 < len(ocr_texts):
  1126. next_next_text = ocr_texts[i + 2].strip()
  1127. if next_next_text == "风" or next_next_text.startswith("风"):
  1128. wind_value = wind_value + "风"
  1129. logger.debug(f"[关键词提取] 合并风向值: {wind_value}")
  1130. current_weather_info["windDirection"] = wind_value
  1131. # 保存最后一个天气记录
  1132. if current_weather_info and any([current_weather_info["monitorAt"], current_weather_info["weather"],
  1133. current_weather_info["temp"], current_weather_info["humidity"],
  1134. current_weather_info["windSpeed"], current_weather_info["windDirection"]]):
  1135. keywords["weather_info"].append(current_weather_info)
  1136. # 提取监测地点(address)信息,用于电磁检测记录
  1137. # 匹配模式:编号(如EB1, EB2, ZB1等)后面跟着地址信息
  1138. # 地址通常在编号之后,可能在同一个文本片段或相邻的文本片段中
  1139. for i, text in enumerate(ocr_texts):
  1140. # 查找编号模式:EB1, EB2, ZB1, ZB2等
  1141. code_match = re.search(r'(E[ZB]\d+|Z[ZB]\d+)', text, re.IGNORECASE)
  1142. if code_match:
  1143. code = code_match.group(1).upper() # 统一转为大写
  1144. # 在当前文本中查找地址(编号后面的非数字、非时间格式的文本)
  1145. # 地址通常在编号之后,可能是中文地名
  1146. address_candidates = []
  1147. # 在当前文本中,编号之后查找地址
  1148. code_pos = code_match.end()
  1149. remaining_text = text[code_pos:].strip()
  1150. # 跳过可能的空格、标点等
  1151. remaining_text = re.sub(r'^[\s,,。、]+', '', remaining_text)
  1152. # 如果剩余文本不为空且不是纯数字或时间格式,可能是地址
  1153. if remaining_text and not re.match(r'^[\d.\-:\s]+$', remaining_text):
  1154. # 提取地址(直到遇到数字、时间或特定关键词)
  1155. address_match = re.search(r'^([^\d\n]+?)(?=\d|时间|线高|$)', remaining_text)
  1156. if address_match:
  1157. address = address_match.group(1).strip()
  1158. # 清理地址,移除常见的非地址字符
  1159. address = re.sub(r'[,。、\s]+$', '', address)
  1160. if address and len(address) > 0:
  1161. address_candidates.append(address)
  1162. # 如果当前文本中没有找到地址,检查相邻的文本片段
  1163. if not address_candidates:
  1164. # 检查编号之前的文本片段(地址可能在编号之前,需要跳过数字、时间、高度等)
  1165. # 向前查找最多5个文本片段,跳过数字、时间、高度等,找到中文地名
  1166. for j in range(i - 1, max(i - 6, -1), -1):
  1167. prev_text = ocr_texts[j].strip()
  1168. if not prev_text:
  1169. continue
  1170. # 跳过编号、数字、时间、高度等
  1171. if re.match(r'^(E[ZB]\d+|Z[ZB]\d+|\d+|时间|线高|编号|均值|24m|\d{4}[.\-]\d{1,2}[.\-]\d{1,2})', prev_text, re.IGNORECASE):
  1172. continue
  1173. # 检查是否是中文地名(包含至少2个中文字符)
  1174. if re.search(r'[\u4e00-\u9fa5]{2,}', prev_text):
  1175. # 进一步确认:不是纯数字、时间格式等
  1176. if not re.match(r'^[\d.\-:\s]+$', prev_text):
  1177. address_candidates.append(prev_text)
  1178. logger.debug(f"[关键词提取] 在编号{code}之前找到地址候选 (索引{j}): {prev_text}")
  1179. break # 找到第一个地址就停止
  1180. # 检查编号之后的文本片段
  1181. if not address_candidates and i + 1 < len(ocr_texts):
  1182. next_text = ocr_texts[i + 1].strip()
  1183. # 如果下一个文本不是编号、数字、时间等,可能是地址
  1184. if next_text and not re.match(r'^(E[ZB]\d+|Z[ZB]\d+|\d+|时间|线高|编号|均值|24m|\d{4}[.\-]\d{1,2}[.\-]\d{1,2})', next_text, re.IGNORECASE):
  1185. # 检查是否是中文地名
  1186. if re.search(r'[\u4e00-\u9fa5]{2,}', next_text):
  1187. address_candidates.append(next_text)
  1188. # 如果找到地址候选,选择最合适的(通常是第一个非空的)
  1189. if address_candidates:
  1190. address = address_candidates[0]
  1191. # 进一步清理地址
  1192. address = re.sub(r'^[,。、\s]+|[,。、\s]+$', '', address)
  1193. if address:
  1194. keywords["address_mapping"][code] = address
  1195. logger.debug(f"[关键词提取] 提取到监测地点: {code} -> {address}")
  1196. return keywords
  1197. def extract_keywords_from_markdown(markdown_content: str) -> Dict[str, Any]:
  1198. """从markdown内容中直接提取关键信息
  1199. Args:
  1200. markdown_content: markdown内容字符串
  1201. Returns:
  1202. 包含提取的关键信息的字典
  1203. """
  1204. keywords = {
  1205. "project": "",
  1206. "standardReferences": "",
  1207. "soundLevelMeterMode": "",
  1208. "soundCalibratorMode": "",
  1209. "calibrationValueBefore": "",
  1210. "calibrationValueAfter": "",
  1211. "weather_info": [] # 存储天气相关信息
  1212. }
  1213. if not markdown_content:
  1214. return keywords
  1215. # 移除HTML标签,保留文本内容(但保留表格结构信息)
  1216. # 先提取表格中的文本内容
  1217. text_content = markdown_content
  1218. # 提取项目名称
  1219. project_match = re.search(r'项目名称[::]([^检测依据声级计声校准器检测前检测后气象条件日期<>]+)', text_content)
  1220. if project_match:
  1221. project = project_match.group(1).strip()
  1222. # 清理可能的后续内容和HTML标签
  1223. project = re.sub(r'检测依据.*$', '', project).strip()
  1224. project = re.sub(r'<[^>]+>', '', project).strip()
  1225. if project:
  1226. keywords["project"] = project
  1227. logger.debug(f"[Markdown关键词提取] 提取到项目名称: {project}")
  1228. # 提取检测依据
  1229. standard_match = re.search(r'检测依据[::]([^声级计声校准器检测前检测后气象条件日期<>]+)', text_content)
  1230. if standard_match:
  1231. standard = standard_match.group(1).strip()
  1232. # 提取GB标准
  1233. gb_standards = re.findall(r'GB\s*\d+[-\.]?\d*[-\.]?\d*', standard)
  1234. if gb_standards:
  1235. keywords["standardReferences"] = " ".join(gb_standards)
  1236. else:
  1237. keywords["standardReferences"] = re.sub(r'<[^>]+>', '', standard).replace("□其他:", "").strip()
  1238. logger.debug(f"[Markdown关键词提取] 提取到检测依据: {keywords['standardReferences']}")
  1239. # 提取声级计型号/编号
  1240. sound_meter_match = re.search(r'声级计型号[/::]?(?:编号)?[::]\s*([A-Z0-9+/()()]+)', text_content)
  1241. if sound_meter_match:
  1242. sound_meter = sound_meter_match.group(1).strip()
  1243. sound_meter = re.sub(r'<[^>]+>', '', sound_meter).strip()
  1244. if sound_meter:
  1245. keywords["soundLevelMeterMode"] = sound_meter
  1246. logger.debug(f"[Markdown关键词提取] 提取到声级计型号: {keywords['soundLevelMeterMode']}")
  1247. # 提取声校准器型号/编号
  1248. calibrator_match = re.search(r'声校准器型号[/::]?(?:编号)?[::]\s*([A-Z0-9+/()()]+)', text_content)
  1249. if calibrator_match:
  1250. calibrator = calibrator_match.group(1).strip()
  1251. calibrator = re.sub(r'<[^>]+>', '', calibrator).strip()
  1252. if calibrator:
  1253. keywords["soundCalibratorMode"] = calibrator
  1254. logger.debug(f"[Markdown关键词提取] 提取到声校准器型号: {keywords['soundCalibratorMode']}")
  1255. # 提取检测前校准值
  1256. before_cal_match = re.search(r'检测前校准值[::]\s*([0-9.]+)\s*dB[((]?A[))]?', text_content)
  1257. if before_cal_match:
  1258. cal_value = before_cal_match.group(1).strip()
  1259. keywords["calibrationValueBefore"] = f"{cal_value} dB(A)"
  1260. logger.debug(f"[Markdown关键词提取] 提取到检测前校准值: {keywords['calibrationValueBefore']}")
  1261. # 提取检测后校准值
  1262. after_cal_match = re.search(r'检测后校准值[::]\s*([0-9.]+)\s*dB[((]?A[))]?', text_content)
  1263. if after_cal_match:
  1264. cal_value = after_cal_match.group(1).strip()
  1265. keywords["calibrationValueAfter"] = f"{cal_value} dB(A)"
  1266. logger.debug(f"[Markdown关键词提取] 提取到检测后校准值: {keywords['calibrationValueAfter']}")
  1267. # 提取天气信息
  1268. # 查找所有包含"日期:"的行或片段
  1269. date_pattern = r'日期[::]\s*([\d.\-]+)'
  1270. date_matches = list(re.finditer(date_pattern, text_content))
  1271. for date_match in date_matches:
  1272. date_value = date_match.group(1).strip()
  1273. # 获取日期匹配位置后的文本(最多500字符)
  1274. start_pos = date_match.end()
  1275. weather_section = text_content[start_pos:start_pos + 500]
  1276. weather_info = {
  1277. "monitorAt": date_value,
  1278. "weather": "",
  1279. "temp": "",
  1280. "humidity": "",
  1281. "windSpeed": "",
  1282. "windDirection": ""
  1283. }
  1284. # 提取天气
  1285. weather_match = re.search(r'天气\s*([^\s温度湿度风速风向<>]+)', weather_section)
  1286. if weather_match:
  1287. weather_value = weather_match.group(1).strip()
  1288. weather_value = re.sub(r'<[^>]+>', '', weather_value).strip()
  1289. if weather_value and weather_value != "_" and not re.match(r'^[\d.\-]+$', weather_value):
  1290. weather_info["weather"] = weather_value
  1291. # 提取温度
  1292. temp_match = re.search(r'温度[::]?\s*([0-9.\-]+)', weather_section)
  1293. if temp_match:
  1294. weather_info["temp"] = temp_match.group(1).strip()
  1295. # 提取湿度
  1296. humidity_match = re.search(r'湿度[::]?\s*([0-9.\-]+)', weather_section)
  1297. if humidity_match:
  1298. weather_info["humidity"] = humidity_match.group(1).strip()
  1299. # 提取风速
  1300. wind_speed_match = re.search(r'风速[::]?\s*([0-9.\-]+)', weather_section)
  1301. if wind_speed_match:
  1302. weather_info["windSpeed"] = wind_speed_match.group(1).strip()
  1303. # 提取风向
  1304. wind_dir_match = re.search(r'风向[::]?\s*([^\s日期温度湿度风速<>]+?)(?=\s|日期|温度|湿度|风速|$|<)', weather_section)
  1305. if wind_dir_match:
  1306. wind_value = wind_dir_match.group(1).strip()
  1307. wind_value = re.sub(r'<[^>]+>', '', wind_value).strip()
  1308. if wind_value and wind_value != "m/s" and not re.match(r'^[0-9.\-]+$', wind_value):
  1309. weather_info["windDirection"] = wind_value
  1310. # 如果至少有一个字段不为空,则添加这条记录
  1311. if any([weather_info["monitorAt"], weather_info["weather"], weather_info["temp"],
  1312. weather_info["humidity"], weather_info["windSpeed"], weather_info["windDirection"]]):
  1313. keywords["weather_info"].append(weather_info)
  1314. logger.debug(f"[Markdown关键词提取] 提取到天气记录: {weather_info}")
  1315. return keywords
  1316. def supplement_missing_fields_from_ocr_json(
  1317. records: List[Dict[str, Any]],
  1318. ocr_json_path: str,
  1319. field_mapping: Dict[str, str] = None
  1320. ) -> List[Dict[str, Any]]:
  1321. """从OCR的JSON输出中补充缺失字段
  1322. 根据文本位置关系来补充缺失字段。例如,如果找到了maxReactivePower的值(如"-2.48"),
  1323. 那么minReactivePower的值就在它后面的位置("-4.75")。
  1324. Args:
  1325. records: 原始解析记录列表(OperationalConditionV2格式)
  1326. ocr_json_path: OCR输出的JSON文件路径
  1327. field_mapping: 字段映射关系,如{"maxReactivePower": "minReactivePower"},表示maxReactivePower后面是minReactivePower
  1328. Returns:
  1329. 补充后的记录列表
  1330. """
  1331. if not records or not ocr_json_path or not os.path.exists(ocr_json_path):
  1332. return records
  1333. try:
  1334. # 读取OCR JSON文件
  1335. with open(ocr_json_path, 'r', encoding='utf-8') as f:
  1336. ocr_data = json.load(f)
  1337. # 提取rec_texts数组
  1338. rec_texts = ocr_data.get("rec_texts", [])
  1339. if not rec_texts:
  1340. logger.warning("[OCR字段补充] JSON中未找到rec_texts字段")
  1341. return records
  1342. logger.info(f"[OCR字段补充] 从OCR JSON中提取到 {len(rec_texts)} 个文本片段")
  1343. # 默认字段映射:max字段后面是min字段
  1344. if field_mapping is None:
  1345. field_mapping = {
  1346. "maxVoltage": "minVoltage",
  1347. "maxCurrent": "minCurrent",
  1348. "maxActivePower": "minActivePower",
  1349. "maxReactivePower": "minReactivePower"
  1350. }
  1351. # 为每条记录补充缺失字段
  1352. for record in records:
  1353. record_name = record.get("name", "")
  1354. logger.debug(f"[OCR字段补充] 处理记录: {record_name}")
  1355. # 对于每个max字段,如果对应的min字段为空,尝试从OCR中补充
  1356. for max_field, min_field in field_mapping.items():
  1357. max_value = record.get(max_field, "").strip()
  1358. min_value = record.get(min_field, "").strip()
  1359. # 如果max字段有值但min字段为空,尝试从OCR中补充
  1360. if max_value and not min_value:
  1361. logger.debug(f"[OCR字段补充] 记录 {record_name}: {max_field}={max_value}, {min_field}为空,尝试从OCR补充")
  1362. # 在rec_texts中查找max_value
  1363. try:
  1364. max_value_float = float(max_value)
  1365. # 查找匹配的文本(允许小的数值差异)
  1366. found_max = False
  1367. for i, text in enumerate(rec_texts):
  1368. # 尝试将文本转换为数值
  1369. try:
  1370. text_float = float(text.strip())
  1371. # 如果数值匹配(允许小的误差)
  1372. if abs(text_float - max_value_float) < 0.01:
  1373. found_max = True
  1374. # 检查后续几个文本,找到第一个数值作为min_value
  1375. # 在表格中,max和min通常是相邻的,但中间可能有其他文本
  1376. for j in range(i + 1, min(i + 5, len(rec_texts))): # 检查后续最多4个文本
  1377. next_text = rec_texts[j].strip()
  1378. try:
  1379. next_value_float = float(next_text)
  1380. # 如果找到数值,且与max_value不同,则作为min_value
  1381. if abs(next_value_float - max_value_float) > 0.01:
  1382. record[min_field] = next_text
  1383. logger.info(f"[OCR字段补充] 从OCR补充 {min_field}: {next_text} (在 {max_field}={max_value} 之后,位置 {j})")
  1384. break
  1385. except ValueError:
  1386. # 不是数值,继续查找
  1387. continue
  1388. if record.get(min_field):
  1389. break
  1390. except ValueError:
  1391. # 文本不是数值,继续
  1392. pass
  1393. if not found_max:
  1394. logger.debug(f"[OCR字段补充] 未在OCR中找到 {max_field} 的值 '{max_value}'")
  1395. except ValueError:
  1396. # max_value不是数值,跳过
  1397. logger.debug(f"[OCR字段补充] {max_field}值 '{max_value}' 不是数值,跳过")
  1398. pass
  1399. logger.info("[OCR字段补充] 字段补充完成")
  1400. return records
  1401. except Exception as e:
  1402. logger.exception(f"[OCR字段补充] 补充过程出错: {e}")
  1403. return records
  1404. def extract_image_from_markdown(markdown_content: str, output_dir: str) -> Optional[str]:
  1405. """从markdown内容中提取第一张图片路径
  1406. Args:
  1407. markdown_content: markdown内容
  1408. output_dir: 输出目录
  1409. Returns:
  1410. 图片路径,如果未找到返回None
  1411. """
  1412. # 查找markdown中的图片引用
  1413. # 格式: ![alt](path) 或 <img src="path">
  1414. image_patterns = [
  1415. r'!\[.*?\]\((.*?)\)', # markdown图片格式
  1416. r'<img[^>]+src=["\'](.*?)["\']', # HTML img标签
  1417. r'<img[^>]+src=(.*?)(?:\s|>)', # HTML img标签(无引号)
  1418. ]
  1419. for pattern in image_patterns:
  1420. matches = re.findall(pattern, markdown_content)
  1421. if matches:
  1422. image_path = matches[0]
  1423. # 如果是相对路径,尝试在output_dir中查找
  1424. if not os.path.isabs(image_path):
  1425. # 尝试多个可能的路径
  1426. possible_paths = [
  1427. os.path.join(output_dir, image_path),
  1428. os.path.join(output_dir, "images", os.path.basename(image_path)),
  1429. os.path.join(output_dir, os.path.basename(image_path)),
  1430. ]
  1431. for full_path in possible_paths:
  1432. if os.path.exists(full_path):
  1433. return full_path
  1434. elif os.path.exists(image_path):
  1435. return image_path
  1436. return None
  1437. def fallback_parse_with_paddleocr(
  1438. json_data: Dict[str, Any],
  1439. markdown_content: str,
  1440. output_dir: Optional[str] = None,
  1441. document_type: Optional[str] = None,
  1442. input_file: Optional[str] = None
  1443. ) -> Optional[str]:
  1444. """当JSON数据缺失时,使用paddleocr进行备用解析
  1445. Args:
  1446. json_data: 原始JSON数据
  1447. markdown_content: 原始markdown内容
  1448. output_dir: 输出目录(用于查找图片)
  1449. document_type: 文档类型
  1450. input_file: 原始输入文件路径(PDF或图片),如果未找到图片则从PDF提取第一页
  1451. Returns:
  1452. 补充后的markdown内容,如果失败返回None
  1453. """
  1454. try:
  1455. # 注意:调用方已经检查过数据完整性,这里不再重复检查
  1456. # 直接进行备用解析,因为调用方已经确定需要备用解析
  1457. doc_type = document_type or json_data.get("document_type", "unknown")
  1458. logger.warning("[PaddleOCR备用] 启用PaddleOCR备用解析")
  1459. # 尝试从markdown中提取图片路径
  1460. image_path = None
  1461. if output_dir:
  1462. # 首先尝试从markdown中提取
  1463. image_path = extract_image_from_markdown(markdown_content, output_dir)
  1464. if image_path:
  1465. logger.info(f"[PaddleOCR备用] 从markdown中找到图片: {image_path}")
  1466. # 如果找不到,尝试在output_dir中查找png文件
  1467. if not image_path and os.path.exists(output_dir):
  1468. # 查找所有png文件
  1469. png_files = list(Path(output_dir).rglob("*.png"))
  1470. if png_files:
  1471. # 优先查找包含"粘贴"或"image"的文件名
  1472. for png_file in png_files:
  1473. if "粘贴" in png_file.name or "image" in png_file.name.lower():
  1474. image_path = str(png_file)
  1475. logger.info(f"[PaddleOCR备用] 使用找到的图片: {image_path}")
  1476. break
  1477. # 如果没找到特殊名称的,使用第一个
  1478. if not image_path:
  1479. image_path = str(png_files[0])
  1480. logger.info(f"[PaddleOCR备用] 使用找到的图片: {image_path}")
  1481. # 如果仍未找到图片,尝试从input_file处理
  1482. if not image_path:
  1483. logger.warning("[PaddleOCR备用] 未找到可用的图片文件,尝试从input_file处理")
  1484. if input_file and os.path.exists(input_file):
  1485. # 检测文件实际类型(不依赖扩展名)
  1486. file_type = detect_file_type(input_file)
  1487. if file_type == 'pdf':
  1488. # 文件是PDF,尝试提取第一页
  1489. pdf_path = input_file
  1490. logger.info(f"[PaddleOCR备用] 检测到PDF文件(通过内容): {pdf_path}")
  1491. image_path = extract_first_page_from_pdf(pdf_path, output_dir)
  1492. if image_path:
  1493. logger.info(f"[PaddleOCR备用] 成功从PDF提取第一页图片: {image_path}")
  1494. else:
  1495. logger.warning("[PaddleOCR备用] 从PDF提取图片失败(可能是PDF文件损坏或缺少必要的库)")
  1496. elif file_type in ['png', 'jpeg', 'jpg']:
  1497. # 文件是图片,直接使用
  1498. image_path = input_file
  1499. logger.info(f"[PaddleOCR备用] 检测到图片文件({file_type}): {image_path}")
  1500. else:
  1501. # 文件类型未知,尝试按PDF处理(可能是PDF但没有正确识别)
  1502. logger.debug(f"[PaddleOCR备用] input_file类型未知({file_type}),尝试按PDF处理: {input_file}")
  1503. if PDFIUM_AVAILABLE or PDF2IMAGE_AVAILABLE:
  1504. try:
  1505. # 尝试打开为PDF
  1506. pdf_path = input_file
  1507. image_path = extract_first_page_from_pdf(pdf_path, output_dir)
  1508. if image_path:
  1509. logger.info(f"[PaddleOCR备用] 成功将文件作为PDF处理并提取第一页: {image_path}")
  1510. except Exception as e:
  1511. logger.debug(f"[PaddleOCR备用] 无法将文件作为PDF处理: {e}")
  1512. # 如果input_file处理失败,尝试在output_dir中查找PDF文件
  1513. if not image_path and output_dir:
  1514. pdf_path = find_pdf_file(output_dir)
  1515. if pdf_path:
  1516. logger.info(f"[PaddleOCR备用] 在输出目录中找到PDF文件: {pdf_path}")
  1517. image_path = extract_first_page_from_pdf(pdf_path, output_dir)
  1518. if image_path:
  1519. logger.info(f"[PaddleOCR备用] 成功从PDF提取第一页图片: {image_path}")
  1520. # 如果仍未找到,尝试在input_file的父目录中查找
  1521. if not image_path and input_file:
  1522. parent_dir = os.path.dirname(input_file)
  1523. if parent_dir and os.path.exists(parent_dir):
  1524. pdf_path = find_pdf_file(parent_dir)
  1525. if pdf_path:
  1526. logger.info(f"[PaddleOCR备用] 在input_file父目录中找到PDF文件: {pdf_path}")
  1527. image_path = extract_first_page_from_pdf(pdf_path, output_dir)
  1528. if image_path:
  1529. logger.info(f"[PaddleOCR备用] 成功从PDF提取第一页图片: {image_path}")
  1530. if not image_path:
  1531. logger.warning(f"[PaddleOCR备用] 未找到可用的图片或PDF文件(input_file={input_file}, output_dir={output_dir}),无法进行备用解析")
  1532. logger.info("[PaddleOCR备用] 备用解析需要图片文件或PDF文件,如果都没有,将返回原始markdown内容")
  1533. if not image_path:
  1534. logger.warning("[PaddleOCR备用] 未找到可用的图片文件,备用解析无法进行,返回None(将使用原始解析结果)")
  1535. return None
  1536. # 使用doc_parser模式解析文档结构
  1537. logger.info("[PaddleOCR备用] 使用doc_parser模式解析文档结构")
  1538. paddleocr_result = call_paddleocr(image_path)
  1539. if not paddleocr_result:
  1540. logger.error("[PaddleOCR备用] PaddleOCR解析失败")
  1541. return None
  1542. # 检查返回结果格式
  1543. if "markdown_content" in paddleocr_result:
  1544. # 直接从MD文件读取的内容
  1545. paddleocr_markdown = paddleocr_result["markdown_content"]
  1546. logger.info(f"[PaddleOCR备用] 成功从MD文件读取,生成 {len(paddleocr_markdown)} 字符的markdown")
  1547. # 从markdown内容中提取关键词来补充数据
  1548. logger.info("[PaddleOCR备用] 从MD文件内容中提取关键词补充数据")
  1549. keywords = extract_keywords_from_markdown(paddleocr_markdown)
  1550. # 将关键词信息添加到markdown中(作为注释,供后续解析使用)
  1551. keywords_comment = "\n\n<!-- Markdown关键词补充:\n"
  1552. if keywords["project"]:
  1553. keywords_comment += f"项目名称:{keywords['project']}\n"
  1554. if keywords["standardReferences"]:
  1555. keywords_comment += f"检测依据:{keywords['standardReferences']}\n"
  1556. if keywords["soundLevelMeterMode"]:
  1557. keywords_comment += f"声级计型号/编号:{keywords['soundLevelMeterMode']}\n"
  1558. if keywords["soundCalibratorMode"]:
  1559. keywords_comment += f"声校准器型号/编号:{keywords['soundCalibratorMode']}\n"
  1560. if keywords["calibrationValueBefore"]:
  1561. keywords_comment += f"检测前校准值:{keywords['calibrationValueBefore']}\n"
  1562. if keywords["calibrationValueAfter"]:
  1563. keywords_comment += f"检测后校准值:{keywords['calibrationValueAfter']}\n"
  1564. if keywords.get("address_mapping"):
  1565. for code, address in keywords["address_mapping"].items():
  1566. keywords_comment += f"监测地点-{code}:{address}\n"
  1567. if keywords["weather_info"]:
  1568. for weather in keywords["weather_info"]:
  1569. keywords_comment += f"日期:{weather['monitorAt']} 天气:{weather['weather']} 温度:{weather['temp']} 湿度:{weather['humidity']} 风速:{weather['windSpeed']} 风向:{weather['windDirection']}\n"
  1570. keywords_comment += "-->\n"
  1571. # 将关键词信息合并到markdown中
  1572. paddleocr_markdown = paddleocr_markdown + keywords_comment
  1573. # 统计补充的字段数量(不包括weather_info列表)
  1574. field_count = sum(1 for k, v in keywords.items() if k != "weather_info" and v) + len(keywords.get("weather_info", []))
  1575. logger.info(f"[PaddleOCR备用] MD文件关键词提取完成,补充了 {field_count} 个字段")
  1576. elif "parsing_res_list" in paddleocr_result:
  1577. # 从JSON或stdout解析的结果,需要转换为markdown
  1578. paddleocr_markdown = paddleocr_to_markdown(paddleocr_result)
  1579. if not paddleocr_markdown:
  1580. logger.warning("[PaddleOCR备用] PaddleOCR未解析出有效内容")
  1581. return None
  1582. logger.info(f"[PaddleOCR备用] 成功解析,生成 {len(paddleocr_markdown)} 字符的markdown")
  1583. else:
  1584. logger.error("[PaddleOCR备用] PaddleOCR返回格式不正确")
  1585. return None
  1586. # 调用paddleocr ocr提取关键词来补充数据(作为doc_parser的补充)
  1587. logger.info("[PaddleOCR备用] 调用OCR提取关键词补充数据")
  1588. ocr_save_path = os.path.dirname(image_path) # 使用图片所在目录作为保存路径
  1589. ocr_texts, _ = call_paddleocr_ocr(image_path, ocr_save_path)
  1590. if ocr_texts:
  1591. # 从OCR文本中提取关键词
  1592. keywords = extract_keywords_from_ocr_texts(ocr_texts)
  1593. # 将关键词信息添加到markdown中(作为注释,供后续解析使用)
  1594. keywords_comment = "\n\n<!-- OCR关键词补充:\n"
  1595. if keywords["project"]:
  1596. keywords_comment += f"项目名称:{keywords['project']}\n"
  1597. if keywords["standardReferences"]:
  1598. keywords_comment += f"检测依据:{keywords['standardReferences']}\n"
  1599. if keywords["soundLevelMeterMode"]:
  1600. keywords_comment += f"声级计型号/编号:{keywords['soundLevelMeterMode']}\n"
  1601. if keywords["soundCalibratorMode"]:
  1602. keywords_comment += f"声校准器型号/编号:{keywords['soundCalibratorMode']}\n"
  1603. if keywords["calibrationValueBefore"]:
  1604. keywords_comment += f"检测前校准值:{keywords['calibrationValueBefore']}\n"
  1605. if keywords.get("address_mapping"):
  1606. for code, address in keywords["address_mapping"].items():
  1607. keywords_comment += f"监测地点-{code}:{address}\n"
  1608. if keywords["calibrationValueAfter"]:
  1609. keywords_comment += f"检测后校准值:{keywords['calibrationValueAfter']}\n"
  1610. if keywords["weather_info"]:
  1611. for weather in keywords["weather_info"]:
  1612. keywords_comment += f"日期:{weather['monitorAt']} 天气:{weather['weather']} 温度:{weather['temp']} 湿度:{weather['humidity']} 风速:{weather['windSpeed']} 风向:{weather['windDirection']}\n"
  1613. keywords_comment += "-->\n"
  1614. # 将关键词信息合并到markdown中
  1615. paddleocr_markdown = paddleocr_markdown + keywords_comment
  1616. logger.info(f"[PaddleOCR备用] OCR关键词提取完成,补充了 {len(keywords)} 个字段")
  1617. # 合并原始markdown和paddleocr结果
  1618. # 优先使用paddleocr的结果,因为它更完整
  1619. combined_markdown = f"{paddleocr_markdown}\n\n<!-- 原始内容(可能不完整) -->\n{markdown_content}"
  1620. return combined_markdown
  1621. except Exception as e:
  1622. logger.exception(f"[PaddleOCR备用] 备用解析过程出错: {e}")
  1623. return None
  1624. def extract_text_with_paragraphs_from_ocr_json(json_path: str, line_height_threshold: float = 1.5, paragraph_gap_threshold: float = 2.0) -> str:
  1625. """
  1626. 从PaddleOCR的JSON输出中提取带段落分割的纯文本
  1627. Args:
  1628. json_path: OCR输出的JSON文件路径
  1629. line_height_threshold: 行高倍数阈值,用于判断是否在同一行(默认1.5)
  1630. paragraph_gap_threshold: 段落间距倍数阈值,用于判断是否需要分段(默认2.0)
  1631. Returns:
  1632. 带段落分割的纯文本字符串
  1633. """
  1634. try:
  1635. with open(json_path, 'r', encoding='utf-8') as f:
  1636. ocr_data = json.load(f)
  1637. # 提取文本和坐标信息
  1638. rec_texts = ocr_data.get("rec_texts", [])
  1639. dt_polys = ocr_data.get("dt_polys", [])
  1640. if not rec_texts or not dt_polys:
  1641. logger.warning("[OCR文本提取] JSON中缺少rec_texts或dt_polys字段")
  1642. return ""
  1643. if len(rec_texts) != len(dt_polys):
  1644. logger.warning(f"[OCR文本提取] rec_texts长度({len(rec_texts)})与dt_polys长度({len(dt_polys)})不匹配")
  1645. # 取较小的长度
  1646. min_len = min(len(rec_texts), len(dt_polys))
  1647. rec_texts = rec_texts[:min_len]
  1648. dt_polys = dt_polys[:min_len]
  1649. # 计算每个文本块的边界框和中心点
  1650. text_blocks = []
  1651. for i, (text, poly) in enumerate(zip(rec_texts, dt_polys)):
  1652. if not text or not text.strip():
  1653. continue
  1654. # 从多边形坐标计算边界框
  1655. # poly格式: [[x1, y1], [x2, y2], [x3, y3], [x4, y4]]
  1656. if len(poly) >= 4:
  1657. xs = [point[0] for point in poly]
  1658. ys = [point[1] for point in poly]
  1659. x_min, x_max = min(xs), max(xs)
  1660. y_min, y_max = min(ys), max(ys)
  1661. # 计算中心点和高度
  1662. center_x = (x_min + x_max) / 2
  1663. center_y = (y_min + y_max) / 2
  1664. height = y_max - y_min
  1665. width = x_max - x_min
  1666. text_blocks.append({
  1667. 'text': text.strip(),
  1668. 'x_min': x_min,
  1669. 'x_max': x_max,
  1670. 'y_min': y_min,
  1671. 'y_max': y_max,
  1672. 'center_x': center_x,
  1673. 'center_y': center_y,
  1674. 'height': height,
  1675. 'width': width,
  1676. 'index': i
  1677. })
  1678. if not text_blocks:
  1679. logger.warning("[OCR文本提取] 没有有效的文本块")
  1680. return ""
  1681. # 按Y坐标(从上到下)排序
  1682. text_blocks.sort(key=lambda b: (b['y_min'], b['x_min']))
  1683. # 计算平均行高(用于判断行间距)
  1684. heights = [b['height'] for b in text_blocks]
  1685. avg_height = sum(heights) / len(heights) if heights else 20
  1686. # 将文本块按行分组
  1687. lines = []
  1688. current_line = [text_blocks[0]]
  1689. for i in range(1, len(text_blocks)):
  1690. prev_block = text_blocks[i - 1]
  1691. curr_block = text_blocks[i]
  1692. # 计算Y坐标重叠度
  1693. y_overlap = min(prev_block['y_max'], curr_block['y_max']) - max(prev_block['y_min'], curr_block['y_min'])
  1694. overlap_ratio = y_overlap / min(prev_block['height'], curr_block['height']) if min(prev_block['height'], curr_block['height']) > 0 else 0
  1695. # 计算Y坐标间距
  1696. y_gap = curr_block['y_min'] - prev_block['y_max']
  1697. gap_ratio = y_gap / avg_height if avg_height > 0 else 0
  1698. # 判断是否在同一行:有重叠或间距小于行高阈值
  1699. if overlap_ratio > 0.3 or (y_gap >= 0 and gap_ratio < line_height_threshold):
  1700. current_line.append(curr_block)
  1701. else:
  1702. # 新行开始,保存当前行
  1703. lines.append(current_line)
  1704. current_line = [curr_block]
  1705. # 添加最后一行
  1706. if current_line:
  1707. lines.append(current_line)
  1708. # 对每行内的文本块按X坐标排序(从左到右)
  1709. for line in lines:
  1710. line.sort(key=lambda b: b['x_min'])
  1711. # 生成文本,根据行间距判断段落分割
  1712. result_lines = []
  1713. prev_line_y = None
  1714. prev_line_height = None
  1715. for line_idx, line in enumerate(lines):
  1716. # 计算当前行的Y坐标和高度
  1717. line_y_min = min(b['y_min'] for b in line)
  1718. line_y_max = max(b['y_max'] for b in line)
  1719. line_height = line_y_max - line_y_min
  1720. line_center_y = (line_y_min + line_y_max) / 2
  1721. # 拼接当前行的文本
  1722. # 对于表格数据,使用制表符分隔;对于普通文本,使用空格
  1723. line_text = ""
  1724. prev_x_max = None
  1725. # 判断是否是表格行(如果一行中有多个文本块且X坐标分布较均匀)
  1726. is_table_row = len(line) > 2
  1727. for block in line:
  1728. if prev_x_max is not None:
  1729. x_gap = block['x_min'] - prev_x_max
  1730. # 如果间距较大,添加分隔符
  1731. if x_gap > avg_height * 0.3:
  1732. if is_table_row:
  1733. # 表格使用制表符
  1734. line_text += "\t"
  1735. else:
  1736. # 普通文本使用空格
  1737. line_text += " "
  1738. line_text += block['text']
  1739. prev_x_max = block['x_max']
  1740. # 判断是否需要换段
  1741. if prev_line_y is not None and prev_line_height is not None:
  1742. # 计算行间距
  1743. line_gap = line_y_min - prev_line_y
  1744. gap_ratio = line_gap / prev_line_height if prev_line_height > 0 else 0
  1745. # 如果行间距大于段落阈值,添加空行
  1746. if gap_ratio > paragraph_gap_threshold:
  1747. result_lines.append("") # 空行表示段落分隔
  1748. result_lines.append(line_text)
  1749. prev_line_y = line_y_max
  1750. prev_line_height = line_height
  1751. # 合并为最终文本
  1752. result_text = "\n".join(result_lines)
  1753. logger.info(f"[OCR文本提取] 成功提取文本,共 {len(lines)} 行,{len(result_lines)} 行(含段落分隔)")
  1754. return result_text
  1755. except Exception as e:
  1756. logger.exception(f"[OCR文本提取] 处理失败: {e}")
  1757. return ""