paddleocr_fallback.py 94 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130113111321133113411351136113711381139114011411142114311441145114611471148114911501151115211531154115511561157115811591160116111621163116411651166116711681169117011711172117311741175117611771178117911801181118211831184118511861187118811891190119111921193119411951196119711981199120012011202120312041205120612071208120912101211121212131214121512161217121812191220122112221223122412251226122712281229123012311232123312341235123612371238123912401241124212431244124512461247124812491250125112521253125412551256125712581259126012611262126312641265126612671268126912701271127212731274127512761277127812791280128112821283128412851286128712881289129012911292129312941295129612971298129913001301130213031304130513061307130813091310131113121313131413151316131713181319132013211322132313241325132613271328132913301331133213331334133513361337133813391340134113421343134413451346134713481349135013511352135313541355135613571358135913601361136213631364136513661367136813691370137113721373137413751376137713781379138013811382138313841385138613871388138913901391139213931394139513961397139813991400140114021403140414051406140714081409141014111412141314141415141614171418141914201421142214231424142514261427142814291430143114321433143414351436143714381439144014411442144314441445144614471448144914501451145214531454145514561457145814591460146114621463146414651466146714681469147014711472147314741475147614771478147914801481148214831484148514861487148814891490149114921493149414951496149714981499150015011502150315041505150615071508150915101511151215131514151515161517151815191520152115221523152415251526152715281529153015311532153315341535153615371538153915401541154215431544154515461547154815491550155115521553155415551556155715581559156015611562156315641565156615671568156915701571157215731574157515761577157815791580158115821583158415851586158715881589159015911592159315941595159615971598159916001601160216031604160516061607160816091610161116121613161416151616161716181619162016211622162316241625162616271628162916301631163216331634163516361637163816391640164116421643164416451646164716481649165016511652165316541655165616571658165916601661166216631664166516661667166816691670167116721673167416751676167716781679168016811682168316841685168616871688168916901691169216931694169516961697169816991700170117021703170417051706170717081709171017111712171317141715171617171718171917201721172217231724172517261727172817291730173117321733173417351736173717381739174017411742174317441745174617471748174917501751175217531754175517561757175817591760176117621763176417651766176717681769177017711772177317741775177617771778177917801781178217831784178517861787178817891790179117921793179417951796179717981799180018011802180318041805180618071808180918101811181218131814181518161817181818191820182118221823182418251826182718281829183018311832183318341835183618371838183918401841184218431844184518461847184818491850185118521853185418551856185718581859186018611862186318641865186618671868186918701871187218731874187518761877187818791880188118821883188418851886188718881889189018911892189318941895189618971898189919001901190219031904190519061907190819091910191119121913191419151916191719181919192019211922192319241925192619271928192919301931193219331934193519361937193819391940194119421943194419451946194719481949195019511952195319541955195619571958195919601961196219631964196519661967196819691970197119721973197419751976197719781979198019811982198319841985198619871988
  1. # Copyright (c) Opendatalab. All rights reserved.
  2. """PaddleOCR备用解析模块 - 当MinerU解析结果缺失时使用"""
  3. import json
  4. import os
  5. import subprocess
  6. import sys
  7. import tempfile
  8. import time
  9. import random
  10. from pathlib import Path
  11. from typing import Dict, Any, Optional, List, Tuple
  12. import ast
  13. import re
  14. from ..utils.logging_config import get_logger
  15. from ..config import (
  16. PADDLEOCR_CMD as _PADDLEOCR_CMD,
  17. PADDLE_OCR_DEVICE as _PADDLE_OCR_DEVICE,
  18. PADDLE_OCR_DEVICES as _PADDLE_OCR_DEVICES_CONFIG,
  19. VL_REC_BACKEND,
  20. VL_REC_SERVER_URL,
  21. )
  22. logger = get_logger("pdf_converter_v2.utils.paddleocr")
  23. try:
  24. import pypdfium2 as pdfium
  25. PDFIUM_AVAILABLE = True
  26. except ImportError:
  27. PDFIUM_AVAILABLE = False
  28. logger.warning("[PaddleOCR备用] pypdfium2未安装,无法从PDF提取图片")
  29. try:
  30. from pdf2image import convert_from_path
  31. PDF2IMAGE_AVAILABLE = True
  32. except ImportError:
  33. PDF2IMAGE_AVAILABLE = False
  34. logger.warning("[PaddleOCR备用] pdf2image未安装,无法使用备用方法从PDF提取图片")
  35. try:
  36. from PIL import Image
  37. PIL_AVAILABLE = True
  38. except ImportError:
  39. PIL_AVAILABLE = False
  40. logger.warning("[PaddleOCR备用] PIL未安装,无法处理图片")
  41. def _get_paddleocr_executable() -> str:
  42. """返回 paddleocr 可执行文件路径 or 命令名,供 subprocess 使用。
  43. 当以 systemd 等方式运行时 PATH 可能不包含 venv/bin,故优先使用当前 Python 同目录下的 paddleocr。
  44. 可通过配置 PADDLEOCR_CMD 显式指定(完整路径或命令名)。"""
  45. cmd = _PADDLEOCR_CMD.strip()
  46. if cmd:
  47. return cmd
  48. # 与当前 Python 同目录(venv/bin)下的 paddleocr
  49. bin_dir = os.path.dirname(os.path.abspath(sys.executable))
  50. candidate = os.path.join(bin_dir, "paddleocr")
  51. if os.path.isfile(candidate) and os.access(candidate, os.X_OK):
  52. return candidate
  53. return "paddleocr"
  54. # PaddleOCR 推理设备:支持单卡与多卡轮询
  55. # 单卡:PADDLE_OCR_DEVICE=npu:0 或未设置时 NPU 下默认 npu:0
  56. # 多卡:PADDLE_OCR_DEVICES=npu:0,npu:1 时按请求轮询使用
  57. import threading as _threading
  58. _PADDLE_OCR_DEVICES: List[str] = []
  59. _PADDLE_OCR_DEVICE_INDEX: int = 0
  60. _PADDLE_OCR_DEVICE_LOCK = _threading.Lock()
  61. def _get_paddle_ocr_devices() -> List[str]:
  62. """解析 PADDLE_OCR_DEVICES 或 PADDLE_OCR_DEVICE,返回设备列表(惰性、线程安全)。"""
  63. global _PADDLE_OCR_DEVICES
  64. with _PADDLE_OCR_DEVICE_LOCK:
  65. if _PADDLE_OCR_DEVICES:
  66. return _PADDLE_OCR_DEVICES
  67. multi = _PADDLE_OCR_DEVICES_CONFIG.strip()
  68. if multi:
  69. _PADDLE_OCR_DEVICES[:] = [d.strip() for d in multi.split(",") if d.strip()]
  70. if not _PADDLE_OCR_DEVICES:
  71. single = _PADDLE_OCR_DEVICE.strip()
  72. if not single:
  73. from .device_env import is_npu
  74. if is_npu():
  75. single = "npu:0"
  76. if single:
  77. _PADDLE_OCR_DEVICES.append(single)
  78. return _PADDLE_OCR_DEVICES
  79. def get_paddle_ocr_devices() -> List[str]:
  80. """返回 PaddleOCR 设备列表(用于单任务多卡:按页拆分后并行使用各卡)。"""
  81. return list(_get_paddle_ocr_devices())
  82. def get_paddle_ocr_device_args_for_index(device_index: int) -> list:
  83. """返回指定设备索引的 --device 参数列表;用于多卡并行时显式指定每段用哪张卡。"""
  84. devices = _get_paddle_ocr_devices()
  85. if not devices:
  86. return []
  87. device = devices[device_index % len(devices)]
  88. return ["--device", device]
  89. # 供 PaddleOCR 子进程使用的环境变量(LD_PRELOAD 避免 sklearn libgomp static TLS 报错;PADDLE_PDX 跳过模型源检查)
  90. _PADDLEOCR_ENV: Optional[Dict[str, str]] = None
  91. def _get_paddleocr_subprocess_env() -> Dict[str, str]:
  92. """返回调用 paddleocr 子进程时应使用的环境变量(含 LD_PRELOAD 与 PADDLE_PDX_DISABLE_MODEL_SOURCE_CHECK)。"""
  93. global _PADDLEOCR_ENV
  94. if _PADDLEOCR_ENV is not None:
  95. return _PADDLEOCR_ENV
  96. env = dict(os.environ)
  97. # 跳过「Checking connectivity to the model hosters」
  98. env.setdefault("PADDLE_PDX_DISABLE_MODEL_SOURCE_CHECK", "True")
  99. # doc_parser 加载 PaddleOCR-VL 时 safetensors 会触发 view_dtype(CPU, Undefined(AnyLayout), uint8),
  100. # 该 kernel 未注册;强制使用 STRIDED 布局可避免:RuntimeError: kernel view_dtype (CPU, Undefined(AnyLayout), uint8) is not registered
  101. env.setdefault("FLAGS_use_stride_kernel", "1")
  102. # 子进程若无 LD_PRELOAD,会触发 sklearn/paddlex 的「cannot allocate memory in static TLS block」
  103. if not env.get("LD_PRELOAD"):
  104. preload_paths: List[str] = []
  105. # 系统 libgomp 优先
  106. for p in (
  107. "/usr/lib/x86_64-linux-gnu/libgomp.so.1",
  108. "/usr/lib/aarch64-linux-gnu/libgomp.so.1",
  109. "/usr/lib/libgomp.so.1",
  110. ):
  111. if os.path.isfile(p):
  112. preload_paths.append(p)
  113. break
  114. # scikit_learn.libs 中的 libgomp(不 import sklearn,仅按路径查找)
  115. for sp in getattr(sys, "path", []):
  116. if not sp or not os.path.isdir(sp):
  117. continue
  118. for sub in ("scikit_learn.libs", "simsimd.libs"):
  119. d = os.path.join(sp, sub)
  120. if not os.path.isdir(d):
  121. continue
  122. for name in os.listdir(d):
  123. if name.startswith("libgomp") and (name.endswith(".so") or ".so." in name):
  124. preload_paths.append(os.path.join(d, name))
  125. # 固定路径(常见容器)
  126. for p in (
  127. "/usr/local/lib/python3.10/dist-packages/scikit_learn.libs/libgomp-d22c30c5.so.1.0.0",
  128. "/usr/local/lib/python3.10/site-packages/scikit_learn.libs/libgomp-d22c30c5.so.1.0.0",
  129. ):
  130. if os.path.isfile(p) and p not in preload_paths:
  131. preload_paths.append(p)
  132. if preload_paths:
  133. env["LD_PRELOAD"] = ":".join(preload_paths)
  134. logger.debug("[PaddleOCR] 子进程 LD_PRELOAD 已设置,避免 static TLS 报错")
  135. _PADDLEOCR_ENV = env
  136. return env
  137. def _paddle_ocr_device_args() -> list:
  138. """返回 PaddleOCR 命令的 --device 参数列表;多卡时按请求轮询。"""
  139. devices = _get_paddle_ocr_devices()
  140. if not devices:
  141. return []
  142. global _PADDLE_OCR_DEVICE_INDEX
  143. with _PADDLE_OCR_DEVICE_LOCK:
  144. idx = _PADDLE_OCR_DEVICE_INDEX % len(devices)
  145. _PADDLE_OCR_DEVICE_INDEX += 1
  146. device = devices[idx]
  147. return ["--device", device]
  148. def detect_file_type(file_path: str) -> Optional[str]:
  149. """通过文件内容(魔数)检测文件类型,不依赖扩展名
  150. Args:
  151. file_path: 文件路径
  152. Returns:
  153. 文件类型:'pdf', 'png', 'jpeg', 'jpg' 或 None
  154. """
  155. if not file_path or not os.path.exists(file_path):
  156. return None
  157. try:
  158. with open(file_path, 'rb') as f:
  159. # 读取文件头部(前16字节足够识别常见格式)
  160. header = f.read(16)
  161. if not header:
  162. return None
  163. # PDF文件:以 %PDF 开头
  164. if header.startswith(b'%PDF'):
  165. return 'pdf'
  166. # PNG图片:以 \x89PNG\r\n\x1a\n 开头
  167. if header.startswith(b'\x89PNG\r\n\x1a\n'):
  168. return 'png'
  169. # JPEG图片:以 \xff\xd8\xff 开头
  170. if header.startswith(b'\xff\xd8\xff'):
  171. return 'jpeg'
  172. # 其他格式可以继续扩展
  173. return None
  174. except Exception as e:
  175. logger.debug(f"[PaddleOCR备用] 检测文件类型失败: {e}")
  176. return None
  177. def check_json_data_completeness(json_data: Dict[str, Any], document_type: str) -> bool:
  178. """检查JSON数据是否大面积缺失
  179. Args:
  180. json_data: 解析后的JSON数据
  181. document_type: 文档类型
  182. Returns:
  183. True表示数据完整,False表示数据缺失
  184. """
  185. if not json_data or "data" not in json_data:
  186. return False
  187. data = json_data["data"]
  188. # 根据文档类型检查关键字段
  189. if document_type == "noiseMonitoringRecord":
  190. # 检查噪声检测记录的关键字段(不包括noise数组,noise数组由表格解析生成,不依赖OCR)
  191. required_fields = ["project", "standardReferences", "soundLevelMeterMode", "soundCalibratorMode"]
  192. missing_count = sum(1 for field in required_fields if not data.get(field))
  193. # 如果超过一半的关键字段缺失,认为数据缺失
  194. if missing_count >= len(required_fields) / 2:
  195. logger.warning(f"[数据完整性检查] 关键字段缺失过多: {missing_count}/{len(required_fields)}")
  196. return False
  197. # 检查天气字段是否异常(例如解析成“天气”标签或风向全部缺失)
  198. weather_list = data.get("weather") or []
  199. if weather_list:
  200. weather_label_tokens = {"天气", "天气状况", "天气情况"}
  201. has_label_as_value = any(
  202. (item.get("weather") or "").strip() in weather_label_tokens for item in weather_list
  203. )
  204. all_wind_direction_missing = all(
  205. not (item.get("windDirection") or "").strip() for item in weather_list
  206. )
  207. if has_label_as_value:
  208. logger.warning("[数据完整性检查] 天气字段疑似被解析为标签,触发备用解析")
  209. return False
  210. if all_wind_direction_missing:
  211. logger.warning("[数据完整性检查] 风向字段全部缺失,触发备用解析")
  212. return False
  213. return True
  214. elif document_type == "electromagneticTestRecord":
  215. # 检查电磁检测记录的关键字段
  216. # 区分必需字段和可选字段:
  217. # - deviceName 和 deviceMode 是必需字段(仪器信息)
  218. # - project 和 standardReferences 可能为空(某些文档可能没有填写)
  219. required_fields = ["deviceName", "deviceMode"] # 必需字段
  220. optional_fields = ["project", "standardReferences"] # 可选字段
  221. # 检查必需字段
  222. missing_required = sum(1 for field in required_fields if not data.get(field) or not str(data.get(field)).strip())
  223. # 检查可选字段(如果所有可选字段都为空,也算缺失)
  224. missing_optional = sum(1 for field in optional_fields if not data.get(field) or not str(data.get(field)).strip())
  225. # 检查电磁数据
  226. em_list = data.get("electricMagnetic", [])
  227. if len(em_list) == 0:
  228. logger.warning("[数据完整性检查] 电磁数据列表为空")
  229. return False
  230. # 如果必需字段缺失,认为数据不完整
  231. if missing_required > 0:
  232. logger.warning(f"[数据完整性检查] 必需字段缺失: {missing_required}/{len(required_fields)} (deviceName, deviceMode)")
  233. return False
  234. # 如果所有字段(必需+可选)都缺失,也认为数据不完整
  235. if missing_required + missing_optional >= len(required_fields) + len(optional_fields):
  236. logger.warning(f"[数据完整性检查] 所有关键字段都缺失: {missing_required + missing_optional}/{len(required_fields) + len(optional_fields)}")
  237. return False
  238. # 检查project和address字段:如果project为空且所有address都为空,说明minerU和Paddle doc_parser都丢失了,需要运行Paddle OCR
  239. project_empty = not data.get("project") or not str(data.get("project")).strip()
  240. if project_empty:
  241. # 检查所有电磁数据项的address字段是否都为空
  242. all_address_empty = True
  243. for em_item in em_list:
  244. address = em_item.get("address", "")
  245. if address and str(address).strip():
  246. all_address_empty = False
  247. break
  248. if all_address_empty:
  249. logger.warning("[数据完整性检查] project为空且所有address字段都为空,说明minerU和Paddle doc_parser都丢失了,需要运行Paddle OCR")
  250. return False
  251. return True
  252. elif document_type == "operatingConditionInfo":
  253. # 检查工况信息
  254. op_list = data.get("operationalConditions", [])
  255. if len(op_list) == 0:
  256. logger.warning("[数据完整性检查] 工况信息列表为空")
  257. return False
  258. return True
  259. # 未知类型,默认认为完整
  260. return True
  261. def parse_paddleocr_output(output_text: str) -> Dict[str, Any]:
  262. """解析paddleocr的输出文本
  263. Args:
  264. output_text: paddleocr命令的输出文本
  265. Returns:
  266. 解析后的字典,包含parsing_res_list
  267. """
  268. try:
  269. # 清理输出文本,移除可能的额外空白
  270. output_text = output_text.strip()
  271. # 尝试直接eval(因为输出是Python字典格式)
  272. # 先处理np.float32等numpy类型
  273. output_text = output_text.replace('np.float32', 'float')
  274. output_text = output_text.replace('np.int32', 'int')
  275. output_text = output_text.replace('np.int64', 'int')
  276. # 尝试使用ast.literal_eval安全解析
  277. try:
  278. result = ast.literal_eval(output_text)
  279. except (ValueError, SyntaxError):
  280. # 如果literal_eval失败,尝试使用eval(不推荐,但paddleocr输出可能需要)
  281. logger.warning("[PaddleOCR解析] literal_eval失败,尝试使用eval")
  282. # 创建一个安全的eval环境
  283. safe_dict = {"__builtins__": {}}
  284. result = eval(output_text, safe_dict)
  285. if isinstance(result, dict):
  286. # 检查是否有res键
  287. if "res" in result:
  288. parsing_res_list = result.get("res", {}).get("parsing_res_list", [])
  289. return {"parsing_res_list": parsing_res_list}
  290. # 也可能直接包含parsing_res_list
  291. elif "parsing_res_list" in result:
  292. return {"parsing_res_list": result.get("parsing_res_list", [])}
  293. return {"parsing_res_list": []}
  294. except Exception as e:
  295. logger.error(f"[PaddleOCR解析] 解析输出失败: {e}")
  296. logger.debug(f"[PaddleOCR解析] 输出内容: {output_text[:500]}")
  297. return {"parsing_res_list": []}
  298. def paddleocr_to_markdown(paddleocr_result: Dict[str, Any]) -> str:
  299. """将paddleocr的解析结果转换为markdown格式
  300. Args:
  301. paddleocr_result: paddleocr解析结果
  302. Returns:
  303. markdown格式的文本
  304. """
  305. markdown_parts = []
  306. parsing_res_list = paddleocr_result.get("parsing_res_list", [])
  307. for item in parsing_res_list:
  308. block_label = item.get("block_label", "")
  309. block_content = item.get("block_content", "")
  310. if block_label == "table":
  311. # 表格直接使用HTML格式
  312. markdown_parts.append(block_content)
  313. elif block_label in ["header", "title", "figure_title"]:
  314. # 标题使用markdown标题格式
  315. markdown_parts.append(f"# {block_content}")
  316. elif block_label == "text":
  317. # 普通文本
  318. markdown_parts.append(block_content)
  319. else:
  320. # 其他类型直接添加内容
  321. markdown_parts.append(block_content)
  322. return "\n\n".join(markdown_parts)
  323. def call_paddleocr(image_path: str) -> Optional[Dict[str, Any]]:
  324. """调用paddleocr命令解析图片
  325. Args:
  326. image_path: 图片路径
  327. Returns:
  328. paddleocr解析结果,如果失败返回None
  329. """
  330. try:
  331. # 检查图片文件是否存在
  332. if not os.path.exists(image_path):
  333. logger.error(f"[PaddleOCR] 图片文件不存在: {image_path}")
  334. return None
  335. # 生成输出目录和基础文件名
  336. image_dir = os.path.dirname(image_path)
  337. image_basename = os.path.splitext(os.path.basename(image_path))[0]
  338. save_path_base = os.path.join(image_dir, image_basename)
  339. # 构建paddleocr命令,添加所有参数(NPU 下需加 --device npu:0,否则走 CPU 易段错误)
  340. # PaddleOCR会在save_path下创建目录,文件保存在该目录内
  341. cmd = [
  342. _get_paddleocr_executable(), "doc_parser", "-i", image_path,
  343. "--precision", "fp32",
  344. "--use_doc_unwarping", "False",
  345. "--use_doc_orientation_classify", "True",
  346. "--use_chart_recognition", "True",
  347. "--save_path", save_path_base
  348. ] + _paddle_ocr_device_args()
  349. # 添加 VL 识别后端配置(如果已配置)
  350. if VL_REC_BACKEND:
  351. cmd.extend(["--vl_rec_backend", VL_REC_BACKEND])
  352. if VL_REC_SERVER_URL:
  353. cmd.extend(["--vl_rec_server_url", VL_REC_SERVER_URL])
  354. # 设置环境变量,限制GPU内存使用
  355. # env = os.environ.copy()
  356. # 设置PaddlePaddle的GPU内存分配策略,使用更保守的内存分配
  357. # env["FLAGS_fraction_of_gpu_memory_to_use"] = "0.3" # 只使用30%的GPU内存
  358. # env["FLAGS_allocator_strategy"] = "auto_growth" # 使用自动增长策略,避免一次性分配过多内存
  359. logger.info(f"[PaddleOCR] 执行命令: {' '.join(cmd)}")
  360. # 执行命令(env 含 LD_PRELOAD 与 PADDLE_PDX_DISABLE_MODEL_SOURCE_CHECK,避免 static TLS / 模型源检查)
  361. result = subprocess.run(
  362. cmd,
  363. capture_output=True,
  364. text=True,
  365. timeout=300, # 5分钟超时
  366. check=False,
  367. env=_get_paddleocr_subprocess_env(),
  368. )
  369. if result.returncode != 0:
  370. logger.error(f"[PaddleOCR] 命令执行失败,返回码: {result.returncode}")
  371. # doc_parser 已知问题:PP-DocLayoutV3 返回 3 值而管道按 2 值解包,报 "too many values to unpack (expected 2)"
  372. if result.stderr and ("too many values to unpack" in result.stderr or "Exception from the 'cv' worker" in result.stderr):
  373. logger.warning(
  374. "[PaddleOCR] doc_parser 报 cv worker 解包错误,多为 PaddleX 与 PP-DocLayoutV3 不兼容。"
  375. " 可尝试: pip install -U paddlex;或仅需文字时改用 ocr 模式。详见 README_STARTUP.md。"
  376. )
  377. # 完整 stderr 便于排查(NPU 初始化日志较长,真正错误常在末尾)
  378. if result.stderr:
  379. logger.error(f"[PaddleOCR] stderr: {result.stderr}")
  380. if result.stdout:
  381. logger.error(f"[PaddleOCR] stdout(末 2000 字符): {result.stdout[-2000:] if len(result.stdout) > 2000 else result.stdout}")
  382. return None
  383. # 从保存的Markdown文件中读取结果
  384. # PaddleOCR会在save_path下创建目录,文件路径为: {save_path}/{basename}.md
  385. md_file = os.path.join(save_path_base, f"{image_basename}.md")
  386. if os.path.exists(md_file):
  387. logger.info(f"[PaddleOCR] 从Markdown文件读取结果: {md_file}")
  388. try:
  389. with open(md_file, 'r', encoding='utf-8') as f:
  390. markdown_content = f.read()
  391. if markdown_content.strip():
  392. # 将markdown内容转换为标准格式
  393. # 为了兼容现有代码,我们需要将markdown转换回parsing_res_list格式
  394. # 但实际上,我们可以直接返回markdown内容,让调用方处理
  395. # 这里我们返回一个特殊标记,表示这是markdown格式
  396. logger.info(f"[PaddleOCR] 成功读取Markdown文件,内容长度: {len(markdown_content)} 字符")
  397. # 返回markdown内容,使用特殊键标记
  398. return {"markdown_content": markdown_content}
  399. else:
  400. logger.warning("[PaddleOCR] Markdown文件内容为空")
  401. except Exception as e:
  402. logger.exception(f"[PaddleOCR] 读取Markdown文件失败: {e}")
  403. else:
  404. logger.warning(f"[PaddleOCR] Markdown文件不存在: {md_file}")
  405. # 如果Markdown文件不存在或读取失败,尝试从stdout解析
  406. output_text = result.stdout.strip()
  407. if output_text:
  408. logger.info("[PaddleOCR] 从stdout解析输出")
  409. parsed_result = parse_paddleocr_output(output_text)
  410. logger.info(f"[PaddleOCR] 解析成功,获得 {len(parsed_result.get('parsing_res_list', []))} 个区块")
  411. return parsed_result
  412. else:
  413. logger.warning("[PaddleOCR] stdout输出为空,且未找到Markdown文件")
  414. return None
  415. except subprocess.TimeoutExpired:
  416. logger.error("[PaddleOCR] 命令执行超时")
  417. return None
  418. except Exception as e:
  419. logger.exception(f"[PaddleOCR] 调用失败: {e}")
  420. return None
  421. def extract_first_page_from_pdf(pdf_path: str, output_dir: str) -> Optional[str]:
  422. """从PDF文件中提取第一页作为图片
  423. 优先使用pypdfium2,如果不可用则使用pdf2image作为后备方案。
  424. Args:
  425. pdf_path: PDF文件路径
  426. output_dir: 输出目录,用于保存提取的图片
  427. Returns:
  428. 提取的图片路径,如果失败返回None
  429. """
  430. if not PIL_AVAILABLE:
  431. logger.error("[PaddleOCR备用] 缺少必要的库(PIL/Pillow),无法处理图片")
  432. return None
  433. if not os.path.exists(pdf_path):
  434. logger.error(f"[PaddleOCR备用] PDF文件不存在: {pdf_path}")
  435. return None
  436. # 方法1: 尝试使用pypdfium2(优先方法)
  437. if PDFIUM_AVAILABLE:
  438. try:
  439. pdf = pdfium.PdfDocument(pdf_path)
  440. try:
  441. if len(pdf) == 0:
  442. logger.error("[PaddleOCR备用] PDF文件为空")
  443. return None
  444. page = pdf[0]
  445. bitmap = page.render(scale=150 / 72) # 150 DPI
  446. pil_image = bitmap.to_pil()
  447. os.makedirs(output_dir, exist_ok=True)
  448. image_filename = f"paddleocr_fallback_page0_{int(time.time() * 1000)}_{random.randint(1000, 9999)}.png"
  449. image_path = os.path.join(output_dir, image_filename)
  450. pil_image.save(image_path, "PNG", optimize=True, compress_level=6)
  451. logger.info(f"[PaddleOCR备用] 使用pypdfium2从PDF提取第一页图片: {image_path}")
  452. bitmap.close()
  453. return image_path
  454. finally:
  455. try:
  456. pdf.close()
  457. except Exception:
  458. pass
  459. except Exception as e:
  460. logger.warning(f"[PaddleOCR备用] 使用pypdfium2提取图片失败,尝试pdf2image: {e}")
  461. # 方法2: 使用 pdf2image 作为后备方案
  462. if PDF2IMAGE_AVAILABLE:
  463. try:
  464. images = convert_from_path(pdf_path, dpi=150, first_page=1, last_page=1)
  465. if not images:
  466. logger.error("[PaddleOCR备用] pdf2image未能提取到图片")
  467. return None
  468. os.makedirs(output_dir, exist_ok=True)
  469. image_filename = f"paddleocr_fallback_page0_{int(time.time() * 1000)}_{random.randint(1000, 9999)}.png"
  470. image_path = os.path.join(output_dir, image_filename)
  471. images[0].save(image_path, "PNG", optimize=True, compress_level=6)
  472. logger.info(f"[PaddleOCR备用] 使用pdf2image从PDF提取第一页图片: {image_path}")
  473. return image_path
  474. except Exception as e:
  475. logger.exception(f"[PaddleOCR备用] 使用pdf2image提取图片失败: {e}")
  476. # 如果两种方法都不可用
  477. missing_libs = []
  478. if not PDFIUM_AVAILABLE:
  479. missing_libs.append("pypdfium2")
  480. if not PDF2IMAGE_AVAILABLE:
  481. missing_libs.append("pdf2image")
  482. logger.error(
  483. f"[PaddleOCR备用] 缺少必要的库({'或'.join(missing_libs)}),无法从PDF提取图片。请安装: pip install {' '.join(missing_libs)}"
  484. )
  485. return None
  486. def find_pdf_file(output_dir: str) -> Optional[str]:
  487. """在输出目录中查找PDF文件
  488. Args:
  489. output_dir: 输出目录
  490. Returns:
  491. PDF文件路径,如果未找到返回None
  492. """
  493. if not os.path.exists(output_dir):
  494. return None
  495. # 查找PDF文件
  496. pdf_files = list(Path(output_dir).rglob("*.pdf"))
  497. if pdf_files:
  498. # 返回第一个找到的PDF文件
  499. return str(pdf_files[0])
  500. return None
  501. def markdown_to_plain_text(markdown_content: str) -> List[str]:
  502. """将Markdown内容转换为纯文本列表(按行分割)
  503. Args:
  504. markdown_content: Markdown格式的文本
  505. Returns:
  506. 纯文本列表,每行一个元素
  507. """
  508. if not markdown_content:
  509. return []
  510. lines = []
  511. in_code_block = False
  512. # 先处理HTML表格:提取整个表格,转换为文本行
  513. # 查找所有<table>...</table>块
  514. table_pattern = r'<table[^>]*>.*?</table>'
  515. tables = re.findall(table_pattern, markdown_content, re.DOTALL)
  516. # 将表格内容替换为占位符,稍后处理
  517. table_placeholders = []
  518. for i, table in enumerate(tables):
  519. placeholder = f"__TABLE_PLACEHOLDER_{i}__"
  520. table_placeholders.append((placeholder, table))
  521. markdown_content = markdown_content.replace(table, placeholder, 1)
  522. # 处理每一行
  523. for line in markdown_content.split('\n'):
  524. line = line.rstrip() # 只移除右侧空格
  525. # 检测代码块
  526. if line.strip().startswith('```'):
  527. in_code_block = not in_code_block
  528. continue
  529. if in_code_block:
  530. # 代码块内的内容保留原样
  531. if line.strip():
  532. lines.append(line)
  533. continue
  534. # 处理表格占位符
  535. if '__TABLE_PLACEHOLDER_' in line:
  536. # 找到对应的表格
  537. for placeholder, table_html in table_placeholders:
  538. if placeholder in line:
  539. # 提取表格中的所有单元格文本
  540. table_lines = extract_table_text(table_html)
  541. lines.extend(table_lines)
  542. break
  543. continue
  544. # 检测Markdown表格(以 | 开头)
  545. if '|' in line and line.strip().startswith('|'):
  546. # 处理表格行:移除首尾的 |,分割单元格
  547. cells = [cell.strip() for cell in line.split('|') if cell.strip()]
  548. # 移除表格分隔行(只包含 - 和 |)
  549. if all(c in ['-', ':', ' '] for c in ''.join(cells)):
  550. continue
  551. # 合并单元格内容,用空格分隔
  552. table_line = ' '.join(cells)
  553. if table_line.strip():
  554. lines.append(table_line)
  555. continue
  556. # 移除Markdown语法标记
  557. # 移除标题标记 (# ## ### 等)
  558. line = re.sub(r'^#+\s*', '', line)
  559. # 移除列表标记 (- * + 等)
  560. line = re.sub(r'^[-*+]\s+', '', line)
  561. # 移除数字列表标记
  562. line = re.sub(r'^\d+\.\s+', '', line)
  563. # 移除粗体和斜体标记
  564. line = re.sub(r'\*\*([^*]+)\*\*', r'\1', line) # **bold**
  565. line = re.sub(r'\*([^*]+)\*', r'\1', line) # *italic*
  566. line = re.sub(r'__([^_]+)__', r'\1', line) # __bold__
  567. line = re.sub(r'_([^_]+)_', r'\1', line) # _italic_
  568. # 移除链接格式 [text](url) -> text
  569. line = re.sub(r'\[([^\]]+)\]\([^\)]+\)', r'\1', line)
  570. # 移除图片格式 ![alt](url) -> alt
  571. line = re.sub(r'!\[([^\]]*)\]\([^\)]+\)', r'\1', line)
  572. # 移除行内代码标记
  573. line = re.sub(r'`([^`]+)`', r'\1', line)
  574. # 移除HTML标签(div、span等)
  575. line = re.sub(r'<div[^>]*>', '', line)
  576. line = re.sub(r'</div>', '', line)
  577. line = re.sub(r'<span[^>]*>', '', line)
  578. line = re.sub(r'</span>', '', line)
  579. line = re.sub(r'<[^>]+>', '', line) # 移除其他HTML标签
  580. # 清理多余空格
  581. line = line.strip()
  582. if line: # 只保留非空行
  583. lines.append(line)
  584. return lines
  585. def extract_table_text(table_html: str) -> List[str]:
  586. """从HTML表格中提取文本,每行一个元素
  587. Args:
  588. table_html: HTML表格字符串
  589. Returns:
  590. 文本行列表
  591. """
  592. table_lines = []
  593. try:
  594. # 提取所有<tr>标签
  595. tr_pattern = r'<tr[^>]*>(.*?)</tr>'
  596. tr_matches = re.findall(tr_pattern, table_html, re.DOTALL)
  597. for tr_content in tr_matches:
  598. # 提取所有<td>和<th>标签内的文本
  599. cell_pattern = r'<(?:td|th)[^>]*>(.*?)</(?:td|th)>'
  600. cells = re.findall(cell_pattern, tr_content, re.DOTALL)
  601. if cells:
  602. # 清理每个单元格的文本
  603. cleaned_cells = []
  604. for cell in cells:
  605. # 移除嵌套的HTML标签
  606. cleaned = re.sub(r'<[^>]+>', '', cell)
  607. # 移除HTML实体
  608. cleaned = cleaned.replace('&nbsp;', ' ')
  609. cleaned = cleaned.strip()
  610. if cleaned:
  611. cleaned_cells.append(cleaned)
  612. if cleaned_cells:
  613. # 合并单元格内容,用空格分隔
  614. table_line = ' '.join(cleaned_cells)
  615. if table_line.strip():
  616. table_lines.append(table_line)
  617. except Exception as e:
  618. logger.warning(f"[Markdown转换] 提取表格文本失败: {e}")
  619. return table_lines
  620. def call_paddleocr_ocr(image_path: str, save_path: str) -> tuple[Optional[List[str]], Optional[str]]:
  621. """调用paddleocr ocr命令提取文本(用于API接口)
  622. Args:
  623. image_path: 图片路径
  624. save_path: 保存路径(目录)
  625. Returns:
  626. (OCR识别的文本列表, JSON文件路径),如果失败返回(None, None)
  627. """
  628. try:
  629. if not os.path.exists(image_path):
  630. logger.error(f"[PaddleOCR OCR] 图片文件不存在: {image_path}")
  631. return None, None
  632. # 构建paddleocr ocr命令(NPU 下需加 --device npu:0,否则走 CPU 易段错误)
  633. # 注意:ocr 子命令不支持 --vl_rec_backend 等 VL 相关参数
  634. cmd = [_get_paddleocr_executable(), "ocr", "-i", image_path, "--save_path", save_path] + _paddle_ocr_device_args()
  635. logger.info(f"[PaddleOCR OCR] 执行命令: {' '.join(cmd)}")
  636. # 执行命令(env 含 LD_PRELOAD 与 PADDLE_PDX_DISABLE_MODEL_SOURCE_CHECK)
  637. result = subprocess.run(
  638. cmd,
  639. capture_output=True,
  640. text=True,
  641. timeout=300, # 5分钟超时
  642. check=False,
  643. env=_get_paddleocr_subprocess_env(),
  644. )
  645. if result.returncode != 0:
  646. logger.error(f"[PaddleOCR OCR] 命令执行失败,返回码: {result.returncode}")
  647. logger.error(f"[PaddleOCR OCR] 错误输出: {result.stderr}")
  648. return None, None
  649. # 查找保存的JSON文件
  650. # OCR命令会在save_path下生成 {basename}_res.json
  651. image_basename = os.path.splitext(os.path.basename(image_path))[0]
  652. json_file = os.path.join(save_path, f"{image_basename}_res.json")
  653. if not os.path.exists(json_file):
  654. logger.warning(f"[PaddleOCR OCR] JSON文件不存在: {json_file}")
  655. return None, None
  656. # 读取JSON文件
  657. try:
  658. with open(json_file, 'r', encoding='utf-8') as f:
  659. ocr_data = json.load(f)
  660. # 优先提取rec_texts字段(如果存在)
  661. if "rec_texts" in ocr_data and isinstance(ocr_data["rec_texts"], list):
  662. texts = ocr_data["rec_texts"]
  663. logger.info(f"[PaddleOCR OCR] 成功提取 {len(texts)} 个文本片段(从rec_texts)")
  664. return texts, json_file
  665. # 如果没有rec_texts,尝试从parsing_res_list中提取block_content
  666. if "parsing_res_list" in ocr_data and isinstance(ocr_data["parsing_res_list"], list):
  667. texts = []
  668. for item in ocr_data["parsing_res_list"]:
  669. if isinstance(item, dict) and "block_content" in item:
  670. block_content = item["block_content"]
  671. if block_content and block_content.strip():
  672. # 如果block_content包含换行符,按行分割
  673. if "\n" in block_content:
  674. texts.extend([line.strip() for line in block_content.split("\n") if line.strip()])
  675. else:
  676. texts.append(block_content.strip())
  677. if texts:
  678. logger.info(f"[PaddleOCR OCR] 成功提取 {len(texts)} 个文本片段(从parsing_res_list)")
  679. return texts, json_file
  680. logger.warning("[PaddleOCR OCR] JSON文件中未找到rec_texts或parsing_res_list字段")
  681. return None, json_file
  682. except Exception as e:
  683. logger.exception(f"[PaddleOCR OCR] 读取JSON文件失败: {e}")
  684. return None, json_file
  685. except subprocess.TimeoutExpired:
  686. logger.error("[PaddleOCR OCR] 命令执行超时")
  687. return None, None
  688. except Exception as e:
  689. logger.exception(f"[PaddleOCR OCR] 调用失败: {e}")
  690. return None, None
  691. def call_paddleocr_doc_parser_for_text(image_path: str, save_path: str) -> tuple[Optional[List[str]], Optional[str]]:
  692. """调用paddleocr doc_parser命令,将markdown转换为纯文本(用于内部调用提取关键词)
  693. Args:
  694. image_path: 图片路径
  695. save_path: 保存路径(目录)
  696. Returns:
  697. (纯文本列表(按行分割), markdown文件路径),如果失败返回(None, None)
  698. """
  699. try:
  700. if not os.path.exists(image_path):
  701. logger.error(f"[PaddleOCR DocParser] 图片文件不存在: {image_path}")
  702. return None, None
  703. # 生成输出目录和基础文件名
  704. image_dir = os.path.dirname(image_path)
  705. image_basename = os.path.splitext(os.path.basename(image_path))[0]
  706. save_path_base = os.path.join(save_path, image_basename)
  707. os.makedirs(save_path_base, exist_ok=True)
  708. # 构建paddleocr doc_parser命令(NPU 下需加 --device npu:0,否则走 CPU 易段错误)
  709. cmd = [
  710. _get_paddleocr_executable(), "doc_parser", "-i", image_path,
  711. "--precision", "fp32",
  712. "--use_doc_unwarping", "False",
  713. "--use_doc_orientation_classify", "True",
  714. "--use_chart_recognition", "True",
  715. "--save_path", save_path_base
  716. ] + _paddle_ocr_device_args()
  717. # 添加 VL 识别后端配置(如果已配置)
  718. if VL_REC_BACKEND:
  719. cmd.extend(["--vl_rec_backend", VL_REC_BACKEND])
  720. if VL_REC_SERVER_URL:
  721. cmd.extend(["--vl_rec_server_url", VL_REC_SERVER_URL])
  722. logger.info(f"[PaddleOCR DocParser] 执行命令: {' '.join(cmd)}")
  723. # 执行命令(env 含 LD_PRELOAD 与 PADDLE_PDX_DISABLE_MODEL_SOURCE_CHECK)
  724. result = subprocess.run(
  725. cmd,
  726. capture_output=True,
  727. text=True,
  728. timeout=300, # 5分钟超时
  729. check=False,
  730. env=_get_paddleocr_subprocess_env(),
  731. )
  732. if result.returncode != 0:
  733. logger.error(f"[PaddleOCR DocParser] 命令执行失败,返回码: {result.returncode}")
  734. if result.stderr and ("too many values to unpack" in result.stderr or "Exception from the 'cv' worker" in result.stderr):
  735. logger.warning(
  736. "[PaddleOCR DocParser] 报 cv worker 解包错误,多为 PaddleX 与 PP-DocLayoutV3 不兼容。"
  737. " 可尝试: pip install -U paddlex;或改用 ocr 模式提取文字。详见 README_STARTUP.md。"
  738. )
  739. logger.error(f"[PaddleOCR DocParser] 错误输出: {result.stderr}")
  740. return None, None
  741. # 查找保存的Markdown文件
  742. # PaddleOCR会在save_path下创建目录,文件路径为: {save_path}/{basename}.md
  743. md_file = os.path.join(save_path_base, f"{image_basename}.md")
  744. # 也可能在子目录中
  745. if not os.path.exists(md_file):
  746. md_files = sorted(Path(save_path_base).rglob("*.md"))
  747. if md_files:
  748. md_file = str(md_files[0])
  749. logger.info(f"[PaddleOCR DocParser] 在子目录中找到Markdown文件: {md_file}")
  750. if not os.path.exists(md_file):
  751. logger.warning(f"[PaddleOCR DocParser] Markdown文件不存在: {md_file}")
  752. return None, None
  753. # 读取Markdown文件并转换为纯文本
  754. try:
  755. with open(md_file, 'r', encoding='utf-8') as f:
  756. markdown_content = f.read()
  757. if not markdown_content.strip():
  758. logger.warning("[PaddleOCR DocParser] Markdown文件内容为空")
  759. return [], md_file
  760. # 将Markdown转换为纯文本列表
  761. plain_text_lines = markdown_to_plain_text(markdown_content)
  762. logger.info(f"[PaddleOCR DocParser] 成功提取 {len(plain_text_lines)} 行纯文本,Markdown文件: {md_file}")
  763. return plain_text_lines, md_file
  764. except Exception as e:
  765. logger.exception(f"[PaddleOCR DocParser] 读取Markdown文件失败: {e}")
  766. return None, md_file
  767. except subprocess.TimeoutExpired:
  768. logger.error("[PaddleOCR DocParser] 命令执行超时")
  769. return None, None
  770. except Exception as e:
  771. logger.exception(f"[PaddleOCR DocParser] 调用失败: {e}")
  772. return None, None
  773. def extract_keywords_from_ocr_texts(ocr_texts: List[str]) -> Dict[str, Any]:
  774. """从OCR文本列表中提取关键信息
  775. Args:
  776. ocr_texts: OCR识别的文本列表
  777. Returns:
  778. 包含提取的关键信息的字典
  779. """
  780. keywords = {
  781. "project": "",
  782. "standardReferences": "",
  783. "soundLevelMeterMode": "",
  784. "soundCalibratorMode": "",
  785. "calibrationValueBefore": "",
  786. "calibrationValueAfter": "",
  787. "weather_info": [], # 存储天气相关信息
  788. "address_mapping": {} # 存储编号到地址的映射,用于电磁检测记录
  789. }
  790. if not ocr_texts:
  791. return keywords
  792. # 将所有文本合并,用于匹配
  793. full_text = " ".join(ocr_texts)
  794. # 提取项目名称
  795. # 先尝试匹配"项目名称:"格式
  796. project_match = re.search(r'项目名称[::]([^检测依据声级计声校准器检测前检测后气象条件日期]+)', full_text)
  797. if project_match:
  798. project = project_match.group(1).strip()
  799. # 清理可能的后续内容
  800. project = re.sub(r'检测依据.*$', '', project).strip()
  801. keywords["project"] = project
  802. logger.debug(f"[关键词提取] 提取到项目名称: {project}")
  803. else:
  804. # 如果没找到,尝试查找"项目名称"文本,然后检查后续文本片段
  805. for i, text in enumerate(ocr_texts):
  806. if "项目名称" in text:
  807. # 检查当前文本中是否有值(在冒号后面)
  808. if ":" in text or ":" in text:
  809. project_match = re.search(r'项目名称[::]([^检测依据声级计声校准器检测前检测后气象条件日期]+)', text)
  810. if project_match:
  811. project = project_match.group(1).strip()
  812. project = re.sub(r'检测依据.*$', '', project).strip()
  813. if project:
  814. keywords["project"] = project
  815. logger.debug(f"[关键词提取] 从当前文本提取到项目名称: {project}")
  816. break
  817. # 如果当前文本只有"项目名称",检查下一个文本片段
  818. elif text.strip() == "项目名称" or text.strip().startswith("项目名称"):
  819. # 检查后续几个文本片段,找到项目名称值
  820. for j in range(i + 1, min(i + 3, len(ocr_texts))):
  821. next_text = ocr_texts[j].strip()
  822. # 如果下一个文本不是"检测依据"、"监测依据"等标签,且包含中文字符,可能是项目名称
  823. if next_text and not re.match(r'^(检测依据|监测依据|检查依据|声级计|声校准器|检测前|检测后|气象条件|日期)', next_text):
  824. # 检查是否包含中文字符(项目名称通常是中文)
  825. if re.search(r'[\u4e00-\u9fa5]', next_text):
  826. # 提取项目名称(直到遇到"检测依据"等关键词)
  827. project = re.sub(r'(检测依据|监测依据|检查依据).*$', '', next_text).strip()
  828. if project:
  829. keywords["project"] = project
  830. logger.debug(f"[关键词提取] 从后续文本提取到项目名称: {project}")
  831. break
  832. if keywords["project"]:
  833. break
  834. # 提取检测依据
  835. standard_match = re.search(r'检测依据[::]([^声级计声校准器检测前检测后气象条件日期]+)', full_text)
  836. if standard_match:
  837. standard = standard_match.group(1).strip()
  838. # 提取GB标准
  839. gb_standards = re.findall(r'GB\s*\d+[-\.]?\d*[-\.]?\d*', standard)
  840. if gb_standards:
  841. keywords["standardReferences"] = " ".join(gb_standards)
  842. else:
  843. keywords["standardReferences"] = standard.replace("□其他:", "").strip()
  844. logger.debug(f"[关键词提取] 提取到检测依据: {keywords['standardReferences']}")
  845. # 提取声级计型号/编号
  846. sound_meter_match = re.search(r'声级计型号[/::]?(?:编号)?[::]\s*([A-Z0-9+/]+)', full_text)
  847. if sound_meter_match:
  848. keywords["soundLevelMeterMode"] = sound_meter_match.group(1).strip()
  849. logger.debug(f"[关键词提取] 提取到声级计型号: {keywords['soundLevelMeterMode']}")
  850. # 提取声校准器型号/编号
  851. calibrator_match = re.search(r'声校准器型号[/::]?(?:编号)?[::]\s*([A-Z0-9+/]+)', full_text)
  852. if calibrator_match:
  853. keywords["soundCalibratorMode"] = calibrator_match.group(1).strip()
  854. logger.debug(f"[关键词提取] 提取到声校准器型号: {keywords['soundCalibratorMode']}")
  855. # 提取校准值 - 按照出现顺序:第一个dB(A)是检测前,第二个是检测后
  856. # 首先尝试通过字段名匹配
  857. before_cal_found = False
  858. after_cal_found = False
  859. # 先尝试通过字段名精确匹配
  860. for i, text in enumerate(ocr_texts):
  861. if "检测前校准值" in text and not before_cal_found:
  862. # 在当前文本中查找(可能格式:检测前校准值:93.8 dB(A))
  863. before_cal_match = re.search(r'检测前校准值[::]\s*([0-9.]+)\s*dB[((]?A[))]?', text)
  864. if before_cal_match:
  865. cal_value = before_cal_match.group(1).strip()
  866. keywords["calibrationValueBefore"] = f"{cal_value} dB(A)"
  867. logger.debug(f"[关键词提取] 提取到检测前校准值: {keywords['calibrationValueBefore']}")
  868. before_cal_found = True
  869. continue
  870. # 如果当前文本只有字段名(如"检测前校准值:"),检查相邻文本片段
  871. elif re.search(r'检测前校准值[::]\s*$', text) or (text.strip() == "检测前校准值:"):
  872. # 检查后续3个文本片段,查找包含dB(A)的文本
  873. for j in range(i + 1, min(i + 4, len(ocr_texts))):
  874. next_text = ocr_texts[j]
  875. # 查找包含dB(A)的文本(如"93.8dB(A)")
  876. db_match = re.search(r'([0-9.]+)\s*dB[((]?A[))]?', next_text)
  877. if db_match:
  878. cal_value = db_match.group(1).strip()
  879. keywords["calibrationValueBefore"] = f"{cal_value} dB(A)"
  880. logger.debug(f"[关键词提取] 从相邻文本提取到检测前校准值: {keywords['calibrationValueBefore']}")
  881. before_cal_found = True
  882. break
  883. if before_cal_found:
  884. continue
  885. if "检测后校准值" in text and not after_cal_found:
  886. # 在当前文本中查找(可能格式:检测后校准值:93.8 dB(A)或 93.8dB(A)检测后校准值:_93.8dB(A))
  887. after_cal_match = re.search(r'检测后校准值[::]\s*([0-9.]+)\s*dB[((]?A[))]?', text)
  888. if after_cal_match:
  889. cal_value = after_cal_match.group(1).strip()
  890. keywords["calibrationValueAfter"] = f"{cal_value} dB(A)"
  891. logger.debug(f"[关键词提取] 提取到检测后校准值: {keywords['calibrationValueAfter']}")
  892. after_cal_found = True
  893. continue
  894. # 如果当前文本包含"检测后校准值"但值在文本前面(如"93.8dB(A)检测后校准值:")
  895. elif re.search(r'([0-9.]+)\s*dB[((]?A[))]?\s*检测后校准值', text):
  896. db_match = re.search(r'([0-9.]+)\s*dB[((]?A[))]?', text)
  897. if db_match:
  898. cal_value = db_match.group(1).strip()
  899. keywords["calibrationValueAfter"] = f"{cal_value} dB(A)"
  900. logger.debug(f"[关键词提取] 从同一文本提取到检测后校准值: {keywords['calibrationValueAfter']}")
  901. after_cal_found = True
  902. continue
  903. # 如果当前文本只有字段名(如"检测后校准值:"),检查相邻文本片段
  904. elif re.search(r'检测后校准值[::]\s*$', text) or (text.strip() == "检测后校准值:"):
  905. # 检查后续3个文本片段,查找包含dB(A)的文本
  906. for j in range(i + 1, min(i + 4, len(ocr_texts))):
  907. next_text = ocr_texts[j]
  908. # 查找包含dB(A)的文本(如"93.8dB(A)")
  909. db_match = re.search(r'([0-9.]+)\s*dB[((]?A[))]?', next_text)
  910. if db_match:
  911. cal_value = db_match.group(1).strip()
  912. keywords["calibrationValueAfter"] = f"{cal_value} dB(A)"
  913. logger.debug(f"[关键词提取] 从相邻文本提取到检测后校准值: {keywords['calibrationValueAfter']}")
  914. after_cal_found = True
  915. break
  916. if after_cal_found:
  917. continue
  918. # 如果通过字段名没有找到,按照出现顺序:第一个dB(A)是检测前,第二个是检测后
  919. if not before_cal_found or not after_cal_found:
  920. db_a_matches = [] # 存储所有找到的dB(A)值及其位置
  921. for i, text in enumerate(ocr_texts):
  922. # 查找包含dB(A)的文本
  923. db_matches = re.finditer(r'([0-9.]+)\s*dB[((]?A[))]?', text)
  924. for match in db_matches:
  925. cal_value = match.group(1).strip()
  926. db_a_matches.append((i, cal_value, text))
  927. # 如果找到至少一个dB(A),且还没有检测前校准值,第一个就是检测前
  928. if db_a_matches and not before_cal_found:
  929. first_cal_value = db_a_matches[0][1]
  930. keywords["calibrationValueBefore"] = f"{first_cal_value} dB(A)"
  931. logger.debug(f"[关键词提取] 按出现顺序提取到检测前校准值(第一个dB(A)): {keywords['calibrationValueBefore']}")
  932. before_cal_found = True
  933. # 如果找到至少两个dB(A),且还没有检测后校准值,第二个就是检测后
  934. if len(db_a_matches) >= 2 and not after_cal_found:
  935. second_cal_value = db_a_matches[1][1]
  936. keywords["calibrationValueAfter"] = f"{second_cal_value} dB(A)"
  937. logger.debug(f"[关键词提取] 按出现顺序提取到检测后校准值(第二个dB(A)): {keywords['calibrationValueAfter']}")
  938. after_cal_found = True
  939. # 如果只找到一个dB(A),且还没有检测后校准值,且检测前已经找到,那么这个就是检测后(可能是同一个值)
  940. elif len(db_a_matches) == 1 and not after_cal_found and before_cal_found:
  941. # 如果检测前和检测后是同一个值,也设置检测后
  942. if keywords["calibrationValueBefore"]:
  943. keywords["calibrationValueAfter"] = keywords["calibrationValueBefore"]
  944. logger.debug(f"[关键词提取] 检测前和检测后校准值相同: {keywords['calibrationValueAfter']}")
  945. # 提取天气信息(从文本片段中查找包含日期和天气信息的片段)
  946. # 需要处理文本可能分散在多个片段中的情况
  947. # 只有当"日期:"存在且后续有天气相关信息时才提取
  948. current_weather_info = None
  949. weather_start_idx = -1 # 记录天气信息开始的索引
  950. for i, text in enumerate(ocr_texts):
  951. # 查找包含"日期:"的文本,开始新的天气记录
  952. # 只有当后续文本中有天气相关信息时才创建记录
  953. date_match = re.search(r'日期[::]\s*([\d.\-]+)', text)
  954. if date_match:
  955. # 检查后续10个文本片段中是否有天气相关信息(天气、温度、湿度、风速、风向等)
  956. has_weather_info = False
  957. for j in range(i, min(i + 10, len(ocr_texts))):
  958. check_text = ocr_texts[j]
  959. if any(keyword in check_text for keyword in ["天气", "温度", "湿度", "风速", "风向", "℃", "%RH", "m/s"]):
  960. has_weather_info = True
  961. break
  962. if has_weather_info:
  963. # 如果之前有未完成的天气记录,先保存
  964. if current_weather_info and any([current_weather_info["monitorAt"], current_weather_info["weather"],
  965. current_weather_info["temp"], current_weather_info["humidity"],
  966. current_weather_info["windSpeed"], current_weather_info["windDirection"]]):
  967. keywords["weather_info"].append(current_weather_info)
  968. # 创建新的天气记录
  969. current_weather_info = {
  970. "monitorAt": date_match.group(1).strip(),
  971. "weather": "",
  972. "temp": "",
  973. "humidity": "",
  974. "windSpeed": "",
  975. "windDirection": ""
  976. }
  977. weather_start_idx = i
  978. # 如果当前有天气记录,继续提取信息(从当前文本和后续几个文本中)
  979. if current_weather_info:
  980. # 只在天气记录开始后的10个文本片段内查找(避免跨太远)
  981. if weather_start_idx >= 0 and i <= weather_start_idx + 10:
  982. # 查找天气(在同一文本或后续文本中)
  983. if not current_weather_info["weather"]:
  984. weather_match = re.search(r'天气\s*([^\s温度湿度风速风向]+)', text)
  985. if weather_match:
  986. weather_value = weather_match.group(1).strip()
  987. if weather_value and weather_value != "_" and not re.match(r'^[\d.\-]+$', weather_value):
  988. current_weather_info["weather"] = weather_value
  989. # 查找温度(可能格式:温度29.5-35.0 或 温度 29.5-35.0)
  990. if not current_weather_info["temp"]:
  991. temp_match = re.search(r'温度\s*([0-9.\-]+)', text)
  992. if temp_match:
  993. current_weather_info["temp"] = temp_match.group(1).strip()
  994. # 查找湿度(可能格式:湿度74.0-74.1 或 在"℃ 湿度"之后的文本中)
  995. if not current_weather_info["humidity"]:
  996. # 先检查当前文本是否包含湿度值
  997. humidity_match = re.search(r'湿度\s*([0-9.\-]+)', text)
  998. if humidity_match:
  999. current_weather_info["humidity"] = humidity_match.group(1).strip()
  1000. # 如果当前文本是"℃ 湿度"或类似格式,湿度值可能在下一行
  1001. elif "湿度" in text and i + 1 < len(ocr_texts):
  1002. next_text = ocr_texts[i + 1]
  1003. if re.match(r'^[0-9.\-]+', next_text):
  1004. current_weather_info["humidity"] = next_text.strip()
  1005. # 查找风速(可能格式:风速0.4-0.5 或 在"%RH 风速"之后的文本中)
  1006. if not current_weather_info["windSpeed"]:
  1007. # 先检查当前文本是否包含风速值
  1008. wind_speed_match = re.search(r'风速\s*([0-9.\-]+)', text)
  1009. if wind_speed_match:
  1010. current_weather_info["windSpeed"] = wind_speed_match.group(1).strip()
  1011. # 如果当前文本是"%RH 风速"或类似格式,风速值可能在下一行
  1012. elif "风速" in text and i + 1 < len(ocr_texts):
  1013. next_text = ocr_texts[i + 1]
  1014. if re.match(r'^[0-9.\-]+', next_text):
  1015. current_weather_info["windSpeed"] = next_text.strip()
  1016. # 查找风向(可能格式:风向南风 或 在"m/s风向"之后的文本中,或 "_m/s风向南风" 或 "m/s风向南风")
  1017. if not current_weather_info["windDirection"]:
  1018. # 先检查当前文本是否包含风向值(格式:风向南风)
  1019. # 改进正则表达式,匹配更长的风向值(如"南风"、"东北"、"东偏北"等)
  1020. # 注意:不要排除"风"字,因为"风速"中包含"风",会导致"南风"只匹配到"南"
  1021. wind_dir_match = re.search(r'风向\s*([^\s日期温度湿度]+?)(?=\s|日期|温度|湿度|风速|$)', text)
  1022. if wind_dir_match:
  1023. wind_value = wind_dir_match.group(1).strip()
  1024. # 确保不是"m/s"或数字
  1025. if wind_value and wind_value != "m/s" and not re.match(r'^[0-9.\-]+$', wind_value):
  1026. # 如果只匹配到单个方向字(如"南"),检查下一个文本片段是否是"风"
  1027. if len(wind_value) == 1 and i + 1 < len(ocr_texts):
  1028. next_text = ocr_texts[i + 1].strip()
  1029. # 如果下一个文本是"风",合并为"南风"等
  1030. if next_text == "风" or next_text.startswith("风"):
  1031. wind_value = wind_value + "风"
  1032. logger.debug(f"[关键词提取] 合并风向值: {wind_value}")
  1033. current_weather_info["windDirection"] = wind_value
  1034. # 如果当前文本是"m/s风向"或"_m/s风向"格式,风向值在同一文本中(如 "_m/s风向南风" 或 "m/s风向南风")
  1035. if not current_weather_info["windDirection"]:
  1036. # 注意:不要排除"风"字,因为"风速"中包含"风",会导致"南风"只匹配到"南"
  1037. wind_dir_match = re.search(r'[_\s]*m/s\s*风向\s*([^\s日期温度湿度]+?)(?=\s|日期|温度|湿度|风速|$)', text)
  1038. if wind_dir_match:
  1039. wind_value = wind_dir_match.group(1).strip()
  1040. if wind_value and not re.match(r'^[0-9.\-]+$', wind_value):
  1041. # 如果只匹配到单个方向字,检查下一个文本片段
  1042. if len(wind_value) == 1 and i + 1 < len(ocr_texts):
  1043. next_text = ocr_texts[i + 1].strip()
  1044. if next_text == "风" or next_text.startswith("风"):
  1045. wind_value = wind_value + "风"
  1046. logger.debug(f"[关键词提取] 合并风向值: {wind_value}")
  1047. current_weather_info["windDirection"] = wind_value
  1048. # 如果当前文本是"m/s"或类似格式,风向值可能在下一行
  1049. if not current_weather_info["windDirection"]:
  1050. if ("m/s" in text or "风向" in text) and i + 1 < len(ocr_texts):
  1051. next_text = ocr_texts[i + 1].strip()
  1052. if next_text and not re.match(r'^[0-9.\-]+', next_text) and "风向" not in next_text:
  1053. wind_value = next_text
  1054. # 如果下一个文本是单个方向字,再检查下下个文本是否是"风"
  1055. if len(wind_value) == 1 and i + 2 < len(ocr_texts):
  1056. next_next_text = ocr_texts[i + 2].strip()
  1057. if next_next_text == "风" or next_next_text.startswith("风"):
  1058. wind_value = wind_value + "风"
  1059. logger.debug(f"[关键词提取] 合并风向值: {wind_value}")
  1060. current_weather_info["windDirection"] = wind_value
  1061. # 保存最后一个天气记录
  1062. if current_weather_info and any([current_weather_info["monitorAt"], current_weather_info["weather"],
  1063. current_weather_info["temp"], current_weather_info["humidity"],
  1064. current_weather_info["windSpeed"], current_weather_info["windDirection"]]):
  1065. keywords["weather_info"].append(current_weather_info)
  1066. # 提取监测地点(address)信息,用于电磁检测记录
  1067. # 匹配模式:编号(如EB1, EB2, ZB1等)后面跟着地址信息
  1068. # 地址通常在编号之后,可能在同一个文本片段或相邻的文本片段中
  1069. for i, text in enumerate(ocr_texts):
  1070. # 查找编号模式:EB1, EB2, ZB1, ZB2等
  1071. code_match = re.search(r'(E[ZB]\d+|Z[ZB]\d+)', text, re.IGNORECASE)
  1072. if code_match:
  1073. code = code_match.group(1).upper() # 统一转为大写
  1074. # 在当前文本中查找地址(编号后面的非数字、非时间格式的文本)
  1075. # 地址通常在编号之后,可能是中文地名
  1076. address_candidates = []
  1077. # 在当前文本中,编号之后查找地址
  1078. code_pos = code_match.end()
  1079. remaining_text = text[code_pos:].strip()
  1080. # 跳过可能的空格、标点等
  1081. remaining_text = re.sub(r'^[\s,,。、]+', '', remaining_text)
  1082. # 如果剩余文本不为空且不是纯数字或时间格式,可能是地址
  1083. if remaining_text and not re.match(r'^[\d.\-:\s]+$', remaining_text):
  1084. # 提取地址(直到遇到数字、时间或特定关键词)
  1085. address_match = re.search(r'^([^\d\n]+?)(?=\d|时间|线高|$)', remaining_text)
  1086. if address_match:
  1087. address = address_match.group(1).strip()
  1088. # 清理地址,移除常见的非地址字符
  1089. address = re.sub(r'[,。、\s]+$', '', address)
  1090. if address and len(address) > 0:
  1091. address_candidates.append(address)
  1092. # 如果当前文本中没有找到地址,检查相邻的文本片段
  1093. if not address_candidates:
  1094. # 检查编号之前的文本片段(地址可能在编号之前,需要跳过数字、时间、高度等)
  1095. # 向前查找最多5个文本片段,跳过数字、时间、高度等,找到中文地名
  1096. for j in range(i - 1, max(i - 6, -1), -1):
  1097. prev_text = ocr_texts[j].strip()
  1098. if not prev_text:
  1099. continue
  1100. # 跳过编号、数字、时间、高度等
  1101. if re.match(r'^(E[ZB]\d+|Z[ZB]\d+|\d+|时间|线高|编号|均值|24m|\d{4}[.\-]\d{1,2}[.\-]\d{1,2})', prev_text, re.IGNORECASE):
  1102. continue
  1103. # 检查是否是中文地名(包含至少2个中文字符)
  1104. if re.search(r'[\u4e00-\u9fa5]{2,}', prev_text):
  1105. # 进一步确认:不是纯数字、时间格式等
  1106. if not re.match(r'^[\d.\-:\s]+$', prev_text):
  1107. address_candidates.append(prev_text)
  1108. logger.debug(f"[关键词提取] 在编号{code}之前找到地址候选 (索引{j}): {prev_text}")
  1109. break # 找到第一个地址就停止
  1110. # 检查编号之后的文本片段
  1111. if not address_candidates and i + 1 < len(ocr_texts):
  1112. next_text = ocr_texts[i + 1].strip()
  1113. # 如果下一个文本不是编号、数字、时间等,可能是地址
  1114. if next_text and not re.match(r'^(E[ZB]\d+|Z[ZB]\d+|\d+|时间|线高|编号|均值|24m|\d{4}[.\-]\d{1,2}[.\-]\d{1,2})', next_text, re.IGNORECASE):
  1115. # 检查是否是中文地名
  1116. if re.search(r'[\u4e00-\u9fa5]{2,}', next_text):
  1117. address_candidates.append(next_text)
  1118. # 如果找到地址候选,选择最合适的(通常是第一个非空的)
  1119. if address_candidates:
  1120. address = address_candidates[0]
  1121. # 进一步清理地址
  1122. address = re.sub(r'^[,。、\s]+|[,。、\s]+$', '', address)
  1123. if address:
  1124. keywords["address_mapping"][code] = address
  1125. logger.debug(f"[关键词提取] 提取到监测地点: {code} -> {address}")
  1126. return keywords
  1127. def extract_keywords_from_markdown(markdown_content: str) -> Dict[str, Any]:
  1128. """从markdown内容中直接提取关键信息
  1129. Args:
  1130. markdown_content: markdown内容字符串
  1131. Returns:
  1132. 包含提取的关键信息的字典
  1133. """
  1134. keywords = {
  1135. "project": "",
  1136. "standardReferences": "",
  1137. "soundLevelMeterMode": "",
  1138. "soundCalibratorMode": "",
  1139. "calibrationValueBefore": "",
  1140. "calibrationValueAfter": "",
  1141. "weather_info": [] # 存储天气相关信息
  1142. }
  1143. if not markdown_content:
  1144. return keywords
  1145. # 移除HTML标签,保留文本内容(但保留表格结构信息)
  1146. # 先提取表格中的文本内容
  1147. text_content = markdown_content
  1148. # 提取项目名称
  1149. project_match = re.search(r'项目名称[::]([^检测依据声级计声校准器检测前检测后气象条件日期<>]+)', text_content)
  1150. if project_match:
  1151. project = project_match.group(1).strip()
  1152. # 清理可能的后续内容和HTML标签
  1153. project = re.sub(r'检测依据.*$', '', project).strip()
  1154. project = re.sub(r'<[^>]+>', '', project).strip()
  1155. if project:
  1156. keywords["project"] = project
  1157. logger.debug(f"[Markdown关键词提取] 提取到项目名称: {project}")
  1158. # 提取检测依据
  1159. standard_match = re.search(r'检测依据[::]([^声级计声校准器检测前检测后气象条件日期<>]+)', text_content)
  1160. if standard_match:
  1161. standard = standard_match.group(1).strip()
  1162. # 提取GB标准
  1163. gb_standards = re.findall(r'GB\s*\d+[-\.]?\d*[-\.]?\d*', standard)
  1164. if gb_standards:
  1165. keywords["standardReferences"] = " ".join(gb_standards)
  1166. else:
  1167. keywords["standardReferences"] = re.sub(r'<[^>]+>', '', standard).replace("□其他:", "").strip()
  1168. logger.debug(f"[Markdown关键词提取] 提取到检测依据: {keywords['standardReferences']}")
  1169. # 提取声级计型号/编号
  1170. sound_meter_match = re.search(r'声级计型号[/::]?(?:编号)?[::]\s*([A-Z0-9+/()()]+)', text_content)
  1171. if sound_meter_match:
  1172. sound_meter = sound_meter_match.group(1).strip()
  1173. sound_meter = re.sub(r'<[^>]+>', '', sound_meter).strip()
  1174. if sound_meter:
  1175. keywords["soundLevelMeterMode"] = sound_meter
  1176. logger.debug(f"[Markdown关键词提取] 提取到声级计型号: {keywords['soundLevelMeterMode']}")
  1177. # 提取声校准器型号/编号
  1178. calibrator_match = re.search(r'声校准器型号[/::]?(?:编号)?[::]\s*([A-Z0-9+/()()]+)', text_content)
  1179. if calibrator_match:
  1180. calibrator = calibrator_match.group(1).strip()
  1181. calibrator = re.sub(r'<[^>]+>', '', calibrator).strip()
  1182. if calibrator:
  1183. keywords["soundCalibratorMode"] = calibrator
  1184. logger.debug(f"[Markdown关键词提取] 提取到声校准器型号: {keywords['soundCalibratorMode']}")
  1185. # 提取检测前校准值
  1186. before_cal_match = re.search(r'检测前校准值[::]\s*([0-9.]+)\s*dB[((]?A[))]?', text_content)
  1187. if before_cal_match:
  1188. cal_value = before_cal_match.group(1).strip()
  1189. keywords["calibrationValueBefore"] = f"{cal_value} dB(A)"
  1190. logger.debug(f"[Markdown关键词提取] 提取到检测前校准值: {keywords['calibrationValueBefore']}")
  1191. # 提取检测后校准值
  1192. after_cal_match = re.search(r'检测后校准值[::]\s*([0-9.]+)\s*dB[((]?A[))]?', text_content)
  1193. if after_cal_match:
  1194. cal_value = after_cal_match.group(1).strip()
  1195. keywords["calibrationValueAfter"] = f"{cal_value} dB(A)"
  1196. logger.debug(f"[Markdown关键词提取] 提取到检测后校准值: {keywords['calibrationValueAfter']}")
  1197. # 提取天气信息
  1198. # 查找所有包含"日期:"的行或片段
  1199. date_pattern = r'日期[::]\s*([\d.\-]+)'
  1200. date_matches = list(re.finditer(date_pattern, text_content))
  1201. for date_match in date_matches:
  1202. date_value = date_match.group(1).strip()
  1203. # 获取日期匹配位置后的文本(最多500字符)
  1204. start_pos = date_match.end()
  1205. weather_section = text_content[start_pos:start_pos + 500]
  1206. weather_info = {
  1207. "monitorAt": date_value,
  1208. "weather": "",
  1209. "temp": "",
  1210. "humidity": "",
  1211. "windSpeed": "",
  1212. "windDirection": ""
  1213. }
  1214. # 提取天气
  1215. weather_match = re.search(r'天气\s*([^\s温度湿度风速风向<>]+)', weather_section)
  1216. if weather_match:
  1217. weather_value = weather_match.group(1).strip()
  1218. weather_value = re.sub(r'<[^>]+>', '', weather_value).strip()
  1219. if weather_value and weather_value != "_" and not re.match(r'^[\d.\-]+$', weather_value):
  1220. weather_info["weather"] = weather_value
  1221. # 提取温度
  1222. temp_match = re.search(r'温度[::]?\s*([0-9.\-]+)', weather_section)
  1223. if temp_match:
  1224. weather_info["temp"] = temp_match.group(1).strip()
  1225. # 提取湿度
  1226. humidity_match = re.search(r'湿度[::]?\s*([0-9.\-]+)', weather_section)
  1227. if humidity_match:
  1228. weather_info["humidity"] = humidity_match.group(1).strip()
  1229. # 提取风速
  1230. wind_speed_match = re.search(r'风速[::]?\s*([0-9.\-]+)', weather_section)
  1231. if wind_speed_match:
  1232. weather_info["windSpeed"] = wind_speed_match.group(1).strip()
  1233. # 提取风向
  1234. wind_dir_match = re.search(r'风向[::]?\s*([^\s日期温度湿度风速<>]+?)(?=\s|日期|温度|湿度|风速|$|<)', weather_section)
  1235. if wind_dir_match:
  1236. wind_value = wind_dir_match.group(1).strip()
  1237. wind_value = re.sub(r'<[^>]+>', '', wind_value).strip()
  1238. if wind_value and wind_value != "m/s" and not re.match(r'^[0-9.\-]+$', wind_value):
  1239. weather_info["windDirection"] = wind_value
  1240. # 如果至少有一个字段不为空,则添加这条记录
  1241. if any([weather_info["monitorAt"], weather_info["weather"], weather_info["temp"],
  1242. weather_info["humidity"], weather_info["windSpeed"], weather_info["windDirection"]]):
  1243. keywords["weather_info"].append(weather_info)
  1244. logger.debug(f"[Markdown关键词提取] 提取到天气记录: {weather_info}")
  1245. return keywords
  1246. def supplement_missing_fields_from_ocr_json(
  1247. records: List[Dict[str, Any]],
  1248. ocr_json_path: str,
  1249. field_mapping: Dict[str, str] = None
  1250. ) -> List[Dict[str, Any]]:
  1251. """从OCR的JSON输出中补充缺失字段
  1252. 根据文本位置关系来补充缺失字段。例如,如果找到了maxReactivePower的值(如"-2.48"),
  1253. 那么minReactivePower的值就在它后面的位置("-4.75")。
  1254. Args:
  1255. records: 原始解析记录列表(OperationalConditionV2格式)
  1256. ocr_json_path: OCR输出的JSON文件路径
  1257. field_mapping: 字段映射关系,如{"maxReactivePower": "minReactivePower"},表示maxReactivePower后面是minReactivePower
  1258. Returns:
  1259. 补充后的记录列表
  1260. """
  1261. if not records or not ocr_json_path or not os.path.exists(ocr_json_path):
  1262. return records
  1263. try:
  1264. # 读取OCR JSON文件
  1265. with open(ocr_json_path, 'r', encoding='utf-8') as f:
  1266. ocr_data = json.load(f)
  1267. # 提取rec_texts数组
  1268. rec_texts = ocr_data.get("rec_texts", [])
  1269. if not rec_texts:
  1270. logger.warning("[OCR字段补充] JSON中未找到rec_texts字段")
  1271. return records
  1272. logger.info(f"[OCR字段补充] 从OCR JSON中提取到 {len(rec_texts)} 个文本片段")
  1273. # 默认字段映射:max字段后面是min字段
  1274. if field_mapping is None:
  1275. field_mapping = {
  1276. "maxVoltage": "minVoltage",
  1277. "maxCurrent": "minCurrent",
  1278. "maxActivePower": "minActivePower",
  1279. "maxReactivePower": "minReactivePower"
  1280. }
  1281. # 为每条记录补充缺失字段
  1282. for record in records:
  1283. record_name = record.get("name", "")
  1284. logger.debug(f"[OCR字段补充] 处理记录: {record_name}")
  1285. # 对于每个max字段,如果对应的min字段为空,尝试从OCR中补充
  1286. for max_field, min_field in field_mapping.items():
  1287. max_value = record.get(max_field, "").strip()
  1288. min_value = record.get(min_field, "").strip()
  1289. # 如果max字段有值但min字段为空,尝试从OCR中补充
  1290. if max_value and not min_value:
  1291. logger.debug(f"[OCR字段补充] 记录 {record_name}: {max_field}={max_value}, {min_field}为空,尝试从OCR补充")
  1292. # 在rec_texts中查找max_value
  1293. try:
  1294. max_value_float = float(max_value)
  1295. # 查找匹配的文本(允许小的数值差异)
  1296. found_max = False
  1297. for i, text in enumerate(rec_texts):
  1298. # 尝试将文本转换为数值
  1299. try:
  1300. text_float = float(text.strip())
  1301. # 如果数值匹配(允许小的误差)
  1302. if abs(text_float - max_value_float) < 0.01:
  1303. found_max = True
  1304. # 检查后续几个文本,找到第一个数值作为min_value
  1305. # 在表格中,max和min通常是相邻的,但中间可能有其他文本
  1306. for j in range(i + 1, min(i + 5, len(rec_texts))): # 检查后续最多4个文本
  1307. next_text = rec_texts[j].strip()
  1308. try:
  1309. next_value_float = float(next_text)
  1310. # 如果找到数值,且与max_value不同,则作为min_value
  1311. if abs(next_value_float - max_value_float) > 0.01:
  1312. record[min_field] = next_text
  1313. logger.info(f"[OCR字段补充] 从OCR补充 {min_field}: {next_text} (在 {max_field}={max_value} 之后,位置 {j})")
  1314. break
  1315. except ValueError:
  1316. # 不是数值,继续查找
  1317. continue
  1318. if record.get(min_field):
  1319. break
  1320. except ValueError:
  1321. # 文本不是数值,继续
  1322. pass
  1323. if not found_max:
  1324. logger.debug(f"[OCR字段补充] 未在OCR中找到 {max_field} 的值 '{max_value}'")
  1325. except ValueError:
  1326. # max_value不是数值,跳过
  1327. logger.debug(f"[OCR字段补充] {max_field}值 '{max_value}' 不是数值,跳过")
  1328. pass
  1329. logger.info("[OCR字段补充] 字段补充完成")
  1330. return records
  1331. except Exception as e:
  1332. logger.exception(f"[OCR字段补充] 补充过程出错: {e}")
  1333. return records
  1334. def extract_image_from_markdown(markdown_content: str, output_dir: str) -> Optional[str]:
  1335. """从markdown内容中提取第一张图片路径
  1336. Args:
  1337. markdown_content: markdown内容
  1338. output_dir: 输出目录
  1339. Returns:
  1340. 图片路径,如果未找到返回None
  1341. """
  1342. # 查找markdown中的图片引用
  1343. # 格式: ![alt](path) 或 <img src="path">
  1344. image_patterns = [
  1345. r'!\[.*?\]\((.*?)\)', # markdown图片格式
  1346. r'<img[^>]+src=["\'](.*?)["\']', # HTML img标签
  1347. r'<img[^>]+src=(.*?)(?:\s|>)', # HTML img标签(无引号)
  1348. ]
  1349. for pattern in image_patterns:
  1350. matches = re.findall(pattern, markdown_content)
  1351. if matches:
  1352. image_path = matches[0]
  1353. # 如果是相对路径,尝试在output_dir中查找
  1354. if not os.path.isabs(image_path):
  1355. # 尝试多个可能的路径
  1356. possible_paths = [
  1357. os.path.join(output_dir, image_path),
  1358. os.path.join(output_dir, "images", os.path.basename(image_path)),
  1359. os.path.join(output_dir, os.path.basename(image_path)),
  1360. ]
  1361. for full_path in possible_paths:
  1362. if os.path.exists(full_path):
  1363. return full_path
  1364. elif os.path.exists(image_path):
  1365. return image_path
  1366. return None
  1367. def fallback_parse_with_paddleocr(
  1368. json_data: Dict[str, Any],
  1369. markdown_content: str,
  1370. output_dir: Optional[str] = None,
  1371. document_type: Optional[str] = None,
  1372. input_file: Optional[str] = None
  1373. ) -> Optional[str]:
  1374. """当JSON数据缺失时,使用paddleocr进行备用解析
  1375. Args:
  1376. json_data: 原始JSON数据
  1377. markdown_content: 原始markdown内容
  1378. output_dir: 输出目录(用于查找图片)
  1379. document_type: 文档类型
  1380. input_file: 原始输入文件路径(PDF或图片),如果未找到图片则从PDF提取第一页
  1381. Returns:
  1382. 补充后的markdown内容,如果失败返回None
  1383. """
  1384. try:
  1385. # 注意:调用方已经检查过数据完整性,这里不再重复检查
  1386. # 直接进行备用解析,因为调用方已经确定需要备用解析
  1387. doc_type = document_type or json_data.get("document_type", "unknown")
  1388. logger.warning("[PaddleOCR备用] 启用PaddleOCR备用解析")
  1389. # 尝试从markdown中提取图片路径
  1390. image_path = None
  1391. if output_dir:
  1392. # 首先尝试从markdown中提取
  1393. image_path = extract_image_from_markdown(markdown_content, output_dir)
  1394. if image_path:
  1395. logger.info(f"[PaddleOCR备用] 从markdown中找到图片: {image_path}")
  1396. # 如果找不到,尝试在output_dir中查找png文件
  1397. if not image_path and os.path.exists(output_dir):
  1398. # 查找所有png文件
  1399. png_files = list(Path(output_dir).rglob("*.png"))
  1400. if png_files:
  1401. # 优先查找包含"粘贴"或"image"的文件名
  1402. for png_file in png_files:
  1403. if "粘贴" in png_file.name or "image" in png_file.name.lower():
  1404. image_path = str(png_file)
  1405. logger.info(f"[PaddleOCR备用] 使用找到的图片: {image_path}")
  1406. break
  1407. # 如果没找到特殊名称的,使用第一个
  1408. if not image_path:
  1409. image_path = str(png_files[0])
  1410. logger.info(f"[PaddleOCR备用] 使用找到的图片: {image_path}")
  1411. # 如果仍未找到图片,尝试从input_file处理
  1412. if not image_path:
  1413. logger.warning("[PaddleOCR备用] 未找到可用的图片文件,尝试从input_file处理")
  1414. if input_file and os.path.exists(input_file):
  1415. # 检测文件实际类型(不依赖扩展名)
  1416. file_type = detect_file_type(input_file)
  1417. if file_type == 'pdf':
  1418. # 文件是PDF,尝试提取第一页
  1419. pdf_path = input_file
  1420. logger.info(f"[PaddleOCR备用] 检测到PDF文件(通过内容): {pdf_path}")
  1421. image_path = extract_first_page_from_pdf(pdf_path, output_dir)
  1422. if image_path:
  1423. logger.info(f"[PaddleOCR备用] 成功从PDF提取第一页图片: {image_path}")
  1424. else:
  1425. logger.warning("[PaddleOCR备用] 从PDF提取图片失败(可能是PDF文件损坏或缺少必要的库)")
  1426. elif file_type in ['png', 'jpeg', 'jpg']:
  1427. # 文件是图片,直接使用
  1428. image_path = input_file
  1429. logger.info(f"[PaddleOCR备用] 检测到图片文件({file_type}): {image_path}")
  1430. else:
  1431. # 文件类型未知,尝试按PDF处理(可能是PDF但没有正确识别)
  1432. logger.debug(f"[PaddleOCR备用] input_file类型未知({file_type}),尝试按PDF处理: {input_file}")
  1433. if PDFIUM_AVAILABLE or PDF2IMAGE_AVAILABLE:
  1434. try:
  1435. # 尝试打开为PDF
  1436. pdf_path = input_file
  1437. image_path = extract_first_page_from_pdf(pdf_path, output_dir)
  1438. if image_path:
  1439. logger.info(f"[PaddleOCR备用] 成功将文件作为PDF处理并提取第一页: {image_path}")
  1440. except Exception as e:
  1441. logger.debug(f"[PaddleOCR备用] 无法将文件作为PDF处理: {e}")
  1442. # 如果input_file处理失败,尝试在output_dir中查找PDF文件
  1443. if not image_path and output_dir:
  1444. pdf_path = find_pdf_file(output_dir)
  1445. if pdf_path:
  1446. logger.info(f"[PaddleOCR备用] 在输出目录中找到PDF文件: {pdf_path}")
  1447. image_path = extract_first_page_from_pdf(pdf_path, output_dir)
  1448. if image_path:
  1449. logger.info(f"[PaddleOCR备用] 成功从PDF提取第一页图片: {image_path}")
  1450. # 如果仍未找到,尝试在input_file的父目录中查找
  1451. if not image_path and input_file:
  1452. parent_dir = os.path.dirname(input_file)
  1453. if parent_dir and os.path.exists(parent_dir):
  1454. pdf_path = find_pdf_file(parent_dir)
  1455. if pdf_path:
  1456. logger.info(f"[PaddleOCR备用] 在input_file父目录中找到PDF文件: {pdf_path}")
  1457. image_path = extract_first_page_from_pdf(pdf_path, output_dir)
  1458. if image_path:
  1459. logger.info(f"[PaddleOCR备用] 成功从PDF提取第一页图片: {image_path}")
  1460. if not image_path:
  1461. logger.warning(f"[PaddleOCR备用] 未找到可用的图片或PDF文件(input_file={input_file}, output_dir={output_dir}),无法进行备用解析")
  1462. logger.info("[PaddleOCR备用] 备用解析需要图片文件或PDF文件,如果都没有,将返回原始markdown内容")
  1463. if not image_path:
  1464. logger.warning("[PaddleOCR备用] 未找到可用的图片文件,备用解析无法进行,返回None(将使用原始解析结果)")
  1465. return None
  1466. # 使用doc_parser模式解析文档结构
  1467. logger.info("[PaddleOCR备用] 使用doc_parser模式解析文档结构")
  1468. paddleocr_result = call_paddleocr(image_path)
  1469. if not paddleocr_result:
  1470. logger.error("[PaddleOCR备用] PaddleOCR解析失败")
  1471. return None
  1472. # 检查返回结果格式
  1473. if "markdown_content" in paddleocr_result:
  1474. # 直接从MD文件读取的内容
  1475. paddleocr_markdown = paddleocr_result["markdown_content"]
  1476. logger.info(f"[PaddleOCR备用] 成功从MD文件读取,生成 {len(paddleocr_markdown)} 字符的markdown")
  1477. # 从markdown内容中提取关键词来补充数据
  1478. logger.info("[PaddleOCR备用] 从MD文件内容中提取关键词补充数据")
  1479. keywords = extract_keywords_from_markdown(paddleocr_markdown)
  1480. # 将关键词信息添加到markdown中(作为注释,供后续解析使用)
  1481. keywords_comment = "\n\n<!-- Markdown关键词补充:\n"
  1482. if keywords["project"]:
  1483. keywords_comment += f"项目名称:{keywords['project']}\n"
  1484. if keywords["standardReferences"]:
  1485. keywords_comment += f"检测依据:{keywords['standardReferences']}\n"
  1486. if keywords["soundLevelMeterMode"]:
  1487. keywords_comment += f"声级计型号/编号:{keywords['soundLevelMeterMode']}\n"
  1488. if keywords["soundCalibratorMode"]:
  1489. keywords_comment += f"声校准器型号/编号:{keywords['soundCalibratorMode']}\n"
  1490. if keywords["calibrationValueBefore"]:
  1491. keywords_comment += f"检测前校准值:{keywords['calibrationValueBefore']}\n"
  1492. if keywords["calibrationValueAfter"]:
  1493. keywords_comment += f"检测后校准值:{keywords['calibrationValueAfter']}\n"
  1494. if keywords.get("address_mapping"):
  1495. for code, address in keywords["address_mapping"].items():
  1496. keywords_comment += f"监测地点-{code}:{address}\n"
  1497. if keywords["weather_info"]:
  1498. for weather in keywords["weather_info"]:
  1499. keywords_comment += f"日期:{weather['monitorAt']} 天气:{weather['weather']} 温度:{weather['temp']} 湿度:{weather['humidity']} 风速:{weather['windSpeed']} 风向:{weather['windDirection']}\n"
  1500. keywords_comment += "-->\n"
  1501. # 将关键词信息合并到markdown中
  1502. paddleocr_markdown = paddleocr_markdown + keywords_comment
  1503. # 统计补充的字段数量(不包括weather_info列表)
  1504. field_count = sum(1 for k, v in keywords.items() if k != "weather_info" and v) + len(keywords.get("weather_info", []))
  1505. logger.info(f"[PaddleOCR备用] MD文件关键词提取完成,补充了 {field_count} 个字段")
  1506. elif "parsing_res_list" in paddleocr_result:
  1507. # 从JSON或stdout解析的结果,需要转换为markdown
  1508. paddleocr_markdown = paddleocr_to_markdown(paddleocr_result)
  1509. if not paddleocr_markdown:
  1510. logger.warning("[PaddleOCR备用] PaddleOCR未解析出有效内容")
  1511. return None
  1512. logger.info(f"[PaddleOCR备用] 成功解析,生成 {len(paddleocr_markdown)} 字符的markdown")
  1513. else:
  1514. logger.error("[PaddleOCR备用] PaddleOCR返回格式不正确")
  1515. return None
  1516. # 调用paddleocr ocr提取关键词来补充数据(作为doc_parser的补充)
  1517. logger.info("[PaddleOCR备用] 调用OCR提取关键词补充数据")
  1518. ocr_save_path = os.path.dirname(image_path) # 使用图片所在目录作为保存路径
  1519. ocr_texts, _ = call_paddleocr_ocr(image_path, ocr_save_path)
  1520. if ocr_texts:
  1521. # 从OCR文本中提取关键词
  1522. keywords = extract_keywords_from_ocr_texts(ocr_texts)
  1523. # 将关键词信息添加到markdown中(作为注释,供后续解析使用)
  1524. keywords_comment = "\n\n<!-- OCR关键词补充:\n"
  1525. if keywords["project"]:
  1526. keywords_comment += f"项目名称:{keywords['project']}\n"
  1527. if keywords["standardReferences"]:
  1528. keywords_comment += f"检测依据:{keywords['standardReferences']}\n"
  1529. if keywords["soundLevelMeterMode"]:
  1530. keywords_comment += f"声级计型号/编号:{keywords['soundLevelMeterMode']}\n"
  1531. if keywords["soundCalibratorMode"]:
  1532. keywords_comment += f"声校准器型号/编号:{keywords['soundCalibratorMode']}\n"
  1533. if keywords["calibrationValueBefore"]:
  1534. keywords_comment += f"检测前校准值:{keywords['calibrationValueBefore']}\n"
  1535. if keywords.get("address_mapping"):
  1536. for code, address in keywords["address_mapping"].items():
  1537. keywords_comment += f"监测地点-{code}:{address}\n"
  1538. if keywords["calibrationValueAfter"]:
  1539. keywords_comment += f"检测后校准值:{keywords['calibrationValueAfter']}\n"
  1540. if keywords["weather_info"]:
  1541. for weather in keywords["weather_info"]:
  1542. keywords_comment += f"日期:{weather['monitorAt']} 天气:{weather['weather']} 温度:{weather['temp']} 湿度:{weather['humidity']} 风速:{weather['windSpeed']} 风向:{weather['windDirection']}\n"
  1543. keywords_comment += "-->\n"
  1544. # 将关键词信息合并到markdown中
  1545. paddleocr_markdown = paddleocr_markdown + keywords_comment
  1546. logger.info(f"[PaddleOCR备用] OCR关键词提取完成,补充了 {len(keywords)} 个字段")
  1547. # 合并原始markdown和paddleocr结果
  1548. # 优先使用paddleocr的结果,因为它更完整
  1549. combined_markdown = f"{paddleocr_markdown}\n\n<!-- 原始内容(可能不完整) -->\n{markdown_content}"
  1550. return combined_markdown
  1551. except Exception as e:
  1552. logger.exception(f"[PaddleOCR备用] 备用解析过程出错: {e}")
  1553. return None
  1554. def extract_text_with_paragraphs_from_ocr_json(json_path: str, line_height_threshold: float = 1.5, paragraph_gap_threshold: float = 2.0) -> str:
  1555. """
  1556. 从PaddleOCR的JSON输出中提取带段落分割的纯文本
  1557. Args:
  1558. json_path: OCR输出的JSON文件路径
  1559. line_height_threshold: 行高倍数阈值,用于判断是否在同一行(默认1.5)
  1560. paragraph_gap_threshold: 段落间距倍数阈值,用于判断是否需要分段(默认2.0)
  1561. Returns:
  1562. 带段落分割的纯文本字符串
  1563. """
  1564. try:
  1565. with open(json_path, 'r', encoding='utf-8') as f:
  1566. ocr_data = json.load(f)
  1567. # 提取文本和坐标信息
  1568. rec_texts = ocr_data.get("rec_texts", [])
  1569. dt_polys = ocr_data.get("dt_polys", [])
  1570. if not rec_texts or not dt_polys:
  1571. logger.warning("[OCR文本提取] JSON中缺少rec_texts或dt_polys字段")
  1572. return ""
  1573. if len(rec_texts) != len(dt_polys):
  1574. logger.warning(f"[OCR文本提取] rec_texts长度({len(rec_texts)})与dt_polys长度({len(dt_polys)})不匹配")
  1575. # 取较小的长度
  1576. min_len = min(len(rec_texts), len(dt_polys))
  1577. rec_texts = rec_texts[:min_len]
  1578. dt_polys = dt_polys[:min_len]
  1579. # 计算每个文本块的边界框和中心点
  1580. text_blocks = []
  1581. for i, (text, poly) in enumerate(zip(rec_texts, dt_polys)):
  1582. if not text or not text.strip():
  1583. continue
  1584. # 从多边形坐标计算边界框
  1585. # poly格式: [[x1, y1], [x2, y2], [x3, y3], [x4, y4]]
  1586. if len(poly) >= 4:
  1587. xs = [point[0] for point in poly]
  1588. ys = [point[1] for point in poly]
  1589. x_min, x_max = min(xs), max(xs)
  1590. y_min, y_max = min(ys), max(ys)
  1591. # 计算中心点和高度
  1592. center_x = (x_min + x_max) / 2
  1593. center_y = (y_min + y_max) / 2
  1594. height = y_max - y_min
  1595. width = x_max - x_min
  1596. text_blocks.append({
  1597. 'text': text.strip(),
  1598. 'x_min': x_min,
  1599. 'x_max': x_max,
  1600. 'y_min': y_min,
  1601. 'y_max': y_max,
  1602. 'center_x': center_x,
  1603. 'center_y': center_y,
  1604. 'height': height,
  1605. 'width': width,
  1606. 'index': i
  1607. })
  1608. if not text_blocks:
  1609. logger.warning("[OCR文本提取] 没有有效的文本块")
  1610. return ""
  1611. # 按Y坐标(从上到下)排序
  1612. text_blocks.sort(key=lambda b: (b['y_min'], b['x_min']))
  1613. # 计算平均行高(用于判断行间距)
  1614. heights = [b['height'] for b in text_blocks]
  1615. avg_height = sum(heights) / len(heights) if heights else 20
  1616. # 将文本块按行分组
  1617. lines = []
  1618. current_line = [text_blocks[0]]
  1619. for i in range(1, len(text_blocks)):
  1620. prev_block = text_blocks[i - 1]
  1621. curr_block = text_blocks[i]
  1622. # 计算Y坐标重叠度
  1623. y_overlap = min(prev_block['y_max'], curr_block['y_max']) - max(prev_block['y_min'], curr_block['y_min'])
  1624. overlap_ratio = y_overlap / min(prev_block['height'], curr_block['height']) if min(prev_block['height'], curr_block['height']) > 0 else 0
  1625. # 计算Y坐标间距
  1626. y_gap = curr_block['y_min'] - prev_block['y_max']
  1627. gap_ratio = y_gap / avg_height if avg_height > 0 else 0
  1628. # 判断是否在同一行:有重叠或间距小于行高阈值
  1629. if overlap_ratio > 0.3 or (y_gap >= 0 and gap_ratio < line_height_threshold):
  1630. current_line.append(curr_block)
  1631. else:
  1632. # 新行开始,保存当前行
  1633. lines.append(current_line)
  1634. current_line = [curr_block]
  1635. # 添加最后一行
  1636. if current_line:
  1637. lines.append(current_line)
  1638. # 对每行内的文本块按X坐标排序(从左到右)
  1639. for line in lines:
  1640. line.sort(key=lambda b: b['x_min'])
  1641. # 生成文本,根据行间距判断段落分割
  1642. result_lines = []
  1643. prev_line_y = None
  1644. prev_line_height = None
  1645. for line_idx, line in enumerate(lines):
  1646. # 计算当前行的Y坐标和高度
  1647. line_y_min = min(b['y_min'] for b in line)
  1648. line_y_max = max(b['y_max'] for b in line)
  1649. line_height = line_y_max - line_y_min
  1650. line_center_y = (line_y_min + line_y_max) / 2
  1651. # 拼接当前行的文本
  1652. # 对于表格数据,使用制表符分隔;对于普通文本,使用空格
  1653. line_text = ""
  1654. prev_x_max = None
  1655. # 判断是否是表格行(如果一行中有多个文本块且X坐标分布较均匀)
  1656. is_table_row = len(line) > 2
  1657. for block in line:
  1658. if prev_x_max is not None:
  1659. x_gap = block['x_min'] - prev_x_max
  1660. # 如果间距较大,添加分隔符
  1661. if x_gap > avg_height * 0.3:
  1662. if is_table_row:
  1663. # 表格使用制表符
  1664. line_text += "\t"
  1665. else:
  1666. # 普通文本使用空格
  1667. line_text += " "
  1668. line_text += block['text']
  1669. prev_x_max = block['x_max']
  1670. # 判断是否需要换段
  1671. if prev_line_y is not None and prev_line_height is not None:
  1672. # 计算行间距
  1673. line_gap = line_y_min - prev_line_y
  1674. gap_ratio = line_gap / prev_line_height if prev_line_height > 0 else 0
  1675. # 如果行间距大于段落阈值,添加空行
  1676. if gap_ratio > paragraph_gap_threshold:
  1677. result_lines.append("") # 空行表示段落分隔
  1678. result_lines.append(line_text)
  1679. prev_line_y = line_y_max
  1680. prev_line_height = line_height
  1681. # 合并为最终文本
  1682. result_text = "\n".join(result_lines)
  1683. logger.info(f"[OCR文本提取] 成功提取文本,共 {len(lines)} 行,{len(result_lines)} 行(含段落分隔)")
  1684. return result_text
  1685. except Exception as e:
  1686. logger.exception(f"[OCR文本提取] 处理失败: {e}")
  1687. return ""