electromagnetic_parser.py 31 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584
  1. # Copyright (c) Opendatalab. All rights reserved.
  2. """电磁检测记录解析模块 v2 - 独立版本"""
  3. from typing import List
  4. import re
  5. from ..utils.logging_config import get_logger
  6. from ..models.data_models import ElectromagneticDetectionRecord, ElectromagneticData
  7. from .table_parser import extract_table_with_rowspan_colspan
  8. logger = get_logger("pdf_converter_v2.parser.electromagnetic")
  9. def validate_height(value: str) -> str:
  10. """校验高度值格式
  11. 高度可以为空,但不应包含冒号(排除时间格式如 "14:50")
  12. Args:
  13. value: 原始高度值
  14. Returns:
  15. 校验后的高度值,如果包含冒号则返回空字符串
  16. """
  17. if not value or not value.strip():
  18. return ""
  19. value = value.strip()
  20. # 如果包含冒号,认为是时间格式,返回空字符串
  21. if ':' in value:
  22. logger.warning(f"[电磁检测] 高度值包含冒号(可能是时间格式),已忽略: '{value}'")
  23. return ""
  24. return value
  25. def calculate_average(values: List[str]) -> str:
  26. """计算平均值,处理空值和无效值"""
  27. numeric_values = []
  28. for val in values:
  29. if val and val.strip():
  30. # 尝试提取数字(可能包含单位)
  31. try:
  32. # 移除可能的单位(如V/m, T等)和空格
  33. cleaned = re.sub(r'[^\d.\-]', '', val.strip())
  34. if cleaned:
  35. num = float(cleaned)
  36. numeric_values.append(num)
  37. except (ValueError, AttributeError):
  38. continue
  39. if numeric_values:
  40. avg = sum(numeric_values) / len(numeric_values)
  41. # 保留原始格式,如果是整数则返回整数格式
  42. if avg == int(avg):
  43. return str(int(avg))
  44. else:
  45. # 保留适当的小数位
  46. return f"{avg:.3f}".rstrip('0').rstrip('.')
  47. return ""
  48. def parse_electromagnetic_detection_record(markdown_content: str) -> ElectromagneticDetectionRecord:
  49. """解析电磁检测记录"""
  50. record = ElectromagneticDetectionRecord()
  51. # 首先从OCR关键词注释中提取项目名称(优先级高,因为OCR可能识别到了表格中缺失的信息)
  52. # 先提取Markdown关键词补充(优先级高)
  53. md_keywords_comment_match = re.search(r'<!--\s*Markdown关键词补充:(.*?)-->', markdown_content, re.DOTALL)
  54. if md_keywords_comment_match:
  55. keywords_text = md_keywords_comment_match.group(1)
  56. logger.info("[电磁检测] 发现Markdown关键词补充,开始提取(优先级高)")
  57. # 提取项目名称
  58. project_match = re.search(r'项目名称[::]([^\n]+)', keywords_text)
  59. if project_match:
  60. record.project = project_match.group(1).strip()
  61. logger.debug(f"[电磁检测] 从Markdown关键词补充提取到项目名称: {record.project}")
  62. # 然后提取OCR关键词补充(优先级低,只在字段为空时补充)
  63. ocr_keywords_comment_match = re.search(r'<!--\s*OCR关键词补充:(.*?)-->', markdown_content, re.DOTALL)
  64. if ocr_keywords_comment_match:
  65. keywords_text = ocr_keywords_comment_match.group(1)
  66. logger.info("[电磁检测] 发现OCR关键词补充,开始提取(优先级低,仅在字段为空时补充)")
  67. # 提取项目名称(仅在字段为空时)
  68. project_match = re.search(r'项目名称[::]([^\n]+)', keywords_text)
  69. if project_match and (not record.project or not record.project.strip()):
  70. record.project = project_match.group(1).strip()
  71. logger.debug(f"[电磁检测] 从OCR关键词补充提取到项目名称: {record.project}")
  72. tables = extract_table_with_rowspan_colspan(markdown_content)
  73. if not tables:
  74. logger.warning(f"[电磁检测] 未能提取出任何表格内容")
  75. return record
  76. # 定义元数据标签关键词,用于识别标签(避免将标签误认为值)
  77. METADATA_LABELS = {"项目名称", "监测依据", "仪器名称", "仪器型号", "仪器编号",
  78. "测量高度", "检测高度", "检测环境条件", "测点分布示意图",
  79. "工况及工程信息", "备注", "备注:"}
  80. def find_next_non_empty_value(row: List[str], start_idx: int) -> tuple[str, int]:
  81. """从指定索引开始查找下一个非空值(遇到下一个标签时停止)
  82. Args:
  83. row: 行数据
  84. start_idx: 起始索引(标签所在位置)
  85. Returns:
  86. (value, next_idx): 找到的值和下一个索引位置
  87. """
  88. for j in range(start_idx + 1, len(row)):
  89. cell_value = row[j].strip() if row[j] else ""
  90. if cell_value:
  91. # 如果找到的值是另一个标签,说明当前标签没有值,停止查找
  92. # 但要注意:标签可能包含在值中(如"监测依据"可能出现在"☐HJ681-2013"中),所以要精确匹配
  93. is_label = False
  94. for label in METADATA_LABELS:
  95. # 精确匹配:单元格值完全等于标签,或者单元格值以标签开头且后面是冒号等分隔符
  96. if cell_value == label or cell_value.startswith(label + ":") or cell_value.startswith(label + ":"):
  97. is_label = True
  98. break
  99. if is_label:
  100. return "", j # 返回空值和下一个标签的位置
  101. # 找到非标签的值,返回它
  102. return cell_value, j + 1
  103. return "", len(row)
  104. # 查找包含头部信息的表格(可能不是第一个表格,特别是fallback后可能有多个表格)
  105. # 头部信息表格的特征:包含"项目名称"、"仪器名称"等关键词
  106. header_table = None
  107. for table in tables:
  108. for row in table:
  109. if row and any("项目名称" in str(cell) or "仪器名称" in str(cell) or "监测依据" in str(cell) for cell in row if cell):
  110. header_table = table
  111. logger.debug(f"[电磁检测] 找到包含头部信息的表格,行数: {len(table)}")
  112. break
  113. if header_table:
  114. break
  115. # 如果没找到包含头部信息的表格,使用第一个表格
  116. if not header_table:
  117. header_table = tables[0]
  118. logger.debug(f"[电磁检测] 未找到包含头部信息的表格,使用第一个表格")
  119. first_table = header_table
  120. for row in first_table:
  121. logger.debug(f"[电磁检测][ROW] len={len(row)}, content={row}")
  122. i = 0
  123. while i < len(row):
  124. cell = row[i]
  125. if not cell or not cell.strip():
  126. i += 1
  127. continue
  128. if "项目名称" in cell:
  129. value, next_idx = find_next_non_empty_value(row, i)
  130. # 只有当value不为空,且record.project为空时,才从表格中提取
  131. # 这样可以保留从OCR关键词补充中提取的项目名称
  132. if value and (not record.project or not record.project.strip()):
  133. record.project = value
  134. logger.debug(f"[电磁检测] 从表格中提取到项目名称: {record.project}")
  135. elif not record.project or not record.project.strip():
  136. # 如果表格中也没有值,记录警告
  137. logger.warning(f"[电磁检测] 项目名称 为空,行数据: {row}")
  138. i = next_idx
  139. continue
  140. if "监测依据" in cell:
  141. value, next_idx = find_next_non_empty_value(row, i)
  142. record.standardReferences = value
  143. if not record.standardReferences.strip():
  144. logger.warning(f"[电磁检测] 监测依据 为空,行数据: {row}")
  145. i = next_idx
  146. continue
  147. if "仪器名称" in cell:
  148. value, next_idx = find_next_non_empty_value(row, i)
  149. record.deviceName = value
  150. if not record.deviceName.strip():
  151. logger.warning(f"[电磁检测] 仪器名称 为空,行数据: {row}")
  152. i = next_idx
  153. continue
  154. if "仪器型号" in cell:
  155. value, next_idx = find_next_non_empty_value(row, i)
  156. record.deviceMode = value
  157. if not record.deviceMode.strip():
  158. logger.warning(f"[电磁检测] 仪器型号 为空,行数据: {row}")
  159. i = next_idx
  160. continue
  161. if "仪器编号" in cell:
  162. value, next_idx = find_next_non_empty_value(row, i)
  163. record.deviceCode = value
  164. if not record.deviceCode.strip():
  165. logger.warning(f"[电磁检测] 仪器编号 为空,行数据: {row}")
  166. i = next_idx
  167. continue
  168. if any(k in cell for k in ["测量高度", "检测高度"]):
  169. value, next_idx = find_next_non_empty_value(row, i)
  170. record.monitorHeight = value
  171. if not record.monitorHeight.strip():
  172. logger.warning(f"[电磁检测] 检测/测量高度 为空,行数据: {row}")
  173. i = next_idx
  174. continue
  175. if "检测环境条件" in cell:
  176. value, next_idx = find_next_non_empty_value(row, i)
  177. text = value
  178. # 解析天气字段,即使字段为空也保留(ElectromagneticWeatherData的__init__已初始化所有字段为空字符串)
  179. # 温度:匹配格式如 "29.5-35.0℃" 或 "29.5-35.0 ℃"
  180. m = re.search(r'([0-9.\-]+)\s*℃', text)
  181. if m:
  182. record.weather.temp = m.group(1)
  183. # 如果没有匹配到,字段保持为空字符串(已在__init__中初始化)
  184. # 湿度:匹配格式如 "74.0-74.1%RH" 或 "74.0-74.1 %RH"
  185. m = re.search(r'([0-9.\-]+)\s*%RH', text)
  186. if m:
  187. record.weather.humidity = m.group(1)
  188. # 如果没有匹配到,字段保持为空字符串(已在__init__中初始化)
  189. # 风速:匹配格式如 "0.4-0.5 m/s" 或 "0.4-0.5m/s"
  190. m = re.search(r'([0-9.\-]+)\s*m/s', text)
  191. if m:
  192. record.weather.windSpeed = m.group(1)
  193. # 如果没有匹配到,字段保持为空字符串(已在__init__中初始化)
  194. # 天气:匹配格式如 "天气:晴" 或 "天气 晴"
  195. m = re.search(r'天气[::]*\s*([^\s温度湿度风速]+)', text)
  196. if m:
  197. record.weather.weather = m.group(1).strip()
  198. # 如果没有匹配到,字段保持为空字符串(已在__init__中初始化)
  199. # 解析风向
  200. m = re.search(r'风向[::]*\s*([^\s温度湿度风速天气]+)', text)
  201. if m: record.weather.windDirection = m.group(1).strip()
  202. # 天气为空、":"或只有冒号时,如果其它气象字段有任意一个不为空,默认填入"晴"
  203. weather_value = record.weather.weather.strip() if record.weather.weather else ""
  204. if (not weather_value or weather_value == ":") and any([
  205. record.weather.temp, record.weather.humidity, record.weather.windSpeed, record.weather.windDirection
  206. ]):
  207. record.weather.weather = "晴"
  208. i = next_idx
  209. continue
  210. i += 1
  211. # 表头关键词:用于识别表头行
  212. EXCLUDED_HEADERS = {"编号", "备注"} # 使用集合提高查找效率
  213. HEADER_KEYWORDS = {"1", "2", "3", "4", "5", "均值", "工频电场强度", "工频磁感应强度",
  214. "监测地点", "线高", "时间", "V/m", "μT"} # 表头常见关键词
  215. # 元数据行关键词:这些行的第一列包含这些关键词,应该被排除
  216. METADATA_KEYWORDS = {"项目名称", "监测依据", "仪器名称", "仪器型号", "仪器编号",
  217. "测量高度", "检测高度", "检测环境条件", "测点分布示意图",
  218. "工况及工程信息", "备注", "备注:"}
  219. def is_valid_data_row(row: List[str]) -> bool:
  220. """判断是否为有效的数据行
  221. 有效数据行的特征:
  222. 1. 第一列应该是测点编号(如ZB1, ZB2, EB1等),不能是表头关键词或元数据关键词
  223. 2. 第一列不能为空
  224. 3. 行中不应包含表头关键词(如"1", "2", "3", "4", "5", "均值"等)
  225. 4. 至少需要8列数据
  226. """
  227. if len(row) < 8:
  228. return False
  229. first_cell = row[0].strip() if row[0] else ""
  230. # 第一列为空,跳过
  231. if not first_cell:
  232. return False
  233. # 第一列是表头关键词或元数据关键词,跳过
  234. if first_cell in EXCLUDED_HEADERS or first_cell in METADATA_KEYWORDS:
  235. return False
  236. # 检查第一列是否包含元数据关键词(部分匹配)
  237. for keyword in METADATA_KEYWORDS:
  238. if keyword in first_cell:
  239. logger.debug(f"[电磁检测] 跳过元数据行(第一列包含'{keyword}'): {row[0]}")
  240. return False
  241. # 检查第一列是否是有效的测点编号格式(ZB/EB开头,或至少是字母+数字)
  242. # 如果第一列是纯数字(如"1", "2")或表头关键词,跳过
  243. if first_cell in HEADER_KEYWORDS or (first_cell.isdigit() and len(first_cell) == 1):
  244. return False
  245. # 检查行中是否包含表头关键词(如果第一列为空但其他列包含"1", "2", "均值"等,可能是表头行)
  246. # 特别检查第4-9列(电场强度列)和第10-15列(磁感应强度列)是否包含表头关键词
  247. header_keyword_count = 0
  248. for i in range(min(16, len(row))):
  249. cell = row[i].strip() if i < len(row) and row[i] else ""
  250. if cell in HEADER_KEYWORDS:
  251. header_keyword_count += 1
  252. # 如果行中包含多个表头关键词(>=3个),很可能是表头行
  253. if header_keyword_count >= 3:
  254. logger.debug(f"[电磁检测] 跳过表头行(包含{header_keyword_count}个表头关键词): {row[:5]}")
  255. return False
  256. # 如果第一列不是以ZB/EB开头,但行中前几列都是表头关键词,可能是表头行
  257. if not (first_cell.startswith("ZB") or first_cell.startswith("EB")):
  258. # 检查前4列是否都是表头关键词或数字
  259. first_four_are_headers = True
  260. for i in range(min(4, len(row))):
  261. cell = row[i].strip() if i < len(row) and row[i] else ""
  262. if cell and cell not in HEADER_KEYWORDS and not (cell.isdigit() and len(cell) == 1):
  263. first_four_are_headers = False
  264. break
  265. if first_four_are_headers:
  266. logger.debug(f"[电磁检测] 跳过表头行(前4列都是表头关键词): {row[:5]}")
  267. return False
  268. return True
  269. # 使用集合跟踪已添加的测点编号,避免重复添加(处理跨页重复的情况)
  270. seen_codes = set()
  271. def get_address_continuation_strict(row: List[str]) -> str:
  272. """仅当首列为空、除第二列外其余列均为空,且第二列像地名时,返回第二列(与上一条合并用)。"""
  273. if len(row) < 2:
  274. return ""
  275. first = (row[0] or "").strip()
  276. second = (row[1] or "").strip()
  277. if first:
  278. return ""
  279. if not second:
  280. return ""
  281. # 除第二列外必须都为空
  282. for i in range(2, len(row)):
  283. if (row[i] or "").strip():
  284. return ""
  285. # 第二列像地名:含中文,且不是纯数字/时间
  286. if re.search(r"[\u4e00-\u9fa5]", second) and not re.match(r"^[\d.\-:\s]+$", second):
  287. return second
  288. return ""
  289. for table in tables:
  290. for row in table:
  291. # 仅当首列为空、除第二列外都为空且第二列为地名时:合并到上一条监测地点
  292. first_cell = (row[0] or "").strip() if len(row) > 0 else ""
  293. if not first_cell and record.electricMagnetic:
  294. continuation = get_address_continuation_strict(row)
  295. if continuation:
  296. last_em = record.electricMagnetic[-1]
  297. last_em.address = (last_em.address or "").strip()
  298. if last_em.address:
  299. last_em.address = last_em.address + continuation
  300. else:
  301. last_em.address = continuation
  302. logger.debug(f"[电磁检测] 监测地点续行合并: ... + '{continuation}' -> {last_em.address}")
  303. continue
  304. if is_valid_data_row(row):
  305. code = row[0].strip() if row[0] else ""
  306. # 检查是否已经添加过该测点编号
  307. if code in seen_codes:
  308. logger.debug(f"[电磁检测] 跳过重复的测点编号: {code}")
  309. continue
  310. logger.info(row)
  311. em = ElectromagneticData()
  312. em.code = code
  313. # 智能识别列位置:由于表格可能有colspan,不能简单按索引
  314. # 1. 地址列:在编号之后,通常是第一个或第二个非空列(如果地址为空则跳过)
  315. # 2. 高度列:包含"m"单位的列(如"24m")
  316. # 3. 时间列:包含日期格式的列(如"2025.7.21 10:35")
  317. # 4. 数据列:时间列之后的所有数值列
  318. address_idx = -1
  319. height_idx = -1
  320. monitor_at_idx = -1
  321. # 从第1列开始查找(跳过编号列0)
  322. for i in range(1, len(row)):
  323. cell = row[i].strip() if i < len(row) and row[i] else ""
  324. if not cell:
  325. continue
  326. # 先检查是否是时间列(包含日期格式)- 优先级最高,因为格式最明确
  327. if re.search(r'\d{4}[.\-]\d{1,2}[.\-]\d{1,2}', cell):
  328. if monitor_at_idx == -1:
  329. monitor_at_idx = i
  330. logger.debug(f"[电磁检测] 识别到时间列: 索引{i}, 值={cell}")
  331. continue
  332. # 检查是否是高度列(包含"m"单位,且不是时间格式)
  333. if "m" in cell and not re.search(r'\d{4}[.\-]\d{1,2}[.\-]\d{1,2}', cell):
  334. # 进一步确认:高度通常是数字+m(如"24m"),不包含日期
  335. if re.match(r'^\d+[.\d]*m', cell) and height_idx == -1:
  336. height_idx = i
  337. logger.debug(f"[电磁检测] 识别到高度列: 索引{i}, 值={cell}")
  338. continue
  339. # 如果既不是高度也不是时间,且地址索引未设置,可能是地址
  340. # 地址通常是中文地名(包含中文字符),且不是纯数字
  341. if address_idx == -1:
  342. # 检查是否是中文地名(包含中文字符)
  343. if re.search(r'[\u4e00-\u9fa5]', cell) and not re.match(r'^[\d.\-:\s]+$', cell):
  344. address_idx = i
  345. logger.debug(f"[电磁检测] 识别到地址列: 索引{i}, 值={cell}")
  346. # 如果通过智能识别没找到高度和时间,使用默认位置(向后兼容)
  347. if height_idx == -1:
  348. # 尝试默认位置:第2列(索引2)
  349. if len(row) > 2 and row[2]:
  350. height_value = row[2].strip()
  351. if height_value:
  352. height_idx = 2
  353. logger.debug(f"[电磁检测] 使用默认高度列位置: 索引2")
  354. if monitor_at_idx == -1:
  355. # 尝试默认位置:第3列(索引3)
  356. if len(row) > 3 and row[3]:
  357. time_value = row[3].strip()
  358. if time_value:
  359. monitor_at_idx = 3
  360. logger.debug(f"[电磁检测] 使用默认时间列位置: 索引3")
  361. # 提取字段值
  362. if address_idx >= 0 and address_idx < len(row):
  363. em.address = row[address_idx].strip()
  364. if height_idx >= 0 and height_idx < len(row):
  365. height_value = row[height_idx].strip()
  366. em.height = validate_height(height_value)
  367. if monitor_at_idx >= 0 and monitor_at_idx < len(row):
  368. em.monitorAt = row[monitor_at_idx].strip()
  369. # 工频电场磁场检测结果中线高为空时默认为 1.5
  370. if not em.height or not em.height.strip():
  371. em.height = "1.5"
  372. # 数据列从时间列之后开始,如果时间列未找到,从高度列之后开始
  373. # 如果高度列也未找到,从地址列之后开始,如果地址列也未找到,从第4列开始
  374. if monitor_at_idx >= 0:
  375. data_start_idx = monitor_at_idx + 1
  376. elif height_idx >= 0:
  377. data_start_idx = height_idx + 1
  378. elif address_idx >= 0:
  379. data_start_idx = address_idx + 1
  380. else:
  381. data_start_idx = 4
  382. # 跳过空列,找到第一个数值列(应该是电场强度的第一个值)
  383. while data_start_idx < len(row) and (not row[data_start_idx] or not row[data_start_idx].strip()):
  384. data_start_idx += 1
  385. logger.debug(f"[电磁检测] 数据列起始索引: {data_start_idx}, 行数据: {row[data_start_idx:data_start_idx+12] if len(row) > data_start_idx else 'N/A'}")
  386. # 电场强度(从data_start_idx开始,共6列:1-5和均值)
  387. # 注意:均值列可能在"均值"标签之后,也可能直接是第6个数值
  388. if len(row) > data_start_idx: em.powerFrequencyEFieldStrength1 = row[data_start_idx]
  389. if len(row) > data_start_idx + 1: em.powerFrequencyEFieldStrength2 = row[data_start_idx + 1]
  390. if len(row) > data_start_idx + 2: em.powerFrequencyEFieldStrength3 = row[data_start_idx + 2]
  391. if len(row) > data_start_idx + 3: em.powerFrequencyEFieldStrength4 = row[data_start_idx + 3]
  392. if len(row) > data_start_idx + 4: em.powerFrequencyEFieldStrength5 = row[data_start_idx + 4]
  393. # 电场强度均值:跳过可能的"均值"标签,找到下一个数值
  394. avg_field_idx = data_start_idx + 5
  395. while avg_field_idx < len(row) and (not row[avg_field_idx] or not row[avg_field_idx].strip() or row[avg_field_idx].strip() == "均值"):
  396. avg_field_idx += 1
  397. if len(row) > avg_field_idx:
  398. # 检查是否是数值(可能是均值,也可能是磁感应强度的第一个值)
  399. avg_value = row[avg_field_idx].strip()
  400. # 如果看起来像电场强度值(较大的数字,如9.xxx),则使用它
  401. # 如果看起来像磁感应强度值(较小的数字,如0.xxx),则跳过,使用计算的平均值
  402. try:
  403. avg_float = float(avg_value)
  404. # 电场强度通常在1-1000范围内,磁感应强度通常在0-10范围内
  405. if avg_float > 1.0: # 可能是电场强度均值
  406. em.avgPowerFrequencyEFieldStrength = avg_value
  407. magnetic_start_idx = avg_field_idx + 1
  408. else: # 可能是磁感应强度的第一个值,跳过
  409. magnetic_start_idx = avg_field_idx
  410. except ValueError:
  411. # 不是数字,跳过
  412. magnetic_start_idx = avg_field_idx + 1
  413. else:
  414. magnetic_start_idx = data_start_idx + 6
  415. # 磁感应强度均值:同样需要跳过"均值"标签
  416. avg_magnetic_idx = magnetic_start_idx + 5
  417. while avg_magnetic_idx < len(row) and (not row[avg_magnetic_idx] or not row[avg_magnetic_idx].strip() or row[avg_magnetic_idx].strip() == "均值"):
  418. avg_magnetic_idx += 1
  419. # 磁感应强度(从magnetic_start_idx开始,共6列:1-5和均值)
  420. if len(row) > magnetic_start_idx: em.powerFrequencyMagneticDensity1 = row[magnetic_start_idx]
  421. if len(row) > magnetic_start_idx + 1: em.powerFrequencyMagneticDensity2 = row[magnetic_start_idx + 1]
  422. if len(row) > magnetic_start_idx + 2: em.powerFrequencyMagneticDensity3 = row[magnetic_start_idx + 2]
  423. if len(row) > magnetic_start_idx + 3: em.powerFrequencyMagneticDensity4 = row[magnetic_start_idx + 3]
  424. if len(row) > magnetic_start_idx + 4: em.powerFrequencyMagneticDensity5 = row[magnetic_start_idx + 4]
  425. if len(row) > avg_magnetic_idx:
  426. em.avgPowerFrequencyMagneticDensity = row[avg_magnetic_idx]
  427. elif len(row) > magnetic_start_idx + 5:
  428. em.avgPowerFrequencyMagneticDensity = row[magnetic_start_idx + 5]
  429. # 如果平均电场强度为空,则计算平均值
  430. if not em.avgPowerFrequencyEFieldStrength or not em.avgPowerFrequencyEFieldStrength.strip():
  431. field_values = [
  432. em.powerFrequencyEFieldStrength1,
  433. em.powerFrequencyEFieldStrength2,
  434. em.powerFrequencyEFieldStrength3,
  435. em.powerFrequencyEFieldStrength4,
  436. em.powerFrequencyEFieldStrength5
  437. ]
  438. calculated_avg = calculate_average(field_values)
  439. if calculated_avg:
  440. em.avgPowerFrequencyEFieldStrength = calculated_avg
  441. logger.debug(f"计算平均电场强度: {calculated_avg} (基于前5个值)")
  442. # 如果平均磁感应强度为空,则计算平均值
  443. if not em.avgPowerFrequencyMagneticDensity or not em.avgPowerFrequencyMagneticDensity.strip():
  444. density_values = [
  445. em.powerFrequencyMagneticDensity1,
  446. em.powerFrequencyMagneticDensity2,
  447. em.powerFrequencyMagneticDensity3,
  448. em.powerFrequencyMagneticDensity4,
  449. em.powerFrequencyMagneticDensity5
  450. ]
  451. calculated_avg = calculate_average(density_values)
  452. if calculated_avg:
  453. em.avgPowerFrequencyMagneticDensity = calculated_avg
  454. logger.debug(f"计算平均磁感应强度: {calculated_avg} (基于前5个值)")
  455. # 标记该测点编号已添加
  456. seen_codes.add(code)
  457. record.electricMagnetic.append(em)
  458. # 矫正编号:按照数据顺序重新分配编号为 EB1, EB2, EB3...
  459. # 同时建立原始编号到新编号的映射,用于从OCR关键词中提取地址
  460. code_mapping = {} # 原始编号 -> 新编号
  461. for idx, em in enumerate(record.electricMagnetic, start=1):
  462. original_code = em.code
  463. new_code = f"EB{idx}"
  464. code_mapping[original_code.upper()] = new_code
  465. em.code = new_code
  466. if original_code != new_code:
  467. logger.info(f"[电磁检测] 编号矫正: {original_code} -> {new_code}")
  468. # 从OCR关键词注释中提取地址信息并填充到对应的数据项中
  469. # 先提取Markdown关键词补充(优先级高)
  470. md_keywords_comment_match = re.search(r'<!--\s*Markdown关键词补充:(.*?)-->', markdown_content, re.DOTALL)
  471. if md_keywords_comment_match:
  472. keywords_text = md_keywords_comment_match.group(1)
  473. logger.info("[电磁检测] 发现Markdown关键词补充,开始提取地址信息(优先级高)")
  474. # 提取监测地点映射
  475. address_matches = re.findall(r'监测地点-([A-Z0-9]+)[::]([^\n]+)', keywords_text)
  476. for code, address in address_matches:
  477. code_upper = code.upper()
  478. address = address.strip()
  479. if address:
  480. # 查找对应的数据项(使用原始编号或新编号)
  481. target_code = code_mapping.get(code_upper, code_upper)
  482. for em in record.electricMagnetic:
  483. if em.code == target_code and (not em.address or not em.address.strip()):
  484. em.address = address
  485. logger.debug(f"[电磁检测] 从Markdown关键词补充提取到地址: {em.code} -> {address}")
  486. # 然后提取OCR关键词补充(优先级低,只在字段为空时补充)
  487. ocr_keywords_comment_match = re.search(r'<!--\s*OCR关键词补充:(.*?)-->', markdown_content, re.DOTALL)
  488. if ocr_keywords_comment_match:
  489. keywords_text = ocr_keywords_comment_match.group(1)
  490. logger.info("[电磁检测] 发现OCR关键词补充,开始提取地址信息(优先级低,仅在字段为空时补充)")
  491. # 提取监测地点映射
  492. address_matches = re.findall(r'监测地点-([A-Z0-9]+)[::]([^\n]+)', keywords_text)
  493. for code, address in address_matches:
  494. code_upper = code.upper()
  495. address = address.strip()
  496. if address:
  497. # 查找对应的数据项(使用原始编号或新编号)
  498. target_code = code_mapping.get(code_upper, code_upper)
  499. for em in record.electricMagnetic:
  500. if em.code == target_code and (not em.address or not em.address.strip()):
  501. em.address = address
  502. logger.debug(f"[电磁检测] 从OCR关键词补充提取到地址: {em.code} -> {address}")
  503. return record