noise_parser.py 93 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130113111321133113411351136113711381139114011411142114311441145114611471148114911501151115211531154115511561157115811591160116111621163116411651166116711681169117011711172117311741175117611771178117911801181118211831184118511861187118811891190119111921193119411951196119711981199120012011202120312041205120612071208120912101211121212131214121512161217121812191220122112221223122412251226122712281229123012311232123312341235123612371238123912401241124212431244124512461247124812491250125112521253125412551256125712581259126012611262126312641265126612671268126912701271127212731274127512761277127812791280128112821283128412851286128712881289129012911292129312941295129612971298129913001301130213031304130513061307130813091310131113121313131413151316131713181319132013211322132313241325132613271328132913301331133213331334133513361337133813391340134113421343134413451346134713481349135013511352135313541355135613571358135913601361136213631364136513661367136813691370137113721373137413751376137713781379138013811382138313841385138613871388138913901391139213931394139513961397139813991400140114021403140414051406140714081409141014111412141314141415141614171418141914201421142214231424142514261427142814291430143114321433143414351436143714381439144014411442144314441445144614471448144914501451145214531454145514561457145814591460146114621463146414651466146714681469147014711472147314741475147614771478147914801481148214831484148514861487148814891490149114921493149414951496149714981499
  1. # Copyright (c) Opendatalab. All rights reserved.
  2. """噪声检测记录解析模块 v2 - 独立版本,不依赖OCR"""
  3. from typing import Optional, List
  4. import re
  5. from ..utils.logging_config import get_logger
  6. from ..models.data_models import NoiseDetectionRecord, WeatherData, NoiseData
  7. from .table_parser import extract_table_with_rowspan_colspan, parse_operational_conditions, parse_operational_conditions_opstatus
  8. logger = get_logger("pdf_converter_v2.parser.noise")
  9. def clean_project_field(project: str) -> str:
  10. """清理project字段:如果包含"检测依据",删除"检测依据"及其后面的所有字符
  11. 同时清理末尾的标点符号(逗号、句号、分号等)
  12. Args:
  13. project: 原始project字段值
  14. Returns:
  15. 清理后的project字段值
  16. """
  17. if not project:
  18. return project
  19. # 查找"检测/监测/检查依据"的位置
  20. match = re.search(r'(检测|监测|检查)依据', project)
  21. if match:
  22. project = project[:match.start()].strip()
  23. logger.debug(f"[噪声检测] 清理project字段,删除'{match.group(0)}'及之后内容: {project}")
  24. # 清理末尾的标点符号(逗号、句号、分号、冒号等)
  25. project = re.sub(r'[,。;:,.;:]+$', '', project).strip()
  26. return project
  27. def correct_address_ocr_errors(address: str) -> str:
  28. """纠正address字段中的常见OCR识别错误
  29. 常见错误模式:
  30. 1. "厂界外lm" -> "厂界外1m" (手写的"1m"被识别为"lm")
  31. 2. "住户17" -> "住户1F" (手写的"1F"被识别为"17")
  32. 3. "住户47" -> "住户4F" (手写的"4F"被识别为"47")
  33. 4. "T界" -> "厂界" (手写的"厂"被识别为"T")
  34. 5. "群星木业17" -> "群星木业1F"
  35. 6. "东海花园137" -> "东海花园13F" (手写的"13F"被识别为"137")
  36. Args:
  37. address: 原始address字段值
  38. Returns:
  39. 纠正后的address字段值
  40. """
  41. if not address:
  42. return address
  43. original_address = address
  44. # 1. 纠正 "厂界外lm" -> "厂界外1m"
  45. # 匹配模式:厂界外 + lm(可能是手写的"1m"被识别为"lm")
  46. address = re.sub(r'厂界外lm\b', '厂界外1m', address)
  47. # 2. 纠正 "外lm" -> "外1m"(在"外"后面,如"五洲国际建材中心外lm")
  48. address = re.sub(r'外lm\b', '外1m', address)
  49. # 3. 纠正 "T界"/"t界" -> "厂界"(手写“厂”容易被识别为T/t,且后面可能紧跟其他汉字)
  50. address = re.sub(r'[Tt]界', '厂界', address)
  51. # 4. 纠正楼层号识别错误:数字+7 -> 数字+F
  52. # 模式:地址末尾的数字+7组合,很可能是楼层号(如1F、2F、13F等)
  53. # 根据实际案例:
  54. # - "住户17" -> "住户1F" (1楼)
  55. # - "住户47" -> "住户4F" (4楼)
  56. # - "群星木业17" -> "群星木业1F" (1楼)
  57. # - "群星木业47" -> "群星木业4F" (4楼)
  58. # - "东海花园17" -> "东海花园1F" (1楼)
  59. # - "东海花园137" -> "东海花园13F" (13楼)
  60. # - "东海花园177" -> "东海花园17F" (17楼)
  61. # - "东海花园217" -> "东海花园21F" (21楼)
  62. # - "卓维商务楼17" -> "卓维商务楼1F" (1楼)
  63. # - "卓维商务楼47" -> "卓维商务楼4F" (4楼)
  64. # 策略:先处理较长的模式(两位数+7),再处理较短的模式(单位数+7)
  65. # 这样可以避免误判,例如"137"应该被识别为"13F"而不是"1F"
  66. # 4.1 先处理两位数+7的情况(如137 -> 13F, 177 -> 17F, 217 -> 21F)
  67. # 匹配模式:地址末尾是两位数+7,且十位是1-2,个位是0-9
  68. address = re.sub(r'([1-2][0-9])7\b', r'\1F', address)
  69. # 4.2 再处理单位数+7的情况(如17 -> 1F, 27 -> 2F, 97 -> 9F)
  70. # 注意:这个规则在两位数规则之后执行,所以"137"已经被替换为"13F",不会再次匹配
  71. address = re.sub(r'([1-9])7\b', r'\1F', address)
  72. # 如果地址被修改了,记录日志
  73. if address != original_address:
  74. logger.info(f"[噪声检测] 纠正address字段OCR错误: '{original_address}' -> '{address}'")
  75. return address
  76. def _mark_auto_weather_default(record: NoiseDetectionRecord, weather_obj: Optional[WeatherData] = None) -> None:
  77. """标记天气字段使用了默认填充值"""
  78. setattr(record, "_auto_weather_default_used", True)
  79. if weather_obj is not None:
  80. weather_obj._auto_filled_weather = True
  81. def normalize_standard_text(text: str) -> str:
  82. """标准字段中可能包含数学/LaTeX格式,需先清理"""
  83. if not text:
  84. return text
  85. # 去掉美元符号和常见LaTeX指令(例如 \mathrm、\left、\right、\cdot 等)
  86. text = text.replace("$", "")
  87. text = re.sub(r"\\(mathrm|left|right|cdot|cdots|ldots|frac|overline|underline|mathbf|mathbf|mathit|mathsf|mathtt|mathcal)\b", "", text)
  88. # 删除其他未知的反斜杠命令,保留紧跟其后的文本
  89. text = re.sub(r"\\[a-zA-Z]+", "", text)
  90. # 去掉多余的大括号
  91. text = re.sub(r"[{}]", "", text)
  92. # 合并多余空白
  93. text = re.sub(r"\s+", " ", text).strip()
  94. return text
  95. def extract_standard_references(text: str) -> str:
  96. """解析检测/监测依据,支持包含数学格式的文本"""
  97. if not text:
  98. return ""
  99. text = normalize_standard_text(text.strip())
  100. text = text.replace("☐", "□").replace("■", "□")
  101. text = re.sub(r'□其他[::]?$', '', text).strip()
  102. gb_standards = re.findall(r'GB\s*\d+[-\.]?\d*[-\.]?\d*', text)
  103. if gb_standards:
  104. return " ".join(gb_standards)
  105. return re.sub(r'□\s*', '', text).strip()
  106. def _normalize_weather_text(weather_text: str) -> str:
  107. """标准化气象字段文本,插入缺失的分隔符,移除HTML"""
  108. if not weather_text:
  109. return weather_text
  110. text = weather_text
  111. text = re.sub(r'<[^>]+>', ' ', text) # 移除HTML标签
  112. text = text.replace("&nbsp;", " ")
  113. text = text.replace(".", ".")
  114. text = text.replace(",", " ")
  115. text = text.replace(":", ":")
  116. text = text.replace("℃C", "℃")
  117. # 为不同字段增加缺失的空格,避免如 "℃湿度" 无法拆分
  118. text = re.sub(r'([℃°C])\s*湿度', r'\1 湿度', text)
  119. text = re.sub(r'([℃°C])\s*风速', r'\1 风速', text)
  120. text = re.sub(r'(%RH)\s*风速', r'\1 风速', text, flags=re.IGNORECASE)
  121. text = re.sub(r'(%RH)\s*风速', r'%RH 风速', text)
  122. text = re.sub(r'(m/s)\s*风向', r'\1 风向', text, flags=re.IGNORECASE)
  123. text = re.sub(r'(M/S)\s*风向', r'm/s 风向', text)
  124. text = re.sub(r'风速([0-9])', r'风速 \1', text)
  125. # 保证冒号后有空格,便于分段
  126. text = re.sub(r'(日期|天气|温度|湿度|风速|风向)\s*[::]', r'\1: ', text)
  127. # 合并多余空白
  128. text = re.sub(r'\s+', ' ', text).strip()
  129. return text
  130. def parse_weather_from_text(weather_text: str, record: NoiseDetectionRecord) -> None:
  131. """从文本中解析天气数据,支持多条记录
  132. 文本格式示例:
  133. 日期:2025.9.23 天气 多云 温度26.7-27.4℃湿度66.6-67.4%RH 风速0.9-1.0 m/s 风向东偏北 日期:2025.9.24 天气 多云 温度24.5-28.6℃湿度65.3-67.1%RH 风速1.4-1.5 m/s 风向东偏北
  134. """
  135. weather_text = _normalize_weather_text(weather_text)
  136. if not weather_text or "日期" not in weather_text:
  137. return
  138. # 直接使用分段解析方式,因为第一个正则表达式太复杂且容易出错
  139. # 首先尝试分段解析,按日期分段,然后逐字段提取
  140. date_pattern = r'日期[::]\s*([\d.\-]+)'
  141. weather_pattern_simple = r'天气\s+([^\s温度湿度风速风向日期]+)'
  142. # 温度模式:温度后跟数字,直到遇到"℃"或"湿度"
  143. temp_pattern = r'温度\s*([0-9.\-]+)[℃°C]?'
  144. # 湿度模式:湿度后跟数字,直到遇到"%RH"或"风速"
  145. humidity_pattern = r'湿度\s*([0-9.\-]+)[%RH]?'
  146. # 风速模式:风速后跟数字和"m/s",支持~符号(如2.1~3.1)
  147. wind_speed_pattern = r'风速\s*([0-9.~\-]+)\s*m/s'
  148. # 风向模式:风向后跟方向描述,直到遇到"日期"、"气候条件"、"气候特征"或文本结束
  149. # 注意:不能排除"风"字,否则"南风"只能匹配到"南"
  150. # 需要处理"气候条件"或"气候特征"紧跟在风向后的情况(如"西北气候条件"、"西北气候特征")
  151. # 注意:字符类中不排除空格,因为风向值可能包含空格(如"东偏北"),空格会在前瞻断言中处理
  152. # 前瞻断言支持有空格或直接遇到关键词的情况
  153. wind_dir_pattern = r'风向\s*([^日期温度湿度风速气候条件气候特征<>]+?)(?=\s*(?:日期|温度|湿度|风速|气候条件|气候特征|<)|$)'
  154. # 找到所有日期位置,然后为每个日期解析一条记录
  155. dates = list(re.finditer(date_pattern, weather_text))
  156. if not dates:
  157. logger.warning(f"[噪声检测] 未找到日期信息: {weather_text[:100]}")
  158. return
  159. weather_found = False
  160. for idx, date_match in enumerate(dates):
  161. date_start = date_match.start()
  162. # 找到下一个日期位置或文本末尾
  163. if idx + 1 < len(dates):
  164. next_date_match = dates[idx + 1]
  165. section_end = next_date_match.start()
  166. else:
  167. section_end = len(weather_text)
  168. section = weather_text[date_start:section_end]
  169. logger.debug(f"[噪声检测] 解析天气段落: {section}")
  170. weather = WeatherData()
  171. weather.monitorAt = date_match.group(1).strip()
  172. # 提取天气(格式:天气 多云 或 天气多云)
  173. w_match = re.search(weather_pattern_simple, section)
  174. if w_match:
  175. weather_value = w_match.group(1).strip()
  176. # 如果提取到的值不是字段标签,则认为是天气值
  177. if weather_value and weather_value not in {"温度", "天气", "天气状况", "天气情况"}:
  178. weather.weather = weather_value
  179. logger.debug(f"[噪声检测] 提取到天气: {weather.weather}")
  180. # 提取温度(格式:温度26.7-27.4℃ 或 温度 26.7-27.4℃)
  181. t_match = re.search(temp_pattern, section)
  182. if t_match:
  183. temp = t_match.group(1).strip()
  184. weather.temp = temp
  185. logger.debug(f"[噪声检测] 提取到温度: {weather.temp}")
  186. # 提取湿度(格式:湿度66.6-67.4%RH 或 湿度 66.6-67.4%RH)
  187. h_match = re.search(humidity_pattern, section)
  188. if h_match:
  189. humidity = h_match.group(1).strip()
  190. weather.humidity = humidity
  191. logger.debug(f"[噪声检测] 提取到湿度: {weather.humidity}")
  192. # 提取风速(格式:风速0.9-1.0 m/s 或 风速 0.9-1.0 m/s,支持~符号但输出时转换为-)
  193. ws_match = re.search(wind_speed_pattern, section)
  194. if ws_match:
  195. wind_speed = ws_match.group(1).strip().replace("~", "-")
  196. weather.windSpeed = wind_speed
  197. logger.debug(f"[噪声检测] 提取到风速: {weather.windSpeed}")
  198. # 提取风向(格式:风向东北 或 风向 东北)
  199. # 注意:风向值不应该包含"日期"关键词,如果匹配到包含"日期"的内容,说明匹配错误
  200. wd_match = re.search(wind_dir_pattern, section)
  201. if wd_match:
  202. wind_dir_value = wd_match.group(1).strip()
  203. # 如果风向值包含"气候条件"或"气候特征",需要截断(处理"西北气候条件"、"西北气候特征"这种情况)
  204. if "气候条件" in wind_dir_value:
  205. wind_dir_value = wind_dir_value.split("气候条件")[0].strip()
  206. if "气候特征" in wind_dir_value:
  207. wind_dir_value = wind_dir_value.split("气候特征")[0].strip()
  208. # 验证风向值:不应该包含"日期"、"温度"、"湿度"、"风速"、"气候条件"、"气候特征"等关键词
  209. if wind_dir_value and "日期" not in wind_dir_value and "温度" not in wind_dir_value and \
  210. "湿度" not in wind_dir_value and "风速" not in wind_dir_value and \
  211. "气候条件" not in wind_dir_value and "气候特征" not in wind_dir_value and \
  212. not wind_dir_value.startswith("日期") and len(wind_dir_value) < 50: # 风向值不应该太长
  213. weather.windDirection = wind_dir_value
  214. logger.debug(f"[噪声检测] 提取到风向: {weather.windDirection}")
  215. else:
  216. logger.warning(f"[噪声检测] 风向值验证失败,跳过: {wind_dir_value}")
  217. # weather 为空且其它气象字段有任意一个不为空时,默认填入“晴”
  218. if not weather.weather.strip() and any([weather.temp, weather.humidity, weather.windSpeed, weather.windDirection]):
  219. weather.weather = "晴"
  220. _mark_auto_weather_default(record, weather)
  221. # 如果至少有一个字段不为空,则添加这条记录
  222. if any([weather.monitorAt, weather.weather, weather.temp, weather.humidity, weather.windSpeed, weather.windDirection]):
  223. record.weather.append(weather)
  224. weather_found = True
  225. logger.info(f"[噪声检测] 解析到天气记录: {weather.to_dict()}")
  226. else:
  227. logger.warning(f"[噪声检测] 天气记录字段全为空,跳过: {section}")
  228. # 如果分段解析成功,就不需要继续执行后面的代码
  229. if weather_found:
  230. return
  231. # 如果分段解析没有成功,尝试其他方式(向后兼容)
  232. if not weather_found:
  233. # 尝试分段解析,更精确地匹配格式:日期:2025.9.23 天气 多云 温度26.7-27.4℃湿度66.6-67.4%RH 风速0.9-1.0 m/s 风向东北
  234. # 注意:格式中字段和值之间可能没有空格,如"温度26.7-27.4℃"、"湿度66.6-67.4%RH"、"风速0.9-1.0 m/s"
  235. # 需要在遇到单位符号或下一个字段名时停止匹配
  236. date_pattern = r'日期[::]\s*([\d.\-]+)'
  237. weather_pattern_simple = r'天气\s+([^\s温度湿度风速风向日期]+)'
  238. # 温度模式:温度后跟数字和单位,直到遇到"湿度"或其他字段
  239. temp_pattern = r'温度\s*([0-9.\-]+)[℃°C]?'
  240. # 湿度模式:湿度后跟数字和单位,直到遇到"风速"或其他字段
  241. humidity_pattern = r'湿度\s*([0-9.\-]+)[%RH]?'
  242. # 风速模式:风速后跟数字和单位,直到遇到"风向"或其他字段,支持~符号(如2.1~3.1)
  243. wind_speed_pattern = r'风速\s*([0-9.~\-]+)\s*m/s'
  244. # 风向模式:风向后跟方向描述,直到遇到"日期"、"气候条件"或文本结束
  245. # 注意:不能排除"风"字,否则"南风"只能匹配到"南"
  246. # 需要处理"气候条件"紧跟在风向后的情况(如"西北气候条件")
  247. # 注意:字符类中不排除空格,因为风向值可能包含空格,空格会在前瞻断言中处理
  248. # 前瞻断言支持有空格或直接遇到关键词的情况
  249. wind_dir_pattern = r'风向\s*([^日期温度湿度风速气候条件<>]+?)(?=\s*(?:日期|温度|湿度|风速|气候条件|<)|$)'
  250. # 找到所有日期位置,然后为每个日期解析一条记录
  251. dates = list(re.finditer(date_pattern, weather_text))
  252. if not dates:
  253. logger.warning(f"[噪声检测] 未找到日期信息: {weather_text[:100]}")
  254. return
  255. for idx, date_match in enumerate(dates):
  256. date_start = date_match.start()
  257. # 找到下一个日期位置或文本末尾
  258. if idx + 1 < len(dates):
  259. next_date_match = dates[idx + 1]
  260. section_end = next_date_match.start()
  261. else:
  262. section_end = len(weather_text)
  263. section = weather_text[date_start:section_end]
  264. logger.debug(f"[噪声检测] 解析天气段落: {section}")
  265. weather = WeatherData()
  266. weather.monitorAt = date_match.group(1).strip()
  267. # 提取天气(格式:天气 多云 或 天气多云)
  268. w_match = re.search(weather_pattern_simple, section)
  269. if w_match:
  270. weather_value = w_match.group(1).strip()
  271. if weather_value and weather_value not in {"温度", "天气", "天气状况", "天气情况"}:
  272. weather.weather = weather_value
  273. logger.debug(f"[噪声检测] 提取到天气: {weather.weather}")
  274. # 提取温度(格式:温度26.7-27.4℃ 或 温度 26.7-27.4℃)
  275. t_match = re.search(temp_pattern, section)
  276. if t_match:
  277. temp = t_match.group(1).strip()
  278. weather.temp = temp
  279. logger.debug(f"[噪声检测] 提取到温度: {weather.temp}")
  280. # 提取湿度(格式:湿度66.6-67.4%RH 或 湿度 66.6-67.4%RH)
  281. h_match = re.search(humidity_pattern, section)
  282. if h_match:
  283. humidity = h_match.group(1).strip()
  284. weather.humidity = humidity
  285. logger.debug(f"[噪声检测] 提取到湿度: {weather.humidity}")
  286. # 提取风速(格式:风速0.9-1.0 m/s 或 风速 0.9-1.0 m/s,支持~符号但输出时转换为-)
  287. ws_match = re.search(wind_speed_pattern, section)
  288. if ws_match:
  289. wind_speed = ws_match.group(1).strip().replace("~", "-")
  290. weather.windSpeed = wind_speed
  291. logger.debug(f"[噪声检测] 提取到风速: {weather.windSpeed}")
  292. # 提取风向(格式:风向东北 或 风向 东北)
  293. # 注意:风向值不应该包含"日期"关键词,如果匹配到包含"日期"的内容,说明匹配错误
  294. wd_match = re.search(wind_dir_pattern, section)
  295. if wd_match:
  296. wind_dir_value = wd_match.group(1).strip()
  297. # 如果风向值包含"气候条件",需要截断(处理"西北气候条件"这种情况)
  298. if "气候条件" in wind_dir_value:
  299. wind_dir_value = wind_dir_value.split("气候条件")[0].strip()
  300. # 验证风向值:不应该包含"日期"、"温度"、"湿度"、"风速"、"气候条件"等关键词
  301. if wind_dir_value and "日期" not in wind_dir_value and "温度" not in wind_dir_value and \
  302. "湿度" not in wind_dir_value and "风速" not in wind_dir_value and \
  303. "气候条件" not in wind_dir_value and \
  304. not wind_dir_value.startswith("日期") and len(wind_dir_value) < 50: # 风向值不应该太长
  305. weather.windDirection = wind_dir_value
  306. logger.debug(f"[噪声检测] 提取到风向: {weather.windDirection}")
  307. else:
  308. logger.warning(f"[噪声检测] 风向值验证失败,跳过: {wind_dir_value}")
  309. # weather 为空且其它气象字段有任意一个不为空时,默认填入"晴"
  310. if not weather.weather.strip() and any([weather.temp, weather.humidity, weather.windSpeed, weather.windDirection]):
  311. weather.weather = "晴"
  312. _mark_auto_weather_default(record, weather)
  313. # 如果至少有一个字段不为空,则添加这条记录
  314. if any([weather.monitorAt, weather.weather, weather.temp, weather.humidity, weather.windSpeed, weather.windDirection]):
  315. record.weather.append(weather)
  316. logger.info(f"[噪声检测] 解析到天气记录(简化模式): {weather.to_dict()}")
  317. else:
  318. logger.warning(f"[噪声检测] 天气记录字段全为空,跳过: {section}")
  319. def parse_header_from_combined_cell(cell_text: str) -> dict:
  320. """从组合单元格中解析头部信息
  321. 单元格格式示例:
  322. 项目名称:武汉黄陂路102号南站改造工程竣工验收检测依据:GB 12348-2008 □GB3096-2008 □其他:声级计型号/编号:AY2201 声校准器型号/编号:AY2204 检测前校准值:93.8 dB(A) 检测后校准值:94.0 dB(A)气象条件...
  323. """
  324. result = {
  325. "project": "",
  326. "standardReferences": "",
  327. "soundLevelMeterMode": "",
  328. "soundCalibratorMode": "",
  329. "calibrationValueBefore": "",
  330. "calibrationValueAfter": ""
  331. }
  332. if not cell_text:
  333. return result
  334. # 检查是否包含任何需要解析的字段
  335. has_any_field = any(keyword in cell_text for keyword in [
  336. "项目名称", "检测依据", "监测依据", "检查依据", "声级计型号", "声校准器型号",
  337. "检测前校准值", "检测后校准值", "声纹计型号", "声级计校准器型号"
  338. ])
  339. if not has_any_field:
  340. return result
  341. # 解析项目名称:项目名称:xxx(后面跟着检测依据或其他字段,可能没有分隔符)
  342. # 匹配模式:项目名称:xxx(直到检测依据、监测依据、声级计、声校准器、检测前、检测后或气象条件,或到字符串末尾)
  343. # 注意:项目名称后面可能直接跟着"检测依据"没有分隔符,也可能后面没有其他字段
  344. project_match = re.search(r'项目名称[::](.+?)(?:检测依据|监测依据|检查依据|声级计|声校准器|检测前|检测后|气象条件|</td>|</tr>|$)', cell_text)
  345. if project_match:
  346. result["project"] = project_match.group(1).strip()
  347. # 如果提取到的项目名称为空,可能是正则表达式匹配到了但内容为空
  348. if not result["project"]:
  349. # 尝试更简单的匹配:项目名称:后面直到行尾或换行
  350. project_match2 = re.search(r'项目名称[::]([^<]+?)(?:</td>|</tr>|$)', cell_text)
  351. if project_match2:
  352. result["project"] = project_match2.group(1).strip()
  353. # 清理project字段,删除"检测依据"及之后的内容(防止正则表达式没有完全匹配到的情况)
  354. result["project"] = clean_project_field(result["project"])
  355. # 解析检测依据:检测依据:GB 12348-2008 □GB3096-2008 □其他:声级计...
  356. # 也可能格式为:检测依据:xxx或监测依据:xxx
  357. # 注意:检测依据后面可能跟着"□其他:",需要截断到"声级计"或"声校准器"或"检测前"或"检测后"或"气象条件"
  358. standard_match = re.search(r'(?:检测|监测|检查)依据[::](.+?)(?:声级计|声校准器|检测前|检测后|气象条件)', cell_text)
  359. if standard_match:
  360. standard_text = extract_standard_references(standard_match.group(1))
  361. if standard_text:
  362. result["standardReferences"] = standard_text
  363. else:
  364. # 如果第一个正则没有匹配到,尝试更宽松的匹配:匹配到行尾或下一个字段
  365. standard_match2 = re.search(r'(?:检测|监测|检查)依据[::]([^声级计声校准器检测前检测后气象条件]+?)(?:声级计|声校准器|检测前|检测后|气象条件|$)', cell_text)
  366. if standard_match2:
  367. standard_text = extract_standard_references(standard_match2.group(1))
  368. if standard_text:
  369. result["standardReferences"] = standard_text
  370. # 解析声级计型号/编号:声级计型号/编号:AY2201 或 声级计型号:AY2201 或 声级计型号/编号:AWA628+/AY2249
  371. # 支持包含+号和斜杠的型号,如 AWA628+/AY2249
  372. sound_meter_match = re.search(r'声级计型号[/::]?(?:编号)?[::]\s*([A-Z0-9+/]+)', cell_text)
  373. if sound_meter_match:
  374. result["soundLevelMeterMode"] = sound_meter_match.group(1).strip()
  375. # 解析声校准器型号/编号:声校准器型号/编号:AY2204 或 声校准器型号:AY2204
  376. # 支持包含+号和斜杠的型号
  377. calibrator_match = re.search(r'声校准器型号[/::]?(?:编号)?[::]\s*([A-Z0-9+/]+)', cell_text)
  378. if calibrator_match:
  379. result["soundCalibratorMode"] = calibrator_match.group(1).strip()
  380. # 解析检测前校准值:检测前校准值:93.8 dB(A)
  381. before_cal_match = re.search(r'检测前校准值[::]\s*([0-9.]+)\s*dB\(A\)', cell_text)
  382. if before_cal_match:
  383. cal_value = before_cal_match.group(1).strip()
  384. result["calibrationValueBefore"] = f"{cal_value} dB(A)"
  385. else:
  386. # 如果没有单位,只提取数值
  387. before_cal_match2 = re.search(r'检测前校准值[::]\s*([0-9.]+)', cell_text)
  388. if before_cal_match2:
  389. result["calibrationValueBefore"] = before_cal_match2.group(1).strip()
  390. # 解析检测后校准值:检测后校准值:94.0 dB(A)
  391. after_cal_match = re.search(r'检测后校准值[::]\s*([0-9.]+)\s*dB\(A\)', cell_text)
  392. if after_cal_match:
  393. cal_value = after_cal_match.group(1).strip()
  394. result["calibrationValueAfter"] = f"{cal_value} dB(A)"
  395. else:
  396. # 如果没有单位,只提取数值
  397. after_cal_match2 = re.search(r'检测后校准值[::]\s*([0-9.]+)', cell_text)
  398. if after_cal_match2:
  399. result["calibrationValueAfter"] = after_cal_match2.group(1).strip()
  400. return result
  401. def parse_noise_detection_record(markdown_content: str, first_page_image: Optional = None, output_dir: Optional[str] = None) -> NoiseDetectionRecord:
  402. """解析噪声检测记录 - v2版本不依赖OCR,只从markdown内容解析"""
  403. record = NoiseDetectionRecord()
  404. # 首先提取Markdown关键词补充(优先级高)
  405. md_keywords_comment_match = re.search(r'<!--\s*Markdown关键词补充:(.*?)-->', markdown_content, re.DOTALL)
  406. if md_keywords_comment_match:
  407. keywords_text = md_keywords_comment_match.group(1)
  408. logger.info("[噪声检测] 发现Markdown关键词补充,开始提取(优先级高)")
  409. # 提取项目名称
  410. project_match = re.search(r'项目名称[::]([^\n]+)', keywords_text)
  411. if project_match:
  412. record.project = clean_project_field(project_match.group(1).strip())
  413. logger.debug(f"[噪声检测] 从Markdown关键词补充提取到项目名称: {record.project}")
  414. # 提取检测依据
  415. standard_match = re.search(r'(?:检测|监测|检查)依据[::]([^\n]+)', keywords_text)
  416. if standard_match:
  417. record.standardReferences = extract_standard_references(standard_match.group(1))
  418. logger.debug(f"[噪声检测] 从Markdown关键词补充提取到检测依据: {record.standardReferences}")
  419. # 提取声级计型号/编号
  420. sound_meter_match = re.search(r'声级计型号/编号[::]([^\n]+)', keywords_text)
  421. if sound_meter_match:
  422. record.soundLevelMeterMode = sound_meter_match.group(1).strip()
  423. logger.debug(f"[噪声检测] 从Markdown关键词补充提取到声级计型号: {record.soundLevelMeterMode}")
  424. # 提取声校准器型号/编号
  425. calibrator_match = re.search(r'声校准器型号/编号[::]([^\n]+)', keywords_text)
  426. if calibrator_match:
  427. record.soundCalibratorMode = calibrator_match.group(1).strip()
  428. logger.debug(f"[噪声检测] 从Markdown关键词补充提取到声校准器型号: {record.soundCalibratorMode}")
  429. # 提取检测前校准值
  430. before_cal_match = re.search(r'检测前校准值[::]([^\n]+)', keywords_text)
  431. if before_cal_match:
  432. record.calibrationValueBefore = before_cal_match.group(1).strip()
  433. logger.debug(f"[噪声检测] 从Markdown关键词补充提取到检测前校准值: {record.calibrationValueBefore}")
  434. # 提取检测后校准值
  435. after_cal_match = re.search(r'检测后校准值[::]([^\n]+)', keywords_text)
  436. if after_cal_match:
  437. record.calibrationValueAfter = after_cal_match.group(1).strip()
  438. logger.debug(f"[噪声检测] 从Markdown关键词补充提取到检测后校准值: {record.calibrationValueAfter}")
  439. # 提取天气信息
  440. weather_lines = re.findall(r'日期[::]([^\n]+)', keywords_text)
  441. for weather_line in weather_lines:
  442. weather = WeatherData()
  443. # 解析天气行:日期:xxx 天气:xxx 温度:xxx 湿度:xxx 风速:xxx 风向:xxx
  444. date_match = re.search(r'日期[::]\s*([\d.\-]+)', weather_line)
  445. if date_match:
  446. weather.monitorAt = date_match.group(1).strip()
  447. weather_match = re.search(r'天气[::]\s*([^\s温度]+)', weather_line)
  448. if weather_match:
  449. weather.weather = weather_match.group(1).strip()
  450. temp_match = re.search(r'温度[::]\s*([0-9.\-]+)', weather_line)
  451. if temp_match:
  452. weather.temp = temp_match.group(1).strip()
  453. humidity_match = re.search(r'湿度[::]\s*([0-9.\-]+)', weather_line)
  454. if humidity_match:
  455. weather.humidity = humidity_match.group(1).strip()
  456. wind_speed_match = re.search(r'风速[::]\s*([0-9.~\-]+)', weather_line)
  457. if wind_speed_match:
  458. weather.windSpeed = wind_speed_match.group(1).strip().replace("~", "-")
  459. # 注意:不能排除"风"字,否则"南风"只能匹配到"南"
  460. # 使用非贪婪匹配,匹配到下一个字段名或行尾,需要处理"气候条件"
  461. wind_dir_match = re.search(r'风向[::]\s*([^\s日期温度湿度气候条件]+?)(?=\s*(?:日期|温度|湿度|风速|气候条件)|$)', weather_line)
  462. if wind_dir_match:
  463. wind_dir_value = wind_dir_match.group(1).strip()
  464. # 如果风向值包含"气候条件",需要截断(处理"西北气候条件"这种情况)
  465. if "气候条件" in wind_dir_value:
  466. wind_dir_value = wind_dir_value.split("气候条件")[0].strip()
  467. # 验证风向值:不应该包含"日期"、"温度"、"湿度"、"风速"、"气候条件"等关键词
  468. if wind_dir_value and "日期" not in wind_dir_value and "温度" not in wind_dir_value and \
  469. "湿度" not in wind_dir_value and "风速" not in wind_dir_value and \
  470. "气候条件" not in wind_dir_value and \
  471. not wind_dir_value.startswith("日期") and len(wind_dir_value) < 50: # 风向值不应该太长
  472. weather.windDirection = wind_dir_value
  473. else:
  474. logger.warning(f"[噪声检测] 风向值验证失败,跳过: {wind_dir_value}")
  475. # 如果天气为空但其他字段有值,默认为"晴"
  476. if not weather.weather or not weather.weather.strip():
  477. if any([weather.temp, weather.humidity, weather.windSpeed, weather.windDirection]):
  478. weather.weather = "晴"
  479. _mark_auto_weather_default(record, weather)
  480. _mark_auto_weather_default(record, weather)
  481. # 如果至少有一个字段不为空,添加到记录(即使monitorAt为空也先添加,后续会从表格中补充)
  482. if any([weather.weather, weather.temp, weather.humidity, weather.windSpeed, weather.windDirection]):
  483. record.weather.append(weather)
  484. logger.debug(f"[噪声检测] 从Markdown关键词补充提取到天气信息: {weather.to_dict()}")
  485. # 然后提取OCR关键词补充(优先级低,只在字段为空时补充)
  486. ocr_keywords_comment_match = re.search(r'<!--\s*OCR关键词补充:(.*?)-->', markdown_content, re.DOTALL)
  487. if ocr_keywords_comment_match:
  488. keywords_text = ocr_keywords_comment_match.group(1)
  489. logger.info("[噪声检测] 发现OCR关键词补充,开始提取(优先级低,仅在字段为空时补充)")
  490. # 提取项目名称(仅在字段为空时)
  491. project_match = re.search(r'项目名称[::]([^\n]+)', keywords_text)
  492. if project_match and (not record.project or not record.project.strip()):
  493. record.project = clean_project_field(project_match.group(1).strip())
  494. logger.debug(f"[噪声检测] 从OCR关键词补充提取到项目名称: {record.project}")
  495. # 提取检测依据(仅在字段为空时)
  496. standard_match = re.search(r'(?:检测|监测|检查)依据[::]([^\n]+)', keywords_text)
  497. if standard_match and (not record.standardReferences or not record.standardReferences.strip()):
  498. record.standardReferences = extract_standard_references(standard_match.group(1))
  499. logger.debug(f"[噪声检测] 从OCR关键词补充提取到检测依据: {record.standardReferences}")
  500. # 提取声级计型号/编号(仅在字段为空时)
  501. sound_meter_match = re.search(r'声级计型号/编号[::]([^\n]+)', keywords_text)
  502. if sound_meter_match and (not record.soundLevelMeterMode or not record.soundLevelMeterMode.strip()):
  503. record.soundLevelMeterMode = sound_meter_match.group(1).strip()
  504. logger.debug(f"[噪声检测] 从OCR关键词补充提取到声级计型号: {record.soundLevelMeterMode}")
  505. # 提取声校准器型号/编号(仅在字段为空时)
  506. calibrator_match = re.search(r'声校准器型号/编号[::]([^\n]+)', keywords_text)
  507. if calibrator_match and (not record.soundCalibratorMode or not record.soundCalibratorMode.strip()):
  508. record.soundCalibratorMode = calibrator_match.group(1).strip()
  509. logger.debug(f"[噪声检测] 从OCR关键词补充提取到声校准器型号: {record.soundCalibratorMode}")
  510. # 提取检测前校准值(仅在字段为空时)
  511. before_cal_match = re.search(r'检测前校准值[::]([^\n]+)', keywords_text)
  512. if before_cal_match and (not record.calibrationValueBefore or not record.calibrationValueBefore.strip()):
  513. record.calibrationValueBefore = before_cal_match.group(1).strip()
  514. logger.debug(f"[噪声检测] 从OCR关键词补充提取到检测前校准值: {record.calibrationValueBefore}")
  515. # 提取检测后校准值(仅在字段为空时)
  516. after_cal_match = re.search(r'检测后校准值[::]([^\n]+)', keywords_text)
  517. if after_cal_match and (not record.calibrationValueAfter or not record.calibrationValueAfter.strip()):
  518. record.calibrationValueAfter = after_cal_match.group(1).strip()
  519. logger.debug(f"[噪声检测] 从OCR关键词补充提取到检测后校准值: {record.calibrationValueAfter}")
  520. # 提取天气信息(仅在MD天气信息中没有对应日期或字段为空时补充)
  521. weather_lines = re.findall(r'日期[::]([^\n]+)', keywords_text)
  522. for weather_line in weather_lines:
  523. ocr_weather = WeatherData()
  524. # 解析天气行:日期:xxx 天气:xxx 温度:xxx 湿度:xxx 风速:xxx 风向:xxx
  525. date_match = re.search(r'日期[::]\s*([\d.\-]+)', weather_line)
  526. if date_match:
  527. ocr_weather.monitorAt = date_match.group(1).strip()
  528. weather_match = re.search(r'天气[::]\s*([^\s温度]+)', weather_line)
  529. if weather_match:
  530. ocr_weather.weather = weather_match.group(1).strip()
  531. temp_match = re.search(r'温度[::]\s*([0-9.\-]+)', weather_line)
  532. if temp_match:
  533. ocr_weather.temp = temp_match.group(1).strip()
  534. humidity_match = re.search(r'湿度[::]\s*([0-9.\-]+)', weather_line)
  535. if humidity_match:
  536. ocr_weather.humidity = humidity_match.group(1).strip()
  537. wind_speed_match = re.search(r'风速[::]\s*([0-9.~\-]+)', weather_line)
  538. if wind_speed_match:
  539. ocr_weather.windSpeed = wind_speed_match.group(1).strip().replace("~", "-")
  540. # 注意:不能排除"风"字,否则"南风"只能匹配到"南"
  541. # 使用非贪婪匹配,匹配到下一个字段名或行尾,需要处理"气候条件"
  542. wind_dir_match = re.search(r'风向[::]\s*([^\s日期温度湿度气候条件]+?)(?=\s*(?:日期|温度|湿度|风速|气候条件)|$)', weather_line)
  543. if wind_dir_match:
  544. wind_dir_value = wind_dir_match.group(1).strip()
  545. # 如果风向值包含"气候条件",需要截断(处理"西北气候条件"这种情况)
  546. if "气候条件" in wind_dir_value:
  547. wind_dir_value = wind_dir_value.split("气候条件")[0].strip()
  548. # 验证风向值:不应该包含"日期"、"温度"、"湿度"、"风速"、"气候条件"等关键词
  549. if wind_dir_value and "日期" not in wind_dir_value and "温度" not in wind_dir_value and \
  550. "湿度" not in wind_dir_value and "风速" not in wind_dir_value and \
  551. "气候条件" not in wind_dir_value and \
  552. not wind_dir_value.startswith("日期") and len(wind_dir_value) < 50: # 风向值不应该太长
  553. ocr_weather.windDirection = wind_dir_value
  554. else:
  555. logger.warning(f"[噪声检测] 风向值验证失败,跳过: {wind_dir_value}")
  556. # 如果天气为空但其他字段有值,默认为"晴"
  557. if not ocr_weather.weather or not ocr_weather.weather.strip():
  558. if any([ocr_weather.temp, ocr_weather.humidity, ocr_weather.windSpeed, ocr_weather.windDirection]):
  559. ocr_weather.weather = "晴"
  560. _mark_auto_weather_default(record, ocr_weather)
  561. # 检查是否已存在相同日期的MD天气记录
  562. if ocr_weather.monitorAt:
  563. ocr_date = ocr_weather.monitorAt.strip().rstrip('.')
  564. found_md_weather = None
  565. for md_weather in record.weather:
  566. md_date = md_weather.monitorAt.strip().rstrip('.') if md_weather.monitorAt else ""
  567. if md_date == ocr_date:
  568. found_md_weather = md_weather
  569. break
  570. if found_md_weather:
  571. # 如果找到MD天气记录,只在字段为空时用OCR补充
  572. if not found_md_weather.weather and ocr_weather.weather:
  573. found_md_weather.weather = ocr_weather.weather
  574. logger.debug(f"[噪声检测] 从OCR补充MD天气记录的天气字段: {found_md_weather.weather}")
  575. if not found_md_weather.temp and ocr_weather.temp:
  576. found_md_weather.temp = ocr_weather.temp
  577. logger.debug(f"[噪声检测] 从OCR补充MD天气记录的温度字段: {found_md_weather.temp}")
  578. if not found_md_weather.humidity and ocr_weather.humidity:
  579. found_md_weather.humidity = ocr_weather.humidity
  580. logger.debug(f"[噪声检测] 从OCR补充MD天气记录的湿度字段: {found_md_weather.humidity}")
  581. if not found_md_weather.windSpeed and ocr_weather.windSpeed:
  582. found_md_weather.windSpeed = ocr_weather.windSpeed
  583. logger.debug(f"[噪声检测] 从OCR补充MD天气记录的风速字段: {found_md_weather.windSpeed}")
  584. if not found_md_weather.windDirection and ocr_weather.windDirection:
  585. found_md_weather.windDirection = ocr_weather.windDirection
  586. logger.debug(f"[噪声检测] 从OCR补充MD天气记录的风向字段: {found_md_weather.windDirection}")
  587. else:
  588. # 如果没有找到MD天气记录,且OCR天气信息有值,则添加OCR天气记录
  589. if any([ocr_weather.weather, ocr_weather.temp, ocr_weather.humidity, ocr_weather.windSpeed, ocr_weather.windDirection]):
  590. record.weather.append(ocr_weather)
  591. logger.debug(f"[噪声检测] 从OCR关键词补充添加天气信息(MD中无对应日期): {ocr_weather.to_dict()}")
  592. else:
  593. # 如果OCR天气信息没有日期,但有其他字段,也添加(后续会从表格中补充日期)
  594. if any([ocr_weather.weather, ocr_weather.temp, ocr_weather.humidity, ocr_weather.windSpeed, ocr_weather.windDirection]):
  595. record.weather.append(ocr_weather)
  596. logger.debug(f"[噪声检测] 从OCR关键词补充添加天气信息(无日期): {ocr_weather.to_dict()}")
  597. # 保存OCR提取的天气信息(用于后续与表格解析结果合并)
  598. ocr_weather_list = record.weather.copy() if record.weather else []
  599. # 清空record.weather,让表格解析重新填充
  600. record.weather = []
  601. # 使用支持rowspan和colspan的函数提取表格,因为噪声检测表有复杂的表头结构
  602. tables = extract_table_with_rowspan_colspan(markdown_content)
  603. if not tables:
  604. logger.warning(f"[噪声检测] 未能提取出任何表格内容")
  605. return record
  606. first_table = tables[0]
  607. # 首先尝试从组合单元格中解析头部信息(这种情况是多个字段都在一个单元格中,或者单个字段也在同一单元格中)
  608. # 同时也支持字段名和值在不同单元格的情况(新格式)
  609. header_extracted = False
  610. weather_extracted = False
  611. for row_idx, row in enumerate(first_table):
  612. # 先尝试从同一单元格解析(旧格式)
  613. for cell in row:
  614. # 检查单元格是否包含头部字段的关键词(放宽条件,支持单个字段的情况)
  615. # 如果单元格包含字段名和冒号,说明值就在同一单元格中
  616. has_header_field = any(keyword in cell for keyword in [
  617. "项目名称", "检测依据", "监测依据", "检查依据", "声级计型号", "声校准器型号",
  618. "检测前校准值", "检测后校准值", "声纹计型号", "声级计校准器型号"
  619. ])
  620. has_colon = ":" in cell or ":" in cell
  621. if has_header_field and has_colon:
  622. logger.debug(f"[噪声检测] 发现包含字段信息的单元格,尝试解析: {cell[:100]}...")
  623. # 清理HTML标签,只保留文本内容
  624. cell_clean = re.sub(r'<[^>]+>', '', cell).strip()
  625. parsed_header = parse_header_from_combined_cell(cell_clean)
  626. # 更新字段(如果解析到值)
  627. if parsed_header["project"] and not record.project:
  628. record.project = clean_project_field(parsed_header["project"])
  629. header_extracted = True
  630. logger.debug(f"[噪声检测] 从单元格解析到项目名称: {record.project}")
  631. if parsed_header["standardReferences"] and not record.standardReferences:
  632. record.standardReferences = parsed_header["standardReferences"]
  633. logger.debug(f"[噪声检测] 从单元格解析到检测依据: {record.standardReferences}")
  634. if parsed_header["soundLevelMeterMode"] and not record.soundLevelMeterMode:
  635. record.soundLevelMeterMode = parsed_header["soundLevelMeterMode"]
  636. logger.debug(f"[噪声检测] 从单元格解析到声级计型号: {record.soundLevelMeterMode}")
  637. if parsed_header["soundCalibratorMode"] and not record.soundCalibratorMode:
  638. record.soundCalibratorMode = parsed_header["soundCalibratorMode"]
  639. logger.debug(f"[噪声检测] 从单元格解析到声校准器型号: {record.soundCalibratorMode}")
  640. if parsed_header["calibrationValueBefore"] and not record.calibrationValueBefore:
  641. record.calibrationValueBefore = parsed_header["calibrationValueBefore"]
  642. logger.debug(f"[噪声检测] 从单元格解析到检测前校准值: {record.calibrationValueBefore}")
  643. if parsed_header["calibrationValueAfter"] and not record.calibrationValueAfter:
  644. record.calibrationValueAfter = parsed_header["calibrationValueAfter"]
  645. logger.debug(f"[噪声检测] 从单元格解析到检测后校准值: {record.calibrationValueAfter}")
  646. # 如果单元格中包含气象条件,也从这里解析
  647. if "气象条件" in cell:
  648. weather_text = cell
  649. # 从"气象条件"之后的内容开始解析
  650. if "气象条件" in weather_text:
  651. # 提取气象条件部分(从"气象条件"开始到字符串末尾或下一个主要字段)
  652. weather_section = weather_text.split("气象条件")[-1] if "气象条件" in weather_text else weather_text
  653. parse_weather_from_text(weather_section, record)
  654. weather_extracted = True
  655. logger.info(f"[噪声检测] 从组合单元格解析到天气信息: {len(record.weather)} 条记录")
  656. # 尝试从不同单元格解析(新格式:字段名和值在不同单元格)
  657. for col_idx, cell in enumerate(row):
  658. cell_clean = re.sub(r'<[^>]+>', '', cell).strip()
  659. # 声级计型号/编号:在单元格中,值在下一列
  660. if "声级计型号" in cell_clean and (":" in cell_clean or ":" in cell_clean) and not record.soundLevelMeterMode:
  661. if col_idx + 1 < len(row) and row[col_idx + 1].strip():
  662. record.soundLevelMeterMode = row[col_idx + 1].strip()
  663. header_extracted = True
  664. logger.debug(f"[噪声检测] 从不同单元格解析到声级计型号: {record.soundLevelMeterMode}")
  665. # 声校准器型号/编号:在单元格中,值在下一列
  666. if "声校准器型号" in cell_clean and (":" in cell_clean or ":" in cell_clean) and not record.soundCalibratorMode:
  667. if col_idx + 1 < len(row) and row[col_idx + 1].strip():
  668. record.soundCalibratorMode = row[col_idx + 1].strip()
  669. header_extracted = True
  670. logger.debug(f"[噪声检测] 从不同单元格解析到声校准器型号: {record.soundCalibratorMode}")
  671. # 检测前校准值:在单元格中,值在下一列
  672. if "检测前校准值" in cell_clean and (":" in cell_clean or ":" in cell_clean) and not record.calibrationValueBefore:
  673. if col_idx + 1 < len(row) and row[col_idx + 1].strip():
  674. cal_value = row[col_idx + 1].strip()
  675. # 如果包含单位,保留;否则添加单位
  676. if "dB" in cal_value or "dB(A)" in cal_value or "dB(A)" in cal_value:
  677. record.calibrationValueBefore = cal_value
  678. else:
  679. record.calibrationValueBefore = f"{cal_value} dB(A)"
  680. header_extracted = True
  681. logger.debug(f"[噪声检测] 从不同单元格解析到检测前校准值: {record.calibrationValueBefore}")
  682. # 检测后校准值:在单元格中,值在下一列
  683. if "检测后校准值" in cell_clean and (":" in cell_clean or ":" in cell_clean) and not record.calibrationValueAfter:
  684. if col_idx + 1 < len(row) and row[col_idx + 1].strip():
  685. cal_value = row[col_idx + 1].strip()
  686. # 如果包含单位,保留;否则添加单位
  687. if "dB" in cal_value or "dB(A)" in cal_value or "dB(A)" in cal_value:
  688. record.calibrationValueAfter = cal_value
  689. else:
  690. record.calibrationValueAfter = f"{cal_value} dB(A)"
  691. header_extracted = True
  692. logger.debug(f"[噪声检测] 从不同单元格解析到检测后校准值: {record.calibrationValueAfter}")
  693. # 如果已经解析到所有需要的字段,可以提前结束
  694. if record.project and record.soundLevelMeterMode and record.calibrationValueBefore and record.calibrationValueAfter:
  695. logger.info(f"[噪声检测] 从单元格成功解析到所有头部信息: project={record.project}, "
  696. f"soundLevelMeterMode={record.soundLevelMeterMode}, "
  697. f"calibrationValueBefore={record.calibrationValueBefore}, "
  698. f"calibrationValueAfter={record.calibrationValueAfter}")
  699. if not weather_extracted:
  700. break
  701. # 如果还没有提取到头部信息,使用原来的方法(假设字段分布在不同的单元格中)
  702. # 但也要尝试从同一单元格中提取(如果单元格包含字段名和冒号)
  703. if not header_extracted:
  704. for row in first_table:
  705. logger.debug(f"[噪声检测][ROW] len={len(row)}, content={row}")
  706. for i, cell in enumerate(row):
  707. # 尝试从同一单元格中提取项目名称(如果包含冒号)
  708. if "项目名称" in cell and (":" in cell or ":" in cell) and not record.project:
  709. # 使用 parse_header_from_combined_cell 解析
  710. parsed = parse_header_from_combined_cell(cell)
  711. if parsed["project"]:
  712. record.project = clean_project_field(parsed["project"])
  713. header_extracted = True
  714. logger.debug(f"[噪声检测] 从单元格 {i} 解析到项目名称: {record.project}")
  715. break
  716. # 如果同一单元格没有值,尝试从下一个单元格获取(向后兼容)
  717. if "项目名称" in cell and i + 1 < len(row) and not record.project:
  718. # 检查下一个单元格是否有内容
  719. if row[i + 1].strip():
  720. record.project = clean_project_field(row[i + 1].strip())
  721. if not record.project.strip():
  722. logger.error(f"[噪声检测] 项目名称 为空,行数据: {row}")
  723. else:
  724. header_extracted = True
  725. logger.debug(f"[噪声检测] 从单元格 {i+1} 解析到项目名称: {record.project}")
  726. break
  727. if any(k in row[0] for k in ["检测依据", "监测依据", "检查依据"]):
  728. for i, cell in enumerate(row):
  729. if any(k in cell for k in ["检测依据", "监测依据", "检查依据"]) and i + 1 < len(row):
  730. candidate_standard = extract_standard_references(row[i + 1])
  731. if candidate_standard:
  732. record.standardReferences = candidate_standard
  733. logger.debug(f"[噪声检测] 从行数据解析到检测依据: {record.standardReferences}")
  734. else:
  735. logger.error(f"[噪声检测] 检测/监测依据 为空或无法解析,行数据: {row}")
  736. break
  737. # 尝试从同一单元格或下一个单元格提取声级计型号
  738. for i, cell in enumerate(row):
  739. if any(k in cell for k in ["声纹计型号", "声级计型号"]) and not record.soundLevelMeterMode:
  740. # 先尝试从同一单元格提取(如果包含冒号)
  741. if (":" in cell or ":" in cell):
  742. parsed = parse_header_from_combined_cell(cell)
  743. if parsed["soundLevelMeterMode"]:
  744. record.soundLevelMeterMode = parsed["soundLevelMeterMode"]
  745. logger.debug(f"[噪声检测] 从单元格 {i} 解析到声级计型号: {record.soundLevelMeterMode}")
  746. break
  747. # 如果同一单元格没有值,尝试从下一个单元格获取
  748. elif i + 1 < len(row) and row[i + 1].strip():
  749. record.soundLevelMeterMode = row[i + 1].strip()
  750. if not record.soundLevelMeterMode.strip():
  751. logger.error(f"[噪声检测] 声级计型号 为空,行数据: {row}")
  752. else:
  753. logger.debug(f"[噪声检测] 从单元格 {i+1} 解析到声级计型号: {record.soundLevelMeterMode}")
  754. break
  755. # 尝试从同一单元格或下一个单元格提取声校准器型号
  756. for i, cell in enumerate(row):
  757. if any(k in cell for k in ["声纹准器型号", "声校准器型号", "声级计校准器型号"]) and not record.soundCalibratorMode:
  758. # 先尝试从同一单元格提取(如果包含冒号)
  759. if (":" in cell or ":" in cell):
  760. parsed = parse_header_from_combined_cell(cell)
  761. if parsed["soundCalibratorMode"]:
  762. record.soundCalibratorMode = parsed["soundCalibratorMode"]
  763. logger.debug(f"[噪声检测] 从单元格 {i} 解析到声校准器型号: {record.soundCalibratorMode}")
  764. break
  765. # 如果同一单元格没有值,尝试从下一个单元格获取
  766. elif i + 1 < len(row) and row[i + 1].strip():
  767. record.soundCalibratorMode = row[i + 1].strip()
  768. if not record.soundCalibratorMode.strip():
  769. logger.error(f"[噪声检测] 声级计校准器型号 为空,行数据: {row}")
  770. else:
  771. logger.debug(f"[噪声检测] 从单元格 {i+1} 解析到声校准器型号: {record.soundCalibratorMode}")
  772. break
  773. # 尝试从同一单元格或下一个单元格提取检测前校准值
  774. for i, cell in enumerate(row):
  775. if "检测前校准值" in cell and not record.calibrationValueBefore:
  776. # 先尝试从同一单元格提取(如果包含冒号)
  777. if (":" in cell or ":" in cell):
  778. parsed = parse_header_from_combined_cell(cell)
  779. if parsed["calibrationValueBefore"]:
  780. record.calibrationValueBefore = parsed["calibrationValueBefore"]
  781. logger.debug(f"[噪声检测] 从单元格 {i} 解析到检测前校准值: {record.calibrationValueBefore}")
  782. break
  783. # 如果同一单元格没有值,尝试从下一个单元格获取
  784. elif i + 1 < len(row) and row[i + 1].strip():
  785. record.calibrationValueBefore = row[i + 1].strip()
  786. if not record.calibrationValueBefore.strip():
  787. logger.error(f"[噪声检测] 检测前校准值 为空,行数据: {row}")
  788. else:
  789. logger.debug(f"[噪声检测] 从单元格 {i+1} 解析到检测前校准值: {record.calibrationValueBefore}")
  790. break
  791. # 尝试从同一单元格或下一个单元格提取检测后校准值
  792. for i, cell in enumerate(row):
  793. if "检测后校准值" in cell and not record.calibrationValueAfter:
  794. # 先尝试从同一单元格提取(如果包含冒号)
  795. if (":" in cell or ":" in cell):
  796. parsed = parse_header_from_combined_cell(cell)
  797. if parsed["calibrationValueAfter"]:
  798. record.calibrationValueAfter = parsed["calibrationValueAfter"]
  799. logger.debug(f"[噪声检测] 从单元格 {i} 解析到检测后校准值: {record.calibrationValueAfter}")
  800. break
  801. # 如果同一单元格没有值,尝试从下一个单元格获取
  802. elif i + 1 < len(row) and row[i + 1].strip():
  803. record.calibrationValueAfter = row[i + 1].strip()
  804. if not record.calibrationValueAfter.strip():
  805. logger.error(f"[噪声检测] 检测后校准值 为空,行数据: {row}")
  806. else:
  807. logger.debug(f"[噪声检测] 从单元格 {i+1} 解析到检测后校准值: {record.calibrationValueAfter}")
  808. break
  809. # 解析气象条件 - 支持多条记录(如果还没有从组合单元格中提取到天气数据)
  810. if not weather_extracted:
  811. # 首先尝试从表格结构中解析(气象条件在第一列,日期在第二列,天气在第三列等)
  812. for row_idx, row in enumerate(first_table):
  813. if len(row) < 2:
  814. continue
  815. # 检查是否是气象条件行(第一列包含"气象条件")
  816. if "气象条件" in row[0]:
  817. # 尝试从表格单元格中解析天气信息
  818. # 格式:气象条件 | 日期:xxx | 天气 | 温度 | xxx | ...
  819. # 或者:气象条件 | 日期:xxx | 天气 | 多云 | 温度 | xxx | ...
  820. # 找到日期列(包含"日期"的列)
  821. date_col_idx = -1
  822. weather_col_idx = -1
  823. temp_col_idx = -1
  824. for col_idx, cell in enumerate(row):
  825. if "日期" in cell and ":" in cell:
  826. date_col_idx = col_idx
  827. elif cell.strip() == "天气" or "天气" in cell:
  828. weather_col_idx = col_idx
  829. elif cell.strip() == "温度" or "温度" in cell:
  830. temp_col_idx = col_idx
  831. # 如果找到日期列,尝试解析多行天气数据
  832. if date_col_idx >= 0:
  833. # 从当前行开始,查找所有包含日期的行
  834. for check_row_idx in range(row_idx, min(row_idx + 5, len(first_table))): # 最多检查5行
  835. check_row = first_table[check_row_idx]
  836. if len(check_row) <= date_col_idx:
  837. continue
  838. date_cell = check_row[date_col_idx]
  839. # 检查是否包含日期
  840. date_match = re.search(r'日期[::]\s*([\d.\-]+)', date_cell)
  841. if not date_match:
  842. continue
  843. weather = WeatherData()
  844. weather.monitorAt = date_match.group(1).strip()
  845. # 在当前行中重新查找列索引(因为不同行的列结构可能不同)
  846. current_weather_col_idx = -1
  847. current_temp_col_idx = -1
  848. for col_idx, cell in enumerate(check_row):
  849. if cell.strip() == "天气" or "天气" in cell:
  850. current_weather_col_idx = col_idx
  851. elif cell.strip() == "温度" or "温度" in cell:
  852. current_temp_col_idx = col_idx
  853. # 提取天气(在日期列之后查找)
  854. # 天气值在"天气"标签的下一列(如果下一列不是"温度")
  855. if current_weather_col_idx >= 0 and len(check_row) > current_weather_col_idx + 1:
  856. weather_value = check_row[current_weather_col_idx + 1].strip()
  857. # 如果下一列是天气值(不是"天气"标签,不是"温度"标签,且不是数字),则使用
  858. if weather_value and weather_value != "天气" and weather_value != "温度" and not re.match(r'^[\d.\-]+$', weather_value):
  859. weather.weather = weather_value
  860. else:
  861. # 尝试从日期列之后查找"天气"标签,然后取下一列
  862. for col_idx in range(date_col_idx + 1, min(date_col_idx + 5, len(check_row))):
  863. cell = check_row[col_idx].strip()
  864. if cell == "天气" and col_idx + 1 < len(check_row):
  865. # 找到"天气"标签,取下一列的值
  866. next_cell = check_row[col_idx + 1].strip()
  867. if next_cell and next_cell != "天气" and next_cell != "温度" and not re.match(r'^[\d.\-]+$', next_cell):
  868. weather.weather = next_cell
  869. break
  870. elif cell and cell != "天气" and cell != "温度" and not re.match(r'^[\d.\-]+$', cell) and col_idx == date_col_idx + 1:
  871. # 日期列之后的第一列可能是天气值(如果格式正确)
  872. weather.weather = cell
  873. break
  874. # 提取温度
  875. # 温度值在"温度"标签的下一列
  876. if current_temp_col_idx >= 0 and len(check_row) > current_temp_col_idx + 1:
  877. temp_value = check_row[current_temp_col_idx + 1].strip()
  878. # 如果下一列是温度值(包含数字和-),则使用
  879. if temp_value and re.match(r'[\d.\-]+', temp_value):
  880. weather.temp = temp_value
  881. else:
  882. # 尝试从日期列之后查找"温度"标签,然后取下一列
  883. for col_idx in range(date_col_idx + 1, min(date_col_idx + 6, len(check_row))):
  884. cell = check_row[col_idx].strip()
  885. if cell == "温度" and col_idx + 1 < len(check_row):
  886. # 找到"温度"标签,取下一列的值
  887. temp_value = check_row[col_idx + 1].strip()
  888. if temp_value and re.match(r'[\d.\-]+', temp_value):
  889. weather.temp = temp_value
  890. break
  891. elif "℃" in cell or (re.match(r'[\d.\-]+', cell) and "温度" not in cell):
  892. # 如果直接找到温度值(包含℃或数字),也使用
  893. weather.temp = cell.replace("℃", "").strip()
  894. break
  895. # 提取湿度
  896. # 注意:新格式中"℃ 湿度"可能在同一个单元格中
  897. for col_idx, cell in enumerate(check_row):
  898. if "湿度" in cell:
  899. # 如果单元格包含"℃ 湿度",湿度值在下一列
  900. if "℃ 湿度" in cell or ("℃" in cell and "湿度" in cell):
  901. if col_idx + 1 < len(check_row):
  902. humidity_value = check_row[col_idx + 1].strip()
  903. if humidity_value and humidity_value != "湿度":
  904. weather.humidity = humidity_value.replace("%RH", "").strip()
  905. break
  906. elif col_idx + 1 < len(check_row):
  907. humidity_value = check_row[col_idx + 1].strip()
  908. if humidity_value and humidity_value != "湿度":
  909. weather.humidity = humidity_value.replace("%RH", "").strip()
  910. break
  911. # 提取风速
  912. # 注意:新格式中"%RH 风速"可能在同一个单元格中
  913. for col_idx, cell in enumerate(check_row):
  914. if "风速" in cell:
  915. # 如果单元格包含"%RH 风速",风速值在下一列
  916. if "%RH 风速" in cell or ("%RH" in cell and "风速" in cell):
  917. if col_idx + 1 < len(check_row):
  918. wind_speed_value = check_row[col_idx + 1].strip()
  919. if wind_speed_value and wind_speed_value != "风速":
  920. weather.windSpeed = wind_speed_value.replace("m/s", "").strip()
  921. break
  922. elif col_idx + 1 < len(check_row):
  923. wind_speed_value = check_row[col_idx + 1].strip()
  924. if wind_speed_value and wind_speed_value != "风速":
  925. weather.windSpeed = wind_speed_value.replace("m/s", "").strip()
  926. break
  927. # 提取风向
  928. for col_idx, cell in enumerate(check_row):
  929. if "风向" in cell and col_idx + 1 < len(check_row):
  930. wind_dir_value = check_row[col_idx + 1].strip()
  931. # 验证风向值:不应该包含"日期"、"温度"、"湿度"、"风速"等关键词
  932. if wind_dir_value and wind_dir_value != "风向" and \
  933. "日期" not in wind_dir_value and "温度" not in wind_dir_value and \
  934. "湿度" not in wind_dir_value and "风速" not in wind_dir_value and \
  935. not wind_dir_value.startswith("日期") and len(wind_dir_value) < 50:
  936. weather.windDirection = wind_dir_value
  937. break
  938. # 如果天气为空但其他字段有值,默认为"晴"
  939. if not weather.weather or not weather.weather.strip():
  940. if any([weather.temp, weather.humidity, weather.windSpeed, weather.windDirection]):
  941. weather.weather = "晴"
  942. _mark_auto_weather_default(record, weather)
  943. logger.debug(f"[噪声检测] 天气字段为空,但其他字段有值,默认为'晴': {weather.monitorAt}")
  944. # 如果至少有一个字段不为空,则添加这条记录
  945. if any([weather.monitorAt, weather.weather, weather.temp, weather.humidity, weather.windSpeed, weather.windDirection]):
  946. record.weather.append(weather)
  947. weather_extracted = True
  948. logger.info(f"[噪声检测] 从表格解析到天气记录: {weather.to_dict()}")
  949. if weather_extracted:
  950. break
  951. # 如果表格解析失败,尝试文本解析
  952. if not weather_extracted:
  953. text = " ".join(row[1:])
  954. parse_weather_from_text(text, record)
  955. if record.weather:
  956. weather_extracted = True
  957. break
  958. # 将OCR提取的天气信息与表格解析的天气信息进行合并
  959. # 按顺序匹配风向:第一条OCR天气信息对应第一条表格天气信息,以此类推
  960. if ocr_weather_list:
  961. logger.debug(f"[噪声检测] 开始合并OCR和表格解析的天气信息,OCR提取了 {len(ocr_weather_list)} 条,表格解析了 {len(record.weather)} 条")
  962. # 提取OCR风向数组,按顺序匹配到表格解析的天气记录
  963. ocr_wind_directions = []
  964. for ocr_weather in ocr_weather_list:
  965. if ocr_weather.windDirection and ocr_weather.windDirection.strip():
  966. ocr_wind_directions.append(ocr_weather.windDirection.strip())
  967. else:
  968. ocr_wind_directions.append("") # 保持顺序,即使为空
  969. logger.debug(f"[噪声检测] 从OCR提取的风向数组: {ocr_wind_directions}")
  970. # 按顺序将OCR风向填充到表格解析的天气记录中
  971. for i, table_weather in enumerate(record.weather):
  972. if i < len(ocr_wind_directions) and ocr_wind_directions[i]:
  973. if not table_weather.windDirection or not table_weather.windDirection.strip():
  974. table_weather.windDirection = ocr_wind_directions[i]
  975. logger.debug(f"[噪声检测] 按顺序填充第{i}条表格天气记录的风向: {table_weather.windDirection}")
  976. # 原有的合并逻辑(用于补充其他字段,如日期、天气、温度等)
  977. for ocr_weather in ocr_weather_list:
  978. # 如果OCR提取的天气信息中monitorAt为空,尝试从表格解析的天气信息中匹配
  979. matched_in_first_branch = False
  980. if not ocr_weather.monitorAt or not ocr_weather.monitorAt.strip():
  981. # 根据温度、湿度、风速等字段匹配表格解析的天气信息
  982. matched = False
  983. for table_weather in record.weather:
  984. # 匹配条件:温度、湿度、风速、风向相同或相似
  985. # 确保返回布尔值,避免空字符串导致类型错误
  986. temp_match = bool(ocr_weather.temp and table_weather.temp and
  987. ocr_weather.temp.strip() == table_weather.temp.strip())
  988. humidity_match = bool(ocr_weather.humidity and table_weather.humidity and
  989. ocr_weather.humidity.strip() == table_weather.humidity.strip())
  990. wind_speed_match = bool(ocr_weather.windSpeed and table_weather.windSpeed and
  991. ocr_weather.windSpeed.strip() == table_weather.windSpeed.strip())
  992. wind_dir_match = bool(ocr_weather.windDirection and table_weather.windDirection and
  993. ocr_weather.windDirection.strip() == table_weather.windDirection.strip())
  994. # 如果至少有两个字段匹配,认为这是同一条天气记录
  995. # 或者如果表格解析的天气记录只有部分字段(如只有windDirection),也尝试合并
  996. match_count = sum([temp_match, humidity_match, wind_speed_match, wind_dir_match])
  997. # 如果表格解析的天气记录只有windDirection,也尝试合并(通过日期匹配)
  998. has_only_wind_dir = (table_weather.windDirection and not table_weather.weather and
  999. not table_weather.temp and not table_weather.humidity and
  1000. not table_weather.windSpeed)
  1001. if (match_count >= 2 and table_weather.monitorAt) or (has_only_wind_dir and table_weather.monitorAt):
  1002. ocr_weather.monitorAt = table_weather.monitorAt
  1003. logger.debug(f"[噪声检测] 从表格解析结果补充OCR天气信息的日期: {ocr_weather.monitorAt}")
  1004. # 将OCR提取的所有字段补充到表格解析的天气信息中
  1005. if not table_weather.weather and ocr_weather.weather:
  1006. table_weather.weather = ocr_weather.weather
  1007. logger.debug(f"[噪声检测] 从OCR补充表格解析的天气: {table_weather.weather}")
  1008. if not table_weather.temp and ocr_weather.temp:
  1009. table_weather.temp = ocr_weather.temp
  1010. logger.debug(f"[噪声检测] 从OCR补充表格解析的温度: {table_weather.temp}")
  1011. if not table_weather.humidity and ocr_weather.humidity:
  1012. table_weather.humidity = ocr_weather.humidity
  1013. logger.debug(f"[噪声检测] 从OCR补充表格解析的湿度: {table_weather.humidity}")
  1014. if not table_weather.windSpeed and ocr_weather.windSpeed:
  1015. table_weather.windSpeed = ocr_weather.windSpeed
  1016. logger.debug(f"[噪声检测] 从OCR补充表格解析的风速: {table_weather.windSpeed}")
  1017. if not table_weather.windDirection and ocr_weather.windDirection:
  1018. table_weather.windDirection = ocr_weather.windDirection
  1019. logger.debug(f"[噪声检测] 从OCR补充表格解析的风向: {table_weather.windDirection}")
  1020. matched = True
  1021. matched_in_first_branch = True # 标记已在第一个分支匹配成功
  1022. break # 匹配成功后立即退出循环,避免继续匹配
  1023. # 如果没有匹配到,但OCR天气信息有其他字段,也添加到记录中(日期为空)
  1024. if not matched:
  1025. logger.debug(f"[噪声检测] OCR天气信息未匹配到表格解析结果,保留原信息(日期为空)")
  1026. # 如果OCR天气信息有日期,且未在第一个分支匹配成功,检查是否与表格解析的天气信息重复
  1027. if ocr_weather.monitorAt and ocr_weather.monitorAt.strip() and not matched_in_first_branch:
  1028. # 检查是否已存在相同日期的天气记录
  1029. # 处理日期格式不一致的情况(如 205.7.10 vs 2025.7.10)
  1030. ocr_date = ocr_weather.monitorAt.strip()
  1031. # 如果日期格式是 205.7.10,尝试修正为 2025.7.10
  1032. if re.match(r'^205\.', ocr_date): # 匹配 205 开头的日期(OCR识别错误)
  1033. ocr_date_normalized = re.sub(r'^205\.', '2025.', ocr_date)
  1034. elif re.match(r'^20[0-4]\.', ocr_date): # 匹配其他 200-204 开头的日期
  1035. ocr_date_normalized = re.sub(r'^20[0-4]\.', '2025.', ocr_date)
  1036. else:
  1037. ocr_date_normalized = ocr_date
  1038. exists = False
  1039. for table_weather in record.weather:
  1040. table_date = table_weather.monitorAt.strip() if table_weather.monitorAt else ""
  1041. # 处理表格日期格式(可能包含末尾的点号,如 "2025.03.28.")
  1042. table_date_clean = table_date.rstrip('.')
  1043. ocr_date_clean = ocr_date.rstrip('.')
  1044. ocr_date_normalized_clean = ocr_date_normalized.rstrip('.')
  1045. # 直接比较或比较归一化后的日期(忽略末尾的点号)
  1046. if table_date_clean and (table_date_clean == ocr_date_clean or table_date_clean == ocr_date_normalized_clean):
  1047. exists = True
  1048. # 如果表格解析的天气信息不完整,用OCR信息补充
  1049. if not table_weather.weather and ocr_weather.weather:
  1050. table_weather.weather = ocr_weather.weather
  1051. logger.debug(f"[噪声检测] 从OCR补充表格解析的天气: {table_weather.weather}")
  1052. if not table_weather.temp and ocr_weather.temp:
  1053. table_weather.temp = ocr_weather.temp
  1054. logger.debug(f"[噪声检测] 从OCR补充表格解析的温度: {table_weather.temp}")
  1055. if not table_weather.humidity and ocr_weather.humidity:
  1056. table_weather.humidity = ocr_weather.humidity
  1057. logger.debug(f"[噪声检测] 从OCR补充表格解析的湿度: {table_weather.humidity}")
  1058. if not table_weather.windSpeed and ocr_weather.windSpeed:
  1059. table_weather.windSpeed = ocr_weather.windSpeed
  1060. logger.debug(f"[噪声检测] 从OCR补充表格解析的风速: {table_weather.windSpeed}")
  1061. if not table_weather.windDirection and ocr_weather.windDirection:
  1062. table_weather.windDirection = ocr_weather.windDirection
  1063. logger.debug(f"[噪声检测] 从OCR补充表格解析的风向: {table_weather.windDirection}")
  1064. logger.debug(f"[噪声检测] OCR天气信息与表格解析结果合并: {table_weather.to_dict()}")
  1065. break # 找到匹配的记录后立即退出
  1066. # 如果不存在相同日期的记录,且OCR信息完整,添加到记录中
  1067. if not exists and any([ocr_weather.weather, ocr_weather.temp, ocr_weather.humidity,
  1068. ocr_weather.windSpeed, ocr_weather.windDirection]):
  1069. record.weather.append(ocr_weather)
  1070. logger.debug(f"[噪声检测] 添加OCR天气信息到记录: {ocr_weather.to_dict()}")
  1071. elif not matched_in_first_branch and any([ocr_weather.weather, ocr_weather.temp, ocr_weather.humidity,
  1072. ocr_weather.windSpeed, ocr_weather.windDirection]):
  1073. # 如果OCR天气信息没有日期但有其他字段,且未在第一个分支匹配成功,也添加到记录中
  1074. record.weather.append(ocr_weather)
  1075. logger.debug(f"[噪声检测] 添加OCR天气信息到记录(无日期): {ocr_weather.to_dict()}")
  1076. # 最终去重和合并:按日期分组,合并相同日期的记录,补齐空白字段
  1077. if record.weather:
  1078. logger.debug(f"[噪声检测] 合并前天气记录数: {len(record.weather)}")
  1079. deduplicated_weather = {}
  1080. for weather in record.weather:
  1081. date_key = weather.monitorAt.strip() if weather.monitorAt else ""
  1082. if not date_key:
  1083. # 如果没有日期,跳过(不应该出现,但为了安全)
  1084. continue
  1085. # 处理日期格式不一致的情况(如 205.7.10 vs 2025.7.10)
  1086. if re.match(r'^205\.', date_key):
  1087. date_key = re.sub(r'^205\.', '2025.', date_key)
  1088. elif re.match(r'^20[0-4]\.', date_key):
  1089. date_key = re.sub(r'^20[0-4]\.', '2025.', date_key)
  1090. if date_key not in deduplicated_weather:
  1091. # 创建新的天气记录
  1092. merged_weather = WeatherData()
  1093. merged_weather.monitorAt = date_key
  1094. deduplicated_weather[date_key] = merged_weather
  1095. else:
  1096. merged_weather = deduplicated_weather[date_key]
  1097. # 合并字段:如果当前记录的字段有值且合并记录的字段为空,则补齐
  1098. if not merged_weather.weather and weather.weather:
  1099. merged_weather.weather = weather.weather
  1100. if not merged_weather.temp and weather.temp:
  1101. merged_weather.temp = weather.temp
  1102. if not merged_weather.humidity and weather.humidity:
  1103. merged_weather.humidity = weather.humidity
  1104. if not merged_weather.windSpeed and weather.windSpeed:
  1105. merged_weather.windSpeed = weather.windSpeed
  1106. if not merged_weather.windDirection and weather.windDirection:
  1107. merged_weather.windDirection = weather.windDirection
  1108. # 更新record.weather为去重后的列表
  1109. record.weather = list(deduplicated_weather.values())
  1110. logger.debug(f"[噪声检测] 合并后天气记录数: {len(record.weather)}")
  1111. for weather in record.weather:
  1112. logger.debug(f"[噪声检测] 最终天气记录: {weather.to_dict()}")
  1113. for table in tables:
  1114. # 首先识别表头,找到各列的索引
  1115. code_idx = -1
  1116. address_idx = -1
  1117. source_idx = -1
  1118. dayMonitorAt_idx = -1
  1119. dayMonitorValue_idx = -1
  1120. dayMonitorBackgroundValue_idx = -1
  1121. nightMonitorAt_idx = -1
  1122. nightMonitorValue_idx = -1
  1123. nightMonitorBackgroundValue_idx = -1
  1124. remark_idx = -1
  1125. header_start_row = -1
  1126. # 查找表头行(通常包含"编号"、"测点位置"、"昼间"、"夜间"等关键词)
  1127. for row_idx, row in enumerate(table):
  1128. row_text = " ".join(row).lower()
  1129. # 检查是否是表头行
  1130. if ("编号" in row_text or "测点位置" in row_text or "测点" in row_text) and \
  1131. ("昼间" in row_text or "夜间" in row_text or "测量值" in row_text or "检测时间" in row_text):
  1132. header_start_row = row_idx
  1133. logger.debug(f"[噪声检测] 找到表头行: 第{row_idx}行, 内容: {row}")
  1134. # 在第一行表头中查找列索引
  1135. for col_idx, cell in enumerate(row):
  1136. cell_lower = cell.lower().strip()
  1137. if "编号" in cell:
  1138. code_idx = col_idx
  1139. elif "测点位置" in cell or "测点" in cell:
  1140. address_idx = col_idx
  1141. elif "主要声源" in cell or "声源" in cell:
  1142. source_idx = col_idx
  1143. elif "昼间" in cell and ("检测时间" in cell or "时间" in cell):
  1144. dayMonitorAt_idx = col_idx
  1145. elif "昼间" in cell and ("测量值" in cell or "测量" in cell):
  1146. dayMonitorValue_idx = col_idx
  1147. elif "昼间" in cell and ("背景值" in cell or "背景" in cell):
  1148. dayMonitorBackgroundValue_idx = col_idx
  1149. elif "夜间" in cell and ("检测时间" in cell or "时间" in cell):
  1150. nightMonitorAt_idx = col_idx
  1151. elif "夜间" in cell and ("测量值" in cell or "测量" in cell):
  1152. nightMonitorValue_idx = col_idx
  1153. elif "夜间" in cell and ("背景值" in cell or "背景" in cell):
  1154. nightMonitorBackgroundValue_idx = col_idx
  1155. elif "备注" in cell:
  1156. remark_idx = col_idx
  1157. # 如果第一行表头没有找到所有列,检查下一行(如果是两行表头)
  1158. if row_idx + 1 < len(table):
  1159. next_row = table[row_idx + 1]
  1160. next_row_text = " ".join(next_row).lower()
  1161. # 如果是第二行表头(通常是详细的列名)
  1162. if "检测时间" in next_row_text or "测量值" in next_row_text or "背景值" in next_row_text:
  1163. logger.debug(f"[噪声检测] 找到第二行表头: 第{row_idx + 1}行, 内容: {next_row}")
  1164. # 检查第一行表头,找到"昼间"和"夜间"的列范围
  1165. day_start_col = -1
  1166. day_end_col = -1
  1167. night_start_col = -1
  1168. night_end_col = -1
  1169. for col_idx, cell in enumerate(row):
  1170. cell_lower = cell.lower().strip()
  1171. if "昼间" in cell_lower:
  1172. day_start_col = col_idx
  1173. # 查找昼间结束位置(通常是下一个非空单元格或"夜间"开始)
  1174. for next_col in range(col_idx + 1, len(row)):
  1175. if "夜间" in row[next_col].lower() or row[next_col].strip():
  1176. day_end_col = next_col - 1
  1177. break
  1178. if day_end_col == -1:
  1179. day_end_col = len(row) - 1
  1180. elif "夜间" in cell_lower:
  1181. night_start_col = col_idx
  1182. # 查找夜间结束位置
  1183. for next_col in range(col_idx + 1, len(row)):
  1184. if "备注" in row[next_col].lower() or (next_col == len(row) - 1):
  1185. night_end_col = next_col - 1
  1186. break
  1187. if night_end_col == -1:
  1188. night_end_col = len(row) - 1
  1189. # 在第二行表头中查找列索引
  1190. for col_idx, cell in enumerate(next_row):
  1191. cell_lower = cell.lower().strip()
  1192. if "检测时间" in cell or "时间" in cell:
  1193. # 根据列位置判断是昼间还是夜间
  1194. if day_start_col >= 0 and day_start_col <= col_idx <= day_end_col and dayMonitorAt_idx == -1:
  1195. dayMonitorAt_idx = col_idx
  1196. elif night_start_col >= 0 and night_start_col <= col_idx <= night_end_col and nightMonitorAt_idx == -1:
  1197. nightMonitorAt_idx = col_idx
  1198. elif dayMonitorAt_idx == -1:
  1199. dayMonitorAt_idx = col_idx
  1200. elif nightMonitorAt_idx == -1:
  1201. nightMonitorAt_idx = col_idx
  1202. elif "测量值" in cell or "测量" in cell:
  1203. if day_start_col >= 0 and day_start_col <= col_idx <= day_end_col and dayMonitorValue_idx == -1:
  1204. dayMonitorValue_idx = col_idx
  1205. elif night_start_col >= 0 and night_start_col <= col_idx <= night_end_col and nightMonitorValue_idx == -1:
  1206. nightMonitorValue_idx = col_idx
  1207. elif dayMonitorValue_idx == -1:
  1208. dayMonitorValue_idx = col_idx
  1209. elif nightMonitorValue_idx == -1:
  1210. nightMonitorValue_idx = col_idx
  1211. elif "背景值" in cell or "背景" in cell:
  1212. if day_start_col >= 0 and day_start_col <= col_idx <= day_end_col and dayMonitorBackgroundValue_idx == -1:
  1213. dayMonitorBackgroundValue_idx = col_idx
  1214. elif night_start_col >= 0 and night_start_col <= col_idx <= night_end_col and nightMonitorBackgroundValue_idx == -1:
  1215. nightMonitorBackgroundValue_idx = col_idx
  1216. elif dayMonitorBackgroundValue_idx == -1:
  1217. dayMonitorBackgroundValue_idx = col_idx
  1218. elif nightMonitorBackgroundValue_idx == -1:
  1219. nightMonitorBackgroundValue_idx = col_idx
  1220. # 如果仍然没有找到某些列,使用默认顺序
  1221. if code_idx == -1:
  1222. code_idx = 0
  1223. if address_idx == -1:
  1224. address_idx = 1
  1225. if source_idx == -1:
  1226. source_idx = 2
  1227. if dayMonitorAt_idx == -1:
  1228. dayMonitorAt_idx = 3
  1229. if dayMonitorValue_idx == -1:
  1230. dayMonitorValue_idx = 4
  1231. if dayMonitorBackgroundValue_idx == -1:
  1232. dayMonitorBackgroundValue_idx = 5
  1233. if nightMonitorAt_idx == -1:
  1234. nightMonitorAt_idx = 6
  1235. if nightMonitorValue_idx == -1:
  1236. nightMonitorValue_idx = 7
  1237. if nightMonitorBackgroundValue_idx == -1:
  1238. nightMonitorBackgroundValue_idx = 8
  1239. if remark_idx == -1:
  1240. remark_idx = 9
  1241. logger.info(f"[噪声检测] 列索引映射: 编号={code_idx}, 测点位置={address_idx}, 主要声源={source_idx}, "
  1242. f"昼间检测时间={dayMonitorAt_idx}, 昼间测量值={dayMonitorValue_idx}, 昼间背景值={dayMonitorBackgroundValue_idx}, "
  1243. f"夜间检测时间={nightMonitorAt_idx}, 夜间测量值={nightMonitorValue_idx}, 夜间背景值={nightMonitorBackgroundValue_idx}, "
  1244. f"备注={remark_idx}")
  1245. break
  1246. # 如果找到了表头,从表头之后开始解析数据行
  1247. data_start_row = header_start_row + 2 if header_start_row >= 0 and header_start_row + 1 < len(table) and \
  1248. any(k in " ".join(table[header_start_row + 1]).lower() for k in ["检测时间", "测量值", "背景值"]) else \
  1249. (header_start_row + 1 if header_start_row >= 0 else 0)
  1250. # 解析数据行
  1251. for row_idx in range(data_start_row, len(table)):
  1252. row = table[row_idx]
  1253. # 跳过空行和表头行
  1254. if not row or len(row) < 3:
  1255. continue
  1256. # 检查是否是数据行(第一列应该是编号,通常是N1、N2或M1、M2等格式)
  1257. first_cell = row[0].strip() if len(row) > 0 else ""
  1258. if not first_cell or first_cell in ["编号", "备注"] or not (first_cell[0].upper() in ['N', 'M'] and first_cell[1:].isdigit()):
  1259. # 如果不是标准编号格式,也可能是有编号但格式不同,继续检查
  1260. if not (first_cell and (first_cell[0].isalnum() or first_cell.startswith('N') or first_cell.startswith('M'))):
  1261. continue
  1262. logger.debug(f"[噪声检测] 解析数据行 {row_idx}: {row}")
  1263. nd = NoiseData()
  1264. # 使用识别的列索引来提取数据
  1265. if code_idx >= 0 and code_idx < len(row):
  1266. nd.code = row[code_idx].strip()
  1267. if address_idx >= 0 and address_idx < len(row):
  1268. raw_address = row[address_idx].strip()
  1269. # 纠正address字段中的常见OCR识别错误
  1270. nd.address = correct_address_ocr_errors(raw_address)
  1271. if source_idx >= 0 and source_idx < len(row):
  1272. nd.source = row[source_idx].strip()
  1273. if dayMonitorAt_idx >= 0 and dayMonitorAt_idx < len(row):
  1274. nd.dayMonitorAt = row[dayMonitorAt_idx].strip()
  1275. if dayMonitorValue_idx >= 0 and dayMonitorValue_idx < len(row):
  1276. nd.dayMonitorValue = row[dayMonitorValue_idx].strip()
  1277. if dayMonitorBackgroundValue_idx >= 0 and dayMonitorBackgroundValue_idx < len(row):
  1278. nd.dayMonitorBackgroundValue = row[dayMonitorBackgroundValue_idx].strip()
  1279. if nightMonitorAt_idx >= 0 and nightMonitorAt_idx < len(row):
  1280. nd.nightMonitorAt = row[nightMonitorAt_idx].strip()
  1281. if nightMonitorValue_idx >= 0 and nightMonitorValue_idx < len(row):
  1282. nd.nightMonitorValue = row[nightMonitorValue_idx].strip()
  1283. if nightMonitorBackgroundValue_idx >= 0 and nightMonitorBackgroundValue_idx < len(row):
  1284. nd.nightMonitorBackgroundValue = row[nightMonitorBackgroundValue_idx].strip()
  1285. if remark_idx >= 0 and remark_idx < len(row):
  1286. nd.remark = row[remark_idx].strip()
  1287. # 验证数据有效性(至少应该有编号和测点位置)
  1288. if nd.code and nd.address:
  1289. logger.info(f"[噪声检测] 解析到数据: {nd.to_dict()}")
  1290. record.noise.append(nd)
  1291. else:
  1292. logger.warning(f"[噪声检测] 跳过无效数据行: {row}")
  1293. # 矫正编号:按照数据顺序重新分配编号为 N1, N2, N3...
  1294. for idx, nd in enumerate(record.noise, start=1):
  1295. original_code = nd.code
  1296. nd.code = f"N{idx}"
  1297. if original_code != nd.code:
  1298. logger.info(f"[噪声检测] 编号矫正: {original_code} -> {nd.code}")
  1299. # 解析工况信息
  1300. # 优先使用opStatus格式解析(附件 工况及工程信息),如果失败则使用旧格式
  1301. if "附件" in markdown_content and "工况" in markdown_content:
  1302. operational_conditions = parse_operational_conditions_opstatus(markdown_content)
  1303. if operational_conditions:
  1304. logger.info(f"[噪声检测] 使用opStatus格式解析到 {len(operational_conditions)} 条工况信息")
  1305. record.operationalConditions = operational_conditions
  1306. else:
  1307. # 如果opStatus格式解析失败,尝试旧格式
  1308. operational_conditions = parse_operational_conditions(markdown_content)
  1309. record.operationalConditions = operational_conditions
  1310. else:
  1311. operational_conditions = parse_operational_conditions(markdown_content)
  1312. record.operationalConditions = operational_conditions
  1313. # v2版本不依赖OCR,只从markdown内容解析
  1314. # 如果某些字段为空,会在日志中记录警告,但不进行OCR补充识别
  1315. return record