json_converter.py 31 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525
  1. # Copyright (c) Opendatalab. All rights reserved.
  2. """JSON转换模块 v2 - 独立版本,不依赖v1"""
  3. from typing import Dict, Any, Optional, List
  4. import re
  5. import os
  6. from copy import deepcopy
  7. from PIL import Image
  8. from ..utils.logging_config import get_logger
  9. from ..utils.paddleocr_fallback import fallback_parse_with_paddleocr, call_paddleocr, has_recognition_garbage
  10. from .document_type import detect_document_type
  11. from .noise_parser import parse_noise_detection_record
  12. from .electromagnetic_parser import parse_electromagnetic_detection_record
  13. from .investment_parser import parse_investment_record
  14. from .table_parser import parse_operational_conditions, parse_operational_conditions_v2, parse_operational_conditions_opstatus, parse_operational_conditions_format3_5
  15. logger = get_logger("pdf_converter_v2.parser.json")
  16. NOISE_HEADER_FIELDS = [
  17. "project",
  18. "standardReferences",
  19. "soundLevelMeterMode",
  20. "soundCalibratorMode",
  21. "calibrationValueBefore",
  22. "calibrationValueAfter",
  23. ]
  24. WEATHER_VALUE_FIELDS = ["weather", "temp", "humidity", "windSpeed", "windDirection"]
  25. def _normalize_date(date: Optional[str]) -> str:
  26. if not date:
  27. return ""
  28. return date.strip().rstrip(".")
  29. def _merge_weather_lists(
  30. primary: Optional[List[Dict[str, Any]]],
  31. secondary: Optional[List[Dict[str, Any]]]
  32. ) -> List[Dict[str, Any]]:
  33. if not primary and not secondary:
  34. return []
  35. if not primary:
  36. return deepcopy(secondary or [])
  37. merged = deepcopy(primary)
  38. if not secondary:
  39. return merged
  40. date_to_index: Dict[str, int] = {}
  41. empty_indices: List[int] = []
  42. for idx, item in enumerate(merged):
  43. norm_date = _normalize_date(item.get("monitorAt"))
  44. if norm_date:
  45. date_to_index.setdefault(norm_date, idx)
  46. else:
  47. empty_indices.append(idx)
  48. for src in secondary:
  49. norm_date = _normalize_date(src.get("monitorAt"))
  50. target = None
  51. if norm_date and norm_date in date_to_index:
  52. target = merged[date_to_index[norm_date]]
  53. elif empty_indices:
  54. target = merged[empty_indices.pop(0)]
  55. if target:
  56. if not _normalize_date(target.get("monitorAt")) and src.get("monitorAt"):
  57. target["monitorAt"] = src["monitorAt"]
  58. for field in WEATHER_VALUE_FIELDS:
  59. if (not target.get(field) or not str(target.get(field)).strip()) and src.get(field):
  60. target[field] = src[field]
  61. else:
  62. merged.append(deepcopy(src))
  63. return merged
  64. def _merge_noise_records(
  65. primary: Optional[Dict[str, Any]],
  66. secondary: Optional[Dict[str, Any]],
  67. preserve_primary_noise: bool = True
  68. ) -> Dict[str, Any]:
  69. if not primary and not secondary:
  70. return {}
  71. merged = deepcopy(primary) if primary else {}
  72. secondary = secondary or {}
  73. for field in NOISE_HEADER_FIELDS:
  74. primary_value = merged.get(field) if merged else ""
  75. secondary_value = secondary.get(field)
  76. if (not primary_value or not str(primary_value).strip()) and secondary_value:
  77. merged[field] = secondary_value
  78. merged["weather"] = _merge_weather_lists(merged.get("weather"), secondary.get("weather"))
  79. if not merged.get("operationalConditions") and secondary.get("operationalConditions"):
  80. merged["operationalConditions"] = deepcopy(secondary["operationalConditions"])
  81. if preserve_primary_noise and primary and primary.get("noise"):
  82. merged["noise"] = deepcopy(primary["noise"])
  83. elif not merged.get("noise") and secondary.get("noise"):
  84. merged["noise"] = deepcopy(secondary["noise"])
  85. return merged
  86. def _merge_electromagnetic_records(
  87. primary: Optional[Dict[str, Any]],
  88. secondary: Optional[Dict[str, Any]],
  89. preserve_primary_electric_magnetic: bool = True
  90. ) -> Dict[str, Any]:
  91. """合并电磁检测记录的原始和fallback解析结果
  92. Args:
  93. primary: 原始解析结果
  94. secondary: fallback解析结果
  95. preserve_primary_electric_magnetic: 是否保留原始的电测数据(默认True)
  96. Returns:
  97. 合并后的数据
  98. """
  99. if not primary and not secondary:
  100. return {}
  101. merged = deepcopy(primary) if primary else {}
  102. secondary = secondary or {}
  103. logger.info(f"[合并数据] 开始合并,primary project: {repr(merged.get('project'))}, secondary project: {repr(secondary.get('project'))}")
  104. # 合并头部字段(如果原始结果中字段为空,使用fallback结果)
  105. header_fields = ["project", "standardReferences", "deviceName", "deviceMode", "deviceCode", "monitorHeight"]
  106. for field in header_fields:
  107. primary_value = merged.get(field) if merged else ""
  108. secondary_value = secondary.get(field)
  109. logger.info(f"[合并数据] 检查字段 {field}: primary={repr(primary_value)}, secondary={repr(secondary_value)}")
  110. if (not primary_value or not str(primary_value).strip()) and secondary_value:
  111. merged[field] = secondary_value
  112. logger.info(f"[合并数据] 从fallback结果补充头部字段: {field} = {secondary_value}")
  113. else:
  114. logger.info(f"[合并数据] 字段 {field} 不满足合并条件,跳过")
  115. # 合并天气信息
  116. primary_weather = merged.get("weather", {}) if merged else {}
  117. secondary_weather = secondary.get("weather", {}) or {}
  118. for field in ["weather", "temp", "humidity", "windSpeed", "windDirection"]:
  119. primary_value = primary_weather.get(field) if primary_weather else ""
  120. secondary_value = secondary_weather.get(field)
  121. if (not primary_value or not str(primary_value).strip()) and secondary_value:
  122. if "weather" not in merged:
  123. merged["weather"] = {}
  124. merged["weather"][field] = secondary_value
  125. # 合并电测数据:优先保留原始数据,如果原始数据为空则使用fallback数据
  126. # 但是需要合并每个数据项的address字段(如果原始数据中address为空,使用fallback数据)
  127. if preserve_primary_electric_magnetic and primary and primary.get("electricMagnetic"):
  128. merged["electricMagnetic"] = deepcopy(primary["electricMagnetic"])
  129. # 合并每个数据项的address字段
  130. secondary_electric_magnetic = secondary.get("electricMagnetic", [])
  131. if secondary_electric_magnetic:
  132. # 建立编号到数据项的映射
  133. code_to_em = {em.get("code", "").upper(): em for em in merged["electricMagnetic"]}
  134. # 从secondary中提取address并填充到merged中
  135. for sec_em in secondary_electric_magnetic:
  136. sec_code = sec_em.get("code", "").upper()
  137. sec_address = sec_em.get("address", "")
  138. if sec_code in code_to_em and sec_address:
  139. # 如果merged中对应数据项的address为空,使用secondary的address
  140. if not code_to_em[sec_code].get("address") or not str(code_to_em[sec_code].get("address")).strip():
  141. code_to_em[sec_code]["address"] = sec_address
  142. logger.info(f"[合并数据] 从fallback结果补充地址: {sec_code} -> {sec_address}")
  143. elif not merged.get("electricMagnetic") and secondary.get("electricMagnetic"):
  144. merged["electricMagnetic"] = deepcopy(secondary["electricMagnetic"])
  145. return merged
  146. def parse_markdown_to_json(markdown_content: str, first_page_image: Optional[Image.Image] = None, output_dir: Optional[str] = None, forced_document_type: Optional[str] = None, enable_paddleocr_fallback: bool = True, input_file: Optional[str] = None) -> Dict[str, Any]:
  147. """将Markdown内容转换为JSON - v2独立版本,不依赖v1和OCR
  148. 如果提供 forced_document_type(正式全称),则优先按指定类型解析。
  149. 支持映射:
  150. - noiseMonitoringRecord -> 使用噪声解析
  151. - electromagneticTestRecord -> 使用电磁解析
  152. - 其他类型:返回空数据占位
  153. Args:
  154. markdown_content: markdown内容
  155. first_page_image: 第一页图片(v2版本不使用)
  156. output_dir: 输出目录(用于查找图片进行备用解析)
  157. forced_document_type: 强制文档类型
  158. enable_paddleocr_fallback: 是否启用PaddleOCR备用解析(默认True)
  159. input_file: 原始输入文件路径(PDF或图片),用于从PDF提取第一页
  160. """
  161. original_markdown = markdown_content
  162. # 若检测到 MinerU 识别异常(如表格单元格内同一字符大量重复),用 Paddle doc_parser 结果替换后再解析
  163. if enable_paddleocr_fallback and (output_dir or input_file) and has_recognition_garbage(markdown_content):
  164. logger.warning("[JSON转换] 检测到MinerU识别异常(如重复字符),尝试使用Paddle doc_parser补充替换(全文档解析)")
  165. try:
  166. fallback_markdown = fallback_parse_with_paddleocr(
  167. json_data={"document_type": forced_document_type or "electromagneticTestRecord"},
  168. markdown_content=markdown_content,
  169. output_dir=output_dir,
  170. document_type=forced_document_type,
  171. input_file=input_file,
  172. full_document=True,
  173. )
  174. if fallback_markdown:
  175. markdown_content = fallback_markdown
  176. original_markdown = markdown_content
  177. logger.info("[JSON转换] 已用Paddle doc_parser结果替换Markdown,继续解析")
  178. except Exception as e:
  179. logger.warning(f"[JSON转换] Paddle doc_parser补充替换失败,继续使用原Markdown: {e}")
  180. logger.info(f"[JSON转换] 开始解析,forced_document_type={forced_document_type}")
  181. if forced_document_type:
  182. auto_weather_default = False
  183. if forced_document_type == "noiseMonitoringRecord":
  184. noise_record = parse_noise_detection_record(markdown_content, first_page_image=None, output_dir=output_dir)
  185. auto_weather_default = getattr(noise_record, "_auto_weather_default_used", False)
  186. data = noise_record.to_dict()
  187. result = {"document_type": forced_document_type, "data": data}
  188. elif forced_document_type == "electromagneticTestRecord":
  189. data = parse_electromagnetic_detection_record(markdown_content).to_dict()
  190. result = {"document_type": forced_document_type, "data": data}
  191. elif forced_document_type == "operatingConditionInfo":
  192. # 仅解析工况信息
  193. # 优先级:表1检测工况格式 > 格式3/5 > opStatus格式 > 旧格式
  194. # 1. 检查是否为"表1检测工况"格式(使用正则表达式,允许中间有空格)
  195. # 支持:表1检测工况、表 1 检测工况、表 1检测工况、表1 检测工况 等变体
  196. pattern = r'表\s*1\s*检测工况'
  197. if re.search(pattern, markdown_content):
  198. logger.info("[JSON转换] 检测到'表1检测工况'标识(包括空格变体),使用新格式解析")
  199. op_list = parse_operational_conditions_v2(markdown_content)
  200. serialized = [oc.to_dict() if hasattr(oc, "to_dict") else oc for oc in (op_list or [])]
  201. return {"document_type": forced_document_type, "data": {"operationalConditions": serialized}}
  202. # 2. 检查是否为格式3/5(附件 2 工况信息 或 附件 2 工况及工程信息,电压列第一列存储时间段)
  203. # 更精确的判断:必须包含"附件"和"2",且包含"工况信息"或"工况及工程信息"
  204. # 排除格式4("附件 工况及工程信息"没有"2")
  205. has_attachment_2 = re.search(r'附件\s*2', markdown_content) or ("附件2" in markdown_content)
  206. has_condition_info = "工况信息" in markdown_content or "工况及工程信息" in markdown_content
  207. if has_attachment_2 and has_condition_info:
  208. logger.info("[JSON转换] 检测到'附件 2 工况信息'或'附件 2 工况及工程信息'格式,尝试使用格式3/5解析")
  209. op_list = parse_operational_conditions_format3_5(markdown_content)
  210. if op_list:
  211. # 格式3/5返回OperationalConditionV2格式
  212. serialized = [oc.to_dict() if hasattr(oc, "to_dict") else oc for oc in op_list]
  213. logger.info(f"[JSON转换] 格式3/5解析成功,共解析到 {len(serialized)} 条记录")
  214. # 检查是否有缺失字段(如minReactivePower为空)
  215. has_missing_fields = False
  216. required_fields = ["maxVoltage", "minVoltage", "maxCurrent", "minCurrent",
  217. "maxActivePower", "minActivePower", "maxReactivePower", "minReactivePower"]
  218. for record in serialized:
  219. for field in required_fields:
  220. if not record.get(field) or record.get(field) == "":
  221. has_missing_fields = True
  222. logger.warning(f"[JSON转换] 检测到缺失字段: {field} 在记录 {record.get('name', 'unknown')} 中为空")
  223. break
  224. if has_missing_fields:
  225. break
  226. # 如果有缺失字段,调用paddle ocr获取JSON来补充缺失字段
  227. if has_missing_fields and enable_paddleocr_fallback and (output_dir or input_file):
  228. logger.info("[JSON转换] 检测到缺失字段,调用PaddleOCR OCR获取JSON来补充")
  229. try:
  230. # 查找图片路径
  231. image_path = None
  232. if output_dir:
  233. from ..utils.paddleocr_fallback import extract_image_from_markdown
  234. image_path = extract_image_from_markdown(markdown_content, output_dir)
  235. # 如果从markdown中找不到图片,尝试从input_file提取
  236. if not image_path and input_file:
  237. from ..utils.paddleocr_fallback import extract_first_page_from_pdf, detect_file_type
  238. file_type = detect_file_type(input_file)
  239. if file_type == 'pdf':
  240. image_path = extract_first_page_from_pdf(input_file, output_dir)
  241. elif file_type in ['png', 'jpeg', 'jpg']:
  242. image_path = input_file
  243. if image_path and os.path.exists(image_path):
  244. logger.info(f"[JSON转换] 使用PaddleOCR OCR解析图片: {image_path}")
  245. from ..utils.paddleocr_fallback import call_paddleocr_ocr, supplement_missing_fields_from_ocr_json
  246. # 调用OCR获取JSON
  247. ocr_save_path = os.path.dirname(image_path) if image_path else output_dir
  248. ocr_texts, ocr_json_path = call_paddleocr_ocr(image_path, ocr_save_path)
  249. if ocr_json_path and os.path.exists(ocr_json_path):
  250. logger.info(f"[JSON转换] 从OCR JSON文件补充缺失字段: {ocr_json_path}")
  251. # 使用OCR JSON补充缺失字段
  252. serialized = supplement_missing_fields_from_ocr_json(serialized, ocr_json_path)
  253. logger.info("[JSON转换] OCR字段补充完成")
  254. else:
  255. logger.warning("[JSON转换] 未找到OCR JSON文件,无法补充缺失字段")
  256. else:
  257. logger.warning("[JSON转换] 未找到可用的图片文件,无法使用PaddleOCR OCR补充")
  258. except Exception as e:
  259. logger.exception(f"[JSON转换] PaddleOCR OCR补充过程出错: {e}")
  260. return {"document_type": forced_document_type, "data": {"operationalConditions": serialized}}
  261. else:
  262. logger.debug("[JSON转换] 格式3/5解析未找到结果,继续尝试其他格式")
  263. # 3. 检查是否为opStatus格式(附件 工况及工程信息,没有"2",表格结构是U/I/P/Q)
  264. # 格式4:附件 工况及工程信息(没有"2"),且表格结构是U/I/P/Q(不是"检测时间 项目"格式)
  265. # 先检查表格结构,避免误判包含"检测时间"和"项目"列的格式2
  266. is_opstatus_format = False
  267. if "附件" in markdown_content and "工况" in markdown_content and not has_attachment_2:
  268. # 检查表格结构:opStatus格式的表头应该是"名称 时间 U (kV) I (A) P (MW) Q (Mvar)"
  269. # 而不是"检测时间 项目 电压 电流 有功功率 无功功率"
  270. from ..parser.table_parser import extract_table_with_rowspan_colspan
  271. tables = extract_table_with_rowspan_colspan(markdown_content)
  272. for table in tables:
  273. if table and len(table) > 0:
  274. first_row = table[0]
  275. first_row_text = " ".join(first_row).lower()
  276. # 如果包含"检测时间"和"项目",则不是opStatus格式(可能是格式2)
  277. if "检测时间" in first_row_text and "项目" in first_row_text:
  278. logger.debug("[JSON转换] 表格包含'检测时间'和'项目'列,不是opStatus格式,跳过")
  279. is_opstatus_format = False
  280. break
  281. # 如果包含"运行工况"或"U (kV)"、"I (A)"等,则是opStatus格式
  282. if "运行工况" in first_row_text or ("u" in first_row_text and "kv" in first_row_text):
  283. is_opstatus_format = True
  284. break
  285. if is_opstatus_format:
  286. logger.info("[JSON转换] 检测到'附件 工况及工程信息'格式(格式4),使用opStatus格式解析,返回OperationalCondition格式")
  287. op_list = parse_operational_conditions_opstatus(markdown_content)
  288. # 格式4直接返回OperationalCondition格式(旧格式),不转换为V2格式
  289. serialized = [oc.to_dict() if hasattr(oc, "to_dict") else oc for oc in (op_list or [])]
  290. return {"document_type": forced_document_type, "data": {"operationalConditions": serialized}}
  291. # 3. 使用旧格式解析(先尝试有标题模式,如果失败则尝试无标题模式)
  292. logger.info("[JSON转换] 未检测到特殊格式标识,使用旧格式解析")
  293. op_list = parse_operational_conditions(markdown_content, require_title=True)
  294. # 如果没有找到结果,尝试无标题模式(仅根据表格结构判断)
  295. if not op_list:
  296. logger.info("[JSON转换] 有标题模式未找到结果,尝试无标题模式解析")
  297. op_list = parse_operational_conditions(markdown_content, require_title=False)
  298. serialized = [oc.to_dict() if hasattr(oc, "to_dict") else oc for oc in (op_list or [])]
  299. result = {"document_type": forced_document_type, "data": {"operationalConditions": serialized}}
  300. elif forced_document_type in ["fsApproval", "fsReview", "pdApproval", "safetyFsApproval"]:
  301. # 投资估算类型处理
  302. logger.info(f"[JSON转换] 处理投资估算类型: {forced_document_type}")
  303. logger.debug(f"[JSON转换] Markdown内容长度: {len(markdown_content)} 字符")
  304. investment_record = parse_investment_record(markdown_content, forced_document_type)
  305. if investment_record:
  306. data = investment_record.to_dict()
  307. # safetyFsApproval 可能返回 {"projectInfo": {...}, "data": [...]},取列表用于条数与摘要
  308. record_list = data.get("data", []) if isinstance(data, dict) else (data if isinstance(data, list) else [])
  309. logger.info(f"[JSON转换] 投资估算解析成功,共 {len(record_list)} 条记录")
  310. if record_list:
  311. for idx, item in enumerate(record_list[:3]):
  312. if isinstance(item, dict):
  313. logger.debug(f"[JSON转换] 记录 {idx+1}: No={item.get('No', '')}, Name={item.get('name', '')}, Level={item.get('Level', '')}")
  314. result = {"document_type": forced_document_type, "data": data}
  315. else:
  316. logger.error("[JSON转换] 投资估算解析失败:parse_investment_record 返回 None")
  317. result = {"document_type": forced_document_type, "data": [], "error": "投资估算解析失败"}
  318. elif forced_document_type == "finalAccount":
  319. # 决算报告类型处理
  320. logger.info(f"[JSON转换] 处理决算报告类型: {forced_document_type}")
  321. logger.debug(f"[JSON转换] Markdown内容长度: {len(markdown_content)} 字符")
  322. from .investment_parser import parse_final_account_record
  323. final_account_record = parse_final_account_record(markdown_content)
  324. if final_account_record:
  325. data = final_account_record.to_dict()
  326. logger.info(f"[JSON转换] 决算报告解析成功,共 {len(data)} 条记录")
  327. # 输出前3条记录的摘要
  328. if data:
  329. for idx, item in enumerate(data[:3]):
  330. logger.debug(f"[JSON转换] 记录 {idx+1}: No={item.get('No', '')}, Name={item.get('name', '')}, feeName={item.get('feeName', '')}")
  331. result = {"document_type": forced_document_type, "data": data}
  332. else:
  333. logger.error("[JSON转换] 决算报告解析失败:parse_final_account_record 返回 None")
  334. result = {"document_type": forced_document_type, "data": [], "error": "决算报告解析失败"}
  335. else:
  336. result = {"document_type": forced_document_type, "data": {}}
  337. # 对于forced_document_type,也检查数据完整性
  338. if enable_paddleocr_fallback and result.get("document_type") in ["noiseMonitoringRecord", "electromagneticTestRecord"]:
  339. try:
  340. from ..utils.paddleocr_fallback import check_json_data_completeness
  341. is_complete = check_json_data_completeness(result, result.get("document_type"))
  342. if auto_weather_default and result.get("document_type") == "noiseMonitoringRecord":
  343. logger.warning("[JSON转换] 检测到天气字段使用默认值,尝试使用PaddleOCR备用解析")
  344. is_complete = False
  345. if not is_complete:
  346. logger.warning(f"[JSON转换] 检测到数据缺失,尝试使用PaddleOCR备用解析")
  347. fallback_markdown = fallback_parse_with_paddleocr(
  348. result,
  349. original_markdown,
  350. output_dir=output_dir,
  351. document_type=result.get("document_type"),
  352. input_file=input_file
  353. )
  354. if fallback_markdown:
  355. logger.info("[JSON转换] PaddleOCR备用解析成功,重新解析JSON")
  356. if result.get("document_type") == "noiseMonitoringRecord":
  357. original_data = result.get("data", {}) or {}
  358. fallback_data = parse_noise_detection_record(fallback_markdown, first_page_image=None, output_dir=output_dir).to_dict()
  359. merged_data = _merge_noise_records(
  360. primary=original_data,
  361. secondary=fallback_data,
  362. preserve_primary_noise=True
  363. )
  364. result = {"document_type": "noiseMonitoringRecord", "data": merged_data}
  365. elif result.get("document_type") == "electromagneticTestRecord":
  366. original_data = result.get("data", {}) or {}
  367. fallback_data = parse_electromagnetic_detection_record(fallback_markdown).to_dict()
  368. logger.info(f"[JSON转换] fallback_data project: {repr(fallback_data.get('project'))}, EB1 address: {repr(fallback_data.get('electricMagnetic', [{}])[0].get('address') if fallback_data.get('electricMagnetic') else '')}")
  369. merged_data = _merge_electromagnetic_records(
  370. primary=original_data,
  371. secondary=fallback_data,
  372. preserve_primary_electric_magnetic=True
  373. )
  374. logger.info(f"[JSON转换] merged_data project: {repr(merged_data.get('project'))}, EB1 address: {repr(merged_data.get('electricMagnetic', [{}])[0].get('address') if merged_data.get('electricMagnetic') else '')}")
  375. result = {"document_type": "electromagneticTestRecord", "data": merged_data}
  376. logger.info("[JSON转换] 使用PaddleOCR结果重新解析完成")
  377. except Exception as e:
  378. logger.exception(f"[JSON转换] PaddleOCR备用解析过程出错: {e}")
  379. return result
  380. auto_weather_default = False
  381. doc_type = detect_document_type(markdown_content)
  382. if doc_type == "noiseRec":
  383. # v2版本不依赖OCR,first_page_image参数会被忽略
  384. noise_record = parse_noise_detection_record(markdown_content, first_page_image=None, output_dir=output_dir)
  385. auto_weather_default = getattr(noise_record, "_auto_weather_default_used", False)
  386. data = noise_record.to_dict()
  387. result = {"document_type": doc_type, "data": data}
  388. elif doc_type == "emRec":
  389. data = parse_electromagnetic_detection_record(markdown_content).to_dict()
  390. result = {"document_type": doc_type, "data": data}
  391. elif doc_type in ["fsApproval", "fsReview", "pdApproval", "safetyFsApproval"]:
  392. # 新增:投资估算类型
  393. logger.info(f"[JSON转换] 检测到投资估算类型: {doc_type}")
  394. logger.debug(f"[JSON转换] Markdown内容长度: {len(markdown_content)} 字符")
  395. investment_record = parse_investment_record(markdown_content, doc_type)
  396. if investment_record:
  397. data = investment_record.to_dict()
  398. # safetyFsApproval 可能返回 {"projectInfo": {...}, "data": [...]},取列表用于条数与摘要
  399. record_list = data.get("data", []) if isinstance(data, dict) else (data if isinstance(data, list) else [])
  400. logger.info(f"[JSON转换] 投资估算解析成功,共 {len(record_list)} 条记录")
  401. if record_list:
  402. for idx, item in enumerate(record_list[:3]):
  403. if isinstance(item, dict):
  404. logger.debug(f"[JSON转换] 记录 {idx+1}: No={item.get('No', '')}, Name={item.get('name', '')}, Level={item.get('Level', '')}")
  405. result = {"document_type": doc_type, "data": data}
  406. else:
  407. logger.error("[JSON转换] 投资估算解析失败:parse_investment_record 返回 None")
  408. result = {"document_type": doc_type, "data": [], "error": "投资估算解析失败"}
  409. else:
  410. result = {"document_type": "unknown", "data": {}, "error": "无法识别的文档类型"}
  411. # 检查数据完整性,如果缺失则使用PaddleOCR备用解析
  412. if enable_paddleocr_fallback and result.get("document_type") != "unknown":
  413. try:
  414. # 检查是否需要备用解析
  415. from ..utils.paddleocr_fallback import check_json_data_completeness
  416. is_complete = check_json_data_completeness(result, result.get("document_type"))
  417. if auto_weather_default and result.get("document_type") in ["noiseMonitoringRecord", "noise_detection"]:
  418. logger.warning("[JSON转换] 检测到天气字段使用默认值,尝试使用PaddleOCR备用解析")
  419. is_complete = False
  420. if not is_complete:
  421. logger.warning(f"[JSON转换] 检测到数据缺失,尝试使用PaddleOCR备用解析")
  422. # 尝试使用PaddleOCR补充
  423. fallback_markdown = fallback_parse_with_paddleocr(
  424. result,
  425. original_markdown,
  426. output_dir=output_dir,
  427. document_type=result.get("document_type"),
  428. input_file=input_file
  429. )
  430. if fallback_markdown:
  431. logger.info("[JSON转换] PaddleOCR备用解析成功,重新解析JSON")
  432. # 使用PaddleOCR的结果重新解析
  433. if result.get("document_type") == "noiseMonitoringRecord" or doc_type == "noise_detection":
  434. original_data = result.get("data", {}) or {}
  435. fallback_data = parse_noise_detection_record(fallback_markdown, first_page_image=None, output_dir=output_dir).to_dict()
  436. merged_data = _merge_noise_records(
  437. primary=original_data,
  438. secondary=fallback_data,
  439. preserve_primary_noise=True
  440. )
  441. result = {"document_type": "noiseMonitoringRecord", "data": merged_data}
  442. elif result.get("document_type") == "electromagneticTestRecord" or doc_type == "electromagnetic_detection":
  443. original_data = result.get("data", {}) or {}
  444. fallback_data = parse_electromagnetic_detection_record(fallback_markdown).to_dict()
  445. merged_data = _merge_electromagnetic_records(
  446. primary=original_data,
  447. secondary=fallback_data,
  448. preserve_primary_electric_magnetic=True
  449. )
  450. result = {"document_type": "electromagneticTestRecord", "data": merged_data}
  451. logger.info("[JSON转换] 使用PaddleOCR结果重新解析完成")
  452. except Exception as e:
  453. logger.exception(f"[JSON转换] PaddleOCR备用解析过程出错: {e}")
  454. # 即使备用解析失败,也返回原始结果
  455. return result