investment_parser.py 58 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130113111321133113411351136113711381139114011411142114311441145114611471148114911501151115211531154115511561157115811591160116111621163116411651166116711681169117011711172117311741175117611771178117911801181118211831184118511861187118811891190119111921193119411951196119711981199120012011202120312041205120612071208120912101211121212131214121512161217121812191220122112221223122412251226122712281229123012311232123312341235123612371238123912401241124212431244124512461247124812491250125112521253125412551256125712581259126012611262126312641265126612671268126912701271127212731274127512761277127812791280128112821283128412851286128712881289129012911292129312941295129612971298129913001301130213031304130513061307
  1. # Copyright (c) Opendatalab. All rights reserved.
  2. """
  3. 投资估算表格解析模块
  4. 支持四种类型:
  5. 1. 可研批复投资估算
  6. 2. 可研评审投资估算
  7. 3. 初设批复概算投资
  8. 4. 安评可研批复投资估算
  9. """
  10. from typing import List, Optional
  11. import re
  12. from ..utils.logging_config import get_logger
  13. from ..models.data_models import (
  14. FeasibilityApprovalInvestment,
  15. FeasibilityReviewInvestment,
  16. PreliminaryApprovalInvestment,
  17. InvestmentItem,
  18. FinalAccountRecord,
  19. FinalAccountItem
  20. )
  21. from .table_parser import extract_table_with_rowspan_colspan
  22. logger = get_logger("pdf_converter_v2.parser.investment")
  23. def detect_investment_type(markdown_content: str) -> Optional[str]:
  24. """
  25. 检测投资估算表格类型
  26. Returns:
  27. str: 类型名称
  28. - "fsApproval" - 可研批复
  29. - "fsReview" - 可研评审
  30. - "pdApproval" - 初设批复
  31. - "safetyFsApproval" - 安评可研批复
  32. - None - 无法识别
  33. """
  34. # 安评可研批复(先于可研批复判断,结构同可研批复)
  35. if "安评可研批复" in markdown_content or ("安全评价" in markdown_content and "可行性研究报告的批复" in markdown_content):
  36. if "架空线" in markdown_content or "间隔" in markdown_content:
  37. logger.info("[投资估算] 检测到类型: 安评可研批复投资估算")
  38. return "safetyFsApproval"
  39. # 检查标题关键词
  40. if "可研批复" in markdown_content or "可行性研究报告的批复" in markdown_content:
  41. # 检查是否有建设规模相关列(可研批复特有)
  42. if "架空线" in markdown_content or "间隔" in markdown_content:
  43. logger.info("[投资估算] 检测到类型: 可研批复投资估算")
  44. return "fsApproval"
  45. if "可研评审" in markdown_content or "可行性研究报告的评审意见" in markdown_content:
  46. logger.info("[投资估算] 检测到类型: 可研评审投资估算")
  47. return "fsReview"
  48. if "初设批复" in markdown_content or "初步设计的批复" in markdown_content:
  49. logger.info("[投资估算] 检测到类型: 初设批复概算投资")
  50. return "pdApproval"
  51. logger.warning("[投资估算] 无法识别投资估算表格类型")
  52. return None
  53. def determine_level(text: str, name: str = "", strict_mode: bool = True) -> str:
  54. """
  55. 判断明细等级
  56. 规则:
  57. - 大写中文数字(一、二、三等) -> 第一级(顶级大类)
  58. - strict_mode=True: 需要名称包含电压等级+输变电工程才是一级,否则降为二级
  59. - strict_mode=False: 中文数字直接判断为一级(用于 fsReview、pdApproval)
  60. - 小写阿拉伯数字(1、2、3等) -> 第二级
  61. - 带括号的数字(1)、2)等) -> 第三级
  62. - 合计 -> 0
  63. Args:
  64. text: 序号或名称文本
  65. name: 可选,名称文本,用于辅助判断(区分顶级大类和子项)
  66. strict_mode: 是否使用严格模式(默认True,用于 fsApproval 区分顶级大类)
  67. Returns:
  68. str: "0"(合计), "1"(一级), "2"(二级), "3"(三级), ""(无法判断)
  69. """
  70. if not text:
  71. return ""
  72. text = text.strip()
  73. # 合计行(包含"合 计"这种带空格的情况)
  74. text_no_space = text.replace(" ", "")
  75. if "合计" in text_no_space or "小计" in text_no_space:
  76. return "0"
  77. # 第一级: 大写中文数字
  78. # 匹配: "一、", "一,", "一.", "一 ", "一" (后面可以跟任意字符或结束)
  79. # 注意:需要排除"十一"、"十二"等多位数字,只匹配单个中文数字
  80. is_chinese_numeral = False
  81. if re.match(r'^[一二三四五六七八九十]+[、,,.\s]', text):
  82. is_chinese_numeral = True
  83. # 如果序号后面直接跟汉字(没有标点),也可能是第一级
  84. # 例如: "一变电工程", "二线路工程"
  85. elif re.match(r'^[一二三四五六七八九十]+[\u4e00-\u9fa5]', text):
  86. is_chinese_numeral = True
  87. # 如果只是单独的中文数字(没有后续字符),也可能是第一级
  88. # 例如: "一", "二", "三"
  89. elif re.match(r'^[一二三四五六七八九十]+$', text):
  90. is_chinese_numeral = True
  91. if is_chinese_numeral:
  92. # 非严格模式:中文数字直接判断为一级(用于 fsReview、pdApproval)
  93. if not strict_mode:
  94. return "1"
  95. # 严格模式:进一步判断,区分顶级大类和子项目(用于 fsApproval)
  96. # 顶级大类特征:名称包含电压等级(如"220千伏"、"500kV")+ 输变电工程
  97. # 子项目特征:简单的工程类型名称(变电工程、线路工程、配套通信工程)
  98. name_to_check = name if name else text
  99. # 1. 检查是否是顶级大类(包含电压等级 + 输变电工程)
  100. # 电压等级模式:220千伏、500kV、110kv、35千伏等
  101. has_voltage = bool(re.search(r'\d+\s*(千伏|kV|KV|kv)', name_to_check, re.IGNORECASE))
  102. has_project_type = "输变电" in name_to_check or "变电站" in name_to_check or "送出工程" in name_to_check
  103. if has_voltage and has_project_type:
  104. # 包含电压等级和工程类型,是顶级大类
  105. return "1"
  106. # 2. 检查是否是子项目(固定名称,人为错误可能把"3"写成"三")
  107. # 子项目名称通常较短且是固定的工程类型
  108. subitem_exact = ["变电工程", "线路工程", "配套通信工程", "通信工程"]
  109. is_exact_subitem = name_to_check in subitem_exact
  110. if is_exact_subitem:
  111. # 完全匹配子项目名称,按二级处理
  112. logger.debug(f"[等级判断] 中文数字序号但名称是子项目,按二级处理: text={text}, name={name}")
  113. return "2"
  114. # 3. 其他情况:如果名称较长(>10字符),可能是顶级大类;否则按二级处理
  115. if len(name_to_check) > 10:
  116. # 较长的名称可能是顶级大类(即使没有匹配到电压等级模式)
  117. return "1"
  118. else:
  119. # 较短的名称,按二级处理
  120. logger.debug(f"[等级判断] 中文数字序号但名称较短,按二级处理: text={text}, name={name}")
  121. return "2"
  122. # 第二级: 小写阿拉伯数字
  123. # 匹配: "1、", "1,", "1.", "1 " (后面跟标点或空格)
  124. if re.match(r'^\d+[、,,.\s]', text) and not text.startswith('(') and not text.startswith('('):
  125. return "2"
  126. # 如果数字后面直接跟汉字(没有标点),也认为是第二级
  127. # 例如: "1周村220kV变电站"
  128. if re.match(r'^\d+[\u4e00-\u9fa5]', text) and not text.startswith('(') and not text.startswith('('):
  129. return "2"
  130. # 如果只是单独的阿拉伯数字(没有后续字符),也是第二级
  131. # 例如: "1", "2", "3"
  132. if re.match(r'^\d+$', text) and not text.startswith('(') and not text.startswith('('):
  133. return "2"
  134. # 第三级: 带括号的数字,或者数字后跟右括号
  135. # 匹配: "(1)", "(1)", "1)", "1)"
  136. if re.match(r'^[((]\d+[))]', text):
  137. return "3"
  138. # 数字后跟右括号,如 "1)", "2)"
  139. if re.match(r'^\d+[))]', text):
  140. return "3"
  141. return ""
  142. def clean_number_string(value: str) -> str:
  143. """
  144. 清理数字字符串
  145. - 移除千位分隔符
  146. - 移除单位
  147. - 保留小数点
  148. Args:
  149. value: 原始数字字符串
  150. Returns:
  151. str: 清理后的数字字符串
  152. """
  153. if not value or not value.strip():
  154. return ""
  155. value = value.strip()
  156. # 移除常见单位
  157. value = re.sub(r'[万元元]', '', value)
  158. # 移除千位分隔符
  159. value = value.replace(',', '').replace(',', '')
  160. # 移除空格
  161. value = value.replace(' ', '')
  162. return value
  163. def parse_feasibility_approval_investment(markdown_content: str) -> FeasibilityApprovalInvestment:
  164. """
  165. 解析可研批复投资估算
  166. 包含字段:
  167. - No: 序号
  168. - name: 工程或费用名称
  169. - Level: 明细等级
  170. - constructionScaleOverheadLine: 建设规模-架空线
  171. - constructionScaleBay: 建设规模-间隔
  172. - constructionScaleSubstation: 建设规模-变电
  173. - constructionScaleOpticalCable: 建设规模-光缆
  174. - staticInvestment: 静态投资(元)
  175. - dynamicInvestment: 动态投资(元)
  176. """
  177. record = FeasibilityApprovalInvestment()
  178. tables = extract_table_with_rowspan_colspan(markdown_content)
  179. if not tables:
  180. logger.warning("[可研批复投资] 未能提取出任何表格内容")
  181. return record
  182. # 找到所有投资估算表格并合并
  183. # 因为OCR可能将一个大表格拆分成多个<table>
  184. all_matching_tables = []
  185. for table_idx, table in enumerate(tables):
  186. table_text = ""
  187. for row in table:
  188. table_text += " ".join([str(cell) for cell in row])
  189. # 移除空格后再匹配
  190. table_text_no_space = table_text.replace(" ", "")
  191. # 选择包含"工程或费用名称"和"静态投资"的表格
  192. if "工程或费用名称" in table_text_no_space and "静态投资" in table_text_no_space:
  193. all_matching_tables.append((table_idx, table))
  194. logger.info(f"[可研批复投资] 找到投资估算表格 (表格{table_idx+1}), 行数: {len(table)}")
  195. if not all_matching_tables:
  196. logger.warning("[可研批复投资] 未找到包含投资估算的表格")
  197. return record
  198. # 如果只有一个表格,直接使用
  199. if len(all_matching_tables) == 1:
  200. target_table = all_matching_tables[0][1]
  201. else:
  202. # 多个表格:合并所有表格的数据行(跳过重复的表头行)
  203. logger.info(f"[可研批复投资] 发现 {len(all_matching_tables)} 个投资估算表格,将进行合并")
  204. target_table = []
  205. first_table = True
  206. for table_idx, table in all_matching_tables:
  207. if first_table:
  208. # 第一个表格:保留全部内容(包括表头)
  209. target_table.extend(table)
  210. first_table = False
  211. else:
  212. # 后续表格:跳过表头行(前几行包含"序号"、"工程或费用名称"等)
  213. header_end_idx = 0
  214. for row_idx, row in enumerate(table):
  215. row_text = " ".join([str(cell) for cell in row]).replace(" ", "")
  216. # 如果这行包含表头关键词,继续跳过
  217. if "序号" in row_text or "工程或费用名称" in row_text or "建设规模" in row_text:
  218. header_end_idx = row_idx + 1
  219. # 如果第一列是中文数字(一、二、三...),说明是数据行开始
  220. elif len(row) > 0:
  221. first_cell = str(row[0]).strip()
  222. if first_cell in ["一", "二", "三", "四", "五", "六", "七", "八", "九", "十"]:
  223. break
  224. # 只添加数据行
  225. target_table.extend(table[header_end_idx:])
  226. logger.debug(f"[可研批复投资] 表格{table_idx+1}: 跳过前{header_end_idx}行表头,添加{len(table)-header_end_idx}行数据")
  227. logger.info(f"[可研批复投资] 合并后总行数: {len(target_table)}")
  228. # 识别表头行和列索引
  229. # 注意:表格可能有多层表头(rowspan),需要扫描前几行来找到所有列名
  230. header_row_idx = -1
  231. no_idx = -1
  232. name_idx = -1
  233. overhead_line_idx = -1
  234. bay_idx = -1
  235. substation_idx = -1
  236. optical_cable_idx = -1
  237. static_investment_idx = -1
  238. dynamic_investment_idx = -1
  239. # 新增费用列索引
  240. construction_project_cost_idx = -1 # 建筑工程费
  241. equipment_purchase_cost_idx = -1 # 设备购置费
  242. installation_project_cost_idx = -1 # 安装工程费
  243. other_expenses_idx = -1 # 其他费用(合计)
  244. # 扫描前几行(最多5行)来识别列索引
  245. for row_idx in range(min(5, len(target_table))):
  246. row = target_table[row_idx]
  247. row_text = " ".join([str(cell) for cell in row])
  248. row_text_no_space = row_text.replace(" ", "")
  249. # 识别各列(遍历所有行的所有列)
  250. for col_idx, cell in enumerate(row):
  251. cell_text = str(cell).strip()
  252. cell_text_no_space = cell_text.replace(" ", "")
  253. if "序号" in cell_text and no_idx == -1:
  254. no_idx = col_idx
  255. elif ("工程或费用名称" in cell_text_no_space) and name_idx == -1:
  256. name_idx = col_idx
  257. elif "架空线" in cell_text_no_space and overhead_line_idx == -1:
  258. overhead_line_idx = col_idx
  259. elif "间隔" in cell_text and bay_idx == -1:
  260. bay_idx = col_idx
  261. elif "变电" in cell_text and substation_idx == -1:
  262. substation_idx = col_idx
  263. elif "光缆" in cell_text and optical_cable_idx == -1:
  264. optical_cable_idx = col_idx
  265. elif "静态投资" in cell_text_no_space and static_investment_idx == -1:
  266. static_investment_idx = col_idx
  267. elif "动态投资" in cell_text_no_space and dynamic_investment_idx == -1:
  268. dynamic_investment_idx = col_idx
  269. # 新增费用字段识别
  270. elif "建筑工程费" in cell_text_no_space and construction_project_cost_idx == -1:
  271. construction_project_cost_idx = col_idx
  272. elif "设备购置费" in cell_text_no_space and equipment_purchase_cost_idx == -1:
  273. equipment_purchase_cost_idx = col_idx
  274. elif "安装工程费" in cell_text_no_space and installation_project_cost_idx == -1:
  275. installation_project_cost_idx = col_idx
  276. elif ("其他费用" in cell_text_no_space or "合计" == cell_text_no_space) and other_expenses_idx == -1:
  277. # 其他费用列通常标题为"合计"或"其他费用"
  278. # 注意:表头可能有"合计"列在"其他费用"下面
  279. if "其他费用" in cell_text_no_space:
  280. other_expenses_idx = col_idx
  281. # 如果这一行包含"序号"或"工程或费用名称",记录为表头结束行
  282. if ("序号" in row_text or "工程或费用名称" in row_text_no_space) and header_row_idx == -1:
  283. header_row_idx = row_idx
  284. # 表头结束行应该是最后一个包含表头内容的行
  285. # 找到第一个数据行(通常是"一"、"二"等开头)
  286. for row_idx in range(min(5, len(target_table))):
  287. row = target_table[row_idx]
  288. if len(row) > 0:
  289. first_cell = str(row[0]).strip()
  290. # 如果第一列是中文数字或阿拉伯数字(不是"序号"),这是数据行
  291. if first_cell and first_cell not in ["序号", ""] and (first_cell in ["一", "二", "三", "四", "五"] or first_cell.isdigit()):
  292. header_row_idx = row_idx - 1
  293. logger.debug(f"[可研批复投资] 根据数据行确定表头结束于第{header_row_idx}行")
  294. break
  295. logger.info(f"[可研批复投资] 表头行: {header_row_idx}")
  296. logger.info(f"[可研批复投资] 列索引: 序号={no_idx}, 名称={name_idx}, "
  297. f"架空线={overhead_line_idx}, 间隔={bay_idx}, 变电={substation_idx}, "
  298. f"光缆={optical_cable_idx}, 静态投资={static_investment_idx}, 动态投资={dynamic_investment_idx}")
  299. logger.info(f"[可研批复投资] 费用列索引: 建筑工程费={construction_project_cost_idx}, "
  300. f"设备购置费={equipment_purchase_cost_idx}, 安装工程费={installation_project_cost_idx}, "
  301. f"其他费用={other_expenses_idx}")
  302. if header_row_idx == -1:
  303. logger.warning("[可研批复投资] 未找到表头行")
  304. return record
  305. # 解析数据行(输出全部数据,不再只筛选"四"区域)
  306. for row_idx in range(header_row_idx + 1, len(target_table)):
  307. row = target_table[row_idx]
  308. if len(row) < 3:
  309. continue
  310. # 检查是否是有效数据行(至少有名称)
  311. if name_idx >= 0 and name_idx < len(row):
  312. name = str(row[name_idx]).strip()
  313. if not name or name in ["", "nan", "None"]:
  314. continue
  315. # 提取序号
  316. no = ""
  317. if no_idx >= 0 and no_idx < len(row):
  318. no = str(row[no_idx]).strip()
  319. # 判断等级,传入 name 辅助区分顶级大类和子项
  320. level_input = (no + name) if no else name
  321. level = determine_level(level_input, name)
  322. item = InvestmentItem()
  323. item.no = no
  324. item.name = name
  325. item.level = level
  326. # 提取建设规模
  327. if overhead_line_idx >= 0 and overhead_line_idx < len(row):
  328. item.constructionScaleOverheadLine = str(row[overhead_line_idx]).strip()
  329. if bay_idx >= 0 and bay_idx < len(row):
  330. item.constructionScaleBay = str(row[bay_idx]).strip()
  331. if substation_idx >= 0 and substation_idx < len(row):
  332. item.constructionScaleSubstation = str(row[substation_idx]).strip()
  333. if optical_cable_idx >= 0 and optical_cable_idx < len(row):
  334. item.constructionScaleOpticalCable = str(row[optical_cable_idx]).strip()
  335. # 提取投资金额
  336. if static_investment_idx >= 0 and static_investment_idx < len(row):
  337. item.staticInvestment = clean_number_string(str(row[static_investment_idx]))
  338. if dynamic_investment_idx >= 0 and dynamic_investment_idx < len(row):
  339. item.dynamicInvestment = clean_number_string(str(row[dynamic_investment_idx]))
  340. # 提取费用明细
  341. if construction_project_cost_idx >= 0 and construction_project_cost_idx < len(row):
  342. item.constructionProjectCost = clean_number_string(str(row[construction_project_cost_idx]))
  343. if equipment_purchase_cost_idx >= 0 and equipment_purchase_cost_idx < len(row):
  344. item.equipmentPurchaseCost = clean_number_string(str(row[equipment_purchase_cost_idx]))
  345. if installation_project_cost_idx >= 0 and installation_project_cost_idx < len(row):
  346. item.installationProjectCost = clean_number_string(str(row[installation_project_cost_idx]))
  347. if other_expenses_idx >= 0 and other_expenses_idx < len(row):
  348. item.otherExpenses = clean_number_string(str(row[other_expenses_idx]))
  349. record.items.append(item)
  350. logger.info(f"[可研批复投资] 解析到数据: No={item.no}, Name={item.name}, Level={item.level}")
  351. logger.info(f"[可研批复投资] 共解析到 {len(record.items)} 条数据")
  352. return record
  353. def parse_safety_feasibility_approval_investment(markdown_content: str) -> FeasibilityApprovalInvestment:
  354. """
  355. 解析安全可研批复投资估算(湖北省格式)
  356. 特点:
  357. - 没有顶层大类(Level=1),直接从二级分类开始
  358. - 中文序号(一、二)表示二级分类(如"变电工程"、"线路工程")
  359. - 阿拉伯数字(1、2、3)表示具体项目
  360. - 列名使用"项目名称"和"静态合计/动态合计"
  361. - 先扫描项目信息表提取工程名称、项目单位、设计单位
  362. 返回结构:
  363. - Level 1: 二级分类(如"变电工程"、"线路工程")
  364. - Level 2: 具体项目(如"襄阳连云220千伏变电站新建工程")
  365. """
  366. record = FeasibilityApprovalInvestment()
  367. tables = extract_table_with_rowspan_colspan(markdown_content)
  368. if not tables:
  369. logger.warning("[安全可研批复投资] 未能提取出任何表格内容")
  370. return record
  371. # 首先尝试提取项目基本信息表格
  372. for table_idx, table in enumerate(tables):
  373. if len(table) < 2:
  374. continue
  375. table_text = ""
  376. for row in table:
  377. table_text += " ".join([str(cell) for cell in row])
  378. table_text_no_space = table_text.replace(" ", "").replace("(", "(").replace(")", ")")
  379. if "工程(项目)名称" in table_text_no_space or "工程项目名称" in table_text_no_space:
  380. logger.info(f"[安全可研批复投资] 找到项目信息表格 (表格{table_idx+1})")
  381. for row in table:
  382. if len(row) >= 2:
  383. key = str(row[0]).strip()
  384. value = str(row[1]).strip() if len(row) > 1 else ""
  385. if "工程" in key and "名称" in key:
  386. record.projectName = value
  387. logger.info(f"[安全可研批复投资] 提取工程名称: {value}")
  388. elif "项目单位" in key:
  389. record.projectUnit = value
  390. logger.info(f"[安全可研批复投资] 提取项目单位: {value}")
  391. elif "设计单位" in key:
  392. record.designUnit = value
  393. logger.info(f"[安全可研批复投资] 提取设计单位: {value}")
  394. break
  395. # 找到所有投资估算表格并合并
  396. all_matching_tables = []
  397. for table_idx, table in enumerate(tables):
  398. table_text = ""
  399. for row in table:
  400. table_text += " ".join([str(cell) for cell in row])
  401. table_text_no_space = table_text.replace(" ", "")
  402. has_name_col = "项目名称" in table_text_no_space
  403. has_investment_col = ("静态合计" in table_text_no_space or "静态投资" in table_text_no_space)
  404. if has_name_col and has_investment_col:
  405. all_matching_tables.append((table_idx, table))
  406. logger.info(f"[安全可研批复投资] 找到投资估算表格 (表格{table_idx+1}), 行数: {len(table)}")
  407. if not all_matching_tables:
  408. logger.warning("[安全可研批复投资] 未找到包含投资估算的表格")
  409. return record
  410. if len(all_matching_tables) == 1:
  411. target_table = all_matching_tables[0][1]
  412. else:
  413. logger.info(f"[安全可研批复投资] 发现 {len(all_matching_tables)} 个投资估算表格,将进行合并")
  414. target_table = []
  415. first_table = True
  416. for table_idx, table in all_matching_tables:
  417. if first_table:
  418. target_table.extend(table)
  419. first_table = False
  420. else:
  421. header_end_idx = 0
  422. for row_idx, row in enumerate(table):
  423. row_text = " ".join([str(cell) for cell in row]).replace(" ", "")
  424. if "序号" in row_text or "项目名称" in row_text or "建设规模" in row_text:
  425. header_end_idx = row_idx + 1
  426. elif len(row) > 0:
  427. first_cell = str(row[0]).strip()
  428. if first_cell in ["一", "二", "三", "四", "五", "六", "七", "八", "九", "十"]:
  429. break
  430. target_table.extend(table[header_end_idx:])
  431. logger.info(f"[安全可研批复投资] 合并后总行数: {len(target_table)}")
  432. header_row_idx = -1
  433. no_idx = name_idx = overhead_line_idx = bay_idx = substation_idx = optical_cable_idx = -1
  434. static_investment_idx = dynamic_investment_idx = -1
  435. construction_project_cost_idx = equipment_purchase_cost_idx = installation_project_cost_idx = other_expenses_idx = -1
  436. for row_idx in range(min(5, len(target_table))):
  437. row = target_table[row_idx]
  438. row_text = " ".join([str(cell) for cell in row])
  439. row_text_no_space = row_text.replace(" ", "")
  440. for col_idx, cell in enumerate(row):
  441. cell_text = str(cell).strip()
  442. cell_text_no_space = cell_text.replace(" ", "")
  443. if "序号" in cell_text and no_idx == -1:
  444. no_idx = col_idx
  445. elif "项目名称" in cell_text_no_space and name_idx == -1:
  446. name_idx = col_idx
  447. elif "架空线" in cell_text_no_space and overhead_line_idx == -1:
  448. overhead_line_idx = col_idx
  449. elif "间隔" in cell_text and bay_idx == -1:
  450. bay_idx = col_idx
  451. elif "变电" in cell_text and substation_idx == -1:
  452. substation_idx = col_idx
  453. elif "光缆" in cell_text and optical_cable_idx == -1:
  454. optical_cable_idx = col_idx
  455. elif ("静态投资" in cell_text_no_space or "静态合计" in cell_text_no_space) and static_investment_idx == -1:
  456. static_investment_idx = col_idx
  457. elif ("动态投资" in cell_text_no_space or "动态合计" in cell_text_no_space) and dynamic_investment_idx == -1:
  458. dynamic_investment_idx = col_idx
  459. elif "建筑工程费" in cell_text_no_space and construction_project_cost_idx == -1:
  460. construction_project_cost_idx = col_idx
  461. elif "设备购置费" in cell_text_no_space and equipment_purchase_cost_idx == -1:
  462. equipment_purchase_cost_idx = col_idx
  463. elif "安装工程费" in cell_text_no_space and installation_project_cost_idx == -1:
  464. installation_project_cost_idx = col_idx
  465. elif ("其他费用" in cell_text_no_space or "合计" == cell_text_no_space) and other_expenses_idx == -1:
  466. if "其他费用" in cell_text_no_space:
  467. other_expenses_idx = col_idx
  468. if ("序号" in row_text or "项目名称" in row_text_no_space) and header_row_idx == -1:
  469. header_row_idx = row_idx
  470. for row_idx in range(min(5, len(target_table))):
  471. row = target_table[row_idx]
  472. if len(row) > 0:
  473. first_cell = str(row[0]).strip()
  474. if first_cell and first_cell not in ["序号", ""] and (first_cell in ["一", "二", "三", "四", "五"] or first_cell.isdigit()):
  475. header_row_idx = row_idx - 1
  476. break
  477. if header_row_idx == -1:
  478. logger.warning("[安全可研批复投资] 未找到表头行")
  479. return record
  480. for row_idx in range(header_row_idx + 1, len(target_table)):
  481. row = target_table[row_idx]
  482. if len(row) < 3:
  483. continue
  484. name = str(row[name_idx]).strip() if name_idx >= 0 and name_idx < len(row) else ""
  485. if not name or name in ["", "nan", "None"]:
  486. continue
  487. no = str(row[no_idx]).strip() if no_idx >= 0 and no_idx < len(row) else ""
  488. level_input = (no + name) if no else name
  489. level = determine_level(level_input, name, strict_mode=False)
  490. item = InvestmentItem()
  491. item.no = no
  492. item.name = name
  493. item.level = level
  494. if overhead_line_idx >= 0 and overhead_line_idx < len(row):
  495. item.constructionScaleOverheadLine = str(row[overhead_line_idx]).strip()
  496. if bay_idx >= 0 and bay_idx < len(row):
  497. item.constructionScaleBay = str(row[bay_idx]).strip()
  498. if substation_idx >= 0 and substation_idx < len(row):
  499. item.constructionScaleSubstation = str(row[substation_idx]).strip()
  500. if optical_cable_idx >= 0 and optical_cable_idx < len(row):
  501. item.constructionScaleOpticalCable = str(row[optical_cable_idx]).strip()
  502. if static_investment_idx >= 0 and static_investment_idx < len(row):
  503. item.staticInvestment = clean_number_string(str(row[static_investment_idx]))
  504. if dynamic_investment_idx >= 0 and dynamic_investment_idx < len(row):
  505. item.dynamicInvestment = clean_number_string(str(row[dynamic_investment_idx]))
  506. if construction_project_cost_idx >= 0 and construction_project_cost_idx < len(row):
  507. item.constructionProjectCost = clean_number_string(str(row[construction_project_cost_idx]))
  508. if equipment_purchase_cost_idx >= 0 and equipment_purchase_cost_idx < len(row):
  509. item.equipmentPurchaseCost = clean_number_string(str(row[equipment_purchase_cost_idx]))
  510. if installation_project_cost_idx >= 0 and installation_project_cost_idx < len(row):
  511. item.installationProjectCost = clean_number_string(str(row[installation_project_cost_idx]))
  512. if other_expenses_idx >= 0 and other_expenses_idx < len(row):
  513. item.otherExpenses = clean_number_string(str(row[other_expenses_idx]))
  514. record.items.append(item)
  515. logger.info(f"[安全可研批复投资] 解析到数据: No={item.no}, Name={item.name}, Level={item.level}")
  516. logger.info(f"[安全可研批复投资] 共解析到 {len(record.items)} 条数据")
  517. return record
  518. def parse_feasibility_review_investment(markdown_content: str) -> FeasibilityReviewInvestment:
  519. """
  520. 解析可研评审投资估算
  521. 包含字段:
  522. - No: 序号
  523. - name: 工程或费用名称
  524. - Level: 明细等级
  525. - staticInvestment: 静态投资(元)
  526. - dynamicInvestment: 动态投资(元)
  527. 注意:文档中可能包含多个表格,只解析"输变电工程建设规模及投资估算表"
  528. 排除"总估算表"类型的表格
  529. """
  530. record = FeasibilityReviewInvestment()
  531. # 使用正则表达式查找表格及其前面的标题
  532. # 查找 "输变电工程" + "投资估算表" 的标题,排除 "总估算表"
  533. import re
  534. # 找到目标表格的标题位置
  535. # 标题格式如: # 山西晋城周村220kV输变电工程建设规模及投资估算表
  536. target_table_pattern = re.compile(
  537. r'#\s*[^#\n]*?(输变电工程|输变电|变电工程)[^#\n]*?(建设规模及)?投资估算表',
  538. re.IGNORECASE
  539. )
  540. # 排除"总估算表"的模式
  541. exclude_pattern = re.compile(r'总估算表', re.IGNORECASE)
  542. # 查找所有匹配的标题
  543. target_title_match = None
  544. for match in target_table_pattern.finditer(markdown_content):
  545. title_text = match.group(0)
  546. if not exclude_pattern.search(title_text):
  547. target_title_match = match
  548. logger.info(f"[可研评审投资] 找到目标表格标题: {title_text}")
  549. break
  550. if not target_title_match:
  551. logger.warning("[可研评审投资] 未找到'输变电工程投资估算表'标题")
  552. # 回退到原有逻辑
  553. tables = extract_table_with_rowspan_colspan(markdown_content)
  554. if not tables:
  555. logger.warning("[可研评审投资] 未能提取出任何表格内容")
  556. return record
  557. target_table = None
  558. for table in tables:
  559. for row in table:
  560. row_text = " ".join([str(cell) for cell in row])
  561. row_text_no_space = row_text.replace(" ", "")
  562. if "工程或费用名称" in row_text_no_space or ("序号" in row_text and "静态投资" in row_text_no_space):
  563. target_table = table
  564. logger.info(f"[可研评审投资] 回退: 找到投资估算表格, 行数: {len(table)}")
  565. break
  566. if target_table:
  567. break
  568. if not target_table:
  569. logger.warning("[可研评审投资] 未找到包含投资估算的表格")
  570. return record
  571. else:
  572. # 提取标题后面到下一个标题之间的内容(包含目标表格)
  573. title_end = target_title_match.end()
  574. # 找到下一个标题或文档结束
  575. next_title_pattern = re.compile(r'\n#\s+[^#]')
  576. next_title_match = next_title_pattern.search(markdown_content, title_end)
  577. if next_title_match:
  578. section_content = markdown_content[target_title_match.start():next_title_match.start()]
  579. else:
  580. section_content = markdown_content[target_title_match.start():]
  581. logger.debug(f"[可研评审投资] 提取表格区域内容长度: {len(section_content)} 字符")
  582. # 从该区域提取表格
  583. tables = extract_table_with_rowspan_colspan(section_content)
  584. if not tables:
  585. logger.warning("[可研评审投资] 目标区域未能提取出任何表格内容")
  586. return record
  587. # 选择第一个有效表格
  588. target_table = None
  589. for table in tables:
  590. for row in table:
  591. row_text = " ".join([str(cell) for cell in row])
  592. row_text_no_space = row_text.replace(" ", "")
  593. if "工程或费用名称" in row_text_no_space or ("序号" in row_text and "静态投资" in row_text_no_space):
  594. target_table = table
  595. logger.info(f"[可研评审投资] 找到目标投资估算表格, 行数: {len(table)}")
  596. break
  597. if target_table:
  598. break
  599. if not target_table:
  600. logger.warning("[可研评审投资] 目标区域未找到包含投资估算的表格")
  601. return record
  602. # 识别表头行和列索引(多行表头处理)
  603. # 这个表格有多行表头(rowspan/colspan),需要扫描前几行来找到所有列索引
  604. no_idx = -1
  605. name_idx = -1
  606. static_investment_idx = -1
  607. dynamic_investment_idx = -1
  608. header_row_idx = -1
  609. # 扫描前5行查找列索引
  610. scan_rows = min(5, len(target_table))
  611. for row_idx in range(scan_rows):
  612. row = target_table[row_idx]
  613. for col_idx, cell in enumerate(row):
  614. cell_text = str(cell).strip()
  615. cell_text_no_space = cell_text.replace(" ", "")
  616. if "序号" in cell_text and no_idx == -1:
  617. no_idx = col_idx
  618. elif ("工程或费用名称" in cell_text_no_space or "工程名称" in cell_text_no_space) and name_idx == -1:
  619. name_idx = col_idx
  620. elif "静态投资" in cell_text_no_space and static_investment_idx == -1:
  621. static_investment_idx = col_idx
  622. elif "动态投资" in cell_text_no_space and dynamic_investment_idx == -1:
  623. dynamic_investment_idx = col_idx
  624. logger.info(f"[可研评审投资] 列索引: 序号={no_idx}, 名称={name_idx}, "
  625. f"静态投资={static_investment_idx}, 动态投资={dynamic_investment_idx}")
  626. # 确定表头结束行(第一个数据行的前一行)
  627. # 数据行特征:第一列是中文数字(一、二、三)或阿拉伯数字
  628. for row_idx in range(len(target_table)):
  629. row = target_table[row_idx]
  630. if len(row) > 0:
  631. first_cell = str(row[0]).strip()
  632. # 检查是否是数据行(以中文数字或阿拉伯数字开头)
  633. if re.match(r'^[一二三四五六七八九十]+$', first_cell) or re.match(r'^\d+$', first_cell):
  634. # 排除表头行(检查第二列是否是表头关键词)
  635. if len(row) > 1:
  636. second_cell = str(row[1]).strip().replace(" ", "")
  637. if second_cell not in ["工程或费用名称", "工程名称", "名称", ""]:
  638. header_row_idx = row_idx - 1
  639. logger.debug(f"[可研评审投资] 确定表头结束行: 第{header_row_idx}行")
  640. break
  641. if header_row_idx == -1:
  642. header_row_idx = 2 # 默认假设前3行是表头
  643. logger.debug(f"[可研评审投资] 使用默认表头结束行: 第{header_row_idx}行")
  644. # 解析数据行
  645. for row_idx in range(header_row_idx + 1, len(target_table)):
  646. row = target_table[row_idx]
  647. if len(row) < 2:
  648. continue
  649. if name_idx >= 0 and name_idx < len(row):
  650. name = str(row[name_idx]).strip()
  651. if not name or name in ["", "nan", "None"]:
  652. continue
  653. # 跳过重复的表头行
  654. name_no_space = name.replace(" ", "")
  655. if name_no_space in ["工程或费用名称", "工程名称", "名称"]:
  656. logger.debug(f"[可研评审投资] 跳过表头行: {name}")
  657. continue
  658. item = InvestmentItem()
  659. if no_idx >= 0 and no_idx < len(row):
  660. item.no = str(row[no_idx]).strip()
  661. # 跳过表头中的序号列
  662. if item.no == "序号":
  663. continue
  664. item.name = name
  665. # 判断等级 - 使用 no 和 name 分别判断
  666. # fsReview 使用非严格模式,中文数字直接判断为一级
  667. if item.no:
  668. # 优先使用 no 判断等级
  669. item.level = determine_level(item.no, item.name, strict_mode=False)
  670. if not item.level:
  671. # 如果 no 没有匹配,尝试使用 name
  672. item.level = determine_level(item.name, item.name, strict_mode=False)
  673. else:
  674. item.level = determine_level(item.name, item.name, strict_mode=False)
  675. # 提取投资金额
  676. if static_investment_idx >= 0 and static_investment_idx < len(row):
  677. item.staticInvestment = clean_number_string(str(row[static_investment_idx]))
  678. if dynamic_investment_idx >= 0 and dynamic_investment_idx < len(row):
  679. item.dynamicInvestment = clean_number_string(str(row[dynamic_investment_idx]))
  680. record.items.append(item)
  681. logger.info(f"[可研评审投资] 解析到数据: No={item.no}, Name={item.name}, Level={item.level}, "
  682. f"静态投资={item.staticInvestment}, 动态投资={item.dynamicInvestment}")
  683. logger.info(f"[可研评审投资] 共解析到 {len(record.items)} 条数据")
  684. return record
  685. def parse_preliminary_approval_investment(markdown_content: str) -> PreliminaryApprovalInvestment:
  686. """
  687. 解析初设批复概算投资
  688. 包含字段:
  689. - No: 序号
  690. - name: 工程名称
  691. - Level: 明细等级
  692. - staticInvestment: 静态投资(元)
  693. - dynamicInvestment: 动态投资(元)
  694. Note: 需要包含合计行,合计的level为0
  695. """
  696. logger.info("[初设批复投资] ========== 开始解析初设批复概算投资 ==========")
  697. logger.debug(f"[初设批复投资] Markdown内容长度: {len(markdown_content)} 字符")
  698. record = PreliminaryApprovalInvestment()
  699. logger.info("[初设批复投资] 开始提取表格...")
  700. tables = extract_table_with_rowspan_colspan(markdown_content)
  701. logger.info(f"[初设批复投资] 提取到 {len(tables) if tables else 0} 个表格")
  702. if not tables:
  703. logger.warning("[初设批复投资] 未能提取出任何表格内容")
  704. return record
  705. # 找到包含投资估算的表格
  706. logger.info("[初设批复投资] 开始查找投资估算表格...")
  707. target_table = None
  708. for table_idx, table in enumerate(tables):
  709. logger.debug(f"[初设批复投资] 检查表格 {table_idx + 1}/{len(tables)}, 行数: {len(table)}")
  710. for row_idx, row in enumerate(table):
  711. row_text = " ".join([str(cell) for cell in row])
  712. # 移除空格后再匹配,以处理OCR可能产生的空格
  713. row_text_no_space = row_text.replace(" ", "")
  714. # 输出前几行用于调试
  715. if row_idx < 3:
  716. logger.debug(f"[初设批复投资] 表格{table_idx+1} 第{row_idx+1}行: {row_text[:100]}")
  717. if "工程名称" in row_text_no_space or ("序号" in row_text and "静态投资" in row_text_no_space):
  718. target_table = table
  719. logger.info(f"[初设批复投资] ✓ 找到投资估算表格 (表格{table_idx+1}), 行数: {len(table)}")
  720. logger.debug(f"[初设批复投资] 匹配行内容: {row_text}")
  721. break
  722. if target_table:
  723. break
  724. if not target_table:
  725. logger.warning("[初设批复投资] ✗ 未找到包含投资估算的表格")
  726. logger.warning("[初设批复投资] 查找条件: 包含'工程名称' 或 ('序号' 且 '静态投资')")
  727. return record
  728. # 识别表头行和列索引
  729. logger.info("[初设批复投资] 开始识别表头行和列索引...")
  730. header_row_idx = -1
  731. no_idx = -1
  732. name_idx = -1
  733. static_investment_idx = -1
  734. dynamic_investment_idx = -1
  735. for row_idx, row in enumerate(target_table):
  736. row_text = " ".join([str(cell) for cell in row])
  737. # 移除空格后再匹配,以处理OCR可能产生的空格
  738. row_text_no_space = row_text.replace(" ", "")
  739. logger.debug(f"[初设批复投资] 检查第{row_idx}行: {row_text[:80]}")
  740. if "工程名称" in row_text_no_space or "序号" in row_text:
  741. header_row_idx = row_idx
  742. logger.info(f"[初设批复投资] ✓ 找到表头行: 第{row_idx}行")
  743. logger.debug(f"[初设批复投资] 表头内容: {row}")
  744. for col_idx, cell in enumerate(row):
  745. cell_text = str(cell).strip()
  746. # 移除空格后再匹配,以处理OCR可能产生的空格
  747. cell_text_no_space = cell_text.replace(" ", "")
  748. logger.debug(f"[初设批复投资] 列{col_idx}: '{cell_text}' (去空格: '{cell_text_no_space}')")
  749. if "序号" in cell_text:
  750. no_idx = col_idx
  751. logger.debug(f"[初设批复投资] → 序号列: {col_idx}")
  752. elif "工程名称" in cell_text_no_space or "名称" in cell_text:
  753. name_idx = col_idx
  754. logger.debug(f"[初设批复投资] → 名称列: {col_idx}")
  755. elif "静态投资" in cell_text_no_space:
  756. static_investment_idx = col_idx
  757. logger.debug(f"[初设批复投资] → 静态投资列: {col_idx}")
  758. elif "动态投资" in cell_text_no_space:
  759. dynamic_investment_idx = col_idx
  760. logger.debug(f"[初设批复投资] → 动态投资列: {col_idx}")
  761. logger.info(f"[初设批复投资] ✓ 列索引识别完成: 序号={no_idx}, 名称={name_idx}, "
  762. f"静态投资={static_investment_idx}, 动态投资={dynamic_investment_idx}")
  763. break
  764. if header_row_idx == -1:
  765. logger.warning("[初设批复投资] ✗ 未找到表头行")
  766. logger.warning("[初设批复投资] 查找条件: 包含'工程名称' 或 '序号'")
  767. return record
  768. # 解析数据行
  769. logger.info(f"[初设批复投资] 开始解析数据行 (从第{header_row_idx + 1}行到第{len(target_table)}行)...")
  770. parsed_count = 0
  771. skipped_count = 0
  772. for row_idx in range(header_row_idx + 1, len(target_table)):
  773. row = target_table[row_idx]
  774. logger.debug(f"[初设批复投资] 处理第{row_idx}行, 列数: {len(row)}")
  775. if len(row) < 2:
  776. logger.debug(f"[初设批复投资] 跳过第{row_idx}行: 列数不足 ({len(row)} < 2)")
  777. skipped_count += 1
  778. continue
  779. if name_idx >= 0 and name_idx < len(row):
  780. name = str(row[name_idx]).strip()
  781. logger.debug(f"[初设批复投资] 第{row_idx}行名称: '{name}'")
  782. if not name or name in ["", "nan", "None"]:
  783. logger.debug(f"[初设批复投资] 跳过第{row_idx}行: 名称为空")
  784. skipped_count += 1
  785. continue
  786. item = InvestmentItem()
  787. if no_idx >= 0 and no_idx < len(row):
  788. item.no = str(row[no_idx]).strip()
  789. item.name = name
  790. # 判断等级 - pdApproval 使用非严格模式,中文数字直接判断为一级
  791. level_input = (item.no + item.name) if item.no else item.name
  792. item.level = determine_level(level_input, item.name, strict_mode=False)
  793. logger.debug(f"[初设批复投资] 等级判断: '{level_input}' -> Level={item.level}")
  794. # 提取投资金额
  795. if static_investment_idx >= 0 and static_investment_idx < len(row):
  796. raw_static = str(row[static_investment_idx])
  797. item.staticInvestment = clean_number_string(raw_static)
  798. logger.debug(f"[初设批复投资] 静态投资: '{raw_static}' -> '{item.staticInvestment}'")
  799. if dynamic_investment_idx >= 0 and dynamic_investment_idx < len(row):
  800. raw_dynamic = str(row[dynamic_investment_idx])
  801. item.dynamicInvestment = clean_number_string(raw_dynamic)
  802. logger.debug(f"[初设批复投资] 动态投资: '{raw_dynamic}' -> '{item.dynamicInvestment}'")
  803. record.items.append(item)
  804. parsed_count += 1
  805. logger.info(f"[初设批复投资] ✓ 解析到数据 #{parsed_count}: No={item.no}, Name={item.name}, Level={item.level}, "
  806. f"静态={item.staticInvestment}, 动态={item.dynamicInvestment}")
  807. else:
  808. logger.debug(f"[初设批复投资] 跳过第{row_idx}行: name_idx={name_idx} 超出范围 (行长度={len(row)})")
  809. skipped_count += 1
  810. logger.info(f"[初设批复投资] ========== 解析完成 ==========")
  811. logger.info(f"[初设批复投资] 成功解析: {parsed_count} 条")
  812. logger.info(f"[初设批复投资] 跳过: {skipped_count} 条")
  813. logger.info(f"[初设批复投资] 总计: {len(record.items)} 条数据")
  814. return record
  815. def parse_investment_record(markdown_content: str, investment_type: Optional[str] = None):
  816. """
  817. 解析投资估算记录(统一入口)
  818. Args:
  819. markdown_content: Markdown内容
  820. investment_type: 投资类型(可选,如果不提供则自动检测)
  821. - "fsApproval" - 可研批复
  822. - "fsReview" - 可研评审
  823. - "pdApproval" - 初设批复
  824. - "safetyFsApproval" - 安评可研批复
  825. Returns:
  826. 解析后的记录对象
  827. """
  828. logger.info("=" * 80)
  829. logger.info("[投资估算] 开始解析投资估算记录")
  830. logger.info(f"[投资估算] Markdown内容长度: {len(markdown_content)} 字符")
  831. # 如果没有指定类型,自动检测
  832. if not investment_type:
  833. logger.info("[投资估算] 未指定类型,开始自动检测...")
  834. investment_type = detect_investment_type(markdown_content)
  835. logger.info(f"[投资估算] 自动检测结果: {investment_type}")
  836. else:
  837. logger.info(f"[投资估算] 指定类型: {investment_type}")
  838. if not investment_type:
  839. logger.error("[投资估算] 无法识别投资估算类型")
  840. logger.error(f"[投资估算] Markdown前500字符: {markdown_content[:500]}")
  841. return None
  842. # 根据类型调用对应的解析函数
  843. logger.info(f"[投资估算] 调用解析函数: {investment_type}")
  844. result = None
  845. if investment_type == "fsApproval":
  846. result = parse_feasibility_approval_investment(markdown_content)
  847. elif investment_type == "safetyFsApproval":
  848. # 安评可研批复使用独立解析(湖北省格式,含项目信息表)
  849. result = parse_safety_feasibility_approval_investment(markdown_content)
  850. elif investment_type == "fsReview":
  851. result = parse_feasibility_review_investment(markdown_content)
  852. elif investment_type == "pdApproval":
  853. result = parse_preliminary_approval_investment(markdown_content)
  854. else:
  855. logger.error(f"[投资估算] 未知的投资估算类型: {investment_type}")
  856. return None
  857. if result:
  858. logger.info(f"[投资估算] 解析完成,返回对象类型: {type(result).__name__}")
  859. logger.info(f"[投资估算] 记录数量: {len(result.items)}")
  860. else:
  861. logger.error("[投资估算] 解析函数返回 None")
  862. logger.info("=" * 80)
  863. return result
  864. def parse_final_account_record(markdown_content: str) -> Optional[FinalAccountRecord]:
  865. """
  866. 解析决算报告中的单项工程投资完成情况表格
  867. 从OCR输出的Markdown中提取表格数据:
  868. - 表格结构:费用项目 | 概算金额 | 决算金额(审定-不含税) | 增值税额 | 超节支金额 | 超节支率
  869. - 需要提取4个单项工程的投资完成情况
  870. Args:
  871. markdown_content: OCR转换后的Markdown内容
  872. Returns:
  873. FinalAccountRecord 对象,包含所有单项工程的费用明细
  874. """
  875. logger.info("=" * 80)
  876. logger.info("[决算报告] 开始解析决算报告")
  877. logger.info(f"[决算报告] Markdown内容长度: {len(markdown_content)} 字符")
  878. record = FinalAccountRecord()
  879. # 使用正则表达式提取单项工程名称和对应的表格
  880. # 匹配模式:数字序号 + 工程名称(在"单项工程的投资完成情况"章节内)
  881. project_patterns = [
  882. # 匹配 "1、周村 220kV 输变电工程变电站新建工程" 格式
  883. (r'(\d+)[、\..]\s*(.+?(?:工程|扩建))(?:\n|$)', 1),
  884. # 匹配 "# 1、周村220kV变电站新建工程" 格式(带标题标记)
  885. (r'#\s*(\d+)[、\..]\s*(.+?(?:工程|扩建))(?:\n|$)', 2),
  886. ]
  887. # 找到"单项工程的投资完成情况"章节的起始位置
  888. section_start = 0
  889. section_patterns = [
  890. r'单项工程的?(?:投资)?完成情况',
  891. r'#\s*单项工程',
  892. ]
  893. for pattern in section_patterns:
  894. match = re.search(pattern, markdown_content)
  895. if match:
  896. section_start = match.start()
  897. logger.info(f"[决算报告] 找到单项工程章节起始位置: {section_start}")
  898. break
  899. # 找到所有项目标题及其位置
  900. project_positions = []
  901. for pattern, priority in project_patterns:
  902. for match in re.finditer(pattern, markdown_content):
  903. # 只处理单项工程章节内的项目
  904. if match.start() < section_start:
  905. continue
  906. project_no = int(match.group(1))
  907. project_name = match.group(2).strip()
  908. # 清理项目名称中的多余空格和特殊字符
  909. project_name = re.sub(r'\s+', '', project_name)
  910. project_name = re.sub(r'\\[()\[\]]', '', project_name)
  911. # 清理LaTeX数学公式格式
  912. project_name = re.sub(r'\\mathrm\{([^}]+)\}', r'\1', project_name)
  913. project_name = re.sub(r'\\[a-zA-Z]+', '', project_name)
  914. project_positions.append({
  915. "no": project_no,
  916. "name": project_name,
  917. "start": match.start(),
  918. "end": match.end(),
  919. "priority": priority
  920. })
  921. # 按位置排序并去重
  922. project_positions.sort(key=lambda x: x["start"])
  923. seen_positions = set()
  924. unique_projects = []
  925. for proj in project_positions:
  926. # 避免重复的项目(位置相近的同名项目)
  927. key = (proj["no"], proj["start"] // 100)
  928. if key not in seen_positions:
  929. seen_positions.add(key)
  930. unique_projects.append(proj)
  931. logger.info(f"[决算报告] 找到 {len(unique_projects)} 个单项工程")
  932. for proj in unique_projects:
  933. logger.debug(f"[决算报告] 项目 {proj['no']}: {proj['name']}")
  934. # 提取HTML表格及其位置
  935. table_pattern = r'<table[^>]*>(.*?)</table>'
  936. table_matches = list(re.finditer(table_pattern, markdown_content, re.DOTALL | re.IGNORECASE))
  937. logger.info(f"[决算报告] 找到 {len(table_matches)} 个HTML表格")
  938. # 解析每个表格
  939. for table_idx, table_match in enumerate(table_matches):
  940. table_html = table_match.group(1)
  941. table_pos = table_match.start()
  942. # 检查是否为单项工程投资完成情况表格
  943. if not _is_final_account_table(table_html, table_pos, section_start):
  944. logger.debug(f"[决算报告] 表格 {table_idx + 1} 不是单项工程投资完成情况表格,跳过")
  945. continue
  946. # 查找最近的项目
  947. matched_project = None
  948. for proj in unique_projects:
  949. if proj["end"] < table_pos:
  950. matched_project = proj
  951. if not matched_project:
  952. # 如果没有找到匹配的项目,使用表格索引作为项目序号
  953. logger.warning(f"[决算报告] 表格 {table_idx + 1} 未找到对应的项目名称")
  954. matched_project = {"no": table_idx + 1, "name": f"未知工程{table_idx + 1}"}
  955. logger.info(f"[决算报告] 解析表格 {table_idx + 1},关联项目: {matched_project['no']}-{matched_project['name']}")
  956. # 解析表格内容
  957. items = _parse_final_account_table_html(table_html, matched_project["no"], matched_project["name"])
  958. record.items.extend(items)
  959. logger.info(f"[决算报告] 解析完成,共 {len(record.items)} 条记录")
  960. logger.info("=" * 80)
  961. return record
  962. def _is_final_account_table(table_html: str, table_pos: int, section_start: int) -> bool:
  963. """
  964. 判断表格是否为单项工程投资完成情况表格
  965. 特征:
  966. 1. 位于"单项工程的投资完成情况"章节内
  967. 2. 包含"费用项目"、"概算金额"、"决算金额"、"超"、"节"等关键词
  968. Args:
  969. table_html: 表格HTML内容
  970. table_pos: 表格在Markdown中的位置
  971. section_start: 单项工程章节的起始位置
  972. """
  973. # 表格必须在单项工程章节内
  974. if table_pos < section_start:
  975. return False
  976. table_text = table_html.lower()
  977. # 必须包含的关键词
  978. required_keywords = ["概算金额", "决算金额"]
  979. # 至少包含一个的关键词
  980. optional_keywords = ["费用项目", "建筑安装", "设备购置", "其他费用", "审定金额"]
  981. has_required = all(kw.lower() in table_text for kw in required_keywords)
  982. has_optional = any(kw.lower() in table_text for kw in optional_keywords)
  983. return has_required and has_optional
  984. def _parse_final_account_table_html(table_html: str, project_no: int, project_name: str) -> List[FinalAccountItem]:
  985. """
  986. 解析HTML表格内容
  987. 表格结构:
  988. 费用项目 | 概算金额 | 审定金额(不含税) | 增值税额 | 超节支金额 | 超节支率
  989. Args:
  990. table_html: HTML表格内容
  991. project_no: 项目序号
  992. project_name: 项目名称
  993. Returns:
  994. FinalAccountItem 列表
  995. """
  996. items = []
  997. # 提取所有行
  998. row_pattern = r'<tr[^>]*>(.*?)</tr>'
  999. rows = re.findall(row_pattern, table_html, re.DOTALL | re.IGNORECASE)
  1000. if not rows:
  1001. return items
  1002. # 提取每行的单元格
  1003. cell_pattern = r'<td[^>]*>(.*?)</td>'
  1004. # 跳过表头行(通常前2-3行是表头)
  1005. data_start_idx = 0
  1006. for i, row in enumerate(rows):
  1007. cells = re.findall(cell_pattern, row, re.DOTALL | re.IGNORECASE)
  1008. row_text = " ".join(cells).lower()
  1009. # 检测数据开始行(包含"建筑安装"等费用项目名称)
  1010. if "建筑安装" in row_text or "设备购置" in row_text or "其他费用" in row_text:
  1011. data_start_idx = i
  1012. break
  1013. # 跳过表头行(包含"1"、"2"、"3"等列序号)
  1014. if re.match(r'^[\d\s=\-/]+$', row_text.replace(" ", "")):
  1015. continue
  1016. # 解析数据行
  1017. for row in rows[data_start_idx:]:
  1018. cells = re.findall(cell_pattern, row, re.DOTALL | re.IGNORECASE)
  1019. if len(cells) < 2:
  1020. continue
  1021. # 清理单元格内容
  1022. cells = [_clean_cell_text(cell) for cell in cells]
  1023. # 跳过空行
  1024. if not any(cells):
  1025. continue
  1026. # 获取费用项目名称(第一列)
  1027. fee_name = cells[0] if len(cells) > 0 else ""
  1028. # 跳过合计行
  1029. if any(kw in fee_name for kw in ["合计", "总计", "小计"]):
  1030. continue
  1031. # 只保留主要费用项目
  1032. valid_fee_names = ["建筑安装工程", "建筑安装", "设备购置", "其他费用"]
  1033. is_valid = any(kw in fee_name for kw in valid_fee_names)
  1034. if not is_valid:
  1035. continue
  1036. # 创建记录项
  1037. item = FinalAccountItem()
  1038. item.no = project_no
  1039. item.name = project_name
  1040. item.feeName = fee_name
  1041. # 解析数值列
  1042. # 根据列数确定索引
  1043. if len(cells) >= 6:
  1044. item.estimatedCost = _parse_number_str(cells[1])
  1045. item.approvedFinalAccountExcludingVat = _parse_number_str(cells[2])
  1046. item.vatAmount = _parse_number_str(cells[3])
  1047. item.costVariance = _parse_number_str(cells[4])
  1048. item.varianceRate = _parse_rate_str(cells[5])
  1049. elif len(cells) >= 5:
  1050. item.estimatedCost = _parse_number_str(cells[1])
  1051. item.approvedFinalAccountExcludingVat = _parse_number_str(cells[2])
  1052. item.vatAmount = _parse_number_str(cells[3])
  1053. item.costVariance = _parse_number_str(cells[4])
  1054. item.varianceRate = ""
  1055. items.append(item)
  1056. logger.debug(f"[决算报告] 解析记录: {project_name} - {fee_name} = {item.estimatedCost}")
  1057. return items
  1058. def _clean_cell_text(cell: str) -> str:
  1059. """清理单元格文本,移除HTML标签和多余空格"""
  1060. # 移除HTML标签
  1061. text = re.sub(r'<[^>]+>', '', cell)
  1062. # 移除多余空格
  1063. text = re.sub(r'\s+', ' ', text).strip()
  1064. return text
  1065. def _parse_number_str(value: str) -> str:
  1066. """解析数字字符串,保留原始精度"""
  1067. if not value or not value.strip():
  1068. return "0"
  1069. value = value.strip()
  1070. # 移除千分位逗号
  1071. value = value.replace(',', '')
  1072. # 移除非数字字符(保留负号和小数点)
  1073. cleaned = re.sub(r'[^\d.\-]', '', value)
  1074. if not cleaned or cleaned == '-':
  1075. return "0"
  1076. return cleaned
  1077. def _parse_rate_str(value: str) -> str:
  1078. """解析百分比字符串"""
  1079. if not value or not value.strip():
  1080. return "0%"
  1081. value = value.strip()
  1082. if '%' not in value:
  1083. # 提取数字部分并添加百分号
  1084. num_str = re.sub(r'[^\d.\-]', '', value)
  1085. if num_str and num_str != '-':
  1086. return f"{num_str}%"
  1087. return "0%"
  1088. return value