investment_parser.py 48 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117
  1. # Copyright (c) Opendatalab. All rights reserved.
  2. """
  3. 投资估算表格解析模块
  4. 支持三种类型:
  5. 1. 可研批复投资估算
  6. 2. 可研评审投资估算
  7. 3. 初设批复概算投资
  8. """
  9. from typing import List, Optional
  10. import re
  11. from ..utils.logging_config import get_logger
  12. from ..models.data_models import (
  13. FeasibilityApprovalInvestment,
  14. FeasibilityReviewInvestment,
  15. PreliminaryApprovalInvestment,
  16. InvestmentItem,
  17. FinalAccountRecord,
  18. FinalAccountItem
  19. )
  20. from .table_parser import extract_table_with_rowspan_colspan
  21. logger = get_logger("pdf_converter_v2.parser.investment")
  22. def detect_investment_type(markdown_content: str) -> Optional[str]:
  23. """
  24. 检测投资估算表格类型
  25. Returns:
  26. str: 类型名称
  27. - "fsApproval" - 可研批复
  28. - "fsReview" - 可研评审
  29. - "pdApproval" - 初设批复
  30. - None - 无法识别
  31. """
  32. # 检查标题关键词
  33. if "可研批复" in markdown_content or "可行性研究报告的批复" in markdown_content:
  34. # 检查是否有建设规模相关列(可研批复特有)
  35. if "架空线" in markdown_content or "间隔" in markdown_content:
  36. logger.info("[投资估算] 检测到类型: 可研批复投资估算")
  37. return "fsApproval"
  38. if "可研评审" in markdown_content or "可行性研究报告的评审意见" in markdown_content:
  39. logger.info("[投资估算] 检测到类型: 可研评审投资估算")
  40. return "fsReview"
  41. if "初设批复" in markdown_content or "初步设计的批复" in markdown_content:
  42. logger.info("[投资估算] 检测到类型: 初设批复概算投资")
  43. return "pdApproval"
  44. logger.warning("[投资估算] 无法识别投资估算表格类型")
  45. return None
  46. def determine_level(text: str, name: str = "", strict_mode: bool = True) -> str:
  47. """
  48. 判断明细等级
  49. 规则:
  50. - 大写中文数字(一、二、三等) -> 第一级(顶级大类)
  51. - strict_mode=True: 需要名称包含电压等级+输变电工程才是一级,否则降为二级
  52. - strict_mode=False: 中文数字直接判断为一级(用于 fsReview、pdApproval)
  53. - 小写阿拉伯数字(1、2、3等) -> 第二级
  54. - 带括号的数字(1)、2)等) -> 第三级
  55. - 合计 -> 0
  56. Args:
  57. text: 序号或名称文本
  58. name: 可选,名称文本,用于辅助判断(区分顶级大类和子项)
  59. strict_mode: 是否使用严格模式(默认True,用于 fsApproval 区分顶级大类)
  60. Returns:
  61. str: "0"(合计), "1"(一级), "2"(二级), "3"(三级), ""(无法判断)
  62. """
  63. if not text:
  64. return ""
  65. text = text.strip()
  66. # 合计行(包含"合 计"这种带空格的情况)
  67. text_no_space = text.replace(" ", "")
  68. if "合计" in text_no_space or "小计" in text_no_space:
  69. return "0"
  70. # 第一级: 大写中文数字
  71. # 匹配: "一、", "一,", "一.", "一 ", "一" (后面可以跟任意字符或结束)
  72. # 注意:需要排除"十一"、"十二"等多位数字,只匹配单个中文数字
  73. is_chinese_numeral = False
  74. if re.match(r'^[一二三四五六七八九十]+[、,,.\s]', text):
  75. is_chinese_numeral = True
  76. # 如果序号后面直接跟汉字(没有标点),也可能是第一级
  77. # 例如: "一变电工程", "二线路工程"
  78. elif re.match(r'^[一二三四五六七八九十]+[\u4e00-\u9fa5]', text):
  79. is_chinese_numeral = True
  80. # 如果只是单独的中文数字(没有后续字符),也可能是第一级
  81. # 例如: "一", "二", "三"
  82. elif re.match(r'^[一二三四五六七八九十]+$', text):
  83. is_chinese_numeral = True
  84. if is_chinese_numeral:
  85. # 非严格模式:中文数字直接判断为一级(用于 fsReview、pdApproval)
  86. if not strict_mode:
  87. return "1"
  88. # 严格模式:进一步判断,区分顶级大类和子项目(用于 fsApproval)
  89. # 顶级大类特征:名称包含电压等级(如"220千伏"、"500kV")+ 输变电工程
  90. # 子项目特征:简单的工程类型名称(变电工程、线路工程、配套通信工程)
  91. name_to_check = name if name else text
  92. # 1. 检查是否是顶级大类(包含电压等级 + 输变电工程)
  93. # 电压等级模式:220千伏、500kV、110kv、35千伏等
  94. has_voltage = bool(re.search(r'\d+\s*(千伏|kV|KV|kv)', name_to_check, re.IGNORECASE))
  95. has_project_type = "输变电" in name_to_check or "变电站" in name_to_check or "送出工程" in name_to_check
  96. if has_voltage and has_project_type:
  97. # 包含电压等级和工程类型,是顶级大类
  98. return "1"
  99. # 2. 检查是否是子项目(固定名称,人为错误可能把"3"写成"三")
  100. # 子项目名称通常较短且是固定的工程类型
  101. subitem_exact = ["变电工程", "线路工程", "配套通信工程", "通信工程"]
  102. is_exact_subitem = name_to_check in subitem_exact
  103. if is_exact_subitem:
  104. # 完全匹配子项目名称,按二级处理
  105. logger.debug(f"[等级判断] 中文数字序号但名称是子项目,按二级处理: text={text}, name={name}")
  106. return "2"
  107. # 3. 其他情况:如果名称较长(>10字符),可能是顶级大类;否则按二级处理
  108. if len(name_to_check) > 10:
  109. # 较长的名称可能是顶级大类(即使没有匹配到电压等级模式)
  110. return "1"
  111. else:
  112. # 较短的名称,按二级处理
  113. logger.debug(f"[等级判断] 中文数字序号但名称较短,按二级处理: text={text}, name={name}")
  114. return "2"
  115. # 第二级: 小写阿拉伯数字
  116. # 匹配: "1、", "1,", "1.", "1 " (后面跟标点或空格)
  117. if re.match(r'^\d+[、,,.\s]', text) and not text.startswith('(') and not text.startswith('('):
  118. return "2"
  119. # 如果数字后面直接跟汉字(没有标点),也认为是第二级
  120. # 例如: "1周村220kV变电站"
  121. if re.match(r'^\d+[\u4e00-\u9fa5]', text) and not text.startswith('(') and not text.startswith('('):
  122. return "2"
  123. # 如果只是单独的阿拉伯数字(没有后续字符),也是第二级
  124. # 例如: "1", "2", "3"
  125. if re.match(r'^\d+$', text) and not text.startswith('(') and not text.startswith('('):
  126. return "2"
  127. # 第三级: 带括号的数字,或者数字后跟右括号
  128. # 匹配: "(1)", "(1)", "1)", "1)"
  129. if re.match(r'^[((]\d+[))]', text):
  130. return "3"
  131. # 数字后跟右括号,如 "1)", "2)"
  132. if re.match(r'^\d+[))]', text):
  133. return "3"
  134. return ""
  135. def clean_number_string(value: str) -> str:
  136. """
  137. 清理数字字符串
  138. - 移除千位分隔符
  139. - 移除单位
  140. - 保留小数点
  141. Args:
  142. value: 原始数字字符串
  143. Returns:
  144. str: 清理后的数字字符串
  145. """
  146. if not value or not value.strip():
  147. return ""
  148. value = value.strip()
  149. # 移除常见单位
  150. value = re.sub(r'[万元元]', '', value)
  151. # 移除千位分隔符
  152. value = value.replace(',', '').replace(',', '')
  153. # 移除空格
  154. value = value.replace(' ', '')
  155. return value
  156. def parse_feasibility_approval_investment(markdown_content: str) -> FeasibilityApprovalInvestment:
  157. """
  158. 解析可研批复投资估算
  159. 包含字段:
  160. - No: 序号
  161. - name: 工程或费用名称
  162. - Level: 明细等级
  163. - constructionScaleOverheadLine: 建设规模-架空线
  164. - constructionScaleBay: 建设规模-间隔
  165. - constructionScaleSubstation: 建设规模-变电
  166. - constructionScaleOpticalCable: 建设规模-光缆
  167. - staticInvestment: 静态投资(元)
  168. - dynamicInvestment: 动态投资(元)
  169. """
  170. record = FeasibilityApprovalInvestment()
  171. tables = extract_table_with_rowspan_colspan(markdown_content)
  172. if not tables:
  173. logger.warning("[可研批复投资] 未能提取出任何表格内容")
  174. return record
  175. # 找到所有投资估算表格并合并
  176. # 因为OCR可能将一个大表格拆分成多个<table>
  177. all_matching_tables = []
  178. for table_idx, table in enumerate(tables):
  179. table_text = ""
  180. for row in table:
  181. table_text += " ".join([str(cell) for cell in row])
  182. # 移除空格后再匹配
  183. table_text_no_space = table_text.replace(" ", "")
  184. # 选择包含"工程或费用名称"和"静态投资"的表格
  185. if "工程或费用名称" in table_text_no_space and "静态投资" in table_text_no_space:
  186. all_matching_tables.append((table_idx, table))
  187. logger.info(f"[可研批复投资] 找到投资估算表格 (表格{table_idx+1}), 行数: {len(table)}")
  188. if not all_matching_tables:
  189. logger.warning("[可研批复投资] 未找到包含投资估算的表格")
  190. return record
  191. # 如果只有一个表格,直接使用
  192. if len(all_matching_tables) == 1:
  193. target_table = all_matching_tables[0][1]
  194. else:
  195. # 多个表格:合并所有表格的数据行(跳过重复的表头行)
  196. logger.info(f"[可研批复投资] 发现 {len(all_matching_tables)} 个投资估算表格,将进行合并")
  197. target_table = []
  198. first_table = True
  199. for table_idx, table in all_matching_tables:
  200. if first_table:
  201. # 第一个表格:保留全部内容(包括表头)
  202. target_table.extend(table)
  203. first_table = False
  204. else:
  205. # 后续表格:跳过表头行(前几行包含"序号"、"工程或费用名称"等)
  206. header_end_idx = 0
  207. for row_idx, row in enumerate(table):
  208. row_text = " ".join([str(cell) for cell in row]).replace(" ", "")
  209. # 如果这行包含表头关键词,继续跳过
  210. if "序号" in row_text or "工程或费用名称" in row_text or "建设规模" in row_text:
  211. header_end_idx = row_idx + 1
  212. # 如果第一列是中文数字(一、二、三...),说明是数据行开始
  213. elif len(row) > 0:
  214. first_cell = str(row[0]).strip()
  215. if first_cell in ["一", "二", "三", "四", "五", "六", "七", "八", "九", "十"]:
  216. break
  217. # 只添加数据行
  218. target_table.extend(table[header_end_idx:])
  219. logger.debug(f"[可研批复投资] 表格{table_idx+1}: 跳过前{header_end_idx}行表头,添加{len(table)-header_end_idx}行数据")
  220. logger.info(f"[可研批复投资] 合并后总行数: {len(target_table)}")
  221. # 识别表头行和列索引
  222. # 注意:表格可能有多层表头(rowspan),需要扫描前几行来找到所有列名
  223. header_row_idx = -1
  224. no_idx = -1
  225. name_idx = -1
  226. overhead_line_idx = -1
  227. bay_idx = -1
  228. substation_idx = -1
  229. optical_cable_idx = -1
  230. static_investment_idx = -1
  231. dynamic_investment_idx = -1
  232. # 新增费用列索引
  233. construction_project_cost_idx = -1 # 建筑工程费
  234. equipment_purchase_cost_idx = -1 # 设备购置费
  235. installation_project_cost_idx = -1 # 安装工程费
  236. other_expenses_idx = -1 # 其他费用(合计)
  237. # 扫描前几行(最多5行)来识别列索引
  238. for row_idx in range(min(5, len(target_table))):
  239. row = target_table[row_idx]
  240. row_text = " ".join([str(cell) for cell in row])
  241. row_text_no_space = row_text.replace(" ", "")
  242. # 识别各列(遍历所有行的所有列)
  243. for col_idx, cell in enumerate(row):
  244. cell_text = str(cell).strip()
  245. cell_text_no_space = cell_text.replace(" ", "")
  246. if "序号" in cell_text and no_idx == -1:
  247. no_idx = col_idx
  248. elif ("工程或费用名称" in cell_text_no_space) and name_idx == -1:
  249. name_idx = col_idx
  250. elif "架空线" in cell_text_no_space and overhead_line_idx == -1:
  251. overhead_line_idx = col_idx
  252. elif "间隔" in cell_text and bay_idx == -1:
  253. bay_idx = col_idx
  254. elif "变电" in cell_text and substation_idx == -1:
  255. substation_idx = col_idx
  256. elif "光缆" in cell_text and optical_cable_idx == -1:
  257. optical_cable_idx = col_idx
  258. elif "静态投资" in cell_text_no_space and static_investment_idx == -1:
  259. static_investment_idx = col_idx
  260. elif "动态投资" in cell_text_no_space and dynamic_investment_idx == -1:
  261. dynamic_investment_idx = col_idx
  262. # 新增费用字段识别
  263. elif "建筑工程费" in cell_text_no_space and construction_project_cost_idx == -1:
  264. construction_project_cost_idx = col_idx
  265. elif "设备购置费" in cell_text_no_space and equipment_purchase_cost_idx == -1:
  266. equipment_purchase_cost_idx = col_idx
  267. elif "安装工程费" in cell_text_no_space and installation_project_cost_idx == -1:
  268. installation_project_cost_idx = col_idx
  269. elif ("其他费用" in cell_text_no_space or "合计" == cell_text_no_space) and other_expenses_idx == -1:
  270. # 其他费用列通常标题为"合计"或"其他费用"
  271. # 注意:表头可能有"合计"列在"其他费用"下面
  272. if "其他费用" in cell_text_no_space:
  273. other_expenses_idx = col_idx
  274. # 如果这一行包含"序号"或"工程或费用名称",记录为表头结束行
  275. if ("序号" in row_text or "工程或费用名称" in row_text_no_space) and header_row_idx == -1:
  276. header_row_idx = row_idx
  277. # 表头结束行应该是最后一个包含表头内容的行
  278. # 找到第一个数据行(通常是"一"、"二"等开头)
  279. for row_idx in range(min(5, len(target_table))):
  280. row = target_table[row_idx]
  281. if len(row) > 0:
  282. first_cell = str(row[0]).strip()
  283. # 如果第一列是中文数字或阿拉伯数字(不是"序号"),这是数据行
  284. if first_cell and first_cell not in ["序号", ""] and (first_cell in ["一", "二", "三", "四", "五"] or first_cell.isdigit()):
  285. header_row_idx = row_idx - 1
  286. logger.debug(f"[可研批复投资] 根据数据行确定表头结束于第{header_row_idx}行")
  287. break
  288. logger.info(f"[可研批复投资] 表头行: {header_row_idx}")
  289. logger.info(f"[可研批复投资] 列索引: 序号={no_idx}, 名称={name_idx}, "
  290. f"架空线={overhead_line_idx}, 间隔={bay_idx}, 变电={substation_idx}, "
  291. f"光缆={optical_cable_idx}, 静态投资={static_investment_idx}, 动态投资={dynamic_investment_idx}")
  292. logger.info(f"[可研批复投资] 费用列索引: 建筑工程费={construction_project_cost_idx}, "
  293. f"设备购置费={equipment_purchase_cost_idx}, 安装工程费={installation_project_cost_idx}, "
  294. f"其他费用={other_expenses_idx}")
  295. if header_row_idx == -1:
  296. logger.warning("[可研批复投资] 未找到表头行")
  297. return record
  298. # 解析数据行(输出全部数据,不再只筛选"四"区域)
  299. for row_idx in range(header_row_idx + 1, len(target_table)):
  300. row = target_table[row_idx]
  301. if len(row) < 3:
  302. continue
  303. # 检查是否是有效数据行(至少有名称)
  304. if name_idx >= 0 and name_idx < len(row):
  305. name = str(row[name_idx]).strip()
  306. if not name or name in ["", "nan", "None"]:
  307. continue
  308. # 提取序号
  309. no = ""
  310. if no_idx >= 0 and no_idx < len(row):
  311. no = str(row[no_idx]).strip()
  312. # 判断等级,传入 name 辅助区分顶级大类和子项
  313. level_input = (no + name) if no else name
  314. level = determine_level(level_input, name)
  315. item = InvestmentItem()
  316. item.no = no
  317. item.name = name
  318. item.level = level
  319. # 提取建设规模
  320. if overhead_line_idx >= 0 and overhead_line_idx < len(row):
  321. item.constructionScaleOverheadLine = str(row[overhead_line_idx]).strip()
  322. if bay_idx >= 0 and bay_idx < len(row):
  323. item.constructionScaleBay = str(row[bay_idx]).strip()
  324. if substation_idx >= 0 and substation_idx < len(row):
  325. item.constructionScaleSubstation = str(row[substation_idx]).strip()
  326. if optical_cable_idx >= 0 and optical_cable_idx < len(row):
  327. item.constructionScaleOpticalCable = str(row[optical_cable_idx]).strip()
  328. # 提取投资金额
  329. if static_investment_idx >= 0 and static_investment_idx < len(row):
  330. item.staticInvestment = clean_number_string(str(row[static_investment_idx]))
  331. if dynamic_investment_idx >= 0 and dynamic_investment_idx < len(row):
  332. item.dynamicInvestment = clean_number_string(str(row[dynamic_investment_idx]))
  333. # 提取费用明细
  334. if construction_project_cost_idx >= 0 and construction_project_cost_idx < len(row):
  335. item.constructionProjectCost = clean_number_string(str(row[construction_project_cost_idx]))
  336. if equipment_purchase_cost_idx >= 0 and equipment_purchase_cost_idx < len(row):
  337. item.equipmentPurchaseCost = clean_number_string(str(row[equipment_purchase_cost_idx]))
  338. if installation_project_cost_idx >= 0 and installation_project_cost_idx < len(row):
  339. item.installationProjectCost = clean_number_string(str(row[installation_project_cost_idx]))
  340. if other_expenses_idx >= 0 and other_expenses_idx < len(row):
  341. item.otherExpenses = clean_number_string(str(row[other_expenses_idx]))
  342. record.items.append(item)
  343. logger.info(f"[可研批复投资] 解析到数据: No={item.no}, Name={item.name}, Level={item.level}")
  344. logger.info(f"[可研批复投资] 共解析到 {len(record.items)} 条数据")
  345. return record
  346. def parse_feasibility_review_investment(markdown_content: str) -> FeasibilityReviewInvestment:
  347. """
  348. 解析可研评审投资估算
  349. 包含字段:
  350. - No: 序号
  351. - name: 工程或费用名称
  352. - Level: 明细等级
  353. - staticInvestment: 静态投资(元)
  354. - dynamicInvestment: 动态投资(元)
  355. 注意:文档中可能包含多个表格,只解析"输变电工程建设规模及投资估算表"
  356. 排除"总估算表"类型的表格
  357. """
  358. record = FeasibilityReviewInvestment()
  359. # 使用正则表达式查找表格及其前面的标题
  360. # 查找 "输变电工程" + "投资估算表" 的标题,排除 "总估算表"
  361. import re
  362. # 找到目标表格的标题位置
  363. # 标题格式如: # 山西晋城周村220kV输变电工程建设规模及投资估算表
  364. target_table_pattern = re.compile(
  365. r'#\s*[^#\n]*?(输变电工程|输变电|变电工程)[^#\n]*?(建设规模及)?投资估算表',
  366. re.IGNORECASE
  367. )
  368. # 排除"总估算表"的模式
  369. exclude_pattern = re.compile(r'总估算表', re.IGNORECASE)
  370. # 查找所有匹配的标题
  371. target_title_match = None
  372. for match in target_table_pattern.finditer(markdown_content):
  373. title_text = match.group(0)
  374. if not exclude_pattern.search(title_text):
  375. target_title_match = match
  376. logger.info(f"[可研评审投资] 找到目标表格标题: {title_text}")
  377. break
  378. if not target_title_match:
  379. logger.warning("[可研评审投资] 未找到'输变电工程投资估算表'标题")
  380. # 回退到原有逻辑
  381. tables = extract_table_with_rowspan_colspan(markdown_content)
  382. if not tables:
  383. logger.warning("[可研评审投资] 未能提取出任何表格内容")
  384. return record
  385. target_table = None
  386. for table in tables:
  387. for row in table:
  388. row_text = " ".join([str(cell) for cell in row])
  389. row_text_no_space = row_text.replace(" ", "")
  390. if "工程或费用名称" in row_text_no_space or ("序号" in row_text and "静态投资" in row_text_no_space):
  391. target_table = table
  392. logger.info(f"[可研评审投资] 回退: 找到投资估算表格, 行数: {len(table)}")
  393. break
  394. if target_table:
  395. break
  396. if not target_table:
  397. logger.warning("[可研评审投资] 未找到包含投资估算的表格")
  398. return record
  399. else:
  400. # 提取标题后面到下一个标题之间的内容(包含目标表格)
  401. title_end = target_title_match.end()
  402. # 找到下一个标题或文档结束
  403. next_title_pattern = re.compile(r'\n#\s+[^#]')
  404. next_title_match = next_title_pattern.search(markdown_content, title_end)
  405. if next_title_match:
  406. section_content = markdown_content[target_title_match.start():next_title_match.start()]
  407. else:
  408. section_content = markdown_content[target_title_match.start():]
  409. logger.debug(f"[可研评审投资] 提取表格区域内容长度: {len(section_content)} 字符")
  410. # 从该区域提取表格
  411. tables = extract_table_with_rowspan_colspan(section_content)
  412. if not tables:
  413. logger.warning("[可研评审投资] 目标区域未能提取出任何表格内容")
  414. return record
  415. # 选择第一个有效表格
  416. target_table = None
  417. for table in tables:
  418. for row in table:
  419. row_text = " ".join([str(cell) for cell in row])
  420. row_text_no_space = row_text.replace(" ", "")
  421. if "工程或费用名称" in row_text_no_space or ("序号" in row_text and "静态投资" in row_text_no_space):
  422. target_table = table
  423. logger.info(f"[可研评审投资] 找到目标投资估算表格, 行数: {len(table)}")
  424. break
  425. if target_table:
  426. break
  427. if not target_table:
  428. logger.warning("[可研评审投资] 目标区域未找到包含投资估算的表格")
  429. return record
  430. # 识别表头行和列索引(多行表头处理)
  431. # 这个表格有多行表头(rowspan/colspan),需要扫描前几行来找到所有列索引
  432. no_idx = -1
  433. name_idx = -1
  434. static_investment_idx = -1
  435. dynamic_investment_idx = -1
  436. header_row_idx = -1
  437. # 扫描前5行查找列索引
  438. scan_rows = min(5, len(target_table))
  439. for row_idx in range(scan_rows):
  440. row = target_table[row_idx]
  441. for col_idx, cell in enumerate(row):
  442. cell_text = str(cell).strip()
  443. cell_text_no_space = cell_text.replace(" ", "")
  444. if "序号" in cell_text and no_idx == -1:
  445. no_idx = col_idx
  446. elif ("工程或费用名称" in cell_text_no_space or "工程名称" in cell_text_no_space) and name_idx == -1:
  447. name_idx = col_idx
  448. elif "静态投资" in cell_text_no_space and static_investment_idx == -1:
  449. static_investment_idx = col_idx
  450. elif "动态投资" in cell_text_no_space and dynamic_investment_idx == -1:
  451. dynamic_investment_idx = col_idx
  452. logger.info(f"[可研评审投资] 列索引: 序号={no_idx}, 名称={name_idx}, "
  453. f"静态投资={static_investment_idx}, 动态投资={dynamic_investment_idx}")
  454. # 确定表头结束行(第一个数据行的前一行)
  455. # 数据行特征:第一列是中文数字(一、二、三)或阿拉伯数字
  456. for row_idx in range(len(target_table)):
  457. row = target_table[row_idx]
  458. if len(row) > 0:
  459. first_cell = str(row[0]).strip()
  460. # 检查是否是数据行(以中文数字或阿拉伯数字开头)
  461. if re.match(r'^[一二三四五六七八九十]+$', first_cell) or re.match(r'^\d+$', first_cell):
  462. # 排除表头行(检查第二列是否是表头关键词)
  463. if len(row) > 1:
  464. second_cell = str(row[1]).strip().replace(" ", "")
  465. if second_cell not in ["工程或费用名称", "工程名称", "名称", ""]:
  466. header_row_idx = row_idx - 1
  467. logger.debug(f"[可研评审投资] 确定表头结束行: 第{header_row_idx}行")
  468. break
  469. if header_row_idx == -1:
  470. header_row_idx = 2 # 默认假设前3行是表头
  471. logger.debug(f"[可研评审投资] 使用默认表头结束行: 第{header_row_idx}行")
  472. # 解析数据行
  473. for row_idx in range(header_row_idx + 1, len(target_table)):
  474. row = target_table[row_idx]
  475. if len(row) < 2:
  476. continue
  477. if name_idx >= 0 and name_idx < len(row):
  478. name = str(row[name_idx]).strip()
  479. if not name or name in ["", "nan", "None"]:
  480. continue
  481. # 跳过重复的表头行
  482. name_no_space = name.replace(" ", "")
  483. if name_no_space in ["工程或费用名称", "工程名称", "名称"]:
  484. logger.debug(f"[可研评审投资] 跳过表头行: {name}")
  485. continue
  486. item = InvestmentItem()
  487. if no_idx >= 0 and no_idx < len(row):
  488. item.no = str(row[no_idx]).strip()
  489. # 跳过表头中的序号列
  490. if item.no == "序号":
  491. continue
  492. item.name = name
  493. # 判断等级 - 使用 no 和 name 分别判断
  494. # fsReview 使用非严格模式,中文数字直接判断为一级
  495. if item.no:
  496. # 优先使用 no 判断等级
  497. item.level = determine_level(item.no, item.name, strict_mode=False)
  498. if not item.level:
  499. # 如果 no 没有匹配,尝试使用 name
  500. item.level = determine_level(item.name, item.name, strict_mode=False)
  501. else:
  502. item.level = determine_level(item.name, item.name, strict_mode=False)
  503. # 提取投资金额
  504. if static_investment_idx >= 0 and static_investment_idx < len(row):
  505. item.staticInvestment = clean_number_string(str(row[static_investment_idx]))
  506. if dynamic_investment_idx >= 0 and dynamic_investment_idx < len(row):
  507. item.dynamicInvestment = clean_number_string(str(row[dynamic_investment_idx]))
  508. record.items.append(item)
  509. logger.info(f"[可研评审投资] 解析到数据: No={item.no}, Name={item.name}, Level={item.level}, "
  510. f"静态投资={item.staticInvestment}, 动态投资={item.dynamicInvestment}")
  511. logger.info(f"[可研评审投资] 共解析到 {len(record.items)} 条数据")
  512. return record
  513. def parse_preliminary_approval_investment(markdown_content: str) -> PreliminaryApprovalInvestment:
  514. """
  515. 解析初设批复概算投资
  516. 包含字段:
  517. - No: 序号
  518. - name: 工程名称
  519. - Level: 明细等级
  520. - staticInvestment: 静态投资(元)
  521. - dynamicInvestment: 动态投资(元)
  522. Note: 需要包含合计行,合计的level为0
  523. """
  524. logger.info("[初设批复投资] ========== 开始解析初设批复概算投资 ==========")
  525. logger.debug(f"[初设批复投资] Markdown内容长度: {len(markdown_content)} 字符")
  526. record = PreliminaryApprovalInvestment()
  527. logger.info("[初设批复投资] 开始提取表格...")
  528. tables = extract_table_with_rowspan_colspan(markdown_content)
  529. logger.info(f"[初设批复投资] 提取到 {len(tables) if tables else 0} 个表格")
  530. if not tables:
  531. logger.warning("[初设批复投资] 未能提取出任何表格内容")
  532. return record
  533. # 找到包含投资估算的表格
  534. logger.info("[初设批复投资] 开始查找投资估算表格...")
  535. target_table = None
  536. for table_idx, table in enumerate(tables):
  537. logger.debug(f"[初设批复投资] 检查表格 {table_idx + 1}/{len(tables)}, 行数: {len(table)}")
  538. for row_idx, row in enumerate(table):
  539. row_text = " ".join([str(cell) for cell in row])
  540. # 移除空格后再匹配,以处理OCR可能产生的空格
  541. row_text_no_space = row_text.replace(" ", "")
  542. # 输出前几行用于调试
  543. if row_idx < 3:
  544. logger.debug(f"[初设批复投资] 表格{table_idx+1} 第{row_idx+1}行: {row_text[:100]}")
  545. if "工程名称" in row_text_no_space or ("序号" in row_text and "静态投资" in row_text_no_space):
  546. target_table = table
  547. logger.info(f"[初设批复投资] ✓ 找到投资估算表格 (表格{table_idx+1}), 行数: {len(table)}")
  548. logger.debug(f"[初设批复投资] 匹配行内容: {row_text}")
  549. break
  550. if target_table:
  551. break
  552. if not target_table:
  553. logger.warning("[初设批复投资] ✗ 未找到包含投资估算的表格")
  554. logger.warning("[初设批复投资] 查找条件: 包含'工程名称' 或 ('序号' 且 '静态投资')")
  555. return record
  556. # 识别表头行和列索引
  557. logger.info("[初设批复投资] 开始识别表头行和列索引...")
  558. header_row_idx = -1
  559. no_idx = -1
  560. name_idx = -1
  561. static_investment_idx = -1
  562. dynamic_investment_idx = -1
  563. for row_idx, row in enumerate(target_table):
  564. row_text = " ".join([str(cell) for cell in row])
  565. # 移除空格后再匹配,以处理OCR可能产生的空格
  566. row_text_no_space = row_text.replace(" ", "")
  567. logger.debug(f"[初设批复投资] 检查第{row_idx}行: {row_text[:80]}")
  568. if "工程名称" in row_text_no_space or "序号" in row_text:
  569. header_row_idx = row_idx
  570. logger.info(f"[初设批复投资] ✓ 找到表头行: 第{row_idx}行")
  571. logger.debug(f"[初设批复投资] 表头内容: {row}")
  572. for col_idx, cell in enumerate(row):
  573. cell_text = str(cell).strip()
  574. # 移除空格后再匹配,以处理OCR可能产生的空格
  575. cell_text_no_space = cell_text.replace(" ", "")
  576. logger.debug(f"[初设批复投资] 列{col_idx}: '{cell_text}' (去空格: '{cell_text_no_space}')")
  577. if "序号" in cell_text:
  578. no_idx = col_idx
  579. logger.debug(f"[初设批复投资] → 序号列: {col_idx}")
  580. elif "工程名称" in cell_text_no_space or "名称" in cell_text:
  581. name_idx = col_idx
  582. logger.debug(f"[初设批复投资] → 名称列: {col_idx}")
  583. elif "静态投资" in cell_text_no_space:
  584. static_investment_idx = col_idx
  585. logger.debug(f"[初设批复投资] → 静态投资列: {col_idx}")
  586. elif "动态投资" in cell_text_no_space:
  587. dynamic_investment_idx = col_idx
  588. logger.debug(f"[初设批复投资] → 动态投资列: {col_idx}")
  589. logger.info(f"[初设批复投资] ✓ 列索引识别完成: 序号={no_idx}, 名称={name_idx}, "
  590. f"静态投资={static_investment_idx}, 动态投资={dynamic_investment_idx}")
  591. break
  592. if header_row_idx == -1:
  593. logger.warning("[初设批复投资] ✗ 未找到表头行")
  594. logger.warning("[初设批复投资] 查找条件: 包含'工程名称' 或 '序号'")
  595. return record
  596. # 解析数据行
  597. logger.info(f"[初设批复投资] 开始解析数据行 (从第{header_row_idx + 1}行到第{len(target_table)}行)...")
  598. parsed_count = 0
  599. skipped_count = 0
  600. for row_idx in range(header_row_idx + 1, len(target_table)):
  601. row = target_table[row_idx]
  602. logger.debug(f"[初设批复投资] 处理第{row_idx}行, 列数: {len(row)}")
  603. if len(row) < 2:
  604. logger.debug(f"[初设批复投资] 跳过第{row_idx}行: 列数不足 ({len(row)} < 2)")
  605. skipped_count += 1
  606. continue
  607. if name_idx >= 0 and name_idx < len(row):
  608. name = str(row[name_idx]).strip()
  609. logger.debug(f"[初设批复投资] 第{row_idx}行名称: '{name}'")
  610. if not name or name in ["", "nan", "None"]:
  611. logger.debug(f"[初设批复投资] 跳过第{row_idx}行: 名称为空")
  612. skipped_count += 1
  613. continue
  614. item = InvestmentItem()
  615. if no_idx >= 0 and no_idx < len(row):
  616. item.no = str(row[no_idx]).strip()
  617. item.name = name
  618. # 判断等级 - pdApproval 使用非严格模式,中文数字直接判断为一级
  619. level_input = (item.no + item.name) if item.no else item.name
  620. item.level = determine_level(level_input, item.name, strict_mode=False)
  621. logger.debug(f"[初设批复投资] 等级判断: '{level_input}' -> Level={item.level}")
  622. # 提取投资金额
  623. if static_investment_idx >= 0 and static_investment_idx < len(row):
  624. raw_static = str(row[static_investment_idx])
  625. item.staticInvestment = clean_number_string(raw_static)
  626. logger.debug(f"[初设批复投资] 静态投资: '{raw_static}' -> '{item.staticInvestment}'")
  627. if dynamic_investment_idx >= 0 and dynamic_investment_idx < len(row):
  628. raw_dynamic = str(row[dynamic_investment_idx])
  629. item.dynamicInvestment = clean_number_string(raw_dynamic)
  630. logger.debug(f"[初设批复投资] 动态投资: '{raw_dynamic}' -> '{item.dynamicInvestment}'")
  631. record.items.append(item)
  632. parsed_count += 1
  633. logger.info(f"[初设批复投资] ✓ 解析到数据 #{parsed_count}: No={item.no}, Name={item.name}, Level={item.level}, "
  634. f"静态={item.staticInvestment}, 动态={item.dynamicInvestment}")
  635. else:
  636. logger.debug(f"[初设批复投资] 跳过第{row_idx}行: name_idx={name_idx} 超出范围 (行长度={len(row)})")
  637. skipped_count += 1
  638. logger.info(f"[初设批复投资] ========== 解析完成 ==========")
  639. logger.info(f"[初设批复投资] 成功解析: {parsed_count} 条")
  640. logger.info(f"[初设批复投资] 跳过: {skipped_count} 条")
  641. logger.info(f"[初设批复投资] 总计: {len(record.items)} 条数据")
  642. return record
  643. def parse_investment_record(markdown_content: str, investment_type: Optional[str] = None):
  644. """
  645. 解析投资估算记录(统一入口)
  646. Args:
  647. markdown_content: Markdown内容
  648. investment_type: 投资类型(可选,如果不提供则自动检测)
  649. - "fsApproval" - 可研批复
  650. - "fsReview" - 可研评审
  651. - "pdApproval" - 初设批复
  652. Returns:
  653. 解析后的记录对象
  654. """
  655. logger.info("=" * 80)
  656. logger.info("[投资估算] 开始解析投资估算记录")
  657. logger.info(f"[投资估算] Markdown内容长度: {len(markdown_content)} 字符")
  658. # 如果没有指定类型,自动检测
  659. if not investment_type:
  660. logger.info("[投资估算] 未指定类型,开始自动检测...")
  661. investment_type = detect_investment_type(markdown_content)
  662. logger.info(f"[投资估算] 自动检测结果: {investment_type}")
  663. else:
  664. logger.info(f"[投资估算] 指定类型: {investment_type}")
  665. if not investment_type:
  666. logger.error("[投资估算] 无法识别投资估算类型")
  667. logger.error(f"[投资估算] Markdown前500字符: {markdown_content[:500]}")
  668. return None
  669. # 根据类型调用对应的解析函数
  670. logger.info(f"[投资估算] 调用解析函数: {investment_type}")
  671. result = None
  672. if investment_type == "fsApproval":
  673. result = parse_feasibility_approval_investment(markdown_content)
  674. elif investment_type == "fsReview":
  675. result = parse_feasibility_review_investment(markdown_content)
  676. elif investment_type == "pdApproval":
  677. result = parse_preliminary_approval_investment(markdown_content)
  678. else:
  679. logger.error(f"[投资估算] 未知的投资估算类型: {investment_type}")
  680. return None
  681. if result:
  682. logger.info(f"[投资估算] 解析完成,返回对象类型: {type(result).__name__}")
  683. logger.info(f"[投资估算] 记录数量: {len(result.items)}")
  684. else:
  685. logger.error("[投资估算] 解析函数返回 None")
  686. logger.info("=" * 80)
  687. return result
  688. def parse_final_account_record(markdown_content: str) -> Optional[FinalAccountRecord]:
  689. """
  690. 解析决算报告中的单项工程投资完成情况表格
  691. 从OCR输出的Markdown中提取表格数据:
  692. - 表格结构:费用项目 | 概算金额 | 决算金额(审定-不含税) | 增值税额 | 超节支金额 | 超节支率
  693. - 需要提取4个单项工程的投资完成情况
  694. Args:
  695. markdown_content: OCR转换后的Markdown内容
  696. Returns:
  697. FinalAccountRecord 对象,包含所有单项工程的费用明细
  698. """
  699. logger.info("=" * 80)
  700. logger.info("[决算报告] 开始解析决算报告")
  701. logger.info(f"[决算报告] Markdown内容长度: {len(markdown_content)} 字符")
  702. record = FinalAccountRecord()
  703. # 使用正则表达式提取单项工程名称和对应的表格
  704. # 匹配模式:数字序号 + 工程名称(在"单项工程的投资完成情况"章节内)
  705. project_patterns = [
  706. # 匹配 "1、周村 220kV 输变电工程变电站新建工程" 格式
  707. (r'(\d+)[、\..]\s*(.+?(?:工程|扩建))(?:\n|$)', 1),
  708. # 匹配 "# 1、周村220kV变电站新建工程" 格式(带标题标记)
  709. (r'#\s*(\d+)[、\..]\s*(.+?(?:工程|扩建))(?:\n|$)', 2),
  710. ]
  711. # 找到"单项工程的投资完成情况"章节的起始位置
  712. section_start = 0
  713. section_patterns = [
  714. r'单项工程的?(?:投资)?完成情况',
  715. r'#\s*单项工程',
  716. ]
  717. for pattern in section_patterns:
  718. match = re.search(pattern, markdown_content)
  719. if match:
  720. section_start = match.start()
  721. logger.info(f"[决算报告] 找到单项工程章节起始位置: {section_start}")
  722. break
  723. # 找到所有项目标题及其位置
  724. project_positions = []
  725. for pattern, priority in project_patterns:
  726. for match in re.finditer(pattern, markdown_content):
  727. # 只处理单项工程章节内的项目
  728. if match.start() < section_start:
  729. continue
  730. project_no = int(match.group(1))
  731. project_name = match.group(2).strip()
  732. # 清理项目名称中的多余空格和特殊字符
  733. project_name = re.sub(r'\s+', '', project_name)
  734. project_name = re.sub(r'\\[()\[\]]', '', project_name)
  735. # 清理LaTeX数学公式格式
  736. project_name = re.sub(r'\\mathrm\{([^}]+)\}', r'\1', project_name)
  737. project_name = re.sub(r'\\[a-zA-Z]+', '', project_name)
  738. project_positions.append({
  739. "no": project_no,
  740. "name": project_name,
  741. "start": match.start(),
  742. "end": match.end(),
  743. "priority": priority
  744. })
  745. # 按位置排序并去重
  746. project_positions.sort(key=lambda x: x["start"])
  747. seen_positions = set()
  748. unique_projects = []
  749. for proj in project_positions:
  750. # 避免重复的项目(位置相近的同名项目)
  751. key = (proj["no"], proj["start"] // 100)
  752. if key not in seen_positions:
  753. seen_positions.add(key)
  754. unique_projects.append(proj)
  755. logger.info(f"[决算报告] 找到 {len(unique_projects)} 个单项工程")
  756. for proj in unique_projects:
  757. logger.debug(f"[决算报告] 项目 {proj['no']}: {proj['name']}")
  758. # 提取HTML表格及其位置
  759. table_pattern = r'<table[^>]*>(.*?)</table>'
  760. table_matches = list(re.finditer(table_pattern, markdown_content, re.DOTALL | re.IGNORECASE))
  761. logger.info(f"[决算报告] 找到 {len(table_matches)} 个HTML表格")
  762. # 解析每个表格
  763. for table_idx, table_match in enumerate(table_matches):
  764. table_html = table_match.group(1)
  765. table_pos = table_match.start()
  766. # 检查是否为单项工程投资完成情况表格
  767. if not _is_final_account_table(table_html, table_pos, section_start):
  768. logger.debug(f"[决算报告] 表格 {table_idx + 1} 不是单项工程投资完成情况表格,跳过")
  769. continue
  770. # 查找最近的项目
  771. matched_project = None
  772. for proj in unique_projects:
  773. if proj["end"] < table_pos:
  774. matched_project = proj
  775. if not matched_project:
  776. # 如果没有找到匹配的项目,使用表格索引作为项目序号
  777. logger.warning(f"[决算报告] 表格 {table_idx + 1} 未找到对应的项目名称")
  778. matched_project = {"no": table_idx + 1, "name": f"未知工程{table_idx + 1}"}
  779. logger.info(f"[决算报告] 解析表格 {table_idx + 1},关联项目: {matched_project['no']}-{matched_project['name']}")
  780. # 解析表格内容
  781. items = _parse_final_account_table_html(table_html, matched_project["no"], matched_project["name"])
  782. record.items.extend(items)
  783. logger.info(f"[决算报告] 解析完成,共 {len(record.items)} 条记录")
  784. logger.info("=" * 80)
  785. return record
  786. def _is_final_account_table(table_html: str, table_pos: int, section_start: int) -> bool:
  787. """
  788. 判断表格是否为单项工程投资完成情况表格
  789. 特征:
  790. 1. 位于"单项工程的投资完成情况"章节内
  791. 2. 包含"费用项目"、"概算金额"、"决算金额"、"超"、"节"等关键词
  792. Args:
  793. table_html: 表格HTML内容
  794. table_pos: 表格在Markdown中的位置
  795. section_start: 单项工程章节的起始位置
  796. """
  797. # 表格必须在单项工程章节内
  798. if table_pos < section_start:
  799. return False
  800. table_text = table_html.lower()
  801. # 必须包含的关键词
  802. required_keywords = ["概算金额", "决算金额"]
  803. # 至少包含一个的关键词
  804. optional_keywords = ["费用项目", "建筑安装", "设备购置", "其他费用", "审定金额"]
  805. has_required = all(kw.lower() in table_text for kw in required_keywords)
  806. has_optional = any(kw.lower() in table_text for kw in optional_keywords)
  807. return has_required and has_optional
  808. def _parse_final_account_table_html(table_html: str, project_no: int, project_name: str) -> List[FinalAccountItem]:
  809. """
  810. 解析HTML表格内容
  811. 表格结构:
  812. 费用项目 | 概算金额 | 审定金额(不含税) | 增值税额 | 超节支金额 | 超节支率
  813. Args:
  814. table_html: HTML表格内容
  815. project_no: 项目序号
  816. project_name: 项目名称
  817. Returns:
  818. FinalAccountItem 列表
  819. """
  820. items = []
  821. # 提取所有行
  822. row_pattern = r'<tr[^>]*>(.*?)</tr>'
  823. rows = re.findall(row_pattern, table_html, re.DOTALL | re.IGNORECASE)
  824. if not rows:
  825. return items
  826. # 提取每行的单元格
  827. cell_pattern = r'<td[^>]*>(.*?)</td>'
  828. # 跳过表头行(通常前2-3行是表头)
  829. data_start_idx = 0
  830. for i, row in enumerate(rows):
  831. cells = re.findall(cell_pattern, row, re.DOTALL | re.IGNORECASE)
  832. row_text = " ".join(cells).lower()
  833. # 检测数据开始行(包含"建筑安装"等费用项目名称)
  834. if "建筑安装" in row_text or "设备购置" in row_text or "其他费用" in row_text:
  835. data_start_idx = i
  836. break
  837. # 跳过表头行(包含"1"、"2"、"3"等列序号)
  838. if re.match(r'^[\d\s=\-/]+$', row_text.replace(" ", "")):
  839. continue
  840. # 解析数据行
  841. for row in rows[data_start_idx:]:
  842. cells = re.findall(cell_pattern, row, re.DOTALL | re.IGNORECASE)
  843. if len(cells) < 2:
  844. continue
  845. # 清理单元格内容
  846. cells = [_clean_cell_text(cell) for cell in cells]
  847. # 跳过空行
  848. if not any(cells):
  849. continue
  850. # 获取费用项目名称(第一列)
  851. fee_name = cells[0] if len(cells) > 0 else ""
  852. # 跳过合计行
  853. if any(kw in fee_name for kw in ["合计", "总计", "小计"]):
  854. continue
  855. # 只保留主要费用项目
  856. valid_fee_names = ["建筑安装工程", "建筑安装", "设备购置", "其他费用"]
  857. is_valid = any(kw in fee_name for kw in valid_fee_names)
  858. if not is_valid:
  859. continue
  860. # 创建记录项
  861. item = FinalAccountItem()
  862. item.no = project_no
  863. item.name = project_name
  864. item.feeName = fee_name
  865. # 解析数值列
  866. # 根据列数确定索引
  867. if len(cells) >= 6:
  868. item.estimatedCost = _parse_number_str(cells[1])
  869. item.approvedFinalAccountExcludingVat = _parse_number_str(cells[2])
  870. item.vatAmount = _parse_number_str(cells[3])
  871. item.costVariance = _parse_number_str(cells[4])
  872. item.varianceRate = _parse_rate_str(cells[5])
  873. elif len(cells) >= 5:
  874. item.estimatedCost = _parse_number_str(cells[1])
  875. item.approvedFinalAccountExcludingVat = _parse_number_str(cells[2])
  876. item.vatAmount = _parse_number_str(cells[3])
  877. item.costVariance = _parse_number_str(cells[4])
  878. item.varianceRate = ""
  879. items.append(item)
  880. logger.debug(f"[决算报告] 解析记录: {project_name} - {fee_name} = {item.estimatedCost}")
  881. return items
  882. def _clean_cell_text(cell: str) -> str:
  883. """清理单元格文本,移除HTML标签和多余空格"""
  884. # 移除HTML标签
  885. text = re.sub(r'<[^>]+>', '', cell)
  886. # 移除多余空格
  887. text = re.sub(r'\s+', ' ', text).strip()
  888. return text
  889. def _parse_number_str(value: str) -> str:
  890. """解析数字字符串,保留原始精度"""
  891. if not value or not value.strip():
  892. return "0"
  893. value = value.strip()
  894. # 移除千分位逗号
  895. value = value.replace(',', '')
  896. # 移除非数字字符(保留负号和小数点)
  897. cleaned = re.sub(r'[^\d.\-]', '', value)
  898. if not cleaned or cleaned == '-':
  899. return "0"
  900. return cleaned
  901. def _parse_rate_str(value: str) -> str:
  902. """解析百分比字符串"""
  903. if not value or not value.strip():
  904. return "0%"
  905. value = value.strip()
  906. if '%' not in value:
  907. # 提取数字部分并添加百分号
  908. num_str = re.sub(r'[^\d.\-]', '', value)
  909. if num_str and num_str != '-':
  910. return f"{num_str}%"
  911. return "0%"
  912. return value