table_extractor.py 138 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171117211731174117511761177117811791180118111821183118411851186118711881189119011911192119311941195119611971198119912001201120212031204120512061207120812091210121112121213121412151216121712181219122012211222122312241225122612271228122912301231123212331234123512361237123812391240124112421243124412451246124712481249125012511252125312541255125612571258125912601261126212631264126512661267126812691270127112721273127412751276127712781279128012811282128312841285128612871288128912901291129212931294129512961297129812991300130113021303130413051306130713081309131013111312131313141315131613171318131913201321132213231324132513261327132813291330133113321333133413351336133713381339134013411342134313441345134613471348134913501351135213531354135513561357135813591360136113621363136413651366136713681369137013711372137313741375137613771378137913801381138213831384138513861387138813891390139113921393139413951396139713981399140014011402140314041405140614071408140914101411141214131414141514161417141814191420142114221423142414251426142714281429143014311432143314341435143614371438143914401441144214431444144514461447144814491450145114521453145414551456145714581459146014611462146314641465146614671468146914701471147214731474147514761477147814791480148114821483148414851486148714881489149014911492149314941495149614971498149915001501150215031504150515061507150815091510151115121513151415151516151715181519152015211522152315241525152615271528152915301531153215331534153515361537153815391540154115421543154415451546154715481549155015511552155315541555155615571558155915601561156215631564156515661567156815691570157115721573157415751576157715781579158015811582158315841585158615871588158915901591159215931594159515961597159815991600160116021603160416051606160716081609161016111612161316141615161616171618161916201621162216231624162516261627162816291630163116321633163416351636163716381639164016411642164316441645164616471648164916501651165216531654165516561657165816591660166116621663166416651666166716681669167016711672167316741675167616771678167916801681168216831684168516861687168816891690169116921693169416951696169716981699170017011702170317041705170617071708170917101711171217131714171517161717171817191720172117221723172417251726172717281729173017311732173317341735173617371738173917401741174217431744174517461747174817491750175117521753175417551756175717581759176017611762176317641765176617671768176917701771177217731774177517761777177817791780178117821783178417851786178717881789179017911792179317941795179617971798179918001801180218031804180518061807180818091810181118121813181418151816181718181819182018211822182318241825182618271828182918301831183218331834183518361837183818391840184118421843184418451846184718481849185018511852185318541855185618571858185918601861186218631864186518661867186818691870187118721873187418751876187718781879188018811882188318841885188618871888188918901891189218931894189518961897189818991900190119021903190419051906190719081909191019111912191319141915191619171918191919201921192219231924192519261927192819291930193119321933193419351936193719381939194019411942194319441945194619471948194919501951195219531954195519561957195819591960196119621963196419651966196719681969197019711972197319741975197619771978197919801981198219831984198519861987198819891990199119921993199419951996199719981999200020012002200320042005200620072008200920102011201220132014201520162017201820192020202120222023202420252026202720282029203020312032203320342035203620372038203920402041204220432044204520462047204820492050205120522053205420552056205720582059206020612062206320642065206620672068206920702071207220732074207520762077207820792080208120822083208420852086208720882089209020912092209320942095209620972098209921002101210221032104210521062107210821092110211121122113211421152116211721182119212021212122212321242125212621272128212921302131213221332134213521362137213821392140214121422143214421452146214721482149215021512152215321542155215621572158215921602161216221632164216521662167216821692170217121722173217421752176217721782179218021812182218321842185218621872188218921902191219221932194219521962197219821992200220122022203220422052206220722082209221022112212221322142215221622172218221922202221222222232224222522262227222822292230223122322233223422352236223722382239224022412242224322442245224622472248224922502251225222532254225522562257225822592260226122622263226422652266226722682269227022712272227322742275227622772278227922802281228222832284228522862287228822892290229122922293229422952296229722982299230023012302230323042305230623072308230923102311231223132314231523162317231823192320232123222323232423252326232723282329233023312332233323342335233623372338233923402341234223432344234523462347234823492350235123522353235423552356235723582359236023612362236323642365236623672368236923702371237223732374237523762377237823792380238123822383238423852386238723882389239023912392239323942395239623972398239924002401240224032404240524062407240824092410241124122413241424152416241724182419242024212422242324242425242624272428242924302431243224332434243524362437243824392440244124422443244424452446244724482449245024512452245324542455245624572458245924602461246224632464246524662467246824692470247124722473247424752476247724782479248024812482248324842485248624872488248924902491249224932494249524962497249824992500250125022503250425052506250725082509251025112512251325142515251625172518251925202521252225232524252525262527252825292530253125322533253425352536253725382539254025412542254325442545254625472548254925502551255225532554255525562557255825592560256125622563256425652566256725682569257025712572257325742575257625772578257925802581258225832584258525862587258825892590259125922593259425952596259725982599260026012602260326042605260626072608260926102611261226132614261526162617261826192620262126222623262426252626262726282629263026312632263326342635263626372638263926402641264226432644264526462647264826492650265126522653265426552656265726582659266026612662266326642665266626672668266926702671267226732674267526762677267826792680268126822683268426852686268726882689269026912692269326942695269626972698269927002701270227032704270527062707270827092710271127122713271427152716271727182719272027212722272327242725272627272728272927302731273227332734273527362737273827392740274127422743274427452746274727482749275027512752275327542755275627572758275927602761276227632764276527662767276827692770277127722773277427752776277727782779278027812782278327842785278627872788278927902791279227932794279527962797279827992800280128022803280428052806280728082809281028112812281328142815281628172818281928202821282228232824282528262827282828292830283128322833283428352836283728382839284028412842284328442845284628472848284928502851285228532854285528562857285828592860286128622863286428652866286728682869287028712872287328742875287628772878287928802881288228832884288528862887288828892890289128922893289428952896289728982899290029012902290329042905290629072908290929102911291229132914291529162917291829192920292129222923292429252926292729282929293029312932293329342935293629372938293929402941294229432944294529462947294829492950295129522953295429552956295729582959296029612962296329642965296629672968296929702971297229732974297529762977297829792980298129822983298429852986298729882989299029912992299329942995299629972998299930003001300230033004300530063007300830093010301130123013301430153016301730183019302030213022302330243025302630273028302930303031303230333034303530363037303830393040304130423043304430453046304730483049305030513052305330543055305630573058305930603061306230633064306530663067306830693070307130723073307430753076307730783079308030813082308330843085308630873088308930903091309230933094309530963097309830993100310131023103310431053106310731083109311031113112311331143115311631173118311931203121312231233124312531263127312831293130313131323133313431353136313731383139314031413142314331443145314631473148314931503151315231533154315531563157315831593160316131623163316431653166316731683169317031713172317331743175317631773178317931803181318231833184318531863187318831893190319131923193319431953196319731983199320032013202320332043205320632073208320932103211321232133214321532163217321832193220322132223223322432253226322732283229323032313232323332343235323632373238323932403241324232433244324532463247324832493250325132523253325432553256
  1. from pathlib import Path
  2. from typing import List, Tuple, Dict, Any, Optional, Literal
  3. import re
  4. import json
  5. import logging
  6. import sys
  7. import pandas as pd
  8. # 尝试使用统一的日志系统
  9. try:
  10. from ..utils.logging_config import get_logger
  11. logger = get_logger("pdf_converter_v2.utils.table_extractor")
  12. except (ImportError, ValueError):
  13. # 如果无法导入,使用标准 logging
  14. logger = logging.getLogger(__name__)
  15. # 确保日志系统正确配置
  16. if not logger.handlers:
  17. handler = logging.StreamHandler(sys.stderr)
  18. handler.setFormatter(logging.Formatter('%(asctime)s | %(levelname)-8s | %(name)s:%(funcName)s:%(lineno)d - %(message)s'))
  19. logger.addHandler(handler)
  20. logger.setLevel(logging.INFO)
  21. try:
  22. import pdfplumber
  23. _PDFPLUMBER_AVAILABLE = True
  24. except ImportError as e:
  25. _PDFPLUMBER_AVAILABLE = False
  26. _PDFPLUMBER_IMPORT_ERROR = str(e)
  27. except Exception as e:
  28. # 捕获其他可能的导入错误
  29. _PDFPLUMBER_AVAILABLE = False
  30. _PDFPLUMBER_IMPORT_ERROR = str(e)
  31. else:
  32. _PDFPLUMBER_IMPORT_ERROR = None
  33. # 文档类型 -> 表头规则
  34. TABLE_TYPE_RULES: Dict[str, List[dict]] = {
  35. # 结算报告类
  36. "settlementReport": [
  37. {
  38. "name": "审定结算汇总表",
  39. "keywords": ["序号", "审计内容", "送审金额(含税)", "审定金额(含税)", "审定金额(不含税)", "增减金额", "备注"],
  40. "match_mode": "all",
  41. },
  42. {
  43. "name": "合同执行情况",
  44. "keywords": ["施工单位", "中标通知书金额", "中标通知书编号", "合同金额", "结算送审金额", "差额"],
  45. "match_mode": "all",
  46. },
  47. {
  48. "name": "赔偿合同",
  49. "keywords": ["合同对方", "赔偿事项", "合同金额", "结算送审金额", "差额"],
  50. "match_mode": "all",
  51. },
  52. {
  53. "name": "物资采购合同1",
  54. "keywords": ["物料名称", "合同数量", "施工图数量", "单价(不含税)", "差额"],
  55. "match_mode": "all",
  56. },
  57. {
  58. "name": "物资采购合同2",
  59. "keywords": ["物料名称", "合同金额(不含税)", "入账金额", "差额", "备注"],
  60. "match_mode": "all",
  61. },
  62. {
  63. "name": "其他服务类合同",
  64. "keywords": ["服务商", "中标通知书", "合同金额", "送审金额", "结算金额"],
  65. "match_mode": "all",
  66. },
  67. ],
  68. # 初设评审类
  69. "designReview": [
  70. {
  71. "name": "初设评审的概算投资",
  72. "keywords": ["序号", "工程名称", "建设规模", "静态投资", "其中:建设场地征用及清理费", "动态投资"],
  73. "match_mode": "all",
  74. },
  75. {
  76. "name": "初设评审的概算投资明细",
  77. "keywords": ["序号", "工程或费用名称", "建筑工程费", "设备购置费", "安装工程费", "其他费用", "合计"],
  78. "match_mode": "all",
  79. },
  80. {
  81. "name": "初设评审的概算投资费用",
  82. "keywords": ["序号", "工程或费用名称", "费用金额", "各项占静态投资", "单位投资"],
  83. "match_mode": "all",
  84. },
  85. ],
  86. # 决算报告类
  87. "finalAccount": [
  88. {
  89. "name": "单项工程投资完成情况",
  90. "keywords": ["费用项目", "概算金额", "决算金额", "审定金额", "增值税额", "超", "节"],
  91. "match_mode": "all",
  92. },
  93. ],
  94. }
  95. EXCLUDE_RULES: List[str] = []
  96. # 是否启用跨页合并
  97. ENABLE_MERGE_CROSS_PAGE_TABLES: bool = True
  98. def extract_table_title_from_page(page, table_bbox: tuple, max_lines: int = 10) -> str:
  99. """
  100. 从 PDF 页面中提取表格上方的文本作为表格标题。
  101. 优先查找包含工程名称的行(如 "XXX变电站新建工程总概算表")。
  102. Args:
  103. page: pdfplumber page 对象
  104. table_bbox: 表格的边界框 (x0, top, x1, bottom)
  105. max_lines: 最多向上搜索的行数
  106. Returns:
  107. str: 表格标题,如果未找到则返回空字符串
  108. """
  109. if not table_bbox:
  110. return ""
  111. table_top = table_bbox[1] # 表格顶部 y 坐标
  112. page_width = page.width
  113. # 在表格上方区域搜索文本(向上搜索到页面顶部)
  114. search_top = 0 # 从页面顶部开始搜索
  115. search_bottom = table_top - 5 # 留一点边距
  116. if search_bottom <= search_top:
  117. return ""
  118. # 提取表格上方区域的文本
  119. crop_box = (0, search_top, page_width, search_bottom)
  120. try:
  121. cropped = page.within_bbox(crop_box)
  122. text = cropped.extract_text() or ""
  123. except Exception:
  124. return ""
  125. if not text.strip():
  126. return ""
  127. # 取所有行
  128. lines = [line.strip() for line in text.strip().split('\n') if line.strip()]
  129. if not lines:
  130. return ""
  131. # 排除的关键词(这些不是工程名称)
  132. exclude_keywords = ["工程规模", "金额单位", "建设规模", "附表", "附件"]
  133. def is_valid_title(line: str) -> bool:
  134. """检查是否是有效的工程名称标题"""
  135. # 排除包含排除关键词的行
  136. if any(kw in line for kw in exclude_keywords):
  137. return False
  138. # 必须包含工程相关关键词
  139. if not any(kw in line for kw in ["工程", "概算表", "估算表"]):
  140. return False
  141. return True
  142. # 优先查找包含 "总概算表" 的行(最可能是工程名称)
  143. for line in reversed(lines):
  144. if "总概算表" in line and is_valid_title(line):
  145. # 进一步检查是否包含具体工程类型
  146. if any(kw in line for kw in ["变电站", "线路", "间隔", "kV", "KV", "千伏"]):
  147. return line.strip()
  148. # 次优先:查找包含 "概算表" 但不是 "总概算表" 的行
  149. for line in reversed(lines):
  150. if "概算表" in line and "总概算表" not in line and is_valid_title(line):
  151. if any(kw in line for kw in ["变电站", "线路", "间隔", "kV", "KV", "千伏"]):
  152. return line.strip()
  153. # 再次优先:查找包含 "变电站"/"线路"/"间隔" + "工程" 的行
  154. for line in reversed(lines):
  155. if is_valid_title(line):
  156. if any(kw in line for kw in ["变电站", "线路", "间隔"]) and "工程" in line:
  157. return line.strip()
  158. # 查找包含电压等级 + "工程" 的行
  159. for line in reversed(lines):
  160. if is_valid_title(line):
  161. if any(kw in line for kw in ["kV", "KV", "千伏", "kv"]) and "工程" in line:
  162. return line.strip()
  163. # 最后:返回任何包含工程关键词的有效行
  164. for line in reversed(lines):
  165. if is_valid_title(line):
  166. return line.strip()
  167. return ""
  168. def extract_tables_with_pdfplumber(
  169. pdf_path: str,
  170. pages: str = "all",
  171. extract_titles: bool = False,
  172. ) -> List[Tuple[int, pd.DataFrame, tuple, str]]:
  173. """
  174. 使用 pdfplumber 提取 PDF 中的表格。
  175. Args:
  176. pdf_path: PDF 文件路径
  177. pages: 页码范围,如 "all" 或 "1-5,7,9-10"
  178. extract_titles: 是否提取表格标题
  179. Returns:
  180. List[Tuple[int, pd.DataFrame, tuple, str]]: [(页码, DataFrame, bbox, title), ...]
  181. 如果 extract_titles=False,title 为空字符串
  182. """
  183. # 运行时再次尝试导入,因为模块加载时可能失败但运行时环境可能不同
  184. global _PDFPLUMBER_AVAILABLE, pdfplumber, _PDFPLUMBER_IMPORT_ERROR
  185. if not _PDFPLUMBER_AVAILABLE:
  186. try:
  187. import pdfplumber
  188. _PDFPLUMBER_AVAILABLE = True
  189. _PDFPLUMBER_IMPORT_ERROR = None
  190. except ImportError as e:
  191. error_msg = f"pdfplumber 库未安装,无法提取表格(请安装 pdfplumber)"
  192. if _PDFPLUMBER_IMPORT_ERROR:
  193. error_msg += f"\n模块加载时导入错误: {_PDFPLUMBER_IMPORT_ERROR}"
  194. error_msg += f"\n运行时导入错误: {e}"
  195. raise RuntimeError(error_msg)
  196. except Exception as e:
  197. error_msg = f"pdfplumber 库导入失败: {e}"
  198. if _PDFPLUMBER_IMPORT_ERROR:
  199. error_msg += f"\n模块加载时导入错误: {_PDFPLUMBER_IMPORT_ERROR}"
  200. raise RuntimeError(error_msg)
  201. tables_data: List[Tuple[int, pd.DataFrame, tuple, str]] = []
  202. with pdfplumber.open(pdf_path) as pdf:
  203. total_pages = len(pdf.pages)
  204. logger.info(f"[pdfplumber] 打开 PDF,共 {total_pages} 页")
  205. # 确定要处理的页面
  206. if pages == "all":
  207. pages_to_process = pdf.pages
  208. logger.info(f"[pdfplumber] 处理所有页面: 1-{total_pages}")
  209. else:
  210. # 解析页面范围(如 "1-5,7,9-10")
  211. page_numbers: List[int] = []
  212. for part in pages.split(","):
  213. part = part.strip()
  214. if not part:
  215. continue
  216. if "-" in part:
  217. start, end = map(int, part.split("-"))
  218. page_numbers.extend(range(start, end + 1))
  219. else:
  220. page_numbers.append(int(part))
  221. pages_to_process = [
  222. pdf.pages[i - 1] for i in page_numbers if 0 < i <= len(pdf.pages)
  223. ]
  224. logger.info(f"[pdfplumber] 处理指定页面: {page_numbers}")
  225. # 提取每一页的表格
  226. for page in pages_to_process:
  227. page_num = page.page_number
  228. table_settings = {
  229. "vertical_strategy": "lines",
  230. "horizontal_strategy": "lines",
  231. "intersection_tolerance": 3,
  232. "min_words_vertical": 1,
  233. "min_words_horizontal": 1,
  234. }
  235. tables = page.find_tables(table_settings=table_settings)
  236. page_table_count = 0
  237. for table in tables:
  238. table_data = table.extract()
  239. if table_data and len(table_data) > 0:
  240. df = pd.DataFrame(table_data)
  241. bbox = table.bbox # (x0, top, x1, bottom)
  242. # 提取表格标题
  243. title = ""
  244. if extract_titles:
  245. title = extract_table_title_from_page(page, bbox)
  246. if title:
  247. logger.debug(f"[pdfplumber] 页面 {page_num}: 表格标题: {title}")
  248. tables_data.append((page_num, df, bbox, title))
  249. page_table_count += 1
  250. if page_table_count > 0:
  251. logger.debug(f"[pdfplumber] 页面 {page_num}: 提取到 {page_table_count} 个表格")
  252. logger.info(f"[pdfplumber] 提取完成: 共提取到 {len(tables_data)} 个表格")
  253. return tables_data
  254. def check_table_header(table_df: pd.DataFrame, rule: dict) -> Tuple[bool, str]:
  255. """
  256. 检查表格是否匹配指定的表头规则(固定匹配:必须包含所有关键词)
  257. 处理表头换行(多行表头)并对关键字做去空格匹配。
  258. """
  259. if table_df.empty:
  260. return False, ""
  261. rule_name = rule.get("name", "未知规则")
  262. if rule_name in EXCLUDE_RULES:
  263. return False, ""
  264. # 默认从第一行开始
  265. start_row = 0
  266. # 合并前几行作为完整表头(通常表头可能占 1-3 行)
  267. header_rows_to_check = min(3, len(table_df) - start_row)
  268. header_text_parts: List[str] = []
  269. for row_idx in range(start_row, start_row + header_rows_to_check):
  270. if row_idx >= len(table_df):
  271. break
  272. row = table_df.iloc[row_idx].astype(str).str.strip()
  273. for cell in row:
  274. cell_text = str(cell).strip()
  275. if cell_text and cell_text.lower() not in ["nan", "none", ""]:
  276. # 将单元格内的换行符替换为空格(处理 xlsx 中的换行)
  277. cell_text = cell_text.replace("\n", " ").replace("\r", " ")
  278. header_text_parts.append(cell_text)
  279. # 合并所有表头文本
  280. header_text = " ".join(header_text_parts)
  281. # 规范空白:多个空格合并
  282. header_text = re.sub(r"\s+", " ", header_text).strip()
  283. # 去掉所有空白,用于处理“中标通知\n书金额”这类情况
  284. header_text_no_space = re.sub(r"\s+", "", header_text)
  285. keywords = rule.get("keywords", [])
  286. match_mode = rule.get("match_mode", "all")
  287. if not keywords:
  288. return False, ""
  289. if match_mode == "all":
  290. all_match = True
  291. for keyword in keywords:
  292. keyword_no_space = re.sub(r"\s+", "", keyword)
  293. if keyword in header_text or keyword_no_space in header_text_no_space:
  294. continue
  295. all_match = False
  296. break
  297. if all_match:
  298. return True, rule_name
  299. elif match_mode == "any":
  300. for keyword in keywords:
  301. keyword_no_space = re.sub(r"\s+", "", keyword)
  302. if keyword in header_text or keyword_no_space in header_text_no_space:
  303. return True, rule_name
  304. return False, ""
  305. def is_likely_header_only(table_df: pd.DataFrame, min_data_rows: int = 1) -> bool:
  306. """
  307. 判断表格是否可能只包含表头(数据行很少)。
  308. """
  309. if table_df.empty:
  310. return True
  311. header_rows = min(3, len(table_df))
  312. return len(table_df) <= header_rows + min_data_rows
  313. def has_similar_structure(table1_df: pd.DataFrame, table2_df: pd.DataFrame, tolerance: int = 1) -> bool:
  314. """
  315. 判断两个表格是否有相似的结构(列数接近)。
  316. """
  317. if table1_df.empty or table2_df.empty:
  318. return False
  319. cols1 = len(table1_df.columns)
  320. cols2 = len(table2_df.columns)
  321. return abs(cols1 - cols2) <= tolerance
  322. def _merge_all_tables(
  323. tables: List[Tuple[int, pd.DataFrame, int]],
  324. ) -> List[Tuple[int, pd.DataFrame, int]]:
  325. """
  326. 合并所有跨页表格(不进行过滤):
  327. - 只处理"表头在当前页,内容在下一页"的典型情况;
  328. - 严格限制只合并相邻页,且列结构相似。
  329. """
  330. if not ENABLE_MERGE_CROSS_PAGE_TABLES or not tables:
  331. return tables
  332. # page -> [(idx, df)]
  333. page_map: Dict[int, List[Tuple[int, pd.DataFrame]]] = {}
  334. for orig_idx, df, page in tables:
  335. page_map.setdefault(page, []).append((orig_idx, df))
  336. merged: List[Tuple[int, pd.DataFrame, int]] = []
  337. processed: set[int] = set()
  338. sorted_pages = sorted(page_map.keys())
  339. for page in sorted_pages:
  340. current_list = page_map[page]
  341. for orig_idx, df in current_list:
  342. if orig_idx in processed:
  343. continue
  344. current_df = df
  345. did_merge = False
  346. # 情况:当前表格只有表头,尝试合并下一页
  347. if is_likely_header_only(current_df):
  348. next_page = page + 1
  349. if next_page in page_map:
  350. for next_orig_idx, next_df in page_map[next_page]:
  351. if next_orig_idx in processed:
  352. continue
  353. if not has_similar_structure(current_df, next_df):
  354. continue
  355. # 合并:保留当前页表头,拼接下一页数据
  356. header_rows = min(3, len(current_df))
  357. header_df = current_df.iloc[:header_rows].copy()
  358. next_data_df = next_df.copy()
  359. # 对齐列数
  360. if len(header_df.columns) != len(next_data_df.columns):
  361. if len(next_data_df.columns) < len(header_df.columns):
  362. for _ in range(len(next_data_df.columns), len(header_df.columns)):
  363. next_data_df[len(next_data_df.columns)] = ""
  364. merged_df = pd.concat([header_df, next_data_df], ignore_index=True)
  365. merged.append((orig_idx, merged_df, page))
  366. processed.add(orig_idx)
  367. processed.add(next_orig_idx)
  368. did_merge = True
  369. break
  370. if not did_merge and orig_idx not in processed:
  371. merged.append((orig_idx, current_df, page))
  372. processed.add(orig_idx)
  373. return merged
  374. def _merge_cross_page_tables(
  375. tables: List[Tuple[int, pd.DataFrame, str, int, str]],
  376. header_rules: List[dict],
  377. ) -> List[Tuple[int, pd.DataFrame, str, int, str]]:
  378. """
  379. 简化版跨页合并逻辑(用于已匹配规则的表格):
  380. - 只处理"表头在当前页,内容在下一页"的典型情况;
  381. - 严格限制只合并相邻页,且列结构相似;
  382. - 如果下一页第一行看起来像新的表头,则不合并。
  383. 元组格式: (orig_idx, df, rule_name, page, title)
  384. """
  385. if not ENABLE_MERGE_CROSS_PAGE_TABLES or not tables:
  386. return tables
  387. # page -> [(idx, df, rule_name, title)]
  388. page_map: Dict[int, List[Tuple[int, pd.DataFrame, str, str]]] = {}
  389. for orig_idx, df, rule_name, page, title in tables:
  390. page_map.setdefault(page, []).append((orig_idx, df, rule_name, title))
  391. merged: List[Tuple[int, pd.DataFrame, str, int, str]] = []
  392. processed: set[int] = set()
  393. sorted_pages = sorted(page_map.keys())
  394. for page in sorted_pages:
  395. current_list = page_map[page]
  396. for orig_idx, df, rule_name, title in current_list:
  397. if orig_idx in processed:
  398. continue
  399. current_df = df
  400. did_merge = False
  401. # 情况:当前表格只有表头,尝试合并下一页
  402. if is_likely_header_only(current_df):
  403. next_page = page + 1
  404. if next_page in page_map:
  405. for next_orig_idx, next_df, next_rule_name, next_title in page_map[next_page]:
  406. if next_orig_idx in processed:
  407. continue
  408. if rule_name and next_rule_name and rule_name != next_rule_name:
  409. continue
  410. if not has_similar_structure(current_df, next_df):
  411. continue
  412. # 判断下一页第一行是否像表头
  413. next_first_row_text = ""
  414. if not next_df.empty:
  415. next_first_row_text = " ".join(
  416. next_df.iloc[0].astype(str).str.strip().tolist()
  417. )
  418. keyword_count = 0
  419. if header_rules and rule_name:
  420. for rule in header_rules:
  421. if rule.get("name") == rule_name:
  422. kws = rule.get("keywords", [])
  423. keyword_count = sum(
  424. 1 for kw in kws if kw in next_first_row_text
  425. )
  426. break
  427. # 如果下一页第一行包含较多关键词,认为是新表头,不合并
  428. if keyword_count >= 2:
  429. continue
  430. # 合并:保留当前页表头,拼接下一页数据
  431. header_rows = min(3, len(current_df))
  432. header_df = current_df.iloc[:header_rows].copy()
  433. next_data_df = next_df.copy()
  434. # 对齐列数
  435. if len(header_df.columns) != len(next_data_df.columns):
  436. if len(next_data_df.columns) < len(header_df.columns):
  437. for _ in range(len(next_data_df.columns), len(header_df.columns)):
  438. next_data_df[len(next_data_df.columns)] = ""
  439. merged_df = pd.concat([header_df, next_data_df], ignore_index=True)
  440. merged.append((orig_idx, merged_df, rule_name, page, title))
  441. processed.add(orig_idx)
  442. processed.add(next_orig_idx)
  443. did_merge = True
  444. break
  445. if not did_merge and orig_idx not in processed:
  446. merged.append((orig_idx, current_df, rule_name, page, title))
  447. processed.add(orig_idx)
  448. return merged
  449. def _fix_broken_cells(table_df: pd.DataFrame, header_row_count: int = 1) -> pd.DataFrame:
  450. """
  451. 修复被错误分割的单元格(一个单元格的内容被识别成多行)
  452. 检测规则:
  453. 1. 如果某一行的前N列有内容,但后面的列大部分为空(超过50%)
  454. 2. 且上一行对应列有内容,则认为当前行是上一行的延续,需要合并
  455. Args:
  456. table_df: 表格DataFrame
  457. header_row_count: 表头行数,跳过表头不处理
  458. Returns:
  459. pd.DataFrame: 修复后的表格
  460. """
  461. if table_df.empty or len(table_df) <= header_row_count:
  462. return table_df
  463. df = table_df.copy()
  464. rows_to_remove = []
  465. # 从表头后开始检查
  466. for i in range(header_row_count, len(df)):
  467. if i <= header_row_count:
  468. continue # 跳过第一行数据
  469. # 获取当前行
  470. current_row = df.iloc[i]
  471. prev_row = df.iloc[i-1]
  472. # 统计有内容的列和空列
  473. non_empty_cols = []
  474. empty_cols = []
  475. for j in range(len(df.columns)):
  476. val = current_row.iloc[j]
  477. val_str = str(val).strip()
  478. is_empty = (val is None or pd.isna(val) or not val_str or
  479. val_str.lower() in ['nan', 'none', '', '0', '0.0'])
  480. if is_empty:
  481. empty_cols.append(j)
  482. else:
  483. non_empty_cols.append(j)
  484. # 如果没有非空列,跳过
  485. if not non_empty_cols:
  486. continue
  487. # 计算空列比例
  488. empty_ratio = len(empty_cols) / len(df.columns) if len(df.columns) > 0 else 0
  489. # 如果空列超过50%,可能是被截断的行
  490. if empty_ratio > 0.5:
  491. # 检查上一行对应位置是否有内容
  492. can_merge = True
  493. merge_cols = []
  494. for col_idx in non_empty_cols:
  495. prev_val = prev_row.iloc[col_idx]
  496. prev_val_str = str(prev_val).strip()
  497. curr_val = current_row.iloc[col_idx]
  498. curr_val_str = str(curr_val).strip()
  499. # 如果上一行对应列有内容,可以合并
  500. if prev_val_str and prev_val_str.lower() not in ['nan', 'none', '', '0', '0.0']:
  501. merge_cols.append(col_idx)
  502. else:
  503. # 如果上一行对应列为空,但当前行有内容,可能是新行,不合并
  504. # 但如果只有前几列有内容,且都是文本(不是数字),可能是延续
  505. if col_idx < len(df.columns) * 0.5: # 前50%的列
  506. # 检查是否是文本(不是纯数字)
  507. if not curr_val_str.replace('.', '').replace('-', '').isdigit():
  508. merge_cols.append(col_idx)
  509. else:
  510. can_merge = False
  511. break
  512. else:
  513. can_merge = False
  514. break
  515. if can_merge and merge_cols:
  516. # 合并每个非空列到上一行对应的列
  517. for col_idx in merge_cols:
  518. prev_val = str(prev_row.iloc[col_idx]).strip()
  519. curr_val = str(current_row.iloc[col_idx]).strip()
  520. # 合并(移除换行符)
  521. merged_val = prev_val + curr_val.replace('\n', '').replace('\r', '')
  522. df.iloc[i-1, col_idx] = merged_val
  523. rows_to_remove.append(i)
  524. logger.debug(f"[跨行合并] 合并行 {i} 到行 {i-1},列: {merge_cols}")
  525. # 删除已合并的行
  526. if rows_to_remove:
  527. df = df.drop(rows_to_remove).reset_index(drop=True)
  528. logger.info(f"[跨行合并] 修复了 {len(rows_to_remove)} 个被错误分割的单元格(跨行)")
  529. return df
  530. def _format_header_text(cell_val: str) -> str:
  531. """
  532. 格式化表头文本:移除换行符和多余空格,用于匹配。
  533. Args:
  534. cell_val: 原始单元格值
  535. Returns:
  536. 格式化后的文本(移除所有空格,用于精确匹配)
  537. """
  538. if not cell_val or str(cell_val).lower() in ['nan', 'none', '']:
  539. return ""
  540. # 清理换行符,直接移除
  541. text = str(cell_val).strip().replace('\n', '').replace('\r', '')
  542. # 移除所有空格,用于匹配
  543. text = re.sub(r'\s+', '', text)
  544. return text
  545. def _detect_header_rows(df: pd.DataFrame, header_row_idx: int, header_keywords: List[str]) -> int:
  546. """
  547. 智能检测表头行数,避免把数据行当作表头。
  548. Args:
  549. df: 数据框
  550. header_row_idx: 已识别的表头行索引
  551. header_keywords: 表头关键词列表
  552. Returns:
  553. 表头行数(从 header_row_idx 开始)
  554. """
  555. header_rows_to_check = 1
  556. # 检查后续行是否是表头的延续
  557. for i in range(header_row_idx + 1, min(header_row_idx + 3, len(df))):
  558. row = df.iloc[i]
  559. row_text = " ".join(row.astype(str).str.strip().tolist())
  560. # 检查是否包含表头关键词
  561. keyword_count = sum(1 for kw in header_keywords if kw in row_text)
  562. # 如果包含关键词,进一步判断
  563. if keyword_count >= 2: # 至少包含2个关键词才可能是表头
  564. # 检查是否主要是数字(如果是,则不是表头)
  565. numeric_count = sum(1 for cell in row if str(cell).strip().replace('.', '').replace('-', '').isdigit())
  566. numeric_ratio = numeric_count / len(row) if len(row) > 0 else 0
  567. # 检查是否包含明显的公司名称、人名等(如果是,则不是表头)
  568. has_company_name = any(
  569. '公司' in str(cell) or '有限' in str(cell) or '股份' in str(cell) or
  570. '工程' in str(cell) or '集团' in str(cell) or '局' in str(cell)
  571. for cell in row
  572. )
  573. # 如果数字占比小于30%且不包含公司名称,可能是表头
  574. if numeric_ratio < 0.3 and not has_company_name:
  575. header_rows_to_check += 1
  576. else:
  577. # 包含公司名称或数字占比高,是数据行,停止
  578. break
  579. else:
  580. # 关键词太少,不是表头,停止
  581. break
  582. return header_rows_to_check
  583. def parse_settlement_summary_table(df: pd.DataFrame) -> List[Dict[str, Any]]:
  584. """
  585. 解析"审定结算汇总表",提取数据并生成 JSON 格式。
  586. 返回格式:
  587. [{
  588. "No": int, # 序号
  589. "name": str, # 项目名称(审计内容)
  590. "settledVerifiedTaxExclusiveInvestment": float, # 结算审定不含税投资(元,两位小数)
  591. "settledVerifiedTaxInclusiveInvestment": float, # 结算审定含税投资(元,两位小数)
  592. }, ...]
  593. """
  594. if df.empty:
  595. return []
  596. # 尝试识别表头行(通常在前几行)
  597. header_row_idx = None
  598. for i in range(min(3, len(df))):
  599. row_text = " ".join(df.iloc[i].astype(str).str.strip().tolist())
  600. # 检查是否包含表头关键词
  601. if any(kw in row_text for kw in ["序号", "审计内容", "审定金额(含税)", "审定金额(不含税)"]):
  602. header_row_idx = i
  603. break
  604. if header_row_idx is None:
  605. # 如果没有找到明确的表头,假设第一行是表头
  606. header_row_idx = 0
  607. # 先修复跨行单元格(在识别表头之前)
  608. df = _fix_broken_cells(df, header_row_count=header_row_idx + 1)
  609. # 合并前几行作为表头(处理多行表头的情况)
  610. header_keywords = ["序号", "审计内容", "送审金额", "审定金额", "增减金额", "备注"]
  611. header_rows_to_check = _detect_header_rows(df, header_row_idx, header_keywords)
  612. header_texts = [] # 每列的合并文本
  613. num_cols = len(df.columns)
  614. # 打印原始表头行(用于调试)
  615. logger.info(f"[审定结算汇总表] 原始表头行 (从第 {header_row_idx + 1} 行开始,共检查 {header_rows_to_check} 行):")
  616. for row_idx in range(header_row_idx, min(header_row_idx + header_rows_to_check, len(df))):
  617. row_data = [str(df.iloc[row_idx, col_idx]).strip() for col_idx in range(num_cols)]
  618. logger.info(f"[审定结算汇总表] 第 {row_idx + 1} 行: {row_data}")
  619. for col_idx in range(num_cols):
  620. col_text_parts = []
  621. for row_idx in range(header_row_idx, header_row_idx + header_rows_to_check):
  622. if row_idx < len(df):
  623. cell_val = str(df.iloc[row_idx, col_idx]).strip()
  624. if cell_val and cell_val.lower() not in ['nan', 'none', '']:
  625. # 格式化表头文本(移除换行符和空格)
  626. formatted_text = _format_header_text(cell_val)
  627. if formatted_text:
  628. col_text_parts.append(formatted_text)
  629. # 合并该列的所有表头文本(已经是格式化后的,无空格)
  630. merged_text = ''.join(col_text_parts)
  631. header_texts.append(merged_text)
  632. logger.info(f"[审定结算汇总表] 合并后的表头文本(已格式化): {header_texts}")
  633. col_no = None # 序号列
  634. col_name = None # 审计内容列(项目名称)
  635. col_tax_exclusive = None # 审定金额(不含税)列
  636. col_tax_inclusive = None # 审定金额(含税)列
  637. for idx, header_text in enumerate(header_texts):
  638. cell_lower = header_text.lower()
  639. if "序号" in header_text or "no" in cell_lower:
  640. col_no = idx
  641. elif "审计内容" in header_text or "项目名称" in header_text or "name" in cell_lower:
  642. col_name = idx
  643. elif "审定金额(不含税)" in header_text or ("不含税" in header_text and "审定" in header_text):
  644. col_tax_exclusive = idx
  645. elif "审定金额(含税)" in header_text or ("审定金额" in header_text and "含税" in header_text):
  646. col_tax_inclusive = idx
  647. # 如果关键列未找到,尝试通过位置推断
  648. if col_no is None:
  649. col_no = 0
  650. if col_name is None:
  651. col_name = 1
  652. # 如果金额列未找到,尝试从后往前找(通常金额列在表格右侧)
  653. if col_tax_exclusive is None or col_tax_inclusive is None:
  654. for idx in range(len(header_texts) - 1, -1, -1):
  655. header_text = header_texts[idx]
  656. if "不含税" in header_text and col_tax_exclusive is None:
  657. col_tax_exclusive = idx
  658. elif "含税" in header_text and "审定" in header_text and col_tax_inclusive is None:
  659. col_tax_inclusive = idx
  660. if col_tax_exclusive is not None and col_tax_inclusive is not None:
  661. break
  662. logger.info(f"[审定结算汇总表] 列识别: 序号={col_no}, 项目名称={col_name}, 不含税={col_tax_exclusive}, 含税={col_tax_inclusive}")
  663. # 从数据行开始解析(跳过表头行)
  664. data_rows = df.iloc[header_row_idx + 1:].reset_index(drop=True)
  665. result = []
  666. def parse_number(value: Any) -> str:
  667. """解析数字,返回字符串保留原始精度"""
  668. if pd.isna(value):
  669. return "0"
  670. value_str = str(value).strip()
  671. # 移除常见的非数字字符(保留小数点、负号)
  672. value_str = re.sub(r'[^\d.\-]', '', value_str)
  673. if not value_str or value_str == '-':
  674. return "0"
  675. return value_str
  676. for idx, row in data_rows.iterrows():
  677. # 跳过空行
  678. if row.isna().all():
  679. continue
  680. # 提取各列数据
  681. no_val = row.iloc[col_no] if col_no is not None and col_no < len(row) else None
  682. name_val = row.iloc[col_name] if col_name is not None and col_name < len(row) else None
  683. tax_exclusive_val = row.iloc[col_tax_exclusive] if col_tax_exclusive is not None and col_tax_exclusive < len(row) else None
  684. tax_inclusive_val = row.iloc[col_tax_inclusive] if col_tax_inclusive is not None and col_tax_inclusive < len(row) else None
  685. # 解析序号
  686. no = None
  687. no_str = ""
  688. if no_val is not None and not pd.isna(no_val):
  689. no_str = str(no_val).strip()
  690. if no_str:
  691. try:
  692. no = int(float(no_str))
  693. except (ValueError, TypeError):
  694. pass
  695. # 跳过序号为空的行(这些通常是合计、其中等说明行)
  696. if not no_str or pd.isna(no_val):
  697. continue
  698. # 解析项目名称(审计内容),清理换行符
  699. name = str(name_val).strip() if name_val is not None and not pd.isna(name_val) else ""
  700. # 清理换行符,替换为空格
  701. name = name.replace('\n', '').replace('\r', '') # 直接移除换行符,不替换为空格
  702. # 清理多余空格
  703. name = re.sub(r'\s+', ' ', name).strip()
  704. # 跳过空行
  705. if not name or name == "":
  706. continue
  707. # 判断是否为合计行(合计行需要跳过)
  708. is_total = any(kw in name for kw in ["合计", "总计", "总计", "合计金额"])
  709. if is_total:
  710. continue
  711. # 解析投资金额
  712. settled_verified_tax_exclusive = parse_number(tax_exclusive_val)
  713. settled_verified_tax_inclusive = parse_number(tax_inclusive_val)
  714. # 添加到结果
  715. result.append({
  716. "No": no if no is not None else idx + 1,
  717. "name": name,
  718. "settledVerifiedTaxExclusiveInvestment": settled_verified_tax_exclusive,
  719. "settledVerifiedTaxInclusiveInvestment": settled_verified_tax_inclusive,
  720. })
  721. return result
  722. def parse_contract_execution_table(df: pd.DataFrame) -> List[Dict[str, Any]]:
  723. """
  724. 解析"合同执行情况"表,提取数据并生成 JSON 格式。
  725. 返回格式:
  726. [{
  727. "No": int, # 序号
  728. "constructionUnit": str, # 施工单位
  729. "bidNoticeAmount": float, # 中标通知书金额(元,两位小数)
  730. "bidNoticeNo": str, # 中标通知书编号
  731. "contractAmount": float, # 合同金额(元,两位小数)
  732. "settlementSubmittedAmount": float, # 结算送审金额(元,两位小数)
  733. "differenceAmount": float, # 差额(元,两位小数)
  734. }, ...]
  735. """
  736. if df.empty:
  737. return []
  738. # 尝试识别表头行(通常在前几行)
  739. header_row_idx = None
  740. for i in range(min(3, len(df))):
  741. row_text = " ".join(df.iloc[i].astype(str).str.strip().tolist())
  742. # 检查是否包含表头关键词
  743. if any(kw in row_text for kw in ["序号", "施工单位", "中标通知书金额", "中标通知书编号", "合同金额", "结算送审金额", "差额"]):
  744. header_row_idx = i
  745. break
  746. if header_row_idx is None:
  747. # 如果没有找到明确的表头,假设第一行是表头
  748. header_row_idx = 0
  749. # 先修复跨行单元格(在识别表头之前)
  750. df = _fix_broken_cells(df, header_row_count=header_row_idx + 1)
  751. # 合并前几行作为表头(处理多行表头的情况)
  752. # 智能判断表头行数:表头行通常不包含纯数字(除了序号列)
  753. header_rows_to_check = 1
  754. for i in range(header_row_idx + 1, min(header_row_idx + 3, len(df))):
  755. row = df.iloc[i]
  756. # 检查这一行是否像表头(包含关键词且不全是数字)
  757. row_text = " ".join(row.astype(str).str.strip().tolist())
  758. has_keywords = any(kw in row_text for kw in ["施工单位", "中标", "合同", "结算", "送审", "差额"])
  759. # 如果包含关键词且看起来不像数据行(不全是数字),则可能是表头的一部分
  760. if has_keywords:
  761. # 检查是否主要是数字(如果是,则不是表头)
  762. numeric_count = sum(1 for cell in row if str(cell).strip().replace('.', '').replace('-', '').isdigit())
  763. if numeric_count < len(row) * 0.5: # 如果数字占比小于50%,可能是表头
  764. header_rows_to_check += 1
  765. else:
  766. break
  767. else:
  768. break
  769. header_texts = [] # 每列的合并文本
  770. num_cols = len(df.columns)
  771. # 打印原始表头行(用于调试)
  772. logger.info(f"[合同执行情况] 原始表头行 (从第 {header_row_idx + 1} 行开始,共检查 {header_rows_to_check} 行):")
  773. for row_idx in range(header_row_idx, min(header_row_idx + header_rows_to_check, len(df))):
  774. row_data = [str(df.iloc[row_idx, col_idx]).strip() for col_idx in range(num_cols)]
  775. logger.info(f"[合同执行情况] 第 {row_idx + 1} 行: {row_data}")
  776. for col_idx in range(num_cols):
  777. col_text_parts = []
  778. for row_idx in range(header_row_idx, header_row_idx + header_rows_to_check):
  779. if row_idx < len(df):
  780. cell_val = str(df.iloc[row_idx, col_idx]).strip()
  781. if cell_val and cell_val.lower() not in ['nan', 'none', '']:
  782. # 格式化表头文本(移除换行符和空格)
  783. formatted_text = _format_header_text(cell_val)
  784. if formatted_text:
  785. col_text_parts.append(formatted_text)
  786. # 合并该列的所有表头文本(已经是格式化后的,无空格)
  787. merged_text = ''.join(col_text_parts)
  788. header_texts.append(merged_text)
  789. logger.info(f"[合同执行情况] 合并后的表头文本(已格式化): {header_texts}")
  790. col_no = None # 序号列
  791. col_construction_unit = None # 施工单位列
  792. col_bid_notice_amount = None # 中标通知书金额列
  793. col_bid_notice_no = None # 中标通知书编号列
  794. col_contract_amount = None # 合同金额列
  795. col_settlement_submitted = None # 结算送审金额列
  796. col_difference = None # 差额列
  797. for idx, header_text in enumerate(header_texts):
  798. cell_lower = header_text.lower()
  799. # header_text 已经是格式化后的(无空格),直接使用
  800. if "序号" in header_text or "no" in cell_lower:
  801. col_no = idx
  802. elif "施工单位" in header_text:
  803. col_construction_unit = idx
  804. elif "中标" in header_text and "金额" in header_text:
  805. col_bid_notice_amount = idx
  806. elif "中标" in header_text and "编号" in header_text:
  807. col_bid_notice_no = idx
  808. elif ("合同金额" in header_text or
  809. ("合同" in header_text and "金额" in header_text and
  810. "结算" not in header_text and "送审" not in header_text)):
  811. col_contract_amount = idx
  812. elif ("结算送审金额" in header_text or
  813. ("送审" in header_text and "金额" in header_text) or
  814. ("结算" in header_text and "送审" in header_text)):
  815. col_settlement_submitted = idx
  816. elif "差额" in header_text:
  817. col_difference = idx
  818. # 如果关键列未找到,尝试通过位置推断
  819. if col_no is None:
  820. col_no = 0
  821. if col_construction_unit is None:
  822. col_construction_unit = 1
  823. # 如果金额列未找到,尝试从后往前找(通常金额列在表格右侧)
  824. # 同时检查列名中是否包含关键词的部分匹配
  825. if col_bid_notice_amount is None:
  826. for idx, header_text in enumerate(header_texts):
  827. if "中标" in header_text and "金额" in header_text:
  828. col_bid_notice_amount = idx
  829. break
  830. if col_bid_notice_no is None:
  831. for idx, header_text in enumerate(header_texts):
  832. if "中标" in header_text and "编号" in header_text:
  833. col_bid_notice_no = idx
  834. break
  835. if col_contract_amount is None:
  836. # 尝试更灵活的匹配:合同金额可能在"合同"和"金额"分开的列中
  837. for idx, header_text in enumerate(header_texts):
  838. # header_text 已经是格式化后的(无空格),直接使用
  839. # 检查是否包含"合同"和"金额",且不包含"结算"、"送审"等
  840. if ("合同" in header_text and "金额" in header_text and
  841. "结算" not in header_text and "送审" not in header_text):
  842. col_contract_amount = idx
  843. break
  844. if col_settlement_submitted is None:
  845. # 尝试更灵活的匹配:结算送审金额可能在"结算"、"送审"、"金额"分开的列中
  846. for idx, header_text in enumerate(header_texts):
  847. # header_text 已经是格式化后的(无空格),直接使用
  848. # 检查是否包含"送审"和"金额",或者"结算"和"送审"
  849. if (("送审" in header_text and "金额" in header_text) or
  850. ("结算" in header_text and "送审" in header_text) or
  851. "结算送审金额" in header_text):
  852. col_settlement_submitted = idx
  853. break
  854. if col_difference is None:
  855. for idx, header_text in enumerate(header_texts):
  856. if "差额" in header_text:
  857. col_difference = idx
  858. break
  859. logger.info(f"[合同执行情况] 列识别: 序号={col_no}, 施工单位={col_construction_unit}, "
  860. f"中标金额={col_bid_notice_amount}, 中标编号={col_bid_notice_no}, "
  861. f"合同金额={col_contract_amount}, 送审金额={col_settlement_submitted}, 差额={col_difference}")
  862. # 从数据行开始解析(跳过表头行)
  863. data_rows = df.iloc[header_row_idx + 1:].reset_index(drop=True)
  864. result = []
  865. def parse_number(value: Any) -> str:
  866. """解析数字,返回字符串保留原始精度"""
  867. if pd.isna(value):
  868. return "0"
  869. value_str = str(value).strip()
  870. # 移除常见的非数字字符(保留小数点、负号)
  871. value_str = re.sub(r'[^\d.\-]', '', value_str)
  872. if not value_str or value_str == '-':
  873. return "0"
  874. return value_str
  875. for idx, row in data_rows.iterrows():
  876. # 跳过空行
  877. if row.isna().all():
  878. continue
  879. # 提取各列数据
  880. no_val = row.iloc[col_no] if col_no is not None and col_no < len(row) else None
  881. construction_unit_val = row.iloc[col_construction_unit] if col_construction_unit is not None and col_construction_unit < len(row) else None
  882. bid_notice_amount_val = row.iloc[col_bid_notice_amount] if col_bid_notice_amount is not None and col_bid_notice_amount < len(row) else None
  883. bid_notice_no_val = row.iloc[col_bid_notice_no] if col_bid_notice_no is not None and col_bid_notice_no < len(row) else None
  884. contract_amount_val = row.iloc[col_contract_amount] if col_contract_amount is not None and col_contract_amount < len(row) else None
  885. settlement_submitted_val = row.iloc[col_settlement_submitted] if col_settlement_submitted is not None and col_settlement_submitted < len(row) else None
  886. difference_val = row.iloc[col_difference] if col_difference is not None and col_difference < len(row) else None
  887. # 解析序号
  888. no = None
  889. no_str = ""
  890. if no_val is not None and not pd.isna(no_val):
  891. no_str = str(no_val).strip()
  892. if no_str:
  893. try:
  894. no = int(float(no_str))
  895. except (ValueError, TypeError):
  896. pass
  897. # 跳过序号为空的行(这些通常是合计、其中等说明行)
  898. if not no_str or pd.isna(no_val):
  899. continue
  900. # 解析施工单位,清理换行符
  901. construction_unit = str(construction_unit_val).strip() if construction_unit_val is not None and not pd.isna(construction_unit_val) else ""
  902. # 清理换行符,直接移除
  903. construction_unit = construction_unit.replace('\n', '').replace('\r', '')
  904. # 清理多余空格
  905. construction_unit = re.sub(r'\s+', ' ', construction_unit).strip()
  906. # 跳过空行
  907. if not construction_unit or construction_unit == "":
  908. continue
  909. # 判断是否为合计行(合计行需要跳过)
  910. is_total = any(kw in construction_unit for kw in ["合计", "总计", "总计", "合计金额"])
  911. if is_total:
  912. continue
  913. # 解析中标通知书编号,清理换行符
  914. bid_notice_no = str(bid_notice_no_val).strip() if bid_notice_no_val is not None and not pd.isna(bid_notice_no_val) else ""
  915. bid_notice_no = bid_notice_no.replace('\n', '').replace('\r', '')
  916. bid_notice_no = re.sub(r'\s+', ' ', bid_notice_no).strip()
  917. # 解析金额(保留两位小数)
  918. bid_notice_amount = parse_number(bid_notice_amount_val)
  919. contract_amount = parse_number(contract_amount_val)
  920. settlement_submitted_amount = parse_number(settlement_submitted_val)
  921. difference_amount = parse_number(difference_val)
  922. # 添加到结果
  923. result.append({
  924. "No": no if no is not None else idx + 1,
  925. "constructionUnit": construction_unit,
  926. "bidNoticeAmount": bid_notice_amount,
  927. "bidNoticeNo": bid_notice_no,
  928. "contractAmount": contract_amount,
  929. "settlementSubmittedAmount": settlement_submitted_amount,
  930. "differenceAmount": difference_amount,
  931. })
  932. return result
  933. def parse_compensation_contract_table(df: pd.DataFrame) -> List[Dict[str, Any]]:
  934. """
  935. 解析"赔偿合同"表,提取数据并生成 JSON 格式。
  936. 返回格式:
  937. [{
  938. "No": int, # 序号
  939. "contractCounterpartyName": str, # 合同对方名称
  940. "compensationItem": str, # 赔偿事项
  941. "contractAmount": float, # 合同金额(元,两位小数)
  942. "settlementSubmittedAmount": float, # 结算送审金额(元,两位小数)
  943. "differenceAmount": float, # 差额(元,两位小数)
  944. }, ...]
  945. """
  946. if df.empty:
  947. return []
  948. # 尝试识别表头行(通常在前几行)
  949. header_row_idx = None
  950. for i in range(min(3, len(df))):
  951. row_text = " ".join(df.iloc[i].astype(str).str.strip().tolist())
  952. # 检查是否包含表头关键词
  953. if any(kw in row_text for kw in ["序号", "合同对方", "赔偿事项", "合同金额", "结算送审金额", "差额"]):
  954. header_row_idx = i
  955. break
  956. if header_row_idx is None:
  957. # 如果没有找到明确的表头,假设第一行是表头
  958. header_row_idx = 0
  959. # 先修复跨行单元格(在识别表头之前)
  960. df = _fix_broken_cells(df, header_row_count=header_row_idx + 1)
  961. # 合并前几行作为表头(处理多行表头的情况)
  962. header_keywords = ["序号", "合同对方", "赔偿事项", "合同金额", "结算送审金额", "差额"]
  963. header_rows_to_check = _detect_header_rows(df, header_row_idx, header_keywords)
  964. header_texts = [] # 每列的合并文本
  965. num_cols = len(df.columns)
  966. # 打印原始表头行(用于调试)
  967. logger.info(f"[赔偿合同] 原始表头行 (从第 {header_row_idx + 1} 行开始,共检查 {header_rows_to_check} 行):")
  968. for row_idx in range(header_row_idx, min(header_row_idx + header_rows_to_check, len(df))):
  969. row_data = [str(df.iloc[row_idx, col_idx]).strip() for col_idx in range(num_cols)]
  970. logger.info(f"[赔偿合同] 第 {row_idx + 1} 行: {row_data}")
  971. for col_idx in range(num_cols):
  972. col_text_parts = []
  973. for row_idx in range(header_row_idx, header_row_idx + header_rows_to_check):
  974. if row_idx < len(df):
  975. cell_val = str(df.iloc[row_idx, col_idx]).strip()
  976. if cell_val and cell_val.lower() not in ['nan', 'none', '']:
  977. # 格式化表头文本(移除换行符和空格)
  978. formatted_text = _format_header_text(cell_val)
  979. if formatted_text:
  980. col_text_parts.append(formatted_text)
  981. # 合并该列的所有表头文本(已经是格式化后的,无空格)
  982. merged_text = ''.join(col_text_parts)
  983. header_texts.append(merged_text)
  984. logger.info(f"[赔偿合同] 合并后的表头文本(已格式化): {header_texts}")
  985. col_no = None # 序号列
  986. col_counterparty_name = None # 合同对方名称列
  987. col_compensation_item = None # 赔偿事项列
  988. col_contract_amount = None # 合同金额列
  989. col_settlement_submitted = None # 结算送审金额列
  990. col_difference = None # 差额列
  991. for idx, header_text in enumerate(header_texts):
  992. cell_lower = header_text.lower()
  993. if "序号" in header_text or "no" in cell_lower:
  994. col_no = idx
  995. elif "合同对方" in header_text:
  996. col_counterparty_name = idx
  997. elif "赔偿事项" in header_text:
  998. col_compensation_item = idx
  999. elif "合同金额" in header_text and "结算" not in header_text:
  1000. col_contract_amount = idx
  1001. elif "结算送审金额" in header_text or ("送审金额" in header_text and "结算" in header_text):
  1002. col_settlement_submitted = idx
  1003. elif "差额" in header_text:
  1004. col_difference = idx
  1005. # 如果关键列未找到,尝试通过位置推断
  1006. if col_no is None:
  1007. col_no = 0
  1008. if col_counterparty_name is None:
  1009. col_counterparty_name = 1
  1010. if col_compensation_item is None:
  1011. col_compensation_item = 2
  1012. # 如果金额列未找到,尝试从后往前找(通常金额列在表格右侧)
  1013. if col_contract_amount is None or col_settlement_submitted is None or col_difference is None:
  1014. for idx in range(len(header_texts) - 1, -1, -1):
  1015. header_text = header_texts[idx]
  1016. if "差额" in header_text and col_difference is None:
  1017. col_difference = idx
  1018. elif "送审" in header_text and "金额" in header_text and col_settlement_submitted is None:
  1019. col_settlement_submitted = idx
  1020. elif "合同金额" in header_text and col_contract_amount is None:
  1021. col_contract_amount = idx
  1022. if col_contract_amount is not None and col_settlement_submitted is not None and col_difference is not None:
  1023. break
  1024. logger.info(f"[赔偿合同] 列识别: 序号={col_no}, 合同对方={col_counterparty_name}, "
  1025. f"赔偿事项={col_compensation_item}, 合同金额={col_contract_amount}, "
  1026. f"送审金额={col_settlement_submitted}, 差额={col_difference}")
  1027. # 从数据行开始解析(跳过表头行)
  1028. data_rows = df.iloc[header_row_idx + 1:].reset_index(drop=True)
  1029. result = []
  1030. def parse_number(value: Any) -> str:
  1031. """解析数字,返回字符串保留原始精度"""
  1032. if pd.isna(value):
  1033. return "0"
  1034. value_str = str(value).strip()
  1035. # 移除常见的非数字字符(保留小数点、负号)
  1036. value_str = re.sub(r'[^\d.\-]', '', value_str)
  1037. if not value_str or value_str == '-':
  1038. return "0"
  1039. return value_str
  1040. for idx, row in data_rows.iterrows():
  1041. # 跳过空行
  1042. if row.isna().all():
  1043. continue
  1044. # 提取各列数据
  1045. no_val = row.iloc[col_no] if col_no is not None and col_no < len(row) else None
  1046. counterparty_name_val = row.iloc[col_counterparty_name] if col_counterparty_name is not None and col_counterparty_name < len(row) else None
  1047. compensation_item_val = row.iloc[col_compensation_item] if col_compensation_item is not None and col_compensation_item < len(row) else None
  1048. contract_amount_val = row.iloc[col_contract_amount] if col_contract_amount is not None and col_contract_amount < len(row) else None
  1049. settlement_submitted_val = row.iloc[col_settlement_submitted] if col_settlement_submitted is not None and col_settlement_submitted < len(row) else None
  1050. difference_val = row.iloc[col_difference] if col_difference is not None and col_difference < len(row) else None
  1051. # 解析序号
  1052. no = None
  1053. no_str = ""
  1054. if no_val is not None and not pd.isna(no_val):
  1055. no_str = str(no_val).strip()
  1056. if no_str:
  1057. try:
  1058. no = int(float(no_str))
  1059. except (ValueError, TypeError):
  1060. pass
  1061. # 跳过序号为空的行(这些通常是合计、其中等说明行)
  1062. if not no_str or pd.isna(no_val):
  1063. continue
  1064. # 解析合同对方名称,清理换行符
  1065. counterparty_name = str(counterparty_name_val).strip() if counterparty_name_val is not None and not pd.isna(counterparty_name_val) else ""
  1066. counterparty_name = counterparty_name.replace('\n', '').replace('\r', '')
  1067. counterparty_name = re.sub(r'\s+', ' ', counterparty_name).strip()
  1068. # 跳过空行
  1069. if not counterparty_name or counterparty_name == "":
  1070. continue
  1071. # 判断是否为合计行(合计行需要跳过)
  1072. is_total = any(kw in counterparty_name for kw in ["合计", "总计", "总计", "合计金额"])
  1073. if is_total:
  1074. continue
  1075. # 解析赔偿事项,清理换行符
  1076. compensation_item = str(compensation_item_val).strip() if compensation_item_val is not None and not pd.isna(compensation_item_val) else ""
  1077. compensation_item = compensation_item.replace('\n', '').replace('\r', '')
  1078. compensation_item = re.sub(r'\s+', ' ', compensation_item).strip()
  1079. # 解析金额(保留两位小数)
  1080. contract_amount = parse_number(contract_amount_val)
  1081. settlement_submitted_amount = parse_number(settlement_submitted_val)
  1082. difference_amount = parse_number(difference_val)
  1083. # 添加到结果
  1084. result.append({
  1085. "No": no if no is not None else idx + 1,
  1086. "contractCounterpartyName": counterparty_name,
  1087. "compensationItem": compensation_item,
  1088. "contractAmount": contract_amount,
  1089. "settlementSubmittedAmount": settlement_submitted_amount,
  1090. "differenceAmount": difference_amount,
  1091. })
  1092. return result
  1093. def parse_material_purchase_contract1_table(df: pd.DataFrame) -> List[Dict[str, Any]]:
  1094. """
  1095. 解析"物资采购合同1"表,提取数据并生成 JSON 格式。
  1096. 返回格式:
  1097. [{
  1098. "No": int, # 序号
  1099. "materialName": str, # 物料名称
  1100. "contractQuantity": float, # 合同数量
  1101. "drawingQuantity": float, # 施工图数量
  1102. "unitPriceExcludingTax": float, # 单价(不含税)(元,两位小数)
  1103. "differenceAmount": float, # 差额(元,两位小数)
  1104. }, ...]
  1105. """
  1106. if df.empty:
  1107. return []
  1108. # 尝试识别表头行(通常在前几行)
  1109. header_row_idx = None
  1110. for i in range(min(3, len(df))):
  1111. row_text = " ".join(df.iloc[i].astype(str).str.strip().tolist())
  1112. # 检查是否包含表头关键词
  1113. if any(kw in row_text for kw in ["序号", "物料名称", "合同数量", "施工图数量", "单价(不含税)", "差额"]):
  1114. header_row_idx = i
  1115. break
  1116. if header_row_idx is None:
  1117. # 如果没有找到明确的表头,假设第一行是表头
  1118. header_row_idx = 0
  1119. # 先修复跨行单元格(在识别表头之前)
  1120. df = _fix_broken_cells(df, header_row_count=header_row_idx + 1)
  1121. # 合并前几行作为表头(处理多行表头的情况)
  1122. header_keywords = ["序号", "物料名称", "合同数量", "施工图数量", "单价", "差额"]
  1123. header_rows_to_check = _detect_header_rows(df, header_row_idx, header_keywords)
  1124. header_texts = [] # 每列的合并文本
  1125. num_cols = len(df.columns)
  1126. # 打印原始表头行(用于调试)
  1127. logger.info(f"[物资采购合同1] 原始表头行 (从第 {header_row_idx + 1} 行开始,共检查 {header_rows_to_check} 行):")
  1128. for row_idx in range(header_row_idx, min(header_row_idx + header_rows_to_check, len(df))):
  1129. row_data = [str(df.iloc[row_idx, col_idx]).strip() for col_idx in range(num_cols)]
  1130. logger.info(f"[物资采购合同1] 第 {row_idx + 1} 行: {row_data}")
  1131. for col_idx in range(num_cols):
  1132. col_text_parts = []
  1133. for row_idx in range(header_row_idx, header_row_idx + header_rows_to_check):
  1134. if row_idx < len(df):
  1135. cell_val = str(df.iloc[row_idx, col_idx]).strip()
  1136. if cell_val and cell_val.lower() not in ['nan', 'none', '']:
  1137. # 格式化表头文本(移除换行符和空格)
  1138. formatted_text = _format_header_text(cell_val)
  1139. if formatted_text:
  1140. col_text_parts.append(formatted_text)
  1141. # 合并该列的所有表头文本(已经是格式化后的,无空格)
  1142. merged_text = ''.join(col_text_parts)
  1143. header_texts.append(merged_text)
  1144. logger.info(f"[物资采购合同1] 合并后的表头文本(已格式化): {header_texts}")
  1145. col_no = None # 序号列
  1146. col_material_name = None # 物料名称列
  1147. col_contract_quantity = None # 合同数量列
  1148. col_drawing_quantity = None # 施工图数量列
  1149. col_unit_price = None # 单价(不含税)列
  1150. col_difference = None # 差额列
  1151. for idx, header_text in enumerate(header_texts):
  1152. cell_lower = header_text.lower()
  1153. if "序号" in header_text or "no" in cell_lower:
  1154. col_no = idx
  1155. elif "物料名称" in header_text:
  1156. col_material_name = idx
  1157. elif "合同数量" in header_text:
  1158. col_contract_quantity = idx
  1159. elif "施工图数量" in header_text:
  1160. col_drawing_quantity = idx
  1161. elif "单价" in header_text and "不含税" in header_text:
  1162. col_unit_price = idx
  1163. elif "差额" in header_text:
  1164. col_difference = idx
  1165. # 如果关键列未找到,尝试通过位置推断
  1166. if col_no is None:
  1167. col_no = 0
  1168. if col_material_name is None:
  1169. col_material_name = 1
  1170. # 如果数量列未找到,尝试查找
  1171. if col_contract_quantity is None:
  1172. for idx, header_text in enumerate(header_texts):
  1173. if "合同" in header_text and "数量" in header_text:
  1174. col_contract_quantity = idx
  1175. break
  1176. if col_drawing_quantity is None:
  1177. for idx, header_text in enumerate(header_texts):
  1178. if "施工图" in header_text and "数量" in header_text:
  1179. col_drawing_quantity = idx
  1180. break
  1181. # 如果金额列未找到,尝试从后往前找(通常金额列在表格右侧)
  1182. if col_unit_price is None:
  1183. for idx, header_text in enumerate(header_texts):
  1184. if "单价" in header_text:
  1185. col_unit_price = idx
  1186. break
  1187. if col_difference is None:
  1188. for idx in range(len(header_texts) - 1, -1, -1):
  1189. if "差额" in header_texts[idx]:
  1190. col_difference = idx
  1191. break
  1192. logger.info(f"[物资采购合同1] 列识别: 序号={col_no}, 物料名称={col_material_name}, "
  1193. f"合同数量={col_contract_quantity}, 施工图数量={col_drawing_quantity}, "
  1194. f"单价={col_unit_price}, 差额={col_difference}")
  1195. # 从数据行开始解析(跳过表头行)
  1196. data_rows = df.iloc[header_row_idx + 1:].reset_index(drop=True)
  1197. result = []
  1198. def parse_number(value: Any) -> str:
  1199. """解析数字,返回字符串保留原始精度"""
  1200. if pd.isna(value):
  1201. return "0"
  1202. value_str = str(value).strip()
  1203. # 移除常见的非数字字符(保留小数点、负号)
  1204. value_str = re.sub(r'[^\d.\-]', '', value_str)
  1205. if not value_str or value_str == '-':
  1206. return "0"
  1207. return value_str
  1208. for idx, row in data_rows.iterrows():
  1209. # 跳过空行
  1210. if row.isna().all():
  1211. continue
  1212. # 提取各列数据
  1213. no_val = row.iloc[col_no] if col_no is not None and col_no < len(row) else None
  1214. material_name_val = row.iloc[col_material_name] if col_material_name is not None and col_material_name < len(row) else None
  1215. contract_quantity_val = row.iloc[col_contract_quantity] if col_contract_quantity is not None and col_contract_quantity < len(row) else None
  1216. drawing_quantity_val = row.iloc[col_drawing_quantity] if col_drawing_quantity is not None and col_drawing_quantity < len(row) else None
  1217. unit_price_val = row.iloc[col_unit_price] if col_unit_price is not None and col_unit_price < len(row) else None
  1218. difference_val = row.iloc[col_difference] if col_difference is not None and col_difference < len(row) else None
  1219. # 解析序号
  1220. no = None
  1221. no_str = ""
  1222. if no_val is not None and not pd.isna(no_val):
  1223. no_str = str(no_val).strip()
  1224. if no_str:
  1225. try:
  1226. no = int(float(no_str))
  1227. except (ValueError, TypeError):
  1228. pass
  1229. # 跳过序号为空的行(这些通常是合计、其中等说明行)
  1230. if not no_str or pd.isna(no_val):
  1231. continue
  1232. # 解析物料名称,清理换行符
  1233. material_name = str(material_name_val).strip() if material_name_val is not None and not pd.isna(material_name_val) else ""
  1234. material_name = material_name.replace('\n', '').replace('\r', '')
  1235. material_name = re.sub(r'\s+', ' ', material_name).strip()
  1236. # 跳过空行
  1237. if not material_name or material_name == "":
  1238. continue
  1239. # 判断是否为合计行(合计行需要跳过)
  1240. is_total = any(kw in material_name for kw in ["合计", "总计", "总计", "合计金额"])
  1241. if is_total:
  1242. continue
  1243. # 解析数量(合同数量和施工图数量可能是整数或小数)
  1244. contract_quantity = parse_number(contract_quantity_val)
  1245. drawing_quantity = parse_number(drawing_quantity_val)
  1246. # 解析金额(保留两位小数)
  1247. unit_price_excluding_tax = parse_number(unit_price_val)
  1248. difference_amount = parse_number(difference_val)
  1249. # 添加到结果
  1250. result.append({
  1251. "No": no if no is not None else idx + 1,
  1252. "materialName": material_name,
  1253. "contractQuantity": contract_quantity,
  1254. "drawingQuantity": drawing_quantity,
  1255. "unitPriceExcludingTax": unit_price_excluding_tax,
  1256. "differenceAmount": difference_amount,
  1257. })
  1258. return result
  1259. def parse_material_purchase_contract2_table(df: pd.DataFrame) -> List[Dict[str, Any]]:
  1260. """
  1261. 解析"物资采购合同2"表,提取数据并生成 JSON 格式。
  1262. 返回格式:
  1263. [{
  1264. "No": int, # 序号
  1265. "materialName": str, # 物料名称
  1266. "contractAmount": float, # 合同金额(元,两位小数)
  1267. "bookedAmount": float, # 入账金额(元,两位小数)
  1268. "differenceAmount": float, # 差额(元,两位小数)
  1269. "remark": str, # 备注
  1270. }, ...]
  1271. """
  1272. if df.empty:
  1273. return []
  1274. # 尝试识别表头行(通常在前几行)
  1275. header_row_idx = None
  1276. for i in range(min(3, len(df))):
  1277. row_text = " ".join(df.iloc[i].astype(str).str.strip().tolist())
  1278. # 检查是否包含表头关键词
  1279. if any(kw in row_text for kw in ["序号", "物料名称", "合同金额(不含税)", "入账金额", "差额", "备注"]):
  1280. header_row_idx = i
  1281. break
  1282. if header_row_idx is None:
  1283. # 如果没有找到明确的表头,假设第一行是表头
  1284. header_row_idx = 0
  1285. # 先修复跨行单元格(在识别表头之前)
  1286. df = _fix_broken_cells(df, header_row_count=header_row_idx + 1)
  1287. # 合并前几行作为表头(处理多行表头的情况)
  1288. header_keywords = ["序号", "物料名称", "合同金额", "入账金额", "差额", "备注"]
  1289. header_rows_to_check = _detect_header_rows(df, header_row_idx, header_keywords)
  1290. header_texts = [] # 每列的合并文本
  1291. num_cols = len(df.columns)
  1292. # 打印原始表头行(用于调试)
  1293. logger.info(f"[物资采购合同2] 原始表头行 (从第 {header_row_idx + 1} 行开始,共检查 {header_rows_to_check} 行):")
  1294. for row_idx in range(header_row_idx, min(header_row_idx + header_rows_to_check, len(df))):
  1295. row_data = [str(df.iloc[row_idx, col_idx]).strip() for col_idx in range(num_cols)]
  1296. logger.info(f"[物资采购合同2] 第 {row_idx + 1} 行: {row_data}")
  1297. for col_idx in range(num_cols):
  1298. col_text_parts = []
  1299. for row_idx in range(header_row_idx, header_row_idx + header_rows_to_check):
  1300. if row_idx < len(df):
  1301. cell_val = str(df.iloc[row_idx, col_idx]).strip()
  1302. if cell_val and cell_val.lower() not in ['nan', 'none', '']:
  1303. # 格式化表头文本(移除换行符和空格)
  1304. formatted_text = _format_header_text(cell_val)
  1305. if formatted_text:
  1306. col_text_parts.append(formatted_text)
  1307. # 合并该列的所有表头文本(已经是格式化后的,无空格)
  1308. merged_text = ''.join(col_text_parts)
  1309. header_texts.append(merged_text)
  1310. logger.info(f"[物资采购合同2] 合并后的表头文本(已格式化): {header_texts}")
  1311. col_no = None # 序号列
  1312. col_material_name = None # 物料名称列
  1313. col_contract_amount = None # 合同金额(不含税)列
  1314. col_booked_amount = None # 入账金额列
  1315. col_difference = None # 差额列
  1316. col_remark = None # 备注列
  1317. for idx, header_text in enumerate(header_texts):
  1318. cell_lower = header_text.lower()
  1319. if "序号" in header_text or "no" in cell_lower:
  1320. col_no = idx
  1321. elif "物料名称" in header_text:
  1322. col_material_name = idx
  1323. elif "合同金额" in header_text and "不含税" in header_text:
  1324. col_contract_amount = idx
  1325. elif "入账金额" in header_text:
  1326. col_booked_amount = idx
  1327. elif "差额" in header_text:
  1328. col_difference = idx
  1329. elif "备注" in header_text:
  1330. col_remark = idx
  1331. # 如果关键列未找到,尝试通过位置推断
  1332. if col_no is None:
  1333. col_no = 0
  1334. if col_material_name is None:
  1335. col_material_name = 1
  1336. # 如果金额列未找到,尝试查找
  1337. if col_contract_amount is None:
  1338. for idx, header_text in enumerate(header_texts):
  1339. if "合同金额" in header_text:
  1340. col_contract_amount = idx
  1341. break
  1342. if col_booked_amount is None:
  1343. for idx, header_text in enumerate(header_texts):
  1344. if "入账" in header_text and "金额" in header_text:
  1345. col_booked_amount = idx
  1346. break
  1347. # 如果差额列未找到,尝试从后往前找(通常差额列在表格右侧)
  1348. if col_difference is None:
  1349. for idx in range(len(header_texts) - 1, -1, -1):
  1350. if "差额" in header_texts[idx]:
  1351. col_difference = idx
  1352. break
  1353. # 备注列通常在最后
  1354. if col_remark is None:
  1355. for idx in range(len(header_texts) - 1, -1, -1):
  1356. if "备注" in header_texts[idx]:
  1357. col_remark = idx
  1358. break
  1359. logger.info(f"[物资采购合同2] 列识别: 序号={col_no}, 物料名称={col_material_name}, "
  1360. f"合同金额={col_contract_amount}, 入账金额={col_booked_amount}, "
  1361. f"差额={col_difference}, 备注={col_remark}")
  1362. # 从数据行开始解析(跳过表头行)
  1363. data_rows = df.iloc[header_row_idx + 1:].reset_index(drop=True)
  1364. result = []
  1365. def parse_number(value: Any) -> str:
  1366. """解析数字,返回字符串保留原始精度"""
  1367. if pd.isna(value):
  1368. return "0"
  1369. value_str = str(value).strip()
  1370. # 移除常见的非数字字符(保留小数点、负号)
  1371. value_str = re.sub(r'[^\d.\-]', '', value_str)
  1372. if not value_str or value_str == '-':
  1373. return "0"
  1374. return value_str
  1375. for idx, row in data_rows.iterrows():
  1376. # 跳过空行
  1377. if row.isna().all():
  1378. continue
  1379. # 提取各列数据
  1380. no_val = row.iloc[col_no] if col_no is not None and col_no < len(row) else None
  1381. material_name_val = row.iloc[col_material_name] if col_material_name is not None and col_material_name < len(row) else None
  1382. contract_amount_val = row.iloc[col_contract_amount] if col_contract_amount is not None and col_contract_amount < len(row) else None
  1383. booked_amount_val = row.iloc[col_booked_amount] if col_booked_amount is not None and col_booked_amount < len(row) else None
  1384. difference_val = row.iloc[col_difference] if col_difference is not None and col_difference < len(row) else None
  1385. remark_val = row.iloc[col_remark] if col_remark is not None and col_remark < len(row) else None
  1386. # 解析序号
  1387. no = None
  1388. no_str = ""
  1389. if no_val is not None and not pd.isna(no_val):
  1390. no_str = str(no_val).strip()
  1391. if no_str:
  1392. try:
  1393. no = int(float(no_str))
  1394. except (ValueError, TypeError):
  1395. pass
  1396. # 跳过序号为空的行(这些通常是合计、其中等说明行)
  1397. if not no_str or pd.isna(no_val):
  1398. continue
  1399. # 解析物料名称,清理换行符
  1400. material_name = str(material_name_val).strip() if material_name_val is not None and not pd.isna(material_name_val) else ""
  1401. material_name = material_name.replace('\n', '').replace('\r', '')
  1402. material_name = re.sub(r'\s+', ' ', material_name).strip()
  1403. # 跳过空行
  1404. if not material_name or material_name == "":
  1405. continue
  1406. # 判断是否为合计行(合计行需要跳过)
  1407. is_total = any(kw in material_name for kw in ["合计", "总计", "总计", "合计金额"])
  1408. if is_total:
  1409. continue
  1410. # 解析金额(保留两位小数)
  1411. contract_amount = parse_number(contract_amount_val)
  1412. booked_amount = parse_number(booked_amount_val)
  1413. difference_amount = parse_number(difference_val)
  1414. # 解析备注,清理换行符
  1415. remark = str(remark_val).strip() if remark_val is not None and not pd.isna(remark_val) else ""
  1416. remark = remark.replace('\n', '').replace('\r', '')
  1417. remark = re.sub(r'\s+', ' ', remark).strip()
  1418. # 添加到结果
  1419. result.append({
  1420. "No": no if no is not None else idx + 1,
  1421. "materialName": material_name,
  1422. "contractAmount": contract_amount,
  1423. "bookedAmount": booked_amount,
  1424. "differenceAmount": difference_amount,
  1425. "remark": remark,
  1426. })
  1427. return result
  1428. def parse_other_service_contract_table(df: pd.DataFrame) -> List[Dict[str, Any]]:
  1429. """
  1430. 解析"其他服务类合同"表,提取数据并生成 JSON 格式。
  1431. 返回格式:
  1432. [{
  1433. "No": int, # 序号
  1434. "serviceProvider": str, # 服务商
  1435. "bidNotice": str, # 中标通知书
  1436. "contractAmount": float, # 合同金额(元,两位小数)
  1437. "submittedAmount": float, # 送审金额(元,两位小数)
  1438. "settlementAmount": float, # 结算金额(元,两位小数)
  1439. }, ...]
  1440. """
  1441. if df.empty:
  1442. return []
  1443. # 尝试识别表头行(通常在前几行)
  1444. header_row_idx = None
  1445. for i in range(min(3, len(df))):
  1446. row_text = " ".join(df.iloc[i].astype(str).str.strip().tolist())
  1447. # 检查是否包含表头关键词
  1448. if any(kw in row_text for kw in ["序号", "服务商", "中标通知书", "合同金额", "送审金额", "结算金额"]):
  1449. header_row_idx = i
  1450. break
  1451. if header_row_idx is None:
  1452. # 如果没有找到明确的表头,假设第一行是表头
  1453. header_row_idx = 0
  1454. # 先修复跨行单元格(在识别表头之前)
  1455. df = _fix_broken_cells(df, header_row_count=header_row_idx + 1)
  1456. # 合并前几行作为表头(处理多行表头的情况)
  1457. header_keywords = ["序号", "服务商", "中标通知书", "合同金额", "送审金额", "结算金额"]
  1458. header_rows_to_check = _detect_header_rows(df, header_row_idx, header_keywords)
  1459. header_texts = [] # 每列的合并文本
  1460. num_cols = len(df.columns)
  1461. # 打印原始表头行(用于调试)
  1462. logger.info(f"[其他服务类合同] 原始表头行 (从第 {header_row_idx + 1} 行开始,共检查 {header_rows_to_check} 行):")
  1463. for row_idx in range(header_row_idx, min(header_row_idx + header_rows_to_check, len(df))):
  1464. row_data = [str(df.iloc[row_idx, col_idx]).strip() for col_idx in range(num_cols)]
  1465. logger.info(f"[其他服务类合同] 第 {row_idx + 1} 行: {row_data}")
  1466. for col_idx in range(num_cols):
  1467. col_text_parts = []
  1468. for row_idx in range(header_row_idx, header_row_idx + header_rows_to_check):
  1469. if row_idx < len(df):
  1470. cell_val = str(df.iloc[row_idx, col_idx]).strip()
  1471. if cell_val and cell_val.lower() not in ['nan', 'none', '']:
  1472. # 格式化表头文本(移除换行符和空格)
  1473. formatted_text = _format_header_text(cell_val)
  1474. if formatted_text:
  1475. col_text_parts.append(formatted_text)
  1476. # 合并该列的所有表头文本(已经是格式化后的,无空格)
  1477. merged_text = ''.join(col_text_parts)
  1478. header_texts.append(merged_text)
  1479. logger.info(f"[其他服务类合同] 合并后的表头文本(已格式化): {header_texts}")
  1480. col_no = None # 序号列
  1481. col_service_provider = None # 服务商列
  1482. col_bid_notice = None # 中标通知书列
  1483. col_contract_amount = None # 合同金额列
  1484. col_submitted_amount = None # 送审金额列
  1485. col_settlement_amount = None # 结算金额列
  1486. for idx, header_text in enumerate(header_texts):
  1487. cell_lower = header_text.lower()
  1488. if "序号" in header_text or "no" in cell_lower:
  1489. col_no = idx
  1490. elif "服务商" in header_text:
  1491. col_service_provider = idx
  1492. elif "中标通知书" in header_text:
  1493. col_bid_notice = idx
  1494. elif "合同金额" in header_text and "送审" not in header_text and "结算" not in header_text:
  1495. col_contract_amount = idx
  1496. elif "送审金额" in header_text:
  1497. col_submitted_amount = idx
  1498. elif "结算金额" in header_text:
  1499. col_settlement_amount = idx
  1500. # 如果关键列未找到,尝试通过位置推断
  1501. if col_no is None:
  1502. col_no = 0
  1503. if col_service_provider is None:
  1504. col_service_provider = 1
  1505. # 如果金额列未找到,尝试查找
  1506. if col_contract_amount is None:
  1507. for idx, header_text in enumerate(header_texts):
  1508. if "合同" in header_text and "金额" in header_text:
  1509. col_contract_amount = idx
  1510. break
  1511. if col_submitted_amount is None:
  1512. for idx, header_text in enumerate(header_texts):
  1513. if "送审" in header_text and "金额" in header_text:
  1514. col_submitted_amount = idx
  1515. break
  1516. if col_settlement_amount is None:
  1517. for idx, header_text in enumerate(header_texts):
  1518. if "结算" in header_text and "金额" in header_text:
  1519. col_settlement_amount = idx
  1520. break
  1521. # 如果中标通知书列未找到,尝试查找
  1522. if col_bid_notice is None:
  1523. for idx, header_text in enumerate(header_texts):
  1524. if "中标" in header_text and "通知" in header_text:
  1525. col_bid_notice = idx
  1526. break
  1527. logger.info(f"[其他服务类合同] 列识别: 序号={col_no}, 服务商={col_service_provider}, "
  1528. f"中标通知书={col_bid_notice}, 合同金额={col_contract_amount}, "
  1529. f"送审金额={col_submitted_amount}, 结算金额={col_settlement_amount}")
  1530. # 从数据行开始解析(跳过表头行)
  1531. data_rows = df.iloc[header_row_idx + 1:].reset_index(drop=True)
  1532. result = []
  1533. def parse_number(value: Any) -> str:
  1534. """解析数字,返回字符串保留原始精度"""
  1535. if pd.isna(value):
  1536. return "0"
  1537. value_str = str(value).strip()
  1538. # 移除常见的非数字字符(保留小数点、负号)
  1539. value_str = re.sub(r'[^\d.\-]', '', value_str)
  1540. if not value_str or value_str == '-':
  1541. return "0"
  1542. return value_str
  1543. for idx, row in data_rows.iterrows():
  1544. # 跳过空行
  1545. if row.isna().all():
  1546. continue
  1547. # 提取各列数据
  1548. no_val = row.iloc[col_no] if col_no is not None and col_no < len(row) else None
  1549. service_provider_val = row.iloc[col_service_provider] if col_service_provider is not None and col_service_provider < len(row) else None
  1550. bid_notice_val = row.iloc[col_bid_notice] if col_bid_notice is not None and col_bid_notice < len(row) else None
  1551. contract_amount_val = row.iloc[col_contract_amount] if col_contract_amount is not None and col_contract_amount < len(row) else None
  1552. submitted_amount_val = row.iloc[col_submitted_amount] if col_submitted_amount is not None and col_submitted_amount < len(row) else None
  1553. settlement_amount_val = row.iloc[col_settlement_amount] if col_settlement_amount is not None and col_settlement_amount < len(row) else None
  1554. # 解析序号
  1555. no = None
  1556. no_str = ""
  1557. if no_val is not None and not pd.isna(no_val):
  1558. no_str = str(no_val).strip()
  1559. if no_str:
  1560. try:
  1561. no = int(float(no_str))
  1562. except (ValueError, TypeError):
  1563. pass
  1564. # 跳过序号为空的行(这些通常是合计、其中等说明行)
  1565. if not no_str or pd.isna(no_val):
  1566. continue
  1567. # 解析服务商,清理换行符
  1568. service_provider = str(service_provider_val).strip() if service_provider_val is not None and not pd.isna(service_provider_val) else ""
  1569. service_provider = service_provider.replace('\n', '').replace('\r', '')
  1570. service_provider = re.sub(r'\s+', ' ', service_provider).strip()
  1571. # 跳过空行
  1572. if not service_provider or service_provider == "":
  1573. continue
  1574. # 判断是否为合计行(合计行需要跳过)
  1575. is_total = any(kw in service_provider for kw in ["合计", "总计", "总计", "合计金额"])
  1576. if is_total:
  1577. continue
  1578. # 解析中标通知书,清理换行符
  1579. bid_notice = str(bid_notice_val).strip() if bid_notice_val is not None and not pd.isna(bid_notice_val) else ""
  1580. bid_notice = bid_notice.replace('\n', '').replace('\r', '')
  1581. bid_notice = re.sub(r'\s+', ' ', bid_notice).strip()
  1582. # 解析金额(保留两位小数)
  1583. contract_amount = parse_number(contract_amount_val)
  1584. submitted_amount = parse_number(submitted_amount_val)
  1585. settlement_amount = parse_number(settlement_amount_val)
  1586. # 添加到结果
  1587. result.append({
  1588. "No": no if no is not None else idx + 1,
  1589. "serviceProvider": service_provider,
  1590. "bidNotice": bid_notice,
  1591. "contractAmount": contract_amount,
  1592. "submittedAmount": submitted_amount,
  1593. "settlementAmount": settlement_amount,
  1594. })
  1595. return result
  1596. def parse_design_review_table(df: pd.DataFrame) -> List[Dict[str, Any]]:
  1597. """
  1598. 解析 designReview 类型的表格,提取数据并生成结构化的 JSON 格式。
  1599. 返回格式:
  1600. [{
  1601. "name": str, # 大类名称(如"变电工程"、"线路工程")
  1602. "Level": 0, # 大类层级
  1603. "staticInvestment": float, # 静态投资总计
  1604. "dynamicInvestment": float, # 动态投资总计
  1605. "items": [ # 子项列表
  1606. {
  1607. "No": int, # 序号
  1608. "name": str, # 工程名称
  1609. "Level": 1, # 子项层级
  1610. "staticInvestment": float, # 静态投资
  1611. "dynamicInvestment": float, # 动态投资
  1612. },
  1613. ...
  1614. ]
  1615. }, ...]
  1616. """
  1617. if df.empty:
  1618. return []
  1619. # 中文数字映射(用于识别序号格式)
  1620. CHINESE_NUMBERS = {
  1621. '一': 1, '二': 2, '三': 3, '四': 4, '五': 5, '六': 6, '七': 7, '八': 8, '九': 9, '十': 10,
  1622. '十一': 11, '十二': 12, '十三': 13, '十四': 14, '十五': 15, '十六': 16, '十七': 17, '十八': 18, '十九': 19, '二十': 20
  1623. }
  1624. def is_category_by_serial(no_str: str) -> bool:
  1625. """
  1626. 通过序号格式判断是否为大类
  1627. 大类:序号为中文数字,如"一"、"二"、"三"(不含括号)
  1628. 子类:序号为带括号的中文数字,如"(一)"、"(二)"、"(三)"
  1629. """
  1630. if not no_str:
  1631. return False
  1632. no_str = str(no_str).strip()
  1633. # 如果包含括号,是子类
  1634. if '(' in no_str or '(' in no_str or ')' in no_str or ')' in no_str:
  1635. return False
  1636. # 检查是否是中文数字(大类)
  1637. # 移除可能的空格和标点
  1638. cleaned = no_str.replace(' ', '').replace('、', '').replace('.', '').replace('。', '')
  1639. # 检查是否以中文数字开头
  1640. for chinese_num in CHINESE_NUMBERS.keys():
  1641. if cleaned.startswith(chinese_num):
  1642. return True
  1643. return False
  1644. # 尝试识别表头行(通常在前几行)
  1645. header_row_idx = None
  1646. for i in range(min(3, len(df))):
  1647. row_text = " ".join(df.iloc[i].astype(str).str.strip().tolist())
  1648. # 检查是否包含表头关键词
  1649. if any(kw in row_text for kw in ["序号", "工程名称", "建设规模", "静态投资", "动态投资"]):
  1650. header_row_idx = i
  1651. break
  1652. if header_row_idx is None:
  1653. # 如果没有找到明确的表头,假设第一行是表头
  1654. header_row_idx = 0
  1655. # 从表头行识别列索引
  1656. header_row = df.iloc[header_row_idx].astype(str).str.strip()
  1657. col_no = None # 序号列
  1658. col_name = None # 工程名称列
  1659. col_scale = None # 建设规模列(用于判断层级)
  1660. col_static = None # 静态投资列
  1661. col_dynamic = None # 动态投资列
  1662. for idx, cell in enumerate(header_row):
  1663. cell_lower = cell.lower()
  1664. if "序号" in cell or "no" in cell_lower:
  1665. col_no = idx
  1666. elif "工程名称" in cell or "name" in cell_lower:
  1667. col_name = idx
  1668. elif "建设规模" in cell or "规模" in cell:
  1669. col_scale = idx
  1670. elif "静态投资" in cell or "static" in cell_lower:
  1671. col_static = idx
  1672. elif "动态投资" in cell or "dynamic" in cell_lower:
  1673. col_dynamic = idx
  1674. # 如果关键列未找到,尝试通过位置推断(通常顺序:序号、工程名称、建设规模、静态投资、动态投资)
  1675. if col_no is None:
  1676. col_no = 0
  1677. if col_name is None:
  1678. col_name = 1
  1679. if col_static is None:
  1680. # 从后往前找
  1681. for idx in range(len(df.columns) - 1, -1, -1):
  1682. if "投资" in str(df.iloc[header_row_idx, idx]) or "元" in str(df.iloc[header_row_idx, idx]):
  1683. if col_dynamic is None:
  1684. col_dynamic = idx
  1685. elif col_static is None:
  1686. col_static = idx
  1687. break
  1688. # 从数据行开始解析(跳过表头行)
  1689. data_rows = df.iloc[header_row_idx + 1:].reset_index(drop=True)
  1690. def parse_number(value: Any) -> str:
  1691. """解析数字,返回字符串保留原始精度"""
  1692. if pd.isna(value):
  1693. return "0"
  1694. value_str = str(value).strip()
  1695. # 移除常见的非数字字符(保留小数点、负号)
  1696. value_str = re.sub(r'[^\d.\-]', '', value_str)
  1697. if not value_str or value_str == '-':
  1698. return "0"
  1699. return value_str
  1700. # 先解析所有行
  1701. all_items = []
  1702. for idx, row in data_rows.iterrows():
  1703. # 跳过空行
  1704. if row.isna().all():
  1705. continue
  1706. # 提取各列数据
  1707. no_val = row.iloc[col_no] if col_no is not None and col_no < len(row) else None
  1708. name_val = row.iloc[col_name] if col_name is not None and col_name < len(row) else None
  1709. static_val = row.iloc[col_static] if col_static is not None and col_static < len(row) else None
  1710. dynamic_val = row.iloc[col_dynamic] if col_dynamic is not None and col_dynamic < len(row) else None
  1711. # 解析序号(保留原始字符串)
  1712. no = None
  1713. no_str = ""
  1714. if no_val is not None and not pd.isna(no_val):
  1715. no_str = str(no_val).strip()
  1716. if no_str:
  1717. # 先尝试解析为数字
  1718. try:
  1719. no = int(float(no_str))
  1720. except (ValueError, TypeError):
  1721. # 如果不是数字,尝试解析中文数字
  1722. cleaned = no_str.replace(' ', '').replace('、', '').replace('.', '').replace('。', '')
  1723. # 移除括号
  1724. cleaned_no_brackets = cleaned.replace('(', '').replace('(', '').replace(')', '').replace(')', '')
  1725. if cleaned_no_brackets in CHINESE_NUMBERS:
  1726. no = CHINESE_NUMBERS[cleaned_no_brackets]
  1727. else:
  1728. # 尝试匹配中文数字前缀
  1729. for chinese_num, num_val in CHINESE_NUMBERS.items():
  1730. if cleaned_no_brackets.startswith(chinese_num):
  1731. no = num_val
  1732. break
  1733. # 跳过序号为空的行(这些通常是"其中:"等说明行)
  1734. if not no_str or pd.isna(no_val):
  1735. continue
  1736. # 解析工程名称
  1737. name = str(name_val).strip() if name_val is not None and not pd.isna(name_val) else ""
  1738. # 跳过空行
  1739. if not name or name == "":
  1740. continue
  1741. # 判断是否为合计行(合计行需要跳过)
  1742. is_total = any(kw in name for kw in ["合计", "总计", "总计", "合计金额"])
  1743. if is_total:
  1744. continue
  1745. # 解析投资金额
  1746. static_investment = parse_number(static_val)
  1747. dynamic_investment = parse_number(dynamic_val)
  1748. # 判断是否为大类:通过序号格式识别
  1749. is_category = is_category_by_serial(no_str)
  1750. all_items.append({
  1751. "No": no if no is not None else idx + 1,
  1752. "name": name,
  1753. "isCategory": is_category,
  1754. "staticInvestment": static_investment,
  1755. "dynamicInvestment": dynamic_investment,
  1756. })
  1757. # 构建层级结构
  1758. result = []
  1759. current_category = None
  1760. for item in all_items:
  1761. if item["isCategory"]:
  1762. # 如果遇到新的大类,先保存之前的大类(如果有)
  1763. if current_category is not None:
  1764. result.append(current_category)
  1765. # 创建新的大类
  1766. current_category = {
  1767. "name": item["name"],
  1768. "Level": 0,
  1769. "staticInvestment": item["staticInvestment"],
  1770. "dynamicInvestment": item["dynamicInvestment"],
  1771. "items": []
  1772. }
  1773. else:
  1774. # 子项,添加到当前大类
  1775. if current_category is not None:
  1776. current_category["items"].append({
  1777. "No": item["No"],
  1778. "name": item["name"],
  1779. "Level": 1,
  1780. "staticInvestment": item["staticInvestment"],
  1781. "dynamicInvestment": item["dynamicInvestment"],
  1782. })
  1783. else:
  1784. # 如果没有大类,作为独立项(不应该发生,但容错处理)
  1785. result.append({
  1786. "name": item["name"],
  1787. "Level": 1,
  1788. "staticInvestment": item["staticInvestment"],
  1789. "dynamicInvestment": item["dynamicInvestment"],
  1790. "items": []
  1791. })
  1792. # 保存最后一个大类
  1793. if current_category is not None:
  1794. result.append(current_category)
  1795. return result
  1796. def parse_design_review_detail_table(df: pd.DataFrame, table_title: str) -> List[Dict[str, Any]]:
  1797. """
  1798. 解析 designReview 类型的概算投资明细表格(规则2)。
  1799. 表头格式:
  1800. 序号 | 工程或费用名称 | 建筑工程费 | 设备购置费 | 安装工程费 | 其他费用 | 合计 | ...
  1801. Args:
  1802. df: 表格 DataFrame
  1803. table_title: 表格标题(如"周村 220kV 变电站新建工程总概算表"),用于提取工程名称
  1804. 返回格式:
  1805. [{
  1806. "No": int, # 序号
  1807. "Level": int, # 明细等级
  1808. "name": str, # 单项工程名称(从标题提取,如"周村220KV变电站新建工程")
  1809. "projectOrExpenseName": str, # 工程或费用名称
  1810. "constructionProjectCost": float, # 建筑工程费(元)
  1811. "equipmentPurchaseCost": float, # 设备购置费(元)
  1812. "installationProjectCost": float, # 安装工程费(元)
  1813. "otherExpenses": float, # 其他费用(元)
  1814. }, ...]
  1815. """
  1816. if df.empty:
  1817. return []
  1818. # 从标题中提取工程名称
  1819. # 标题格式如:"周村 220kV 变电站新建工程总概算表" -> "周村220kV变电站新建工程"
  1820. project_name = table_title
  1821. if project_name:
  1822. # 移除"总概算表"、"概算表"、"估算表"等后缀
  1823. project_name = re.sub(r'(总概算表|概算表|估算表|汇总表)$', '', project_name)
  1824. # 移除多余空格
  1825. project_name = re.sub(r'\s+', '', project_name)
  1826. # 中文数字映射(用于判断层级)
  1827. CHINESE_NUMBERS = {
  1828. '一': 1, '二': 2, '三': 3, '四': 4, '五': 5, '六': 6, '七': 7, '八': 8, '九': 9, '十': 10,
  1829. '十一': 11, '十二': 12, '十三': 13, '十四': 14, '十五': 15, '十六': 16, '十七': 17, '十八': 18, '十九': 19, '二十': 20
  1830. }
  1831. # 特殊的"其中:"项,应作为 Level 1(独立大类)而非子项
  1832. SPECIAL_LEVEL1_ITEMS = [
  1833. "可抵扣固定资产增值税额",
  1834. ]
  1835. def determine_level(no_str: str, expense_name: str = "") -> int:
  1836. """
  1837. 根据序号格式和费用名称判断层级:
  1838. - 中文数字(一、二、三):Level 1(大类)
  1839. - 阿拉伯数字(1、2、3):Level 1(大类)
  1840. - 带括号的数字((1)、(一)):Level 2(子项)
  1841. - 以"其中:"开头的行:Level 2(子项),但特殊项除外
  1842. """
  1843. no_str = str(no_str).strip() if no_str else ""
  1844. expense_name = str(expense_name).strip() if expense_name else ""
  1845. # 移除"其中:"前缀用于判断
  1846. name_without_prefix = expense_name
  1847. if expense_name.startswith("其中:"):
  1848. name_without_prefix = expense_name[3:]
  1849. elif expense_name.startswith("其中:"):
  1850. name_without_prefix = expense_name[3:]
  1851. # 检查是否为特殊的 Level 1 项
  1852. for special_item in SPECIAL_LEVEL1_ITEMS:
  1853. if special_item in name_without_prefix:
  1854. return 1
  1855. # 以"其中:"或"其中:"开头的 -> Level 2(子项)
  1856. if expense_name.startswith("其中:") or expense_name.startswith("其中:"):
  1857. return 2
  1858. # 序号为空的情况,默认 Level 1
  1859. if not no_str:
  1860. return 1
  1861. # 带括号的 -> Level 2(子项)
  1862. if '(' in no_str or '(' in no_str or ')' in no_str or ')' in no_str:
  1863. return 2
  1864. # 中文数字(一、二、三等)-> Level 1(大类)
  1865. cleaned = no_str.replace(' ', '').replace('、', '').replace('.', '').replace('。', '')
  1866. for chinese_num in CHINESE_NUMBERS.keys():
  1867. if cleaned == chinese_num or cleaned.startswith(chinese_num):
  1868. return 1
  1869. # 阿拉伯数字(1、2、3等)-> Level 1(大类)
  1870. if re.match(r'^\d+\.?$', cleaned):
  1871. return 1
  1872. return 1 # 默认
  1873. def parse_number(value: Any) -> str:
  1874. """解析数字,返回字符串保留原始精度"""
  1875. if pd.isna(value):
  1876. return "0"
  1877. value_str = str(value).strip()
  1878. # 移除常见的非数字字符(保留小数点、负号)
  1879. value_str = re.sub(r'[^\d.\-]', '', value_str)
  1880. if not value_str or value_str == '-':
  1881. return "0"
  1882. return value_str
  1883. # 识别表头行(可能有多行表头,需要扫描多行来找到列索引)
  1884. # 先找到包含 "序号" 的行作为起始行
  1885. header_start_idx = None
  1886. for i in range(min(5, len(df))):
  1887. row_text = " ".join(df.iloc[i].astype(str).str.strip().tolist())
  1888. row_text_no_space = row_text.replace(" ", "")
  1889. if "序号" in row_text_no_space:
  1890. header_start_idx = i
  1891. break
  1892. if header_start_idx is None:
  1893. header_start_idx = 0
  1894. logger.warning(f"[表格解析] 未找到包含'序号'的表头行,使用第一行作为表头")
  1895. # 扫描表头区域(可能是多行表头),收集所有列的关键词
  1896. col_no = None
  1897. col_name = None
  1898. col_construction = None
  1899. col_equipment = None
  1900. col_installation = None
  1901. col_other = None
  1902. # 打印前几行表头用于调试
  1903. logger.debug(f"[表格解析] DataFrame 形状: {df.shape}")
  1904. for i in range(min(3, len(df))):
  1905. row_vals = df.iloc[i].astype(str).tolist()
  1906. logger.debug(f"[表格解析] 第{i}行: {row_vals}")
  1907. # 扫描前几行表头(多行表头情况)
  1908. header_end_idx = header_start_idx
  1909. for i in range(header_start_idx, min(header_start_idx + 3, len(df))):
  1910. row = df.iloc[i].astype(str).str.strip()
  1911. for idx, cell in enumerate(row):
  1912. # 移除所有空格、换行符等
  1913. cell_no_space = re.sub(r'\s+', '', cell)
  1914. if ("序号" in cell_no_space or cell_no_space == "No") and col_no is None:
  1915. col_no = idx
  1916. elif ("工程或费用名称" in cell_no_space or "费用名称" in cell_no_space) and col_name is None:
  1917. col_name = idx
  1918. # 增强列名匹配 - 使用更宽松的匹配
  1919. elif col_construction is None and any(kw in cell_no_space for kw in ["建筑工程费", "建筑工程", "建筑"]):
  1920. col_construction = idx
  1921. elif col_equipment is None and any(kw in cell_no_space for kw in ["设备购置费", "设备购置", "设备"]):
  1922. col_equipment = idx
  1923. elif col_installation is None and any(kw in cell_no_space for kw in ["安装工程费", "安装工程", "安装"]):
  1924. col_installation = idx
  1925. elif col_other is None and "其他费用" in cell_no_space:
  1926. col_other = idx
  1927. # 检查这一行是否像数据行(第一列是数字或中文数字)
  1928. first_cell = row.iloc[0] if len(row) > 0 else ""
  1929. first_cell_clean = re.sub(r'\s+', '', first_cell).replace("、", "")
  1930. chinese_nums = ['一', '二', '三', '四', '五', '六', '七', '八', '九', '十']
  1931. if first_cell_clean and (first_cell_clean.isdigit() or first_cell_clean in chinese_nums):
  1932. # 这一行可能是数据行,表头到此结束
  1933. header_end_idx = i - 1
  1934. break
  1935. header_end_idx = i
  1936. # 如果关键列未找到,尝试位置推断
  1937. if col_no is None:
  1938. col_no = 0
  1939. if col_name is None:
  1940. col_name = 1
  1941. # 如果费用列未找到,尝试按位置推断
  1942. # 假设顺序:序号(0)、名称(1)、建筑(2)、设备(3)、安装(4)、其他(5)、合计(6)...
  1943. logger.info(f"[表格解析] 列数: {len(df.columns)}, 当前列索引: 建筑={col_construction}, 设备={col_equipment}, 安装={col_installation}, 其他={col_other}")
  1944. if col_construction is None or col_equipment is None or col_installation is None or col_other is None:
  1945. num_cols = len(df.columns)
  1946. if num_cols >= 7:
  1947. # 按位置推断
  1948. if col_construction is None:
  1949. col_construction = 2
  1950. if col_equipment is None:
  1951. col_equipment = 3
  1952. if col_installation is None:
  1953. col_installation = 4
  1954. if col_other is None:
  1955. col_other = 5
  1956. logger.info(f"[表格解析] 按位置推断列索引: 建筑={col_construction}, 设备={col_equipment}, 安装={col_installation}, 其他={col_other}")
  1957. else:
  1958. logger.warning(f"[表格解析] 列数不足({num_cols}<7),无法按位置推断费用列")
  1959. logger.info(f"[表格解析] 列索引: 序号={col_no}, 名称={col_name}, 建筑={col_construction}, 设备={col_equipment}, 安装={col_installation}, 其他={col_other}")
  1960. logger.info(f"[表格解析] 表头范围: 行 {header_start_idx} - {header_end_idx}")
  1961. # 从数据行开始解析(跳过表头行)
  1962. data_rows = df.iloc[header_end_idx + 1:].reset_index(drop=True)
  1963. result = []
  1964. for idx, row in data_rows.iterrows():
  1965. # 跳过空行
  1966. if row.isna().all():
  1967. continue
  1968. # 提取各列数据
  1969. no_val = row.iloc[col_no] if col_no is not None and col_no < len(row) else None
  1970. name_val = row.iloc[col_name] if col_name is not None and col_name < len(row) else None
  1971. construction_val = row.iloc[col_construction] if col_construction is not None and col_construction < len(row) else None
  1972. equipment_val = row.iloc[col_equipment] if col_equipment is not None and col_equipment < len(row) else None
  1973. installation_val = row.iloc[col_installation] if col_installation is not None and col_installation < len(row) else None
  1974. other_val = row.iloc[col_other] if col_other is not None and col_other < len(row) else None
  1975. # 解析序号
  1976. no = None
  1977. no_str = ""
  1978. if no_val is not None and not pd.isna(no_val):
  1979. no_str = str(no_val).strip()
  1980. if no_str:
  1981. try:
  1982. no = int(float(no_str))
  1983. except (ValueError, TypeError):
  1984. # 尝试中文数字
  1985. cleaned = no_str.replace(' ', '').replace('、', '').replace('.', '').replace('。', '')
  1986. cleaned_no_brackets = cleaned.replace('(', '').replace('(', '').replace(')', '').replace(')', '')
  1987. if cleaned_no_brackets in CHINESE_NUMBERS:
  1988. no = CHINESE_NUMBERS[cleaned_no_brackets]
  1989. # 解析工程或费用名称(提前解析,用于判断是否为"其中:"行)
  1990. expense_name = str(name_val).strip() if name_val is not None and not pd.isna(name_val) else ""
  1991. # 跳过空行
  1992. if not expense_name:
  1993. continue
  1994. # 判断是否为"其中:"开头的行(这类行序号通常为空,但需要保留)
  1995. is_sub_item = expense_name.startswith("其中:") or expense_name.startswith("其中:")
  1996. # 跳过序号为空或无效的行(但"其中:"行除外)
  1997. if (not no_str or pd.isna(no_val)) and not is_sub_item:
  1998. continue
  1999. # 跳过合计行
  2000. if any(kw in expense_name for kw in ["合计", "总计", "小计"]):
  2001. continue
  2002. # 判断层级(传入费用名称用于判断"其中:")
  2003. level = determine_level(no_str, expense_name)
  2004. # 去除"其中:"前缀
  2005. clean_expense_name = expense_name
  2006. if expense_name.startswith("其中:"):
  2007. clean_expense_name = expense_name[3:]
  2008. elif expense_name.startswith("其中:"):
  2009. clean_expense_name = expense_name[3:]
  2010. # 解析费用金额
  2011. construction_cost = parse_number(construction_val)
  2012. equipment_cost = parse_number(equipment_val)
  2013. installation_cost = parse_number(installation_val)
  2014. other_cost = parse_number(other_val)
  2015. result.append({
  2016. "No": no if no is not None else idx + 1,
  2017. "Level": level,
  2018. "name": project_name, # 从标题提取的工程名称
  2019. "projectOrExpenseName": clean_expense_name,
  2020. "constructionProjectCost": construction_cost,
  2021. "equipmentPurchaseCost": equipment_cost,
  2022. "installationProjectCost": installation_cost,
  2023. "otherExpenses": other_cost,
  2024. })
  2025. logger.info(f"[表格解析] 解析完成: 工程名称={project_name}, 共 {len(result)} 条数据")
  2026. return result
  2027. def parse_design_review_cost_table(df: pd.DataFrame, table_title: str) -> List[Dict[str, Any]]:
  2028. """
  2029. 解析 designReview 类型的概算投资费用表格(规则3)。
  2030. 表头格式:
  2031. 序号 | 工程或费用名称 | 费用金额 | 各项占静态投资% | 单位投资万元/km
  2032. Args:
  2033. df: 表格 DataFrame
  2034. table_title: 表格标题(如"周村 220kV 变电站新建工程总概算表"),用于提取工程名称
  2035. 返回格式:
  2036. [{
  2037. "No": int, # 序号
  2038. "Level": int, # 明细等级
  2039. "name": str, # 单项工程名称(从标题提取)
  2040. "projectOrExpenseName": str, # 工程或费用名称
  2041. "cost": float, # 费用金额(元)
  2042. }, ...]
  2043. """
  2044. if df.empty:
  2045. return []
  2046. # 从标题中提取工程名称
  2047. project_name = table_title
  2048. if project_name:
  2049. # 移除"总概算表"、"概算表"、"估算表"等后缀
  2050. project_name = re.sub(r'(总概算表|概算表|估算表|汇总表)$', '', project_name)
  2051. # 移除多余空格
  2052. project_name = re.sub(r'\s+', '', project_name)
  2053. # 中文数字映射(用于判断层级)
  2054. CHINESE_NUMBERS = {
  2055. '一': 1, '二': 2, '三': 3, '四': 4, '五': 5, '六': 6, '七': 7, '八': 8, '九': 9, '十': 10,
  2056. '十一': 11, '十二': 12, '十三': 13, '十四': 14, '十五': 15, '十六': 16, '十七': 17, '十八': 18, '十九': 19, '二十': 20
  2057. }
  2058. # 特殊的"其中:"项,应作为 Level 1(独立大类)而非子项
  2059. SPECIAL_LEVEL1_ITEMS = [
  2060. "可抵扣固定资产增值税额",
  2061. ]
  2062. def determine_level(no_str: str, expense_name: str = "") -> int:
  2063. """
  2064. 根据序号格式和费用名称判断层级:
  2065. - 中文数字(一、二、三):Level 1(大类)
  2066. - 阿拉伯数字(1、2、3):Level 1(大类)
  2067. - 带括号的数字((1)、(一)):Level 2(子项)
  2068. - 以"其中:"开头的行:Level 2(子项),但特殊项除外
  2069. """
  2070. no_str = str(no_str).strip() if no_str else ""
  2071. expense_name = str(expense_name).strip() if expense_name else ""
  2072. # 移除"其中:"前缀用于判断
  2073. name_without_prefix = expense_name
  2074. if expense_name.startswith("其中:"):
  2075. name_without_prefix = expense_name[3:]
  2076. elif expense_name.startswith("其中:"):
  2077. name_without_prefix = expense_name[3:]
  2078. # 检查是否为特殊的 Level 1 项
  2079. for special_item in SPECIAL_LEVEL1_ITEMS:
  2080. if special_item in name_without_prefix:
  2081. return 1
  2082. # 以"其中:"或"其中:"开头的 -> Level 2(子项)
  2083. if expense_name.startswith("其中:") or expense_name.startswith("其中:"):
  2084. return 2
  2085. # 序号为空的情况,默认 Level 1
  2086. if not no_str:
  2087. return 1
  2088. # 带括号的 -> Level 2(子项)
  2089. if '(' in no_str or '(' in no_str or ')' in no_str or ')' in no_str:
  2090. return 2
  2091. # 中文数字(一、二、三等)-> Level 1(大类)
  2092. cleaned = no_str.replace(' ', '').replace('、', '').replace('.', '').replace('。', '')
  2093. for chinese_num in CHINESE_NUMBERS.keys():
  2094. if cleaned == chinese_num or cleaned.startswith(chinese_num):
  2095. return 1
  2096. # 阿拉伯数字(1、2、3等)-> Level 1(大类)
  2097. if re.match(r'^\d+\.?$', cleaned):
  2098. return 1
  2099. return 1
  2100. def parse_number(value: Any) -> str:
  2101. """解析数字,返回字符串保留原始精度"""
  2102. if pd.isna(value):
  2103. return "0"
  2104. value_str = str(value).strip()
  2105. value_str = re.sub(r'[^\d.\-]', '', value_str)
  2106. if not value_str or value_str == '-':
  2107. return "0"
  2108. return value_str
  2109. # 识别表头行
  2110. header_row_idx = None
  2111. for i in range(min(5, len(df))):
  2112. row_text = " ".join(df.iloc[i].astype(str).str.strip().tolist())
  2113. row_text_no_space = row_text.replace(" ", "")
  2114. if "工程或费用名称" in row_text_no_space and "费用金额" in row_text_no_space:
  2115. header_row_idx = i
  2116. break
  2117. elif "序号" in row_text and "费用金额" in row_text:
  2118. header_row_idx = i
  2119. break
  2120. if header_row_idx is None:
  2121. header_row_idx = 0
  2122. logger.warning(f"[表格解析] 未找到明确表头,使用第一行作为表头")
  2123. # 从表头行识别列索引
  2124. header_row = df.iloc[header_row_idx].astype(str).str.strip()
  2125. col_no = None
  2126. col_name = None
  2127. col_cost = None
  2128. for idx, cell in enumerate(header_row):
  2129. cell_no_space = cell.replace(" ", "")
  2130. if "序号" in cell_no_space or cell_no_space == "No":
  2131. col_no = idx
  2132. elif "工程或费用名称" in cell_no_space or "费用名称" in cell_no_space:
  2133. col_name = idx
  2134. elif "费用金额" in cell_no_space:
  2135. col_cost = idx
  2136. if col_no is None:
  2137. col_no = 0
  2138. if col_name is None:
  2139. col_name = 1
  2140. if col_cost is None:
  2141. col_cost = 2
  2142. logger.info(f"[表格解析] 列索引: 序号={col_no}, 名称={col_name}, 费用金额={col_cost}")
  2143. # 从数据行开始解析
  2144. data_rows = df.iloc[header_row_idx + 1:].reset_index(drop=True)
  2145. result = []
  2146. for idx, row in data_rows.iterrows():
  2147. if row.isna().all():
  2148. continue
  2149. no_val = row.iloc[col_no] if col_no is not None and col_no < len(row) else None
  2150. name_val = row.iloc[col_name] if col_name is not None and col_name < len(row) else None
  2151. cost_val = row.iloc[col_cost] if col_cost is not None and col_cost < len(row) else None
  2152. # 解析序号
  2153. no = None
  2154. no_str = ""
  2155. if no_val is not None and not pd.isna(no_val):
  2156. no_str = str(no_val).strip()
  2157. if no_str:
  2158. try:
  2159. no = int(float(no_str))
  2160. except (ValueError, TypeError):
  2161. cleaned = no_str.replace(' ', '').replace('、', '').replace('.', '').replace('。', '')
  2162. cleaned_no_brackets = cleaned.replace('(', '').replace('(', '').replace(')', '').replace(')', '')
  2163. if cleaned_no_brackets in CHINESE_NUMBERS:
  2164. no = CHINESE_NUMBERS[cleaned_no_brackets]
  2165. # 解析工程或费用名称(提前解析,用于判断是否为"其中:"行)
  2166. expense_name = str(name_val).strip() if name_val is not None and not pd.isna(name_val) else ""
  2167. if not expense_name:
  2168. continue
  2169. # 判断是否为"其中:"开头的行(这类行序号通常为空,但需要保留)
  2170. is_sub_item = expense_name.startswith("其中:") or expense_name.startswith("其中:")
  2171. # 跳过序号为空或无效的行(但"其中:"行除外)
  2172. if (not no_str or pd.isna(no_val)) and not is_sub_item:
  2173. continue
  2174. if any(kw in expense_name for kw in ["合计", "总计", "小计"]):
  2175. continue
  2176. # 判断层级(传入费用名称用于判断"其中:")
  2177. level = determine_level(no_str, expense_name)
  2178. # 去除"其中:"前缀
  2179. clean_expense_name = expense_name
  2180. if expense_name.startswith("其中:"):
  2181. clean_expense_name = expense_name[3:]
  2182. elif expense_name.startswith("其中:"):
  2183. clean_expense_name = expense_name[3:]
  2184. cost = parse_number(cost_val)
  2185. result.append({
  2186. "No": no if no is not None else idx + 1,
  2187. "Level": level,
  2188. "name": project_name,
  2189. "projectOrExpenseName": clean_expense_name,
  2190. "cost": cost,
  2191. })
  2192. logger.info(f"[表格解析] 解析完成: 工程名称={project_name}, 共 {len(result)} 条数据")
  2193. return result
  2194. def _group_items_by_name(items: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
  2195. """
  2196. 将平铺的项目列表按 name 字段分组,并按 Level 嵌套:
  2197. - Level 1 的项作为大类
  2198. - Level 2 的项作为大类的子项(放入 items 中)
  2199. 输入:
  2200. [
  2201. {"No": "一", "Level": 1, "name": "工程A", "projectOrExpenseName": "主辅生产工程", ...},
  2202. {"No": "(一)", "Level": 2, "name": "工程A", "projectOrExpenseName": "主要生产工程", ...},
  2203. {"No": "(二)", "Level": 2, "name": "工程A", "projectOrExpenseName": "辅助生产工程", ...},
  2204. {"No": "二", "Level": 1, "name": "工程A", "projectOrExpenseName": "其他费用", ...},
  2205. {"No": "", "Level": 2, "name": "工程A", "projectOrExpenseName": "其中:建设场地征用", ...},
  2206. ]
  2207. 输出:
  2208. [
  2209. {
  2210. "name": "工程A",
  2211. "items": [
  2212. {
  2213. "No": "一", "Level": 1, "projectOrExpenseName": "主辅生产工程", ...,
  2214. "items": [
  2215. {"No": "(一)", "Level": 2, "projectOrExpenseName": "主要生产工程", ...},
  2216. {"No": "(二)", "Level": 2, "projectOrExpenseName": "辅助生产工程", ...},
  2217. ]
  2218. },
  2219. {
  2220. "No": "二", "Level": 1, "projectOrExpenseName": "其他费用", ...,
  2221. "items": [
  2222. {"No": "", "Level": 2, "projectOrExpenseName": "其中:建设场地征用", ...},
  2223. ]
  2224. }
  2225. ]
  2226. }
  2227. ]
  2228. """
  2229. if not items:
  2230. return []
  2231. from collections import OrderedDict
  2232. # 第一步:按工程名称(name)分组
  2233. grouped_by_name: OrderedDict[str, List[Dict[str, Any]]] = OrderedDict()
  2234. for item in items:
  2235. name = item.get("name", "未知工程")
  2236. if name not in grouped_by_name:
  2237. grouped_by_name[name] = []
  2238. # 复制 item 并移除 name 字段
  2239. item_copy = {k: v for k, v in item.items() if k != "name"}
  2240. grouped_by_name[name].append(item_copy)
  2241. # 第二步:在每个工程组内,按 Level 建立父子关系
  2242. result = []
  2243. for name, group_items in grouped_by_name.items():
  2244. nested_items = []
  2245. current_parent = None
  2246. for item in group_items:
  2247. level = item.get("Level", 1)
  2248. if level == 1:
  2249. # Level 1 是大类,创建新的父项
  2250. item_with_children = dict(item)
  2251. item_with_children["items"] = []
  2252. nested_items.append(item_with_children)
  2253. current_parent = item_with_children
  2254. elif level == 2:
  2255. # Level 2 是子项,放入当前父项的 items 中
  2256. if current_parent is not None:
  2257. # 移除 Level 字段(子项统一在父项下,不需要重复标识)
  2258. child_item = {k: v for k, v in item.items()}
  2259. current_parent["items"].append(child_item)
  2260. else:
  2261. # 没有父项,作为独立项处理
  2262. nested_items.append(item)
  2263. else:
  2264. # 其他 Level,作为独立项
  2265. nested_items.append(item)
  2266. result.append({
  2267. "name": name,
  2268. "items": nested_items
  2269. })
  2270. return result
  2271. def parse_final_account_table(df: pd.DataFrame, project_name: str, project_no: int) -> List[Dict[str, Any]]:
  2272. """
  2273. 解析 finalAccount 类型的单项工程投资完成情况表格。
  2274. 表格结构:
  2275. 费用项目 | 概算金额 | 决算金额(审定金额-不含税) | 增值税额 | 超(-)节(+)支金额 | 超(-)节(+)支率
  2276. Args:
  2277. df: 表格 DataFrame
  2278. project_name: 项目名称(从标题提取,如"周村220kV输变电工程变电站新建工程")
  2279. project_no: 项目序号(如1、2、3、4)
  2280. 返回格式:
  2281. [{
  2282. "No": int, # 序号(项目序号)
  2283. "name": str, # 项目名称(审计内容)
  2284. "feeName": str, # 费用项目
  2285. "estimatedCost": str, # 概算金额
  2286. "approvedFinalAccountExcludingVat": str, # 决算金额审定不含税
  2287. "vatAmount": str, # 增值税额
  2288. "costVariance": str, # 超节支金额
  2289. "varianceRate": str, # 超节支率
  2290. }, ...]
  2291. """
  2292. if df.empty:
  2293. return []
  2294. def parse_number(value: Any) -> str:
  2295. """解析数字,返回字符串保留原始精度"""
  2296. if pd.isna(value):
  2297. return "0"
  2298. value_str = str(value).strip()
  2299. # 移除非数字字符(保留负号和小数点)
  2300. value_str = re.sub(r'[^\d.\-]', '', value_str)
  2301. if not value_str or value_str == '-':
  2302. return "0"
  2303. return value_str
  2304. def parse_rate(value: Any) -> str:
  2305. """解析百分比,返回字符串"""
  2306. if pd.isna(value):
  2307. return "0%"
  2308. value_str = str(value).strip()
  2309. # 移除非数字字符(保留负号、小数点和百分号)
  2310. if '%' not in value_str:
  2311. # 提取数字部分并添加百分号
  2312. num_str = re.sub(r'[^\d.\-]', '', value_str)
  2313. if num_str and num_str != '-':
  2314. return f"{num_str}%"
  2315. return "0%"
  2316. return value_str
  2317. # 识别表头行
  2318. header_row_idx = None
  2319. for i in range(min(5, len(df))):
  2320. row_text = " ".join(df.iloc[i].astype(str).str.strip().tolist())
  2321. row_text_no_space = row_text.replace(" ", "")
  2322. # 匹配表头特征
  2323. if "费用项目" in row_text_no_space and "概算金额" in row_text_no_space:
  2324. header_row_idx = i
  2325. break
  2326. elif "概算金额" in row_text_no_space and "决算金额" in row_text_no_space:
  2327. header_row_idx = i
  2328. break
  2329. if header_row_idx is None:
  2330. # 检查是否有多行表头(如"决算金额"跨行)
  2331. for i in range(min(3, len(df))):
  2332. row_text = " ".join(df.iloc[i].astype(str).str.strip().tolist())
  2333. if "1" in row_text and "2" in row_text and "3" in row_text:
  2334. # 这是列序号行,表头在前面
  2335. header_row_idx = i
  2336. break
  2337. if header_row_idx is None:
  2338. header_row_idx = 0
  2339. logger.warning(f"[表格解析] 未找到明确表头,使用第一行作为表头")
  2340. # 从表头行识别列索引
  2341. header_row = df.iloc[header_row_idx].astype(str).str.strip()
  2342. col_fee_name = None # 费用项目
  2343. col_estimated = None # 概算金额
  2344. col_approved = None # 审定金额(不含税)
  2345. col_vat = None # 增值税额
  2346. col_variance = None # 超节支金额
  2347. col_variance_rate = None # 超节支率
  2348. for idx, cell in enumerate(header_row):
  2349. cell_no_space = cell.replace(" ", "")
  2350. if "费用项目" in cell_no_space:
  2351. col_fee_name = idx
  2352. elif "概算金额" in cell_no_space or cell_no_space == "1":
  2353. col_estimated = idx
  2354. elif "审定金额" in cell_no_space or "不含税" in cell_no_space or cell_no_space == "2":
  2355. col_approved = idx
  2356. elif "增值税额" in cell_no_space or "增值税" in cell_no_space or cell_no_space == "3":
  2357. col_vat = idx
  2358. elif "超" in cell_no_space and "节" in cell_no_space and "金额" in cell_no_space:
  2359. col_variance = idx
  2360. elif cell_no_space == "4=1-2-3":
  2361. col_variance = idx
  2362. elif "超" in cell_no_space and "节" in cell_no_space and "率" in cell_no_space:
  2363. col_variance_rate = idx
  2364. elif cell_no_space == "5=4/1":
  2365. col_variance_rate = idx
  2366. # 如果无法识别列,使用默认索引(根据OCR输出的表格结构)
  2367. if col_fee_name is None:
  2368. col_fee_name = 0
  2369. if col_estimated is None:
  2370. col_estimated = 1
  2371. if col_approved is None:
  2372. col_approved = 2
  2373. if col_vat is None:
  2374. col_vat = 3
  2375. if col_variance is None:
  2376. col_variance = 4
  2377. if col_variance_rate is None:
  2378. col_variance_rate = 5
  2379. logger.info(f"[表格解析] 列索引: 费用项目={col_fee_name}, 概算金额={col_estimated}, "
  2380. f"审定不含税={col_approved}, 增值税={col_vat}, 超节支金额={col_variance}, 超节支率={col_variance_rate}")
  2381. # 跳过表头行(可能有多行表头)
  2382. # 找到数据开始行(第一个包含"建筑安装"或"设备购置"的行)
  2383. data_start_idx = header_row_idx + 1
  2384. for i in range(header_row_idx + 1, min(header_row_idx + 5, len(df))):
  2385. row_text = " ".join(df.iloc[i].astype(str).str.strip().tolist())
  2386. if "建筑安装" in row_text or "设备购置" in row_text or "其他费用" in row_text:
  2387. data_start_idx = i
  2388. break
  2389. # 跳过列序号行(如 "1 | 2 | 3 | 4=1-2-3 | 5=4/1")
  2390. if re.match(r'^[\d\s=\-/]+$', row_text.replace(" ", "")):
  2391. continue
  2392. # 从数据行开始解析
  2393. data_rows = df.iloc[data_start_idx:].reset_index(drop=True)
  2394. result = []
  2395. for idx, row in data_rows.iterrows():
  2396. if row.isna().all():
  2397. continue
  2398. fee_name = row.iloc[col_fee_name] if col_fee_name < len(row) else None
  2399. estimated = row.iloc[col_estimated] if col_estimated < len(row) else None
  2400. approved = row.iloc[col_approved] if col_approved < len(row) else None
  2401. vat = row.iloc[col_vat] if col_vat < len(row) else None
  2402. variance = row.iloc[col_variance] if col_variance < len(row) else None
  2403. variance_rate = row.iloc[col_variance_rate] if col_variance_rate < len(row) else None
  2404. # 解析费用项目名称
  2405. fee_name_str = str(fee_name).strip() if fee_name is not None and not pd.isna(fee_name) else ""
  2406. if not fee_name_str:
  2407. continue
  2408. # 跳过合计行
  2409. if any(kw in fee_name_str for kw in ["合计", "总计", "小计"]):
  2410. continue
  2411. # 只保留主要费用项目:建筑安装工程、设备购置、其他费用
  2412. valid_fee_names = ["建筑安装工程", "建筑安装", "设备购置", "其他费用"]
  2413. is_valid = any(kw in fee_name_str for kw in valid_fee_names)
  2414. if not is_valid:
  2415. continue
  2416. result.append({
  2417. "No": project_no,
  2418. "name": project_name,
  2419. "feeName": fee_name_str,
  2420. "estimatedCost": parse_number(estimated),
  2421. "approvedFinalAccountExcludingVat": parse_number(approved),
  2422. "vatAmount": parse_number(vat),
  2423. "costVariance": parse_number(variance),
  2424. "varianceRate": parse_rate(variance_rate),
  2425. })
  2426. logger.info(f"[表格解析] 解析完成: 项目名称={project_name}, 共 {len(result)} 条数据")
  2427. return result
  2428. def parse_settlement_report_tables(
  2429. merged_tables: List[Tuple[int, pd.DataFrame, str, int]]
  2430. ) -> Dict[str, List[Dict[str, Any]]]:
  2431. """
  2432. 解析 settlementReport 类型的所有表格,按表名组织返回。
  2433. 返回格式:
  2434. {
  2435. "审定结算汇总表": [...],
  2436. "合同执行情况": [],
  2437. "赔偿合同": [],
  2438. "物资采购合同1": [],
  2439. "物资采购合同2": [],
  2440. "其他服务类合同": [],
  2441. }
  2442. """
  2443. result = {
  2444. "审定结算汇总表": [],
  2445. "合同执行情况": [],
  2446. "赔偿合同": [],
  2447. "物资采购合同1": [],
  2448. "物资采购合同2": [],
  2449. "其他服务类合同": [],
  2450. }
  2451. for orig_idx, df, rule_name, page in merged_tables:
  2452. try:
  2453. logger.info(f"[表格解析] 开始解析表格: {rule_name} (页面 {page}, 行数: {len(df)})")
  2454. if rule_name == "审定结算汇总表":
  2455. parsed_data = parse_settlement_summary_table(df)
  2456. if parsed_data:
  2457. result[rule_name] = parsed_data
  2458. logger.info(f"[表格解析] {rule_name}: 解析成功,共 {len(parsed_data)} 条数据")
  2459. elif rule_name == "合同执行情况":
  2460. parsed_data = parse_contract_execution_table(df)
  2461. if parsed_data:
  2462. result[rule_name] = parsed_data
  2463. logger.info(f"[表格解析] {rule_name}: 解析成功,共 {len(parsed_data)} 条数据")
  2464. elif rule_name == "赔偿合同":
  2465. parsed_data = parse_compensation_contract_table(df)
  2466. if parsed_data:
  2467. result[rule_name] = parsed_data
  2468. logger.info(f"[表格解析] {rule_name}: 解析成功,共 {len(parsed_data)} 条数据")
  2469. elif rule_name == "物资采购合同1":
  2470. parsed_data = parse_material_purchase_contract1_table(df)
  2471. if parsed_data:
  2472. result[rule_name] = parsed_data
  2473. logger.info(f"[表格解析] {rule_name}: 解析成功,共 {len(parsed_data)} 条数据")
  2474. elif rule_name == "物资采购合同2":
  2475. parsed_data = parse_material_purchase_contract2_table(df)
  2476. if parsed_data:
  2477. result[rule_name] = parsed_data
  2478. logger.info(f"[表格解析] {rule_name}: 解析成功,共 {len(parsed_data)} 条数据")
  2479. elif rule_name == "其他服务类合同":
  2480. parsed_data = parse_other_service_contract_table(df)
  2481. if parsed_data:
  2482. result[rule_name] = parsed_data
  2483. logger.info(f"[表格解析] {rule_name}: 解析成功,共 {len(parsed_data)} 条数据")
  2484. else:
  2485. logger.warning(f"[表格解析] 未知的表格类型: {rule_name}")
  2486. except Exception as e:
  2487. # 如果解析失败,记录错误但不影响其他表格
  2488. logger.warning(f"[表格解析] 解析 {rule_name} 表格失败: {e}", exc_info=True)
  2489. return result
  2490. def extract_and_filter_tables_for_pdf(
  2491. pdf_path: str,
  2492. base_output_dir: str,
  2493. doc_type: Literal["settlementReport", "designReview", "finalAccount"],
  2494. ) -> Dict[str, Any]:
  2495. """
  2496. 从指定 PDF 提取所有表格 + 合并后的表格 + 筛选后的表格,全部落盘。
  2497. 返回结构:
  2498. {
  2499. "tables_root": str,
  2500. "extracted_dir": str,
  2501. "merged_dir": str,
  2502. "filtered_dir": str,
  2503. "all_tables": [
  2504. {"page": int, "index_on_page": int, "excel_path": str}
  2505. ],
  2506. "merged_tables": [
  2507. {"page": int, "index_on_page": int, "excel_path": str}
  2508. ],
  2509. "filtered_tables": [
  2510. {"page": int, "index_on_page": int, "rule_name": str, "excel_path": str}
  2511. ],
  2512. }
  2513. """
  2514. pdf_path_obj = Path(pdf_path)
  2515. base_output = Path(base_output_dir)
  2516. tables_root = base_output / "tables"
  2517. extracted_dir = tables_root / "extracted_tables"
  2518. merged_dir = tables_root / "merged_tables"
  2519. filtered_dir = tables_root / "filtered_tables"
  2520. extracted_dir.mkdir(parents=True, exist_ok=True)
  2521. merged_dir.mkdir(parents=True, exist_ok=True)
  2522. filtered_dir.mkdir(parents=True, exist_ok=True)
  2523. # 立即输出日志,确保日志系统正常工作
  2524. logger.info(f"[表格提取] ========== 开始处理 PDF ==========")
  2525. logger.info(f"[表格提取] PDF 路径: {pdf_path}")
  2526. logger.info(f"[表格提取] 文档类型: {doc_type}")
  2527. logger.info(f"[表格提取] PDF 文件存在: {Path(pdf_path).exists()}")
  2528. if Path(pdf_path).exists():
  2529. logger.info(f"[表格提取] PDF 文件大小: {Path(pdf_path).stat().st_size} bytes")
  2530. logger.info(f"[表格提取] 输出目录: {base_output_dir}")
  2531. # 1. 使用 pdfplumber 从 PDF 提取所有表格(不限制页数)
  2532. # 对于 designReview 类型,启用标题提取(用于识别多个概算表)
  2533. extract_titles = (doc_type == "designReview")
  2534. logger.info("[表格提取] 步骤1: 使用 pdfplumber 提取所有表格...")
  2535. tables_data = extract_tables_with_pdfplumber(str(pdf_path_obj), pages="all", extract_titles=extract_titles)
  2536. logger.info(f"[表格提取] 步骤1完成: 共提取到 {len(tables_data)} 个表格")
  2537. # 2. 保存所有原始表格为 xlsx,命名: table_page{page}_{index}.xlsx
  2538. logger.info("[表格提取] 步骤2: 保存所有原始表格到 extracted_tables...")
  2539. page_table_count: Dict[int, int] = {}
  2540. all_tables_meta: List[Dict[str, Any]] = []
  2541. # 存储表格标题映射(用于后续解析)
  2542. table_titles: Dict[int, str] = {} # orig_idx -> title
  2543. # 给每个表格一个全局索引,方便后续合并/去重
  2544. all_tables: List[Tuple[int, pd.DataFrame, int]] = [] # (orig_idx, df, page)
  2545. for orig_idx, (page, df, _bbox, title) in enumerate(tables_data):
  2546. # 保存标题映射
  2547. if title:
  2548. table_titles[orig_idx] = title
  2549. page_table_count[page] = page_table_count.get(page, 0) + 1
  2550. idx_on_page = page_table_count[page]
  2551. excel_path = extracted_dir / f"table_page{page}_{idx_on_page}.xlsx"
  2552. df.to_excel(str(excel_path), index=False, header=False)
  2553. all_tables_meta.append(
  2554. {
  2555. "page": page,
  2556. "index_on_page": idx_on_page,
  2557. "excel_path": str(excel_path),
  2558. }
  2559. )
  2560. all_tables.append((orig_idx, df.copy(), page))
  2561. logger.info(f"[表格提取] 步骤2完成: 已保存 {len(all_tables)} 个原始表格")
  2562. # 3. 合并所有跨页表格(不进行过滤),保存到 merged_tables
  2563. logger.info("[表格提取] 步骤3: 合并跨页表格...")
  2564. merged_all_tables = _merge_all_tables(all_tables)
  2565. logger.info(f"[表格提取] 步骤3完成: 从 {len(all_tables)} 个原始表格合并为 {len(merged_all_tables)} 个表格")
  2566. merged_page_table_count: Dict[int, int] = {}
  2567. merged_meta: List[Dict[str, Any]] = []
  2568. logger.info("[表格提取] 步骤3.1: 保存合并后的表格到 merged_tables...")
  2569. for merged_idx, (orig_idx, df, page) in enumerate(merged_all_tables):
  2570. merged_page_table_count[page] = merged_page_table_count.get(page, 0) + 1
  2571. idx_on_page = merged_page_table_count[page]
  2572. excel_path = merged_dir / f"table_{merged_idx + 1}.xlsx"
  2573. df.to_excel(str(excel_path), index=False, header=False)
  2574. merged_meta.append(
  2575. {
  2576. "page": page,
  2577. "index_on_page": idx_on_page,
  2578. "excel_path": str(excel_path),
  2579. }
  2580. )
  2581. logger.info(f"[表格提取] 步骤3.1完成: 已保存 {len(merged_meta)} 个合并后的表格")
  2582. # 4. 根据 doc_type 选择对应的表头规则
  2583. logger.info(f"[表格提取] 步骤4: 加载表头规则,文档类型: {doc_type}")
  2584. header_rules = TABLE_TYPE_RULES.get(doc_type, [])
  2585. logger.info(f"[表格提取] 步骤4完成: 找到 {len(header_rules)} 个表头规则")
  2586. # 如果没有规则,直接返回(保留 extracted_tables 和 merged_tables)
  2587. if not header_rules:
  2588. logger.warning("[表格提取] 未找到表头规则,跳过筛选步骤")
  2589. return {
  2590. "tables_root": str(tables_root),
  2591. "extracted_dir": str(extracted_dir),
  2592. "merged_dir": str(merged_dir),
  2593. "filtered_dir": str(filtered_dir),
  2594. "all_tables": all_tables_meta,
  2595. "merged_tables": merged_meta,
  2596. "filtered_tables": [],
  2597. }
  2598. # 5. 从合并后的表格中过滤出匹配规则的表格
  2599. logger.info("[表格提取] 步骤5: 从合并后的表格中筛选匹配规则的表格...")
  2600. # 增加 title 字段: (orig_idx, df, rule_name, page, title)
  2601. matched_for_merge: List[Tuple[int, pd.DataFrame, str, int, str]] = []
  2602. for merged_idx, (orig_idx, df, page) in enumerate(merged_all_tables):
  2603. rule_name: Optional[str] = None
  2604. for rule in header_rules:
  2605. is_match, rn = check_table_header(df, rule)
  2606. if is_match:
  2607. rule_name = rn
  2608. # 获取表格标题
  2609. title = table_titles.get(orig_idx, "")
  2610. logger.info(f"[表格提取] 表格 {merged_idx + 1} (页面 {page}) 匹配规则: {rule_name}, 标题: {title}")
  2611. break
  2612. if rule_name:
  2613. title = table_titles.get(orig_idx, "")
  2614. matched_for_merge.append((orig_idx, df.copy(), rule_name, page, title))
  2615. logger.info(f"[表格提取] 步骤5完成: 共匹配到 {len(matched_for_merge)} 个表格")
  2616. # 如果没有匹配到表格,直接返回(保留 extracted_tables 和 merged_tables)
  2617. if not matched_for_merge:
  2618. logger.warning("[表格提取] 未匹配到任何表格,跳过后续处理")
  2619. return {
  2620. "tables_root": str(tables_root),
  2621. "extracted_dir": str(extracted_dir),
  2622. "merged_dir": str(merged_dir),
  2623. "filtered_dir": str(filtered_dir),
  2624. "all_tables": all_tables_meta,
  2625. "merged_tables": merged_meta,
  2626. "filtered_tables": [],
  2627. }
  2628. # 6. 对已匹配规则的表格再次进行跨页合并(处理规则匹配后的特殊情况)
  2629. logger.info("[表格提取] 步骤6: 对已匹配规则的表格进行跨页合并...")
  2630. merged_tables = _merge_cross_page_tables(matched_for_merge, header_rules)
  2631. logger.info(f"[表格提取] 步骤6完成: 跨页合并后剩余 {len(merged_tables)} 个表格")
  2632. # 7. 保存筛选+合并后的表格到 filtered_dir,命名仍然按 page + 序号
  2633. logger.info("[表格提取] 步骤7: 保存筛选+合并后的表格到 filtered_tables...")
  2634. filtered_page_table_count: Dict[int, int] = {}
  2635. filtered_meta: List[Dict[str, Any]] = []
  2636. parsed_data = None
  2637. for orig_idx, df, rule_name, page, title in merged_tables:
  2638. filtered_page_table_count[page] = filtered_page_table_count.get(page, 0) + 1
  2639. idx_on_page = filtered_page_table_count[page]
  2640. excel_path = filtered_dir / f"table_page{page}_{idx_on_page}.xlsx"
  2641. df.to_excel(str(excel_path), index=False, header=False)
  2642. filtered_meta.append(
  2643. {
  2644. "page": page,
  2645. "index_on_page": idx_on_page,
  2646. "rule_name": rule_name,
  2647. "title": title,
  2648. "excel_path": str(excel_path),
  2649. }
  2650. )
  2651. logger.info(f"[表格提取] 步骤7完成: 已保存 {len(filtered_meta)} 个筛选后的表格")
  2652. # 8. 根据文档类型解析表格数据
  2653. logger.info(f"[表格提取] 步骤8: 解析表格数据,文档类型: {doc_type}...")
  2654. logger.info(f"[表格提取] 待解析的表格列表: {[(rule_name, page, title) for _, _, rule_name, page, title in merged_tables]}")
  2655. if doc_type == "designReview":
  2656. # 对于 designReview 类型,返回类似 settlementReport 的结构
  2657. # 按规则类型分组:
  2658. # - 初设评审的概算投资(规则1): 嵌套结构
  2659. # - 初设评审的概算投资明细(规则2): 平铺结构,多个表格
  2660. # - 初设评审的概算投资费用(规则3): 平铺结构,多个表格
  2661. parsed_data = {
  2662. "初设评审的概算投资": [],
  2663. "初设评审的概算投资明细": [],
  2664. "初设评审的概算投资费用": [],
  2665. }
  2666. for orig_idx, df, rule_name, page, title in merged_tables:
  2667. if rule_name == "初设评审的概算投资":
  2668. # 规则1:嵌套结构
  2669. try:
  2670. logger.info(f"[表格提取] 解析 designReview 表格(规则1): {rule_name} (页面 {page}, 行数: {len(df)})")
  2671. summary_data = parse_design_review_table(df)
  2672. if summary_data:
  2673. parsed_data["初设评审的概算投资"] = summary_data
  2674. logger.info(f"[表格提取] 解析完成: 共 {len(summary_data)} 条数据")
  2675. except Exception as e:
  2676. logger.warning(f"[表格提取] 解析 designReview 表格(规则1)失败: {e}", exc_info=True)
  2677. elif rule_name == "初设评审的概算投资明细":
  2678. # 规则2:使用标题作为工程名称
  2679. try:
  2680. logger.info(f"[表格提取] 解析 designReview 表格(规则2): {rule_name} (页面 {page}, 标题: {title}, 行数: {len(df)})")
  2681. detail_data = parse_design_review_detail_table(df, title)
  2682. if detail_data:
  2683. parsed_data["初设评审的概算投资明细"].extend(detail_data)
  2684. logger.info(f"[表格提取] 解析完成: 共 {len(detail_data)} 条数据")
  2685. except Exception as e:
  2686. logger.warning(f"[表格提取] 解析 designReview 表格(规则2)失败: {e}", exc_info=True)
  2687. elif rule_name == "初设评审的概算投资费用":
  2688. # 规则3:使用标题作为工程名称
  2689. try:
  2690. logger.info(f"[表格提取] 解析 designReview 表格(规则3): {rule_name} (页面 {page}, 标题: {title}, 行数: {len(df)})")
  2691. cost_data = parse_design_review_cost_table(df, title)
  2692. if cost_data:
  2693. parsed_data["初设评审的概算投资费用"].extend(cost_data)
  2694. logger.info(f"[表格提取] 解析完成: 共 {len(cost_data)} 条数据")
  2695. except Exception as e:
  2696. logger.warning(f"[表格提取] 解析 designReview 表格(规则3)失败: {e}", exc_info=True)
  2697. # 将规则2和规则3的平铺结果按工程名称分组为嵌套结构
  2698. for rule_key in ["初设评审的概算投资明细", "初设评审的概算投资费用"]:
  2699. if rule_key in parsed_data and parsed_data[rule_key]:
  2700. flat_items = parsed_data[rule_key]
  2701. grouped = _group_items_by_name(flat_items)
  2702. parsed_data[rule_key] = grouped
  2703. # 统计解析结果
  2704. logger.info(f"[表格提取] 解析结果统计:")
  2705. total_records = 0
  2706. for table_type, table_data in parsed_data.items():
  2707. if table_data:
  2708. if isinstance(table_data, list):
  2709. # 嵌套结构,统计所有 items
  2710. record_count = sum(len(item.get("items", [])) for item in table_data) if table_data and isinstance(table_data[0], dict) and "items" in table_data[0] else len(table_data)
  2711. else:
  2712. record_count = len(table_data)
  2713. total_records += record_count
  2714. logger.info(f"[表格提取] - {table_type}: {len(table_data)} 个工程,共 {record_count} 条明细")
  2715. else:
  2716. logger.info(f"[表格提取] - {table_type}: 未匹配到数据")
  2717. logger.info(f"[表格提取] 总计: {total_records} 条数据")
  2718. elif doc_type == "settlementReport":
  2719. # 对于 settlementReport 类型,解析所有匹配的表格,按表名组织
  2720. try:
  2721. logger.info(f"[表格提取] 解析 settlementReport 表格,共 {len(merged_tables)} 个表格")
  2722. # 转换为4元组格式以兼容现有的 parse_settlement_report_tables 函数
  2723. tables_4tuple = [(orig_idx, df, rule_name, page) for orig_idx, df, rule_name, page, title in merged_tables]
  2724. parsed_data = parse_settlement_report_tables(tables_4tuple)
  2725. # 统计每个表的解析结果
  2726. logger.info(f"[表格提取] 解析结果统计:")
  2727. total_records = 0
  2728. for table_name, table_data in parsed_data.items():
  2729. if table_data:
  2730. record_count = len(table_data)
  2731. total_records += record_count
  2732. logger.info(f"[表格提取] - {table_name}: {record_count} 条数据")
  2733. else:
  2734. logger.info(f"[表格提取] - {table_name}: 未匹配到数据")
  2735. logger.info(f"[表格提取] 总计: {total_records} 条数据")
  2736. except Exception as e:
  2737. logger.warning(f"[表格提取] 解析 settlementReport 表格失败: {e}", exc_info=True)
  2738. logger.info("[表格提取] 步骤8完成: 表格数据解析完成")
  2739. result = {
  2740. "tables_root": str(tables_root),
  2741. "extracted_dir": str(extracted_dir),
  2742. "merged_dir": str(merged_dir),
  2743. "filtered_dir": str(filtered_dir),
  2744. "all_tables": all_tables_meta,
  2745. "merged_tables": merged_meta,
  2746. "filtered_tables": filtered_meta,
  2747. }
  2748. # 添加解析后的 JSON 数据
  2749. if parsed_data is not None:
  2750. result["parsed_data"] = parsed_data
  2751. result["parsed_data_json"] = json.dumps(parsed_data, ensure_ascii=False, indent=2)
  2752. logger.info("[表格提取] JSON 数据已生成")
  2753. logger.info(f"[表格提取] 处理完成: 原始表格 {len(all_tables_meta)} 个, 合并后 {len(merged_meta)} 个, 筛选后 {len(filtered_meta)} 个")
  2754. return result