|
|
@@ -180,25 +180,30 @@ def parse_operational_conditions(markdown_content: str, require_title: bool = Tr
|
|
|
logger.warning("[工况信息] 未能提取出任何表格内容")
|
|
|
return conditions
|
|
|
|
|
|
- # 查找工况信息表格(通常包含"检测时间"、"电压"、"电流"等关键词)
|
|
|
+ # 查找工况信息表格(通常包含"检测时间"/"时间"、"电压"/"U"、"电流"/"I"等关键词;支持两行表头)
|
|
|
for table in tables:
|
|
|
if not table or len(table) < 2:
|
|
|
continue
|
|
|
|
|
|
- # 检查表头是否包含工况信息的关键词
|
|
|
header_row = table[0]
|
|
|
- has_operational_keywords = any(
|
|
|
- keyword in " ".join(header_row)
|
|
|
- for keyword in ["检测时间", "电压", "电流", "有功功率", "无功功率", "项目"]
|
|
|
+ header_text = " ".join(header_row)
|
|
|
+ # 第一行表头:检测时间/电压/电流/项目 或 名称/时间/运行工况
|
|
|
+ has_row0 = any(
|
|
|
+ k in header_text for k in ["检测时间", "电压", "电流", "有功功率", "无功功率", "项目", "时间", "名称", "运行工况"]
|
|
|
)
|
|
|
-
|
|
|
+ # 两行表头时第二行常有 U(kV)、I(A)、P(MW)、Q(Mvar)
|
|
|
+ header_row2 = table[1] if len(table) > 1 else []
|
|
|
+ header2_text = " ".join(header_row2).lower()
|
|
|
+ has_row1 = any(
|
|
|
+ k in header2_text for k in ["u(", "i(", "p(", "q(", "电压", "电流", "有功", "无功", "kv", "mw", "mvar"]
|
|
|
+ )
|
|
|
+ has_operational_keywords = has_row0 or (len(header_row2) >= 4 and has_row1)
|
|
|
if not has_operational_keywords:
|
|
|
continue
|
|
|
|
|
|
logger.info(f"[工况信息] 找到工况信息表格,行数: {len(table)}")
|
|
|
|
|
|
- # 找到表头行的列索引
|
|
|
- header_row = table[0]
|
|
|
+ # 列索引:优先用第二行表头(U/I/P/Q),否则用第一行
|
|
|
monitor_at_idx = -1
|
|
|
project_idx = -1
|
|
|
name_idx = -1
|
|
|
@@ -207,38 +212,62 @@ def parse_operational_conditions(markdown_content: str, require_title: bool = Tr
|
|
|
active_power_idx = -1
|
|
|
reactive_power_idx = -1
|
|
|
|
|
|
+ # 若第二行表头存在且含 U/I/P/Q,用其确定电压/电流/功率列
|
|
|
+ if len(table) > 1 and has_row1:
|
|
|
+ row2 = table[1]
|
|
|
+ for idx, cell in enumerate(row2):
|
|
|
+ cell_n = normalize_text(cell)
|
|
|
+ if "u" in cell_n and ("kv" in cell_n or "k v" in cell_n or "电压" in cell):
|
|
|
+ voltage_idx = idx
|
|
|
+ elif "i" in cell_n and ("a)" in cell_n or "a )" in cell_n or "电流" in cell):
|
|
|
+ current_idx = idx
|
|
|
+ elif "p" in cell_n and ("mw" in cell_n or "m w" in cell_n or "有功" in cell):
|
|
|
+ active_power_idx = idx
|
|
|
+ elif "q" in cell_n and ("mvar" in cell_n or "无功" in cell):
|
|
|
+ reactive_power_idx = idx
|
|
|
+ elif ("时间" in cell or "检测时间" in cell or "监测时间" in cell) and monitor_at_idx == -1:
|
|
|
+ monitor_at_idx = idx
|
|
|
+ elif ("名称" in cell or "主变" in cell) and name_idx == -1:
|
|
|
+ name_idx = idx
|
|
|
+
|
|
|
+ # 用第一行表头补全未识别的列(名称、时间、项目等)
|
|
|
for idx, cell in enumerate(header_row):
|
|
|
cell_lower = cell.lower()
|
|
|
- if "检测时间" in cell or "监测时间" in cell:
|
|
|
+ if ("检测时间" in cell or "监测时间" in cell or "时间" in cell) and monitor_at_idx == -1:
|
|
|
monitor_at_idx = idx
|
|
|
elif "项目" in cell:
|
|
|
- # 项目列可能有colspan,需要找到实际的列
|
|
|
if project_idx == -1:
|
|
|
project_idx = idx
|
|
|
- # 检查下一列是否是名称列(如果项目列colspan=2,下一列可能是名称)
|
|
|
if idx + 1 < len(header_row) and name_idx == -1:
|
|
|
next_cell = header_row[idx + 1]
|
|
|
if not any(k in next_cell.lower() for k in ["电压", "电流", "有功", "无功", "检测"]):
|
|
|
name_idx = idx + 1
|
|
|
- elif "电压" in cell or "电压(kv)" in cell_lower:
|
|
|
+ elif "电压" in cell or "电压(kv)" in cell_lower and voltage_idx == -1:
|
|
|
voltage_idx = idx
|
|
|
- elif "电流" in cell or "电流(a)" in cell_lower:
|
|
|
+ elif "电流" in cell or "电流(a)" in cell_lower and current_idx == -1:
|
|
|
current_idx = idx
|
|
|
- elif "有功功率" in cell or ("有功" in cell and "功率" in cell):
|
|
|
+ elif ("有功功率" in cell or ("有功" in cell and "功率" in cell)) and active_power_idx == -1:
|
|
|
active_power_idx = idx
|
|
|
- elif "无功功率" in cell or ("无功" in cell and "功率" in cell):
|
|
|
+ elif ("无功功率" in cell or ("无功" in cell and "功率" in cell)) and reactive_power_idx == -1:
|
|
|
reactive_power_idx = idx
|
|
|
elif ("名称" in cell or "主变" in cell) and name_idx == -1:
|
|
|
name_idx = idx
|
|
|
|
|
|
+ # 默认名称列0、时间列1(常见两行表头:名称、时间、运行工况 | 名称、时间、U、I、P、Q)
|
|
|
+ if name_idx == -1 and len(header_row) > 0 and ("名称" in header_row[0] or not any("名称" in c for c in header_row2)):
|
|
|
+ name_idx = 0
|
|
|
+ if monitor_at_idx == -1 and len(header_row) > 1 and ("时间" in header_row[1] or (len(header_row2) > 1 and "时间" in header_row2[1])):
|
|
|
+ monitor_at_idx = 1
|
|
|
+
|
|
|
logger.debug(f"[工况信息] 列索引: 检测时间={monitor_at_idx}, 项目={project_idx}, 名称={name_idx}, "
|
|
|
f"电压={voltage_idx}, 电流={current_idx}, 有功功率={active_power_idx}, 无功功率={reactive_power_idx}")
|
|
|
|
|
|
- # 处理数据行(从第二行开始,第一行是表头)
|
|
|
+ # 数据行:两行表头时从第3行(索引2)开始,否则从第2行(索引1)开始
|
|
|
+ data_start = 2 if (len(table) > 1 and has_row1) else 1
|
|
|
current_monitor_at = ""
|
|
|
current_project = ""
|
|
|
|
|
|
- for row_idx in range(1, len(table)):
|
|
|
+ for row_idx in range(data_start, len(table)):
|
|
|
row = table[row_idx]
|
|
|
if len(row) < 4: # 至少需要检测时间、项目、名称等基本字段
|
|
|
continue
|