|
|
@@ -317,23 +317,31 @@ def parse_electromagnetic_detection_record(markdown_content: str) -> Electromagn
|
|
|
# 使用集合跟踪已添加的测点编号,避免重复添加(处理跨页重复的情况)
|
|
|
seen_codes = set()
|
|
|
|
|
|
- def get_address_continuation(row: List[str]) -> str:
|
|
|
- """首列为空时,取第一个像地名的非空格(与上一条合并用)。"""
|
|
|
- for i in range(1, min(4, len(row))):
|
|
|
- cell = (row[i] or "").strip()
|
|
|
- if not cell:
|
|
|
- continue
|
|
|
- # 像地名:含中文,且不是纯数字/时间
|
|
|
- if re.search(r"[\u4e00-\u9fa5]", cell) and not re.match(r"^[\d.\-:\s]+$", cell):
|
|
|
- return cell
|
|
|
+ def get_address_continuation_strict(row: List[str]) -> str:
|
|
|
+ """仅当首列为空、除第二列外其余列均为空,且第二列像地名时,返回第二列(与上一条合并用)。"""
|
|
|
+ if len(row) < 2:
|
|
|
+ return ""
|
|
|
+ first = (row[0] or "").strip()
|
|
|
+ second = (row[1] or "").strip()
|
|
|
+ if first:
|
|
|
+ return ""
|
|
|
+ if not second:
|
|
|
+ return ""
|
|
|
+ # 除第二列外必须都为空
|
|
|
+ for i in range(2, len(row)):
|
|
|
+ if (row[i] or "").strip():
|
|
|
+ return ""
|
|
|
+ # 第二列像地名:含中文,且不是纯数字/时间
|
|
|
+ if re.search(r"[\u4e00-\u9fa5]", second) and not re.match(r"^[\d.\-:\s]+$", second):
|
|
|
+ return second
|
|
|
return ""
|
|
|
|
|
|
for table in tables:
|
|
|
for row in table:
|
|
|
- # 首列为空且第二列(或其后)有地名类内容:视为上一条的监测地点续行,合并到上一条
|
|
|
+ # 仅当首列为空、除第二列外都为空且第二列为地名时:合并到上一条监测地点
|
|
|
first_cell = (row[0] or "").strip() if len(row) > 0 else ""
|
|
|
if not first_cell and record.electricMagnetic:
|
|
|
- continuation = get_address_continuation(row)
|
|
|
+ continuation = get_address_continuation_strict(row)
|
|
|
if continuation:
|
|
|
last_em = record.electricMagnetic[-1]
|
|
|
last_em.address = (last_em.address or "").strip()
|