Pārlūkot izejas kodu

feat(电磁): 首列为空且第二列为地名时合并到上一条监测地点

Co-authored-by: Cursor <cursoragent@cursor.com>
何文松 2 nedēļas atpakaļ
vecāks
revīzija
bd336ace01
1 mainītis faili ar 25 papildinājumiem un 1 dzēšanām
  1. 25 1
      pdf_converter_v2/parser/electromagnetic_parser.py

+ 25 - 1
pdf_converter_v2/parser/electromagnetic_parser.py

@@ -316,9 +316,33 @@ def parse_electromagnetic_detection_record(markdown_content: str) -> Electromagn
     
     # 使用集合跟踪已添加的测点编号,避免重复添加(处理跨页重复的情况)
     seen_codes = set()
-    
+
+    def get_address_continuation(row: List[str]) -> str:
+        """首列为空时,取第一个像地名的非空格(与上一条合并用)。"""
+        for i in range(1, min(4, len(row))):
+            cell = (row[i] or "").strip()
+            if not cell:
+                continue
+            # 像地名:含中文,且不是纯数字/时间
+            if re.search(r"[\u4e00-\u9fa5]", cell) and not re.match(r"^[\d.\-:\s]+$", cell):
+                return cell
+        return ""
+
     for table in tables:
         for row in table:
+            # 首列为空且第二列(或其后)有地名类内容:视为上一条的监测地点续行,合并到上一条
+            first_cell = (row[0] or "").strip() if len(row) > 0 else ""
+            if not first_cell and record.electricMagnetic:
+                continuation = get_address_continuation(row)
+                if continuation:
+                    last_em = record.electricMagnetic[-1]
+                    last_em.address = (last_em.address or "").strip()
+                    if last_em.address:
+                        last_em.address = last_em.address + " " + continuation
+                    else:
+                        last_em.address = continuation
+                    logger.debug(f"[电磁检测] 监测地点续行合并: ... + '{continuation}' -> {last_em.address}")
+                    continue
             if is_valid_data_row(row):
                 code = row[0].strip() if row[0] else ""