何文松 3 settimane fa
parent
commit
8d9644a993

+ 1 - 4
pdf_converter_v2/converter.py

@@ -77,10 +77,7 @@ async def _convert_with_paddle(
         "--save_path",
         save_path_base,
     ]
-    # NPU 下需加 --device npu:0,否则 PaddleOCR 走 CPU 易段错误
-    paddle_device = os.getenv("PADDLE_OCR_DEVICE", "").strip()
-    if paddle_device:
-        cmd.extend(["--device", paddle_device])
+    
     try:
         return_code, _, stderr = await _run_paddle_doc_parser(cmd)
         if return_code != 0:

+ 1 - 45
pdf_converter_v2/models/data_models.py

@@ -283,18 +283,9 @@ class FeasibilityApprovalInvestment:
     - Level 0: 顶层大类(如"山西晋城周村220千伏输变电工程")
     - Level 1: 二级分类(如"变电工程"、"线路工程"),有自己的 items
     - Level 2: 具体项目(如"周村220千伏变电站新建工程")
-    
-    项目信息(可选,用于 safetyFsApproval 类型):
-    - projectName: 工程(项目)名称
-    - projectUnit: 项目单位
-    - designUnit: 设计单位
     """
     def __init__(self):
         self.items: List[InvestmentItem] = []
-        # 项目基本信息(safetyFsApproval 专用)
-        self.projectName: Optional[str] = None
-        self.projectUnit: Optional[str] = None
-        self.designUnit: Optional[str] = None
     
     def to_dict(self):
         """转换为嵌套结构,与 designReview 保持一致
@@ -303,38 +294,14 @@ class FeasibilityApprovalInvestment:
         Level="2" 的项目作为二级分类(Level: 1),有自己的 items
         Level="3" 的项目作为具体项目(Level: 2),放入二级分类的 items
         Level="0" 的项目(合计)跳过
-        
-        特殊处理:如果表格没有 Level=1 的顶层大类(如湖北省格式),
-        自动创建一个虚拟顶层大类来包含所有 Level=2 的项目
         """
         if not self.items:
             return []
         
-        # 检查是否有 Level=1 的顶层大类
-        has_level_1 = any(item.level == "1" for item in self.items)
-        
         result = []
         current_top_category = None  # Level 0 顶层大类
         current_sub_category = None  # Level 1 二级分类
         
-        # 如果没有 Level=1 的顶层大类,创建一个虚拟的
-        if not has_level_1:
-            current_top_category = {
-                "name": "项目总表",
-                "Level": 0,
-                "constructionScaleSubstation": "",
-                "constructionScaleBay": "",
-                "constructionScaleOverheadLine": "",
-                "constructionScaleOpticalCable": "",
-                "staticInvestment": "",
-                "dynamicInvestment": "",
-                "constructionProjectCost": "",
-                "equipmentPurchaseCost": "",
-                "installationProjectCost": "",
-                "otherExpenses": "",
-                "items": []
-            }
-        
         for item in self.items:
             if item.level == "1":
                 # 顶层大类(如"山西晋城周村220千伏输变电工程")
@@ -414,18 +381,7 @@ class FeasibilityApprovalInvestment:
         if current_top_category is not None:
             result.append(current_top_category)
         
-        # 如果有项目信息,返回包含项目信息的字典;否则直接返回数据列表
-        if self.projectName or self.projectUnit or self.designUnit:
-            return {
-                "projectInfo": {
-                    "projectName": self.projectName or "",
-                    "projectUnit": self.projectUnit or "",
-                    "designUnit": self.designUnit or ""
-                },
-                "data": result
-            }
-        else:
-            return result
+        return result
     
     @staticmethod
     def _parse_number(value: str) -> str:

+ 2 - 28
pdf_converter_v2/paddleocr_fallback.py

@@ -129,34 +129,12 @@ def _decrement_service_count(lock_file: object) -> int:
         return 0
 
 
-def _systemd_available() -> bool:
-    """检测当前环境是否有 systemd(容器内通常无 systemd,无法操作 mineru-api.service)"""
-    try:
-        r = subprocess.run(
-            ["systemctl", "status", "--no-pager"],
-            capture_output=True,
-            text=True,
-            timeout=3,
-            check=False,
-        )
-        err = (r.stderr or "") + (r.stdout or "")
-        if "Failed to connect to bus" in err or "not been booted with systemd" in err:
-            return False
-        return True
-    except (FileNotFoundError, subprocess.TimeoutExpired, Exception):
-        return False
-
-
 def stop_mineru_service() -> bool:
     """停止mineru-api.service以释放GPU内存(线程安全)
-    容器内无 systemd 时直接返回 True,不报错。
     
     Returns:
-        True表示成功停止或已停止/无需操作,False表示失败
+        True表示成功停止或已停止,False表示失败
     """
-    if not _systemd_available():
-        logger.debug("[PaddleOCR] 无 systemd,跳过停止 mineru-api.service")
-        return True
     lock_file = _acquire_service_lock()
     if not lock_file:
         # 如果无法获取锁,等待一小段时间后检查服务状态
@@ -222,14 +200,10 @@ def stop_mineru_service() -> bool:
 
 def start_mineru_service() -> bool:
     """启动mineru-api.service(线程安全)
-    容器内无 systemd 时直接返回 True,不报错。
     
     Returns:
-        True表示成功启动或已启动/无需操作,False表示失败
+        True表示成功启动或已启动,False表示失败
     """
-    if not _systemd_available():
-        logger.debug("[PaddleOCR] 无 systemd,跳过启动 mineru-api.service")
-        return True
     lock_file = _acquire_service_lock()
     if not lock_file:
         # 如果无法获取锁,等待一小段时间后检查服务状态

+ 7 - 260
pdf_converter_v2/parser/investment_parser.py

@@ -224,11 +224,8 @@ def parse_feasibility_approval_investment(markdown_content: str) -> FeasibilityA
             table_text += " ".join([str(cell) for cell in row])
         # 移除空格后再匹配
         table_text_no_space = table_text.replace(" ", "")
-        # 选择包含"工程或费用名称"或"项目名称",且包含"静态投资"或"静态合计"的表格
-        has_name_col = ("工程或费用名称" in table_text_no_space or "项目名称" in table_text_no_space)
-        has_investment_col = ("静态投资" in table_text_no_space or "静态合计" in table_text_no_space)
-        
-        if has_name_col and has_investment_col:
+        # 选择包含"工程或费用名称"和"静态投资"的表格
+        if "工程或费用名称" in table_text_no_space and "静态投资" in table_text_no_space:
             all_matching_tables.append((table_idx, table))
             logger.info(f"[可研批复投资] 找到投资估算表格 (表格{table_idx+1}), 行数: {len(table)}")
     
@@ -298,7 +295,7 @@ def parse_feasibility_approval_investment(markdown_content: str) -> FeasibilityA
             
             if "序号" in cell_text and no_idx == -1:
                 no_idx = col_idx
-            elif ("工程或费用名称" in cell_text_no_space or "项目名称" in cell_text_no_space) and name_idx == -1:
+            elif ("工程或费用名称" in cell_text_no_space) and name_idx == -1:
                 name_idx = col_idx
             elif "架空线" in cell_text_no_space and overhead_line_idx == -1:
                 overhead_line_idx = col_idx
@@ -308,9 +305,9 @@ def parse_feasibility_approval_investment(markdown_content: str) -> FeasibilityA
                 substation_idx = col_idx
             elif "光缆" in cell_text and optical_cable_idx == -1:
                 optical_cable_idx = col_idx
-            elif ("静态投资" in cell_text_no_space or "静态合计" in cell_text_no_space) and static_investment_idx == -1:
+            elif "静态投资" in cell_text_no_space and static_investment_idx == -1:
                 static_investment_idx = col_idx
-            elif ("动态投资" in cell_text_no_space or "动态合计" in cell_text_no_space) and dynamic_investment_idx == -1:
+            elif "动态投资" in cell_text_no_space and dynamic_investment_idx == -1:
                 dynamic_investment_idx = col_idx
             # 新增费用字段识别
             elif "建筑工程费" in cell_text_no_space and construction_project_cost_idx == -1:
@@ -325,8 +322,8 @@ def parse_feasibility_approval_investment(markdown_content: str) -> FeasibilityA
                 if "其他费用" in cell_text_no_space:
                     other_expenses_idx = col_idx
         
-        # 如果这一行包含"序号"或"工程或费用名称"或"项目名称",记录为表头结束行
-        if ("序号" in row_text or "工程或费用名称" in row_text_no_space or "项目名称" in row_text_no_space) and header_row_idx == -1:
+        # 如果这一行包含"序号"或"工程或费用名称",记录为表头结束行
+        if ("序号" in row_text or "工程或费用名称" in row_text_no_space) and header_row_idx == -1:
             header_row_idx = row_idx
     
     # 表头结束行应该是最后一个包含表头内容的行
@@ -420,253 +417,6 @@ def parse_feasibility_approval_investment(markdown_content: str) -> FeasibilityA
     return record
 
 
-def parse_safety_feasibility_approval_investment(markdown_content: str) -> FeasibilityApprovalInvestment:
-    """
-    解析安全可研批复投资估算(湖北省格式)
-    
-    特点:
-    - 没有顶层大类(Level=1),直接从二级分类开始
-    - 中文序号(一、二)表示二级分类(如"变电工程"、"线路工程")
-    - 阿拉伯数字(1、2、3)表示具体项目
-    - 列名使用"项目名称"和"静态合计/动态合计"
-    
-    返回结构:
-    - Level 1: 二级分类(如"变电工程"、"线路工程")
-    - Level 2: 具体项目(如"襄阳连云220千伏变电站新建工程")
-    """
-    record = FeasibilityApprovalInvestment()
-    
-    tables = extract_table_with_rowspan_colspan(markdown_content)
-    
-    if not tables:
-        logger.warning("[安全可研批复投资] 未能提取出任何表格内容")
-        return record
-    
-    # 首先尝试提取项目基本信息表格
-    for table_idx, table in enumerate(tables):
-        if len(table) < 2:
-            continue
-        
-        table_text = ""
-        for row in table:
-            table_text += " ".join([str(cell) for cell in row])
-        table_text_no_space = table_text.replace(" ", "").replace("(", "(").replace(")", ")")
-        
-        # 查找包含"工程(项目)名称"的表格
-        if "工程(项目)名称" in table_text_no_space or "工程项目名称" in table_text_no_space:
-            logger.info(f"[安全可研批复投资] 找到项目信息表格 (表格{table_idx+1})")
-            
-            # 提取项目信息
-            for row in table:
-                if len(row) >= 2:
-                    key = str(row[0]).strip()
-                    value = str(row[1]).strip() if len(row) > 1 else ""
-                    
-                    if "工程" in key and "名称" in key:
-                        record.projectName = value
-                        logger.info(f"[安全可研批复投资] 提取工程名称: {value}")
-                    elif "项目单位" in key:
-                        record.projectUnit = value
-                        logger.info(f"[安全可研批复投资] 提取项目单位: {value}")
-                    elif "设计单位" in key:
-                        record.designUnit = value
-                        logger.info(f"[安全可研批复投资] 提取设计单位: {value}")
-            break
-    
-    # 找到所有投资估算表格并合并
-    all_matching_tables = []
-    for table_idx, table in enumerate(tables):
-        table_text = ""
-        for row in table:
-            table_text += " ".join([str(cell) for cell in row])
-        table_text_no_space = table_text.replace(" ", "")
-        
-        # 选择包含"项目名称"且包含"静态合计"或"静态投资"的表格
-        has_name_col = "项目名称" in table_text_no_space
-        has_investment_col = ("静态合计" in table_text_no_space or "静态投资" in table_text_no_space)
-        
-        if has_name_col and has_investment_col:
-            all_matching_tables.append((table_idx, table))
-            logger.info(f"[安全可研批复投资] 找到投资估算表格 (表格{table_idx+1}), 行数: {len(table)}")
-    
-    if not all_matching_tables:
-        logger.warning("[安全可研批复投资] 未找到包含投资估算的表格")
-        return record
-    
-    # 如果只有一个表格,直接使用
-    if len(all_matching_tables) == 1:
-        target_table = all_matching_tables[0][1]
-    else:
-        # 多个表格:合并所有表格的数据行
-        logger.info(f"[安全可研批复投资] 发现 {len(all_matching_tables)} 个投资估算表格,将进行合并")
-        target_table = []
-        first_table = True
-        for table_idx, table in all_matching_tables:
-            if first_table:
-                target_table.extend(table)
-                first_table = False
-            else:
-                # 跳过表头行
-                header_end_idx = 0
-                for row_idx, row in enumerate(table):
-                    row_text = " ".join([str(cell) for cell in row]).replace(" ", "")
-                    if "序号" in row_text or "项目名称" in row_text or "建设规模" in row_text:
-                        header_end_idx = row_idx + 1
-                    elif len(row) > 0:
-                        first_cell = str(row[0]).strip()
-                        if first_cell in ["一", "二", "三", "四", "五", "六", "七", "八", "九", "十"]:
-                            break
-                target_table.extend(table[header_end_idx:])
-                logger.debug(f"[安全可研批复投资] 表格{table_idx+1}: 跳过前{header_end_idx}行表头,添加{len(table)-header_end_idx}行数据")
-        
-        logger.info(f"[安全可研批复投资] 合并后总行数: {len(target_table)}")
-    
-    # 识别表头行和列索引
-    header_row_idx = -1
-    no_idx = -1
-    name_idx = -1
-    overhead_line_idx = -1
-    bay_idx = -1
-    substation_idx = -1
-    optical_cable_idx = -1
-    static_investment_idx = -1
-    dynamic_investment_idx = -1
-    construction_project_cost_idx = -1
-    equipment_purchase_cost_idx = -1
-    installation_project_cost_idx = -1
-    other_expenses_idx = -1
-    
-    # 扫描前几行识别列索引
-    for row_idx in range(min(5, len(target_table))):
-        row = target_table[row_idx]
-        row_text = " ".join([str(cell) for cell in row])
-        row_text_no_space = row_text.replace(" ", "")
-        
-        for col_idx, cell in enumerate(row):
-            cell_text = str(cell).strip()
-            cell_text_no_space = cell_text.replace(" ", "")
-            
-            if "序号" in cell_text and no_idx == -1:
-                no_idx = col_idx
-            elif "项目名称" in cell_text_no_space and name_idx == -1:
-                name_idx = col_idx
-            elif "架空线" in cell_text_no_space and overhead_line_idx == -1:
-                overhead_line_idx = col_idx
-            elif "间隔" in cell_text and bay_idx == -1:
-                bay_idx = col_idx
-            elif "变电" in cell_text and substation_idx == -1:
-                substation_idx = col_idx
-            elif "光缆" in cell_text and optical_cable_idx == -1:
-                optical_cable_idx = col_idx
-            elif ("静态投资" in cell_text_no_space or "静态合计" in cell_text_no_space) and static_investment_idx == -1:
-                static_investment_idx = col_idx
-            elif ("动态投资" in cell_text_no_space or "动态合计" in cell_text_no_space) and dynamic_investment_idx == -1:
-                dynamic_investment_idx = col_idx
-            elif "建筑工程费" in cell_text_no_space and construction_project_cost_idx == -1:
-                construction_project_cost_idx = col_idx
-            elif "设备购置费" in cell_text_no_space and equipment_purchase_cost_idx == -1:
-                equipment_purchase_cost_idx = col_idx
-            elif "安装工程费" in cell_text_no_space and installation_project_cost_idx == -1:
-                installation_project_cost_idx = col_idx
-            elif ("其他费用" in cell_text_no_space or "合计" == cell_text_no_space) and other_expenses_idx == -1:
-                if "其他费用" in cell_text_no_space:
-                    other_expenses_idx = col_idx
-        
-        if ("序号" in row_text or "项目名称" in row_text_no_space) and header_row_idx == -1:
-            header_row_idx = row_idx
-    
-    # 找到第一个数据行
-    for row_idx in range(min(5, len(target_table))):
-        row = target_table[row_idx]
-        if len(row) > 0:
-            first_cell = str(row[0]).strip()
-            if first_cell and first_cell not in ["序号", ""] and (first_cell in ["一", "二", "三", "四", "五"] or first_cell.isdigit()):
-                header_row_idx = row_idx - 1
-                logger.debug(f"[安全可研批复投资] 根据数据行确定表头结束于第{header_row_idx}行")
-                break
-    
-    logger.info(f"[安全可研批复投资] 表头行: {header_row_idx}")
-    logger.info(f"[安全可研批复投资] 列索引: 序号={no_idx}, 名称={name_idx}, "
-               f"架空线={overhead_line_idx}, 间隔={bay_idx}, 变电={substation_idx}, "
-               f"光缆={optical_cable_idx}, 静态投资={static_investment_idx}, 动态投资={dynamic_investment_idx}")
-    
-    if header_row_idx == -1:
-        logger.warning("[安全可研批复投资] 未找到表头行")
-        return record
-    
-    # 解析数据行
-    for row_idx in range(header_row_idx + 1, len(target_table)):
-        row = target_table[row_idx]
-        
-        if len(row) < 3:
-            continue
-        
-        # 检查是否是有效数据行
-        if name_idx >= 0 and name_idx < len(row):
-            name = str(row[name_idx]).strip()
-            if not name or name in ["", "nan", "None"]:
-                continue
-            
-            # 提取序号
-            no = ""
-            if no_idx >= 0 and no_idx < len(row):
-                no = str(row[no_idx]).strip()
-            
-            # 判断等级 - 使用非严格模式,让中文数字直接返回 Level 1
-            level_input = (no + name) if no else name
-            level = determine_level(level_input, name, strict_mode=False)
-            
-            # 对于阿拉伯数字序号,如果当前是 Level 2,且不是具体项目名称,则判定为 Level 2
-            # 这样"1、襄阳连云..."会是 Level 2
-            if level == "2" and no.isdigit():
-                # 阿拉伯数字序号,是具体项目,保持 Level 2
-                pass
-            
-            item = InvestmentItem()
-            item.no = no
-            item.name = name
-            item.level = level
-            
-            # 提取建设规模
-            if overhead_line_idx >= 0 and overhead_line_idx < len(row):
-                item.constructionScaleOverheadLine = str(row[overhead_line_idx]).strip()
-            
-            if bay_idx >= 0 and bay_idx < len(row):
-                item.constructionScaleBay = str(row[bay_idx]).strip()
-            
-            if substation_idx >= 0 and substation_idx < len(row):
-                item.constructionScaleSubstation = str(row[substation_idx]).strip()
-            
-            if optical_cable_idx >= 0 and optical_cable_idx < len(row):
-                item.constructionScaleOpticalCable = str(row[optical_cable_idx]).strip()
-            
-            # 提取投资金额
-            if static_investment_idx >= 0 and static_investment_idx < len(row):
-                item.staticInvestment = str(row[static_investment_idx]).strip()
-            
-            if dynamic_investment_idx >= 0 and dynamic_investment_idx < len(row):
-                item.dynamicInvestment = str(row[dynamic_investment_idx]).strip()
-            
-            # 提取费用
-            if construction_project_cost_idx >= 0 and construction_project_cost_idx < len(row):
-                item.constructionProjectCost = str(row[construction_project_cost_idx]).strip()
-            
-            if equipment_purchase_cost_idx >= 0 and equipment_purchase_cost_idx < len(row):
-                item.equipmentPurchaseCost = str(row[equipment_purchase_cost_idx]).strip()
-            
-            if installation_project_cost_idx >= 0 and installation_project_cost_idx < len(row):
-                item.installationProjectCost = str(row[installation_project_cost_idx]).strip()
-            
-            if other_expenses_idx >= 0 and other_expenses_idx < len(row):
-                item.otherExpenses = str(row[other_expenses_idx]).strip()
-            
-            record.items.append(item)
-            logger.info(f"[安全可研批复投资] 解析到数据: No={item.no}, Name={item.name}, Level={item.level}")
-    
-    logger.info(f"[安全可研批复投资] 共解析到 {len(record.items)} 条数据")
-    return record
-
-
 def parse_feasibility_review_investment(markdown_content: str) -> FeasibilityReviewInvestment:
     """
     解析可研评审投资估算
@@ -1071,9 +821,6 @@ def parse_investment_record(markdown_content: str, investment_type: Optional[str
     result = None
     if investment_type == "fsApproval":
         result = parse_feasibility_approval_investment(markdown_content)
-    elif investment_type == "safetyFsApproval":
-        # safetyFsApproval 使用独立的解析逻辑(湖北省格式)
-        result = parse_safety_feasibility_approval_investment(markdown_content)
     elif investment_type == "fsReview":
         result = parse_feasibility_review_investment(markdown_content)
     elif investment_type == "pdApproval":

+ 9 - 23
pdf_converter_v2/parser/json_converter.py

@@ -329,8 +329,8 @@ def parse_markdown_to_json(markdown_content: str, first_page_image: Optional[Ima
                 op_list = parse_operational_conditions(markdown_content, require_title=False)
             serialized = [oc.to_dict() if hasattr(oc, "to_dict") else oc for oc in (op_list or [])]
             result = {"document_type": forced_document_type, "data": {"operationalConditions": serialized}}
-        elif forced_document_type in ["fsApproval", "fsReview", "pdApproval", "safetyFsApproval"]:
-            # 投资估算类型处理(包括安评类)
+        elif forced_document_type in ["fsApproval", "fsReview", "pdApproval"]:
+            # 投资估算类型处理
             logger.info(f"[JSON转换] 处理投资估算类型: {forced_document_type}")
             logger.debug(f"[JSON转换] Markdown内容长度: {len(markdown_content)} 字符")
             
@@ -338,26 +338,12 @@ def parse_markdown_to_json(markdown_content: str, first_page_image: Optional[Ima
             
             if investment_record:
                 data = investment_record.to_dict()
+                logger.info(f"[JSON转换] 投资估算解析成功,共 {len(data)} 条记录")
                 
-                # 检查返回的数据格式:可能是列表(旧格式)或字典(包含projectInfo的新格式)
-                if isinstance(data, dict) and "data" in data:
-                    # 新格式:包含 projectInfo 和 data
-                    logger.info(f"[JSON转换] 投资估算解析成功,共 {len(data['data'])} 条记录")
-                    if data.get("projectInfo"):
-                        logger.info(f"[JSON转换] 项目信息: {data['projectInfo'].get('projectName', '')}")
-                    
-                    # 输出前3条记录的摘要
-                    if data["data"]:
-                        for idx, item in enumerate(data["data"][:3]):
-                            logger.debug(f"[JSON转换] 记录 {idx+1}: No={item.get('No', '')}, Name={item.get('name', '')}, Level={item.get('Level', '')}")
-                else:
-                    # 旧格式:直接是数据列表
-                    logger.info(f"[JSON转换] 投资估算解析成功,共 {len(data)} 条记录")
-                    
-                    # 输出前3条记录的摘要
-                    if data:
-                        for idx, item in enumerate(data[:3]):
-                            logger.debug(f"[JSON转换] 记录 {idx+1}: No={item.get('No', '')}, Name={item.get('name', '')}, Level={item.get('Level', '')}")
+                # 输出前3条记录的摘要
+                if data:
+                    for idx, item in enumerate(data[:3]):
+                        logger.debug(f"[JSON转换] 记录 {idx+1}: No={item.get('No', '')}, Name={item.get('name', '')}, Level={item.get('Level', '')}")
                 
                 result = {"document_type": forced_document_type, "data": data}
             else:
@@ -446,8 +432,8 @@ def parse_markdown_to_json(markdown_content: str, first_page_image: Optional[Ima
     elif doc_type == "emRec":
         data = parse_electromagnetic_detection_record(markdown_content).to_dict()
         result = {"document_type": doc_type, "data": data}
-    elif doc_type in ["fsApproval", "fsReview", "pdApproval", "safetyFsApproval"]:
-        # 新增:投资估算类型(包括安评类)
+    elif doc_type in ["fsApproval", "fsReview", "pdApproval"]:
+        # 新增:投资估算类型
         logger.info(f"[JSON转换] 检测到投资估算类型: {doc_type}")
         logger.debug(f"[JSON转换] Markdown内容长度: {len(markdown_content)} 字符")
         

+ 3 - 2
pdf_converter_v2/test_api.py

@@ -1,4 +1,5 @@
 #!/usr/bin/env python3
+# -*- coding: utf-8 -*-
 """
 PDF Converter API 测试脚本
 
@@ -22,8 +23,8 @@ import requests
 from pathlib import Path
 from typing import Optional, Dict, Any, List
 
-# API 配置
-API_BASE_URL = "http://47.101.133.94:14213"
+# API 配置(默认本机 4214 端口;可通过环境变量 PDF_CONVERTER_API_URL 覆盖)
+API_BASE_URL = os.getenv("PDF_CONVERTER_API_URL", "http://127.0.0.1:4214")
 
 # 测试文件配置
 TEST_DIR = Path(__file__).parent / "test"