# NER实现示例代码 ## 一、后端实现 ### 1.1 扩展Constants常量 ```java // backend/lingyue-common/src/main/java/com/lingyue/common/core/Constants.java public final class Constants { // ... 现有常量 // NER相关节点类型 public static final String NODE_NER_ENTITY = "NER_ENTITY"; public static final String NODE_NER_RELATION = "NER_RELATION"; // NER相关边类型 public static final String EDGE_HAS_NER_ENTITY = "HAS_NER_ENTITY"; public static final String EDGE_ENTITY_RELATION = "ENTITY_RELATION"; public static final String EDGE_ENTITY_TO_VALUE = "ENTITY_TO_VALUE"; // NER提取方法 public static final String NER_METHOD_RULE = "rule"; public static final String NER_METHOD_LLM = "llm"; public static final String NER_METHOD_MANUAL = "manual"; // NER状态 public static final String NER_PENDING = "pending"; public static final String NER_PROCESSING = "processing"; public static final String NER_COMPLETED = "completed"; public static final String NER_FAILED = "failed"; } ``` ### 1.2 NER实体DTO ```java // backend/lingyue-ai/src/main/java/com/lingyue/ai/dto/NerEntityDTO.java package com.lingyue.ai.dto; import lombok.Data; import java.math.BigDecimal; @Data public class NerEntityDTO { private Long id; private String entityType; // ORG, DATE, PERSON, SCORE等 private String entityName; // 实体名称 private String entityValue; // 实体值 private BigDecimal confidence; // 置信度 // 位置信息 private Integer charStart; private Integer charEnd; private Integer line; private String context; // 上下文 // 来源信息 private Long attachmentId; private String attachmentName; private String extractMethod; // rule/llm/manual private String extractTime; // 映射信息 private String mappedElementKey; // 映射到的要素key private Boolean isMapped; } ``` ### 1.3 NER Service实现 ```java // backend/lingyue-ai/src/main/java/com/lingyue/ai/service/NerEntityService.java package com.lingyue.ai.service; import com.lingyue.ai.dto.NerEntityDTO; import com.lingyue.ai.dto.NerExtractRequest; import com.lingyue.ai.dto.NerExtractResponse; import com.lingyue.common.core.Constants; import com.lingyue.graph.service.NodeService; import com.lingyue.graph.service.EdgeService; import com.lingyue.graph.service.PropertyService; import lombok.RequiredArgsConstructor; import lombok.extern.slf4j.Slf4j; import org.springframework.stereotype.Service; import org.springframework.transaction.annotation.Transactional; import java.util.ArrayList; import java.util.List; @Slf4j @Service @RequiredArgsConstructor public class NerEntityService { private final NerService nerService; private final NodeService nodeService; private final EdgeService edgeService; private final PropertyService propertyService; /** * 对附件执行NER提取并保存到图数据库 */ @Transactional public List extractAndSaveEntities(Long attachmentId, String text) { log.info("开始NER提取: attachmentId={}", attachmentId); // 1. 调用Python NER服务提取实体 NerExtractRequest request = new NerExtractRequest(); request.setText(text); request.setDocumentId(String.valueOf(attachmentId)); NerExtractResponse response = nerService.extract(request); // 2. 保存实体到图数据库 List savedEntities = new ArrayList<>(); for (NerExtractResponse.EntityItem item : response.getEntities()) { NerEntityDTO entity = saveEntity(attachmentId, item); savedEntities.add(entity); } log.info("NER提取完成: attachmentId={}, entityCount={}", attachmentId, savedEntities.size()); return savedEntities; } /** * 保存单个实体到图数据库 */ private NerEntityDTO saveEntity(Long attachmentId, NerExtractResponse.EntityItem item) { // 创建实体节点 String entityKey = "entity_" + System.currentTimeMillis() + "_" + Math.abs(item.getText().hashCode()); Long entityId = nodeService.createNode( Constants.NODE_NER_ENTITY, entityKey, item.getText(), null // createdBy ); // 设置实体属性 propertyService.setNodeProperty(entityId, "entity_type", item.getType()); propertyService.setNodeProperty(entityId, "entity_value", item.getText()); propertyService.setNodeProperty(entityId, "confidence", String.valueOf(item.getConfidence())); propertyService.setNodeProperty(entityId, "char_start", String.valueOf(item.getStartPos())); propertyService.setNodeProperty(entityId, "char_end", String.valueOf(item.getEndPos())); propertyService.setNodeProperty(entityId, "extract_method", Constants.NER_METHOD_RULE); // 创建附件→实体的边 edgeService.createEdge( Constants.EDGE_HAS_NER_ENTITY, attachmentId, entityId, 0 // sortOrder ); // 构建DTO返回 NerEntityDTO dto = new NerEntityDTO(); dto.setId(entityId); dto.setEntityType(item.getType()); dto.setEntityName(item.getText()); dto.setEntityValue(item.getText()); dto.setConfidence(item.getConfidence()); dto.setCharStart(item.getStartPos()); dto.setCharEnd(item.getEndPos()); dto.setAttachmentId(attachmentId); dto.setExtractMethod(Constants.NER_METHOD_RULE); dto.setIsMapped(false); return dto; } /** * 查询附件的所有NER实体 */ public List getEntitiesByAttachment(Long attachmentId) { // TODO: 实现查询逻辑 return new ArrayList<>(); } /** * 将实体映射到要素 */ @Transactional public void mapEntityToElement(Long entityId, String elementKey) { log.info("映射实体到要素: entityId={}, elementKey={}", entityId, elementKey); // 1. 查找element节点 // 2. 创建ENTITY_TO_VALUE边 // 3. 更新entity的mapped属性 propertyService.setNodeProperty(entityId, "mapped_element_key", elementKey); propertyService.setNodeProperty(entityId, "is_mapped", "true"); } } ``` ### 1.4 NER Controller ```java // backend/lingyue-ai/src/main/java/com/lingyue/ai/controller/NerController.java package com.lingyue.ai.controller; import com.lingyue.ai.dto.NerEntityDTO; import com.lingyue.ai.service.NerEntityService; import com.lingyue.common.core.Result; import lombok.RequiredArgsConstructor; import lombok.extern.slf4j.Slf4j; import org.springframework.web.bind.annotation.*; import java.util.List; @Slf4j @RestController @RequestMapping("/api/v1/ner") @RequiredArgsConstructor public class NerController { private final NerEntityService nerEntityService; /** * 对附件执行NER提取 */ @PostMapping("/attachments/{attachmentId}/extract") public Result> extractEntities( @PathVariable Long attachmentId, @RequestBody String text) { List entities = nerEntityService.extractAndSaveEntities( attachmentId, text ); return Result.ok(entities); } /** * 查询附件的NER实体 */ @GetMapping("/attachments/{attachmentId}/entities") public Result> getEntities(@PathVariable Long attachmentId) { List entities = nerEntityService.getEntitiesByAttachment( attachmentId ); return Result.ok(entities); } /** * 将实体映射到要素 */ @PostMapping("/entities/{entityId}/map") public Result mapEntity( @PathVariable Long entityId, @RequestParam String elementKey) { nerEntityService.mapEntityToElement(entityId, elementKey); return Result.ok(); } } ``` ## 二、Python NER服务扩展 ### 2.1 扩展实体类型规则 ```python # python-services/ner-service/app/services/ner_service.py # 在 _extract_by_rules 方法中添加智报专用规则 async def _extract_by_rules(self, text: str, entity_types: Optional[List[str]] = None): """基于规则的NER提取(智报增强版)""" rules = { # ... 现有规则 # === 智报专用规则 === "SCORE": [ # 评审得分:93.33分 r'(\d+\.?\d*分)', r'得分[::]\s*(\d+\.?\d*)', ], "LEVEL": [ # 级别:一级、二级 r'(一级|二级|三级)', r'级别[::]\s*(一级|二级|三级)', ], "CERTIFICATE_CODE": [ # 证书编号:ZGDIDBOY-083 r'(ZGDIDBOY-\d+)', r'([A-Z]+-\d+-\d+)', r'证书编号[::]\s*([A-Z0-9\-]+)', ], "REVIEW_CODE": [ # 评审代码:5.1.1.1 r'(5\.\d+(?:\.\d+)*)', ], "COMPANY_ALIAS": [ # 公司简称(需要结合上下文) r'简称[::「『]([^」』::]{2,10})[」』]', r'以下简称[「『""]([^」』""]{2,10})[」』""]', ], "PROJECT_CODE": [ # 项目编号:BZ-0092-2024 r'([A-Z]+-\d+-\d+)', r'项目编号[::]\s*([A-Z0-9\-]+)', ], "REVIEW_ITEM": [ # 评审项:目标职责、制度化管理等 r'(目标职责|制度化管理|教育培训|现场管理|安全风险管控|应急管理|事故管理|持续改进)', ], } # ... 其余提取逻辑保持不变 ``` ### 2.2 添加表格提取功能 ```python # python-services/ner-service/app/services/table_extractor.py from typing import List, Dict import re class TableExtractor: """表格数据提取器""" def extract_tables(self, text: str) -> List[Dict]: """ 从文本中提取表格数据 返回格式: [ { "table_type": "review_project", # 表格类型 "headers": ["项目名称", "简称", "类型"], "rows": [ ["大邑地勘项目", "大邑项目", "在建项目"], ... ] } ] """ tables = [] # 方法1:基于分隔符识别(简单表格) tables.extend(self._extract_simple_tables(text)) # 方法2:基于关键词识别(特定表格) tables.extend(self._extract_known_tables(text)) return tables def _extract_simple_tables(self, text: str) -> List[Dict]: """提取简单表格(基于|或制表符分隔)""" tables = [] # 查找表格块 table_pattern = r'(\|[^\n]+\|(?:\n\|[^\n]+\|)+)' matches = re.finditer(table_pattern, text) for match in matches: table_text = match.group(1) rows = table_text.strip().split('\n') # 解析表头和数据行 headers = [cell.strip() for cell in rows[0].split('|') if cell.strip()] data_rows = [] for row in rows[1:]: cells = [cell.strip() for cell in row.split('|') if cell.strip()] if cells: data_rows.append(cells) if headers and data_rows: tables.append({ "table_type": "unknown", "headers": headers, "rows": data_rows }) return tables def _extract_known_tables(self, text: str) -> List[Dict]: """提取已知类型的表格""" tables = [] # 示例:提取复审项目表 if "复审项目" in text or "评审项目" in text: table = self._extract_review_project_table(text) if table: tables.append(table) # 示例:提取复审人员表 if "评审组" in text or "评审人员" in text: table = self._extract_reviewer_table(text) if table: tables.append(table) return tables def _extract_review_project_table(self, text: str) -> Dict: """提取复审项目表""" # TODO: 实现具体逻辑 return None def _extract_reviewer_table(self, text: str) -> Dict: """提取评审人员表""" # TODO: 实现具体逻辑 return None # 创建单例 table_extractor = TableExtractor() ``` ## 三、前端实现 ### 3.1 NER分析页面 ```vue ``` ### 3.2 实体列表组件 ```vue ``` ## 四、数据库迁移脚本 ```sql -- database/migrations/003_add_ner_support.sql -- 添加NER相关的节点类型和边类型支持 -- 注意:实际执行时需要根据现有schema调整 -- 1. 如果有node_types表,添加新类型 INSERT INTO node_types (type_code, type_name, description) VALUES ('NER_ENTITY', 'NER实体', 'NER提取的命名实体'), ('NER_RELATION', 'NER关系', '实体间的关系') ON DUPLICATE KEY UPDATE type_name = VALUES(type_name); -- 2. 如果有edge_types表,添加新类型 INSERT INTO edge_types (type_code, type_name, description) VALUES ('HAS_NER_ENTITY', '包含NER实体', '附件包含的NER实体'), ('ENTITY_RELATION', '实体关系', '实体之间的语义关系'), ('ENTITY_TO_VALUE', '实体到值', '实体映射到要素值') ON DUPLICATE KEY UPDATE type_name = VALUES(type_name); -- 3. 添加NER相关索引(如果需要) -- CREATE INDEX idx_ner_entity_type ON graph_properties(node_id, property_key) -- WHERE property_key = 'entity_type'; ``` --- 以上代码提供了NER阶段的核心实现框架,可以根据实际需求进行调整和扩展。