NER实现示例代码.md 23 KB

NER实现示例代码

一、后端实现

1.1 扩展Constants常量

// backend/lingyue-common/src/main/java/com/lingyue/common/core/Constants.java

public final class Constants {
    // ... 现有常量
    
    // NER相关节点类型
    public static final String NODE_NER_ENTITY = "NER_ENTITY";
    public static final String NODE_NER_RELATION = "NER_RELATION";
    
    // NER相关边类型
    public static final String EDGE_HAS_NER_ENTITY = "HAS_NER_ENTITY";
    public static final String EDGE_ENTITY_RELATION = "ENTITY_RELATION";
    public static final String EDGE_ENTITY_TO_VALUE = "ENTITY_TO_VALUE";
    
    // NER提取方法
    public static final String NER_METHOD_RULE = "rule";
    public static final String NER_METHOD_LLM = "llm";
    public static final String NER_METHOD_MANUAL = "manual";
    
    // NER状态
    public static final String NER_PENDING = "pending";
    public static final String NER_PROCESSING = "processing";
    public static final String NER_COMPLETED = "completed";
    public static final String NER_FAILED = "failed";
}

1.2 NER实体DTO

// backend/lingyue-ai/src/main/java/com/lingyue/ai/dto/NerEntityDTO.java

package com.lingyue.ai.dto;

import lombok.Data;
import java.math.BigDecimal;

@Data
public class NerEntityDTO {
    private Long id;
    private String entityType;      // ORG, DATE, PERSON, SCORE等
    private String entityName;      // 实体名称
    private String entityValue;     // 实体值
    private BigDecimal confidence;  // 置信度
    
    // 位置信息
    private Integer charStart;
    private Integer charEnd;
    private Integer line;
    private String context;         // 上下文
    
    // 来源信息
    private Long attachmentId;
    private String attachmentName;
    private String extractMethod;   // rule/llm/manual
    private String extractTime;
    
    // 映射信息
    private String mappedElementKey;  // 映射到的要素key
    private Boolean isMapped;
}

1.3 NER Service实现

// backend/lingyue-ai/src/main/java/com/lingyue/ai/service/NerEntityService.java

package com.lingyue.ai.service;

import com.lingyue.ai.dto.NerEntityDTO;
import com.lingyue.ai.dto.NerExtractRequest;
import com.lingyue.ai.dto.NerExtractResponse;
import com.lingyue.common.core.Constants;
import com.lingyue.graph.service.NodeService;
import com.lingyue.graph.service.EdgeService;
import com.lingyue.graph.service.PropertyService;
import lombok.RequiredArgsConstructor;
import lombok.extern.slf4j.Slf4j;
import org.springframework.stereotype.Service;
import org.springframework.transaction.annotation.Transactional;

import java.util.ArrayList;
import java.util.List;

@Slf4j
@Service
@RequiredArgsConstructor
public class NerEntityService {
    
    private final NerService nerService;
    private final NodeService nodeService;
    private final EdgeService edgeService;
    private final PropertyService propertyService;
    
    /**
     * 对附件执行NER提取并保存到图数据库
     */
    @Transactional
    public List<NerEntityDTO> extractAndSaveEntities(Long attachmentId, String text) {
        log.info("开始NER提取: attachmentId={}", attachmentId);
        
        // 1. 调用Python NER服务提取实体
        NerExtractRequest request = new NerExtractRequest();
        request.setText(text);
        request.setDocumentId(String.valueOf(attachmentId));
        
        NerExtractResponse response = nerService.extract(request);
        
        // 2. 保存实体到图数据库
        List<NerEntityDTO> savedEntities = new ArrayList<>();
        
        for (NerExtractResponse.EntityItem item : response.getEntities()) {
            NerEntityDTO entity = saveEntity(attachmentId, item);
            savedEntities.add(entity);
        }
        
        log.info("NER提取完成: attachmentId={}, entityCount={}", 
                 attachmentId, savedEntities.size());
        
        return savedEntities;
    }
    
    /**
     * 保存单个实体到图数据库
     */
    private NerEntityDTO saveEntity(Long attachmentId, NerExtractResponse.EntityItem item) {
        // 创建实体节点
        String entityKey = "entity_" + System.currentTimeMillis() + "_" + 
                          Math.abs(item.getText().hashCode());
        
        Long entityId = nodeService.createNode(
            Constants.NODE_NER_ENTITY,
            entityKey,
            item.getText(),
            null  // createdBy
        );
        
        // 设置实体属性
        propertyService.setNodeProperty(entityId, "entity_type", item.getType());
        propertyService.setNodeProperty(entityId, "entity_value", item.getText());
        propertyService.setNodeProperty(entityId, "confidence", 
                                       String.valueOf(item.getConfidence()));
        propertyService.setNodeProperty(entityId, "char_start", 
                                       String.valueOf(item.getStartPos()));
        propertyService.setNodeProperty(entityId, "char_end", 
                                       String.valueOf(item.getEndPos()));
        propertyService.setNodeProperty(entityId, "extract_method", 
                                       Constants.NER_METHOD_RULE);
        
        // 创建附件→实体的边
        edgeService.createEdge(
            Constants.EDGE_HAS_NER_ENTITY,
            attachmentId,
            entityId,
            0  // sortOrder
        );
        
        // 构建DTO返回
        NerEntityDTO dto = new NerEntityDTO();
        dto.setId(entityId);
        dto.setEntityType(item.getType());
        dto.setEntityName(item.getText());
        dto.setEntityValue(item.getText());
        dto.setConfidence(item.getConfidence());
        dto.setCharStart(item.getStartPos());
        dto.setCharEnd(item.getEndPos());
        dto.setAttachmentId(attachmentId);
        dto.setExtractMethod(Constants.NER_METHOD_RULE);
        dto.setIsMapped(false);
        
        return dto;
    }
    
    /**
     * 查询附件的所有NER实体
     */
    public List<NerEntityDTO> getEntitiesByAttachment(Long attachmentId) {
        // TODO: 实现查询逻辑
        return new ArrayList<>();
    }
    
    /**
     * 将实体映射到要素
     */
    @Transactional
    public void mapEntityToElement(Long entityId, String elementKey) {
        log.info("映射实体到要素: entityId={}, elementKey={}", entityId, elementKey);
        
        // 1. 查找element节点
        // 2. 创建ENTITY_TO_VALUE边
        // 3. 更新entity的mapped属性
        
        propertyService.setNodeProperty(entityId, "mapped_element_key", elementKey);
        propertyService.setNodeProperty(entityId, "is_mapped", "true");
    }
}

1.4 NER Controller

// backend/lingyue-ai/src/main/java/com/lingyue/ai/controller/NerController.java

package com.lingyue.ai.controller;

import com.lingyue.ai.dto.NerEntityDTO;
import com.lingyue.ai.service.NerEntityService;
import com.lingyue.common.core.Result;
import lombok.RequiredArgsConstructor;
import lombok.extern.slf4j.Slf4j;
import org.springframework.web.bind.annotation.*;

import java.util.List;

@Slf4j
@RestController
@RequestMapping("/api/v1/ner")
@RequiredArgsConstructor
public class NerController {
    
    private final NerEntityService nerEntityService;
    
    /**
     * 对附件执行NER提取
     */
    @PostMapping("/attachments/{attachmentId}/extract")
    public Result<List<NerEntityDTO>> extractEntities(
            @PathVariable Long attachmentId,
            @RequestBody String text) {
        
        List<NerEntityDTO> entities = nerEntityService.extractAndSaveEntities(
            attachmentId, text
        );
        
        return Result.ok(entities);
    }
    
    /**
     * 查询附件的NER实体
     */
    @GetMapping("/attachments/{attachmentId}/entities")
    public Result<List<NerEntityDTO>> getEntities(@PathVariable Long attachmentId) {
        List<NerEntityDTO> entities = nerEntityService.getEntitiesByAttachment(
            attachmentId
        );
        return Result.ok(entities);
    }
    
    /**
     * 将实体映射到要素
     */
    @PostMapping("/entities/{entityId}/map")
    public Result<?> mapEntity(
            @PathVariable Long entityId,
            @RequestParam String elementKey) {
        
        nerEntityService.mapEntityToElement(entityId, elementKey);
        return Result.ok();
    }
}

二、Python NER服务扩展

2.1 扩展实体类型规则

# python-services/ner-service/app/services/ner_service.py

# 在 _extract_by_rules 方法中添加智报专用规则

async def _extract_by_rules(self, text: str, entity_types: Optional[List[str]] = None):
    """基于规则的NER提取(智报增强版)"""
    
    rules = {
        # ... 现有规则
        
        # === 智报专用规则 ===
        "SCORE": [
            # 评审得分:93.33分
            r'(\d+\.?\d*分)',
            r'得分[::]\s*(\d+\.?\d*)',
        ],
        
        "LEVEL": [
            # 级别:一级、二级
            r'(一级|二级|三级)',
            r'级别[::]\s*(一级|二级|三级)',
        ],
        
        "CERTIFICATE_CODE": [
            # 证书编号:ZGDIDBOY-083
            r'(ZGDIDBOY-\d+)',
            r'([A-Z]+-\d+-\d+)',
            r'证书编号[::]\s*([A-Z0-9\-]+)',
        ],
        
        "REVIEW_CODE": [
            # 评审代码:5.1.1.1
            r'(5\.\d+(?:\.\d+)*)',
        ],
        
        "COMPANY_ALIAS": [
            # 公司简称(需要结合上下文)
            r'简称[::「『]([^」』::]{2,10})[」』]',
            r'以下简称[「『""]([^」』""]{2,10})[」』""]',
        ],
        
        "PROJECT_CODE": [
            # 项目编号:BZ-0092-2024
            r'([A-Z]+-\d+-\d+)',
            r'项目编号[::]\s*([A-Z0-9\-]+)',
        ],
        
        "REVIEW_ITEM": [
            # 评审项:目标职责、制度化管理等
            r'(目标职责|制度化管理|教育培训|现场管理|安全风险管控|应急管理|事故管理|持续改进)',
        ],
    }
    
    # ... 其余提取逻辑保持不变

2.2 添加表格提取功能

# python-services/ner-service/app/services/table_extractor.py

from typing import List, Dict
import re

class TableExtractor:
    """表格数据提取器"""
    
    def extract_tables(self, text: str) -> List[Dict]:
        """
        从文本中提取表格数据
        
        返回格式:
        [
            {
                "table_type": "review_project",  # 表格类型
                "headers": ["项目名称", "简称", "类型"],
                "rows": [
                    ["大邑地勘项目", "大邑项目", "在建项目"],
                    ...
                ]
            }
        ]
        """
        tables = []
        
        # 方法1:基于分隔符识别(简单表格)
        tables.extend(self._extract_simple_tables(text))
        
        # 方法2:基于关键词识别(特定表格)
        tables.extend(self._extract_known_tables(text))
        
        return tables
    
    def _extract_simple_tables(self, text: str) -> List[Dict]:
        """提取简单表格(基于|或制表符分隔)"""
        tables = []
        
        # 查找表格块
        table_pattern = r'(\|[^\n]+\|(?:\n\|[^\n]+\|)+)'
        matches = re.finditer(table_pattern, text)
        
        for match in matches:
            table_text = match.group(1)
            rows = table_text.strip().split('\n')
            
            # 解析表头和数据行
            headers = [cell.strip() for cell in rows[0].split('|') if cell.strip()]
            data_rows = []
            
            for row in rows[1:]:
                cells = [cell.strip() for cell in row.split('|') if cell.strip()]
                if cells:
                    data_rows.append(cells)
            
            if headers and data_rows:
                tables.append({
                    "table_type": "unknown",
                    "headers": headers,
                    "rows": data_rows
                })
        
        return tables
    
    def _extract_known_tables(self, text: str) -> List[Dict]:
        """提取已知类型的表格"""
        tables = []
        
        # 示例:提取复审项目表
        if "复审项目" in text or "评审项目" in text:
            table = self._extract_review_project_table(text)
            if table:
                tables.append(table)
        
        # 示例:提取复审人员表
        if "评审组" in text or "评审人员" in text:
            table = self._extract_reviewer_table(text)
            if table:
                tables.append(table)
        
        return tables
    
    def _extract_review_project_table(self, text: str) -> Dict:
        """提取复审项目表"""
        # TODO: 实现具体逻辑
        return None
    
    def _extract_reviewer_table(self, text: str) -> Dict:
        """提取评审人员表"""
        # TODO: 实现具体逻辑
        return None

# 创建单例
table_extractor = TableExtractor()

三、前端实现

3.1 NER分析页面

<!-- frontend/vue-demo/src/views/NerAnalysis.vue -->

<template>
  <div class="ner-analysis-container">
    <!-- 顶部统计卡片 -->
    <el-row :gutter="20" class="stats-row">
      <el-col :span="6">
        <el-card>
          <el-statistic title="实体总数" :value="statistics.totalEntities">
            <template #suffix>个</template>
          </el-statistic>
        </el-card>
      </el-col>
      <el-col :span="6">
        <el-card>
          <el-statistic title="已映射" :value="statistics.mappedEntities">
            <template #suffix>个</template>
          </el-statistic>
        </el-card>
      </el-col>
      <el-col :span="6">
        <el-card>
          <el-statistic title="关系数" :value="statistics.totalRelations">
            <template #suffix>个</template>
          </el-statistic>
        </el-card>
      </el-col>
      <el-col :span="6">
        <el-card>
          <el-statistic 
            title="平均置信度" 
            :value="statistics.avgConfidence"
            :precision="2">
            <template #suffix>%</template>
          </el-statistic>
        </el-card>
      </el-col>
    </el-row>

    <!-- 主内容区 -->
    <el-card class="main-content">
      <el-tabs v-model="activeTab">
        <!-- 实体列表 -->
        <el-tab-pane label="实体列表" name="entities">
          <entity-list-view 
            :entities="entities"
            @map="handleMapEntity"
            @delete="handleDeleteEntity" />
        </el-tab-pane>

        <!-- 文本标注 -->
        <el-tab-pane label="文本标注" name="annotation">
          <text-annotation-view 
            :text="sourceText"
            :entities="entities"
            @entity-click="handleEntityClick" />
        </el-tab-pane>

        <!-- 要素映射 -->
        <el-tab-pane label="要素映射" name="mapping">
          <entity-mapping-view 
            :entities="entities"
            :elements="elements"
            @map="handleMapEntity" />
        </el-tab-pane>

        <!-- 关系图谱 -->
        <el-tab-pane label="关系图谱" name="graph">
          <relation-graph-view 
            :entities="entities"
            :relations="relations" />
        </el-tab-pane>
      </el-tabs>
    </el-card>
  </div>
</template>

<script setup>
import { ref, computed, onMounted } from 'vue'
import { useRoute } from 'vue-router'
import { getNerEntities, mapEntityToElement } from '@/api/ner'
import EntityListView from './components/EntityListView.vue'
import TextAnnotationView from './components/TextAnnotationView.vue'
import EntityMappingView from './components/EntityMappingView.vue'
import RelationGraphView from './components/RelationGraphView.vue'

const route = useRoute()
const attachmentId = ref(route.params.attachmentId)
const activeTab = ref('entities')

const entities = ref([])
const relations = ref([])
const sourceText = ref('')
const elements = ref([])

// 统计数据
const statistics = computed(() => ({
  totalEntities: entities.value.length,
  mappedEntities: entities.value.filter(e => e.isMapped).length,
  totalRelations: relations.value.length,
  avgConfidence: entities.value.length > 0
    ? entities.value.reduce((sum, e) => sum + e.confidence, 0) / entities.value.length * 100
    : 0
}))

// 加载数据
const loadData = async () => {
  const res = await getNerEntities(attachmentId.value)
  entities.value = res.data
}

// 映射实体到要素
const handleMapEntity = async (entityId, elementKey) => {
  await mapEntityToElement(entityId, elementKey)
  await loadData()
}

// 删除实体
const handleDeleteEntity = async (entityId) => {
  // TODO: 实现删除逻辑
}

// 点击实体
const handleEntityClick = (entity) => {
  console.log('点击实体:', entity)
}

onMounted(() => {
  loadData()
})
</script>

<style scoped>
.ner-analysis-container {
  padding: 20px;
}

.stats-row {
  margin-bottom: 20px;
}

.main-content {
  min-height: 600px;
}
</style>

3.2 实体列表组件

<!-- frontend/vue-demo/src/views/components/EntityListView.vue -->

<template>
  <div class="entity-list">
    <!-- 筛选工具栏 -->
    <el-row class="toolbar">
      <el-col :span="12">
        <el-input 
          v-model="searchText"
          placeholder="搜索实体..."
          clearable>
          <template #prefix>
            <el-icon><Search /></el-icon>
          </template>
        </el-input>
      </el-col>
      <el-col :span="12" class="filter-group">
        <el-select v-model="filterType" placeholder="实体类型" clearable>
          <el-option label="全部" value="" />
          <el-option label="机构" value="ORG" />
          <el-option label="日期" value="DATE" />
          <el-option label="人名" value="PERSON" />
          <el-option label="得分" value="SCORE" />
          <el-option label="级别" value="LEVEL" />
        </el-select>
        
        <el-select v-model="filterMapped" placeholder="映射状态" clearable>
          <el-option label="全部" value="" />
          <el-option label="已映射" value="true" />
          <el-option label="未映射" value="false" />
        </el-select>
      </el-col>
    </el-row>

    <!-- 实体表格 -->
    <el-table 
      :data="filteredEntities"
      stripe
      border
      height="500">
      
      <el-table-column prop="entityType" label="类型" width="100">
        <template #default="{ row }">
          <el-tag :type="getTypeColor(row.entityType)">
            {{ row.entityType }}
          </el-tag>
        </template>
      </el-table-column>
      
      <el-table-column prop="entityName" label="实体名称" width="200" />
      
      <el-table-column prop="confidence" label="置信度" width="100">
        <template #default="{ row }">
          <el-progress 
            :percentage="row.confidence * 100"
            :color="getConfidenceColor(row.confidence)" />
        </template>
      </el-table-column>
      
      <el-table-column prop="context" label="上下文" show-overflow-tooltip />
      
      <el-table-column prop="isMapped" label="映射状态" width="120">
        <template #default="{ row }">
          <el-tag v-if="row.isMapped" type="success">已映射</el-tag>
          <el-tag v-else type="info">未映射</el-tag>
        </template>
      </el-table-column>
      
      <el-table-column label="操作" width="200" fixed="right">
        <template #default="{ row }">
          <el-button 
            size="small"
            @click="handleMap(row)">
            映射
          </el-button>
          <el-button 
            size="small"
            type="danger"
            @click="handleDelete(row)">
            删除
          </el-button>
        </template>
      </el-table-column>
    </el-table>

    <!-- 映射对话框 -->
    <el-dialog v-model="mapDialogVisible" title="映射到要素">
      <el-select v-model="selectedElementKey" placeholder="选择要素">
        <el-option 
          v-for="elem in elements"
          :key="elem.key"
          :label="elem.label"
          :value="elem.key" />
      </el-select>
      
      <template #footer>
        <el-button @click="mapDialogVisible = false">取消</el-button>
        <el-button type="primary" @click="confirmMap">确定</el-button>
      </template>
    </el-dialog>
  </div>
</template>

<script setup>
import { ref, computed } from 'vue'
import { Search } from '@element-plus/icons-vue'

const props = defineProps({
  entities: Array,
  elements: Array
})

const emit = defineEmits(['map', 'delete'])

const searchText = ref('')
const filterType = ref('')
const filterMapped = ref('')
const mapDialogVisible = ref(false)
const currentEntity = ref(null)
const selectedElementKey = ref('')

// 过滤实体
const filteredEntities = computed(() => {
  let result = props.entities

  if (searchText.value) {
    result = result.filter(e => 
      e.entityName.includes(searchText.value) ||
      e.context.includes(searchText.value)
    )
  }

  if (filterType.value) {
    result = result.filter(e => e.entityType === filterType.value)
  }

  if (filterMapped.value) {
    const isMapped = filterMapped.value === 'true'
    result = result.filter(e => e.isMapped === isMapped)
  }

  return result
})

// 获取类型颜色
const getTypeColor = (type) => {
  const colors = {
    'ORG': 'primary',
    'DATE': 'success',
    'PERSON': 'warning',
    'SCORE': 'danger',
    'LEVEL': 'info'
  }
  return colors[type] || ''
}

// 获取置信度颜色
const getConfidenceColor = (confidence) => {
  if (confidence >= 0.8) return '#67C23A'
  if (confidence >= 0.6) return '#E6A23C'
  return '#F56C6C'
}

// 映射操作
const handleMap = (entity) => {
  currentEntity.value = entity
  mapDialogVisible.value = true
}

const confirmMap = () => {
  if (selectedElementKey.value) {
    emit('map', currentEntity.value.id, selectedElementKey.value)
    mapDialogVisible.value = false
  }
}

// 删除操作
const handleDelete = (entity) => {
  emit('delete', entity.id)
}
</script>

<style scoped>
.toolbar {
  margin-bottom: 20px;
}

.filter-group {
  display: flex;
  gap: 10px;
  justify-content: flex-end;
}
</style>

四、数据库迁移脚本

-- database/migrations/003_add_ner_support.sql

-- 添加NER相关的节点类型和边类型支持
-- 注意:实际执行时需要根据现有schema调整

-- 1. 如果有node_types表,添加新类型
INSERT INTO node_types (type_code, type_name, description) VALUES
('NER_ENTITY', 'NER实体', 'NER提取的命名实体'),
('NER_RELATION', 'NER关系', '实体间的关系')
ON DUPLICATE KEY UPDATE type_name = VALUES(type_name);

-- 2. 如果有edge_types表,添加新类型
INSERT INTO edge_types (type_code, type_name, description) VALUES
('HAS_NER_ENTITY', '包含NER实体', '附件包含的NER实体'),
('ENTITY_RELATION', '实体关系', '实体之间的语义关系'),
('ENTITY_TO_VALUE', '实体到值', '实体映射到要素值')
ON DUPLICATE KEY UPDATE type_name = VALUES(type_name);

-- 3. 添加NER相关索引(如果需要)
-- CREATE INDEX idx_ner_entity_type ON graph_properties(node_id, property_key)
--   WHERE property_key = 'entity_type';

以上代码提供了NER阶段的核心实现框架,可以根据实际需求进行调整和扩展。