""" NER 服务实现 支持多种模式: 1. rule - 基于规则的简单 NER(默认,用于开发测试) 2. spacy - 使用 spaCy 模型 3. transformers - 使用 Transformers 模型 4. api - 调用外部 API(如 DeepSeek/Qwen) """ import re import uuid from typing import List, Optional from loguru import logger from ..config import settings from ..models import EntityInfo, PositionInfo class NerService: """NER 服务""" def __init__(self): self.model_type = settings.ner_model logger.info(f"初始化 NER 服务: model_type={self.model_type}") async def extract_entities( self, text: str, entity_types: Optional[List[str]] = None ) -> List[EntityInfo]: """ 从文本中提取实体 Args: text: 待提取的文本 entity_types: 指定要提取的实体类型,为空则提取所有类型 Returns: 实体列表 """ if not text or not text.strip(): return [] if self.model_type == "rule": return await self._extract_by_rules(text, entity_types) elif self.model_type == "spacy": return await self._extract_by_spacy(text, entity_types) elif self.model_type == "transformers": return await self._extract_by_transformers(text, entity_types) elif self.model_type == "api": return await self._extract_by_api(text, entity_types) else: logger.warning(f"未知的模型类型: {self.model_type},使用规则模式") return await self._extract_by_rules(text, entity_types) async def _extract_by_rules( self, text: str, entity_types: Optional[List[str]] = None ) -> List[EntityInfo]: """ 基于规则的 NER 提取 用于开发测试阶段,后续可替换为更高级的模型 """ entities = [] # 规则定义 rules = { "DATE": [ # 中文日期格式 r'(\d{4}年\d{1,2}月\d{1,2}日)', r'(\d{4}年\d{1,2}月)', r'(\d{4}-\d{1,2}-\d{1,2})', r'(\d{4}/\d{1,2}/\d{1,2})', ], "NUMBER": [ # 带单位的数值 r'(\d+\.?\d*\s*(?:万元|元|米|公里|千米|平方米|㎡|吨|kg|g|个|台|套|件|次|人|天|小时|分钟|秒|%|百分比))', # 百分比 r'(\d+\.?\d*%)', # 纯数值(较大的数) r'(? 0: context = "..." + context if context_end < len(text): context = context + "..." entity = EntityInfo( name=entity_text, type=entity_type, value=entity_text, position=PositionInfo( char_start=match.start(), char_end=match.end(), line=line_num ), context=context, confidence=0.8, # 规则匹配默认置信度 temp_id=str(uuid.uuid4())[:8] ) entities.append(entity) logger.info(f"规则 NER 提取完成: entity_count={len(entities)}") return entities async def _extract_by_spacy( self, text: str, entity_types: Optional[List[str]] = None ) -> List[EntityInfo]: """ 使用 spaCy 进行 NER 提取 """ # TODO: 实现 spaCy NER logger.warning("spaCy NER 尚未实现,回退到规则模式") return await self._extract_by_rules(text, entity_types) async def _extract_by_transformers( self, text: str, entity_types: Optional[List[str]] = None ) -> List[EntityInfo]: """ 使用 Transformers 模型进行 NER 提取 """ # TODO: 实现 Transformers NER logger.warning("Transformers NER 尚未实现,回退到规则模式") return await self._extract_by_rules(text, entity_types) async def _extract_by_api( self, text: str, entity_types: Optional[List[str]] = None ) -> List[EntityInfo]: """ 调用外部 API 进行 NER 提取 """ # TODO: 实现 API NER(调用 DeepSeek/Qwen) logger.warning("API NER 尚未实现,回退到规则模式") return await self._extract_by_rules(text, entity_types) # 创建单例 ner_service = NerService()