""" NER 服务实现 - 使用 HanLP HanLP 是一个中文NLP工具包,支持高质量的命名实体识别。 """ import uuid from typing import List, Optional from loguru import logger from ..models import EntityInfo, PositionInfo # 每个分段的最大字符数(HanLP对长文本有限制) MAX_SEGMENT_LENGTH = 500 # 需要过滤的实体类型(这些类型通常是噪音) FILTER_TYPES = { 'INTEGER', 'DECIMAL', 'FRACTION', 'ORDINAL', 'CARDINAL', 'RATE', 'DURATION', 'NUMBER', 'POSTALCODE' } # 需要保留的核心实体类型 KEEP_TYPES = {'PERSON', 'ORG', 'LOC', 'DATE', 'TIME', 'MONEY', 'PERCENT'} # 太泛化的实体(黑名单) BLACKLIST_ENTITIES = { '公司', '评审组', '部门', '单位', '组织', '机构', '中心', '委员会', '第一', '第二', '第三', '第四', '第五', '一级', '二级', '三级', '百千万', '十四五', '十三五' } class NerService: """NER 服务 - 基于 HanLP""" def __init__(self): self._hanlp_ner = None self._hanlp_tokenizer = None logger.info("初始化 NER 服务: model=HanLP") def _load_model(self): """延迟加载HanLP模型""" if self._hanlp_ner is not None: return try: import hanlp logger.info("正在加载HanLP NER模型...") # 使用MTL多任务模型,更稳定 self._hanlp_ner = hanlp.load(hanlp.pretrained.mtl.CLOSE_TOK_POS_NER_SRL_DEP_SDP_CON_ELECTRA_SMALL_ZH) logger.info("HanLP NER模型加载完成") except ImportError: logger.error("HanLP未安装,请运行: pip install hanlp") raise except Exception as e: logger.error(f"HanLP模型加载失败: {e}") raise def _split_text(self, text: str) -> List[tuple]: """ 将长文本分段,返回 [(segment, offset), ...] """ segments = [] lines = text.split('\n') current_segment = "" current_offset = 0 segment_start = 0 for line in lines: if len(current_segment) + len(line) + 1 > MAX_SEGMENT_LENGTH: if current_segment: segments.append((current_segment, segment_start)) current_segment = line segment_start = current_offset else: if current_segment: current_segment += '\n' + line else: current_segment = line segment_start = current_offset current_offset += len(line) + 1 if current_segment: segments.append((current_segment, segment_start)) return segments async def extract_entities( self, text: str, entity_types: Optional[List[str]] = None ) -> List[EntityInfo]: """ 从文本中提取实体 """ if not text or not text.strip(): return [] # 加载模型 self._load_model() # HanLP实体类型映射 type_mapping = { 'PERSON': 'PERSON', 'PER': 'PERSON', 'NR': 'PERSON', 'ORGANIZATION': 'ORG', 'ORG': 'ORG', 'NT': 'ORG', 'LOCATION': 'LOC', 'LOC': 'LOC', 'GPE': 'LOC', 'NS': 'LOC', 'DATE': 'DATE', 'TIME': 'DATE', 'MONEY': 'NUMBER', 'PERCENT': 'NUMBER', 'QUANTITY': 'NUMBER', 'CARDINAL': 'NUMBER', } entities = [] seen_entities = set() # 分段处理 segments = self._split_text(text) total_segments = len(segments) logger.info(f"开始NER提取: 文本长度={len(text)}, 分段数={total_segments}") for seg_idx, (segment, offset) in enumerate(segments): if seg_idx % 10 == 0: logger.info(f"NER进度: {seg_idx}/{total_segments} 段") try: # 调用HanLP MTL模型 result = self._hanlp_ner(segment, tasks='ner') # MTL模型返回格式: {'ner/msra': [['实体', '类型', start, end], ...]} ner_results = [] if isinstance(result, dict): for key in result: if 'ner' in key.lower(): ner_results = result[key] break elif isinstance(result, list): ner_results = result # 处理结果 for item in ner_results: entity_text = None entity_type = None char_start = 0 char_end = 0 if isinstance(item, (list, tuple)) and len(item) >= 2: entity_text = item[0] entity_type = item[1] if len(item) >= 4: char_start = item[2] + offset char_end = item[3] + offset else: pos = segment.find(str(entity_text)) char_start = pos + offset if pos >= 0 else offset char_end = char_start + len(str(entity_text)) elif isinstance(item, dict): entity_text = item.get('text', item.get('word', '')) entity_type = item.get('type', item.get('label', 'UNKNOWN')) char_start = item.get('start', 0) + offset char_end = item.get('end', char_start + len(entity_text)) else: continue if not entity_text or not entity_type: continue entity_text = str(entity_text) entity_type = str(entity_type) # 映射实体类型 mapped_type = type_mapping.get(entity_type.upper(), entity_type.upper()) # 过滤噪音类型(数字、序号等) if mapped_type in FILTER_TYPES or entity_type.upper() in FILTER_TYPES: continue # 只保留核心类型 if mapped_type not in KEEP_TYPES and entity_type.upper() not in KEEP_TYPES: continue # 过滤实体类型(用户指定) if entity_types and mapped_type not in entity_types: continue # 黑名单过滤 if entity_text in BLACKLIST_ENTITIES: continue # 去重(忽略类型,只看文本) if entity_text in seen_entities: continue seen_entities.add(entity_text) # 跳过太短的实体 if len(entity_text) < 2: continue # 跳过纯数字 if entity_text.replace('.', '').replace('-', '').isdigit(): continue # 计算行号 line_num = text[:char_start].count('\n') + 1 if char_start > 0 else 1 # 获取上下文 context_start = max(0, char_start - 20) context_end = min(len(text), char_end + 20) context = text[context_start:context_end] entity = EntityInfo( name=entity_text, type=mapped_type, value=entity_text, position=PositionInfo( char_start=char_start, char_end=char_end, line=line_num ), context=context, confidence=0.9, temp_id=str(uuid.uuid4())[:8] ) entities.append(entity) except Exception as e: logger.warning(f"分段 {seg_idx} NER失败: {e}") continue logger.info(f"HanLP NER 提取完成: entity_count={len(entities)}") return entities # 创建单例 ner_service = NerService()