hewensong
/
lingyue-zhibao


			
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234
							"""
NER 服务实现 - 使用 HanLP

HanLP 是一个中文NLP工具包，支持高质量的命名实体识别。
"""
import uuid
from typing import List, Optional
from loguru import logger

from ..models import EntityInfo, PositionInfo


# 每个分段的最大字符数（HanLP对长文本有限制）
MAX_SEGMENT_LENGTH = 500

# 需要过滤的实体类型（这些类型通常是噪音）
FILTER_TYPES = {
    'INTEGER', 'DECIMAL', 'FRACTION', 'ORDINAL', 'CARDINAL',
    'RATE', 'DURATION', 'NUMBER', 'POSTALCODE'
}

# 需要保留的核心实体类型
KEEP_TYPES = {'PERSON', 'ORG', 'LOC', 'DATE', 'TIME', 'MONEY', 'PERCENT'}

# 太泛化的实体（黑名单）
BLACKLIST_ENTITIES = {
    '公司', '评审组', '部门', '单位', '组织', '机构', '中心', '委员会',
    '第一', '第二', '第三', '第四', '第五', '一级', '二级', '三级',
    '百千万', '十四五', '十三五'
}


class NerService:
    """NER 服务 - 基于 HanLP"""
    
    def __init__(self):
        self._hanlp_ner = None
        self._hanlp_tokenizer = None
        logger.info("初始化 NER 服务: model=HanLP")
    
    def _load_model(self):
        """延迟加载HanLP模型"""
        if self._hanlp_ner is not None:
            return
        
        try:
            import hanlp
            logger.info("正在加载HanLP NER模型...")
            
            # 使用MTL多任务模型，更稳定
            self._hanlp_ner = hanlp.load(hanlp.pretrained.mtl.CLOSE_TOK_POS_NER_SRL_DEP_SDP_CON_ELECTRA_SMALL_ZH)
            
            logger.info("HanLP NER模型加载完成")
        except ImportError:
            logger.error("HanLP未安装，请运行: pip install hanlp")
            raise
        except Exception as e:
            logger.error(f"HanLP模型加载失败: {e}")
            raise
    
    def _split_text(self, text: str) -> List[tuple]:
        """
        将长文本分段，返回 [(segment, offset), ...]
        """
        segments = []
        lines = text.split('\n')
        current_segment = ""
        current_offset = 0
        segment_start = 0
        
        for line in lines:
            if len(current_segment) + len(line) + 1 > MAX_SEGMENT_LENGTH:
                if current_segment:
                    segments.append((current_segment, segment_start))
                current_segment = line
                segment_start = current_offset
            else:
                if current_segment:
                    current_segment += '\n' + line
                else:
                    current_segment = line
                    segment_start = current_offset
            current_offset += len(line) + 1
        
        if current_segment:
            segments.append((current_segment, segment_start))
        
        return segments
    
    async def extract_entities(
        self, 
        text: str, 
        entity_types: Optional[List[str]] = None
    ) -> List[EntityInfo]:
        """
        从文本中提取实体
        """
        if not text or not text.strip():
            return []
        
        # 加载模型
        self._load_model()
        
        # HanLP实体类型映射
        type_mapping = {
            'PERSON': 'PERSON', 'PER': 'PERSON', 'NR': 'PERSON',
            'ORGANIZATION': 'ORG', 'ORG': 'ORG', 'NT': 'ORG',
            'LOCATION': 'LOC', 'LOC': 'LOC', 'GPE': 'LOC', 'NS': 'LOC',
            'DATE': 'DATE', 'TIME': 'DATE',
            'MONEY': 'NUMBER', 'PERCENT': 'NUMBER', 'QUANTITY': 'NUMBER', 'CARDINAL': 'NUMBER',
        }
        
        entities = []
        seen_entities = set()
        
        # 分段处理
        segments = self._split_text(text)
        total_segments = len(segments)
        logger.info(f"开始NER提取: 文本长度={len(text)}, 分段数={total_segments}")
        
        for seg_idx, (segment, offset) in enumerate(segments):
            if seg_idx % 10 == 0:
                logger.info(f"NER进度: {seg_idx}/{total_segments} 段")
            
            try:
                # 调用HanLP MTL模型
                result = self._hanlp_ner(segment, tasks='ner')
                
                # MTL模型返回格式: {'ner/msra': [['实体', '类型', start, end], ...]}
                ner_results = []
                if isinstance(result, dict):
                    for key in result:
                        if 'ner' in key.lower():
                            ner_results = result[key]
                            break
                elif isinstance(result, list):
                    ner_results = result
                
                # 处理结果
                for item in ner_results:
                    entity_text = None
                    entity_type = None
                    char_start = 0
                    char_end = 0
                    
                    if isinstance(item, (list, tuple)) and len(item) >= 2:
                        entity_text = item[0]
                        entity_type = item[1]
                        if len(item) >= 4:
                            char_start = item[2] + offset
                            char_end = item[3] + offset
                        else:
                            pos = segment.find(str(entity_text))
                            char_start = pos + offset if pos >= 0 else offset
                            char_end = char_start + len(str(entity_text))
                    elif isinstance(item, dict):
                        entity_text = item.get('text', item.get('word', ''))
                        entity_type = item.get('type', item.get('label', 'UNKNOWN'))
                        char_start = item.get('start', 0) + offset
                        char_end = item.get('end', char_start + len(entity_text))
                    else:
                        continue
                    
                    if not entity_text or not entity_type:
                        continue
                    
                    entity_text = str(entity_text)
                    entity_type = str(entity_type)
                    
                    # 映射实体类型
                    mapped_type = type_mapping.get(entity_type.upper(), entity_type.upper())
                    
                    # 过滤噪音类型（数字、序号等）
                    if mapped_type in FILTER_TYPES or entity_type.upper() in FILTER_TYPES:
                        continue
                    
                    # 只保留核心类型
                    if mapped_type not in KEEP_TYPES and entity_type.upper() not in KEEP_TYPES:
                        continue
                    
                    # 过滤实体类型（用户指定）
                    if entity_types and mapped_type not in entity_types:
                        continue
                    
                    # 黑名单过滤
                    if entity_text in BLACKLIST_ENTITIES:
                        continue
                    
                    # 去重（忽略类型，只看文本）
                    if entity_text in seen_entities:
                        continue
                    seen_entities.add(entity_text)
                    
                    # 跳过太短的实体
                    if len(entity_text) < 2:
                        continue
                    
                    # 跳过纯数字
                    if entity_text.replace('.', '').replace('-', '').isdigit():
                        continue
                    
                    # 计算行号
                    line_num = text[:char_start].count('\n') + 1 if char_start > 0 else 1
                    
                    # 获取上下文
                    context_start = max(0, char_start - 20)
                    context_end = min(len(text), char_end + 20)
                    context = text[context_start:context_end]
                    
                    entity = EntityInfo(
                        name=entity_text,
                        type=mapped_type,
                        value=entity_text,
                        position=PositionInfo(
                            char_start=char_start,
                            char_end=char_end,
                            line=line_num
                        ),
                        context=context,
                        confidence=0.9,
                        temp_id=str(uuid.uuid4())[:8]
                    )
                    entities.append(entity)
                    
            except Exception as e:
                logger.warning(f"分段 {seg_idx} NER失败: {e}")
                continue
        
        logger.info(f"HanLP NER 提取完成: entity_count={len(entities)}")
        return entities


# 创建单例
ner_service = NerService()