| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234 |
- """
- NER 服务实现 - 使用 HanLP
- HanLP 是一个中文NLP工具包,支持高质量的命名实体识别。
- """
- import uuid
- from typing import List, Optional
- from loguru import logger
- from ..models import EntityInfo, PositionInfo
- # 每个分段的最大字符数(HanLP对长文本有限制)
- MAX_SEGMENT_LENGTH = 500
- # 需要过滤的实体类型(这些类型通常是噪音)
- FILTER_TYPES = {
- 'INTEGER', 'DECIMAL', 'FRACTION', 'ORDINAL', 'CARDINAL',
- 'RATE', 'DURATION', 'NUMBER', 'POSTALCODE'
- }
- # 需要保留的核心实体类型
- KEEP_TYPES = {'PERSON', 'ORG', 'LOC', 'DATE', 'TIME', 'MONEY', 'PERCENT'}
- # 太泛化的实体(黑名单)
- BLACKLIST_ENTITIES = {
- '公司', '评审组', '部门', '单位', '组织', '机构', '中心', '委员会',
- '第一', '第二', '第三', '第四', '第五', '一级', '二级', '三级',
- '百千万', '十四五', '十三五'
- }
- class NerService:
- """NER 服务 - 基于 HanLP"""
-
- def __init__(self):
- self._hanlp_ner = None
- self._hanlp_tokenizer = None
- logger.info("初始化 NER 服务: model=HanLP")
-
- def _load_model(self):
- """延迟加载HanLP模型"""
- if self._hanlp_ner is not None:
- return
-
- try:
- import hanlp
- logger.info("正在加载HanLP NER模型...")
-
- # 使用MTL多任务模型,更稳定
- self._hanlp_ner = hanlp.load(hanlp.pretrained.mtl.CLOSE_TOK_POS_NER_SRL_DEP_SDP_CON_ELECTRA_SMALL_ZH)
-
- logger.info("HanLP NER模型加载完成")
- except ImportError:
- logger.error("HanLP未安装,请运行: pip install hanlp")
- raise
- except Exception as e:
- logger.error(f"HanLP模型加载失败: {e}")
- raise
-
- def _split_text(self, text: str) -> List[tuple]:
- """
- 将长文本分段,返回 [(segment, offset), ...]
- """
- segments = []
- lines = text.split('\n')
- current_segment = ""
- current_offset = 0
- segment_start = 0
-
- for line in lines:
- if len(current_segment) + len(line) + 1 > MAX_SEGMENT_LENGTH:
- if current_segment:
- segments.append((current_segment, segment_start))
- current_segment = line
- segment_start = current_offset
- else:
- if current_segment:
- current_segment += '\n' + line
- else:
- current_segment = line
- segment_start = current_offset
- current_offset += len(line) + 1
-
- if current_segment:
- segments.append((current_segment, segment_start))
-
- return segments
-
- async def extract_entities(
- self,
- text: str,
- entity_types: Optional[List[str]] = None
- ) -> List[EntityInfo]:
- """
- 从文本中提取实体
- """
- if not text or not text.strip():
- return []
-
- # 加载模型
- self._load_model()
-
- # HanLP实体类型映射
- type_mapping = {
- 'PERSON': 'PERSON', 'PER': 'PERSON', 'NR': 'PERSON',
- 'ORGANIZATION': 'ORG', 'ORG': 'ORG', 'NT': 'ORG',
- 'LOCATION': 'LOC', 'LOC': 'LOC', 'GPE': 'LOC', 'NS': 'LOC',
- 'DATE': 'DATE', 'TIME': 'DATE',
- 'MONEY': 'NUMBER', 'PERCENT': 'NUMBER', 'QUANTITY': 'NUMBER', 'CARDINAL': 'NUMBER',
- }
-
- entities = []
- seen_entities = set()
-
- # 分段处理
- segments = self._split_text(text)
- total_segments = len(segments)
- logger.info(f"开始NER提取: 文本长度={len(text)}, 分段数={total_segments}")
-
- for seg_idx, (segment, offset) in enumerate(segments):
- if seg_idx % 10 == 0:
- logger.info(f"NER进度: {seg_idx}/{total_segments} 段")
-
- try:
- # 调用HanLP MTL模型
- result = self._hanlp_ner(segment, tasks='ner')
-
- # MTL模型返回格式: {'ner/msra': [['实体', '类型', start, end], ...]}
- ner_results = []
- if isinstance(result, dict):
- for key in result:
- if 'ner' in key.lower():
- ner_results = result[key]
- break
- elif isinstance(result, list):
- ner_results = result
-
- # 处理结果
- for item in ner_results:
- entity_text = None
- entity_type = None
- char_start = 0
- char_end = 0
-
- if isinstance(item, (list, tuple)) and len(item) >= 2:
- entity_text = item[0]
- entity_type = item[1]
- if len(item) >= 4:
- char_start = item[2] + offset
- char_end = item[3] + offset
- else:
- pos = segment.find(str(entity_text))
- char_start = pos + offset if pos >= 0 else offset
- char_end = char_start + len(str(entity_text))
- elif isinstance(item, dict):
- entity_text = item.get('text', item.get('word', ''))
- entity_type = item.get('type', item.get('label', 'UNKNOWN'))
- char_start = item.get('start', 0) + offset
- char_end = item.get('end', char_start + len(entity_text))
- else:
- continue
-
- if not entity_text or not entity_type:
- continue
-
- entity_text = str(entity_text)
- entity_type = str(entity_type)
-
- # 映射实体类型
- mapped_type = type_mapping.get(entity_type.upper(), entity_type.upper())
-
- # 过滤噪音类型(数字、序号等)
- if mapped_type in FILTER_TYPES or entity_type.upper() in FILTER_TYPES:
- continue
-
- # 只保留核心类型
- if mapped_type not in KEEP_TYPES and entity_type.upper() not in KEEP_TYPES:
- continue
-
- # 过滤实体类型(用户指定)
- if entity_types and mapped_type not in entity_types:
- continue
-
- # 黑名单过滤
- if entity_text in BLACKLIST_ENTITIES:
- continue
-
- # 去重(忽略类型,只看文本)
- if entity_text in seen_entities:
- continue
- seen_entities.add(entity_text)
-
- # 跳过太短的实体
- if len(entity_text) < 2:
- continue
-
- # 跳过纯数字
- if entity_text.replace('.', '').replace('-', '').isdigit():
- continue
-
- # 计算行号
- line_num = text[:char_start].count('\n') + 1 if char_start > 0 else 1
-
- # 获取上下文
- context_start = max(0, char_start - 20)
- context_end = min(len(text), char_end + 20)
- context = text[context_start:context_end]
-
- entity = EntityInfo(
- name=entity_text,
- type=mapped_type,
- value=entity_text,
- position=PositionInfo(
- char_start=char_start,
- char_end=char_end,
- line=line_num
- ),
- context=context,
- confidence=0.9,
- temp_id=str(uuid.uuid4())[:8]
- )
- entities.append(entity)
-
- except Exception as e:
- logger.warning(f"分段 {seg_idx} NER失败: {e}")
- continue
-
- logger.info(f"HanLP NER 提取完成: entity_count={len(entities)}")
- return entities
- # 创建单例
- ner_service = NerService()
|