ner_service.py 12 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290
  1. """
  2. NER 服务实现
  3. 支持多种模式:
  4. 1. rule - 基于规则的简单 NER(默认,用于开发测试)
  5. 2. spacy - 使用 spaCy 模型
  6. 3. transformers - 使用 Transformers 模型
  7. 4. api - 调用外部 API(如 DeepSeek/Qwen)
  8. """
  9. import re
  10. import uuid
  11. from typing import List, Optional
  12. from loguru import logger
  13. from ..config import settings
  14. from ..models import EntityInfo, PositionInfo
  15. class NerService:
  16. """NER 服务"""
  17. def __init__(self):
  18. self.model_type = settings.ner_model
  19. logger.info(f"初始化 NER 服务: model_type={self.model_type}")
  20. async def extract_entities(
  21. self,
  22. text: str,
  23. entity_types: Optional[List[str]] = None
  24. ) -> List[EntityInfo]:
  25. """
  26. 从文本中提取实体
  27. Args:
  28. text: 待提取的文本
  29. entity_types: 指定要提取的实体类型,为空则提取所有类型
  30. Returns:
  31. 实体列表
  32. """
  33. if not text or not text.strip():
  34. return []
  35. if self.model_type == "rule":
  36. return await self._extract_by_rules(text, entity_types)
  37. elif self.model_type == "ollama":
  38. return await self._extract_by_ollama(text, entity_types)
  39. elif self.model_type == "deepseek":
  40. return await self._extract_by_deepseek(text, entity_types)
  41. elif self.model_type == "spacy":
  42. return await self._extract_by_spacy(text, entity_types)
  43. elif self.model_type == "transformers":
  44. return await self._extract_by_transformers(text, entity_types)
  45. elif self.model_type == "api":
  46. return await self._extract_by_api(text, entity_types)
  47. else:
  48. logger.warning(f"未知的模型类型: {self.model_type},使用规则模式")
  49. return await self._extract_by_rules(text, entity_types)
  50. async def extract_entities_with_progress(
  51. self,
  52. text: str,
  53. entity_types: Optional[List[str]] = None
  54. ):
  55. """
  56. 从文本中提取实体(带进度生成器,用于 SSE 流式响应)
  57. Yields:
  58. SSE 事件字符串
  59. """
  60. import json
  61. if not text or not text.strip():
  62. yield f"event: entities_data\ndata: {json.dumps({'entities': [], 'total_entities': 0}, ensure_ascii=False)}\n\n"
  63. return
  64. if self.model_type == "deepseek":
  65. from .deepseek_service import deepseek_service
  66. async for event in deepseek_service.extract_entities_with_progress(text, entity_types):
  67. yield event
  68. else:
  69. # 其他模型回退到普通提取,一次性返回
  70. entities = await self.extract_entities(text, entity_types)
  71. yield f"event: chunk_complete\ndata: {json.dumps({'total_entities': len(entities), 'progress_percent': 100}, ensure_ascii=False)}\n\n"
  72. # 发送实体数据事件
  73. yield f"event: entities_data\ndata: {json.dumps({'entities': [e.dict() for e in entities], 'total_entities': len(entities)}, ensure_ascii=False)}\n\n"
  74. async def _extract_by_rules(
  75. self,
  76. text: str,
  77. entity_types: Optional[List[str]] = None
  78. ) -> List[EntityInfo]:
  79. """
  80. 基于规则的 NER 提取
  81. 用于开发测试阶段,后续可替换为更高级的模型
  82. """
  83. entities = []
  84. # 规则定义
  85. rules = {
  86. "DATE": [
  87. # 中文日期格式
  88. r'(\d{4}年\d{1,2}月\d{1,2}日)',
  89. r'(\d{4}年\d{1,2}月)',
  90. r'(\d{4}-\d{1,2}-\d{1,2})',
  91. r'(\d{4}/\d{1,2}/\d{1,2})',
  92. ],
  93. "NUMBER": [
  94. # 带单位的数值
  95. r'(\d+\.?\d*\s*(?:万元|元|米|公里|千米|平方米|㎡|吨|kg|g|个|台|套|件|次|人|天|小时|分钟|秒|%|百分比))',
  96. # 百分比
  97. r'(\d+\.?\d*%)',
  98. # 纯数值(较大的数)
  99. r'(?<![a-zA-Z])(\d{4,}(?:\.\d+)?)(?![a-zA-Z])',
  100. ],
  101. "ORG": [
  102. # 机构/公司名称
  103. r'([\u4e00-\u9fa5]{2,10}(?:公司|集团|院|所|局|部|厅|委|会|中心|协会|学会|银行|医院|学校|大学|学院))',
  104. # xx省/市/县/区
  105. r'([\u4e00-\u9fa5]{2,6}(?:省|市|县|区|镇|乡|村)(?:人民)?(?:政府|委员会)?)',
  106. ],
  107. "LOC": [
  108. # 地点
  109. r'([\u4e00-\u9fa5]{2,6}(?:省|市|县|区|镇|乡|村|路|街|巷|号|楼|栋|单元|室))',
  110. # 常见地名后缀
  111. r'([\u4e00-\u9fa5]{2,8}(?:工业园|开发区|高新区|科技园|产业园))',
  112. ],
  113. "PERSON": [
  114. # 人名(简单规则:姓+名)
  115. r'(?:(?:张|王|李|赵|刘|陈|杨|黄|周|吴|徐|孙|马|朱|胡|郭|何|林|罗|高|郑|梁|谢|唐|许|邓|冯|韩|曹|曾|彭|萧|蔡|潘|田|董|袁|于|余|叶|蒋|杜|苏|魏|程|吕|丁|沈|任|姚|卢|傅|钟|姜|崔|谭|廖|范|汪|陆|金|石|戴|贾|韦|夏|邱|方|侯|邹|熊|孟|秦|白|江|阎|薛|尹|段|雷|黎|史|龙|陶|贺|顾|毛|郝|龚|邵|万|钱|严|赖|覃|洪|武|莫|孔)[\u4e00-\u9fa5]{1,2})(?:总|经理|主任|工程师|教授|博士|先生|女士|同志)?',
  116. ],
  117. "DEVICE": [
  118. # 设备名称
  119. r'([\u4e00-\u9fa5]{2,10}(?:设备|仪器|仪表|机器|装置|系统|探测器|传感器|检测仪|分析仪|监测仪))',
  120. ],
  121. "PROJECT": [
  122. # 项目名称 - 更严格的规则
  123. # 要求:项目名应该是完整的名词短语,通常有特定前缀
  124. # 带书名号的项目名
  125. r'《([\u4e00-\u9fa5a-zA-Z0-9]{2,30}(?:项目|工程|计划|方案|课题))》',
  126. # 明确的项目编号/名称格式
  127. r'([A-Z0-9\-]+(?:项目|工程))',
  128. # 地名/机构名 + 项目类型(更严格)
  129. r'((?:[\u4e00-\u9fa5]{2,6}(?:省|市|县|区|镇))?[\u4e00-\u9fa5]{2,15}(?:建设|改造|修复|治理|开发|研究|试点|示范)(?:项目|工程))',
  130. # xx项目部/项目组
  131. r'([\u4e00-\u9fa5]{2,15}项目(?:部|组|办))',
  132. ],
  133. }
  134. # 过滤实体类型
  135. if entity_types:
  136. rules = {k: v for k, v in rules.items() if k in entity_types}
  137. # 停用词/无效实体过滤(这些词虽然匹配规则但不是有效实体)
  138. stopwords = {
  139. # 常见无意义匹配
  140. "该项目", "本项目", "此项目", "各项目", "子公司和项目", "认真落实项目",
  141. "开展的培训项目", "年已经开展的培训项目",
  142. "该工程", "本工程", "此工程", "各工程",
  143. "该计划", "本计划", "此计划", "各计划",
  144. "该方案", "本方案", "此方案", "各方案",
  145. # 动词开头的无效匹配
  146. "落实项目", "开展项目", "推进项目", "完成项目", "实施项目",
  147. # 太短的无意义实体
  148. "项目", "工程", "计划", "方案", "课题",
  149. }
  150. # 执行规则匹配
  151. seen_entities = set() # 用于去重
  152. for entity_type, patterns in rules.items():
  153. for pattern in patterns:
  154. for match in re.finditer(pattern, text):
  155. entity_text = match.group(1) if match.groups() else match.group(0)
  156. entity_text = entity_text.strip()
  157. # 跳过停用词
  158. if entity_text in stopwords:
  159. continue
  160. # 跳过太短的实体(少于3个字符)
  161. if len(entity_text) < 3:
  162. continue
  163. # 去重
  164. entity_key = f"{entity_type}:{entity_text}"
  165. if entity_key in seen_entities:
  166. continue
  167. seen_entities.add(entity_key)
  168. # 计算行号
  169. line_num = text[:match.start()].count('\n') + 1
  170. # 获取上下文
  171. context_start = max(0, match.start() - 20)
  172. context_end = min(len(text), match.end() + 20)
  173. context = text[context_start:context_end]
  174. if context_start > 0:
  175. context = "..." + context
  176. if context_end < len(text):
  177. context = context + "..."
  178. entity = EntityInfo(
  179. name=entity_text,
  180. type=entity_type,
  181. value=entity_text,
  182. position=PositionInfo(
  183. char_start=match.start(),
  184. char_end=match.end(),
  185. line=line_num
  186. ),
  187. context=context,
  188. confidence=0.8, # 规则匹配默认置信度
  189. temp_id=str(uuid.uuid4())[:8]
  190. )
  191. entities.append(entity)
  192. logger.info(f"规则 NER 提取完成: entity_count={len(entities)}")
  193. return entities
  194. async def _extract_by_ollama(
  195. self,
  196. text: str,
  197. entity_types: Optional[List[str]] = None
  198. ) -> List[EntityInfo]:
  199. """
  200. 使用本地 Ollama LLM 进行 NER 提取
  201. 支持长文本自动分块
  202. """
  203. try:
  204. from .ollama_service import ollama_service
  205. return await ollama_service.extract_entities(text, entity_types)
  206. except Exception as e:
  207. logger.error(f"Ollama NER 失败: {e},回退到规则模式")
  208. return await self._extract_by_rules(text, entity_types)
  209. async def _extract_by_deepseek(
  210. self,
  211. text: str,
  212. entity_types: Optional[List[str]] = None
  213. ) -> List[EntityInfo]:
  214. """
  215. 使用阿里云百炼 DeepSeek API 进行 NER 提取
  216. """
  217. try:
  218. from .deepseek_service import deepseek_service
  219. return await deepseek_service.extract_entities(text, entity_types)
  220. except Exception as e:
  221. logger.error(f"DeepSeek NER 失败: {e},回退到规则模式")
  222. return await self._extract_by_rules(text, entity_types)
  223. async def _extract_by_spacy(
  224. self,
  225. text: str,
  226. entity_types: Optional[List[str]] = None
  227. ) -> List[EntityInfo]:
  228. """
  229. 使用 spaCy 进行 NER 提取
  230. """
  231. # TODO: 实现 spaCy NER
  232. logger.warning("spaCy NER 尚未实现,回退到规则模式")
  233. return await self._extract_by_rules(text, entity_types)
  234. async def _extract_by_transformers(
  235. self,
  236. text: str,
  237. entity_types: Optional[List[str]] = None
  238. ) -> List[EntityInfo]:
  239. """
  240. 使用 Transformers 模型进行 NER 提取
  241. """
  242. # TODO: 实现 Transformers NER
  243. logger.warning("Transformers NER 尚未实现,回退到规则模式")
  244. return await self._extract_by_rules(text, entity_types)
  245. async def _extract_by_api(
  246. self,
  247. text: str,
  248. entity_types: Optional[List[str]] = None
  249. ) -> List[EntityInfo]:
  250. """
  251. 调用外部 API 进行 NER 提取
  252. """
  253. # TODO: 实现 API NER(调用 DeepSeek/Qwen)
  254. logger.warning("API NER 尚未实现,回退到规则模式")
  255. return await self._extract_by_rules(text, entity_types)
  256. # 创建单例
  257. ner_service = NerService()