ollama_service.py 15 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418
  1. """
  2. Ollama LLM 服务
  3. 用于调用本地 Ollama 模型进行 NER 提取
  4. """
  5. import json
  6. import re
  7. import uuid
  8. import httpx
  9. from typing import List, Optional, Dict, Any
  10. from loguru import logger
  11. from ..config import settings
  12. from ..models import EntityInfo, PositionInfo
  13. class OllamaService:
  14. """Ollama LLM 服务"""
  15. def __init__(self):
  16. self.base_url = settings.ollama_url
  17. self.model = settings.ollama_model
  18. self.timeout = settings.ollama_timeout
  19. self.chunk_size = settings.chunk_size
  20. self.chunk_overlap = settings.chunk_overlap
  21. # 检测是否使用 UniversalNER
  22. self.is_universal_ner = "universal-ner" in self.model.lower()
  23. logger.info(f"初始化 Ollama 服务: url={self.base_url}, model={self.model}, universal_ner={self.is_universal_ner}")
  24. def _split_text(self, text: str) -> List[Dict[str, Any]]:
  25. """
  26. 将长文本分割成多个块
  27. Args:
  28. text: 原始文本
  29. Returns:
  30. 分块列表,每个块包含 text, start_pos, end_pos
  31. """
  32. if len(text) <= self.chunk_size:
  33. return [{"text": text, "start_pos": 0, "end_pos": len(text)}]
  34. chunks = []
  35. start = 0
  36. while start < len(text):
  37. end = min(start + self.chunk_size, len(text))
  38. # 尝试在句号、换行处分割,避免截断句子
  39. if end < len(text):
  40. # 向前查找最近的分隔符
  41. for sep in ['\n\n', '\n', '。', ';', '!', '?', '.']:
  42. sep_pos = text.rfind(sep, start + self.chunk_size // 2, end)
  43. if sep_pos > start:
  44. end = sep_pos + len(sep)
  45. break
  46. chunk_text = text[start:end]
  47. chunks.append({
  48. "text": chunk_text,
  49. "start_pos": start,
  50. "end_pos": end
  51. })
  52. # 下一个块的起始位置(考虑重叠)
  53. start = end - self.chunk_overlap if end < len(text) else end
  54. logger.info(f"文本分割完成: 总长度={len(text)}, 分块数={len(chunks)}")
  55. return chunks
  56. def _build_ner_prompt(self, text: str, entity_types: Optional[List[str]] = None) -> str:
  57. """
  58. 构建 NER 提取的 Prompt
  59. """
  60. types = entity_types or settings.entity_types
  61. types_desc = ", ".join(types)
  62. # 示例帮助模型理解格式
  63. example = '{"entities": [{"name": "成都市", "type": "LOC", "charStart": 10, "charEnd": 13}, {"name": "2024年5月", "type": "DATE", "charStart": 0, "charEnd": 7}]}'
  64. prompt = f"""从文本中提取命名实体,只输出JSON。
  65. 实体类型: {types_desc}
  66. 输出格式示例:
  67. {example}
  68. 文本内容:
  69. {text}
  70. JSON结果:
  71. ```json"""
  72. return prompt
  73. async def _call_ollama(self, prompt: str) -> Optional[str]:
  74. """
  75. 调用 Ollama API
  76. """
  77. url = f"{self.base_url}/api/generate"
  78. payload = {
  79. "model": self.model,
  80. "prompt": prompt,
  81. "stream": False,
  82. "options": {
  83. "temperature": 0.1, # 低温度,更确定性的输出
  84. "num_predict": 2048, # 最大输出 token
  85. }
  86. }
  87. # Qwen3 特殊处理:使用 /no_think 标签禁用思考模式
  88. if "qwen3" in self.model.lower():
  89. payload["prompt"] = "/no_think\n" + prompt
  90. try:
  91. async with httpx.AsyncClient(timeout=self.timeout) as client:
  92. response = await client.post(url, json=payload)
  93. response.raise_for_status()
  94. result = response.json()
  95. return result.get("response", "")
  96. except httpx.TimeoutException:
  97. logger.error(f"Ollama 请求超时: timeout={self.timeout}s")
  98. return None
  99. except Exception as e:
  100. logger.error(f"Ollama 请求失败: {e}")
  101. return None
  102. def _parse_llm_response(self, response: str, chunk_start_pos: int = 0) -> List[EntityInfo]:
  103. """
  104. 解析 LLM 返回的 JSON 结果
  105. Args:
  106. response: LLM 返回的文本
  107. chunk_start_pos: 当前分块在原文中的起始位置(用于位置校正)
  108. """
  109. entities = []
  110. try:
  111. # Qwen3 可能有 thinking 模式,需要移除 <think>...</think> 部分
  112. response = re.sub(r'<think>[\s\S]*?</think>', '', response)
  113. # 移除 markdown code block 标记
  114. response = re.sub(r'```json\s*', '', response)
  115. response = re.sub(r'```\s*', '', response)
  116. # 查找所有 JSON 对象,取最后一个(通常是实际结果)
  117. json_matches = re.findall(r'\{[^{}]*"entities"[^{}]*\[[\s\S]*?\]\s*\}', response)
  118. if not json_matches:
  119. # 回退:尝试匹配任意 JSON
  120. json_matches = re.findall(r'\{[\s\S]*?\}', response)
  121. if not json_matches:
  122. logger.warning(f"LLM 响应中未找到 JSON, response={response[:300]}...")
  123. return entities
  124. # 尝试解析每个匹配,使用第一个有效的
  125. data = None
  126. for json_str in json_matches:
  127. try:
  128. parsed = json.loads(json_str)
  129. if "entities" in parsed:
  130. data = parsed
  131. break
  132. except json.JSONDecodeError:
  133. continue
  134. if not data:
  135. logger.warning(f"未找到有效的 entities JSON, response={response[:300]}...")
  136. return entities
  137. entity_list = data.get("entities", [])
  138. for item in entity_list:
  139. name = item.get("name", "").strip()
  140. entity_type = item.get("type", "").upper()
  141. char_start = item.get("charStart", 0)
  142. char_end = item.get("charEnd", 0)
  143. if not name or len(name) < 2:
  144. continue
  145. # 校正位置(加上分块的起始位置)
  146. adjusted_start = char_start + chunk_start_pos
  147. adjusted_end = char_end + chunk_start_pos
  148. entity = EntityInfo(
  149. name=name,
  150. type=entity_type,
  151. value=name,
  152. position=PositionInfo(
  153. char_start=adjusted_start,
  154. char_end=adjusted_end,
  155. line=1 # LLM 模式不计算行号
  156. ),
  157. confidence=0.9, # LLM 模式默认较高置信度
  158. temp_id=str(uuid.uuid4())[:8]
  159. )
  160. entities.append(entity)
  161. except json.JSONDecodeError as e:
  162. logger.warning(f"JSON 解析失败: {e}, response={response[:200]}...")
  163. except Exception as e:
  164. logger.error(f"解析 LLM 响应失败: {e}")
  165. return entities
  166. async def extract_entities(
  167. self,
  168. text: str,
  169. entity_types: Optional[List[str]] = None
  170. ) -> List[EntityInfo]:
  171. """
  172. 使用 Ollama LLM 提取实体
  173. 支持长文本自动分块处理
  174. 自动检测是否使用 UniversalNER 并切换提取策略
  175. """
  176. if not text or not text.strip():
  177. return []
  178. # 根据模型类型选择提取策略
  179. if self.is_universal_ner:
  180. return await self._extract_with_universal_ner(text, entity_types)
  181. else:
  182. return await self._extract_with_general_llm(text, entity_types)
  183. async def _extract_with_general_llm(
  184. self,
  185. text: str,
  186. entity_types: Optional[List[str]] = None
  187. ) -> List[EntityInfo]:
  188. """
  189. 使用通用 LLM(如 Qwen)提取实体
  190. """
  191. # 分割长文本
  192. chunks = self._split_text(text)
  193. all_entities = []
  194. seen_entities = set() # 用于去重
  195. for i, chunk in enumerate(chunks):
  196. logger.info(f"处理分块 {i+1}/{len(chunks)}: 长度={len(chunk['text'])}")
  197. # 构建 prompt
  198. prompt = self._build_ner_prompt(chunk["text"], entity_types)
  199. # 调用 Ollama
  200. response = await self._call_ollama(prompt)
  201. if not response:
  202. logger.warning(f"分块 {i+1} Ollama 返回为空")
  203. continue
  204. # 打印前 500 字符用于调试
  205. logger.debug(f"分块 {i+1} LLM 响应: {response[:500]}...")
  206. # 解析结果
  207. entities = self._parse_llm_response(response, chunk["start_pos"])
  208. # 去重
  209. for entity in entities:
  210. entity_key = f"{entity.type}:{entity.name}"
  211. if entity_key not in seen_entities:
  212. seen_entities.add(entity_key)
  213. all_entities.append(entity)
  214. logger.info(f"分块 {i+1} 提取实体: {len(entities)} 个")
  215. logger.info(f"通用 LLM NER 提取完成: 总实体数={len(all_entities)}")
  216. return all_entities
  217. async def _extract_with_universal_ner(
  218. self,
  219. text: str,
  220. entity_types: Optional[List[str]] = None
  221. ) -> List[EntityInfo]:
  222. """
  223. 使用 UniversalNER 模型提取实体
  224. UniversalNER 的 Prompt 格式: "文本内容. 实体类型英文名"
  225. 返回格式: ["实体1", "实体2", ...]
  226. """
  227. # 实体类型映射(中文类型 -> UniversalNER 英文类型)
  228. type_mapping = {
  229. "PERSON": ["person", "people", "human"],
  230. "ORG": ["organization", "company", "institution"],
  231. "LOC": ["location", "place", "address"],
  232. "DATE": ["date", "time"],
  233. "NUMBER": ["number", "quantity", "measurement"],
  234. "DEVICE": ["device", "equipment", "instrument"],
  235. "PROJECT": ["project", "program"],
  236. "METHOD": ["method", "standard", "specification"],
  237. }
  238. types_to_extract = entity_types or list(type_mapping.keys())
  239. # 分割长文本
  240. chunks = self._split_text(text)
  241. all_entities = []
  242. seen_entities = set() # 用于去重
  243. for i, chunk in enumerate(chunks):
  244. chunk_text = chunk["text"]
  245. chunk_start = chunk["start_pos"]
  246. logger.info(f"UniversalNER 处理分块 {i+1}/{len(chunks)}: 长度={len(chunk_text)}")
  247. # 对每种实体类型分别提取
  248. for entity_type in types_to_extract:
  249. if entity_type not in type_mapping:
  250. continue
  251. # 使用第一个英文类型名
  252. english_type = type_mapping[entity_type][0]
  253. # UniversalNER 的 Prompt 格式
  254. prompt = f"{chunk_text} {english_type}"
  255. # 调用 Ollama
  256. response = await self._call_ollama(prompt)
  257. if not response:
  258. continue
  259. # 解析 UniversalNER 响应(返回格式如: ["实体1", "实体2"])
  260. entities = self._parse_universal_ner_response(
  261. response, entity_type, chunk_text, chunk_start
  262. )
  263. # 去重
  264. for entity in entities:
  265. entity_key = f"{entity.type}:{entity.name}"
  266. if entity_key not in seen_entities:
  267. seen_entities.add(entity_key)
  268. all_entities.append(entity)
  269. logger.info(f"分块 {i+1} UniversalNER 提取实体: {len([e for e in all_entities if e not in seen_entities])} 个")
  270. logger.info(f"UniversalNER 提取完成: 总实体数={len(all_entities)}")
  271. return all_entities
  272. def _parse_universal_ner_response(
  273. self,
  274. response: str,
  275. entity_type: str,
  276. original_text: str,
  277. chunk_start_pos: int = 0
  278. ) -> List[EntityInfo]:
  279. """
  280. 解析 UniversalNER 的响应
  281. UniversalNER 返回格式: ["实体1", "实体2", ...]
  282. """
  283. entities = []
  284. try:
  285. # 清理响应,提取 JSON 数组
  286. response = response.strip()
  287. # 尝试找到 JSON 数组
  288. json_match = re.search(r'\[[\s\S]*?\]', response)
  289. if not json_match:
  290. logger.debug(f"UniversalNER 响应中未找到数组: {response[:100]}")
  291. return entities
  292. json_str = json_match.group()
  293. entity_names = json.loads(json_str)
  294. if not isinstance(entity_names, list):
  295. return entities
  296. for name in entity_names:
  297. if not isinstance(name, str) or len(name) < 2:
  298. continue
  299. name = name.strip()
  300. # 在原文中查找位置
  301. pos = original_text.find(name)
  302. char_start = pos + chunk_start_pos if pos >= 0 else 0
  303. char_end = char_start + len(name) if pos >= 0 else 0
  304. entity = EntityInfo(
  305. name=name,
  306. type=entity_type,
  307. value=name,
  308. position=PositionInfo(
  309. char_start=char_start,
  310. char_end=char_end,
  311. line=1
  312. ),
  313. confidence=0.85, # UniversalNER 置信度
  314. temp_id=str(uuid.uuid4())[:8]
  315. )
  316. entities.append(entity)
  317. except json.JSONDecodeError as e:
  318. logger.debug(f"UniversalNER JSON 解析失败: {e}, response={response[:100]}")
  319. except Exception as e:
  320. logger.error(f"解析 UniversalNER 响应失败: {e}")
  321. return entities
  322. async def check_health(self) -> bool:
  323. """
  324. 检查 Ollama 服务是否可用
  325. """
  326. try:
  327. async with httpx.AsyncClient(timeout=5) as client:
  328. response = await client.get(f"{self.base_url}/api/tags")
  329. return response.status_code == 200
  330. except Exception:
  331. return False
  332. # 创建单例
  333. ollama_service = OllamaService()