|
|
@@ -22,7 +22,11 @@ class OllamaService:
|
|
|
self.timeout = settings.ollama_timeout
|
|
|
self.chunk_size = settings.chunk_size
|
|
|
self.chunk_overlap = settings.chunk_overlap
|
|
|
- logger.info(f"初始化 Ollama 服务: url={self.base_url}, model={self.model}")
|
|
|
+
|
|
|
+ # 检测是否使用 UniversalNER
|
|
|
+ self.is_universal_ner = "universal-ner" in self.model.lower()
|
|
|
+
|
|
|
+ logger.info(f"初始化 Ollama 服务: url={self.base_url}, model={self.model}, universal_ner={self.is_universal_ner}")
|
|
|
|
|
|
def _split_text(self, text: str) -> List[Dict[str, Any]]:
|
|
|
"""
|
|
|
@@ -200,10 +204,25 @@ class OllamaService:
|
|
|
使用 Ollama LLM 提取实体
|
|
|
|
|
|
支持长文本自动分块处理
|
|
|
+ 自动检测是否使用 UniversalNER 并切换提取策略
|
|
|
"""
|
|
|
if not text or not text.strip():
|
|
|
return []
|
|
|
|
|
|
+ # 根据模型类型选择提取策略
|
|
|
+ if self.is_universal_ner:
|
|
|
+ return await self._extract_with_universal_ner(text, entity_types)
|
|
|
+ else:
|
|
|
+ return await self._extract_with_general_llm(text, entity_types)
|
|
|
+
|
|
|
+ async def _extract_with_general_llm(
|
|
|
+ self,
|
|
|
+ text: str,
|
|
|
+ entity_types: Optional[List[str]] = None
|
|
|
+ ) -> List[EntityInfo]:
|
|
|
+ """
|
|
|
+ 使用通用 LLM(如 Qwen)提取实体
|
|
|
+ """
|
|
|
# 分割长文本
|
|
|
chunks = self._split_text(text)
|
|
|
|
|
|
@@ -235,9 +254,142 @@ class OllamaService:
|
|
|
|
|
|
logger.info(f"分块 {i+1} 提取实体: {len(entities)} 个")
|
|
|
|
|
|
- logger.info(f"Ollama NER 提取完成: 总实体数={len(all_entities)}")
|
|
|
+ logger.info(f"通用 LLM NER 提取完成: 总实体数={len(all_entities)}")
|
|
|
+ return all_entities
|
|
|
+
|
|
|
+ async def _extract_with_universal_ner(
|
|
|
+ self,
|
|
|
+ text: str,
|
|
|
+ entity_types: Optional[List[str]] = None
|
|
|
+ ) -> List[EntityInfo]:
|
|
|
+ """
|
|
|
+ 使用 UniversalNER 模型提取实体
|
|
|
+
|
|
|
+ UniversalNER 的 Prompt 格式: "文本内容. 实体类型英文名"
|
|
|
+ 返回格式: ["实体1", "实体2", ...]
|
|
|
+ """
|
|
|
+ # 实体类型映射(中文类型 -> UniversalNER 英文类型)
|
|
|
+ type_mapping = {
|
|
|
+ "PERSON": ["person", "people", "human"],
|
|
|
+ "ORG": ["organization", "company", "institution"],
|
|
|
+ "LOC": ["location", "place", "address"],
|
|
|
+ "DATE": ["date", "time"],
|
|
|
+ "NUMBER": ["number", "quantity", "measurement"],
|
|
|
+ "DEVICE": ["device", "equipment", "instrument"],
|
|
|
+ "PROJECT": ["project", "program"],
|
|
|
+ "METHOD": ["method", "standard", "specification"],
|
|
|
+ }
|
|
|
+
|
|
|
+ types_to_extract = entity_types or list(type_mapping.keys())
|
|
|
+
|
|
|
+ # 分割长文本
|
|
|
+ chunks = self._split_text(text)
|
|
|
+
|
|
|
+ all_entities = []
|
|
|
+ seen_entities = set() # 用于去重
|
|
|
+
|
|
|
+ for i, chunk in enumerate(chunks):
|
|
|
+ chunk_text = chunk["text"]
|
|
|
+ chunk_start = chunk["start_pos"]
|
|
|
+
|
|
|
+ logger.info(f"UniversalNER 处理分块 {i+1}/{len(chunks)}: 长度={len(chunk_text)}")
|
|
|
+
|
|
|
+ # 对每种实体类型分别提取
|
|
|
+ for entity_type in types_to_extract:
|
|
|
+ if entity_type not in type_mapping:
|
|
|
+ continue
|
|
|
+
|
|
|
+ # 使用第一个英文类型名
|
|
|
+ english_type = type_mapping[entity_type][0]
|
|
|
+
|
|
|
+ # UniversalNER 的 Prompt 格式
|
|
|
+ prompt = f"{chunk_text} {english_type}"
|
|
|
+
|
|
|
+ # 调用 Ollama
|
|
|
+ response = await self._call_ollama(prompt)
|
|
|
+
|
|
|
+ if not response:
|
|
|
+ continue
|
|
|
+
|
|
|
+ # 解析 UniversalNER 响应(返回格式如: ["实体1", "实体2"])
|
|
|
+ entities = self._parse_universal_ner_response(
|
|
|
+ response, entity_type, chunk_text, chunk_start
|
|
|
+ )
|
|
|
+
|
|
|
+ # 去重
|
|
|
+ for entity in entities:
|
|
|
+ entity_key = f"{entity.type}:{entity.name}"
|
|
|
+ if entity_key not in seen_entities:
|
|
|
+ seen_entities.add(entity_key)
|
|
|
+ all_entities.append(entity)
|
|
|
+
|
|
|
+ logger.info(f"分块 {i+1} UniversalNER 提取实体: {len([e for e in all_entities if e not in seen_entities])} 个")
|
|
|
+
|
|
|
+ logger.info(f"UniversalNER 提取完成: 总实体数={len(all_entities)}")
|
|
|
return all_entities
|
|
|
|
|
|
+ def _parse_universal_ner_response(
|
|
|
+ self,
|
|
|
+ response: str,
|
|
|
+ entity_type: str,
|
|
|
+ original_text: str,
|
|
|
+ chunk_start_pos: int = 0
|
|
|
+ ) -> List[EntityInfo]:
|
|
|
+ """
|
|
|
+ 解析 UniversalNER 的响应
|
|
|
+
|
|
|
+ UniversalNER 返回格式: ["实体1", "实体2", ...]
|
|
|
+ """
|
|
|
+ entities = []
|
|
|
+
|
|
|
+ try:
|
|
|
+ # 清理响应,提取 JSON 数组
|
|
|
+ response = response.strip()
|
|
|
+
|
|
|
+ # 尝试找到 JSON 数组
|
|
|
+ json_match = re.search(r'\[[\s\S]*?\]', response)
|
|
|
+ if not json_match:
|
|
|
+ logger.debug(f"UniversalNER 响应中未找到数组: {response[:100]}")
|
|
|
+ return entities
|
|
|
+
|
|
|
+ json_str = json_match.group()
|
|
|
+ entity_names = json.loads(json_str)
|
|
|
+
|
|
|
+ if not isinstance(entity_names, list):
|
|
|
+ return entities
|
|
|
+
|
|
|
+ for name in entity_names:
|
|
|
+ if not isinstance(name, str) or len(name) < 2:
|
|
|
+ continue
|
|
|
+
|
|
|
+ name = name.strip()
|
|
|
+
|
|
|
+ # 在原文中查找位置
|
|
|
+ pos = original_text.find(name)
|
|
|
+ char_start = pos + chunk_start_pos if pos >= 0 else 0
|
|
|
+ char_end = char_start + len(name) if pos >= 0 else 0
|
|
|
+
|
|
|
+ entity = EntityInfo(
|
|
|
+ name=name,
|
|
|
+ type=entity_type,
|
|
|
+ value=name,
|
|
|
+ position=PositionInfo(
|
|
|
+ char_start=char_start,
|
|
|
+ char_end=char_end,
|
|
|
+ line=1
|
|
|
+ ),
|
|
|
+ confidence=0.85, # UniversalNER 置信度
|
|
|
+ temp_id=str(uuid.uuid4())[:8]
|
|
|
+ )
|
|
|
+ entities.append(entity)
|
|
|
+
|
|
|
+ except json.JSONDecodeError as e:
|
|
|
+ logger.debug(f"UniversalNER JSON 解析失败: {e}, response={response[:100]}")
|
|
|
+ except Exception as e:
|
|
|
+ logger.error(f"解析 UniversalNER 响应失败: {e}")
|
|
|
+
|
|
|
+ return entities
|
|
|
+
|
|
|
async def check_health(self) -> bool:
|
|
|
"""
|
|
|
检查 Ollama 服务是否可用
|