Просмотр исходного кода

feat: HanLP集成与实体管理功能

- Python NER服务:
  * 集成HanLP MTL多任务模型进行NER提取
  * 实现长文本分段处理(500字符/段)
  * 添加实体过滤逻辑:过滤数字/序号/纯数字等噪音
  * 只保留核心实体类型(PERSON/ORG/LOC/DATE/TIME/MONEY/PERCENT)
  * 添加黑名单过滤泛化词(公司/评审组/部门等)
  * 添加进度日志(每10段输出一次)
  * 实现DOCX目录提取和章节切分功能

- 后端Entity API:
  * 新增EntityController/EntityService/EntityDTO
  * 实现实体CRUD和批量创建API
  * 使用EAV模型存储实体到nodes表
  * 支持项目-实体关联(PROJECT_ENTITY边)

- 前端实体保存:
  * 新增entityApi模块
  * DOCX解析完成后自动保存实体到数据库
  * 实体格式转换和错误处理
何文松 2 дней назад
Родитель
Сommit
a1e584256d

+ 56 - 0
backend/lingyue-project/src/main/java/com/lingyue/project/entity/controller/EntityController.java

@@ -0,0 +1,56 @@
+package com.lingyue.project.entity.controller;
+
+import com.lingyue.common.core.Result;
+import com.lingyue.project.entity.dto.EntityDTO;
+import com.lingyue.project.entity.dto.EntityCreateRequest;
+import com.lingyue.project.entity.service.EntityService;
+import lombok.RequiredArgsConstructor;
+import org.springframework.web.bind.annotation.*;
+
+import java.util.List;
+
+@RestController
+@RequestMapping("/api/v1")
+@RequiredArgsConstructor
+public class EntityController {
+
+    private final EntityService entityService;
+
+    @GetMapping("/projects/{projectId}/entities")
+    public Result<List<EntityDTO>> listByProject(@PathVariable Long projectId) {
+        return Result.ok(entityService.listByProject(projectId));
+    }
+
+    @GetMapping("/entities/{entityId}")
+    public Result<EntityDTO> getById(@PathVariable Long entityId) {
+        return Result.ok(entityService.getById(entityId));
+    }
+
+    @PostMapping("/projects/{projectId}/entities")
+    public Result<EntityDTO> create(@PathVariable Long projectId, @RequestBody EntityCreateRequest request) {
+        return Result.ok(entityService.create(projectId, request));
+    }
+
+    @PostMapping("/projects/{projectId}/entities/batch")
+    public Result<Integer> batchCreate(@PathVariable Long projectId, @RequestBody List<EntityCreateRequest> requests) {
+        int count = entityService.batchCreate(projectId, requests);
+        return Result.ok(count);
+    }
+
+    @PutMapping("/entities/{entityId}")
+    public Result<EntityDTO> update(@PathVariable Long entityId, @RequestBody EntityCreateRequest request) {
+        return Result.ok(entityService.update(entityId, request));
+    }
+
+    @DeleteMapping("/entities/{entityId}")
+    public Result<Void> delete(@PathVariable Long entityId) {
+        entityService.delete(entityId);
+        return Result.ok();
+    }
+
+    @DeleteMapping("/projects/{projectId}/entities")
+    public Result<Integer> deleteByProject(@PathVariable Long projectId) {
+        int count = entityService.deleteByProject(projectId);
+        return Result.ok(count);
+    }
+}

+ 12 - 0
backend/lingyue-project/src/main/java/com/lingyue/project/entity/dto/EntityCreateRequest.java

@@ -0,0 +1,12 @@
+package com.lingyue.project.entity.dto;
+
+import lombok.Data;
+
+@Data
+public class EntityCreateRequest {
+    private String name;
+    private String entityType;
+    private String value;
+    private Double confidence;
+    private String position;
+}

+ 17 - 0
backend/lingyue-project/src/main/java/com/lingyue/project/entity/dto/EntityDTO.java

@@ -0,0 +1,17 @@
+package com.lingyue.project.entity.dto;
+
+import lombok.Data;
+import java.time.LocalDateTime;
+
+@Data
+public class EntityDTO {
+    private Long id;
+    private String name;
+    private String entityType;
+    private String value;
+    private Double confidence;
+    private String position;
+    private Long projectId;
+    private LocalDateTime createdAt;
+    private LocalDateTime updatedAt;
+}

+ 198 - 0
backend/lingyue-project/src/main/java/com/lingyue/project/entity/service/EntityService.java

@@ -0,0 +1,198 @@
+package com.lingyue.project.entity.service;
+
+import com.lingyue.project.entity.dto.EntityDTO;
+import com.lingyue.project.entity.dto.EntityCreateRequest;
+import lombok.RequiredArgsConstructor;
+import org.springframework.jdbc.core.JdbcTemplate;
+import org.springframework.jdbc.core.RowMapper;
+import org.springframework.stereotype.Service;
+import org.springframework.transaction.annotation.Transactional;
+
+import java.sql.ResultSet;
+import java.sql.SQLException;
+import java.time.LocalDateTime;
+import java.util.List;
+
+@Service
+@RequiredArgsConstructor
+public class EntityService {
+
+    private final JdbcTemplate jdbcTemplate;
+
+    private static final String NODE_TYPE = "ENTITY";
+
+    public List<EntityDTO> listByProject(Long projectId) {
+        String sql = """
+            SELECT n.id, n.name, n.node_key, n.created_at, n.updated_at,
+                   MAX(CASE WHEN np.property_key = 'entityType' THEN np.property_value END) as entity_type,
+                   MAX(CASE WHEN np.property_key = 'value' THEN np.property_value END) as value,
+                   MAX(CASE WHEN np.property_key = 'confidence' THEN np.property_value END) as confidence,
+                   MAX(CASE WHEN np.property_key = 'position' THEN np.property_value END) as position
+            FROM nodes n
+            LEFT JOIN node_properties np ON n.id = np.node_id
+            WHERE n.node_type = ?
+              AND EXISTS (
+                  SELECT 1 FROM edges e 
+                  WHERE e.edge_type = 'PROJECT_ENTITY' 
+                    AND e.from_node_id = ? 
+                    AND e.to_node_id = n.id
+              )
+            GROUP BY n.id, n.name, n.node_key, n.created_at, n.updated_at
+            ORDER BY n.created_at DESC
+            """;
+        return jdbcTemplate.query(sql, new EntityRowMapper(projectId), NODE_TYPE, projectId);
+    }
+
+    public EntityDTO getById(Long entityId) {
+        String sql = """
+            SELECT n.id, n.name, n.node_key, n.created_at, n.updated_at,
+                   MAX(CASE WHEN np.property_key = 'entityType' THEN np.property_value END) as entity_type,
+                   MAX(CASE WHEN np.property_key = 'value' THEN np.property_value END) as value,
+                   MAX(CASE WHEN np.property_key = 'confidence' THEN np.property_value END) as confidence,
+                   MAX(CASE WHEN np.property_key = 'position' THEN np.property_value END) as position
+            FROM nodes n
+            LEFT JOIN node_properties np ON n.id = np.node_id
+            WHERE n.id = ? AND n.node_type = ?
+            GROUP BY n.id, n.name, n.node_key, n.created_at, n.updated_at
+            """;
+        List<EntityDTO> results = jdbcTemplate.query(sql, new EntityRowMapper(null), entityId, NODE_TYPE);
+        return results.isEmpty() ? null : results.get(0);
+    }
+
+    @Transactional
+    public EntityDTO create(Long projectId, EntityCreateRequest request) {
+        // 生成node_key
+        String nodeKey = request.getEntityType() + ":" + request.getName();
+        
+        // 检查是否已存在
+        String checkSql = "SELECT id FROM nodes WHERE node_type = ? AND node_key = ?";
+        List<Long> existing = jdbcTemplate.queryForList(checkSql, Long.class, NODE_TYPE, nodeKey);
+        if (!existing.isEmpty()) {
+            // 已存在,返回现有实体
+            return getById(existing.get(0));
+        }
+        
+        // 创建节点
+        String insertNode = "INSERT INTO nodes (node_type, node_key, name, status) VALUES (?, ?, ?, 'active') RETURNING id";
+        Long nodeId = jdbcTemplate.queryForObject(insertNode, Long.class, NODE_TYPE, nodeKey, request.getName());
+        
+        // 创建属性
+        saveProperty(nodeId, "entityType", request.getEntityType());
+        saveProperty(nodeId, "value", request.getValue());
+        if (request.getConfidence() != null) {
+            saveProperty(nodeId, "confidence", String.valueOf(request.getConfidence()));
+        }
+        if (request.getPosition() != null) {
+            saveProperty(nodeId, "position", request.getPosition());
+        }
+        
+        // 创建项目-实体关系
+        String insertEdge = "INSERT INTO edges (edge_type, from_node_id, to_node_id) VALUES ('PROJECT_ENTITY', ?, ?)";
+        jdbcTemplate.update(insertEdge, projectId, nodeId);
+        
+        return getById(nodeId);
+    }
+
+    @Transactional
+    public int batchCreate(Long projectId, List<EntityCreateRequest> requests) {
+        int count = 0;
+        for (EntityCreateRequest request : requests) {
+            try {
+                create(projectId, request);
+                count++;
+            } catch (Exception e) {
+                // 忽略重复实体错误,继续处理
+            }
+        }
+        return count;
+    }
+
+    @Transactional
+    public EntityDTO update(Long entityId, EntityCreateRequest request) {
+        // 更新节点名称
+        String updateNode = "UPDATE nodes SET name = ?, updated_at = NOW() WHERE id = ?";
+        jdbcTemplate.update(updateNode, request.getName(), entityId);
+        
+        // 更新属性
+        updateProperty(entityId, "entityType", request.getEntityType());
+        updateProperty(entityId, "value", request.getValue());
+        if (request.getConfidence() != null) {
+            updateProperty(entityId, "confidence", String.valueOf(request.getConfidence()));
+        }
+        if (request.getPosition() != null) {
+            updateProperty(entityId, "position", request.getPosition());
+        }
+        
+        return getById(entityId);
+    }
+
+    @Transactional
+    public void delete(Long entityId) {
+        // 删除属性
+        jdbcTemplate.update("DELETE FROM node_properties WHERE node_id = ?", entityId);
+        // 删除关系
+        jdbcTemplate.update("DELETE FROM edges WHERE from_node_id = ? OR to_node_id = ?", entityId, entityId);
+        // 删除节点
+        jdbcTemplate.update("DELETE FROM nodes WHERE id = ?", entityId);
+    }
+
+    @Transactional
+    public int deleteByProject(Long projectId) {
+        // 获取项目下所有实体ID
+        String sql = """
+            SELECT e.to_node_id FROM edges e 
+            WHERE e.edge_type = 'PROJECT_ENTITY' AND e.from_node_id = ?
+            """;
+        List<Long> entityIds = jdbcTemplate.queryForList(sql, Long.class, projectId);
+        
+        for (Long entityId : entityIds) {
+            delete(entityId);
+        }
+        
+        return entityIds.size();
+    }
+
+    private void saveProperty(Long nodeId, String key, String value) {
+        if (value == null) return;
+        String sql = "INSERT INTO node_properties (node_id, property_key, property_value) VALUES (?, ?, ?)";
+        jdbcTemplate.update(sql, nodeId, key, value);
+    }
+
+    private void updateProperty(Long nodeId, String key, String value) {
+        if (value == null) return;
+        String checkSql = "SELECT COUNT(*) FROM node_properties WHERE node_id = ? AND property_key = ?";
+        int count = jdbcTemplate.queryForObject(checkSql, Integer.class, nodeId, key);
+        if (count > 0) {
+            jdbcTemplate.update("UPDATE node_properties SET property_value = ? WHERE node_id = ? AND property_key = ?", 
+                value, nodeId, key);
+        } else {
+            saveProperty(nodeId, key, value);
+        }
+    }
+
+    private static class EntityRowMapper implements RowMapper<EntityDTO> {
+        private final Long projectId;
+        
+        EntityRowMapper(Long projectId) {
+            this.projectId = projectId;
+        }
+        
+        @Override
+        public EntityDTO mapRow(ResultSet rs, int rowNum) throws SQLException {
+            EntityDTO dto = new EntityDTO();
+            dto.setId(rs.getLong("id"));
+            dto.setName(rs.getString("name"));
+            dto.setEntityType(rs.getString("entity_type"));
+            dto.setValue(rs.getString("value"));
+            String confidence = rs.getString("confidence");
+            if (confidence != null) {
+                dto.setConfidence(Double.parseDouble(confidence));
+            }
+            dto.setPosition(rs.getString("position"));
+            dto.setProjectId(projectId);
+            dto.setCreatedAt(rs.getTimestamp("created_at").toLocalDateTime());
+            dto.setUpdatedAt(rs.getTimestamp("updated_at").toLocalDateTime());
+            return dto;
+        }
+    }
+}

+ 32 - 0
frontend/vue-demo/src/api/index.js

@@ -137,6 +137,38 @@ export const elementApi = {
   }
 }
 
+// ==================== 实体 API ====================
+
+export const entityApi = {
+  list(projectId) {
+    return api.get(`/projects/${projectId}/entities`)
+  },
+
+  getById(entityId) {
+    return api.get(`/entities/${entityId}`)
+  },
+
+  create(projectId, data) {
+    return api.post(`/projects/${projectId}/entities`, data)
+  },
+
+  batchCreate(projectId, entities) {
+    return api.post(`/projects/${projectId}/entities/batch`, entities)
+  },
+
+  update(entityId, data) {
+    return api.put(`/entities/${entityId}`, data)
+  },
+
+  delete(entityId) {
+    return api.delete(`/entities/${entityId}`)
+  },
+
+  deleteByProject(projectId) {
+    return api.delete(`/projects/${projectId}/entities`)
+  }
+}
+
 // ==================== 要素值 API ====================
 
 export const valueApi = {

+ 25 - 32
frontend/vue-demo/src/views/Editor.vue

@@ -1061,7 +1061,7 @@ import { ref, reactive, computed, watch, onMounted } from 'vue'
 import { useRouter, useRoute } from 'vue-router'
 import { Plus, Delete, Search, Loading, Check, CopyDocument, MoreFilled, List, Folder, Document, Grid, Setting, Paperclip, Upload } from '@element-plus/icons-vue'
 import { ElMessage, ElMessageBox } from 'element-plus'
-import { projectApi, elementApi, valueApi, attachmentApi, ruleApi, parseApi, extractApi } from '@/api'
+import { projectApi, elementApi, valueApi, attachmentApi, ruleApi, parseApi, extractApi, entityApi } from '@/api'
 import { marked } from 'marked'
 import JSZip from 'jszip'
 import { useTaskCenterStore } from '@/stores/taskCenter'
@@ -2052,50 +2052,43 @@ async function handleCreateProject() {
         
         if (result.success) {
           docxParseProgress.value = 92
-          docxParseMessage.value = '正在保存要素...'
+          docxParseMessage.value = '正在保存...'
           
           // 保存doc_content到项目
           if (result.doc_content) {
             await projectApi.saveDocContent(project.id, result.doc_content)
           }
           
-          // 先创建要素定义,再保存要素值
-          if (result.elements && result.elements.length > 0) {
-            for (const elem of result.elements) {
-              try {
-                await elementApi.add(project.id, {
-                  elementKey: elem.elementKey,
-                  elementName: elem.elementName,
-                  elementType: elem.elementType || 'text',
-                  namespace: elem.namespace || 'project',
-                  sortOrder: elem.sortOrder || 0
-                })
-              } catch (e) {
-                // 要素可能已存在,忽略错误
-              }
-            }
-          }
+          // 保存提取的实体
+          const entityCount = result.entities?.length || 0
+          const llmCount = result.llm_extractions?.length || 0
+          console.log(`NER提取: ${entityCount} 个实体`)
+          console.log(`LLM提取: ${llmCount} 个内容`)
           
-          // 保存提取的要素值
-          if (result.values && result.values.length > 0) {
-            let savedCount = 0
-            for (const val of result.values) {
-              if (val.isFilled && val.valueText) {
-                try {
-                  await valueApi.update(project.id, val.elementKey, { valueText: val.valueText })
-                  savedCount++
-                } catch (e) {
-                  console.warn('保存要素值失败:', val.elementKey)
-                }
-              }
+          if (result.entities && result.entities.length > 0) {
+            docxParseMessage.value = `正在保存 ${entityCount} 个实体...`
+            try {
+              // 转换为后端需要的格式
+              const entitiesToSave = result.entities.map(e => ({
+                name: e.text,
+                entityType: e.type,
+                value: e.text,
+                confidence: e.confidence || 0.9,
+                position: e.position ? JSON.stringify(e.position) : null
+              }))
+              await entityApi.batchCreate(project.id, entitiesToSave)
+              console.log(`已保存 ${entityCount} 个实体到数据库`)
+            } catch (saveError) {
+              console.error('保存实体失败:', saveError)
+              // 不阻断流程,继续执行
             }
           }
           
           docxParseProgress.value = 100
           docxParseStatus.value = 'success'
-          docxParseMessage.value = `完成!提取 ${result.statistics?.filled_values || 0} 个要素`
+          docxParseMessage.value = `完成!识别 ${entityCount} 个实体`
           
-          ElMessage.success(`解析完成,已提取 ${result.statistics?.filled_values || 0} 个要素`)
+          ElMessage.success(`解析完成,识别 ${entityCount} 个实体`)
           
           // 切换到该项目并刷新数据
           await switchProject(project)

+ 118 - 9
python-services/ner-service/app/routers/extract.py

@@ -14,7 +14,7 @@ from concurrent.futures import ThreadPoolExecutor
 
 from loguru import logger
 
-from ..services.docx_parser import parse_docx_file, blocks_to_text
+from ..services.docx_parser import parse_docx_file, parse_docx_with_chapters, blocks_to_text, split_by_chapters
 from ..services.element_extractor import element_extractor
 
 router = APIRouter()
@@ -37,8 +37,8 @@ class ExtractResponse(BaseModel):
     """提取响应"""
     success: bool
     doc_content: Optional[Dict[str, Any]] = None
-    elements: List[Dict] = []
-    values: List[Dict] = []
+    entities: List[Dict] = []  # NER识别的实体
+    llm_extractions: List[Dict] = []  # LLM提取的内容
     statistics: Dict[str, Any] = {}
     processing_time_ms: int = 0
     error: Optional[str] = None
@@ -105,8 +105,8 @@ async def extract_from_docx(
         return ExtractResponse(
             success=True,
             doc_content=doc_content,
-            elements=result['elements'],
-            values=result['values'],
+            entities=result['entities'],
+            llm_extractions=result['llm_extractions'],
             statistics=result['statistics'],
             processing_time_ms=processing_time
         )
@@ -211,8 +211,8 @@ async def _process_docx_task(
         _tasks[task_id]["message"] = "处理完成"
         _tasks[task_id]["result"] = {
             "doc_content": doc_content,
-            "elements": result['elements'],
-            "values": result['values'],
+            "entities": result['entities'],
+            "llm_extractions": result['llm_extractions'],
             "statistics": result['statistics'],
             "processing_time_ms": processing_time
         }
@@ -272,8 +272,8 @@ async def extract_from_text(request: ExtractFromTextRequest):
         
         return ExtractResponse(
             success=True,
-            elements=result['elements'],
-            values=result['values'],
+            entities=result['entities'],
+            llm_extractions=result['llm_extractions'],
             statistics=result['statistics'],
             processing_time_ms=processing_time
         )
@@ -284,3 +284,112 @@ async def extract_from_text(request: ExtractFromTextRequest):
             error=str(e),
             processing_time_ms=int((time.time() - start_time) * 1000)
         )
+
+
+# ============================================================
+# 分章节提取接口(大文档优化)
+# ============================================================
+
+class ChapterExtractResponse(BaseModel):
+    """分章节提取响应"""
+    success: bool
+    doc_content: Optional[Dict[str, Any]] = None
+    toc: List[Dict] = []  # 目录结构
+    chapters: List[Dict] = []  # 章节信息
+    entities: List[Dict] = []  # 去重后的实体
+    chapter_entities: Dict[str, Any] = {}  # 按章节分组的实体
+    llm_extractions: List[Dict] = []
+    statistics: Dict[str, Any] = {}
+    processing_time_ms: int = 0
+    error: Optional[str] = None
+
+
+@router.post("/from-docx/chapters", response_model=ChapterExtractResponse)
+async def extract_from_docx_by_chapters(
+    file: UploadFile = File(...),
+    attachment_id: int = Form(default=0),
+    use_llm: bool = Form(default=False),
+    parallel: bool = Form(default=True)
+):
+    """
+    分章节提取:解析DOCX → 切分章节 → 并行提取 → 去重合并
+    
+    适用于大文档(>50页),可以:
+    1. 提取文档目录结构
+    2. 按章节并行提取实体
+    3. 自动去重合并
+    
+    Args:
+        file: DOCX文件
+        attachment_id: 附件ID
+        use_llm: 是否使用LLM提取
+        parallel: 是否并行处理章节
+    """
+    start_time = time.time()
+    
+    if not file.filename.lower().endswith('.docx'):
+        raise HTTPException(status_code=400, detail="仅支持.docx文件")
+    
+    content = await file.read()
+    
+    try:
+        logger.info(f"分章节处理文件: {file.filename}, size={len(content)}")
+        
+        # 1. 解析DOCX并切分章节
+        loop = asyncio.get_event_loop()
+        doc_result = await loop.run_in_executor(_executor, parse_docx_with_chapters, content)
+        
+        toc = doc_result.get('toc', [])
+        chapters = doc_result.get('chapters', [])
+        
+        logger.info(f"解析完成: {doc_result['totalBlocks']} 个块, {len(toc)} 个目录项, {len(chapters)} 个章节")
+        
+        # 2. 为章节添加text(parse_docx_with_chapters中删除了blocks但保留了text)
+        # 重新切分以获取完整的章节数据
+        full_chapters = split_by_chapters(doc_result['blocks'])
+        
+        # 3. 分章节提取实体
+        extract_result = await element_extractor.extract_from_chapters(
+            chapters=full_chapters,
+            attachment_id=attachment_id,
+            use_llm=use_llm,
+            parallel=parallel
+        )
+        
+        processing_time = int((time.time() - start_time) * 1000)
+        
+        # 简化章节信息(不返回完整text)
+        chapters_info = []
+        for ch in full_chapters:
+            chapters_info.append({
+                'chapter_id': ch['chapter_id'],
+                'title': ch['title'],
+                'level': ch['level'],
+                'text_length': len(ch.get('text', '')),
+                'entity_count': len(extract_result['chapter_entities'].get(ch['chapter_id'], {}).get('entities', []))
+            })
+        
+        logger.info(f"分章节提取完成: {extract_result['statistics']}, 耗时: {processing_time}ms")
+        
+        return ChapterExtractResponse(
+            success=True,
+            doc_content={
+                'page': doc_result.get('page'),
+                'totalBlocks': doc_result.get('totalBlocks')
+            },
+            toc=toc,
+            chapters=chapters_info,
+            entities=extract_result['entities'],
+            chapter_entities=extract_result['chapter_entities'],
+            llm_extractions=extract_result['llm_extractions'],
+            statistics=extract_result['statistics'],
+            processing_time_ms=processing_time
+        )
+        
+    except Exception as e:
+        logger.error(f"分章节提取失败: {e}", exc_info=True)
+        return ChapterExtractResponse(
+            success=False,
+            error=str(e),
+            processing_time_ms=int((time.time() - start_time) * 1000)
+        )

+ 170 - 0
python-services/ner-service/app/services/docx_parser.py

@@ -333,3 +333,173 @@ def extract_tables_from_blocks(blocks: List[Dict]) -> List[Dict]:
                 'table': block.get('table', {})
             })
     return tables
+
+
+def extract_toc(blocks: List[Dict]) -> List[Dict]:
+    """
+    从blocks中提取目录结构(Table of Contents)
+    
+    Returns:
+        [
+            {
+                "level": 1,
+                "title": "第一章 概述",
+                "block_index": 5,
+                "block_id": "b5"
+            },
+            ...
+        ]
+    """
+    toc = []
+    
+    for i, block in enumerate(blocks):
+        block_type = block.get('type', '')
+        
+        # 检测标题类型
+        if block_type.startswith('heading'):
+            level = int(block_type.replace('heading', '') or '1')
+            runs = block.get('runs', [])
+            title = ''.join(r.get('text', '') for r in runs).strip()
+            
+            if title:
+                toc.append({
+                    'level': level,
+                    'title': title,
+                    'block_index': i,
+                    'block_id': block.get('id', f'b{i}')
+                })
+        
+        # 检测TOC样式
+        elif block_type.startswith('toc'):
+            level = int(block_type.replace('toc', '') or '1')
+            runs = block.get('runs', [])
+            title = ''.join(r.get('text', '') for r in runs).strip()
+            # TOC条目通常包含页码,去掉尾部数字
+            title = re.sub(r'\s*\d+\s*$', '', title).strip()
+            
+            if title:
+                toc.append({
+                    'level': level,
+                    'title': title,
+                    'block_index': i,
+                    'block_id': block.get('id', f'b{i}'),
+                    'is_toc_entry': True  # 标记为目录条目
+                })
+    
+    return toc
+
+
+def split_by_chapters(blocks: List[Dict], toc: List[Dict] = None) -> List[Dict]:
+    """
+    根据目录/标题将文档切分为章节
+    
+    Args:
+        blocks: 文档块列表
+        toc: 目录结构(可选,如果不提供则自动提取)
+        
+    Returns:
+        [
+            {
+                "chapter_id": "ch0",
+                "title": "前言",
+                "level": 0,
+                "start_index": 0,
+                "end_index": 10,
+                "blocks": [...],
+                "text": "章节纯文本内容"
+            },
+            ...
+        ]
+    """
+    if toc is None:
+        toc = extract_toc(blocks)
+    
+    # 过滤掉TOC条目,只保留真正的标题
+    headings = [t for t in toc if not t.get('is_toc_entry')]
+    
+    if not headings:
+        # 没有标题,整个文档作为一个章节
+        return [{
+            'chapter_id': 'ch0',
+            'title': '全文',
+            'level': 0,
+            'start_index': 0,
+            'end_index': len(blocks),
+            'blocks': blocks,
+            'text': blocks_to_text(blocks)
+        }]
+    
+    chapters = []
+    
+    # 处理第一个标题之前的内容(如封面、摘要等)
+    first_heading_index = headings[0]['block_index']
+    if first_heading_index > 0:
+        pre_blocks = blocks[:first_heading_index]
+        chapters.append({
+            'chapter_id': 'ch0',
+            'title': '前言',
+            'level': 0,
+            'start_index': 0,
+            'end_index': first_heading_index,
+            'blocks': pre_blocks,
+            'text': blocks_to_text(pre_blocks)
+        })
+    
+    # 按标题切分章节
+    for i, heading in enumerate(headings):
+        start_index = heading['block_index']
+        
+        # 确定章节结束位置
+        if i + 1 < len(headings):
+            end_index = headings[i + 1]['block_index']
+        else:
+            end_index = len(blocks)
+        
+        chapter_blocks = blocks[start_index:end_index]
+        
+        chapters.append({
+            'chapter_id': f'ch{i + 1}',
+            'title': heading['title'],
+            'level': heading['level'],
+            'start_index': start_index,
+            'end_index': end_index,
+            'blocks': chapter_blocks,
+            'text': blocks_to_text(chapter_blocks)
+        })
+    
+    return chapters
+
+
+def parse_docx_with_chapters(file_content: bytes) -> Dict[str, Any]:
+    """
+    解析DOCX文件,包含章节切分
+    
+    Returns:
+        {
+            "page": {...},
+            "blocks": [...],
+            "totalBlocks": int,
+            "toc": [...],           # 目录结构
+            "chapters": [...]       # 章节列表
+        }
+    """
+    # 基础解析
+    result = parse_docx_file(file_content)
+    
+    # 提取目录
+    toc = extract_toc(result['blocks'])
+    result['toc'] = toc
+    logger.info(f"提取目录: {len(toc)} 个条目")
+    
+    # 切分章节
+    chapters = split_by_chapters(result['blocks'], toc)
+    # 不在结果中包含完整blocks,节省内存
+    for ch in chapters:
+        ch['block_count'] = len(ch['blocks'])
+        ch['text_length'] = len(ch['text'])
+        del ch['blocks']  # 移除blocks,只保留text
+    
+    result['chapters'] = chapters
+    logger.info(f"切分章节: {len(chapters)} 个章节")
+    
+    return result

+ 319 - 476
python-services/ner-service/app/services/element_extractor.py

@@ -1,293 +1,26 @@
 """
-要素提取器:混合NER+LLM策略
+要素提取器:使用NER和LLM从文档中提取实体
 
-从解析后的文档内容中提取要素值,输出前端渲染所需的elements和values
+支持分章节提取和实体去重
 """
 
-import re
 import json
-from typing import Dict, List, Any, Optional, Tuple
+import asyncio
+from typing import Dict, List, Any, Optional
 from loguru import logger
 
-# DOCX解析由Java后端完成,这里只处理纯文本
-
-
-# ============================================================
-# NER规则定义
-# ============================================================
-
-NER_RULES = {
-    # 日期类
-    "project.workStartAt": {
-        "patterns": [
-            r'评审日期[::]\s*(\d{4}年\d{1,2}月\d{1,2}日)至',
-            r'(\d{4}年\d{1,2}月\d{1,2}日)至\d{4}年',
-            r'评审(?:开始)?日期[::]\s*(\d{4}年\d{1,2}月\d{1,2}日)',
-        ],
-        "type": "DATE",
-        "element_name": "评审开始日期",
-        "element_type": "text",
-        "namespace": "project"
-    },
-    "project.workEndAt": {
-        "patterns": [
-            r'至(\d{4}年\d{1,2}月\d{1,2}日)',
-            r'评审(?:结束)?日期[::]\s*(\d{4}年\d{1,2}月\d{1,2}日)',
-        ],
-        "type": "DATE",
-        "element_name": "评审结束日期",
-        "element_type": "text",
-        "namespace": "project"
-    },
-    
-    # 得分类
-    "project.resultScore": {
-        "patterns": [
-            r'评审得分[::]\s*(\d+\.?\d*)\s*分',
-            r'得分[::]\s*(\d+\.?\d*)\s*分',
-            r'(\d+\.?\d*)分\s*级别',
-        ],
-        "type": "SCORE",
-        "element_name": "评审得分",
-        "element_type": "text",
-        "namespace": "project",
-        "post_process": "append_unit"  # 添加"分"单位
-    },
-    
-    # 级别类
-    "project.resultLevel": {
-        "patterns": [
-            r'级别[::]\s*(一级|二级|三级)',
-            r'评审(?:结论)?级别[::]\s*(一级|二级|三级)',
-            r'(一级|二级|三级)\s*(?:企业)?(?:证书)?',
-        ],
-        "type": "LEVEL",
-        "element_name": "评审结论级别",
-        "element_type": "text",
-        "namespace": "project"
-    },
-    
-    # 编号类
-    "basicInfo.projectCode": {
-        "patterns": [
-            r'项目编号[::]\s*([A-Z]+-\d+-\d+)',
-            r'编号[::]\s*([A-Z0-9\-]+)',
-        ],
-        "type": "CODE",
-        "element_name": "项目编号",
-        "element_type": "text",
-        "namespace": "basicInfo"
-    },
-    "basicInfo.reviewObjectCertificateCode": {
-        "patterns": [
-            r'证书编号[::]\s*(ZGDIDBOY-\d+)',
-            r'证书编号[((]([^))]+)[))]',
-            r'证书编号[::]\s*([A-Z0-9\-]+)',
-        ],
-        "type": "CODE",
-        "element_name": "证书编号",
-        "element_type": "text",
-        "namespace": "basicInfo"
-    },
-    
-    # 机构类
-    "project.reviewObject": {
-        "patterns": [
-            r'评审对象[::]\s*([^\n]{10,60}(?:公司|集团|院|所))',
-            r'对([^\n]{10,60}(?:公司|集团|院|所))进行.*?(?:评审|复审)',
-        ],
-        "type": "ORG",
-        "element_name": "评审对象",
-        "element_type": "text",
-        "namespace": "project"
-    },
-    "project.reviewObjectAlias": {
-        "patterns": [
-            r'以下简称[「『"""]([^」』""]{2,10})[」』"""]',
-            r'简称[「『"""]([^」』""]{2,10})[」』"""]',
-            r'(以下简称"([^"]{2,10})")',
-        ],
-        "type": "ALIAS",
-        "element_name": "评审对象简称",
-        "element_type": "text",
-        "namespace": "project"
-    },
-}
-
-# ============================================================
-# LLM提取配置
-# ============================================================
-
-LLM_SUMMARY_ELEMENTS = [
-    {
-        "element_key": "project.target",
-        "element_name": "目标",
-        "element_type": "paragraph",
-        "namespace": "project",
-        "source_keywords": ["目标", "5.1.1"],
-        "prompt": "请根据以下评审意见,总结企业的安全生产目标情况(100-200字):\n{text}"
-    },
-    {
-        "element_key": "project.duty",
-        "element_name": "职责",
-        "element_type": "paragraph",
-        "namespace": "project",
-        "source_keywords": ["职责", "5.1.2"],
-        "prompt": "请根据以下评审意见,总结企业的安全生产职责落实情况(100-200字):\n{text}"
-    },
-    {
-        "element_key": "project.fullParticipation",
-        "element_name": "全员参与",
-        "element_type": "paragraph",
-        "namespace": "project",
-        "source_keywords": ["全员参与", "5.1.3"],
-        "prompt": "请根据以下评审意见,总结企业的全员参与情况(100-200字):\n{text}"
-    },
-    {
-        "element_key": "project.safetyInvestment",
-        "element_name": "安全投入",
-        "element_type": "paragraph",
-        "namespace": "project",
-        "source_keywords": ["安全投入", "安全生产费用", "5.1.4"],
-        "prompt": "请根据以下评审意见,总结企业的安全投入情况(100-200字):\n{text}"
-    },
-    {
-        "element_key": "project.safetyCulture",
-        "element_name": "安全文化",
-        "element_type": "paragraph",
-        "namespace": "project",
-        "source_keywords": ["安全文化", "5.1.5"],
-        "prompt": "请根据以下评审意见,总结企业的安全文化建设情况(100-200字):\n{text}"
-    },
-    {
-        "element_key": "project.systematicManagement",
-        "element_name": "体系化管理",
-        "element_type": "paragraph",
-        "namespace": "project",
-        "source_keywords": ["制度化管理", "体系化", "5.2"],
-        "prompt": "请根据以下评审意见,总结企业的体系化管理情况(100-200字):\n{text}"
-    },
-    {
-        "element_key": "project.employeeTraining",
-        "element_name": "人员教育培训",
-        "element_type": "paragraph",
-        "namespace": "project",
-        "source_keywords": ["教育培训", "5.3"],
-        "prompt": "请根据以下评审意见,总结企业的人员教育培训情况(100-200字):\n{text}"
-    },
-    {
-        "element_key": "project.assetManagement",
-        "element_name": "设备设施管理",
-        "element_type": "paragraph",
-        "namespace": "project",
-        "source_keywords": ["设备设施", "5.4.1"],
-        "prompt": "请根据以下评审意见,总结企业的设备设施管理情况(100-200字):\n{text}"
-    },
-    {
-        "element_key": "project.jobSafety",
-        "element_name": "作业安全",
-        "element_type": "paragraph",
-        "namespace": "project",
-        "source_keywords": ["作业安全", "5.4.2.1"],
-        "prompt": "请根据以下评审意见,总结企业的作业安全情况(100-200字):\n{text}"
-    },
-    {
-        "element_key": "project.riskAssessment",
-        "element_name": "风险辨识与评价",
-        "element_type": "paragraph",
-        "namespace": "project",
-        "source_keywords": ["风险辨识", "风险评价", "5.5.1"],
-        "prompt": "请根据以下评审意见,总结企业的风险辨识与评价情况(100-200字):\n{text}"
-    },
-    {
-        "element_key": "project.hazardInspection",
-        "element_name": "隐患排查",
-        "element_type": "paragraph",
-        "namespace": "project",
-        "source_keywords": ["隐患排查", "5.5.3"],
-        "prompt": "请根据以下评审意见,总结企业的隐患排查情况(100-200字):\n{text}"
-    },
-    {
-        "element_key": "project.emergencyResponse",
-        "element_name": "应急救援",
-        "element_type": "paragraph",
-        "namespace": "project",
-        "source_keywords": ["应急救援", "应急管理", "5.6"],
-        "prompt": "请根据以下评审意见,总结企业的应急救援情况(100-200字):\n{text}"
-    },
-    {
-        "element_key": "project.incidentManagement",
-        "element_name": "事故管理",
-        "element_type": "paragraph",
-        "namespace": "project",
-        "source_keywords": ["事故管理", "5.7"],
-        "prompt": "请根据以下评审意见,总结企业的事故管理情况(100-200字):\n{text}"
-    },
-    {
-        "element_key": "project.continuousImprovement",
-        "element_name": "持续改进",
-        "element_type": "paragraph",
-        "namespace": "project",
-        "source_keywords": ["持续改进", "5.8"],
-        "prompt": "请根据以下评审意见,总结企业的持续改进情况(100-200字):\n{text}"
-    },
-    {
-        "element_key": "project.reviewObjectSelfAssessmentProcess",
-        "element_name": "自评过程",
-        "element_type": "paragraph",
-        "namespace": "project",
-        "source_keywords": ["自评", "自查"],
-        "prompt": "请根据以下内容,总结企业的自评过程(150-250字):\n{text}"
-    },
-    {
-        "element_key": "project.safetyHighlight",
-        "element_name": "安全生产管理亮点",
-        "element_type": "paragraph",
-        "namespace": "project",
-        "source_keywords": ["亮点", "特色", "优秀"],
-        "prompt": "请根据以下内容,提炼企业的安全生产管理亮点(100-200字):\n{text}"
-    },
-]
-
-LLM_TABLE_ELEMENTS = [
-    {
-        "element_key": "+SPSRRReviewProject",
-        "element_name": "现场复审项目",
-        "element_type": "table",
-        "namespace": "spsrr",
-        "table_keywords": ["项目名称", "简称", "类型"],
-        "prompt": """请从以下表格中提取复审项目列表,以JSON数组格式返回:
-[{{"name": "项目名称", "alias": "简称", "type": "单位/在建项目", "order": 1}}]
-
-表格内容:
-{text}
-
-只返回JSON数组,不要其他内容。"""
-    },
-    {
-        "element_key": "+SPSRRReviewer",
-        "element_name": "现场复审人员",
-        "element_type": "table",
-        "namespace": "spsrr",
-        "table_keywords": ["姓名", "专业", "分工"],
-        "prompt": """请从以下表格中提取评审人员列表,以JSON数组格式返回:
-[{{"name": "姓名", "specialty": "专业分工"}}]
-
-表格内容:
-{text}
-
-只返回JSON数组,不要其他内容。"""
-    },
-]
+from .ner_service import ner_service
 
 
 class ElementExtractor:
-    """要素提取器"""
+    """
+    要素提取器
+    
+    使用NER服务识别文档中的实体,可选使用LLM进行智能提取。
+    不预定义要素结构,返回动态识别的实体。
+    """
     
     def __init__(self):
-        self.ner_rules = NER_RULES
-        self.llm_summary_config = LLM_SUMMARY_ELEMENTS
-        self.llm_table_config = LLM_TABLE_ELEMENTS
         self._deepseek_service = None
     
     @property
@@ -309,248 +42,358 @@ class ElementExtractor:
         use_llm: bool = True
     ) -> Dict[str, Any]:
         """
-        从纯文本中提取所有要素(主接口)
+        从纯文本中提取实体(主接口)
         
         Args:
-            text: Java后端解析的纯文本
+            text: 文档纯文本
             attachment_id: 附件ID
-            use_llm: 是否使用LLM提取(总结型要素)
+            use_llm: 是否使用LLM提取
             
         Returns:
             {
-                "elements": [...],
-                "values": [...],
+                "entities": [...],  # NER识别的实体列表
+                "llm_extractions": [...],  # LLM提取的内容(可选)
                 "statistics": {...}
             }
         """
-        logger.info(f"开始提取要素: attachment_id={attachment_id}, "
+        logger.info(f"开始提取实体: attachment_id={attachment_id}, "
                    f"text_length={len(text)}, use_llm={use_llm}")
         
-        # 1. NER规则提取
-        ner_values = self._extract_by_ner(text, attachment_id)
-        logger.info(f"NER提取完成: {len(ner_values)} 个要素")
+        # 1. 使用NER服务提取实体
+        ner_entities = await self._extract_by_ner(text)
+        logger.info(f"NER提取完成: {len(ner_entities)} 个实体")
         
-        # 2. LLM提取(可选)
-        llm_values = {}
+        # 2. LLM智能提取(可选)
+        llm_extractions = []
         if use_llm and self.deepseek_service:
-            llm_values = await self._extract_by_llm(text, attachment_id)
-            logger.info(f"LLM提取完成: {len(llm_values)} 个要素")
-        
-        # 4. 合并结果
-        all_values = {**ner_values, **llm_values}
-        
-        # 5. 生成输出
-        elements, values = self._build_output(all_values, attachment_id)
+            llm_extractions = await self._extract_by_llm(text)
+            logger.info(f"LLM提取完成: {len(llm_extractions)} 个内容")
         
         return {
-            "elements": elements,
-            "values": values,
+            "entities": ner_entities,
+            "llm_extractions": llm_extractions,
             "statistics": {
-                "total_elements": len(elements),
-                "filled_values": len([v for v in values if v.get("isFilled")]),
-                "ner_extracted": len(ner_values),
-                "llm_extracted": len(llm_values),
+                "ner_entity_count": len(ner_entities),
+                "llm_extraction_count": len(llm_extractions),
+                "text_length": len(text)
             }
         }
     
-    def _extract_by_ner(
-        self, 
-        text: str, 
-        attachment_id: int
-    ) -> Dict[str, Dict]:
-        """NER规则提取"""
-        results = {}
-        
-        for element_key, rule in self.ner_rules.items():
-            for pattern in rule['patterns']:
-                try:
-                    match = re.search(pattern, text)
-                    if match:
-                        value = match.group(1).strip()
-                        
-                        # 后处理
-                        if rule.get('post_process') == 'append_unit':
-                            if not value.endswith('分'):
-                                value = value + '分'
-                        
-                        results[element_key] = {
-                            'value': value,
-                            'confidence': 0.95,
-                            'source': 'ner',
-                            'position': {
-                                'charStart': match.start(1),
-                                'charEnd': match.end(1),
-                                'line': text[:match.start()].count('\n') + 1
-                            },
-                            'element_name': rule['element_name'],
-                            'element_type': rule['element_type'],
-                            'namespace': rule['namespace']
-                        }
-                        break
-                except Exception as e:
-                    logger.warning(f"NER规则匹配失败: {element_key}, pattern={pattern}, error={e}")
+    async def _extract_by_ner(self, text: str) -> List[Dict]:
+        """
+        使用NER服务提取实体
         
-        return results
+        返回实体列表,每个实体包含:
+        - text: 实体文本
+        - type: 实体类型(DATE, ORG, PERSON, NUMBER, CODE等)
+        - label: 实体标签
+        - confidence: 置信度
+        - position: 位置信息
+        """
+        try:
+            # 调用现有的NER服务,返回EntityInfo对象列表
+            entities = await ner_service.extract_entities(text)
+            
+            # 格式化输出(EntityInfo是Pydantic模型,使用属性访问)
+            result = []
+            for entity in entities:
+                result.append({
+                    "text": entity.name,
+                    "type": entity.type,
+                    "label": entity.type,
+                    "confidence": entity.confidence,
+                    "position": {
+                        "start": entity.position.char_start if entity.position else 0,
+                        "end": entity.position.char_end if entity.position else 0
+                    }
+                })
+            
+            return result
+        except Exception as e:
+            logger.error(f"NER提取失败: {e}")
+            return []
     
-    async def _extract_by_llm(
-        self, 
-        text: str, 
-        attachment_id: int
-    ) -> Dict[str, Dict]:
-        """LLM智能提取(总结型要素)"""
-        results = {}
+    async def _extract_by_llm(self, text: str) -> List[Dict]:
+        """
+        使用LLM智能提取关键信息
         
+        让LLM自动识别文档中的重要信息,不预设要提取什么。
+        """
         if not self.deepseek_service:
-            return results
+            return []
         
-        # 提取总结型要素
-        for config in self.llm_summary_config:
-            element_key = config['element_key']
+        try:
+            # 截取文档前部分进行分析
+            sample_text = text[:8000] if len(text) > 8000 else text
             
-            # 查找相关文本
-            relevant_text = self._find_relevant_text(text, config['source_keywords'])
+            prompt = f"""请分析以下文档,提取其中的关键信息。
+
+要求:
+1. 识别文档类型(如:报告、合同、通知等)
+2. 提取关键实体(如:组织名称、日期、金额、编号等)
+3. 提取关键数据(如:得分、级别、数量等)
+4. 以JSON格式返回
+
+返回格式:
+{{
+    "document_type": "文档类型",
+    "key_entities": [
+        {{"name": "实体名称", "type": "实体类型", "value": "实体值"}}
+    ],
+    "key_data": [
+        {{"name": "数据名称", "value": "数据值", "unit": "单位"}}
+    ],
+    "summary": "文档摘要(50字以内)"
+}}
+
+文档内容:
+{sample_text}
+
+只返回JSON,不要其他内容。"""
+
+            response = await self.deepseek_service.chat(prompt)
             
-            if relevant_text and len(relevant_text) > 50:
-                prompt = config['prompt'].format(text=relevant_text[:3000])
+            if response:
+                # 尝试解析JSON
                 try:
-                    response = await self.deepseek_service.chat(prompt)
-                    if response and len(response.strip()) > 20:
-                        results[element_key] = {
-                            'value': response.strip(),
-                            'confidence': 0.85,
-                            'source': 'llm',
-                            'element_name': config['element_name'],
-                            'element_type': config['element_type'],
-                            'namespace': config['namespace']
-                        }
-                except Exception as e:
-                    logger.error(f"LLM提取失败: {element_key}, error={e}")
+                    # 清理响应,提取JSON部分
+                    json_str = response.strip()
+                    if json_str.startswith("```"):
+                        json_str = json_str.split("```")[1]
+                        if json_str.startswith("json"):
+                            json_str = json_str[4:]
+                    
+                    data = json.loads(json_str)
+                    
+                    extractions = []
+                    
+                    # 文档类型
+                    if data.get("document_type"):
+                        extractions.append({
+                            "name": "文档类型",
+                            "value": data["document_type"],
+                            "source": "llm"
+                        })
+                    
+                    # 关键实体
+                    for entity in data.get("key_entities", []):
+                        extractions.append({
+                            "name": entity.get("name", ""),
+                            "type": entity.get("type", ""),
+                            "value": entity.get("value", ""),
+                            "source": "llm"
+                        })
+                    
+                    # 关键数据
+                    for item in data.get("key_data", []):
+                        value = item.get("value", "")
+                        if item.get("unit"):
+                            value = f"{value}{item['unit']}"
+                        extractions.append({
+                            "name": item.get("name", ""),
+                            "value": value,
+                            "source": "llm"
+                        })
+                    
+                    # 摘要
+                    if data.get("summary"):
+                        extractions.append({
+                            "name": "文档摘要",
+                            "value": data["summary"],
+                            "source": "llm"
+                        })
+                    
+                    return extractions
+                    
+                except json.JSONDecodeError:
+                    logger.warning(f"LLM返回的不是有效JSON: {response[:200]}")
+                    return []
+            
+            return []
+            
+        except Exception as e:
+            logger.error(f"LLM提取失败: {e}")
+            return []
+
+
+    async def extract_from_chapters(
+        self, 
+        chapters: List[Dict],
+        attachment_id: int = 0,
+        use_llm: bool = True,
+        parallel: bool = True
+    ) -> Dict[str, Any]:
+        """
+        分章节提取实体,最后去重合并
         
-        # 表格型要素暂时跳过(需要Java后端提供表格结构)
-        # TODO: 后续可以通过Java后端传递表格数据
+        Args:
+            chapters: 章节列表,每个章节包含 {chapter_id, title, text}
+            attachment_id: 附件ID
+            use_llm: 是否使用LLM提取
+            parallel: 是否并行处理章节
+            
+        Returns:
+            {
+                "entities": [...],           # 去重后的实体列表
+                "chapter_entities": {...},   # 按章节分组的实体
+                "llm_extractions": [...],
+                "statistics": {...}
+            }
+        """
+        logger.info(f"开始分章节提取: {len(chapters)} 个章节, parallel={parallel}")
         
-        return results
-    
-    def _find_relevant_text(self, text: str, keywords: List[str]) -> str:
-        """根据关键词查找相关文本段落"""
-        lines = text.split('\n')
-        relevant_lines = []
-        capturing = False
-        capture_count = 0
+        chapter_results = {}
+        all_entities = []
+        all_llm_extractions = []
         
-        for line in lines:
-            # 检查是否包含关键词
-            if any(kw in line for kw in keywords):
-                capturing = True
-                capture_count = 0
+        if parallel and len(chapters) > 1:
+            # 并行处理章节
+            tasks = []
+            for chapter in chapters:
+                task = self._extract_chapter(chapter, attachment_id, use_llm)
+                tasks.append(task)
             
-            if capturing:
-                relevant_lines.append(line)
-                capture_count += 1
-                # 最多取30行
-                if capture_count > 30:
-                    capturing = False
+            results = await asyncio.gather(*tasks, return_exceptions=True)
+            
+            for chapter, result in zip(chapters, results):
+                if isinstance(result, Exception):
+                    logger.error(f"章节 {chapter['chapter_id']} 提取失败: {result}")
+                    continue
+                chapter_results[chapter['chapter_id']] = result
+        else:
+            # 串行处理章节
+            for chapter in chapters:
+                try:
+                    result = await self._extract_chapter(chapter, attachment_id, use_llm)
+                    chapter_results[chapter['chapter_id']] = result
+                except Exception as e:
+                    logger.error(f"章节 {chapter['chapter_id']} 提取失败: {e}")
         
-        return '\n'.join(relevant_lines)
-    
-    def _find_relevant_table(self, tables: List[Dict], keywords: List[str]) -> Optional[Dict]:
-        """根据关键词查找相关表格"""
-        for table_info in tables:
-            table = table_info['table']
-            if table.get('data') and len(table['data']) > 0:
-                # 检查表头
-                header_row = table['data'][0]
-                header_texts = [cell.get('text', '') for cell in header_row]
-                header_str = ' '.join(header_texts)
-                
-                # 检查是否包含关键词
-                match_count = sum(1 for kw in keywords if kw in header_str)
-                if match_count >= 2:
-                    return table
+        # 合并所有章节的实体
+        for chapter_id, result in chapter_results.items():
+            for entity in result.get('entities', []):
+                entity['chapter_id'] = chapter_id
+                all_entities.append(entity)
+            all_llm_extractions.extend(result.get('llm_extractions', []))
         
-        return None
-    
-    def _table_to_text(self, table: Dict) -> str:
-        """将表格转为文本"""
-        lines = []
-        for row in table.get('data', []):
-            cells = [cell.get('text', '') for cell in row]
-            lines.append(' | '.join(cells))
-        return '\n'.join(lines)
+        # 去重
+        unique_entities = self._deduplicate_entities(all_entities)
+        unique_llm = self._deduplicate_llm_extractions(all_llm_extractions)
+        
+        logger.info(f"分章节提取完成: 原始 {len(all_entities)} 个实体, 去重后 {len(unique_entities)} 个")
+        
+        return {
+            "entities": unique_entities,
+            "chapter_entities": chapter_results,
+            "llm_extractions": unique_llm,
+            "statistics": {
+                "chapter_count": len(chapters),
+                "total_entities_before_dedup": len(all_entities),
+                "unique_entity_count": len(unique_entities),
+                "llm_extraction_count": len(unique_llm)
+            }
+        }
     
-    def _build_output(
+    async def _extract_chapter(
         self, 
-        extracted_values: Dict[str, Dict],
-        attachment_id: int
-    ) -> Tuple[List[Dict], List[Dict]]:
-        """构建输出的elements和values"""
+        chapter: Dict, 
+        attachment_id: int,
+        use_llm: bool
+    ) -> Dict[str, Any]:
+        """提取单个章节的实体"""
+        chapter_id = chapter.get('chapter_id', 'unknown')
+        title = chapter.get('title', '')
+        text = chapter.get('text', '')
         
-        # 合并所有要素定义
-        all_element_defs = {}
+        if not text or len(text.strip()) < 10:
+            return {"entities": [], "llm_extractions": []}
         
-        # 从NER规则获取
-        for key, rule in self.ner_rules.items():
-            all_element_defs[key] = {
-                'element_name': rule['element_name'],
-                'element_type': rule['element_type'],
-                'namespace': rule['namespace']
-            }
+        logger.debug(f"提取章节 {chapter_id}: {title[:30]}... (长度: {len(text)})")
         
-        # 从LLM配置获取
-        for config in self.llm_summary_config:
-            all_element_defs[config['element_key']] = {
-                'element_name': config['element_name'],
-                'element_type': config['element_type'],
-                'namespace': config['namespace']
-            }
+        # NER提取
+        entities = await self._extract_by_ner(text)
         
-        for config in self.llm_table_config:
-            all_element_defs[config['element_key']] = {
-                'element_name': config['element_name'],
-                'element_type': config['element_type'],
-                'namespace': config['namespace']
-            }
+        # 为每个实体添加章节信息
+        for entity in entities:
+            entity['chapter_id'] = chapter_id
+            entity['chapter_title'] = title
         
-        elements = []
-        values = []
+        # LLM提取(可选)
+        llm_extractions = []
+        if use_llm and self.deepseek_service:
+            llm_extractions = await self._extract_by_llm(text)
+            for item in llm_extractions:
+                item['chapter_id'] = chapter_id
+                item['chapter_title'] = title
         
-        for i, (element_key, elem_def) in enumerate(all_element_defs.items()):
-            element = {
-                "id": 700 + i,
-                "elementKey": element_key,
-                "elementName": elem_def['element_name'],
-                "elementType": elem_def['element_type'],
-                "namespace": elem_def['namespace'],
-                "sortOrder": i
-            }
-            elements.append(element)
-            
-            # 查找提取的值
-            extracted = extracted_values.get(element_key)
-            if extracted:
-                value = {
-                    "valueId": 800 + i,
-                    "elementKey": element_key,
-                    "valueText": extracted['value'],
-                    "isFilled": True,
-                    "fillSource": "ai" if extracted['source'] == 'llm' else "rule",
-                    "confidence": extracted.get('confidence', 0.8),
-                    "sourceAttachmentId": attachment_id
-                }
-                if 'position' in extracted:
-                    value['extractPosition'] = extracted['position']
+        return {
+            "entities": entities,
+            "llm_extractions": llm_extractions
+        }
+    
+    def _deduplicate_entities(self, entities: List[Dict]) -> List[Dict]:
+        """
+        实体去重
+        
+        去重规则:
+        1. 相同类型+相同文本 -> 保留第一个出现的
+        2. 包含关系 -> 保留更长的实体
+        """
+        if not entities:
+            return []
+        
+        # 按 (type, text) 去重
+        seen = {}
+        for entity in entities:
+            key = (entity.get('type', ''), entity.get('text', ''))
+            if key not in seen:
+                seen[key] = entity
             else:
-                value = {
-                    "valueId": 800 + i,
-                    "elementKey": element_key,
-                    "valueText": "",
-                    "isFilled": False,
-                    "fillSource": "default"
-                }
+                # 保留置信度更高的
+                if entity.get('confidence', 0) > seen[key].get('confidence', 0):
+                    seen[key] = entity
+        
+        unique = list(seen.values())
+        
+        # 处理包含关系(可选,较复杂)
+        # 例如:"中国电建集团" 和 "中国电建集团成都勘测设计研究院有限公司"
+        # 保留更长的
+        final = []
+        texts = set()
+        
+        # 按文本长度降序排序
+        unique.sort(key=lambda x: len(x.get('text', '')), reverse=True)
+        
+        for entity in unique:
+            text = entity.get('text', '')
+            # 检查是否被更长的实体包含
+            is_substring = False
+            for existing_text in texts:
+                if text in existing_text and text != existing_text:
+                    is_substring = True
+                    break
             
-            values.append(value)
+            if not is_substring:
+                final.append(entity)
+                texts.add(text)
+        
+        # 恢复原始顺序(按位置)
+        final.sort(key=lambda x: x.get('position', {}).get('start', 0))
+        
+        return final
+    
+    def _deduplicate_llm_extractions(self, extractions: List[Dict]) -> List[Dict]:
+        """LLM提取结果去重"""
+        if not extractions:
+            return []
+        
+        seen = {}
+        for item in extractions:
+            key = (item.get('name', ''), item.get('value', ''))
+            if key not in seen:
+                seen[key] = item
         
-        return elements, values
+        return list(seen.values())
 
 
 # 创建单例

+ 173 - 229
python-services/ner-service/app/services/ner_service.py

@@ -1,289 +1,233 @@
 """
-NER 服务实现
+NER 服务实现 - 使用 HanLP
 
-支持多种模式:
-1. rule - 基于规则的简单 NER(默认,用于开发测试)
-2. spacy - 使用 spaCy 模型
-3. transformers - 使用 Transformers 模型
-4. api - 调用外部 API(如 DeepSeek/Qwen)
+HanLP 是一个中文NLP工具包,支持高质量的命名实体识别。
 """
-import re
 import uuid
 from typing import List, Optional
 from loguru import logger
 
-from ..config import settings
 from ..models import EntityInfo, PositionInfo
 
 
+# 每个分段的最大字符数(HanLP对长文本有限制)
+MAX_SEGMENT_LENGTH = 500
+
+# 需要过滤的实体类型(这些类型通常是噪音)
+FILTER_TYPES = {
+    'INTEGER', 'DECIMAL', 'FRACTION', 'ORDINAL', 'CARDINAL',
+    'RATE', 'DURATION', 'NUMBER', 'POSTALCODE'
+}
+
+# 需要保留的核心实体类型
+KEEP_TYPES = {'PERSON', 'ORG', 'LOC', 'DATE', 'TIME', 'MONEY', 'PERCENT'}
+
+# 太泛化的实体(黑名单)
+BLACKLIST_ENTITIES = {
+    '公司', '评审组', '部门', '单位', '组织', '机构', '中心', '委员会',
+    '第一', '第二', '第三', '第四', '第五', '一级', '二级', '三级',
+    '百千万', '十四五', '十三五'
+}
+
+
 class NerService:
-    """NER 服务"""
+    """NER 服务 - 基于 HanLP"""
     
     def __init__(self):
-        self.model_type = settings.ner_model
-        logger.info(f"初始化 NER 服务: model_type={self.model_type}")
+        self._hanlp_ner = None
+        self._hanlp_tokenizer = None
+        logger.info("初始化 NER 服务: model=HanLP")
     
-    async def extract_entities(
-        self, 
-        text: str, 
-        entity_types: Optional[List[str]] = None
-    ) -> List[EntityInfo]:
-        """
-        从文本中提取实体
+    def _load_model(self):
+        """延迟加载HanLP模型"""
+        if self._hanlp_ner is not None:
+            return
         
-        Args:
-            text: 待提取的文本
-            entity_types: 指定要提取的实体类型,为空则提取所有类型
+        try:
+            import hanlp
+            logger.info("正在加载HanLP NER模型...")
             
-        Returns:
-            实体列表
-        """
-        if not text or not text.strip():
-            return []
-        
-        if self.model_type == "rule":
-            return await self._extract_by_rules(text, entity_types)
-        elif self.model_type == "ollama":
-            return await self._extract_by_ollama(text, entity_types)
-        elif self.model_type == "deepseek":
-            return await self._extract_by_deepseek(text, entity_types)
-        elif self.model_type == "spacy":
-            return await self._extract_by_spacy(text, entity_types)
-        elif self.model_type == "transformers":
-            return await self._extract_by_transformers(text, entity_types)
-        elif self.model_type == "api":
-            return await self._extract_by_api(text, entity_types)
-        else:
-            logger.warning(f"未知的模型类型: {self.model_type},使用规则模式")
-            return await self._extract_by_rules(text, entity_types)
+            # 使用MTL多任务模型,更稳定
+            self._hanlp_ner = hanlp.load(hanlp.pretrained.mtl.CLOSE_TOK_POS_NER_SRL_DEP_SDP_CON_ELECTRA_SMALL_ZH)
+            
+            logger.info("HanLP NER模型加载完成")
+        except ImportError:
+            logger.error("HanLP未安装,请运行: pip install hanlp")
+            raise
+        except Exception as e:
+            logger.error(f"HanLP模型加载失败: {e}")
+            raise
     
-    async def extract_entities_with_progress(
-        self, 
-        text: str, 
-        entity_types: Optional[List[str]] = None
-    ):
+    def _split_text(self, text: str) -> List[tuple]:
         """
-        从文本中提取实体(带进度生成器,用于 SSE 流式响应)
-        
-        Yields:
-            SSE 事件字符串
+        将长文本分段,返回 [(segment, offset), ...]
         """
-        import json
+        segments = []
+        lines = text.split('\n')
+        current_segment = ""
+        current_offset = 0
+        segment_start = 0
         
-        if not text or not text.strip():
-            yield f"event: entities_data\ndata: {json.dumps({'entities': [], 'total_entities': 0}, ensure_ascii=False)}\n\n"
-            return
+        for line in lines:
+            if len(current_segment) + len(line) + 1 > MAX_SEGMENT_LENGTH:
+                if current_segment:
+                    segments.append((current_segment, segment_start))
+                current_segment = line
+                segment_start = current_offset
+            else:
+                if current_segment:
+                    current_segment += '\n' + line
+                else:
+                    current_segment = line
+                    segment_start = current_offset
+            current_offset += len(line) + 1
         
-        if self.model_type == "deepseek":
-            from .deepseek_service import deepseek_service
-            async for event in deepseek_service.extract_entities_with_progress(text, entity_types):
-                yield event
-        else:
-            # 其他模型回退到普通提取,一次性返回
-            entities = await self.extract_entities(text, entity_types)
-            yield f"event: chunk_complete\ndata: {json.dumps({'total_entities': len(entities), 'progress_percent': 100}, ensure_ascii=False)}\n\n"
-            # 发送实体数据事件
-            yield f"event: entities_data\ndata: {json.dumps({'entities': [e.model_dump(by_alias=True) for e in entities], 'total_entities': len(entities)}, ensure_ascii=False)}\n\n"
+        if current_segment:
+            segments.append((current_segment, segment_start))
+        
+        return segments
     
-    async def _extract_by_rules(
+    async def extract_entities(
         self, 
         text: str, 
         entity_types: Optional[List[str]] = None
     ) -> List[EntityInfo]:
         """
-        基于规则的 NER 提取
-        用于开发测试阶段,后续可替换为更高级的模型
+        从文本中提取实体
         """
-        entities = []
-        
-        # 规则定义
-        rules = {
-            "DATE": [
-                # 中文日期格式
-                r'(\d{4}年\d{1,2}月\d{1,2}日)',
-                r'(\d{4}年\d{1,2}月)',
-                r'(\d{4}-\d{1,2}-\d{1,2})',
-                r'(\d{4}/\d{1,2}/\d{1,2})',
-            ],
-            "NUMBER": [
-                # 带单位的数值
-                r'(\d+\.?\d*\s*(?:万元|元|米|公里|千米|平方米|㎡|吨|kg|g|个|台|套|件|次|人|天|小时|分钟|秒|%|百分比))',
-                # 百分比
-                r'(\d+\.?\d*%)',
-                # 纯数值(较大的数)
-                r'(?<![a-zA-Z])(\d{4,}(?:\.\d+)?)(?![a-zA-Z])',
-            ],
-            "ORG": [
-                # 机构/公司名称
-                r'([\u4e00-\u9fa5]{2,10}(?:公司|集团|院|所|局|部|厅|委|会|中心|协会|学会|银行|医院|学校|大学|学院))',
-                # xx省/市/县/区
-                r'([\u4e00-\u9fa5]{2,6}(?:省|市|县|区|镇|乡|村)(?:人民)?(?:政府|委员会)?)',
-            ],
-            "LOC": [
-                # 地点
-                r'([\u4e00-\u9fa5]{2,6}(?:省|市|县|区|镇|乡|村|路|街|巷|号|楼|栋|单元|室))',
-                # 常见地名后缀
-                r'([\u4e00-\u9fa5]{2,8}(?:工业园|开发区|高新区|科技园|产业园))',
-            ],
-            "PERSON": [
-                # 人名(简单规则:姓+名)
-                r'(?:(?:张|王|李|赵|刘|陈|杨|黄|周|吴|徐|孙|马|朱|胡|郭|何|林|罗|高|郑|梁|谢|唐|许|邓|冯|韩|曹|曾|彭|萧|蔡|潘|田|董|袁|于|余|叶|蒋|杜|苏|魏|程|吕|丁|沈|任|姚|卢|傅|钟|姜|崔|谭|廖|范|汪|陆|金|石|戴|贾|韦|夏|邱|方|侯|邹|熊|孟|秦|白|江|阎|薛|尹|段|雷|黎|史|龙|陶|贺|顾|毛|郝|龚|邵|万|钱|严|赖|覃|洪|武|莫|孔)[\u4e00-\u9fa5]{1,2})(?:总|经理|主任|工程师|教授|博士|先生|女士|同志)?',
-            ],
-            "DEVICE": [
-                # 设备名称
-                r'([\u4e00-\u9fa5]{2,10}(?:设备|仪器|仪表|机器|装置|系统|探测器|传感器|检测仪|分析仪|监测仪))',
-            ],
-            "PROJECT": [
-                # 项目名称 - 更严格的规则
-                # 要求:项目名应该是完整的名词短语,通常有特定前缀
-                # 带书名号的项目名
-                r'《([\u4e00-\u9fa5a-zA-Z0-9]{2,30}(?:项目|工程|计划|方案|课题))》',
-                # 明确的项目编号/名称格式
-                r'([A-Z0-9\-]+(?:项目|工程))',
-                # 地名/机构名 + 项目类型(更严格)
-                r'((?:[\u4e00-\u9fa5]{2,6}(?:省|市|县|区|镇))?[\u4e00-\u9fa5]{2,15}(?:建设|改造|修复|治理|开发|研究|试点|示范)(?:项目|工程))',
-                # xx项目部/项目组
-                r'([\u4e00-\u9fa5]{2,15}项目(?:部|组|办))',
-            ],
-        }
+        if not text or not text.strip():
+            return []
         
-        # 过滤实体类型
-        if entity_types:
-            rules = {k: v for k, v in rules.items() if k in entity_types}
+        # 加载模型
+        self._load_model()
         
-        # 停用词/无效实体过滤(这些词虽然匹配规则但不是有效实体)
-        stopwords = {
-            # 常见无意义匹配
-            "该项目", "本项目", "此项目", "各项目", "子公司和项目", "认真落实项目",
-            "开展的培训项目", "年已经开展的培训项目",
-            "该工程", "本工程", "此工程", "各工程",
-            "该计划", "本计划", "此计划", "各计划",
-            "该方案", "本方案", "此方案", "各方案",
-            # 动词开头的无效匹配
-            "落实项目", "开展项目", "推进项目", "完成项目", "实施项目",
-            # 太短的无意义实体
-            "项目", "工程", "计划", "方案", "课题",
+        # HanLP实体类型映射
+        type_mapping = {
+            'PERSON': 'PERSON', 'PER': 'PERSON', 'NR': 'PERSON',
+            'ORGANIZATION': 'ORG', 'ORG': 'ORG', 'NT': 'ORG',
+            'LOCATION': 'LOC', 'LOC': 'LOC', 'GPE': 'LOC', 'NS': 'LOC',
+            'DATE': 'DATE', 'TIME': 'DATE',
+            'MONEY': 'NUMBER', 'PERCENT': 'NUMBER', 'QUANTITY': 'NUMBER', 'CARDINAL': 'NUMBER',
         }
         
-        # 执行规则匹配
-        seen_entities = set()  # 用于去重
+        entities = []
+        seen_entities = set()
         
-        for entity_type, patterns in rules.items():
-            for pattern in patterns:
-                for match in re.finditer(pattern, text):
-                    entity_text = match.group(1) if match.groups() else match.group(0)
-                    entity_text = entity_text.strip()
+        # 分段处理
+        segments = self._split_text(text)
+        total_segments = len(segments)
+        logger.info(f"开始NER提取: 文本长度={len(text)}, 分段数={total_segments}")
+        
+        for seg_idx, (segment, offset) in enumerate(segments):
+            if seg_idx % 10 == 0:
+                logger.info(f"NER进度: {seg_idx}/{total_segments} 段")
+            
+            try:
+                # 调用HanLP MTL模型
+                result = self._hanlp_ner(segment, tasks='ner')
+                
+                # MTL模型返回格式: {'ner/msra': [['实体', '类型', start, end], ...]}
+                ner_results = []
+                if isinstance(result, dict):
+                    for key in result:
+                        if 'ner' in key.lower():
+                            ner_results = result[key]
+                            break
+                elif isinstance(result, list):
+                    ner_results = result
+                
+                # 处理结果
+                for item in ner_results:
+                    entity_text = None
+                    entity_type = None
+                    char_start = 0
+                    char_end = 0
+                    
+                    if isinstance(item, (list, tuple)) and len(item) >= 2:
+                        entity_text = item[0]
+                        entity_type = item[1]
+                        if len(item) >= 4:
+                            char_start = item[2] + offset
+                            char_end = item[3] + offset
+                        else:
+                            pos = segment.find(str(entity_text))
+                            char_start = pos + offset if pos >= 0 else offset
+                            char_end = char_start + len(str(entity_text))
+                    elif isinstance(item, dict):
+                        entity_text = item.get('text', item.get('word', ''))
+                        entity_type = item.get('type', item.get('label', 'UNKNOWN'))
+                        char_start = item.get('start', 0) + offset
+                        char_end = item.get('end', char_start + len(entity_text))
+                    else:
+                        continue
                     
-                    # 跳过停用词
-                    if entity_text in stopwords:
+                    if not entity_text or not entity_type:
                         continue
                     
-                    # 跳过太短的实体(少于3个字符)
-                    if len(entity_text) < 3:
+                    entity_text = str(entity_text)
+                    entity_type = str(entity_type)
+                    
+                    # 映射实体类型
+                    mapped_type = type_mapping.get(entity_type.upper(), entity_type.upper())
+                    
+                    # 过滤噪音类型(数字、序号等)
+                    if mapped_type in FILTER_TYPES or entity_type.upper() in FILTER_TYPES:
                         continue
                     
-                    # 去重
-                    entity_key = f"{entity_type}:{entity_text}"
-                    if entity_key in seen_entities:
+                    # 只保留核心类型
+                    if mapped_type not in KEEP_TYPES and entity_type.upper() not in KEEP_TYPES:
+                        continue
+                    
+                    # 过滤实体类型(用户指定)
+                    if entity_types and mapped_type not in entity_types:
+                        continue
+                    
+                    # 黑名单过滤
+                    if entity_text in BLACKLIST_ENTITIES:
+                        continue
+                    
+                    # 去重(忽略类型,只看文本)
+                    if entity_text in seen_entities:
+                        continue
+                    seen_entities.add(entity_text)
+                    
+                    # 跳过太短的实体
+                    if len(entity_text) < 2:
+                        continue
+                    
+                    # 跳过纯数字
+                    if entity_text.replace('.', '').replace('-', '').isdigit():
                         continue
-                    seen_entities.add(entity_key)
                     
                     # 计算行号
-                    line_num = text[:match.start()].count('\n') + 1
+                    line_num = text[:char_start].count('\n') + 1 if char_start > 0 else 1
                     
                     # 获取上下文
-                    context_start = max(0, match.start() - 20)
-                    context_end = min(len(text), match.end() + 20)
+                    context_start = max(0, char_start - 20)
+                    context_end = min(len(text), char_end + 20)
                     context = text[context_start:context_end]
-                    if context_start > 0:
-                        context = "..." + context
-                    if context_end < len(text):
-                        context = context + "..."
                     
                     entity = EntityInfo(
                         name=entity_text,
-                        type=entity_type,
+                        type=mapped_type,
                         value=entity_text,
                         position=PositionInfo(
-                            char_start=match.start(),
-                            char_end=match.end(),
+                            char_start=char_start,
+                            char_end=char_end,
                             line=line_num
                         ),
                         context=context,
-                        confidence=0.8,  # 规则匹配默认置信度
+                        confidence=0.9,
                         temp_id=str(uuid.uuid4())[:8]
                     )
                     entities.append(entity)
+                    
+            except Exception as e:
+                logger.warning(f"分段 {seg_idx} NER失败: {e}")
+                continue
         
-        logger.info(f"规则 NER 提取完成: entity_count={len(entities)}")
+        logger.info(f"HanLP NER 提取完成: entity_count={len(entities)}")
         return entities
-    
-    async def _extract_by_ollama(
-        self, 
-        text: str, 
-        entity_types: Optional[List[str]] = None
-    ) -> List[EntityInfo]:
-        """
-        使用本地 Ollama LLM 进行 NER 提取
-        支持长文本自动分块
-        """
-        try:
-            from .ollama_service import ollama_service
-            return await ollama_service.extract_entities(text, entity_types)
-        except Exception as e:
-            logger.error(f"Ollama NER 失败: {e},回退到规则模式")
-            return await self._extract_by_rules(text, entity_types)
-    
-    async def _extract_by_deepseek(
-        self, 
-        text: str, 
-        entity_types: Optional[List[str]] = None
-    ) -> List[EntityInfo]:
-        """
-        使用阿里云百炼 DeepSeek API 进行 NER 提取
-        """
-        try:
-            from .deepseek_service import deepseek_service
-            return await deepseek_service.extract_entities(text, entity_types)
-        except Exception as e:
-            logger.error(f"DeepSeek NER 失败: {e},回退到规则模式")
-            return await self._extract_by_rules(text, entity_types)
-    
-    async def _extract_by_spacy(
-        self, 
-        text: str, 
-        entity_types: Optional[List[str]] = None
-    ) -> List[EntityInfo]:
-        """
-        使用 spaCy 进行 NER 提取
-        """
-        # TODO: 实现 spaCy NER
-        logger.warning("spaCy NER 尚未实现,回退到规则模式")
-        return await self._extract_by_rules(text, entity_types)
-    
-    async def _extract_by_transformers(
-        self, 
-        text: str, 
-        entity_types: Optional[List[str]] = None
-    ) -> List[EntityInfo]:
-        """
-        使用 Transformers 模型进行 NER 提取
-        """
-        # TODO: 实现 Transformers NER
-        logger.warning("Transformers NER 尚未实现,回退到规则模式")
-        return await self._extract_by_rules(text, entity_types)
-    
-    async def _extract_by_api(
-        self, 
-        text: str, 
-        entity_types: Optional[List[str]] = None
-    ) -> List[EntityInfo]:
-        """
-        调用外部 API 进行 NER 提取
-        """
-        # TODO: 实现 API NER(调用 DeepSeek/Qwen)
-        logger.warning("API NER 尚未实现,回退到规则模式")
-        return await self._extract_by_rules(text, entity_types)
 
 
 # 创建单例

+ 2 - 4
python-services/ner-service/requirements.txt

@@ -11,10 +11,8 @@ pydantic-settings==2.1.0
 httpx==0.26.0
 aiohttp==3.9.1
 
-# NER and NLP
-# spacy==3.7.2  # Uncomment if using spaCy
-# transformers==4.36.2  # Uncomment if using Transformers
-# torch==2.1.2  # Uncomment if using PyTorch
+# NER and NLP - HanLP
+hanlp==2.1.0b58
 
 # For DeepSeek/Qwen API fallback
 openai==1.6.1