|
|
@@ -102,8 +102,16 @@ class NerService:
|
|
|
r'([\u4e00-\u9fa5]{2,10}(?:设备|仪器|仪表|机器|装置|系统|探测器|传感器|检测仪|分析仪|监测仪))',
|
|
|
],
|
|
|
"PROJECT": [
|
|
|
- # 项目名称
|
|
|
- r'([\u4e00-\u9fa5]{2,20}(?:项目|工程|计划|方案|课题))',
|
|
|
+ # 项目名称 - 更严格的规则
|
|
|
+ # 要求:项目名应该是完整的名词短语,通常有特定前缀
|
|
|
+ # 带书名号的项目名
|
|
|
+ r'《([\u4e00-\u9fa5a-zA-Z0-9]{2,30}(?:项目|工程|计划|方案|课题))》',
|
|
|
+ # 明确的项目编号/名称格式
|
|
|
+ r'([A-Z0-9\-]+(?:项目|工程))',
|
|
|
+ # 地名/机构名 + 项目类型(更严格)
|
|
|
+ r'((?:[\u4e00-\u9fa5]{2,6}(?:省|市|县|区|镇))?[\u4e00-\u9fa5]{2,15}(?:建设|改造|修复|治理|开发|研究|试点|示范)(?:项目|工程))',
|
|
|
+ # xx项目部/项目组
|
|
|
+ r'([\u4e00-\u9fa5]{2,15}项目(?:部|组|办))',
|
|
|
],
|
|
|
}
|
|
|
|
|
|
@@ -111,6 +119,20 @@ class NerService:
|
|
|
if entity_types:
|
|
|
rules = {k: v for k, v in rules.items() if k in entity_types}
|
|
|
|
|
|
+ # 停用词/无效实体过滤(这些词虽然匹配规则但不是有效实体)
|
|
|
+ stopwords = {
|
|
|
+ # 常见无意义匹配
|
|
|
+ "该项目", "本项目", "此项目", "各项目", "子公司和项目", "认真落实项目",
|
|
|
+ "开展的培训项目", "年已经开展的培训项目",
|
|
|
+ "该工程", "本工程", "此工程", "各工程",
|
|
|
+ "该计划", "本计划", "此计划", "各计划",
|
|
|
+ "该方案", "本方案", "此方案", "各方案",
|
|
|
+ # 动词开头的无效匹配
|
|
|
+ "落实项目", "开展项目", "推进项目", "完成项目", "实施项目",
|
|
|
+ # 太短的无意义实体
|
|
|
+ "项目", "工程", "计划", "方案", "课题",
|
|
|
+ }
|
|
|
+
|
|
|
# 执行规则匹配
|
|
|
seen_entities = set() # 用于去重
|
|
|
|
|
|
@@ -120,6 +142,14 @@ class NerService:
|
|
|
entity_text = match.group(1) if match.groups() else match.group(0)
|
|
|
entity_text = entity_text.strip()
|
|
|
|
|
|
+ # 跳过停用词
|
|
|
+ if entity_text in stopwords:
|
|
|
+ continue
|
|
|
+
|
|
|
+ # 跳过太短的实体(少于3个字符)
|
|
|
+ if len(entity_text) < 3:
|
|
|
+ continue
|
|
|
+
|
|
|
# 去重
|
|
|
entity_key = f"{entity_type}:{entity_text}"
|
|
|
if entity_key in seen_entities:
|