Sfoglia il codice sorgente

feat(extract-service): 核心实体与 Repository 层实现

阶段二完成:

实体类(5个):
- Project: 数据提取项目,包含状态、配置
- SourceDocument: 来源文档,关联已解析的 Document
- ExtractRule: 提取规则,支持多种来源类型和提取类型
- ExtractResult: 提取结果,包含来源追溯信息
- RuleTemplate: 规则模板,用于配置复用

Repository(5个):
- ProjectRepository: 项目CRUD、按用户/状态查询
- SourceDocumentRepository: 来源文档CRUD、按项目查询
- ExtractRuleRepository: 规则CRUD、依赖分析查询
- ExtractResultRepository: 结果CRUD、状态统计
- RuleTemplateRepository: 模板CRUD、公开模板查询

配置DTO类(10个):
来源配置:
- LocationConfig: 定位配置(页码/章节/元素/单元格)
- TransformConfig: 值转换配置
- DocumentSourceConfig: 文档来源配置
- SelfReferenceSourceConfig: 自引用配置
- FixedSourceConfig: 固定内容配置
- ManualSourceConfig: 手动输入配置

提取配置:
- DirectExtractConfig: 直接提取配置
- AIExtractConfig: AI字段提取配置
- AISummarizeConfig: AI总结配置
- OcrExtractConfig: OCR提取配置
何文松 1 mese fa
parent
commit
cccd5afa42
20 ha cambiato i file con 1215 aggiunte e 0 eliminazioni
  1. 49 0
      backend/extract-service/src/main/java/com/lingyue/extract/dto/config/AIExtractConfig.java
  2. 50 0
      backend/extract-service/src/main/java/com/lingyue/extract/dto/config/AISummarizeConfig.java
  3. 36 0
      backend/extract-service/src/main/java/com/lingyue/extract/dto/config/DirectExtractConfig.java
  4. 49 0
      backend/extract-service/src/main/java/com/lingyue/extract/dto/config/DocumentSourceConfig.java
  5. 30 0
      backend/extract-service/src/main/java/com/lingyue/extract/dto/config/FixedSourceConfig.java
  6. 61 0
      backend/extract-service/src/main/java/com/lingyue/extract/dto/config/LocationConfig.java
  7. 43 0
      backend/extract-service/src/main/java/com/lingyue/extract/dto/config/ManualSourceConfig.java
  8. 58 0
      backend/extract-service/src/main/java/com/lingyue/extract/dto/config/OcrExtractConfig.java
  9. 35 0
      backend/extract-service/src/main/java/com/lingyue/extract/dto/config/SelfReferenceSourceConfig.java
  10. 44 0
      backend/extract-service/src/main/java/com/lingyue/extract/dto/config/TransformConfig.java
  11. 118 0
      backend/extract-service/src/main/java/com/lingyue/extract/entity/ExtractResult.java
  12. 125 0
      backend/extract-service/src/main/java/com/lingyue/extract/entity/ExtractRule.java
  13. 53 0
      backend/extract-service/src/main/java/com/lingyue/extract/entity/Project.java
  14. 67 0
      backend/extract-service/src/main/java/com/lingyue/extract/entity/RuleTemplate.java
  15. 65 0
      backend/extract-service/src/main/java/com/lingyue/extract/entity/SourceDocument.java
  16. 74 0
      backend/extract-service/src/main/java/com/lingyue/extract/repository/ExtractResultRepository.java
  17. 88 0
      backend/extract-service/src/main/java/com/lingyue/extract/repository/ExtractRuleRepository.java
  18. 49 0
      backend/extract-service/src/main/java/com/lingyue/extract/repository/ProjectRepository.java
  19. 65 0
      backend/extract-service/src/main/java/com/lingyue/extract/repository/RuleTemplateRepository.java
  20. 56 0
      backend/extract-service/src/main/java/com/lingyue/extract/repository/SourceDocumentRepository.java

+ 49 - 0
backend/extract-service/src/main/java/com/lingyue/extract/dto/config/AIExtractConfig.java

@@ -0,0 +1,49 @@
+package com.lingyue.extract.dto.config;
+
+import io.swagger.v3.oas.annotations.media.Schema;
+import lombok.Data;
+
+import java.util.List;
+
+/**
+ * AI字段提取配置
+ * 
+ * 用于 extractType = "ai_extract" 的规则
+ * 使用AI从内容中提取特定字段
+ * 
+ * @author lingyue
+ * @since 2026-01-22
+ */
+@Data
+@Schema(description = "AI字段提取配置")
+public class AIExtractConfig {
+    
+    @Schema(description = "提取目标描述,如'工程名称'、'建设日期'")
+    private String targetDescription;
+    
+    @Schema(description = "字段类型: text-文本, date-日期, number-数字, amount-金额, list-列表")
+    private String fieldType;
+    
+    @Schema(description = "预期格式,如'YYYY-MM-DD'、'XX万元'")
+    private String expectedFormat;
+    
+    @Schema(description = "示例值列表,帮助AI理解")
+    private List<String> examples;
+    
+    @Schema(description = "额外的提取说明")
+    private String additionalInstructions;
+    
+    @Schema(description = "使用的AI模型(可选,默认使用项目配置)")
+    private String modelName;
+    
+    @Schema(description = "温度参数(控制随机性,0-1)")
+    private Double temperature;
+    
+    // ==================== 字段类型常量 ====================
+    
+    public static final String FIELD_TYPE_TEXT = "text";
+    public static final String FIELD_TYPE_DATE = "date";
+    public static final String FIELD_TYPE_NUMBER = "number";
+    public static final String FIELD_TYPE_AMOUNT = "amount";
+    public static final String FIELD_TYPE_LIST = "list";
+}

+ 50 - 0
backend/extract-service/src/main/java/com/lingyue/extract/dto/config/AISummarizeConfig.java

@@ -0,0 +1,50 @@
+package com.lingyue.extract.dto.config;
+
+import io.swagger.v3.oas.annotations.media.Schema;
+import lombok.Data;
+
+import java.util.List;
+
+/**
+ * AI总结配置
+ * 
+ * 用于 extractType = "ai_summarize" 的规则
+ * 使用AI对内容进行总结、提炼
+ * 
+ * @author lingyue
+ * @since 2026-01-22
+ */
+@Data
+@Schema(description = "AI总结配置")
+public class AISummarizeConfig {
+    
+    @Schema(description = "总结要求/提示词")
+    private String summarizePrompt;
+    
+    @Schema(description = "关注维度列表")
+    private List<String> focusPoints;
+    
+    @Schema(description = "总结规则(如:必须包含XXX,不要提及XXX)")
+    private List<String> rules;
+    
+    @Schema(description = "输出风格: formal-正式, concise-简洁, detailed-详细")
+    private String style;
+    
+    @Schema(description = "最大字数限制")
+    private Integer maxLength;
+    
+    @Schema(description = "上下文字段Key列表(引用已提取的字段作为上下文)")
+    private List<String> contextFieldKeys;
+    
+    @Schema(description = "使用的AI模型(可选)")
+    private String modelName;
+    
+    @Schema(description = "温度参数")
+    private Double temperature;
+    
+    // ==================== 风格常量 ====================
+    
+    public static final String STYLE_FORMAL = "formal";
+    public static final String STYLE_CONCISE = "concise";
+    public static final String STYLE_DETAILED = "detailed";
+}

+ 36 - 0
backend/extract-service/src/main/java/com/lingyue/extract/dto/config/DirectExtractConfig.java

@@ -0,0 +1,36 @@
+package com.lingyue.extract.dto.config;
+
+import io.swagger.v3.oas.annotations.media.Schema;
+import lombok.Data;
+
+/**
+ * 直接提取配置
+ * 
+ * 用于 extractType = "direct" 的规则
+ * 直接使用定位到的内容,不进行AI处理
+ * 
+ * @author lingyue
+ * @since 2026-01-22
+ */
+@Data
+@Schema(description = "直接提取配置")
+public class DirectExtractConfig {
+    
+    @Schema(description = "去除首尾空白")
+    private Boolean trimWhitespace;
+    
+    @Schema(description = "移除换行符")
+    private Boolean removeLineBreaks;
+    
+    @Schema(description = "合并连续空格")
+    private Boolean mergeSpaces;
+    
+    @Schema(description = "正则提取模式(可选,用于从内容中提取特定部分)")
+    private String regexPattern;
+    
+    @Schema(description = "正则提取分组索引(默认0表示整个匹配)")
+    private Integer regexGroupIndex;
+    
+    @Schema(description = "值转换配置")
+    private TransformConfig transform;
+}

+ 49 - 0
backend/extract-service/src/main/java/com/lingyue/extract/dto/config/DocumentSourceConfig.java

@@ -0,0 +1,49 @@
+package com.lingyue.extract.dto.config;
+
+import io.swagger.v3.oas.annotations.media.Schema;
+import lombok.Data;
+
+/**
+ * 文档来源配置
+ * 
+ * 用于 sourceType = "document" 的规则
+ * 
+ * @author lingyue
+ * @since 2026-01-22
+ */
+@Data
+@Schema(description = "文档来源配置")
+public class DocumentSourceConfig {
+    
+    @Schema(description = "来源文档ID(SourceDocument.id)")
+    private String sourceDocId;
+    
+    @Schema(description = "定位配置")
+    private LocationConfig location;
+    
+    @Schema(description = "是否包含整个定位范围的内容")
+    private Boolean includeFullRange;
+    
+    @Schema(description = "内容预处理选项")
+    private PreprocessConfig preprocess;
+    
+    /**
+     * 内容预处理配置
+     */
+    @Data
+    @Schema(description = "内容预处理配置")
+    public static class PreprocessConfig {
+        
+        @Schema(description = "去除首尾空白")
+        private Boolean trimWhitespace;
+        
+        @Schema(description = "移除换行符")
+        private Boolean removeLineBreaks;
+        
+        @Schema(description = "合并连续空格")
+        private Boolean mergeSpaces;
+        
+        @Schema(description = "最大内容长度(用于AI调用时截断)")
+        private Integer maxLength;
+    }
+}

+ 30 - 0
backend/extract-service/src/main/java/com/lingyue/extract/dto/config/FixedSourceConfig.java

@@ -0,0 +1,30 @@
+package com.lingyue.extract.dto.config;
+
+import io.swagger.v3.oas.annotations.media.Schema;
+import lombok.Data;
+
+/**
+ * 固定内容来源配置
+ * 
+ * 用于 sourceType = "fixed" 的规则
+ * 直接使用固定的内容值
+ * 
+ * @author lingyue
+ * @since 2026-01-22
+ */
+@Data
+@Schema(description = "固定内容来源配置")
+public class FixedSourceConfig {
+    
+    @Schema(description = "固定内容")
+    private String fixedContent;
+    
+    @Schema(description = "内容类型: text-文本, html-HTML, markdown-Markdown")
+    private String contentType;
+    
+    // ==================== 内容类型常量 ====================
+    
+    public static final String CONTENT_TYPE_TEXT = "text";
+    public static final String CONTENT_TYPE_HTML = "html";
+    public static final String CONTENT_TYPE_MARKDOWN = "markdown";
+}

+ 61 - 0
backend/extract-service/src/main/java/com/lingyue/extract/dto/config/LocationConfig.java

@@ -0,0 +1,61 @@
+package com.lingyue.extract.dto.config;
+
+import io.swagger.v3.oas.annotations.media.Schema;
+import lombok.Data;
+
+import java.util.List;
+
+/**
+ * 定位配置
+ * 
+ * 描述如何在文档中定位内容
+ * 
+ * @author lingyue
+ * @since 2026-01-22
+ */
+@Data
+@Schema(description = "定位配置")
+public class LocationConfig {
+    
+    @Schema(description = "定位类型: page-按页码, chapter-按章节, element-按元素ID, cell-按单元格")
+    private String type;
+    
+    // ==================== 按页码定位 ====================
+    
+    @Schema(description = "起始页码")
+    private Integer pageStart;
+    
+    @Schema(description = "结束页码")
+    private Integer pageEnd;
+    
+    @Schema(description = "段落关键词过滤")
+    private String paragraphKeyword;
+    
+    // ==================== 按章节定位 ====================
+    
+    @Schema(description = "章节路径,如 ['3', '5', '3', '3'] 表示 3.5.3.3")
+    private List<String> chapterPath;
+    
+    @Schema(description = "章节标题关键词")
+    private String chapterTitle;
+    
+    // ==================== 按元素ID定位 ====================
+    
+    @Schema(description = "元素ID列表")
+    private List<String> elementIds;
+    
+    // ==================== 按单元格定位(Excel) ====================
+    
+    @Schema(description = "Sheet名称")
+    private String sheetName;
+    
+    @Schema(description = "单元格引用,如 'A1:C10' 或 '1.5.1'")
+    private String cellRef;
+    
+    // ==================== 定位类型常量 ====================
+    
+    public static final String TYPE_PAGE = "page";
+    public static final String TYPE_CHAPTER = "chapter";
+    public static final String TYPE_ELEMENT = "element";
+    public static final String TYPE_CELL = "cell";
+}

+ 43 - 0
backend/extract-service/src/main/java/com/lingyue/extract/dto/config/ManualSourceConfig.java

@@ -0,0 +1,43 @@
+package com.lingyue.extract.dto.config;
+
+import io.swagger.v3.oas.annotations.media.Schema;
+import lombok.Data;
+
+/**
+ * 手动输入来源配置
+ * 
+ * 用于 sourceType = "manual" 的规则
+ * 需要用户手动填写内容
+ * 
+ * @author lingyue
+ * @since 2026-01-22
+ */
+@Data
+@Schema(description = "手动输入来源配置")
+public class ManualSourceConfig {
+    
+    @Schema(description = "输入提示")
+    private String placeholder;
+    
+    @Schema(description = "默认值")
+    private String defaultValue;
+    
+    @Schema(description = "是否必填")
+    private Boolean required;
+    
+    @Schema(description = "验证规则(正则表达式)")
+    private String validationPattern;
+    
+    @Schema(description = "验证错误提示")
+    private String validationMessage;
+    
+    @Schema(description = "输入类型: text-单行文本, textarea-多行文本, date-日期, number-数字")
+    private String inputType;
+    
+    // ==================== 输入类型常量 ====================
+    
+    public static final String INPUT_TYPE_TEXT = "text";
+    public static final String INPUT_TYPE_TEXTAREA = "textarea";
+    public static final String INPUT_TYPE_DATE = "date";
+    public static final String INPUT_TYPE_NUMBER = "number";
+}

+ 58 - 0
backend/extract-service/src/main/java/com/lingyue/extract/dto/config/OcrExtractConfig.java

@@ -0,0 +1,58 @@
+package com.lingyue.extract.dto.config;
+
+import io.swagger.v3.oas.annotations.media.Schema;
+import lombok.Data;
+
+/**
+ * OCR提取配置
+ * 
+ * 用于 extractType = "ocr" 的规则
+ * 使用OCR识别图片中的文字
+ * 
+ * @author lingyue
+ * @since 2026-01-22
+ */
+@Data
+@Schema(description = "OCR提取配置")
+public class OcrExtractConfig {
+    
+    @Schema(description = "OCR引擎: paddleocr-飞桨OCR, tesseract-Tesseract")
+    private String ocrEngine;
+    
+    @Schema(description = "识别语言: chi_sim-简体中文, eng-英文")
+    private String language;
+    
+    @Schema(description = "是否进行表格识别")
+    private Boolean detectTable;
+    
+    @Schema(description = "是否进行版面分析")
+    private Boolean layoutAnalysis;
+    
+    @Schema(description = "图片预处理选项")
+    private PreprocessOptions preprocessOptions;
+    
+    /**
+     * 图片预处理选项
+     */
+    @Data
+    @Schema(description = "图片预处理选项")
+    public static class PreprocessOptions {
+        
+        @Schema(description = "是否转为灰度图")
+        private Boolean grayscale;
+        
+        @Schema(description = "是否进行二值化")
+        private Boolean binarize;
+        
+        @Schema(description = "是否去噪")
+        private Boolean denoise;
+        
+        @Schema(description = "是否校正倾斜")
+        private Boolean deskew;
+    }
+    
+    // ==================== OCR引擎常量 ====================
+    
+    public static final String ENGINE_PADDLEOCR = "paddleocr";
+    public static final String ENGINE_TESSERACT = "tesseract";
+}

+ 35 - 0
backend/extract-service/src/main/java/com/lingyue/extract/dto/config/SelfReferenceSourceConfig.java

@@ -0,0 +1,35 @@
+package com.lingyue.extract.dto.config;
+
+import io.swagger.v3.oas.annotations.media.Schema;
+import lombok.Data;
+
+import java.util.List;
+
+/**
+ * 自引用来源配置
+ * 
+ * 用于 sourceType = "self_reference" 的规则
+ * 引用已提取的其他字段值
+ * 
+ * @author lingyue
+ * @since 2026-01-22
+ */
+@Data
+@Schema(description = "自引用来源配置")
+public class SelfReferenceSourceConfig {
+    
+    @Schema(description = "引用的字段Key列表")
+    private List<String> referenceFieldKeys;
+    
+    @Schema(description = "组合模板,如 '{project_name} - {project_code}'")
+    private String combineTemplate;
+    
+    @Schema(description = "值转换配置")
+    private TransformConfig transform;
+    
+    @Schema(description = "当引用字段为空时的默认值")
+    private String defaultValue;
+    
+    @Schema(description = "是否必须所有引用字段都有值")
+    private Boolean requireAll;
+}

+ 44 - 0
backend/extract-service/src/main/java/com/lingyue/extract/dto/config/TransformConfig.java

@@ -0,0 +1,44 @@
+package com.lingyue.extract.dto.config;
+
+import io.swagger.v3.oas.annotations.media.Schema;
+import lombok.Data;
+
+import java.util.Map;
+
+/**
+ * 转换配置
+ * 
+ * 描述如何对值进行转换
+ * 
+ * @author lingyue
+ * @since 2026-01-22
+ */
+@Data
+@Schema(description = "转换配置")
+public class TransformConfig {
+    
+    @Schema(description = "转换类型: mapping-映射, format-格式化, concat-拼接, split-分割")
+    private String type;
+    
+    @Schema(description = "映射表,如 {'是': 'true', '否': 'false'}")
+    private Map<String, String> mapping;
+    
+    @Schema(description = "格式化模板,如 '{value}万元'")
+    private String formatTemplate;
+    
+    @Schema(description = "拼接分隔符")
+    private String concatSeparator;
+    
+    @Schema(description = "分割分隔符")
+    private String splitSeparator;
+    
+    @Schema(description = "取分割后的第几个(0-based)")
+    private Integer splitIndex;
+    
+    // ==================== 转换类型常量 ====================
+    
+    public static final String TYPE_MAPPING = "mapping";
+    public static final String TYPE_FORMAT = "format";
+    public static final String TYPE_CONCAT = "concat";
+    public static final String TYPE_SPLIT = "split";
+}

+ 118 - 0
backend/extract-service/src/main/java/com/lingyue/extract/entity/ExtractResult.java

@@ -0,0 +1,118 @@
+package com.lingyue.extract.entity;
+
+import com.baomidou.mybatisplus.annotation.TableField;
+import com.baomidou.mybatisplus.annotation.TableId;
+import com.baomidou.mybatisplus.annotation.TableName;
+import com.baomidou.mybatisplus.extension.handlers.JacksonTypeHandler;
+import com.fasterxml.jackson.annotation.JsonFormat;
+import io.swagger.v3.oas.annotations.media.Schema;
+import lombok.Data;
+
+import java.math.BigDecimal;
+import java.util.Date;
+import java.util.Map;
+
+/**
+ * 提取结果实体
+ * 
+ * 规则执行后的提取值,保存来源追溯信息
+ * 
+ * @author lingyue
+ * @since 2026-01-22
+ */
+@Data
+@TableName(value = "extract_results", autoResultMap = true)
+@Schema(description = "提取结果")
+public class ExtractResult {
+    
+    @Schema(description = "ID")
+    @TableId
+    private String id;
+    
+    @Schema(description = "规则ID")
+    @TableField("rule_id")
+    private String ruleId;
+    
+    @Schema(description = "项目ID")
+    @TableField("project_id")
+    private String projectId;
+    
+    // ==================== 提取结果 ====================
+    
+    @Schema(description = "提取出的值")
+    @TableField("extracted_value")
+    private String extractedValue;
+    
+    @Schema(description = "值类型: text-文本, table-表格, image-图片, list-列表")
+    @TableField("value_type")
+    private String valueType;
+    
+    // ==================== 来源追溯 ====================
+    
+    @Schema(description = "来源原文内容")
+    @TableField("source_content")
+    private String sourceContent;
+    
+    @Schema(description = "来源位置信息")
+    @TableField(value = "source_location", typeHandler = JacksonTypeHandler.class)
+    private Map<String, Object> sourceLocation;
+    
+    // ==================== 质量评估 ====================
+    
+    @Schema(description = "AI提取置信度 0-1")
+    @TableField("confidence")
+    private BigDecimal confidence;
+    
+    // ==================== 状态 ====================
+    
+    @Schema(description = "状态: extracted-已提取, confirmed-已确认, rejected-已拒绝, modified-已修正")
+    @TableField("status")
+    private String status;
+    
+    // ==================== 人工处理 ====================
+    
+    @Schema(description = "人工修正后的值")
+    @TableField("modified_value")
+    private String modifiedValue;
+    
+    @Schema(description = "确认时间")
+    @JsonFormat(pattern = "yyyy-MM-dd HH:mm:ss")
+    @TableField("confirmed_at")
+    private Date confirmedAt;
+    
+    @Schema(description = "确认人")
+    @TableField("confirmed_by")
+    private String confirmedBy;
+    
+    @Schema(description = "拒绝原因")
+    @TableField("reject_reason")
+    private String rejectReason;
+    
+    // ==================== 元数据 ====================
+    
+    @Schema(description = "元数据(AI输出、处理日志等)")
+    @TableField(value = "metadata", typeHandler = JacksonTypeHandler.class)
+    private Map<String, Object> metadata;
+    
+    @Schema(description = "创建时间")
+    @JsonFormat(pattern = "yyyy-MM-dd HH:mm:ss")
+    @TableField("create_time")
+    private Date createTime;
+    
+    // ==================== 状态常量 ====================
+    
+    public static final String STATUS_EXTRACTED = "extracted";
+    public static final String STATUS_CONFIRMED = "confirmed";
+    public static final String STATUS_REJECTED = "rejected";
+    public static final String STATUS_MODIFIED = "modified";
+    
+    /**
+     * 获取最终值(优先使用修正后的值)
+     */
+    public String getFinalValue() {
+        if (modifiedValue != null && !modifiedValue.isEmpty()) {
+            return modifiedValue;
+        }
+        return extractedValue;
+    }
+}

+ 125 - 0
backend/extract-service/src/main/java/com/lingyue/extract/entity/ExtractRule.java

@@ -0,0 +1,125 @@
+package com.lingyue.extract.entity;
+
+import com.baomidou.mybatisplus.annotation.TableField;
+import com.baomidou.mybatisplus.annotation.TableName;
+import com.baomidou.mybatisplus.extension.handlers.JacksonTypeHandler;
+import com.lingyue.common.domain.entity.SimpleModel;
+import io.swagger.v3.oas.annotations.media.Schema;
+import lombok.Data;
+import lombok.EqualsAndHashCode;
+
+import java.util.Map;
+
+/**
+ * 提取规则实体
+ * 
+ * 描述如何从来源文档中提取数据的配置
+ * 
+ * @author lingyue
+ * @since 2026-01-22
+ */
+@EqualsAndHashCode(callSuper = true)
+@Data
+@TableName(value = "extract_rules", autoResultMap = true)
+@Schema(description = "数据提取规则")
+public class ExtractRule extends SimpleModel {
+    
+    @Schema(description = "项目ID")
+    @TableField("project_id")
+    private String projectId;
+    
+    @Schema(description = "来源文档ID(可为空,表示引用/固定/手动类型)")
+    @TableField("source_doc_id")
+    private String sourceDocId;
+    
+    // ==================== 目标字段 ====================
+    
+    @Schema(description = "目标字段Key(程序用)")
+    @TableField("target_field_key")
+    private String targetFieldKey;
+    
+    @Schema(description = "目标字段名称(显示用)")
+    @TableField("target_field_name")
+    private String targetFieldName;
+    
+    @Schema(description = "字段分组")
+    @TableField("target_field_group")
+    private String targetFieldGroup;
+    
+    @Schema(description = "规则顺序")
+    @TableField("rule_index")
+    private Integer ruleIndex;
+    
+    // ==================== 来源配置 ====================
+    
+    @Schema(description = "来源类型: document-文档, self_reference-引用已提取字段, fixed-固定值, manual-手动输入")
+    @TableField("source_type")
+    private String sourceType;
+    
+    @Schema(description = "来源配置")
+    @TableField(value = "source_config", typeHandler = JacksonTypeHandler.class)
+    private Map<String, Object> sourceConfig;
+    
+    // ==================== 提取配置 ====================
+    
+    @Schema(description = "提取类型: direct-直接提取, ai_extract-AI字段提取, ai_summarize-AI总结, ocr-OCR识别")
+    @TableField("extract_type")
+    private String extractType;
+    
+    @Schema(description = "提取配置")
+    @TableField(value = "extract_config", typeHandler = JacksonTypeHandler.class)
+    private Map<String, Object> extractConfig;
+    
+    // ==================== 结果 ====================
+    
+    @Schema(description = "状态: pending-待提取, extracting-提取中, extracted-已提取, confirmed-已确认, error-错误")
+    @TableField("status")
+    private String status;
+    
+    @Schema(description = "提取出的值")
+    @TableField("extracted_value")
+    private String extractedValue;
+    
+    @Schema(description = "值类型: text-文本, table-表格, image-图片, list-列表")
+    @TableField("value_type")
+    private String valueType;
+    
+    @Schema(description = "错误信息")
+    @TableField("error_message")
+    private String errorMessage;
+    
+    // ==================== 元数据 ====================
+    
+    @Schema(description = "元数据")
+    @TableField(value = "metadata", typeHandler = JacksonTypeHandler.class)
+    private Map<String, Object> metadata;
+    
+    // ==================== 来源类型常量 ====================
+    
+    public static final String SOURCE_TYPE_DOCUMENT = "document";
+    public static final String SOURCE_TYPE_SELF_REFERENCE = "self_reference";
+    public static final String SOURCE_TYPE_FIXED = "fixed";
+    public static final String SOURCE_TYPE_MANUAL = "manual";
+    
+    // ==================== 提取类型常量 ====================
+    
+    public static final String EXTRACT_TYPE_DIRECT = "direct";
+    public static final String EXTRACT_TYPE_AI_EXTRACT = "ai_extract";
+    public static final String EXTRACT_TYPE_AI_SUMMARIZE = "ai_summarize";
+    public static final String EXTRACT_TYPE_OCR = "ocr";
+    
+    // ==================== 状态常量 ====================
+    
+    public static final String STATUS_PENDING = "pending";
+    public static final String STATUS_EXTRACTING = "extracting";
+    public static final String STATUS_EXTRACTED = "extracted";
+    public static final String STATUS_CONFIRMED = "confirmed";
+    public static final String STATUS_ERROR = "error";
+    
+    // ==================== 值类型常量 ====================
+    
+    public static final String VALUE_TYPE_TEXT = "text";
+    public static final String VALUE_TYPE_TABLE = "table";
+    public static final String VALUE_TYPE_IMAGE = "image";
+    public static final String VALUE_TYPE_LIST = "list";
+}

+ 53 - 0
backend/extract-service/src/main/java/com/lingyue/extract/entity/Project.java

@@ -0,0 +1,53 @@
+package com.lingyue.extract.entity;
+
+import com.baomidou.mybatisplus.annotation.TableField;
+import com.baomidou.mybatisplus.annotation.TableName;
+import com.baomidou.mybatisplus.extension.handlers.JacksonTypeHandler;
+import com.lingyue.common.domain.entity.SimpleModel;
+import io.swagger.v3.oas.annotations.media.Schema;
+import lombok.Data;
+import lombok.EqualsAndHashCode;
+
+import java.util.Map;
+
+/**
+ * 数据提取项目实体
+ * 
+ * 一个项目代表一个报告生成任务,包含多个来源文档和提取规则
+ * 
+ * @author lingyue
+ * @since 2026-01-22
+ */
+@EqualsAndHashCode(callSuper = true)
+@Data
+@TableName(value = "extract_projects", autoResultMap = true)
+@Schema(description = "数据提取项目")
+public class Project extends SimpleModel {
+    
+    @Schema(description = "用户ID")
+    @TableField("user_id")
+    private String userId;
+    
+    @Schema(description = "项目名称")
+    @TableField("name")
+    private String name;
+    
+    @Schema(description = "项目描述")
+    @TableField("description")
+    private String description;
+    
+    @Schema(description = "状态: draft-草稿, extracting-提取中, completed-已完成, archived-已归档")
+    @TableField("status")
+    private String status;
+    
+    @Schema(description = "项目配置")
+    @TableField(value = "config", typeHandler = JacksonTypeHandler.class)
+    private Map<String, Object> config;
+    
+    // ==================== 状态常量 ====================
+    
+    public static final String STATUS_DRAFT = "draft";
+    public static final String STATUS_EXTRACTING = "extracting";
+    public static final String STATUS_COMPLETED = "completed";
+    public static final String STATUS_ARCHIVED = "archived";
+}

+ 67 - 0
backend/extract-service/src/main/java/com/lingyue/extract/entity/RuleTemplate.java

@@ -0,0 +1,67 @@
+package com.lingyue.extract.entity;
+
+import com.baomidou.mybatisplus.annotation.TableField;
+import com.baomidou.mybatisplus.annotation.TableName;
+import com.baomidou.mybatisplus.extension.handlers.JacksonTypeHandler;
+import com.lingyue.common.domain.entity.SimpleModel;
+import io.swagger.v3.oas.annotations.media.Schema;
+import lombok.Data;
+import lombok.EqualsAndHashCode;
+
+import java.util.List;
+import java.util.Map;
+
+/**
+ * 规则模板实体
+ * 
+ * 用于保存和复用规则配置
+ * 
+ * @author lingyue
+ * @since 2026-01-22
+ */
+@EqualsAndHashCode(callSuper = true)
+@Data
+@TableName(value = "extract_rule_templates", autoResultMap = true)
+@Schema(description = "规则模板")
+public class RuleTemplate extends SimpleModel {
+    
+    @Schema(description = "用户ID")
+    @TableField("user_id")
+    private String userId;
+    
+    @Schema(description = "模板名称")
+    @TableField("name")
+    private String name;
+    
+    @Schema(description = "模板描述")
+    @TableField("description")
+    private String description;
+    
+    // ==================== 模板内容 ====================
+    
+    @Schema(description = "规则配置快照")
+    @TableField(value = "rules_snapshot", typeHandler = JacksonTypeHandler.class)
+    private List<Map<String, Object>> rulesSnapshot;
+    
+    @Schema(description = "适用的文档类型模式")
+    @TableField(value = "doc_type_pattern", typeHandler = JacksonTypeHandler.class)
+    private Map<String, Object> docTypePattern;
+    
+    // ==================== 可见性 ====================
+    
+    @Schema(description = "是否公开")
+    @TableField("is_public")
+    private Boolean isPublic;
+    
+    // ==================== 统计 ====================
+    
+    @Schema(description = "使用次数")
+    @TableField("use_count")
+    private Integer useCount;
+    
+    // ==================== 元数据 ====================
+    
+    @Schema(description = "元数据")
+    @TableField(value = "metadata", typeHandler = JacksonTypeHandler.class)
+    private Map<String, Object> metadata;
+}

+ 65 - 0
backend/extract-service/src/main/java/com/lingyue/extract/entity/SourceDocument.java

@@ -0,0 +1,65 @@
+package com.lingyue.extract.entity;
+
+import com.baomidou.mybatisplus.annotation.TableField;
+import com.baomidou.mybatisplus.annotation.TableId;
+import com.baomidou.mybatisplus.annotation.TableName;
+import com.baomidou.mybatisplus.extension.handlers.JacksonTypeHandler;
+import com.fasterxml.jackson.annotation.JsonFormat;
+import io.swagger.v3.oas.annotations.media.Schema;
+import lombok.Data;
+
+import java.util.Date;
+import java.util.Map;
+
+/**
+ * 来源文档实体
+ * 
+ * 项目中用到的文档,关联已解析的 Document
+ * 
+ * @author lingyue
+ * @since 2026-01-22
+ */
+@Data
+@TableName(value = "extract_source_documents", autoResultMap = true)
+@Schema(description = "项目来源文档")
+public class SourceDocument {
+    
+    @Schema(description = "ID")
+    @TableId
+    private String id;
+    
+    @Schema(description = "项目ID")
+    @TableField("project_id")
+    private String projectId;
+    
+    @Schema(description = "关联的 Document ID")
+    @TableField("document_id")
+    private String documentId;
+    
+    @Schema(description = "文档别名,如'可研批复'")
+    @TableField("alias")
+    private String alias;
+    
+    @Schema(description = "文档类型: pdf/docx/xlsx")
+    @TableField("doc_type")
+    private String docType;
+    
+    @Schema(description = "显示顺序")
+    @TableField("display_order")
+    private Integer displayOrder;
+    
+    @Schema(description = "元数据")
+    @TableField(value = "metadata", typeHandler = JacksonTypeHandler.class)
+    private Map<String, Object> metadata;
+    
+    @Schema(description = "创建时间")
+    @JsonFormat(pattern = "yyyy-MM-dd HH:mm:ss")
+    @TableField("create_time")
+    private Date createTime;
+    
+    // ==================== 文档类型常量 ====================
+    
+    public static final String DOC_TYPE_PDF = "pdf";
+    public static final String DOC_TYPE_DOCX = "docx";
+    public static final String DOC_TYPE_XLSX = "xlsx";
+}

+ 74 - 0
backend/extract-service/src/main/java/com/lingyue/extract/repository/ExtractResultRepository.java

@@ -0,0 +1,74 @@
+package com.lingyue.extract.repository;
+
+import com.baomidou.mybatisplus.core.mapper.BaseMapper;
+import com.lingyue.extract.entity.ExtractResult;
+import org.apache.ibatis.annotations.Delete;
+import org.apache.ibatis.annotations.Mapper;
+import org.apache.ibatis.annotations.Param;
+import org.apache.ibatis.annotations.Select;
+
+import java.util.List;
+
+/**
+ * 提取结果 Repository
+ * 
+ * @author lingyue
+ * @since 2026-01-22
+ */
+@Mapper
+public interface ExtractResultRepository extends BaseMapper<ExtractResult> {
+    
+    /**
+     * 根据规则ID查询结果列表
+     */
+    @Select("SELECT * FROM extract_results WHERE rule_id = #{ruleId} ORDER BY create_time DESC")
+    List<ExtractResult> findByRuleId(@Param("ruleId") String ruleId);
+    
+    /**
+     * 根据规则ID查询最新结果
+     */
+    @Select("SELECT * FROM extract_results WHERE rule_id = #{ruleId} ORDER BY create_time DESC LIMIT 1")
+    ExtractResult findLatestByRuleId(@Param("ruleId") String ruleId);
+    
+    /**
+     * 根据项目ID查询所有结果
+     */
+    @Select("SELECT * FROM extract_results WHERE project_id = #{projectId} ORDER BY create_time DESC")
+    List<ExtractResult> findByProjectId(@Param("projectId") String projectId);
+    
+    /**
+     * 根据状态查询结果
+     */
+    @Select("SELECT * FROM extract_results WHERE project_id = #{projectId} AND status = #{status} ORDER BY create_time DESC")
+    List<ExtractResult> findByProjectIdAndStatus(@Param("projectId") String projectId, @Param("status") String status);
+    
+    /**
+     * 删除规则的所有结果
+     */
+    @Delete("DELETE FROM extract_results WHERE rule_id = #{ruleId}")
+    int deleteByRuleId(@Param("ruleId") String ruleId);
+    
+    /**
+     * 删除项目的所有结果
+     */
+    @Delete("DELETE FROM extract_results WHERE project_id = #{projectId}")
+    int deleteByProjectId(@Param("projectId") String projectId);
+    
+    /**
+     * 统计项目结果数量
+     */
+    @Select("SELECT COUNT(*) FROM extract_results WHERE project_id = #{projectId}")
+    int countByProjectId(@Param("projectId") String projectId);
+    
+    /**
+     * 按状态统计项目结果数量
+     */
+    @Select("SELECT status, COUNT(*) as count FROM extract_results WHERE project_id = #{projectId} GROUP BY status")
+    List<java.util.Map<String, Object>> countByProjectIdGroupByStatus(@Param("projectId") String projectId);
+    
+    /**
+     * 查询待确认的结果
+     */
+    @Select("SELECT * FROM extract_results WHERE project_id = #{projectId} AND status = 'extracted' ORDER BY create_time")
+    List<ExtractResult> findPendingConfirmByProjectId(@Param("projectId") String projectId);
+}

+ 88 - 0
backend/extract-service/src/main/java/com/lingyue/extract/repository/ExtractRuleRepository.java

@@ -0,0 +1,88 @@
+package com.lingyue.extract.repository;
+
+import com.baomidou.mybatisplus.core.mapper.BaseMapper;
+import com.lingyue.extract.entity.ExtractRule;
+import org.apache.ibatis.annotations.Delete;
+import org.apache.ibatis.annotations.Mapper;
+import org.apache.ibatis.annotations.Param;
+import org.apache.ibatis.annotations.Select;
+
+import java.util.List;
+
+/**
+ * 提取规则 Repository
+ * 
+ * @author lingyue
+ * @since 2026-01-22
+ */
+@Mapper
+public interface ExtractRuleRepository extends BaseMapper<ExtractRule> {
+    
+    /**
+     * 根据项目ID查询规则列表(按顺序)
+     */
+    @Select("SELECT * FROM extract_rules WHERE project_id = #{projectId} ORDER BY rule_index")
+    List<ExtractRule> findByProjectId(@Param("projectId") String projectId);
+    
+    /**
+     * 根据状态查询规则
+     */
+    @Select("SELECT * FROM extract_rules WHERE project_id = #{projectId} AND status = #{status} ORDER BY rule_index")
+    List<ExtractRule> findByProjectIdAndStatus(@Param("projectId") String projectId, @Param("status") String status);
+    
+    /**
+     * 根据目标字段Key查询
+     */
+    @Select("SELECT * FROM extract_rules WHERE project_id = #{projectId} AND target_field_key = #{targetFieldKey}")
+    ExtractRule findByProjectIdAndTargetFieldKey(@Param("projectId") String projectId, 
+                                                  @Param("targetFieldKey") String targetFieldKey);
+    
+    /**
+     * 根据来源文档ID查询
+     */
+    @Select("SELECT * FROM extract_rules WHERE source_doc_id = #{sourceDocId} ORDER BY rule_index")
+    List<ExtractRule> findBySourceDocId(@Param("sourceDocId") String sourceDocId);
+    
+    /**
+     * 根据来源类型查询
+     */
+    @Select("SELECT * FROM extract_rules WHERE project_id = #{projectId} AND source_type = #{sourceType} ORDER BY rule_index")
+    List<ExtractRule> findByProjectIdAndSourceType(@Param("projectId") String projectId, 
+                                                    @Param("sourceType") String sourceType);
+    
+    /**
+     * 删除项目的所有规则
+     */
+    @Delete("DELETE FROM extract_rules WHERE project_id = #{projectId}")
+    int deleteByProjectId(@Param("projectId") String projectId);
+    
+    /**
+     * 统计项目规则数量
+     */
+    @Select("SELECT COUNT(*) FROM extract_rules WHERE project_id = #{projectId}")
+    int countByProjectId(@Param("projectId") String projectId);
+    
+    /**
+     * 按状态统计项目规则数量
+     */
+    @Select("SELECT status, COUNT(*) as count FROM extract_rules WHERE project_id = #{projectId} GROUP BY status")
+    List<java.util.Map<String, Object>> countByProjectIdGroupByStatus(@Param("projectId") String projectId);
+    
+    /**
+     * 获取项目最大规则顺序
+     */
+    @Select("SELECT COALESCE(MAX(rule_index), 0) FROM extract_rules WHERE project_id = #{projectId}")
+    int getMaxRuleIndex(@Param("projectId") String projectId);
+    
+    /**
+     * 查询引用了指定字段的规则(用于依赖分析)
+     */
+    @Select("""
+            SELECT * FROM extract_rules 
+            WHERE project_id = #{projectId} 
+            AND source_type = 'self_reference'
+            AND source_config::text LIKE '%' || #{targetFieldKey} || '%'
+            """)
+    List<ExtractRule> findRulesReferencingField(@Param("projectId") String projectId, 
+                                                 @Param("targetFieldKey") String targetFieldKey);
+}

+ 49 - 0
backend/extract-service/src/main/java/com/lingyue/extract/repository/ProjectRepository.java

@@ -0,0 +1,49 @@
+package com.lingyue.extract.repository;
+
+import com.baomidou.mybatisplus.core.mapper.BaseMapper;
+import com.lingyue.extract.entity.Project;
+import org.apache.ibatis.annotations.Mapper;
+import org.apache.ibatis.annotations.Param;
+import org.apache.ibatis.annotations.Select;
+
+import java.util.List;
+
+/**
+ * 项目 Repository
+ * 
+ * @author lingyue
+ * @since 2026-01-22
+ */
+@Mapper
+public interface ProjectRepository extends BaseMapper<Project> {
+    
+    /**
+     * 根据用户ID查询项目列表
+     */
+    @Select("SELECT * FROM extract_projects WHERE user_id = #{userId} ORDER BY create_time DESC")
+    List<Project> findByUserId(@Param("userId") String userId);
+    
+    /**
+     * 根据状态查询项目
+     */
+    @Select("SELECT * FROM extract_projects WHERE status = #{status} ORDER BY create_time DESC")
+    List<Project> findByStatus(@Param("status") String status);
+    
+    /**
+     * 根据用户ID和状态查询项目
+     */
+    @Select("SELECT * FROM extract_projects WHERE user_id = #{userId} AND status = #{status} ORDER BY create_time DESC")
+    List<Project> findByUserIdAndStatus(@Param("userId") String userId, @Param("status") String status);
+    
+    /**
+     * 统计用户项目数量
+     */
+    @Select("SELECT COUNT(*) FROM extract_projects WHERE user_id = #{userId}")
+    int countByUserId(@Param("userId") String userId);
+    
+    /**
+     * 统计用户各状态项目数量
+     */
+    @Select("SELECT status, COUNT(*) as count FROM extract_projects WHERE user_id = #{userId} GROUP BY status")
+    List<java.util.Map<String, Object>> countByUserIdGroupByStatus(@Param("userId") String userId);
+}

+ 65 - 0
backend/extract-service/src/main/java/com/lingyue/extract/repository/RuleTemplateRepository.java

@@ -0,0 +1,65 @@
+package com.lingyue.extract.repository;
+
+import com.baomidou.mybatisplus.core.mapper.BaseMapper;
+import com.lingyue.extract.entity.RuleTemplate;
+import org.apache.ibatis.annotations.Mapper;
+import org.apache.ibatis.annotations.Param;
+import org.apache.ibatis.annotations.Select;
+import org.apache.ibatis.annotations.Update;
+
+import java.util.List;
+
+/**
+ * 规则模板 Repository
+ * 
+ * @author lingyue
+ * @since 2026-01-22
+ */
+@Mapper
+public interface RuleTemplateRepository extends BaseMapper<RuleTemplate> {
+    
+    /**
+     * 根据用户ID查询模板列表
+     */
+    @Select("SELECT * FROM extract_rule_templates WHERE user_id = #{userId} ORDER BY create_time DESC")
+    List<RuleTemplate> findByUserId(@Param("userId") String userId);
+    
+    /**
+     * 查询公开模板
+     */
+    @Select("SELECT * FROM extract_rule_templates WHERE is_public = true ORDER BY use_count DESC, create_time DESC")
+    List<RuleTemplate> findPublicTemplates();
+    
+    /**
+     * 查询用户可见的模板(自己的 + 公开的)
+     */
+    @Select("""
+            SELECT * FROM extract_rule_templates 
+            WHERE user_id = #{userId} OR is_public = true 
+            ORDER BY use_count DESC, create_time DESC
+            """)
+    List<RuleTemplate> findAccessibleByUserId(@Param("userId") String userId);
+    
+    /**
+     * 根据名称模糊查询
+     */
+    @Select("""
+            SELECT * FROM extract_rule_templates 
+            WHERE (user_id = #{userId} OR is_public = true) 
+            AND name LIKE '%' || #{keyword} || '%'
+            ORDER BY use_count DESC, create_time DESC
+            """)
+    List<RuleTemplate> searchByName(@Param("userId") String userId, @Param("keyword") String keyword);
+    
+    /**
+     * 增加使用次数
+     */
+    @Update("UPDATE extract_rule_templates SET use_count = use_count + 1 WHERE id = #{id}")
+    int incrementUseCount(@Param("id") String id);
+    
+    /**
+     * 统计用户模板数量
+     */
+    @Select("SELECT COUNT(*) FROM extract_rule_templates WHERE user_id = #{userId}")
+    int countByUserId(@Param("userId") String userId);
+}

+ 56 - 0
backend/extract-service/src/main/java/com/lingyue/extract/repository/SourceDocumentRepository.java

@@ -0,0 +1,56 @@
+package com.lingyue.extract.repository;
+
+import com.baomidou.mybatisplus.core.mapper.BaseMapper;
+import com.lingyue.extract.entity.SourceDocument;
+import org.apache.ibatis.annotations.Delete;
+import org.apache.ibatis.annotations.Mapper;
+import org.apache.ibatis.annotations.Param;
+import org.apache.ibatis.annotations.Select;
+
+import java.util.List;
+
+/**
+ * 来源文档 Repository
+ * 
+ * @author lingyue
+ * @since 2026-01-22
+ */
+@Mapper
+public interface SourceDocumentRepository extends BaseMapper<SourceDocument> {
+    
+    /**
+     * 根据项目ID查询来源文档列表
+     */
+    @Select("SELECT * FROM extract_source_documents WHERE project_id = #{projectId} ORDER BY display_order")
+    List<SourceDocument> findByProjectId(@Param("projectId") String projectId);
+    
+    /**
+     * 根据关联的 Document ID 查询
+     */
+    @Select("SELECT * FROM extract_source_documents WHERE document_id = #{documentId}")
+    List<SourceDocument> findByDocumentId(@Param("documentId") String documentId);
+    
+    /**
+     * 根据项目ID和别名查询
+     */
+    @Select("SELECT * FROM extract_source_documents WHERE project_id = #{projectId} AND alias = #{alias}")
+    SourceDocument findByProjectIdAndAlias(@Param("projectId") String projectId, @Param("alias") String alias);
+    
+    /**
+     * 删除项目的所有来源文档
+     */
+    @Delete("DELETE FROM extract_source_documents WHERE project_id = #{projectId}")
+    int deleteByProjectId(@Param("projectId") String projectId);
+    
+    /**
+     * 统计项目来源文档数量
+     */
+    @Select("SELECT COUNT(*) FROM extract_source_documents WHERE project_id = #{projectId}")
+    int countByProjectId(@Param("projectId") String projectId);
+    
+    /**
+     * 获取项目最大显示顺序
+     */
+    @Select("SELECT COALESCE(MAX(display_order), 0) FROM extract_source_documents WHERE project_id = #{projectId}")
+    int getMaxDisplayOrder(@Param("projectId") String projectId);
+}