|
|
@@ -0,0 +1,200 @@
|
|
|
+-- ============================================
|
|
|
+-- 数据提取规则系统核心表
|
|
|
+-- extract-service 模块
|
|
|
+-- ============================================
|
|
|
+
|
|
|
+-- ============================================
|
|
|
+-- 1. 项目表 (projects)
|
|
|
+-- ============================================
|
|
|
+CREATE TABLE IF NOT EXISTS extract_projects (
|
|
|
+ id VARCHAR(32) PRIMARY KEY,
|
|
|
+ user_id VARCHAR(32) NOT NULL,
|
|
|
+ name VARCHAR(255) NOT NULL,
|
|
|
+ description TEXT,
|
|
|
+ status VARCHAR(32) DEFAULT 'draft', -- draft/extracting/completed/archived
|
|
|
+ config JSONB DEFAULT '{}',
|
|
|
+ create_by VARCHAR(36),
|
|
|
+ create_by_name VARCHAR(100),
|
|
|
+ create_time TIMESTAMP NOT NULL DEFAULT CURRENT_TIMESTAMP,
|
|
|
+ update_by VARCHAR(36),
|
|
|
+ update_by_name VARCHAR(100),
|
|
|
+ update_time TIMESTAMP NOT NULL DEFAULT CURRENT_TIMESTAMP
|
|
|
+);
|
|
|
+
|
|
|
+CREATE INDEX IF NOT EXISTS idx_extract_projects_user_id ON extract_projects(user_id);
|
|
|
+CREATE INDEX IF NOT EXISTS idx_extract_projects_status ON extract_projects(status);
|
|
|
+
|
|
|
+COMMENT ON TABLE extract_projects IS '数据提取项目';
|
|
|
+COMMENT ON COLUMN extract_projects.status IS '状态: draft-草稿, extracting-提取中, completed-已完成, archived-已归档';
|
|
|
+COMMENT ON COLUMN extract_projects.config IS '项目配置: outputFormat, autoExtract, notifyOnComplete, aiModel';
|
|
|
+
|
|
|
+-- ============================================
|
|
|
+-- 2. 来源文档表 (source_documents)
|
|
|
+-- ============================================
|
|
|
+CREATE TABLE IF NOT EXISTS extract_source_documents (
|
|
|
+ id VARCHAR(32) PRIMARY KEY,
|
|
|
+ project_id VARCHAR(32) NOT NULL REFERENCES extract_projects(id) ON DELETE CASCADE,
|
|
|
+ document_id VARCHAR(36) NOT NULL, -- 关联 documents 表
|
|
|
+ alias VARCHAR(128) NOT NULL, -- 文档别名,如"可研批复"
|
|
|
+ doc_type VARCHAR(32) NOT NULL, -- pdf/docx/xlsx
|
|
|
+ display_order INT DEFAULT 0,
|
|
|
+ metadata JSONB DEFAULT '{}',
|
|
|
+ create_time TIMESTAMP NOT NULL DEFAULT CURRENT_TIMESTAMP,
|
|
|
+
|
|
|
+ CONSTRAINT uq_source_doc_alias UNIQUE (project_id, alias)
|
|
|
+);
|
|
|
+
|
|
|
+CREATE INDEX IF NOT EXISTS idx_source_docs_project_id ON extract_source_documents(project_id);
|
|
|
+CREATE INDEX IF NOT EXISTS idx_source_docs_document_id ON extract_source_documents(document_id);
|
|
|
+
|
|
|
+COMMENT ON TABLE extract_source_documents IS '项目来源文档';
|
|
|
+COMMENT ON COLUMN extract_source_documents.alias IS '文档别名,如"可研批复"、"可行性研究报告"';
|
|
|
+COMMENT ON COLUMN extract_source_documents.metadata IS '元数据: fileName, fileSize, pageCount, parseStatus';
|
|
|
+
|
|
|
+-- ============================================
|
|
|
+-- 3. 提取规则表 (extract_rules)
|
|
|
+-- ============================================
|
|
|
+CREATE TABLE IF NOT EXISTS extract_rules (
|
|
|
+ id VARCHAR(32) PRIMARY KEY,
|
|
|
+ project_id VARCHAR(32) NOT NULL REFERENCES extract_projects(id) ON DELETE CASCADE,
|
|
|
+ source_doc_id VARCHAR(32), -- 可为空,表示引用/固定/手动类型
|
|
|
+
|
|
|
+ -- 目标字段
|
|
|
+ target_field_key VARCHAR(128) NOT NULL, -- 字段Key(程序用)
|
|
|
+ target_field_name VARCHAR(255) NOT NULL, -- 字段名称(显示用)
|
|
|
+ target_field_group VARCHAR(128), -- 字段分组
|
|
|
+ rule_index INT NOT NULL DEFAULT 0, -- 规则顺序
|
|
|
+
|
|
|
+ -- 来源配置
|
|
|
+ source_type VARCHAR(32) NOT NULL, -- document/self_reference/fixed/manual
|
|
|
+ source_config JSONB NOT NULL, -- 来源配置详情
|
|
|
+
|
|
|
+ -- 提取配置
|
|
|
+ extract_type VARCHAR(32) NOT NULL, -- direct/ai_extract/ai_summarize/ocr
|
|
|
+ extract_config JSONB, -- 提取配置详情
|
|
|
+
|
|
|
+ -- 结果
|
|
|
+ status VARCHAR(32) DEFAULT 'pending', -- pending/extracting/extracted/confirmed/error
|
|
|
+ extracted_value TEXT,
|
|
|
+ value_type VARCHAR(32) DEFAULT 'text', -- text/table/image/list
|
|
|
+ error_message TEXT,
|
|
|
+
|
|
|
+ -- 元数据
|
|
|
+ metadata JSONB DEFAULT '{}',
|
|
|
+ create_by VARCHAR(36),
|
|
|
+ create_by_name VARCHAR(100),
|
|
|
+ create_time TIMESTAMP NOT NULL DEFAULT CURRENT_TIMESTAMP,
|
|
|
+ update_by VARCHAR(36),
|
|
|
+ update_by_name VARCHAR(100),
|
|
|
+ update_time TIMESTAMP NOT NULL DEFAULT CURRENT_TIMESTAMP,
|
|
|
+
|
|
|
+ CONSTRAINT uq_rule_field_key UNIQUE (project_id, target_field_key)
|
|
|
+);
|
|
|
+
|
|
|
+CREATE INDEX IF NOT EXISTS idx_extract_rules_project_id ON extract_rules(project_id);
|
|
|
+CREATE INDEX IF NOT EXISTS idx_extract_rules_source_doc_id ON extract_rules(source_doc_id);
|
|
|
+CREATE INDEX IF NOT EXISTS idx_extract_rules_status ON extract_rules(status);
|
|
|
+CREATE INDEX IF NOT EXISTS idx_extract_rules_rule_index ON extract_rules(project_id, rule_index);
|
|
|
+
|
|
|
+COMMENT ON TABLE extract_rules IS '数据提取规则';
|
|
|
+COMMENT ON COLUMN extract_rules.source_type IS '来源类型: document-文档, self_reference-引用已提取字段, fixed-固定值, manual-手动输入';
|
|
|
+COMMENT ON COLUMN extract_rules.extract_type IS '提取类型: direct-直接提取, ai_extract-AI字段提取, ai_summarize-AI总结, ocr-OCR识别';
|
|
|
+COMMENT ON COLUMN extract_rules.source_config IS '来源配置: location, paragraphKeyword, referenceFieldKeys等';
|
|
|
+COMMENT ON COLUMN extract_rules.extract_config IS '提取配置: targetDescription, expectedFormat, summarizePrompt等';
|
|
|
+
|
|
|
+-- ============================================
|
|
|
+-- 4. 提取结果表 (extract_results)
|
|
|
+-- ============================================
|
|
|
+CREATE TABLE IF NOT EXISTS extract_results (
|
|
|
+ id VARCHAR(32) PRIMARY KEY,
|
|
|
+ rule_id VARCHAR(32) NOT NULL REFERENCES extract_rules(id) ON DELETE CASCADE,
|
|
|
+ project_id VARCHAR(32) NOT NULL REFERENCES extract_projects(id) ON DELETE CASCADE,
|
|
|
+
|
|
|
+ -- 提取结果
|
|
|
+ extracted_value TEXT NOT NULL,
|
|
|
+ value_type VARCHAR(32) DEFAULT 'text',
|
|
|
+
|
|
|
+ -- 来源追溯
|
|
|
+ source_content TEXT, -- 来源原文内容
|
|
|
+ source_location JSONB, -- 来源位置信息
|
|
|
+
|
|
|
+ -- 质量评估
|
|
|
+ confidence DECIMAL(5,4), -- AI提取置信度 0-1
|
|
|
+
|
|
|
+ -- 状态
|
|
|
+ status VARCHAR(32) DEFAULT 'extracted', -- extracted/confirmed/rejected/modified
|
|
|
+
|
|
|
+ -- 人工处理
|
|
|
+ modified_value TEXT, -- 人工修正后的值
|
|
|
+ confirmed_at TIMESTAMP,
|
|
|
+ confirmed_by VARCHAR(32),
|
|
|
+ reject_reason TEXT, -- 拒绝原因
|
|
|
+
|
|
|
+ -- 元数据
|
|
|
+ metadata JSONB DEFAULT '{}',
|
|
|
+ create_time TIMESTAMP NOT NULL DEFAULT CURRENT_TIMESTAMP
|
|
|
+);
|
|
|
+
|
|
|
+CREATE INDEX IF NOT EXISTS idx_extract_results_rule_id ON extract_results(rule_id);
|
|
|
+CREATE INDEX IF NOT EXISTS idx_extract_results_project_id ON extract_results(project_id);
|
|
|
+CREATE INDEX IF NOT EXISTS idx_extract_results_status ON extract_results(status);
|
|
|
+
|
|
|
+COMMENT ON TABLE extract_results IS '提取结果历史';
|
|
|
+COMMENT ON COLUMN extract_results.source_location IS '来源位置: documentId, documentAlias, locationType, pageStart, pageEnd, elementIds, chapterPath';
|
|
|
+COMMENT ON COLUMN extract_results.status IS '状态: extracted-已提取, confirmed-已确认, rejected-已拒绝, modified-已修正';
|
|
|
+
|
|
|
+-- ============================================
|
|
|
+-- 5. 规则模板表 (rule_templates)
|
|
|
+-- ============================================
|
|
|
+CREATE TABLE IF NOT EXISTS extract_rule_templates (
|
|
|
+ id VARCHAR(32) PRIMARY KEY,
|
|
|
+ user_id VARCHAR(32) NOT NULL,
|
|
|
+ name VARCHAR(255) NOT NULL,
|
|
|
+ description TEXT,
|
|
|
+
|
|
|
+ -- 模板内容
|
|
|
+ rules_snapshot JSONB NOT NULL, -- 规则配置快照
|
|
|
+ doc_type_pattern JSONB, -- 适用的文档类型模式
|
|
|
+
|
|
|
+ -- 可见性
|
|
|
+ is_public BOOLEAN DEFAULT FALSE,
|
|
|
+
|
|
|
+ -- 统计
|
|
|
+ use_count INT DEFAULT 0,
|
|
|
+
|
|
|
+ -- 元数据
|
|
|
+ metadata JSONB DEFAULT '{}',
|
|
|
+ create_by VARCHAR(36),
|
|
|
+ create_by_name VARCHAR(100),
|
|
|
+ create_time TIMESTAMP NOT NULL DEFAULT CURRENT_TIMESTAMP,
|
|
|
+ update_by VARCHAR(36),
|
|
|
+ update_by_name VARCHAR(100),
|
|
|
+ update_time TIMESTAMP NOT NULL DEFAULT CURRENT_TIMESTAMP
|
|
|
+);
|
|
|
+
|
|
|
+CREATE INDEX IF NOT EXISTS idx_rule_templates_user_id ON extract_rule_templates(user_id);
|
|
|
+CREATE INDEX IF NOT EXISTS idx_rule_templates_is_public ON extract_rule_templates(is_public);
|
|
|
+
|
|
|
+COMMENT ON TABLE extract_rule_templates IS '规则模板';
|
|
|
+COMMENT ON COLUMN extract_rule_templates.rules_snapshot IS '规则配置快照(不含项目特定信息)';
|
|
|
+COMMENT ON COLUMN extract_rule_templates.doc_type_pattern IS '适用的文档类型模式,用于自动匹配';
|
|
|
+
|
|
|
+-- ============================================
|
|
|
+-- 更新时间触发器
|
|
|
+-- ============================================
|
|
|
+DROP TRIGGER IF EXISTS update_extract_projects_update_time ON extract_projects;
|
|
|
+CREATE TRIGGER update_extract_projects_update_time BEFORE UPDATE ON extract_projects
|
|
|
+ FOR EACH ROW EXECUTE FUNCTION update_update_time_column();
|
|
|
+
|
|
|
+DROP TRIGGER IF EXISTS update_extract_rules_update_time ON extract_rules;
|
|
|
+CREATE TRIGGER update_extract_rules_update_time BEFORE UPDATE ON extract_rules
|
|
|
+ FOR EACH ROW EXECUTE FUNCTION update_update_time_column();
|
|
|
+
|
|
|
+DROP TRIGGER IF EXISTS update_rule_templates_update_time ON extract_rule_templates;
|
|
|
+CREATE TRIGGER update_rule_templates_update_time BEFORE UPDATE ON extract_rule_templates
|
|
|
+ FOR EACH ROW EXECUTE FUNCTION update_update_time_column();
|
|
|
+
|
|
|
+-- ============================================
|
|
|
+-- 显示创建结果
|
|
|
+-- ============================================
|
|
|
+SELECT 'Extract Service 数据表创建成功' AS result;
|