-- ============================================ -- 数据提取规则系统核心表 -- extract-service 模块 -- ============================================ -- ============================================ -- 1. 项目表 (projects) -- ============================================ CREATE TABLE IF NOT EXISTS extract_projects ( id VARCHAR(32) PRIMARY KEY, user_id VARCHAR(32) NOT NULL, name VARCHAR(255) NOT NULL, description TEXT, status VARCHAR(32) DEFAULT 'draft', -- draft/extracting/completed/archived config JSONB DEFAULT '{}', create_by VARCHAR(36), create_by_name VARCHAR(100), create_time TIMESTAMP NOT NULL DEFAULT CURRENT_TIMESTAMP, update_by VARCHAR(36), update_by_name VARCHAR(100), update_time TIMESTAMP NOT NULL DEFAULT CURRENT_TIMESTAMP ); CREATE INDEX IF NOT EXISTS idx_extract_projects_user_id ON extract_projects(user_id); CREATE INDEX IF NOT EXISTS idx_extract_projects_status ON extract_projects(status); COMMENT ON TABLE extract_projects IS '数据提取项目'; COMMENT ON COLUMN extract_projects.status IS '状态: draft-草稿, extracting-提取中, completed-已完成, archived-已归档'; COMMENT ON COLUMN extract_projects.config IS '项目配置: outputFormat, autoExtract, notifyOnComplete, aiModel'; -- ============================================ -- 2. 来源文档表 (source_documents) -- ============================================ CREATE TABLE IF NOT EXISTS extract_source_documents ( id VARCHAR(32) PRIMARY KEY, project_id VARCHAR(32) NOT NULL REFERENCES extract_projects(id) ON DELETE CASCADE, document_id VARCHAR(36) NOT NULL, -- 关联 documents 表 alias VARCHAR(128) NOT NULL, -- 文档别名,如"可研批复" doc_type VARCHAR(32) NOT NULL, -- pdf/docx/xlsx display_order INT DEFAULT 0, metadata JSONB DEFAULT '{}', create_time TIMESTAMP NOT NULL DEFAULT CURRENT_TIMESTAMP, CONSTRAINT uq_source_doc_alias UNIQUE (project_id, alias) ); CREATE INDEX IF NOT EXISTS idx_source_docs_project_id ON extract_source_documents(project_id); CREATE INDEX IF NOT EXISTS idx_source_docs_document_id ON extract_source_documents(document_id); COMMENT ON TABLE extract_source_documents IS '项目来源文档'; COMMENT ON COLUMN extract_source_documents.alias IS '文档别名,如"可研批复"、"可行性研究报告"'; COMMENT ON COLUMN extract_source_documents.metadata IS '元数据: fileName, fileSize, pageCount, parseStatus'; -- ============================================ -- 3. 提取规则表 (extract_rules) -- ============================================ CREATE TABLE IF NOT EXISTS extract_rules ( id VARCHAR(32) PRIMARY KEY, project_id VARCHAR(32) NOT NULL REFERENCES extract_projects(id) ON DELETE CASCADE, source_doc_id VARCHAR(32), -- 可为空,表示引用/固定/手动类型 -- 目标字段 target_field_key VARCHAR(128) NOT NULL, -- 字段Key(程序用) target_field_name VARCHAR(255) NOT NULL, -- 字段名称(显示用) target_field_group VARCHAR(128), -- 字段分组 rule_index INT NOT NULL DEFAULT 0, -- 规则顺序 -- 来源配置 source_type VARCHAR(32) NOT NULL, -- document/self_reference/fixed/manual source_config JSONB NOT NULL, -- 来源配置详情 -- 提取配置 extract_type VARCHAR(32) NOT NULL, -- direct/ai_extract/ai_summarize/ocr extract_config JSONB, -- 提取配置详情 -- 结果 status VARCHAR(32) DEFAULT 'pending', -- pending/extracting/extracted/confirmed/error extracted_value TEXT, value_type VARCHAR(32) DEFAULT 'text', -- text/table/image/list error_message TEXT, -- 元数据 metadata JSONB DEFAULT '{}', create_by VARCHAR(36), create_by_name VARCHAR(100), create_time TIMESTAMP NOT NULL DEFAULT CURRENT_TIMESTAMP, update_by VARCHAR(36), update_by_name VARCHAR(100), update_time TIMESTAMP NOT NULL DEFAULT CURRENT_TIMESTAMP, CONSTRAINT uq_rule_field_key UNIQUE (project_id, target_field_key) ); CREATE INDEX IF NOT EXISTS idx_extract_rules_project_id ON extract_rules(project_id); CREATE INDEX IF NOT EXISTS idx_extract_rules_source_doc_id ON extract_rules(source_doc_id); CREATE INDEX IF NOT EXISTS idx_extract_rules_status ON extract_rules(status); CREATE INDEX IF NOT EXISTS idx_extract_rules_rule_index ON extract_rules(project_id, rule_index); COMMENT ON TABLE extract_rules IS '数据提取规则'; COMMENT ON COLUMN extract_rules.source_type IS '来源类型: document-文档, self_reference-引用已提取字段, fixed-固定值, manual-手动输入'; COMMENT ON COLUMN extract_rules.extract_type IS '提取类型: direct-直接提取, ai_extract-AI字段提取, ai_summarize-AI总结, ocr-OCR识别'; COMMENT ON COLUMN extract_rules.source_config IS '来源配置: location, paragraphKeyword, referenceFieldKeys等'; COMMENT ON COLUMN extract_rules.extract_config IS '提取配置: targetDescription, expectedFormat, summarizePrompt等'; -- ============================================ -- 4. 提取结果表 (extract_results) -- ============================================ CREATE TABLE IF NOT EXISTS extract_results ( id VARCHAR(32) PRIMARY KEY, rule_id VARCHAR(32) NOT NULL REFERENCES extract_rules(id) ON DELETE CASCADE, project_id VARCHAR(32) NOT NULL REFERENCES extract_projects(id) ON DELETE CASCADE, -- 提取结果 extracted_value TEXT NOT NULL, value_type VARCHAR(32) DEFAULT 'text', -- 来源追溯 source_content TEXT, -- 来源原文内容 source_location JSONB, -- 来源位置信息 -- 质量评估 confidence DECIMAL(5,4), -- AI提取置信度 0-1 -- 状态 status VARCHAR(32) DEFAULT 'extracted', -- extracted/confirmed/rejected/modified -- 人工处理 modified_value TEXT, -- 人工修正后的值 confirmed_at TIMESTAMP, confirmed_by VARCHAR(32), reject_reason TEXT, -- 拒绝原因 -- 元数据 metadata JSONB DEFAULT '{}', create_time TIMESTAMP NOT NULL DEFAULT CURRENT_TIMESTAMP ); CREATE INDEX IF NOT EXISTS idx_extract_results_rule_id ON extract_results(rule_id); CREATE INDEX IF NOT EXISTS idx_extract_results_project_id ON extract_results(project_id); CREATE INDEX IF NOT EXISTS idx_extract_results_status ON extract_results(status); COMMENT ON TABLE extract_results IS '提取结果历史'; COMMENT ON COLUMN extract_results.source_location IS '来源位置: documentId, documentAlias, locationType, pageStart, pageEnd, elementIds, chapterPath'; COMMENT ON COLUMN extract_results.status IS '状态: extracted-已提取, confirmed-已确认, rejected-已拒绝, modified-已修正'; -- ============================================ -- 5. 规则模板表 (rule_templates) -- ============================================ CREATE TABLE IF NOT EXISTS extract_rule_templates ( id VARCHAR(32) PRIMARY KEY, user_id VARCHAR(32) NOT NULL, name VARCHAR(255) NOT NULL, description TEXT, -- 模板内容 rules_snapshot JSONB NOT NULL, -- 规则配置快照 doc_type_pattern JSONB, -- 适用的文档类型模式 -- 可见性 is_public BOOLEAN DEFAULT FALSE, -- 统计 use_count INT DEFAULT 0, -- 元数据 metadata JSONB DEFAULT '{}', create_by VARCHAR(36), create_by_name VARCHAR(100), create_time TIMESTAMP NOT NULL DEFAULT CURRENT_TIMESTAMP, update_by VARCHAR(36), update_by_name VARCHAR(100), update_time TIMESTAMP NOT NULL DEFAULT CURRENT_TIMESTAMP ); CREATE INDEX IF NOT EXISTS idx_rule_templates_user_id ON extract_rule_templates(user_id); CREATE INDEX IF NOT EXISTS idx_rule_templates_is_public ON extract_rule_templates(is_public); COMMENT ON TABLE extract_rule_templates IS '规则模板'; COMMENT ON COLUMN extract_rule_templates.rules_snapshot IS '规则配置快照(不含项目特定信息)'; COMMENT ON COLUMN extract_rule_templates.doc_type_pattern IS '适用的文档类型模式,用于自动匹配'; -- ============================================ -- 更新时间触发器 -- ============================================ DROP TRIGGER IF EXISTS update_extract_projects_update_time ON extract_projects; CREATE TRIGGER update_extract_projects_update_time BEFORE UPDATE ON extract_projects FOR EACH ROW EXECUTE FUNCTION update_update_time_column(); DROP TRIGGER IF EXISTS update_extract_rules_update_time ON extract_rules; CREATE TRIGGER update_extract_rules_update_time BEFORE UPDATE ON extract_rules FOR EACH ROW EXECUTE FUNCTION update_update_time_column(); DROP TRIGGER IF EXISTS update_rule_templates_update_time ON extract_rule_templates; CREATE TRIGGER update_rule_templates_update_time BEFORE UPDATE ON extract_rule_templates FOR EACH ROW EXECUTE FUNCTION update_update_time_column(); -- ============================================ -- 显示创建结果 -- ============================================ SELECT 'Extract Service 数据表创建成功' AS result;