| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200 |
- -- ============================================
- -- 数据提取规则系统核心表
- -- extract-service 模块
- -- ============================================
- -- ============================================
- -- 1. 项目表 (projects)
- -- ============================================
- CREATE TABLE IF NOT EXISTS extract_projects (
- id VARCHAR(32) PRIMARY KEY,
- user_id VARCHAR(32) NOT NULL,
- name VARCHAR(255) NOT NULL,
- description TEXT,
- status VARCHAR(32) DEFAULT 'draft', -- draft/extracting/completed/archived
- config JSONB DEFAULT '{}',
- create_by VARCHAR(36),
- create_by_name VARCHAR(100),
- create_time TIMESTAMP NOT NULL DEFAULT CURRENT_TIMESTAMP,
- update_by VARCHAR(36),
- update_by_name VARCHAR(100),
- update_time TIMESTAMP NOT NULL DEFAULT CURRENT_TIMESTAMP
- );
- CREATE INDEX IF NOT EXISTS idx_extract_projects_user_id ON extract_projects(user_id);
- CREATE INDEX IF NOT EXISTS idx_extract_projects_status ON extract_projects(status);
- COMMENT ON TABLE extract_projects IS '数据提取项目';
- COMMENT ON COLUMN extract_projects.status IS '状态: draft-草稿, extracting-提取中, completed-已完成, archived-已归档';
- COMMENT ON COLUMN extract_projects.config IS '项目配置: outputFormat, autoExtract, notifyOnComplete, aiModel';
- -- ============================================
- -- 2. 来源文档表 (source_documents)
- -- ============================================
- CREATE TABLE IF NOT EXISTS extract_source_documents (
- id VARCHAR(32) PRIMARY KEY,
- project_id VARCHAR(32) NOT NULL REFERENCES extract_projects(id) ON DELETE CASCADE,
- document_id VARCHAR(36) NOT NULL, -- 关联 documents 表
- alias VARCHAR(128) NOT NULL, -- 文档别名,如"可研批复"
- doc_type VARCHAR(32) NOT NULL, -- pdf/docx/xlsx
- display_order INT DEFAULT 0,
- metadata JSONB DEFAULT '{}',
- create_time TIMESTAMP NOT NULL DEFAULT CURRENT_TIMESTAMP,
-
- CONSTRAINT uq_source_doc_alias UNIQUE (project_id, alias)
- );
- CREATE INDEX IF NOT EXISTS idx_source_docs_project_id ON extract_source_documents(project_id);
- CREATE INDEX IF NOT EXISTS idx_source_docs_document_id ON extract_source_documents(document_id);
- COMMENT ON TABLE extract_source_documents IS '项目来源文档';
- COMMENT ON COLUMN extract_source_documents.alias IS '文档别名,如"可研批复"、"可行性研究报告"';
- COMMENT ON COLUMN extract_source_documents.metadata IS '元数据: fileName, fileSize, pageCount, parseStatus';
- -- ============================================
- -- 3. 提取规则表 (extract_rules)
- -- ============================================
- CREATE TABLE IF NOT EXISTS extract_rules (
- id VARCHAR(32) PRIMARY KEY,
- project_id VARCHAR(32) NOT NULL REFERENCES extract_projects(id) ON DELETE CASCADE,
- source_doc_id VARCHAR(32), -- 可为空,表示引用/固定/手动类型
-
- -- 目标字段
- target_field_key VARCHAR(128) NOT NULL, -- 字段Key(程序用)
- target_field_name VARCHAR(255) NOT NULL, -- 字段名称(显示用)
- target_field_group VARCHAR(128), -- 字段分组
- rule_index INT NOT NULL DEFAULT 0, -- 规则顺序
-
- -- 来源配置
- source_type VARCHAR(32) NOT NULL, -- document/self_reference/fixed/manual
- source_config JSONB NOT NULL, -- 来源配置详情
-
- -- 提取配置
- extract_type VARCHAR(32) NOT NULL, -- direct/ai_extract/ai_summarize/ocr
- extract_config JSONB, -- 提取配置详情
-
- -- 结果
- status VARCHAR(32) DEFAULT 'pending', -- pending/extracting/extracted/confirmed/error
- extracted_value TEXT,
- value_type VARCHAR(32) DEFAULT 'text', -- text/table/image/list
- error_message TEXT,
-
- -- 元数据
- metadata JSONB DEFAULT '{}',
- create_by VARCHAR(36),
- create_by_name VARCHAR(100),
- create_time TIMESTAMP NOT NULL DEFAULT CURRENT_TIMESTAMP,
- update_by VARCHAR(36),
- update_by_name VARCHAR(100),
- update_time TIMESTAMP NOT NULL DEFAULT CURRENT_TIMESTAMP,
-
- CONSTRAINT uq_rule_field_key UNIQUE (project_id, target_field_key)
- );
- CREATE INDEX IF NOT EXISTS idx_extract_rules_project_id ON extract_rules(project_id);
- CREATE INDEX IF NOT EXISTS idx_extract_rules_source_doc_id ON extract_rules(source_doc_id);
- CREATE INDEX IF NOT EXISTS idx_extract_rules_status ON extract_rules(status);
- CREATE INDEX IF NOT EXISTS idx_extract_rules_rule_index ON extract_rules(project_id, rule_index);
- COMMENT ON TABLE extract_rules IS '数据提取规则';
- COMMENT ON COLUMN extract_rules.source_type IS '来源类型: document-文档, self_reference-引用已提取字段, fixed-固定值, manual-手动输入';
- COMMENT ON COLUMN extract_rules.extract_type IS '提取类型: direct-直接提取, ai_extract-AI字段提取, ai_summarize-AI总结, ocr-OCR识别';
- COMMENT ON COLUMN extract_rules.source_config IS '来源配置: location, paragraphKeyword, referenceFieldKeys等';
- COMMENT ON COLUMN extract_rules.extract_config IS '提取配置: targetDescription, expectedFormat, summarizePrompt等';
- -- ============================================
- -- 4. 提取结果表 (extract_results)
- -- ============================================
- CREATE TABLE IF NOT EXISTS extract_results (
- id VARCHAR(32) PRIMARY KEY,
- rule_id VARCHAR(32) NOT NULL REFERENCES extract_rules(id) ON DELETE CASCADE,
- project_id VARCHAR(32) NOT NULL REFERENCES extract_projects(id) ON DELETE CASCADE,
-
- -- 提取结果
- extracted_value TEXT NOT NULL,
- value_type VARCHAR(32) DEFAULT 'text',
-
- -- 来源追溯
- source_content TEXT, -- 来源原文内容
- source_location JSONB, -- 来源位置信息
-
- -- 质量评估
- confidence DECIMAL(5,4), -- AI提取置信度 0-1
-
- -- 状态
- status VARCHAR(32) DEFAULT 'extracted', -- extracted/confirmed/rejected/modified
-
- -- 人工处理
- modified_value TEXT, -- 人工修正后的值
- confirmed_at TIMESTAMP,
- confirmed_by VARCHAR(32),
- reject_reason TEXT, -- 拒绝原因
-
- -- 元数据
- metadata JSONB DEFAULT '{}',
- create_time TIMESTAMP NOT NULL DEFAULT CURRENT_TIMESTAMP
- );
- CREATE INDEX IF NOT EXISTS idx_extract_results_rule_id ON extract_results(rule_id);
- CREATE INDEX IF NOT EXISTS idx_extract_results_project_id ON extract_results(project_id);
- CREATE INDEX IF NOT EXISTS idx_extract_results_status ON extract_results(status);
- COMMENT ON TABLE extract_results IS '提取结果历史';
- COMMENT ON COLUMN extract_results.source_location IS '来源位置: documentId, documentAlias, locationType, pageStart, pageEnd, elementIds, chapterPath';
- COMMENT ON COLUMN extract_results.status IS '状态: extracted-已提取, confirmed-已确认, rejected-已拒绝, modified-已修正';
- -- ============================================
- -- 5. 规则模板表 (rule_templates)
- -- ============================================
- CREATE TABLE IF NOT EXISTS extract_rule_templates (
- id VARCHAR(32) PRIMARY KEY,
- user_id VARCHAR(32) NOT NULL,
- name VARCHAR(255) NOT NULL,
- description TEXT,
-
- -- 模板内容
- rules_snapshot JSONB NOT NULL, -- 规则配置快照
- doc_type_pattern JSONB, -- 适用的文档类型模式
-
- -- 可见性
- is_public BOOLEAN DEFAULT FALSE,
-
- -- 统计
- use_count INT DEFAULT 0,
-
- -- 元数据
- metadata JSONB DEFAULT '{}',
- create_by VARCHAR(36),
- create_by_name VARCHAR(100),
- create_time TIMESTAMP NOT NULL DEFAULT CURRENT_TIMESTAMP,
- update_by VARCHAR(36),
- update_by_name VARCHAR(100),
- update_time TIMESTAMP NOT NULL DEFAULT CURRENT_TIMESTAMP
- );
- CREATE INDEX IF NOT EXISTS idx_rule_templates_user_id ON extract_rule_templates(user_id);
- CREATE INDEX IF NOT EXISTS idx_rule_templates_is_public ON extract_rule_templates(is_public);
- COMMENT ON TABLE extract_rule_templates IS '规则模板';
- COMMENT ON COLUMN extract_rule_templates.rules_snapshot IS '规则配置快照(不含项目特定信息)';
- COMMENT ON COLUMN extract_rule_templates.doc_type_pattern IS '适用的文档类型模式,用于自动匹配';
- -- ============================================
- -- 更新时间触发器
- -- ============================================
- DROP TRIGGER IF EXISTS update_extract_projects_update_time ON extract_projects;
- CREATE TRIGGER update_extract_projects_update_time BEFORE UPDATE ON extract_projects
- FOR EACH ROW EXECUTE FUNCTION update_update_time_column();
- DROP TRIGGER IF EXISTS update_extract_rules_update_time ON extract_rules;
- CREATE TRIGGER update_extract_rules_update_time BEFORE UPDATE ON extract_rules
- FOR EACH ROW EXECUTE FUNCTION update_update_time_column();
- DROP TRIGGER IF EXISTS update_rule_templates_update_time ON extract_rule_templates;
- CREATE TRIGGER update_rule_templates_update_time BEFORE UPDATE ON extract_rule_templates
- FOR EACH ROW EXECUTE FUNCTION update_update_time_column();
- -- ============================================
- -- 显示创建结果
- -- ============================================
- SELECT 'Extract Service 数据表创建成功' AS result;
|