V2026_01_22_02__create_extract_tables.sql 8.9 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200
  1. -- ============================================
  2. -- 数据提取规则系统核心表
  3. -- extract-service 模块
  4. -- ============================================
  5. -- ============================================
  6. -- 1. 项目表 (projects)
  7. -- ============================================
  8. CREATE TABLE IF NOT EXISTS extract_projects (
  9. id VARCHAR(32) PRIMARY KEY,
  10. user_id VARCHAR(32) NOT NULL,
  11. name VARCHAR(255) NOT NULL,
  12. description TEXT,
  13. status VARCHAR(32) DEFAULT 'draft', -- draft/extracting/completed/archived
  14. config JSONB DEFAULT '{}',
  15. create_by VARCHAR(36),
  16. create_by_name VARCHAR(100),
  17. create_time TIMESTAMP NOT NULL DEFAULT CURRENT_TIMESTAMP,
  18. update_by VARCHAR(36),
  19. update_by_name VARCHAR(100),
  20. update_time TIMESTAMP NOT NULL DEFAULT CURRENT_TIMESTAMP
  21. );
  22. CREATE INDEX IF NOT EXISTS idx_extract_projects_user_id ON extract_projects(user_id);
  23. CREATE INDEX IF NOT EXISTS idx_extract_projects_status ON extract_projects(status);
  24. COMMENT ON TABLE extract_projects IS '数据提取项目';
  25. COMMENT ON COLUMN extract_projects.status IS '状态: draft-草稿, extracting-提取中, completed-已完成, archived-已归档';
  26. COMMENT ON COLUMN extract_projects.config IS '项目配置: outputFormat, autoExtract, notifyOnComplete, aiModel';
  27. -- ============================================
  28. -- 2. 来源文档表 (source_documents)
  29. -- ============================================
  30. CREATE TABLE IF NOT EXISTS extract_source_documents (
  31. id VARCHAR(32) PRIMARY KEY,
  32. project_id VARCHAR(32) NOT NULL REFERENCES extract_projects(id) ON DELETE CASCADE,
  33. document_id VARCHAR(36) NOT NULL, -- 关联 documents 表
  34. alias VARCHAR(128) NOT NULL, -- 文档别名,如"可研批复"
  35. doc_type VARCHAR(32) NOT NULL, -- pdf/docx/xlsx
  36. display_order INT DEFAULT 0,
  37. metadata JSONB DEFAULT '{}',
  38. create_time TIMESTAMP NOT NULL DEFAULT CURRENT_TIMESTAMP,
  39. CONSTRAINT uq_source_doc_alias UNIQUE (project_id, alias)
  40. );
  41. CREATE INDEX IF NOT EXISTS idx_source_docs_project_id ON extract_source_documents(project_id);
  42. CREATE INDEX IF NOT EXISTS idx_source_docs_document_id ON extract_source_documents(document_id);
  43. COMMENT ON TABLE extract_source_documents IS '项目来源文档';
  44. COMMENT ON COLUMN extract_source_documents.alias IS '文档别名,如"可研批复"、"可行性研究报告"';
  45. COMMENT ON COLUMN extract_source_documents.metadata IS '元数据: fileName, fileSize, pageCount, parseStatus';
  46. -- ============================================
  47. -- 3. 提取规则表 (extract_rules)
  48. -- ============================================
  49. CREATE TABLE IF NOT EXISTS extract_rules (
  50. id VARCHAR(32) PRIMARY KEY,
  51. project_id VARCHAR(32) NOT NULL REFERENCES extract_projects(id) ON DELETE CASCADE,
  52. source_doc_id VARCHAR(32), -- 可为空,表示引用/固定/手动类型
  53. -- 目标字段
  54. target_field_key VARCHAR(128) NOT NULL, -- 字段Key(程序用)
  55. target_field_name VARCHAR(255) NOT NULL, -- 字段名称(显示用)
  56. target_field_group VARCHAR(128), -- 字段分组
  57. rule_index INT NOT NULL DEFAULT 0, -- 规则顺序
  58. -- 来源配置
  59. source_type VARCHAR(32) NOT NULL, -- document/self_reference/fixed/manual
  60. source_config JSONB NOT NULL, -- 来源配置详情
  61. -- 提取配置
  62. extract_type VARCHAR(32) NOT NULL, -- direct/ai_extract/ai_summarize/ocr
  63. extract_config JSONB, -- 提取配置详情
  64. -- 结果
  65. status VARCHAR(32) DEFAULT 'pending', -- pending/extracting/extracted/confirmed/error
  66. extracted_value TEXT,
  67. value_type VARCHAR(32) DEFAULT 'text', -- text/table/image/list
  68. error_message TEXT,
  69. -- 元数据
  70. metadata JSONB DEFAULT '{}',
  71. create_by VARCHAR(36),
  72. create_by_name VARCHAR(100),
  73. create_time TIMESTAMP NOT NULL DEFAULT CURRENT_TIMESTAMP,
  74. update_by VARCHAR(36),
  75. update_by_name VARCHAR(100),
  76. update_time TIMESTAMP NOT NULL DEFAULT CURRENT_TIMESTAMP,
  77. CONSTRAINT uq_rule_field_key UNIQUE (project_id, target_field_key)
  78. );
  79. CREATE INDEX IF NOT EXISTS idx_extract_rules_project_id ON extract_rules(project_id);
  80. CREATE INDEX IF NOT EXISTS idx_extract_rules_source_doc_id ON extract_rules(source_doc_id);
  81. CREATE INDEX IF NOT EXISTS idx_extract_rules_status ON extract_rules(status);
  82. CREATE INDEX IF NOT EXISTS idx_extract_rules_rule_index ON extract_rules(project_id, rule_index);
  83. COMMENT ON TABLE extract_rules IS '数据提取规则';
  84. COMMENT ON COLUMN extract_rules.source_type IS '来源类型: document-文档, self_reference-引用已提取字段, fixed-固定值, manual-手动输入';
  85. COMMENT ON COLUMN extract_rules.extract_type IS '提取类型: direct-直接提取, ai_extract-AI字段提取, ai_summarize-AI总结, ocr-OCR识别';
  86. COMMENT ON COLUMN extract_rules.source_config IS '来源配置: location, paragraphKeyword, referenceFieldKeys等';
  87. COMMENT ON COLUMN extract_rules.extract_config IS '提取配置: targetDescription, expectedFormat, summarizePrompt等';
  88. -- ============================================
  89. -- 4. 提取结果表 (extract_results)
  90. -- ============================================
  91. CREATE TABLE IF NOT EXISTS extract_results (
  92. id VARCHAR(32) PRIMARY KEY,
  93. rule_id VARCHAR(32) NOT NULL REFERENCES extract_rules(id) ON DELETE CASCADE,
  94. project_id VARCHAR(32) NOT NULL REFERENCES extract_projects(id) ON DELETE CASCADE,
  95. -- 提取结果
  96. extracted_value TEXT NOT NULL,
  97. value_type VARCHAR(32) DEFAULT 'text',
  98. -- 来源追溯
  99. source_content TEXT, -- 来源原文内容
  100. source_location JSONB, -- 来源位置信息
  101. -- 质量评估
  102. confidence DECIMAL(5,4), -- AI提取置信度 0-1
  103. -- 状态
  104. status VARCHAR(32) DEFAULT 'extracted', -- extracted/confirmed/rejected/modified
  105. -- 人工处理
  106. modified_value TEXT, -- 人工修正后的值
  107. confirmed_at TIMESTAMP,
  108. confirmed_by VARCHAR(32),
  109. reject_reason TEXT, -- 拒绝原因
  110. -- 元数据
  111. metadata JSONB DEFAULT '{}',
  112. create_time TIMESTAMP NOT NULL DEFAULT CURRENT_TIMESTAMP
  113. );
  114. CREATE INDEX IF NOT EXISTS idx_extract_results_rule_id ON extract_results(rule_id);
  115. CREATE INDEX IF NOT EXISTS idx_extract_results_project_id ON extract_results(project_id);
  116. CREATE INDEX IF NOT EXISTS idx_extract_results_status ON extract_results(status);
  117. COMMENT ON TABLE extract_results IS '提取结果历史';
  118. COMMENT ON COLUMN extract_results.source_location IS '来源位置: documentId, documentAlias, locationType, pageStart, pageEnd, elementIds, chapterPath';
  119. COMMENT ON COLUMN extract_results.status IS '状态: extracted-已提取, confirmed-已确认, rejected-已拒绝, modified-已修正';
  120. -- ============================================
  121. -- 5. 规则模板表 (rule_templates)
  122. -- ============================================
  123. CREATE TABLE IF NOT EXISTS extract_rule_templates (
  124. id VARCHAR(32) PRIMARY KEY,
  125. user_id VARCHAR(32) NOT NULL,
  126. name VARCHAR(255) NOT NULL,
  127. description TEXT,
  128. -- 模板内容
  129. rules_snapshot JSONB NOT NULL, -- 规则配置快照
  130. doc_type_pattern JSONB, -- 适用的文档类型模式
  131. -- 可见性
  132. is_public BOOLEAN DEFAULT FALSE,
  133. -- 统计
  134. use_count INT DEFAULT 0,
  135. -- 元数据
  136. metadata JSONB DEFAULT '{}',
  137. create_by VARCHAR(36),
  138. create_by_name VARCHAR(100),
  139. create_time TIMESTAMP NOT NULL DEFAULT CURRENT_TIMESTAMP,
  140. update_by VARCHAR(36),
  141. update_by_name VARCHAR(100),
  142. update_time TIMESTAMP NOT NULL DEFAULT CURRENT_TIMESTAMP
  143. );
  144. CREATE INDEX IF NOT EXISTS idx_rule_templates_user_id ON extract_rule_templates(user_id);
  145. CREATE INDEX IF NOT EXISTS idx_rule_templates_is_public ON extract_rule_templates(is_public);
  146. COMMENT ON TABLE extract_rule_templates IS '规则模板';
  147. COMMENT ON COLUMN extract_rule_templates.rules_snapshot IS '规则配置快照(不含项目特定信息)';
  148. COMMENT ON COLUMN extract_rule_templates.doc_type_pattern IS '适用的文档类型模式,用于自动匹配';
  149. -- ============================================
  150. -- 更新时间触发器
  151. -- ============================================
  152. DROP TRIGGER IF EXISTS update_extract_projects_update_time ON extract_projects;
  153. CREATE TRIGGER update_extract_projects_update_time BEFORE UPDATE ON extract_projects
  154. FOR EACH ROW EXECUTE FUNCTION update_update_time_column();
  155. DROP TRIGGER IF EXISTS update_extract_rules_update_time ON extract_rules;
  156. CREATE TRIGGER update_extract_rules_update_time BEFORE UPDATE ON extract_rules
  157. FOR EACH ROW EXECUTE FUNCTION update_update_time_column();
  158. DROP TRIGGER IF EXISTS update_rule_templates_update_time ON extract_rule_templates;
  159. CREATE TRIGGER update_rule_templates_update_time BEFORE UPDATE ON extract_rule_templates
  160. FOR EACH ROW EXECUTE FUNCTION update_update_time_column();
  161. -- ============================================
  162. -- 显示创建结果
  163. -- ============================================
  164. SELECT 'Extract Service 数据表创建成功' AS result;