rag_tables.sql 4.6 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127
  1. -- ============================================
  2. -- RAG 向量化存储相关表
  3. -- 灵越智报 v2.0
  4. -- ============================================
  5. -- 启用 pgvector 扩展(需要先安装:apt install postgresql-15-pgvector)
  6. CREATE EXTENSION IF NOT EXISTS vector;
  7. -- ============================================
  8. -- 1. 文本分块表 (text_chunks)
  9. -- ============================================
  10. CREATE TABLE IF NOT EXISTS text_chunks (
  11. id VARCHAR(32) PRIMARY KEY,
  12. document_id VARCHAR(32) NOT NULL,
  13. text_storage_id VARCHAR(32),
  14. chunk_index INTEGER NOT NULL,
  15. content TEXT NOT NULL,
  16. token_count INTEGER,
  17. metadata JSONB DEFAULT '{}',
  18. created_at TIMESTAMP NOT NULL DEFAULT CURRENT_TIMESTAMP,
  19. updated_at TIMESTAMP NOT NULL DEFAULT CURRENT_TIMESTAMP
  20. );
  21. -- 索引
  22. CREATE INDEX IF NOT EXISTS idx_text_chunks_document_id ON text_chunks(document_id);
  23. CREATE INDEX IF NOT EXISTS idx_text_chunks_text_storage_id ON text_chunks(text_storage_id);
  24. CREATE INDEX IF NOT EXISTS idx_text_chunks_chunk_index ON text_chunks(document_id, chunk_index);
  25. -- 更新时间触发器
  26. CREATE TRIGGER update_text_chunks_updated_at BEFORE UPDATE ON text_chunks
  27. FOR EACH ROW EXECUTE FUNCTION update_updated_at_column();
  28. -- ============================================
  29. -- 2. 向量嵌入表 (vector_embeddings)
  30. -- ============================================
  31. CREATE TABLE IF NOT EXISTS vector_embeddings (
  32. id VARCHAR(32) PRIMARY KEY,
  33. chunk_id VARCHAR(32) NOT NULL REFERENCES text_chunks(id) ON DELETE CASCADE,
  34. embedding vector(768), -- nomic-embed-text 维度为 768
  35. model_name VARCHAR(100) DEFAULT 'nomic-embed-text',
  36. created_at TIMESTAMP NOT NULL DEFAULT CURRENT_TIMESTAMP
  37. );
  38. -- 普通索引
  39. CREATE INDEX IF NOT EXISTS idx_vector_embeddings_chunk_id ON vector_embeddings(chunk_id);
  40. CREATE INDEX IF NOT EXISTS idx_vector_embeddings_model ON vector_embeddings(model_name);
  41. -- HNSW 向量索引(用于高效相似度检索)
  42. -- 使用余弦距离操作符
  43. CREATE INDEX IF NOT EXISTS idx_vector_embeddings_hnsw ON vector_embeddings
  44. USING hnsw (embedding vector_cosine_ops);
  45. -- ============================================
  46. -- 3. 辅助函数:向量相似度检索
  47. -- ============================================
  48. -- 按文档ID检索相似文本块
  49. CREATE OR REPLACE FUNCTION search_similar_chunks(
  50. query_embedding vector(768),
  51. target_document_id VARCHAR(32),
  52. result_limit INTEGER DEFAULT 3
  53. )
  54. RETURNS TABLE (
  55. chunk_id VARCHAR(32),
  56. document_id VARCHAR(32),
  57. content TEXT,
  58. chunk_index INTEGER,
  59. similarity FLOAT
  60. ) AS $$
  61. BEGIN
  62. RETURN QUERY
  63. SELECT
  64. tc.id AS chunk_id,
  65. tc.document_id,
  66. tc.content,
  67. tc.chunk_index,
  68. 1 - (ve.embedding <=> query_embedding) AS similarity
  69. FROM text_chunks tc
  70. JOIN vector_embeddings ve ON tc.id = ve.chunk_id
  71. WHERE tc.document_id = target_document_id
  72. ORDER BY ve.embedding <=> query_embedding
  73. LIMIT result_limit;
  74. END;
  75. $$ LANGUAGE plpgsql;
  76. -- 全局检索相似文本块(不限制文档)
  77. CREATE OR REPLACE FUNCTION search_similar_chunks_global(
  78. query_embedding vector(768),
  79. result_limit INTEGER DEFAULT 5
  80. )
  81. RETURNS TABLE (
  82. chunk_id VARCHAR(32),
  83. document_id VARCHAR(32),
  84. content TEXT,
  85. chunk_index INTEGER,
  86. similarity FLOAT
  87. ) AS $$
  88. BEGIN
  89. RETURN QUERY
  90. SELECT
  91. tc.id AS chunk_id,
  92. tc.document_id,
  93. tc.content,
  94. tc.chunk_index,
  95. 1 - (ve.embedding <=> query_embedding) AS similarity
  96. FROM text_chunks tc
  97. JOIN vector_embeddings ve ON tc.id = ve.chunk_id
  98. ORDER BY ve.embedding <=> query_embedding
  99. LIMIT result_limit;
  100. END;
  101. $$ LANGUAGE plpgsql;
  102. -- ============================================
  103. -- 4. 注释
  104. -- ============================================
  105. COMMENT ON TABLE text_chunks IS '文本分块表,存储文档分块后的文本片段';
  106. COMMENT ON COLUMN text_chunks.document_id IS '关联的文档ID';
  107. COMMENT ON COLUMN text_chunks.text_storage_id IS '关联的文本存储ID';
  108. COMMENT ON COLUMN text_chunks.chunk_index IS '分块在文档中的顺序索引';
  109. COMMENT ON COLUMN text_chunks.content IS '分块文本内容';
  110. COMMENT ON COLUMN text_chunks.token_count IS '估算的Token数量';
  111. COMMENT ON COLUMN text_chunks.metadata IS '元数据(如页码、段落位置等)';
  112. COMMENT ON TABLE vector_embeddings IS '向量嵌入表,存储文本块的向量表示';
  113. COMMENT ON COLUMN vector_embeddings.chunk_id IS '关联的文本分块ID';
  114. COMMENT ON COLUMN vector_embeddings.embedding IS '768维向量嵌入(nomic-embed-text)';
  115. COMMENT ON COLUMN vector_embeddings.model_name IS '使用的嵌入模型名称';