-- ============================================ -- RAG 向量化存储相关表 -- 灵越智报 v2.0 -- ============================================ -- 启用 pgvector 扩展(需要先安装:apt install postgresql-15-pgvector) CREATE EXTENSION IF NOT EXISTS vector; -- ============================================ -- 1. 文本分块表 (text_chunks) -- ============================================ CREATE TABLE IF NOT EXISTS text_chunks ( id VARCHAR(32) PRIMARY KEY, document_id VARCHAR(32) NOT NULL, text_storage_id VARCHAR(32), chunk_index INTEGER NOT NULL, content TEXT NOT NULL, token_count INTEGER, metadata JSONB DEFAULT '{}', created_at TIMESTAMP NOT NULL DEFAULT CURRENT_TIMESTAMP, updated_at TIMESTAMP NOT NULL DEFAULT CURRENT_TIMESTAMP ); -- 索引 CREATE INDEX IF NOT EXISTS idx_text_chunks_document_id ON text_chunks(document_id); CREATE INDEX IF NOT EXISTS idx_text_chunks_text_storage_id ON text_chunks(text_storage_id); CREATE INDEX IF NOT EXISTS idx_text_chunks_chunk_index ON text_chunks(document_id, chunk_index); -- 更新时间触发器 CREATE TRIGGER update_text_chunks_updated_at BEFORE UPDATE ON text_chunks FOR EACH ROW EXECUTE FUNCTION update_updated_at_column(); -- ============================================ -- 2. 向量嵌入表 (vector_embeddings) -- ============================================ CREATE TABLE IF NOT EXISTS vector_embeddings ( id VARCHAR(32) PRIMARY KEY, chunk_id VARCHAR(32) NOT NULL REFERENCES text_chunks(id) ON DELETE CASCADE, embedding vector(768), -- nomic-embed-text 维度为 768 model_name VARCHAR(100) DEFAULT 'nomic-embed-text', created_at TIMESTAMP NOT NULL DEFAULT CURRENT_TIMESTAMP ); -- 普通索引 CREATE INDEX IF NOT EXISTS idx_vector_embeddings_chunk_id ON vector_embeddings(chunk_id); CREATE INDEX IF NOT EXISTS idx_vector_embeddings_model ON vector_embeddings(model_name); -- HNSW 向量索引(用于高效相似度检索) -- 使用余弦距离操作符 CREATE INDEX IF NOT EXISTS idx_vector_embeddings_hnsw ON vector_embeddings USING hnsw (embedding vector_cosine_ops); -- ============================================ -- 3. 辅助函数:向量相似度检索 -- ============================================ -- 按文档ID检索相似文本块 CREATE OR REPLACE FUNCTION search_similar_chunks( query_embedding vector(768), target_document_id VARCHAR(32), result_limit INTEGER DEFAULT 3 ) RETURNS TABLE ( chunk_id VARCHAR(32), document_id VARCHAR(32), content TEXT, chunk_index INTEGER, similarity FLOAT ) AS $$ BEGIN RETURN QUERY SELECT tc.id AS chunk_id, tc.document_id, tc.content, tc.chunk_index, 1 - (ve.embedding <=> query_embedding) AS similarity FROM text_chunks tc JOIN vector_embeddings ve ON tc.id = ve.chunk_id WHERE tc.document_id = target_document_id ORDER BY ve.embedding <=> query_embedding LIMIT result_limit; END; $$ LANGUAGE plpgsql; -- 全局检索相似文本块(不限制文档) CREATE OR REPLACE FUNCTION search_similar_chunks_global( query_embedding vector(768), result_limit INTEGER DEFAULT 5 ) RETURNS TABLE ( chunk_id VARCHAR(32), document_id VARCHAR(32), content TEXT, chunk_index INTEGER, similarity FLOAT ) AS $$ BEGIN RETURN QUERY SELECT tc.id AS chunk_id, tc.document_id, tc.content, tc.chunk_index, 1 - (ve.embedding <=> query_embedding) AS similarity FROM text_chunks tc JOIN vector_embeddings ve ON tc.id = ve.chunk_id ORDER BY ve.embedding <=> query_embedding LIMIT result_limit; END; $$ LANGUAGE plpgsql; -- ============================================ -- 4. 注释 -- ============================================ COMMENT ON TABLE text_chunks IS '文本分块表,存储文档分块后的文本片段'; COMMENT ON COLUMN text_chunks.document_id IS '关联的文档ID'; COMMENT ON COLUMN text_chunks.text_storage_id IS '关联的文本存储ID'; COMMENT ON COLUMN text_chunks.chunk_index IS '分块在文档中的顺序索引'; COMMENT ON COLUMN text_chunks.content IS '分块文本内容'; COMMENT ON COLUMN text_chunks.token_count IS '估算的Token数量'; COMMENT ON COLUMN text_chunks.metadata IS '元数据(如页码、段落位置等)'; COMMENT ON TABLE vector_embeddings IS '向量嵌入表,存储文本块的向量表示'; COMMENT ON COLUMN vector_embeddings.chunk_id IS '关联的文本分块ID'; COMMENT ON COLUMN vector_embeddings.embedding IS '768维向量嵌入(nomic-embed-text)'; COMMENT ON COLUMN vector_embeddings.model_name IS '使用的嵌入模型名称';