| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127 |
- -- ============================================
- -- RAG 向量化存储相关表
- -- 灵越智报 v2.0
- -- ============================================
- -- 启用 pgvector 扩展(需要先安装:apt install postgresql-15-pgvector)
- CREATE EXTENSION IF NOT EXISTS vector;
- -- ============================================
- -- 1. 文本分块表 (text_chunks)
- -- ============================================
- CREATE TABLE IF NOT EXISTS text_chunks (
- id VARCHAR(32) PRIMARY KEY,
- document_id VARCHAR(32) NOT NULL,
- text_storage_id VARCHAR(32),
- chunk_index INTEGER NOT NULL,
- content TEXT NOT NULL,
- token_count INTEGER,
- metadata JSONB DEFAULT '{}',
- created_at TIMESTAMP NOT NULL DEFAULT CURRENT_TIMESTAMP,
- updated_at TIMESTAMP NOT NULL DEFAULT CURRENT_TIMESTAMP
- );
- -- 索引
- CREATE INDEX IF NOT EXISTS idx_text_chunks_document_id ON text_chunks(document_id);
- CREATE INDEX IF NOT EXISTS idx_text_chunks_text_storage_id ON text_chunks(text_storage_id);
- CREATE INDEX IF NOT EXISTS idx_text_chunks_chunk_index ON text_chunks(document_id, chunk_index);
- -- 更新时间触发器
- CREATE TRIGGER update_text_chunks_updated_at BEFORE UPDATE ON text_chunks
- FOR EACH ROW EXECUTE FUNCTION update_updated_at_column();
- -- ============================================
- -- 2. 向量嵌入表 (vector_embeddings)
- -- ============================================
- CREATE TABLE IF NOT EXISTS vector_embeddings (
- id VARCHAR(32) PRIMARY KEY,
- chunk_id VARCHAR(32) NOT NULL REFERENCES text_chunks(id) ON DELETE CASCADE,
- embedding vector(768), -- nomic-embed-text 维度为 768
- model_name VARCHAR(100) DEFAULT 'nomic-embed-text',
- created_at TIMESTAMP NOT NULL DEFAULT CURRENT_TIMESTAMP
- );
- -- 普通索引
- CREATE INDEX IF NOT EXISTS idx_vector_embeddings_chunk_id ON vector_embeddings(chunk_id);
- CREATE INDEX IF NOT EXISTS idx_vector_embeddings_model ON vector_embeddings(model_name);
- -- HNSW 向量索引(用于高效相似度检索)
- -- 使用余弦距离操作符
- CREATE INDEX IF NOT EXISTS idx_vector_embeddings_hnsw ON vector_embeddings
- USING hnsw (embedding vector_cosine_ops);
- -- ============================================
- -- 3. 辅助函数:向量相似度检索
- -- ============================================
- -- 按文档ID检索相似文本块
- CREATE OR REPLACE FUNCTION search_similar_chunks(
- query_embedding vector(768),
- target_document_id VARCHAR(32),
- result_limit INTEGER DEFAULT 3
- )
- RETURNS TABLE (
- chunk_id VARCHAR(32),
- document_id VARCHAR(32),
- content TEXT,
- chunk_index INTEGER,
- similarity FLOAT
- ) AS $$
- BEGIN
- RETURN QUERY
- SELECT
- tc.id AS chunk_id,
- tc.document_id,
- tc.content,
- tc.chunk_index,
- 1 - (ve.embedding <=> query_embedding) AS similarity
- FROM text_chunks tc
- JOIN vector_embeddings ve ON tc.id = ve.chunk_id
- WHERE tc.document_id = target_document_id
- ORDER BY ve.embedding <=> query_embedding
- LIMIT result_limit;
- END;
- $$ LANGUAGE plpgsql;
- -- 全局检索相似文本块(不限制文档)
- CREATE OR REPLACE FUNCTION search_similar_chunks_global(
- query_embedding vector(768),
- result_limit INTEGER DEFAULT 5
- )
- RETURNS TABLE (
- chunk_id VARCHAR(32),
- document_id VARCHAR(32),
- content TEXT,
- chunk_index INTEGER,
- similarity FLOAT
- ) AS $$
- BEGIN
- RETURN QUERY
- SELECT
- tc.id AS chunk_id,
- tc.document_id,
- tc.content,
- tc.chunk_index,
- 1 - (ve.embedding <=> query_embedding) AS similarity
- FROM text_chunks tc
- JOIN vector_embeddings ve ON tc.id = ve.chunk_id
- ORDER BY ve.embedding <=> query_embedding
- LIMIT result_limit;
- END;
- $$ LANGUAGE plpgsql;
- -- ============================================
- -- 4. 注释
- -- ============================================
- COMMENT ON TABLE text_chunks IS '文本分块表,存储文档分块后的文本片段';
- COMMENT ON COLUMN text_chunks.document_id IS '关联的文档ID';
- COMMENT ON COLUMN text_chunks.text_storage_id IS '关联的文本存储ID';
- COMMENT ON COLUMN text_chunks.chunk_index IS '分块在文档中的顺序索引';
- COMMENT ON COLUMN text_chunks.content IS '分块文本内容';
- COMMENT ON COLUMN text_chunks.token_count IS '估算的Token数量';
- COMMENT ON COLUMN text_chunks.metadata IS '元数据(如页码、段落位置等)';
- COMMENT ON TABLE vector_embeddings IS '向量嵌入表,存储文本块的向量表示';
- COMMENT ON COLUMN vector_embeddings.chunk_id IS '关联的文本分块ID';
- COMMENT ON COLUMN vector_embeddings.embedding IS '768维向量嵌入(nomic-embed-text)';
- COMMENT ON COLUMN vector_embeddings.model_name IS '使用的嵌入模型名称';
|