| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115 |
- -- ============================================
- -- RAG 向量化存储相关表(兼容 SimpleModel 版本)
- -- 灵越智报 v2.0
- -- 字段名与 SimpleModel 基类兼容
- -- ============================================
- -- 启用 pgvector 扩展(需要先安装:apt install postgresql-15-pgvector)
- CREATE EXTENSION IF NOT EXISTS vector;
- -- ============================================
- -- 1. 文本分块表 (text_chunks)
- -- ============================================
- CREATE TABLE IF NOT EXISTS text_chunks (
- id VARCHAR(36) PRIMARY KEY,
- document_id VARCHAR(36) NOT NULL,
- text_storage_id VARCHAR(36),
- chunk_index INTEGER NOT NULL,
- content TEXT NOT NULL,
- token_count INTEGER,
- metadata JSONB DEFAULT '{}',
- create_by VARCHAR(36),
- create_by_name VARCHAR(100),
- create_time TIMESTAMP NOT NULL DEFAULT CURRENT_TIMESTAMP,
- update_by VARCHAR(36),
- update_by_name VARCHAR(100),
- update_time TIMESTAMP NOT NULL DEFAULT CURRENT_TIMESTAMP
- );
- -- 索引
- CREATE INDEX IF NOT EXISTS idx_text_chunks_document_id ON text_chunks(document_id);
- CREATE INDEX IF NOT EXISTS idx_text_chunks_text_storage_id ON text_chunks(text_storage_id);
- CREATE INDEX IF NOT EXISTS idx_text_chunks_chunk_index ON text_chunks(document_id, chunk_index);
- -- ============================================
- -- 2. 向量嵌入表 (vector_embeddings)
- -- ============================================
- CREATE TABLE IF NOT EXISTS vector_embeddings (
- id VARCHAR(36) PRIMARY KEY,
- chunk_id VARCHAR(36) NOT NULL REFERENCES text_chunks(id) ON DELETE CASCADE,
- embedding vector(768), -- nomic-embed-text 维度为 768
- model_name VARCHAR(100) DEFAULT 'nomic-embed-text',
- create_time TIMESTAMP NOT NULL DEFAULT CURRENT_TIMESTAMP
- );
- -- 普通索引
- CREATE INDEX IF NOT EXISTS idx_vector_embeddings_chunk_id ON vector_embeddings(chunk_id);
- CREATE INDEX IF NOT EXISTS idx_vector_embeddings_model ON vector_embeddings(model_name);
- -- HNSW 向量索引(用于高效相似度检索)
- -- 使用余弦距离操作符
- CREATE INDEX IF NOT EXISTS idx_vector_embeddings_hnsw ON vector_embeddings
- USING hnsw (embedding vector_cosine_ops);
- -- ============================================
- -- 3. 辅助函数:向量相似度检索
- -- ============================================
- -- 按文档ID检索相似文本块
- CREATE OR REPLACE FUNCTION search_similar_chunks(
- query_embedding vector(768),
- target_document_id VARCHAR(36),
- result_limit INTEGER DEFAULT 3
- )
- RETURNS TABLE (
- chunk_id VARCHAR(36),
- document_id VARCHAR(36),
- content TEXT,
- chunk_index INTEGER,
- similarity FLOAT
- ) AS $$
- BEGIN
- RETURN QUERY
- SELECT
- tc.id AS chunk_id,
- tc.document_id,
- tc.content,
- tc.chunk_index,
- 1 - (ve.embedding <=> query_embedding) AS similarity
- FROM text_chunks tc
- JOIN vector_embeddings ve ON tc.id = ve.chunk_id
- WHERE tc.document_id = target_document_id
- ORDER BY ve.embedding <=> query_embedding
- LIMIT result_limit;
- END;
- $$ LANGUAGE plpgsql;
- -- 全局检索相似文本块(不限制文档)
- CREATE OR REPLACE FUNCTION search_similar_chunks_global(
- query_embedding vector(768),
- result_limit INTEGER DEFAULT 5
- )
- RETURNS TABLE (
- chunk_id VARCHAR(36),
- document_id VARCHAR(36),
- content TEXT,
- chunk_index INTEGER,
- similarity FLOAT
- ) AS $$
- BEGIN
- RETURN QUERY
- SELECT
- tc.id AS chunk_id,
- tc.document_id,
- tc.content,
- tc.chunk_index,
- 1 - (ve.embedding <=> query_embedding) AS similarity
- FROM text_chunks tc
- JOIN vector_embeddings ve ON tc.id = ve.chunk_id
- ORDER BY ve.embedding <=> query_embedding
- LIMIT result_limit;
- END;
- $$ LANGUAGE plpgsql;
- -- 显示创建结果
- SELECT 'RAG 表创建成功(兼容 SimpleModel)' AS result;
|