Просмотр исходного кода

feat: 添加兼容 SimpleModel 的 RAG 表脚本

创建与 SimpleModel 基类字段兼容的 RAG 表:
- text_chunks: 使用 create_time/update_time
- vector_embeddings: 使用 create_time
- 添加审计字段(create_by, update_by 等)
何文松 1 месяц назад
Родитель
Сommit
7f3acc9c89
1 измененных файлов с 115 добавлено и 0 удалено
  1. 115 0
      backend/sql/rag_tables_compatible.sql

+ 115 - 0
backend/sql/rag_tables_compatible.sql

@@ -0,0 +1,115 @@
+-- ============================================
+-- RAG 向量化存储相关表(兼容 SimpleModel 版本)
+-- 灵越智报 v2.0
+-- 字段名与 SimpleModel 基类兼容
+-- ============================================
+
+-- 启用 pgvector 扩展(需要先安装:apt install postgresql-15-pgvector)
+CREATE EXTENSION IF NOT EXISTS vector;
+
+-- ============================================
+-- 1. 文本分块表 (text_chunks)
+-- ============================================
+CREATE TABLE IF NOT EXISTS text_chunks (
+    id VARCHAR(32) PRIMARY KEY,
+    document_id VARCHAR(32) NOT NULL,
+    text_storage_id VARCHAR(32),
+    chunk_index INTEGER NOT NULL,
+    content TEXT NOT NULL,
+    token_count INTEGER,
+    metadata JSONB DEFAULT '{}',
+    create_by VARCHAR(36),
+    create_by_name VARCHAR(100),
+    create_time TIMESTAMP NOT NULL DEFAULT CURRENT_TIMESTAMP,
+    update_by VARCHAR(36),
+    update_by_name VARCHAR(100),
+    update_time TIMESTAMP NOT NULL DEFAULT CURRENT_TIMESTAMP
+);
+
+-- 索引
+CREATE INDEX IF NOT EXISTS idx_text_chunks_document_id ON text_chunks(document_id);
+CREATE INDEX IF NOT EXISTS idx_text_chunks_text_storage_id ON text_chunks(text_storage_id);
+CREATE INDEX IF NOT EXISTS idx_text_chunks_chunk_index ON text_chunks(document_id, chunk_index);
+
+-- ============================================
+-- 2. 向量嵌入表 (vector_embeddings)
+-- ============================================
+CREATE TABLE IF NOT EXISTS vector_embeddings (
+    id VARCHAR(32) PRIMARY KEY,
+    chunk_id VARCHAR(32) NOT NULL REFERENCES text_chunks(id) ON DELETE CASCADE,
+    embedding vector(768),  -- nomic-embed-text 维度为 768
+    model_name VARCHAR(100) DEFAULT 'nomic-embed-text',
+    create_time TIMESTAMP NOT NULL DEFAULT CURRENT_TIMESTAMP
+);
+
+-- 普通索引
+CREATE INDEX IF NOT EXISTS idx_vector_embeddings_chunk_id ON vector_embeddings(chunk_id);
+CREATE INDEX IF NOT EXISTS idx_vector_embeddings_model ON vector_embeddings(model_name);
+
+-- HNSW 向量索引(用于高效相似度检索)
+-- 使用余弦距离操作符
+CREATE INDEX IF NOT EXISTS idx_vector_embeddings_hnsw ON vector_embeddings 
+    USING hnsw (embedding vector_cosine_ops);
+
+-- ============================================
+-- 3. 辅助函数:向量相似度检索
+-- ============================================
+
+-- 按文档ID检索相似文本块
+CREATE OR REPLACE FUNCTION search_similar_chunks(
+    query_embedding vector(768),
+    target_document_id VARCHAR(32),
+    result_limit INTEGER DEFAULT 3
+)
+RETURNS TABLE (
+    chunk_id VARCHAR(32),
+    document_id VARCHAR(32),
+    content TEXT,
+    chunk_index INTEGER,
+    similarity FLOAT
+) AS $$
+BEGIN
+    RETURN QUERY
+    SELECT 
+        tc.id AS chunk_id,
+        tc.document_id,
+        tc.content,
+        tc.chunk_index,
+        1 - (ve.embedding <=> query_embedding) AS similarity
+    FROM text_chunks tc
+    JOIN vector_embeddings ve ON tc.id = ve.chunk_id
+    WHERE tc.document_id = target_document_id
+    ORDER BY ve.embedding <=> query_embedding
+    LIMIT result_limit;
+END;
+$$ LANGUAGE plpgsql;
+
+-- 全局检索相似文本块(不限制文档)
+CREATE OR REPLACE FUNCTION search_similar_chunks_global(
+    query_embedding vector(768),
+    result_limit INTEGER DEFAULT 5
+)
+RETURNS TABLE (
+    chunk_id VARCHAR(32),
+    document_id VARCHAR(32),
+    content TEXT,
+    chunk_index INTEGER,
+    similarity FLOAT
+) AS $$
+BEGIN
+    RETURN QUERY
+    SELECT 
+        tc.id AS chunk_id,
+        tc.document_id,
+        tc.content,
+        tc.chunk_index,
+        1 - (ve.embedding <=> query_embedding) AS similarity
+    FROM text_chunks tc
+    JOIN vector_embeddings ve ON tc.id = ve.chunk_id
+    ORDER BY ve.embedding <=> query_embedding
+    LIMIT result_limit;
+END;
+$$ LANGUAGE plpgsql;
+
+-- 显示创建结果
+SELECT 'RAG 表创建成功(兼容 SimpleModel)' AS result;