Files
openccb/services/lms-service/migrations/20260319000000_pgvector_knowledge_embeddings.sql
T

136 lines
4.2 KiB
PL/PgSQL

-- PGVector Embeddings for Knowledge Base (LMS)
-- Enables semantic search for AI tutor chat with RAG
-- Enable pgvector extension (should already be enabled from CMS)
CREATE EXTENSION IF NOT EXISTS vector;
-- Add embedding column to knowledge_base table
-- Using 768 dimensions for nomic-embed-text model
ALTER TABLE knowledge_base
ADD COLUMN IF NOT EXISTS embedding vector(768);
-- Add embedding_updated_at timestamp
ALTER TABLE knowledge_base
ADD COLUMN IF NOT EXISTS embedding_updated_at TIMESTAMPTZ;
-- Create index for fast semantic search (IVFFlat for >10k rows)
-- Adjust lists parameter based on expected data size:
-- lists = rows / 1000 for < 1M rows
CREATE INDEX IF NOT EXISTS idx_knowledge_base_embeddings
ON knowledge_base
USING ivfflat (embedding vector_cosine_ops)
WITH (lists = 100);
-- Create index for filtering by embedding status
CREATE INDEX IF NOT EXISTS idx_knowledge_base_embedding_updated
ON knowledge_base (embedding_updated_at);
-- Function to search knowledge base by semantic similarity
CREATE OR REPLACE FUNCTION search_knowledge_semantic(
p_course_id UUID,
p_query_embedding vector(768),
p_limit INTEGER DEFAULT 10,
p_threshold REAL DEFAULT 0.5
)
RETURNS TABLE (
id UUID,
course_id UUID,
lesson_id UUID,
block_id UUID,
content_chunk TEXT,
similarity REAL,
metadata JSONB
) AS $$
BEGIN
RETURN QUERY
SELECT
kb.id,
kb.course_id,
kb.lesson_id,
kb.block_id,
kb.content_chunk,
1 - (kb.embedding <=> p_query_embedding) AS similarity,
kb.metadata
FROM knowledge_base kb
WHERE kb.course_id = p_course_id
AND kb.embedding IS NOT NULL
AND 1 - (kb.embedding <=> p_query_embedding) >= p_threshold
ORDER BY kb.embedding <=> p_query_embedding
LIMIT p_limit;
END;
$$ LANGUAGE plpgsql STABLE;
-- Function to search knowledge base across all courses (for admin/global search)
CREATE OR REPLACE FUNCTION search_knowledge_global(
p_query_embedding vector(768),
p_limit INTEGER DEFAULT 20,
p_threshold REAL DEFAULT 0.6
)
RETURNS TABLE (
id UUID,
course_id UUID,
course_name VARCHAR,
lesson_id UUID,
lesson_title VARCHAR,
content_chunk TEXT,
similarity REAL
) AS $$
BEGIN
RETURN QUERY
SELECT
kb.id,
kb.course_id,
c.name AS course_name,
kb.lesson_id,
l.title AS lesson_title,
kb.content_chunk,
1 - (kb.embedding <=> p_query_embedding) AS similarity
FROM knowledge_base kb
LEFT JOIN courses c ON c.id = kb.course_id
LEFT JOIN lessons l ON l.id = kb.lesson_id
WHERE kb.embedding IS NOT NULL
AND 1 - (kb.embedding <=> p_query_embedding) >= p_threshold
ORDER BY kb.embedding <=> p_query_embedding
LIMIT p_limit;
END;
$$ LANGUAGE plpgsql STABLE;
-- Function to get contextual chunks for a specific lesson
-- Combines semantic search with exact lesson matching
CREATE OR REPLACE FUNCTION get_lesson_context(
p_lesson_id UUID,
p_query_embedding vector(768),
p_limit INTEGER DEFAULT 5
)
RETURNS TABLE (
id UUID,
content_chunk TEXT,
similarity REAL,
is_exact_lesson BOOLEAN,
metadata JSONB
) AS $$
BEGIN
RETURN QUERY
SELECT
kb.id,
kb.content_chunk,
1 - (kb.embedding <=> p_query_embedding) AS similarity,
(kb.lesson_id = p_lesson_id) AS is_exact_lesson,
kb.metadata
FROM knowledge_base kb
WHERE kb.embedding IS NOT NULL
AND (kb.lesson_id = p_lesson_id OR 1 - (kb.embedding <=> p_query_embedding) >= 0.6)
ORDER BY
(kb.lesson_id = p_lesson_id) DESC,
kb.embedding <=> p_query_embedding
LIMIT p_limit;
END;
$$ LANGUAGE plpgsql STABLE;
-- Comments
COMMENT ON COLUMN knowledge_base.embedding IS 'Semantic embedding vector for RAG search (nomic-embed-text, 384 dimensions)';
COMMENT ON COLUMN knowledge_base.embedding_updated_at IS 'Timestamp when embedding was last generated';
COMMENT ON FUNCTION search_knowledge_semantic IS 'Search knowledge base by semantic similarity within a course';
COMMENT ON FUNCTION search_knowledge_global IS 'Search knowledge base across all courses (global admin search)';
COMMENT ON FUNCTION get_lesson_context IS 'Get contextual chunks for a lesson, prioritizing exact lesson match';