feat: implementing embedding AI
This commit is contained in:
@@ -0,0 +1,135 @@
|
||||
-- PGVector Embeddings for Knowledge Base (LMS)
|
||||
-- Enables semantic search for AI tutor chat with RAG
|
||||
|
||||
-- Enable pgvector extension (should already be enabled from CMS)
|
||||
CREATE EXTENSION IF NOT EXISTS vector;
|
||||
|
||||
-- Add embedding column to knowledge_base table
|
||||
-- Using 768 dimensions for nomic-embed-text model
|
||||
ALTER TABLE knowledge_base
|
||||
ADD COLUMN IF NOT EXISTS embedding vector(768);
|
||||
|
||||
-- Add embedding_updated_at timestamp
|
||||
ALTER TABLE knowledge_base
|
||||
ADD COLUMN IF NOT EXISTS embedding_updated_at TIMESTAMPTZ;
|
||||
|
||||
-- Create index for fast semantic search (IVFFlat for >10k rows)
|
||||
-- Adjust lists parameter based on expected data size:
|
||||
-- lists = rows / 1000 for < 1M rows
|
||||
CREATE INDEX IF NOT EXISTS idx_knowledge_base_embeddings
|
||||
ON knowledge_base
|
||||
USING ivfflat (embedding vector_cosine_ops)
|
||||
WITH (lists = 100);
|
||||
|
||||
-- Create index for filtering by embedding status
|
||||
CREATE INDEX IF NOT EXISTS idx_knowledge_base_embedding_updated
|
||||
ON knowledge_base (embedding_updated_at);
|
||||
|
||||
-- Function to search knowledge base by semantic similarity
|
||||
CREATE OR REPLACE FUNCTION search_knowledge_semantic(
|
||||
p_course_id UUID,
|
||||
p_query_embedding vector(768),
|
||||
p_limit INTEGER DEFAULT 10,
|
||||
p_threshold REAL DEFAULT 0.5
|
||||
)
|
||||
RETURNS TABLE (
|
||||
id UUID,
|
||||
course_id UUID,
|
||||
lesson_id UUID,
|
||||
block_id UUID,
|
||||
content_chunk TEXT,
|
||||
similarity REAL,
|
||||
metadata JSONB
|
||||
) AS $$
|
||||
BEGIN
|
||||
RETURN QUERY
|
||||
SELECT
|
||||
kb.id,
|
||||
kb.course_id,
|
||||
kb.lesson_id,
|
||||
kb.block_id,
|
||||
kb.content_chunk,
|
||||
1 - (kb.embedding <=> p_query_embedding) AS similarity,
|
||||
kb.metadata
|
||||
FROM knowledge_base kb
|
||||
WHERE kb.course_id = p_course_id
|
||||
AND kb.embedding IS NOT NULL
|
||||
AND 1 - (kb.embedding <=> p_query_embedding) >= p_threshold
|
||||
ORDER BY kb.embedding <=> p_query_embedding
|
||||
LIMIT p_limit;
|
||||
END;
|
||||
$$ LANGUAGE plpgsql STABLE;
|
||||
|
||||
-- Function to search knowledge base across all courses (for admin/global search)
|
||||
CREATE OR REPLACE FUNCTION search_knowledge_global(
|
||||
p_query_embedding vector(768),
|
||||
p_limit INTEGER DEFAULT 20,
|
||||
p_threshold REAL DEFAULT 0.6
|
||||
)
|
||||
RETURNS TABLE (
|
||||
id UUID,
|
||||
course_id UUID,
|
||||
course_name VARCHAR,
|
||||
lesson_id UUID,
|
||||
lesson_title VARCHAR,
|
||||
content_chunk TEXT,
|
||||
similarity REAL
|
||||
) AS $$
|
||||
BEGIN
|
||||
RETURN QUERY
|
||||
SELECT
|
||||
kb.id,
|
||||
kb.course_id,
|
||||
c.name AS course_name,
|
||||
kb.lesson_id,
|
||||
l.title AS lesson_title,
|
||||
kb.content_chunk,
|
||||
1 - (kb.embedding <=> p_query_embedding) AS similarity
|
||||
FROM knowledge_base kb
|
||||
LEFT JOIN courses c ON c.id = kb.course_id
|
||||
LEFT JOIN lessons l ON l.id = kb.lesson_id
|
||||
WHERE kb.embedding IS NOT NULL
|
||||
AND 1 - (kb.embedding <=> p_query_embedding) >= p_threshold
|
||||
ORDER BY kb.embedding <=> p_query_embedding
|
||||
LIMIT p_limit;
|
||||
END;
|
||||
$$ LANGUAGE plpgsql STABLE;
|
||||
|
||||
-- Function to get contextual chunks for a specific lesson
|
||||
-- Combines semantic search with exact lesson matching
|
||||
CREATE OR REPLACE FUNCTION get_lesson_context(
|
||||
p_lesson_id UUID,
|
||||
p_query_embedding vector(768),
|
||||
p_limit INTEGER DEFAULT 5
|
||||
)
|
||||
RETURNS TABLE (
|
||||
id UUID,
|
||||
content_chunk TEXT,
|
||||
similarity REAL,
|
||||
is_exact_lesson BOOLEAN,
|
||||
metadata JSONB
|
||||
) AS $$
|
||||
BEGIN
|
||||
RETURN QUERY
|
||||
SELECT
|
||||
kb.id,
|
||||
kb.content_chunk,
|
||||
1 - (kb.embedding <=> p_query_embedding) AS similarity,
|
||||
(kb.lesson_id = p_lesson_id) AS is_exact_lesson,
|
||||
kb.metadata
|
||||
FROM knowledge_base kb
|
||||
WHERE kb.embedding IS NOT NULL
|
||||
AND (kb.lesson_id = p_lesson_id OR 1 - (kb.embedding <=> p_query_embedding) >= 0.6)
|
||||
ORDER BY
|
||||
(kb.lesson_id = p_lesson_id) DESC,
|
||||
kb.embedding <=> p_query_embedding
|
||||
LIMIT p_limit;
|
||||
END;
|
||||
$$ LANGUAGE plpgsql STABLE;
|
||||
|
||||
-- Comments
|
||||
COMMENT ON COLUMN knowledge_base.embedding IS 'Semantic embedding vector for RAG search (nomic-embed-text, 384 dimensions)';
|
||||
COMMENT ON COLUMN knowledge_base.embedding_updated_at IS 'Timestamp when embedding was last generated';
|
||||
COMMENT ON FUNCTION search_knowledge_semantic IS 'Search knowledge base by semantic similarity within a course';
|
||||
COMMENT ON FUNCTION search_knowledge_global IS 'Search knowledge base across all courses (global admin search)';
|
||||
COMMENT ON FUNCTION get_lesson_context IS 'Get contextual chunks for a lesson, prioritizing exact lesson match';
|
||||
Reference in New Issue
Block a user