Files
openccb/services/cms-service/migrations/20260319000002_pgvector_embeddings.sql

168 lines
5.4 KiB
PL/PgSQL

-- PGVector Embeddings Integration
-- Enables semantic search for question bank and RAG generation
-- Enable pgvector extension
CREATE EXTENSION IF NOT EXISTS vector;
-- Add embedding column to question_bank table
-- Using 768 dimensions for nomic-embed-text model
ALTER TABLE question_bank
ADD COLUMN IF NOT EXISTS embedding vector(768);
-- Add embedding_updated_at timestamp
ALTER TABLE question_bank
ADD COLUMN IF NOT EXISTS embedding_updated_at TIMESTAMPTZ;
-- Create index for fast semantic search (IVFFlat for >10k rows)
CREATE INDEX IF NOT EXISTS idx_question_embeddings
ON question_bank
USING ivfflat (embedding vector_cosine_ops)
WITH (lists = 100);
-- Create index for filtering by embedding status
CREATE INDEX IF NOT EXISTS idx_question_embedding_updated
ON question_bank (embedding_updated_at);
-- Function to calculate cosine similarity between two embeddings
CREATE OR REPLACE FUNCTION question_similarity(
q1_id UUID,
q2_id UUID
)
RETURNS REAL AS $$
BEGIN
RETURN (
SELECT qb1.embedding <=> qb2.embedding
FROM question_bank qb1, question_bank qb2
WHERE qb1.id = q1_id AND qb2.id = q2_id
);
END;
$$ LANGUAGE plpgsql STABLE;
-- Function to find similar questions (for duplicate detection)
CREATE OR REPLACE FUNCTION find_similar_questions(
p_question_id UUID,
p_threshold REAL DEFAULT 0.85,
p_limit INTEGER DEFAULT 10
)
RETURNS TABLE (
id UUID,
question_text TEXT,
similarity REAL,
question_type question_bank_type
) AS $$
BEGIN
RETURN QUERY
SELECT
qb.id,
qb.question_text,
1 - (qb.embedding <=> (SELECT embedding FROM question_bank WHERE id = p_question_id)) AS similarity,
qb.question_type
FROM question_bank qb
WHERE qb.id != p_question_id
AND qb.organization_id = (SELECT organization_id FROM question_bank WHERE id = p_question_id)
AND qb.embedding IS NOT NULL
ORDER BY qb.embedding <=> (SELECT embedding FROM question_bank WHERE id = p_question_id)
LIMIT p_limit;
END;
$$ LANGUAGE plpgsql STABLE;
-- Function to search questions by semantic similarity
CREATE OR REPLACE FUNCTION search_questions_semantic(
p_organization_id UUID,
p_query_embedding vector(768),
p_limit INTEGER DEFAULT 20,
p_threshold DOUBLE PRECISION DEFAULT 0.5
)
RETURNS TABLE (
id UUID,
question_text TEXT,
question_type question_bank_type,
similarity DOUBLE PRECISION,
tags TEXT[],
difficulty VARCHAR,
points INTEGER
) AS $$
BEGIN
RETURN QUERY
SELECT
qb.id,
qb.question_text,
qb.question_type,
(1 - (qb.embedding <=> p_query_embedding))::DOUBLE PRECISION AS similarity,
qb.tags,
qb.difficulty,
qb.points
FROM question_bank qb
WHERE qb.organization_id = p_organization_id
AND qb.embedding IS NOT NULL
AND (1 - (qb.embedding <=> p_query_embedding))::DOUBLE PRECISION >= p_threshold
ORDER BY qb.embedding <=> p_query_embedding
LIMIT p_limit;
END;
$$ LANGUAGE plpgsql STABLE;
-- Function to get diverse questions covering multiple topics
-- Uses Maximal Marginal Relevance (MMR) to balance relevance and diversity
CREATE OR REPLACE FUNCTION get_diverse_questions(
p_organization_id UUID,
p_query_embedding vector(768),
p_limit INTEGER DEFAULT 10,
p_lambda DOUBLE PRECISION DEFAULT 0.7 -- 0 = max diversity, 1 = max relevance
)
RETURNS TABLE (
id UUID,
question_text TEXT,
question_type question_bank_type,
similarity DOUBLE PRECISION
) AS $$
DECLARE
selected_ids UUID[] := ARRAY[]::UUID[];
candidate_id UUID;
best_score REAL;
current_score REAL;
diversity_score REAL;
relevance_score REAL;
BEGIN
-- Simple MMR implementation: iteratively select questions
-- that are relevant but dissimilar to already selected ones
FOR i IN 1..p_limit LOOP
SELECT qb.id INTO candidate_id
FROM question_bank qb
WHERE qb.organization_id = p_organization_id
AND qb.id != ALL(selected_ids)
AND qb.embedding IS NOT NULL
ORDER BY
(1 - (qb.embedding <=> p_query_embedding)) * p_lambda -
(COALESCE((
SELECT MAX(1 - (qb.embedding <=> qb2.embedding))
FROM unnest(selected_ids) AS sid
JOIN question_bank qb2 ON qb2.id = sid
), 0)) * (1 - p_lambda)
DESC
LIMIT 1;
EXIT WHEN candidate_id IS NULL;
selected_ids := array_append(selected_ids, candidate_id);
END LOOP;
RETURN QUERY
SELECT
qb.id,
qb.question_text,
qb.question_type,
1 - (qb.embedding <=> p_query_embedding) AS similarity
FROM question_bank qb
WHERE qb.id = ANY(selected_ids)
ORDER BY similarity DESC;
END;
$$ LANGUAGE plpgsql STABLE;
-- Comments
COMMENT ON COLUMN question_bank.embedding IS 'Semantic embedding vector for similarity search (nomic-embed-text, 384 dimensions)';
COMMENT ON COLUMN question_bank.embedding_updated_at IS 'Timestamp when embedding was last generated';
COMMENT ON FUNCTION question_similarity IS 'Calculate cosine similarity between two questions';
COMMENT ON FUNCTION find_similar_questions IS 'Find questions similar to a given question (for duplicate detection)';
COMMENT ON FUNCTION search_questions_semantic IS 'Search questions by semantic similarity using embedding vector';
COMMENT ON FUNCTION get_diverse_questions IS 'Get diverse questions using Maximal Marginal Relevance (MMR)';