feat: Fix tokens Revenue stream

2026-03-19 14:54:31 -03:00
parent 686eb377f0
commit 0598fc4865
10 changed files with 1346 additions and 14 deletions
@@ -0,0 +1,167 @@
+-- PGVector Embeddings Integration
+-- Enables semantic search for question bank and RAG generation
+
+-- Enable pgvector extension
+CREATE EXTENSION IF NOT EXISTS vector;
+
+-- Add embedding column to question_bank table
+-- Using 768 dimensions for nomic-embed-text model
+ALTER TABLE question_bank
+ADD COLUMN IF NOT EXISTS embedding vector(768);
+
+-- Add embedding_updated_at timestamp
+ALTER TABLE question_bank
+ADD COLUMN IF NOT EXISTS embedding_updated_at TIMESTAMPTZ;
+
+-- Create index for fast semantic search (IVFFlat for >10k rows)
+CREATE INDEX IF NOT EXISTS idx_question_embeddings 
+ON question_bank 
+USING ivfflat (embedding vector_cosine_ops) 
+WITH (lists = 100);
+
+-- Create index for filtering by embedding status
+CREATE INDEX IF NOT EXISTS idx_question_embedding_updated 
+ON question_bank (embedding_updated_at);
+
+-- Function to calculate cosine similarity between two embeddings
+CREATE OR REPLACE FUNCTION question_similarity(
+    q1_id UUID,
+    q2_id UUID
+)
+RETURNS REAL AS $$
+BEGIN
+    RETURN (
+        SELECT qb1.embedding <=> qb2.embedding
+        FROM question_bank qb1, question_bank qb2
+        WHERE qb1.id = q1_id AND qb2.id = q2_id
+    );
+END;
+$$ LANGUAGE plpgsql STABLE;
+
+-- Function to find similar questions (for duplicate detection)
+CREATE OR REPLACE FUNCTION find_similar_questions(
+    p_question_id UUID,
+    p_threshold REAL DEFAULT 0.85,
+    p_limit INTEGER DEFAULT 10
+)
+RETURNS TABLE (
+    id UUID,
+    question_text TEXT,
+    similarity REAL,
+    question_type question_bank_type
+) AS $$
+BEGIN
+    RETURN QUERY
+    SELECT 
+        qb.id,
+        qb.question_text,
+        1 - (qb.embedding <=> (SELECT embedding FROM question_bank WHERE id = p_question_id)) AS similarity,
+        qb.question_type
+    FROM question_bank qb
+    WHERE qb.id != p_question_id
+      AND qb.organization_id = (SELECT organization_id FROM question_bank WHERE id = p_question_id)
+      AND qb.embedding IS NOT NULL
+    ORDER BY qb.embedding <=> (SELECT embedding FROM question_bank WHERE id = p_question_id)
+    LIMIT p_limit;
+END;
+$$ LANGUAGE plpgsql STABLE;
+
+-- Function to search questions by semantic similarity
+CREATE OR REPLACE FUNCTION search_questions_semantic(
+    p_organization_id UUID,
+    p_query_embedding vector(768),
+    p_limit INTEGER DEFAULT 20,
+    p_threshold DOUBLE PRECISION DEFAULT 0.5
+)
+RETURNS TABLE (
+    id UUID,
+    question_text TEXT,
+    question_type question_bank_type,
+    similarity DOUBLE PRECISION,
+    tags TEXT[],
+    difficulty VARCHAR,
+    points INTEGER
+) AS $$
+BEGIN
+    RETURN QUERY
+    SELECT 
+        qb.id,
+        qb.question_text,
+        qb.question_type,
+        (1 - (qb.embedding <=> p_query_embedding))::DOUBLE PRECISION AS similarity,
+        qb.tags,
+        qb.difficulty,
+        qb.points
+    FROM question_bank qb
+    WHERE qb.organization_id = p_organization_id
+      AND qb.embedding IS NOT NULL
+      AND (1 - (qb.embedding <=> p_query_embedding))::DOUBLE PRECISION >= p_threshold
+    ORDER BY qb.embedding <=> p_query_embedding
+    LIMIT p_limit;
+END;
+$$ LANGUAGE plpgsql STABLE;
+
+-- Function to get diverse questions covering multiple topics
+-- Uses Maximal Marginal Relevance (MMR) to balance relevance and diversity
+CREATE OR REPLACE FUNCTION get_diverse_questions(
+    p_organization_id UUID,
+    p_query_embedding vector(768),
+    p_limit INTEGER DEFAULT 10,
+    p_lambda DOUBLE PRECISION DEFAULT 0.7 -- 0 = max diversity, 1 = max relevance
+)
+RETURNS TABLE (
+    id UUID,
+    question_text TEXT,
+    question_type question_bank_type,
+    similarity DOUBLE PRECISION
+) AS $$
+DECLARE
+    selected_ids UUID[] := ARRAY[]::UUID[];
+    candidate_id UUID;
+    best_score REAL;
+    current_score REAL;
+    diversity_score REAL;
+    relevance_score REAL;
+BEGIN
+    -- Simple MMR implementation: iteratively select questions
+    -- that are relevant but dissimilar to already selected ones
+    FOR i IN 1..p_limit LOOP
+        SELECT qb.id INTO candidate_id
+        FROM question_bank qb
+        WHERE qb.organization_id = p_organization_id
+          AND qb.id != ALL(selected_ids)
+          AND qb.embedding IS NOT NULL
+        ORDER BY 
+            (1 - (qb.embedding <=> p_query_embedding)) * p_lambda - 
+            (COALESCE((
+                SELECT MAX(1 - (qb.embedding <=> qb2.embedding))
+                FROM unnest(selected_ids) AS sid
+                JOIN question_bank qb2 ON qb2.id = sid
+            ), 0)) * (1 - p_lambda)
+        DESC
+        LIMIT 1;
+        
+        EXIT WHEN candidate_id IS NULL;
+        
+        selected_ids := array_append(selected_ids, candidate_id);
+    END LOOP;
+    
+    RETURN QUERY
+    SELECT 
+        qb.id,
+        qb.question_text,
+        qb.question_type,
+        1 - (qb.embedding <=> p_query_embedding) AS similarity
+    FROM question_bank qb
+    WHERE qb.id = ANY(selected_ids)
+    ORDER BY similarity DESC;
+END;
+$$ LANGUAGE plpgsql STABLE;
+
+-- Comments
+COMMENT ON COLUMN question_bank.embedding IS 'Semantic embedding vector for similarity search (nomic-embed-text, 384 dimensions)';
+COMMENT ON COLUMN question_bank.embedding_updated_at IS 'Timestamp when embedding was last generated';
+COMMENT ON FUNCTION question_similarity IS 'Calculate cosine similarity between two questions';
+COMMENT ON FUNCTION find_similar_questions IS 'Find questions similar to a given question (for duplicate detection)';
+COMMENT ON FUNCTION search_questions_semantic IS 'Search questions by semantic similarity using embedding vector';
+COMMENT ON FUNCTION get_diverse_questions IS 'Get diverse questions using Maximal Marginal Relevance (MMR)';