feat: Fix tokens Revenue stream
This commit is contained in:
@@ -0,0 +1,167 @@
|
||||
-- PGVector Embeddings Integration
|
||||
-- Enables semantic search for question bank and RAG generation
|
||||
|
||||
-- Enable pgvector extension
|
||||
CREATE EXTENSION IF NOT EXISTS vector;
|
||||
|
||||
-- Add embedding column to question_bank table
|
||||
-- Using 768 dimensions for nomic-embed-text model
|
||||
ALTER TABLE question_bank
|
||||
ADD COLUMN IF NOT EXISTS embedding vector(768);
|
||||
|
||||
-- Add embedding_updated_at timestamp
|
||||
ALTER TABLE question_bank
|
||||
ADD COLUMN IF NOT EXISTS embedding_updated_at TIMESTAMPTZ;
|
||||
|
||||
-- Create index for fast semantic search (IVFFlat for >10k rows)
|
||||
CREATE INDEX IF NOT EXISTS idx_question_embeddings
|
||||
ON question_bank
|
||||
USING ivfflat (embedding vector_cosine_ops)
|
||||
WITH (lists = 100);
|
||||
|
||||
-- Create index for filtering by embedding status
|
||||
CREATE INDEX IF NOT EXISTS idx_question_embedding_updated
|
||||
ON question_bank (embedding_updated_at);
|
||||
|
||||
-- Function to calculate cosine similarity between two embeddings
|
||||
CREATE OR REPLACE FUNCTION question_similarity(
|
||||
q1_id UUID,
|
||||
q2_id UUID
|
||||
)
|
||||
RETURNS REAL AS $$
|
||||
BEGIN
|
||||
RETURN (
|
||||
SELECT qb1.embedding <=> qb2.embedding
|
||||
FROM question_bank qb1, question_bank qb2
|
||||
WHERE qb1.id = q1_id AND qb2.id = q2_id
|
||||
);
|
||||
END;
|
||||
$$ LANGUAGE plpgsql STABLE;
|
||||
|
||||
-- Function to find similar questions (for duplicate detection)
|
||||
CREATE OR REPLACE FUNCTION find_similar_questions(
|
||||
p_question_id UUID,
|
||||
p_threshold REAL DEFAULT 0.85,
|
||||
p_limit INTEGER DEFAULT 10
|
||||
)
|
||||
RETURNS TABLE (
|
||||
id UUID,
|
||||
question_text TEXT,
|
||||
similarity REAL,
|
||||
question_type question_bank_type
|
||||
) AS $$
|
||||
BEGIN
|
||||
RETURN QUERY
|
||||
SELECT
|
||||
qb.id,
|
||||
qb.question_text,
|
||||
1 - (qb.embedding <=> (SELECT embedding FROM question_bank WHERE id = p_question_id)) AS similarity,
|
||||
qb.question_type
|
||||
FROM question_bank qb
|
||||
WHERE qb.id != p_question_id
|
||||
AND qb.organization_id = (SELECT organization_id FROM question_bank WHERE id = p_question_id)
|
||||
AND qb.embedding IS NOT NULL
|
||||
ORDER BY qb.embedding <=> (SELECT embedding FROM question_bank WHERE id = p_question_id)
|
||||
LIMIT p_limit;
|
||||
END;
|
||||
$$ LANGUAGE plpgsql STABLE;
|
||||
|
||||
-- Function to search questions by semantic similarity
|
||||
CREATE OR REPLACE FUNCTION search_questions_semantic(
|
||||
p_organization_id UUID,
|
||||
p_query_embedding vector(768),
|
||||
p_limit INTEGER DEFAULT 20,
|
||||
p_threshold DOUBLE PRECISION DEFAULT 0.5
|
||||
)
|
||||
RETURNS TABLE (
|
||||
id UUID,
|
||||
question_text TEXT,
|
||||
question_type question_bank_type,
|
||||
similarity DOUBLE PRECISION,
|
||||
tags TEXT[],
|
||||
difficulty VARCHAR,
|
||||
points INTEGER
|
||||
) AS $$
|
||||
BEGIN
|
||||
RETURN QUERY
|
||||
SELECT
|
||||
qb.id,
|
||||
qb.question_text,
|
||||
qb.question_type,
|
||||
(1 - (qb.embedding <=> p_query_embedding))::DOUBLE PRECISION AS similarity,
|
||||
qb.tags,
|
||||
qb.difficulty,
|
||||
qb.points
|
||||
FROM question_bank qb
|
||||
WHERE qb.organization_id = p_organization_id
|
||||
AND qb.embedding IS NOT NULL
|
||||
AND (1 - (qb.embedding <=> p_query_embedding))::DOUBLE PRECISION >= p_threshold
|
||||
ORDER BY qb.embedding <=> p_query_embedding
|
||||
LIMIT p_limit;
|
||||
END;
|
||||
$$ LANGUAGE plpgsql STABLE;
|
||||
|
||||
-- Function to get diverse questions covering multiple topics
|
||||
-- Uses Maximal Marginal Relevance (MMR) to balance relevance and diversity
|
||||
CREATE OR REPLACE FUNCTION get_diverse_questions(
|
||||
p_organization_id UUID,
|
||||
p_query_embedding vector(768),
|
||||
p_limit INTEGER DEFAULT 10,
|
||||
p_lambda DOUBLE PRECISION DEFAULT 0.7 -- 0 = max diversity, 1 = max relevance
|
||||
)
|
||||
RETURNS TABLE (
|
||||
id UUID,
|
||||
question_text TEXT,
|
||||
question_type question_bank_type,
|
||||
similarity DOUBLE PRECISION
|
||||
) AS $$
|
||||
DECLARE
|
||||
selected_ids UUID[] := ARRAY[]::UUID[];
|
||||
candidate_id UUID;
|
||||
best_score REAL;
|
||||
current_score REAL;
|
||||
diversity_score REAL;
|
||||
relevance_score REAL;
|
||||
BEGIN
|
||||
-- Simple MMR implementation: iteratively select questions
|
||||
-- that are relevant but dissimilar to already selected ones
|
||||
FOR i IN 1..p_limit LOOP
|
||||
SELECT qb.id INTO candidate_id
|
||||
FROM question_bank qb
|
||||
WHERE qb.organization_id = p_organization_id
|
||||
AND qb.id != ALL(selected_ids)
|
||||
AND qb.embedding IS NOT NULL
|
||||
ORDER BY
|
||||
(1 - (qb.embedding <=> p_query_embedding)) * p_lambda -
|
||||
(COALESCE((
|
||||
SELECT MAX(1 - (qb.embedding <=> qb2.embedding))
|
||||
FROM unnest(selected_ids) AS sid
|
||||
JOIN question_bank qb2 ON qb2.id = sid
|
||||
), 0)) * (1 - p_lambda)
|
||||
DESC
|
||||
LIMIT 1;
|
||||
|
||||
EXIT WHEN candidate_id IS NULL;
|
||||
|
||||
selected_ids := array_append(selected_ids, candidate_id);
|
||||
END LOOP;
|
||||
|
||||
RETURN QUERY
|
||||
SELECT
|
||||
qb.id,
|
||||
qb.question_text,
|
||||
qb.question_type,
|
||||
1 - (qb.embedding <=> p_query_embedding) AS similarity
|
||||
FROM question_bank qb
|
||||
WHERE qb.id = ANY(selected_ids)
|
||||
ORDER BY similarity DESC;
|
||||
END;
|
||||
$$ LANGUAGE plpgsql STABLE;
|
||||
|
||||
-- Comments
|
||||
COMMENT ON COLUMN question_bank.embedding IS 'Semantic embedding vector for similarity search (nomic-embed-text, 384 dimensions)';
|
||||
COMMENT ON COLUMN question_bank.embedding_updated_at IS 'Timestamp when embedding was last generated';
|
||||
COMMENT ON FUNCTION question_similarity IS 'Calculate cosine similarity between two questions';
|
||||
COMMENT ON FUNCTION find_similar_questions IS 'Find questions similar to a given question (for duplicate detection)';
|
||||
COMMENT ON FUNCTION search_questions_semantic IS 'Search questions by semantic similarity using embedding vector';
|
||||
COMMENT ON FUNCTION get_diverse_questions IS 'Get diverse questions using Maximal Marginal Relevance (MMR)';
|
||||
Reference in New Issue
Block a user