feat: implementing embedding AI
This commit is contained in:
@@ -0,0 +1,106 @@
|
||||
-- MySQL Courses Integration
|
||||
-- Store imported course and study plan data from external MySQL database
|
||||
-- Used for test template creation with automatic level/course_type detection
|
||||
|
||||
-- Study Plans from MySQL
|
||||
CREATE TABLE mysql_study_plans (
|
||||
id SERIAL PRIMARY KEY,
|
||||
mysql_id INTEGER NOT NULL UNIQUE, -- idPlanDeEstudios from MySQL
|
||||
organization_id UUID NOT NULL REFERENCES organizations(id) ON DELETE CASCADE,
|
||||
|
||||
name VARCHAR(255) NOT NULL, -- Nombre from MySQL
|
||||
|
||||
-- Course type detection
|
||||
course_type VARCHAR(20) NOT NULL DEFAULT 'regular', -- 'regular' (40h) or 'intensive' (80h)
|
||||
|
||||
-- Metadata
|
||||
is_active BOOLEAN NOT NULL DEFAULT true,
|
||||
created_at TIMESTAMPTZ NOT NULL DEFAULT NOW(),
|
||||
updated_at TIMESTAMPTZ NOT NULL DEFAULT NOW(),
|
||||
|
||||
UNIQUE(organization_id, mysql_id)
|
||||
);
|
||||
|
||||
-- Courses from MySQL
|
||||
CREATE TABLE IF NOT EXISTS mysql_courses (
|
||||
id SERIAL PRIMARY KEY,
|
||||
mysql_id INTEGER NOT NULL UNIQUE, -- idCursos from MySQL
|
||||
organization_id UUID NOT NULL REFERENCES organizations(id) ON DELETE CASCADE,
|
||||
study_plan_id INTEGER NOT NULL REFERENCES mysql_study_plans(id) ON DELETE CASCADE,
|
||||
|
||||
name VARCHAR(255) NOT NULL, -- NombreCurso from MySQL
|
||||
level INTEGER, -- NivelCurso from MySQL (1-12+)
|
||||
duracion INTEGER, -- Duracion from MySQL (40h or 80h)
|
||||
|
||||
-- Auto-calculated fields
|
||||
course_type VARCHAR(20) NOT NULL DEFAULT 'regular', -- 'regular' (40h) or 'intensive' (80h)
|
||||
level_calculated VARCHAR(20), -- Calculated from NivelCurso: beginner, beginner_1, etc.
|
||||
|
||||
-- Metadata
|
||||
is_active BOOLEAN NOT NULL DEFAULT true,
|
||||
created_at TIMESTAMPTZ NOT NULL DEFAULT NOW(),
|
||||
updated_at TIMESTAMPTZ NOT NULL DEFAULT NOW(),
|
||||
|
||||
UNIQUE(organization_id, mysql_id)
|
||||
);
|
||||
|
||||
-- Indexes for performance
|
||||
CREATE INDEX idx_mysql_courses_study_plan ON mysql_courses(study_plan_id);
|
||||
CREATE INDEX idx_mysql_courses_org ON mysql_courses(organization_id);
|
||||
CREATE INDEX idx_mysql_plans_org ON mysql_study_plans(organization_id);
|
||||
|
||||
-- Function to update updated_at timestamp
|
||||
CREATE OR REPLACE FUNCTION update_mysql_integration_updated_at()
|
||||
RETURNS TRIGGER AS $$
|
||||
BEGIN
|
||||
NEW.updated_at = NOW();
|
||||
RETURN NEW;
|
||||
END;
|
||||
$$ LANGUAGE plpgsql;
|
||||
|
||||
-- Triggers for updated_at
|
||||
CREATE TRIGGER update_mysql_study_plans_updated_at
|
||||
BEFORE UPDATE ON mysql_study_plans
|
||||
FOR EACH ROW
|
||||
EXECUTE FUNCTION update_mysql_integration_updated_at();
|
||||
|
||||
CREATE TRIGGER update_mysql_courses_updated_at
|
||||
BEFORE UPDATE ON mysql_courses
|
||||
FOR EACH ROW
|
||||
EXECUTE FUNCTION update_mysql_integration_updated_at();
|
||||
|
||||
-- Function to determine course level from NivelCurso
|
||||
CREATE OR REPLACE FUNCTION calculate_course_level(nivel INTEGER)
|
||||
RETURNS TEXT AS $$
|
||||
BEGIN
|
||||
IF nivel IS NULL THEN
|
||||
RETURN 'intermediate';
|
||||
ELSIF nivel <= 2 THEN
|
||||
RETURN 'beginner';
|
||||
ELSIF nivel <= 4 THEN
|
||||
RETURN 'beginner_1';
|
||||
ELSIF nivel <= 6 THEN
|
||||
RETURN 'beginner_2';
|
||||
ELSIF nivel <= 8 THEN
|
||||
RETURN 'intermediate';
|
||||
ELSIF nivel <= 10 THEN
|
||||
RETURN 'intermediate_1';
|
||||
ELSIF nivel <= 12 THEN
|
||||
RETURN 'intermediate_2';
|
||||
ELSE
|
||||
RETURN 'advanced';
|
||||
END IF;
|
||||
END;
|
||||
$$ LANGUAGE plpgsql;
|
||||
|
||||
-- Function to determine course type from plan name
|
||||
CREATE OR REPLACE FUNCTION calculate_course_type(plan_name TEXT)
|
||||
RETURNS TEXT AS $$
|
||||
BEGIN
|
||||
IF LOWER(plan_name) LIKE '%intensive%' OR LOWER(plan_name) LIKE '%intensivo%' THEN
|
||||
RETURN 'intensive';
|
||||
ELSE
|
||||
RETURN 'regular';
|
||||
END IF;
|
||||
END;
|
||||
$$ LANGUAGE plpgsql;
|
||||
@@ -0,0 +1,82 @@
|
||||
-- Fix test_templates to use mysql_course_id reference instead of level/course_type strings
|
||||
-- This ensures data consistency and leverages the imported MySQL course data in PostgreSQL
|
||||
|
||||
-- Add mysql_course_id column to test_templates
|
||||
ALTER TABLE test_templates
|
||||
ADD COLUMN mysql_course_id INTEGER REFERENCES mysql_courses(mysql_id) ON DELETE SET NULL,
|
||||
ALTER COLUMN level DROP NOT NULL,
|
||||
ALTER COLUMN course_type DROP NOT NULL;
|
||||
|
||||
-- Create index for faster lookups
|
||||
CREATE INDEX IF NOT EXISTS idx_test_templates_mysql_course ON test_templates(mysql_course_id);
|
||||
|
||||
-- Add comment for documentation
|
||||
COMMENT ON COLUMN test_templates.mysql_course_id IS 'Reference to imported MySQL course (mysql_courses.mysql_id). Preferred over level/course_type fields.';
|
||||
|
||||
-- Create view for backward compatibility - shows calculated level/course_type from mysql_courses
|
||||
CREATE OR REPLACE VIEW test_templates_with_course_info AS
|
||||
SELECT
|
||||
tt.*,
|
||||
mc.name AS course_name,
|
||||
mc.level_calculated,
|
||||
mc.course_type AS calculated_course_type,
|
||||
mc.duracion AS course_duration
|
||||
FROM test_templates tt
|
||||
LEFT JOIN mysql_courses mc ON tt.mysql_course_id = mc.mysql_id;
|
||||
|
||||
-- Function to get template with course info
|
||||
CREATE OR REPLACE FUNCTION get_test_template_with_course(p_template_id UUID)
|
||||
RETURNS TABLE (
|
||||
id UUID,
|
||||
organization_id UUID,
|
||||
name VARCHAR,
|
||||
description TEXT,
|
||||
mysql_course_id INTEGER,
|
||||
course_name VARCHAR,
|
||||
level course_level,
|
||||
level_calculated TEXT,
|
||||
course_type course_type,
|
||||
calculated_course_type TEXT,
|
||||
test_type test_type,
|
||||
duration_minutes INTEGER,
|
||||
passing_score INTEGER,
|
||||
total_points INTEGER,
|
||||
instructions TEXT,
|
||||
template_data JSONB,
|
||||
tags TEXT[],
|
||||
is_active BOOLEAN,
|
||||
usage_count INTEGER,
|
||||
created_by UUID,
|
||||
created_at TIMESTAMPTZ,
|
||||
updated_at TIMESTAMPTZ
|
||||
) AS $$
|
||||
BEGIN
|
||||
RETURN QUERY
|
||||
SELECT
|
||||
tt.id,
|
||||
tt.organization_id,
|
||||
tt.name,
|
||||
tt.description,
|
||||
tt.mysql_course_id,
|
||||
mc.name,
|
||||
tt.level,
|
||||
mc.level_calculated,
|
||||
tt.course_type,
|
||||
mc.course_type,
|
||||
tt.test_type,
|
||||
tt.duration_minutes,
|
||||
tt.passing_score,
|
||||
tt.total_points,
|
||||
tt.instructions,
|
||||
tt.template_data,
|
||||
tt.tags,
|
||||
tt.is_active,
|
||||
tt.usage_count,
|
||||
tt.created_by,
|
||||
tt.created_at,
|
||||
tt.updated_at
|
||||
FROM test_templates tt
|
||||
LEFT JOIN mysql_courses mc ON tt.mysql_course_id = mc.mysql_id
|
||||
WHERE tt.id = p_template_id;
|
||||
END;
|
||||
$$ LANGUAGE plpgsql;
|
||||
@@ -0,0 +1,167 @@
|
||||
-- PGVector Embeddings Integration
|
||||
-- Enables semantic search for question bank and RAG generation
|
||||
|
||||
-- Enable pgvector extension
|
||||
CREATE EXTENSION IF NOT EXISTS vector;
|
||||
|
||||
-- Add embedding column to question_bank table
|
||||
-- Using 768 dimensions for nomic-embed-text model
|
||||
ALTER TABLE question_bank
|
||||
ADD COLUMN IF NOT EXISTS embedding vector(768);
|
||||
|
||||
-- Add embedding_updated_at timestamp
|
||||
ALTER TABLE question_bank
|
||||
ADD COLUMN IF NOT EXISTS embedding_updated_at TIMESTAMPTZ;
|
||||
|
||||
-- Create index for fast semantic search (IVFFlat for >10k rows)
|
||||
CREATE INDEX IF NOT EXISTS idx_question_embeddings
|
||||
ON question_bank
|
||||
USING ivfflat (embedding vector_cosine_ops)
|
||||
WITH (lists = 100);
|
||||
|
||||
-- Create index for filtering by embedding status
|
||||
CREATE INDEX IF NOT EXISTS idx_question_embedding_updated
|
||||
ON question_bank (embedding_updated_at);
|
||||
|
||||
-- Function to calculate cosine similarity between two embeddings
|
||||
CREATE OR REPLACE FUNCTION question_similarity(
|
||||
q1_id UUID,
|
||||
q2_id UUID
|
||||
)
|
||||
RETURNS REAL AS $$
|
||||
BEGIN
|
||||
RETURN (
|
||||
SELECT qb1.embedding <=> qb2.embedding
|
||||
FROM question_bank qb1, question_bank qb2
|
||||
WHERE qb1.id = q1_id AND qb2.id = q2_id
|
||||
);
|
||||
END;
|
||||
$$ LANGUAGE plpgsql STABLE;
|
||||
|
||||
-- Function to find similar questions (for duplicate detection)
|
||||
CREATE OR REPLACE FUNCTION find_similar_questions(
|
||||
p_question_id UUID,
|
||||
p_threshold REAL DEFAULT 0.85,
|
||||
p_limit INTEGER DEFAULT 10
|
||||
)
|
||||
RETURNS TABLE (
|
||||
id UUID,
|
||||
question_text TEXT,
|
||||
similarity REAL,
|
||||
question_type question_bank_type
|
||||
) AS $$
|
||||
BEGIN
|
||||
RETURN QUERY
|
||||
SELECT
|
||||
qb.id,
|
||||
qb.question_text,
|
||||
1 - (qb.embedding <=> (SELECT embedding FROM question_bank WHERE id = p_question_id)) AS similarity,
|
||||
qb.question_type
|
||||
FROM question_bank qb
|
||||
WHERE qb.id != p_question_id
|
||||
AND qb.organization_id = (SELECT organization_id FROM question_bank WHERE id = p_question_id)
|
||||
AND qb.embedding IS NOT NULL
|
||||
ORDER BY qb.embedding <=> (SELECT embedding FROM question_bank WHERE id = p_question_id)
|
||||
LIMIT p_limit;
|
||||
END;
|
||||
$$ LANGUAGE plpgsql STABLE;
|
||||
|
||||
-- Function to search questions by semantic similarity
|
||||
CREATE OR REPLACE FUNCTION search_questions_semantic(
|
||||
p_organization_id UUID,
|
||||
p_query_embedding vector(768),
|
||||
p_limit INTEGER DEFAULT 20,
|
||||
p_threshold DOUBLE PRECISION DEFAULT 0.5
|
||||
)
|
||||
RETURNS TABLE (
|
||||
id UUID,
|
||||
question_text TEXT,
|
||||
question_type question_bank_type,
|
||||
similarity DOUBLE PRECISION,
|
||||
tags TEXT[],
|
||||
difficulty VARCHAR,
|
||||
points INTEGER
|
||||
) AS $$
|
||||
BEGIN
|
||||
RETURN QUERY
|
||||
SELECT
|
||||
qb.id,
|
||||
qb.question_text,
|
||||
qb.question_type,
|
||||
(1 - (qb.embedding <=> p_query_embedding))::DOUBLE PRECISION AS similarity,
|
||||
qb.tags,
|
||||
qb.difficulty,
|
||||
qb.points
|
||||
FROM question_bank qb
|
||||
WHERE qb.organization_id = p_organization_id
|
||||
AND qb.embedding IS NOT NULL
|
||||
AND (1 - (qb.embedding <=> p_query_embedding))::DOUBLE PRECISION >= p_threshold
|
||||
ORDER BY qb.embedding <=> p_query_embedding
|
||||
LIMIT p_limit;
|
||||
END;
|
||||
$$ LANGUAGE plpgsql STABLE;
|
||||
|
||||
-- Function to get diverse questions covering multiple topics
|
||||
-- Uses Maximal Marginal Relevance (MMR) to balance relevance and diversity
|
||||
CREATE OR REPLACE FUNCTION get_diverse_questions(
|
||||
p_organization_id UUID,
|
||||
p_query_embedding vector(768),
|
||||
p_limit INTEGER DEFAULT 10,
|
||||
p_lambda DOUBLE PRECISION DEFAULT 0.7 -- 0 = max diversity, 1 = max relevance
|
||||
)
|
||||
RETURNS TABLE (
|
||||
id UUID,
|
||||
question_text TEXT,
|
||||
question_type question_bank_type,
|
||||
similarity DOUBLE PRECISION
|
||||
) AS $$
|
||||
DECLARE
|
||||
selected_ids UUID[] := ARRAY[]::UUID[];
|
||||
candidate_id UUID;
|
||||
best_score REAL;
|
||||
current_score REAL;
|
||||
diversity_score REAL;
|
||||
relevance_score REAL;
|
||||
BEGIN
|
||||
-- Simple MMR implementation: iteratively select questions
|
||||
-- that are relevant but dissimilar to already selected ones
|
||||
FOR i IN 1..p_limit LOOP
|
||||
SELECT qb.id INTO candidate_id
|
||||
FROM question_bank qb
|
||||
WHERE qb.organization_id = p_organization_id
|
||||
AND qb.id != ALL(selected_ids)
|
||||
AND qb.embedding IS NOT NULL
|
||||
ORDER BY
|
||||
(1 - (qb.embedding <=> p_query_embedding)) * p_lambda -
|
||||
(COALESCE((
|
||||
SELECT MAX(1 - (qb.embedding <=> qb2.embedding))
|
||||
FROM unnest(selected_ids) AS sid
|
||||
JOIN question_bank qb2 ON qb2.id = sid
|
||||
), 0)) * (1 - p_lambda)
|
||||
DESC
|
||||
LIMIT 1;
|
||||
|
||||
EXIT WHEN candidate_id IS NULL;
|
||||
|
||||
selected_ids := array_append(selected_ids, candidate_id);
|
||||
END LOOP;
|
||||
|
||||
RETURN QUERY
|
||||
SELECT
|
||||
qb.id,
|
||||
qb.question_text,
|
||||
qb.question_type,
|
||||
1 - (qb.embedding <=> p_query_embedding) AS similarity
|
||||
FROM question_bank qb
|
||||
WHERE qb.id = ANY(selected_ids)
|
||||
ORDER BY similarity DESC;
|
||||
END;
|
||||
$$ LANGUAGE plpgsql STABLE;
|
||||
|
||||
-- Comments
|
||||
COMMENT ON COLUMN question_bank.embedding IS 'Semantic embedding vector for similarity search (nomic-embed-text, 384 dimensions)';
|
||||
COMMENT ON COLUMN question_bank.embedding_updated_at IS 'Timestamp when embedding was last generated';
|
||||
COMMENT ON FUNCTION question_similarity IS 'Calculate cosine similarity between two questions';
|
||||
COMMENT ON FUNCTION find_similar_questions IS 'Find questions similar to a given question (for duplicate detection)';
|
||||
COMMENT ON FUNCTION search_questions_semantic IS 'Search questions by semantic similarity using embedding vector';
|
||||
COMMENT ON FUNCTION get_diverse_questions IS 'Get diverse questions using Maximal Marginal Relevance (MMR)';
|
||||
Reference in New Issue
Block a user