feat: implementing embedding AI

This commit is contained in:
2026-03-18 17:15:39 -03:00
parent e8cdf61468
commit 64d3d5be91
32 changed files with 3568 additions and 174 deletions
@@ -0,0 +1,106 @@
-- MySQL Courses Integration
-- Store imported course and study plan data from external MySQL database
-- Used for test template creation with automatic level/course_type detection
-- Study Plans from MySQL
CREATE TABLE mysql_study_plans (
id SERIAL PRIMARY KEY,
mysql_id INTEGER NOT NULL UNIQUE, -- idPlanDeEstudios from MySQL
organization_id UUID NOT NULL REFERENCES organizations(id) ON DELETE CASCADE,
name VARCHAR(255) NOT NULL, -- Nombre from MySQL
-- Course type detection
course_type VARCHAR(20) NOT NULL DEFAULT 'regular', -- 'regular' (40h) or 'intensive' (80h)
-- Metadata
is_active BOOLEAN NOT NULL DEFAULT true,
created_at TIMESTAMPTZ NOT NULL DEFAULT NOW(),
updated_at TIMESTAMPTZ NOT NULL DEFAULT NOW(),
UNIQUE(organization_id, mysql_id)
);
-- Courses from MySQL
CREATE TABLE IF NOT EXISTS mysql_courses (
id SERIAL PRIMARY KEY,
mysql_id INTEGER NOT NULL UNIQUE, -- idCursos from MySQL
organization_id UUID NOT NULL REFERENCES organizations(id) ON DELETE CASCADE,
study_plan_id INTEGER NOT NULL REFERENCES mysql_study_plans(id) ON DELETE CASCADE,
name VARCHAR(255) NOT NULL, -- NombreCurso from MySQL
level INTEGER, -- NivelCurso from MySQL (1-12+)
duracion INTEGER, -- Duracion from MySQL (40h or 80h)
-- Auto-calculated fields
course_type VARCHAR(20) NOT NULL DEFAULT 'regular', -- 'regular' (40h) or 'intensive' (80h)
level_calculated VARCHAR(20), -- Calculated from NivelCurso: beginner, beginner_1, etc.
-- Metadata
is_active BOOLEAN NOT NULL DEFAULT true,
created_at TIMESTAMPTZ NOT NULL DEFAULT NOW(),
updated_at TIMESTAMPTZ NOT NULL DEFAULT NOW(),
UNIQUE(organization_id, mysql_id)
);
-- Indexes for performance
CREATE INDEX idx_mysql_courses_study_plan ON mysql_courses(study_plan_id);
CREATE INDEX idx_mysql_courses_org ON mysql_courses(organization_id);
CREATE INDEX idx_mysql_plans_org ON mysql_study_plans(organization_id);
-- Function to update updated_at timestamp
CREATE OR REPLACE FUNCTION update_mysql_integration_updated_at()
RETURNS TRIGGER AS $$
BEGIN
NEW.updated_at = NOW();
RETURN NEW;
END;
$$ LANGUAGE plpgsql;
-- Triggers for updated_at
CREATE TRIGGER update_mysql_study_plans_updated_at
BEFORE UPDATE ON mysql_study_plans
FOR EACH ROW
EXECUTE FUNCTION update_mysql_integration_updated_at();
CREATE TRIGGER update_mysql_courses_updated_at
BEFORE UPDATE ON mysql_courses
FOR EACH ROW
EXECUTE FUNCTION update_mysql_integration_updated_at();
-- Function to determine course level from NivelCurso
CREATE OR REPLACE FUNCTION calculate_course_level(nivel INTEGER)
RETURNS TEXT AS $$
BEGIN
IF nivel IS NULL THEN
RETURN 'intermediate';
ELSIF nivel <= 2 THEN
RETURN 'beginner';
ELSIF nivel <= 4 THEN
RETURN 'beginner_1';
ELSIF nivel <= 6 THEN
RETURN 'beginner_2';
ELSIF nivel <= 8 THEN
RETURN 'intermediate';
ELSIF nivel <= 10 THEN
RETURN 'intermediate_1';
ELSIF nivel <= 12 THEN
RETURN 'intermediate_2';
ELSE
RETURN 'advanced';
END IF;
END;
$$ LANGUAGE plpgsql;
-- Function to determine course type from plan name
CREATE OR REPLACE FUNCTION calculate_course_type(plan_name TEXT)
RETURNS TEXT AS $$
BEGIN
IF LOWER(plan_name) LIKE '%intensive%' OR LOWER(plan_name) LIKE '%intensivo%' THEN
RETURN 'intensive';
ELSE
RETURN 'regular';
END IF;
END;
$$ LANGUAGE plpgsql;
@@ -0,0 +1,82 @@
-- Fix test_templates to use mysql_course_id reference instead of level/course_type strings
-- This ensures data consistency and leverages the imported MySQL course data in PostgreSQL
-- Add mysql_course_id column to test_templates
ALTER TABLE test_templates
ADD COLUMN mysql_course_id INTEGER REFERENCES mysql_courses(mysql_id) ON DELETE SET NULL,
ALTER COLUMN level DROP NOT NULL,
ALTER COLUMN course_type DROP NOT NULL;
-- Create index for faster lookups
CREATE INDEX IF NOT EXISTS idx_test_templates_mysql_course ON test_templates(mysql_course_id);
-- Add comment for documentation
COMMENT ON COLUMN test_templates.mysql_course_id IS 'Reference to imported MySQL course (mysql_courses.mysql_id). Preferred over level/course_type fields.';
-- Create view for backward compatibility - shows calculated level/course_type from mysql_courses
CREATE OR REPLACE VIEW test_templates_with_course_info AS
SELECT
tt.*,
mc.name AS course_name,
mc.level_calculated,
mc.course_type AS calculated_course_type,
mc.duracion AS course_duration
FROM test_templates tt
LEFT JOIN mysql_courses mc ON tt.mysql_course_id = mc.mysql_id;
-- Function to get template with course info
CREATE OR REPLACE FUNCTION get_test_template_with_course(p_template_id UUID)
RETURNS TABLE (
id UUID,
organization_id UUID,
name VARCHAR,
description TEXT,
mysql_course_id INTEGER,
course_name VARCHAR,
level course_level,
level_calculated TEXT,
course_type course_type,
calculated_course_type TEXT,
test_type test_type,
duration_minutes INTEGER,
passing_score INTEGER,
total_points INTEGER,
instructions TEXT,
template_data JSONB,
tags TEXT[],
is_active BOOLEAN,
usage_count INTEGER,
created_by UUID,
created_at TIMESTAMPTZ,
updated_at TIMESTAMPTZ
) AS $$
BEGIN
RETURN QUERY
SELECT
tt.id,
tt.organization_id,
tt.name,
tt.description,
tt.mysql_course_id,
mc.name,
tt.level,
mc.level_calculated,
tt.course_type,
mc.course_type,
tt.test_type,
tt.duration_minutes,
tt.passing_score,
tt.total_points,
tt.instructions,
tt.template_data,
tt.tags,
tt.is_active,
tt.usage_count,
tt.created_by,
tt.created_at,
tt.updated_at
FROM test_templates tt
LEFT JOIN mysql_courses mc ON tt.mysql_course_id = mc.mysql_id
WHERE tt.id = p_template_id;
END;
$$ LANGUAGE plpgsql;
@@ -0,0 +1,167 @@
-- PGVector Embeddings Integration
-- Enables semantic search for question bank and RAG generation
-- Enable pgvector extension
CREATE EXTENSION IF NOT EXISTS vector;
-- Add embedding column to question_bank table
-- Using 768 dimensions for nomic-embed-text model
ALTER TABLE question_bank
ADD COLUMN IF NOT EXISTS embedding vector(768);
-- Add embedding_updated_at timestamp
ALTER TABLE question_bank
ADD COLUMN IF NOT EXISTS embedding_updated_at TIMESTAMPTZ;
-- Create index for fast semantic search (IVFFlat for >10k rows)
CREATE INDEX IF NOT EXISTS idx_question_embeddings
ON question_bank
USING ivfflat (embedding vector_cosine_ops)
WITH (lists = 100);
-- Create index for filtering by embedding status
CREATE INDEX IF NOT EXISTS idx_question_embedding_updated
ON question_bank (embedding_updated_at);
-- Function to calculate cosine similarity between two embeddings
CREATE OR REPLACE FUNCTION question_similarity(
q1_id UUID,
q2_id UUID
)
RETURNS REAL AS $$
BEGIN
RETURN (
SELECT qb1.embedding <=> qb2.embedding
FROM question_bank qb1, question_bank qb2
WHERE qb1.id = q1_id AND qb2.id = q2_id
);
END;
$$ LANGUAGE plpgsql STABLE;
-- Function to find similar questions (for duplicate detection)
CREATE OR REPLACE FUNCTION find_similar_questions(
p_question_id UUID,
p_threshold REAL DEFAULT 0.85,
p_limit INTEGER DEFAULT 10
)
RETURNS TABLE (
id UUID,
question_text TEXT,
similarity REAL,
question_type question_bank_type
) AS $$
BEGIN
RETURN QUERY
SELECT
qb.id,
qb.question_text,
1 - (qb.embedding <=> (SELECT embedding FROM question_bank WHERE id = p_question_id)) AS similarity,
qb.question_type
FROM question_bank qb
WHERE qb.id != p_question_id
AND qb.organization_id = (SELECT organization_id FROM question_bank WHERE id = p_question_id)
AND qb.embedding IS NOT NULL
ORDER BY qb.embedding <=> (SELECT embedding FROM question_bank WHERE id = p_question_id)
LIMIT p_limit;
END;
$$ LANGUAGE plpgsql STABLE;
-- Function to search questions by semantic similarity
CREATE OR REPLACE FUNCTION search_questions_semantic(
p_organization_id UUID,
p_query_embedding vector(768),
p_limit INTEGER DEFAULT 20,
p_threshold DOUBLE PRECISION DEFAULT 0.5
)
RETURNS TABLE (
id UUID,
question_text TEXT,
question_type question_bank_type,
similarity DOUBLE PRECISION,
tags TEXT[],
difficulty VARCHAR,
points INTEGER
) AS $$
BEGIN
RETURN QUERY
SELECT
qb.id,
qb.question_text,
qb.question_type,
(1 - (qb.embedding <=> p_query_embedding))::DOUBLE PRECISION AS similarity,
qb.tags,
qb.difficulty,
qb.points
FROM question_bank qb
WHERE qb.organization_id = p_organization_id
AND qb.embedding IS NOT NULL
AND (1 - (qb.embedding <=> p_query_embedding))::DOUBLE PRECISION >= p_threshold
ORDER BY qb.embedding <=> p_query_embedding
LIMIT p_limit;
END;
$$ LANGUAGE plpgsql STABLE;
-- Function to get diverse questions covering multiple topics
-- Uses Maximal Marginal Relevance (MMR) to balance relevance and diversity
CREATE OR REPLACE FUNCTION get_diverse_questions(
p_organization_id UUID,
p_query_embedding vector(768),
p_limit INTEGER DEFAULT 10,
p_lambda DOUBLE PRECISION DEFAULT 0.7 -- 0 = max diversity, 1 = max relevance
)
RETURNS TABLE (
id UUID,
question_text TEXT,
question_type question_bank_type,
similarity DOUBLE PRECISION
) AS $$
DECLARE
selected_ids UUID[] := ARRAY[]::UUID[];
candidate_id UUID;
best_score REAL;
current_score REAL;
diversity_score REAL;
relevance_score REAL;
BEGIN
-- Simple MMR implementation: iteratively select questions
-- that are relevant but dissimilar to already selected ones
FOR i IN 1..p_limit LOOP
SELECT qb.id INTO candidate_id
FROM question_bank qb
WHERE qb.organization_id = p_organization_id
AND qb.id != ALL(selected_ids)
AND qb.embedding IS NOT NULL
ORDER BY
(1 - (qb.embedding <=> p_query_embedding)) * p_lambda -
(COALESCE((
SELECT MAX(1 - (qb.embedding <=> qb2.embedding))
FROM unnest(selected_ids) AS sid
JOIN question_bank qb2 ON qb2.id = sid
), 0)) * (1 - p_lambda)
DESC
LIMIT 1;
EXIT WHEN candidate_id IS NULL;
selected_ids := array_append(selected_ids, candidate_id);
END LOOP;
RETURN QUERY
SELECT
qb.id,
qb.question_text,
qb.question_type,
1 - (qb.embedding <=> p_query_embedding) AS similarity
FROM question_bank qb
WHERE qb.id = ANY(selected_ids)
ORDER BY similarity DESC;
END;
$$ LANGUAGE plpgsql STABLE;
-- Comments
COMMENT ON COLUMN question_bank.embedding IS 'Semantic embedding vector for similarity search (nomic-embed-text, 384 dimensions)';
COMMENT ON COLUMN question_bank.embedding_updated_at IS 'Timestamp when embedding was last generated';
COMMENT ON FUNCTION question_similarity IS 'Calculate cosine similarity between two questions';
COMMENT ON FUNCTION find_similar_questions IS 'Find questions similar to a given question (for duplicate detection)';
COMMENT ON FUNCTION search_questions_semantic IS 'Search questions by semantic similarity using embedding vector';
COMMENT ON FUNCTION get_diverse_questions IS 'Get diverse questions using Maximal Marginal Relevance (MMR)';