Files
game-library/app/models/database.py
Claude Agent 66ae831c36 Rebuild extraction pipeline infrastructure (Faza 0 prep)
Implements the approved plan to replace the broken regex/index-master
extraction with an LLM-subagent pipeline. Four parallel lanes:

Lane A — scripts/extract_common.py (PDF/docx/doc/pptx/html/zip, no
  max_pages truncation), normalize_sources.py, chunk_sources.py
  (~20pg chunks + overlap, manifest registry), activity_schema.json.
Lane B — app/config_taxonomy.py (16 fixed category slugs), schema
  rebuilt from scratch in app/models/ with content_type, language,
  source_files, source_excerpt, normalized_name, extraction_confidence,
  needs_review; FTS5 + 3 triggers extended with materials_list and
  skills_developed.
Lane C — build_database.py (--rebuild, atomic swap, schema + fuzzy
  source_excerpt validation, dedup with needs_review band),
  validate_extractions.py, review_queue.py, new run_extraction.py
  orchestrator, SUBAGENT_PROMPT.md.
Lane D — search.py content_type/language filters (default search
  excludes non-game content), E7 schema-compat audit; fixed a NULL
  keywords AttributeError in _boost_search_relevance.

Removes 8 orphaned/dead scripts and app/services/parser.py +
indexer.py. Adds tests/ (70 passing, 1 skipped — libreoffice absent).

Note: Lane D made one additive edit to app/models/database.py
(_update_category_counts) to surface content_type/language in
get_filter_options, outside its nominal lane boundary but after
Lane B completed.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-05-19 17:43:38 +00:00

373 lines
16 KiB
Python

"""
Database manager for INDEX-SISTEM-JOCURI v2.0
Implements SQLite with FTS5 for full-text search
"""
import sqlite3
import json
from pathlib import Path
from typing import List, Dict, Any, Optional, Tuple
from app.models.activity import Activity
class DatabaseManager:
"""Enhanced database manager with FTS5 support"""
def __init__(self, db_path: str):
"""Initialize database manager"""
self.db_path = Path(db_path)
self.db_path.parent.mkdir(parents=True, exist_ok=True)
self._init_database()
def _get_connection(self) -> sqlite3.Connection:
"""Get database connection with row factory"""
conn = sqlite3.connect(self.db_path)
conn.row_factory = sqlite3.Row
# Enable FTS5
conn.execute("PRAGMA table_info=sqlite_master")
return conn
def _init_database(self):
"""Initialize database with v2.0 schema"""
with self._get_connection() as conn:
# Main activities table
# NOTE: schema is rebuilt from scratch (plan §6) — no in-place
# migration. The old DB is deleted and recreated by build_database.
conn.execute("""
CREATE TABLE IF NOT EXISTS activities (
id INTEGER PRIMARY KEY AUTOINCREMENT,
name TEXT NOT NULL,
description TEXT NOT NULL,
rules TEXT,
variations TEXT,
category TEXT NOT NULL,
subcategory TEXT,
content_type TEXT,
source_file TEXT NOT NULL,
source_files TEXT,
page_reference TEXT,
source_excerpt TEXT,
-- Structured parameters
age_group_min INTEGER,
age_group_max INTEGER,
participants_min INTEGER,
participants_max INTEGER,
duration_min INTEGER,
duration_max INTEGER,
-- Categories for filtering
materials_category TEXT,
materials_list TEXT,
skills_developed TEXT,
difficulty_level TEXT,
-- Metadata
keywords TEXT,
tags TEXT,
popularity_score INTEGER DEFAULT 0,
-- Extraction / language metadata
language TEXT,
normalized_name TEXT,
extraction_confidence TEXT,
needs_review INTEGER DEFAULT 0,
created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
updated_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP
)
""")
# FTS5 virtual table for search
conn.execute("""
CREATE VIRTUAL TABLE IF NOT EXISTS activities_fts USING fts5(
name, description, rules, variations, keywords,
materials_list, skills_developed,
content='activities',
content_rowid='id'
)
""")
# Categories table for dynamic filters
conn.execute("""
CREATE TABLE IF NOT EXISTS categories (
id INTEGER PRIMARY KEY AUTOINCREMENT,
type TEXT NOT NULL,
value TEXT NOT NULL,
display_name TEXT,
usage_count INTEGER DEFAULT 0,
UNIQUE(type, value)
)
""")
# Create indexes for performance
indexes = [
"CREATE INDEX IF NOT EXISTS idx_activities_category ON activities(category)",
"CREATE INDEX IF NOT EXISTS idx_activities_age ON activities(age_group_min, age_group_max)",
"CREATE INDEX IF NOT EXISTS idx_activities_participants ON activities(participants_min, participants_max)",
"CREATE INDEX IF NOT EXISTS idx_activities_duration ON activities(duration_min, duration_max)",
"CREATE INDEX IF NOT EXISTS idx_activities_normalized_name ON activities(normalized_name)",
"CREATE INDEX IF NOT EXISTS idx_categories_type ON categories(type)"
]
for index_sql in indexes:
conn.execute(index_sql)
# Triggers to keep FTS in sync
conn.execute("""
CREATE TRIGGER IF NOT EXISTS activities_fts_insert AFTER INSERT ON activities
BEGIN
INSERT INTO activities_fts(rowid, name, description, rules, variations,
keywords, materials_list, skills_developed)
VALUES (new.id, new.name, new.description, new.rules, new.variations,
new.keywords, new.materials_list, new.skills_developed);
END
""")
conn.execute("""
CREATE TRIGGER IF NOT EXISTS activities_fts_delete AFTER DELETE ON activities
BEGIN
INSERT INTO activities_fts(activities_fts, rowid, name, description, rules,
variations, keywords, materials_list, skills_developed)
VALUES ('delete', old.id, old.name, old.description, old.rules,
old.variations, old.keywords, old.materials_list, old.skills_developed);
END
""")
conn.execute("""
CREATE TRIGGER IF NOT EXISTS activities_fts_update AFTER UPDATE ON activities
BEGIN
INSERT INTO activities_fts(activities_fts, rowid, name, description, rules,
variations, keywords, materials_list, skills_developed)
VALUES ('delete', old.id, old.name, old.description, old.rules,
old.variations, old.keywords, old.materials_list, old.skills_developed);
INSERT INTO activities_fts(rowid, name, description, rules, variations,
keywords, materials_list, skills_developed)
VALUES (new.id, new.name, new.description, new.rules, new.variations,
new.keywords, new.materials_list, new.skills_developed);
END
""")
conn.commit()
def insert_activity(self, activity: Activity) -> int:
"""Insert new activity and return ID"""
with self._get_connection() as conn:
data = activity.to_dict()
columns = ', '.join(data.keys())
placeholders = ', '.join(['?' for _ in data])
values = list(data.values())
cursor = conn.execute(
f"INSERT INTO activities ({columns}) VALUES ({placeholders})",
values
)
activity_id = cursor.lastrowid
# Update category counts
self._update_category_counts(conn, activity)
conn.commit()
return activity_id
def bulk_insert_activities(self, activities: List[Activity]) -> int:
"""Bulk insert activities for better performance"""
if not activities:
return 0
with self._get_connection() as conn:
data_list = [activity.to_dict() for activity in activities]
if not data_list:
return 0
columns = ', '.join(data_list[0].keys())
placeholders = ', '.join(['?' for _ in data_list[0]])
values_list = [list(data.values()) for data in data_list]
conn.executemany(
f"INSERT INTO activities ({columns}) VALUES ({placeholders})",
values_list
)
# Update category counts
for activity in activities:
self._update_category_counts(conn, activity)
conn.commit()
return len(activities)
def _update_category_counts(self, conn: sqlite3.Connection, activity: Activity):
"""Update category usage counts"""
categories_to_update = [
('category', activity.category),
('content_type', activity.content_type),
('language', activity.language),
('age_group', activity.get_age_range_display()),
('participants', activity.get_participants_display()),
('duration', activity.get_duration_display()),
('materials', activity.get_materials_display()),
('difficulty', activity.difficulty_level),
]
for cat_type, cat_value in categories_to_update:
if cat_value and cat_value.strip():
conn.execute("""
INSERT OR IGNORE INTO categories (type, value, display_name, usage_count)
VALUES (?, ?, ?, 0)
""", (cat_type, cat_value, cat_value))
conn.execute("""
UPDATE categories
SET usage_count = usage_count + 1
WHERE type = ? AND value = ?
""", (cat_type, cat_value))
def search_activities(self,
search_text: Optional[str] = None,
category: Optional[str] = None,
age_group_min: Optional[int] = None,
age_group_max: Optional[int] = None,
participants_min: Optional[int] = None,
participants_max: Optional[int] = None,
duration_min: Optional[int] = None,
duration_max: Optional[int] = None,
materials_category: Optional[str] = None,
difficulty_level: Optional[str] = None,
limit: int = 100) -> List[Dict[str, Any]]:
"""Enhanced search with FTS5 and filters"""
with self._get_connection() as conn:
if search_text and search_text.strip():
# Use FTS5 for text search
base_query = """
SELECT a.*,
activities_fts.rank as search_rank
FROM activities a
JOIN activities_fts ON a.id = activities_fts.rowid
WHERE activities_fts MATCH ?
"""
params = [search_text.strip()]
order_clause = "ORDER BY search_rank, a.popularity_score DESC"
else:
# Regular query without FTS
base_query = "SELECT * FROM activities WHERE 1=1"
params = []
order_clause = "ORDER BY popularity_score DESC, name ASC"
# Add filters
if category:
base_query += " AND category LIKE ?"
params.append(f"%{category}%")
if age_group_min is not None:
base_query += " AND (age_group_min IS NULL OR age_group_min <= ?)"
params.append(age_group_min)
if age_group_max is not None:
base_query += " AND (age_group_max IS NULL OR age_group_max >= ?)"
params.append(age_group_max)
if participants_min is not None:
base_query += " AND (participants_min IS NULL OR participants_min <= ?)"
params.append(participants_min)
if participants_max is not None:
base_query += " AND (participants_max IS NULL OR participants_max >= ?)"
params.append(participants_max)
if duration_min is not None:
base_query += " AND (duration_min IS NULL OR duration_min >= ?)"
params.append(duration_min)
if duration_max is not None:
base_query += " AND (duration_max IS NULL OR duration_max <= ?)"
params.append(duration_max)
if materials_category:
base_query += " AND materials_category LIKE ?"
params.append(f"%{materials_category}%")
if difficulty_level:
base_query += " AND difficulty_level = ?"
params.append(difficulty_level)
# Add ordering and limit
query = f"{base_query} {order_clause} LIMIT ?"
params.append(limit)
cursor = conn.execute(query, params)
return [dict(row) for row in cursor.fetchall()]
def get_activity_by_id(self, activity_id: int) -> Optional[Dict[str, Any]]:
"""Get single activity by ID"""
with self._get_connection() as conn:
cursor = conn.execute("SELECT * FROM activities WHERE id = ?", (activity_id,))
row = cursor.fetchone()
return dict(row) if row else None
def get_filter_options(self) -> Dict[str, List[str]]:
"""Get dynamic filter options from categories table"""
with self._get_connection() as conn:
cursor = conn.execute("""
SELECT type, value, usage_count
FROM categories
WHERE usage_count > 0
ORDER BY type, usage_count DESC, value ASC
""")
options = {}
for row in cursor.fetchall():
cat_type, value, count = row
if cat_type not in options:
options[cat_type] = []
options[cat_type].append(value)
return options
def get_statistics(self) -> Dict[str, Any]:
"""Get database statistics"""
with self._get_connection() as conn:
# Total activities
cursor = conn.execute("SELECT COUNT(*) FROM activities")
total_activities = cursor.fetchone()[0]
# Activities by category
cursor = conn.execute("""
SELECT category, COUNT(*) as count
FROM activities
GROUP BY category
ORDER BY count DESC
""")
categories = dict(cursor.fetchall())
# Database size
cursor = conn.execute("SELECT page_count * page_size as size FROM pragma_page_count(), pragma_page_size()")
size_row = cursor.fetchone()
db_size = size_row[0] if size_row else 0
return {
'total_activities': total_activities,
'categories': categories,
'database_size_bytes': db_size,
'database_path': str(self.db_path)
}
def clear_database(self):
"""Clear all data from database"""
with self._get_connection() as conn:
# Deleting from activities fires the delete trigger, which removes
# the matching FTS rows. The explicit 'delete-all' command then
# guarantees the external-content FTS index is fully cleared.
conn.execute("DELETE FROM activities")
conn.execute("INSERT INTO activities_fts(activities_fts) VALUES('delete-all')")
conn.execute("DELETE FROM categories")
conn.commit()
def rebuild_fts_index(self):
"""Rebuild FTS5 index"""
with self._get_connection() as conn:
conn.execute("INSERT INTO activities_fts(activities_fts) VALUES('rebuild')")
conn.commit()