Rebuild extraction pipeline infrastructure (Faza 0 prep)

Implements the approved plan to replace the broken regex/index-master
extraction with an LLM-subagent pipeline. Four parallel lanes:

Lane A — scripts/extract_common.py (PDF/docx/doc/pptx/html/zip, no
  max_pages truncation), normalize_sources.py, chunk_sources.py
  (~20pg chunks + overlap, manifest registry), activity_schema.json.
Lane B — app/config_taxonomy.py (16 fixed category slugs), schema
  rebuilt from scratch in app/models/ with content_type, language,
  source_files, source_excerpt, normalized_name, extraction_confidence,
  needs_review; FTS5 + 3 triggers extended with materials_list and
  skills_developed.
Lane C — build_database.py (--rebuild, atomic swap, schema + fuzzy
  source_excerpt validation, dedup with needs_review band),
  validate_extractions.py, review_queue.py, new run_extraction.py
  orchestrator, SUBAGENT_PROMPT.md.
Lane D — search.py content_type/language filters (default search
  excludes non-game content), E7 schema-compat audit; fixed a NULL
  keywords AttributeError in _boost_search_relevance.

Removes 8 orphaned/dead scripts and app/services/parser.py +
indexer.py. Adds tests/ (70 passing, 1 skipped — libreoffice absent).

Note: Lane D made one additive edit to app/models/database.py
(_update_category_counts) to surface content_type/language in
get_filter_options, outside its nominal lane boundary but after
Lane B completed.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
Claude Agent
2026-05-19 17:43:38 +00:00
parent e0080edf85
commit 66ae831c36
37 changed files with 4101 additions and 1881 deletions

230
app/config_taxonomy.py Normal file
View File

@@ -0,0 +1,230 @@
"""
Controlled category taxonomy for game-library.
Single source of truth for activity categories. The DB stores the *slug*;
the UI displays the Romanian name. `category` (thematic domain) and
`content_type` (form of the content) are INDEPENDENT axes — see plan §2.
"""
import unicodedata
import re
from typing import Dict, List
# --- Categories (thematic domain) --------------------------------------------
# slug -> Romanian display name. ~16 fixed slugs; `altele` is the mandatory
# fallback and MUST always be present.
CATEGORIES: Dict[str, str] = {
"jocuri-cercetasesti": "Jocuri cercetășești",
"team-building": "Team-building",
"icebreakers": "Icebreakers / spargerea gheții",
"camp-outdoor": "Tabără și activități în aer liber",
"wide-games": "Wide games / jocuri de teren",
"orientare": "Orientare",
"prim-ajutor": "Prim ajutor",
"escape-room-puzzle": "Escape room și puzzle",
"creative-stem": "Creativitate și STEM",
"sports-active": "Sport și activități fizice",
"cantece-ceremonii": "Cântece și ceremonii",
"retete": "Rețete",
"supravietuire": "Supraviețuire",
"integrare-incluziune": "Integrare și incluziune",
"conflict-empatie": "Conflict și empatie",
"altele": "Altele",
}
# Mandatory fallback slug.
FALLBACK_CATEGORY = "altele"
# Ordered list of valid slugs.
CATEGORY_SLUGS: List[str] = list(CATEGORIES.keys())
# --- Content type (form of the content) --------------------------------------
# Independent axis from `category`. The UI default search excludes the
# non-game content types (see plan §6).
CONTENT_TYPES: Dict[str, str] = {
"joc": "Joc",
"activitate": "Activitate",
"reteta": "Rețetă",
"cantec": "Cântec",
"ceremonie": "Ceremonie",
}
CONTENT_TYPE_SLUGS: List[str] = list(CONTENT_TYPES.keys())
# Content types considered "non-game" — excluded from the default UI search.
NON_GAME_CONTENT_TYPES: List[str] = ["reteta", "cantec", "ceremonie"]
DEFAULT_CONTENT_TYPE = "activitate"
# --- Aliases -----------------------------------------------------------------
# Map of normalized arbitrary strings -> canonical slug. Keys are already
# diacritic-stripped, lowercased and hyphenated (see _slugify). This catches
# legacy / messy values from the old DB and common English/Romanian variants.
_CATEGORY_ALIASES: Dict[str, str] = {
# legacy junk
"general-activity": "altele",
"general": "altele",
"educational": "creative-stem",
"d": "altele",
"a": "altele",
"b": "altele",
"c": "altele",
# scouting
"cercetasie": "jocuri-cercetasesti",
"cercetasesti": "jocuri-cercetasesti",
"scout": "jocuri-cercetasesti",
"scouting": "jocuri-cercetasesti",
"scout-games": "jocuri-cercetasesti",
"jocuri-cercetasesti": "jocuri-cercetasesti",
# team building
"teambuilding": "team-building",
"team": "team-building",
"cooperare": "team-building",
# icebreakers
"icebreaker": "icebreakers",
"spargerea-ghetii": "icebreakers",
"cunoastere": "icebreakers",
"energizers": "icebreakers",
"energizer": "icebreakers",
# camp / outdoor
"camp": "camp-outdoor",
"tabara": "camp-outdoor",
"outdoor": "camp-outdoor",
"aer-liber": "camp-outdoor",
# wide games
"wide-game": "wide-games",
"jocuri-de-teren": "wide-games",
"joc-de-teren": "wide-games",
"big-games": "wide-games",
# orientare
"orienteering": "orientare",
"navigatie": "orientare",
# prim ajutor
"first-aid": "prim-ajutor",
"primul-ajutor": "prim-ajutor",
# escape room / puzzle
"escape-room": "escape-room-puzzle",
"escaperoom": "escape-room-puzzle",
"puzzle": "escape-room-puzzle",
"puzzles": "escape-room-puzzle",
"ghicitori": "escape-room-puzzle",
# creative / stem
"creative": "creative-stem",
"creativitate": "creative-stem",
"stem": "creative-stem",
"arts-and-crafts": "creative-stem",
"craft": "creative-stem",
"crafts": "creative-stem",
"stiinta": "creative-stem",
# sports
"sport": "sports-active",
"sports": "sports-active",
"sportive": "sports-active",
"active": "sports-active",
"miscare": "sports-active",
"physical": "sports-active",
# songs / ceremonies
"cantece": "cantece-ceremonii",
"cantec": "cantece-ceremonii",
"songs": "cantece-ceremonii",
"ceremonii": "cantece-ceremonii",
"ceremonie": "cantece-ceremonii",
"ceremony": "cantece-ceremonii",
# recipes
"reteta": "retete",
"recipe": "retete",
"recipes": "retete",
"cooking": "retete",
"gatit": "retete",
# survival
"survival": "supravietuire",
"supravietuire": "supravietuire",
# inclusion
"integrare": "integrare-incluziune",
"incluziune": "integrare-incluziune",
"inclusion": "integrare-incluziune",
# conflict / empathy
"conflict": "conflict-empatie",
"empatie": "conflict-empatie",
"empathy": "conflict-empatie",
"rezolvarea-conflictelor": "conflict-empatie",
# fallback
"altele": "altele",
"other": "altele",
"others": "altele",
"misc": "altele",
}
def _slugify(value: str) -> str:
"""Lowercase, strip diacritics, collapse non-alphanumerics to hyphens."""
if not value:
return ""
# Decompose accents (ă -> a, ș -> s, ț -> t, etc.)
decomposed = unicodedata.normalize("NFKD", value)
ascii_str = "".join(c for c in decomposed if not unicodedata.combining(c))
ascii_str = ascii_str.lower().strip()
ascii_str = re.sub(r"[^a-z0-9]+", "-", ascii_str)
return ascii_str.strip("-")
def normalize_category(value: str) -> str:
"""Map an arbitrary string to a valid category slug.
Returns one of CATEGORY_SLUGS, falling back to `altele` for anything
unrecognised or empty.
"""
if not value:
return FALLBACK_CATEGORY
slug = _slugify(str(value))
if not slug:
return FALLBACK_CATEGORY
# Exact slug match.
if slug in CATEGORIES:
return slug
# Alias match.
if slug in _CATEGORY_ALIASES:
return _CATEGORY_ALIASES[slug]
return FALLBACK_CATEGORY
def normalize_content_type(value: str) -> str:
"""Map an arbitrary string to a valid content_type slug.
Returns one of CONTENT_TYPE_SLUGS, falling back to `activitate`.
"""
if not value:
return DEFAULT_CONTENT_TYPE
slug = _slugify(str(value))
if slug in CONTENT_TYPES:
return slug
# Light alias handling for plural / English forms.
aliases = {
"jocuri": "joc",
"game": "joc",
"games": "joc",
"activitati": "activitate",
"activity": "activitate",
"retete": "reteta",
"recipe": "reteta",
"cantece": "cantec",
"song": "cantec",
"ceremonii": "ceremonie",
"ceremony": "ceremonie",
}
return aliases.get(slug, DEFAULT_CONTENT_TYPE)
def is_valid_category(slug: str) -> bool:
"""True if `slug` is a valid category slug."""
return slug in CATEGORIES
def category_display_name(slug: str) -> str:
"""Romanian display name for a slug (fallback to the slug itself)."""
return CATEGORIES.get(slug, slug)
def content_type_display_name(slug: str) -> str:
"""Romanian display name for a content_type slug."""
return CONTENT_TYPES.get(slug, slug)

View File

@@ -5,6 +5,22 @@ Activity data model for INDEX-SISTEM-JOCURI v2.0
from dataclasses import dataclass, field
from typing import List, Optional, Dict, Any
import json
import re
import unicodedata
def normalize_name(name: str) -> str:
"""Diacritic-free, lowercased, whitespace-collapsed form of a name.
Used as the exact-match key for dedup grouping (see plan §4).
"""
if not name:
return ""
decomposed = unicodedata.normalize("NFKD", name)
ascii_str = "".join(c for c in decomposed if not unicodedata.combining(c))
ascii_str = ascii_str.lower().strip()
ascii_str = re.sub(r"\s+", " ", ascii_str)
return ascii_str
@dataclass
class Activity:
@@ -19,10 +35,19 @@ class Activity:
# Categories
category: str = ""
subcategory: Optional[str] = None
# content_type is an axis INDEPENDENT of category:
# one of joc/activitate/reteta/cantec/ceremonie (see config_taxonomy).
content_type: Optional[str] = None
# Source information
source_file: str = ""
page_reference: Optional[str] = None
# source_files: JSON-encoded list of every source the activity was seen in.
# `source_file` (singular) stays as the primary/original source; build_database
# (Lane C) accumulates the full list here on dedup-merge.
source_files: List[str] = field(default_factory=list)
# Short verbatim quote from the source — anti-hallucination anchor.
source_excerpt: Optional[str] = None
# Age and participants
age_group_min: Optional[int] = None
@@ -44,11 +69,22 @@ class Activity:
keywords: Optional[str] = None
tags: List[str] = field(default_factory=list)
popularity_score: int = 0
# Extraction / language metadata
language: Optional[str] = None # 'ro' / 'en'
normalized_name: Optional[str] = None # dedup key; auto-derived from name
extraction_confidence: Optional[str] = None # 'high' / 'med' / 'low'
needs_review: int = 0
# Database fields
id: Optional[int] = None
created_at: Optional[str] = None
updated_at: Optional[str] = None
def __post_init__(self):
"""Derive normalized_name from name when not explicitly provided."""
if not self.normalized_name:
self.normalized_name = normalize_name(self.name)
def to_dict(self) -> Dict[str, Any]:
"""Convert activity to dictionary for database storage"""
@@ -59,8 +95,11 @@ class Activity:
'variations': self.variations,
'category': self.category,
'subcategory': self.subcategory,
'content_type': self.content_type,
'source_file': self.source_file,
'source_files': json.dumps(self.source_files) if self.source_files else None,
'page_reference': self.page_reference,
'source_excerpt': self.source_excerpt,
'age_group_min': self.age_group_min,
'age_group_max': self.age_group_max,
'participants_min': self.participants_min,
@@ -73,7 +112,11 @@ class Activity:
'difficulty_level': self.difficulty_level,
'keywords': self.keywords,
'tags': json.dumps(self.tags) if self.tags else None,
'popularity_score': self.popularity_score
'popularity_score': self.popularity_score,
'language': self.language,
'normalized_name': self.normalized_name or normalize_name(self.name),
'extraction_confidence': self.extraction_confidence,
'needs_review': self.needs_review,
}
@classmethod
@@ -86,7 +129,17 @@ class Activity:
tags = json.loads(data['tags'])
except (json.JSONDecodeError, TypeError):
tags = []
# source_files may arrive as a JSON string (DB) or a list (extraction)
source_files = data.get('source_files')
if isinstance(source_files, str):
try:
source_files = json.loads(source_files)
except (json.JSONDecodeError, TypeError):
source_files = []
elif source_files is None:
source_files = []
return cls(
id=data.get('id'),
name=data.get('name', ''),
@@ -95,8 +148,11 @@ class Activity:
variations=data.get('variations'),
category=data.get('category', ''),
subcategory=data.get('subcategory'),
content_type=data.get('content_type'),
source_file=data.get('source_file', ''),
source_files=source_files,
page_reference=data.get('page_reference'),
source_excerpt=data.get('source_excerpt'),
age_group_min=data.get('age_group_min'),
age_group_max=data.get('age_group_max'),
participants_min=data.get('participants_min'),
@@ -110,6 +166,10 @@ class Activity:
keywords=data.get('keywords'),
tags=tags,
popularity_score=data.get('popularity_score', 0),
language=data.get('language'),
normalized_name=data.get('normalized_name'),
extraction_confidence=data.get('extraction_confidence'),
needs_review=data.get('needs_review', 0) or 0,
created_at=data.get('created_at'),
updated_at=data.get('updated_at')
)

View File

@@ -30,6 +30,8 @@ class DatabaseManager:
"""Initialize database with v2.0 schema"""
with self._get_connection() as conn:
# Main activities table
# NOTE: schema is rebuilt from scratch (plan §6) — no in-place
# migration. The old DB is deleted and recreated by build_database.
conn.execute("""
CREATE TABLE IF NOT EXISTS activities (
id INTEGER PRIMARY KEY AUTOINCREMENT,
@@ -39,9 +41,12 @@ class DatabaseManager:
variations TEXT,
category TEXT NOT NULL,
subcategory TEXT,
content_type TEXT,
source_file TEXT NOT NULL,
source_files TEXT,
page_reference TEXT,
source_excerpt TEXT,
-- Structured parameters
age_group_min INTEGER,
age_group_max INTEGER,
@@ -49,26 +54,34 @@ class DatabaseManager:
participants_max INTEGER,
duration_min INTEGER,
duration_max INTEGER,
-- Categories for filtering
materials_category TEXT,
materials_list TEXT,
skills_developed TEXT,
difficulty_level TEXT,
-- Metadata
keywords TEXT,
tags TEXT,
popularity_score INTEGER DEFAULT 0,
-- Extraction / language metadata
language TEXT,
normalized_name TEXT,
extraction_confidence TEXT,
needs_review INTEGER DEFAULT 0,
created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
updated_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP
)
""")
# FTS5 virtual table for search
conn.execute("""
CREATE VIRTUAL TABLE IF NOT EXISTS activities_fts USING fts5(
name, description, rules, variations, keywords,
materials_list, skills_developed,
content='activities',
content_rowid='id'
)
@@ -92,6 +105,7 @@ class DatabaseManager:
"CREATE INDEX IF NOT EXISTS idx_activities_age ON activities(age_group_min, age_group_max)",
"CREATE INDEX IF NOT EXISTS idx_activities_participants ON activities(participants_min, participants_max)",
"CREATE INDEX IF NOT EXISTS idx_activities_duration ON activities(duration_min, duration_max)",
"CREATE INDEX IF NOT EXISTS idx_activities_normalized_name ON activities(normalized_name)",
"CREATE INDEX IF NOT EXISTS idx_categories_type ON categories(type)"
]
@@ -102,24 +116,34 @@ class DatabaseManager:
conn.execute("""
CREATE TRIGGER IF NOT EXISTS activities_fts_insert AFTER INSERT ON activities
BEGIN
INSERT INTO activities_fts(rowid, name, description, rules, variations, keywords)
VALUES (new.id, new.name, new.description, new.rules, new.variations, new.keywords);
INSERT INTO activities_fts(rowid, name, description, rules, variations,
keywords, materials_list, skills_developed)
VALUES (new.id, new.name, new.description, new.rules, new.variations,
new.keywords, new.materials_list, new.skills_developed);
END
""")
conn.execute("""
CREATE TRIGGER IF NOT EXISTS activities_fts_delete AFTER DELETE ON activities
BEGIN
DELETE FROM activities_fts WHERE rowid = old.id;
INSERT INTO activities_fts(activities_fts, rowid, name, description, rules,
variations, keywords, materials_list, skills_developed)
VALUES ('delete', old.id, old.name, old.description, old.rules,
old.variations, old.keywords, old.materials_list, old.skills_developed);
END
""")
conn.execute("""
CREATE TRIGGER IF NOT EXISTS activities_fts_update AFTER UPDATE ON activities
BEGIN
DELETE FROM activities_fts WHERE rowid = old.id;
INSERT INTO activities_fts(rowid, name, description, rules, variations, keywords)
VALUES (new.id, new.name, new.description, new.rules, new.variations, new.keywords);
INSERT INTO activities_fts(activities_fts, rowid, name, description, rules,
variations, keywords, materials_list, skills_developed)
VALUES ('delete', old.id, old.name, old.description, old.rules,
old.variations, old.keywords, old.materials_list, old.skills_developed);
INSERT INTO activities_fts(rowid, name, description, rules, variations,
keywords, materials_list, skills_developed)
VALUES (new.id, new.name, new.description, new.rules, new.variations,
new.keywords, new.materials_list, new.skills_developed);
END
""")
@@ -179,6 +203,8 @@ class DatabaseManager:
"""Update category usage counts"""
categories_to_update = [
('category', activity.category),
('content_type', activity.content_type),
('language', activity.language),
('age_group', activity.get_age_range_display()),
('participants', activity.get_participants_display()),
('duration', activity.get_duration_display()),
@@ -332,8 +358,11 @@ class DatabaseManager:
def clear_database(self):
"""Clear all data from database"""
with self._get_connection() as conn:
# Deleting from activities fires the delete trigger, which removes
# the matching FTS rows. The explicit 'delete-all' command then
# guarantees the external-content FTS index is fully cleared.
conn.execute("DELETE FROM activities")
conn.execute("DELETE FROM activities_fts")
conn.execute("INSERT INTO activities_fts(activities_fts) VALUES('delete-all')")
conn.execute("DELETE FROM categories")
conn.commit()

View File

@@ -2,8 +2,6 @@
Services for INDEX-SISTEM-JOCURI v2.0
"""
from .parser import IndexMasterParser
from .indexer import ActivityIndexer
from .search import SearchService
__all__ = ['IndexMasterParser', 'ActivityIndexer', 'SearchService']
__all__ = ['SearchService']

View File

@@ -1,248 +0,0 @@
"""
Activity indexer service for INDEX-SISTEM-JOCURI v2.0
Coordinates parsing and database indexing
"""
from typing import List, Dict, Any
from pathlib import Path
from app.models.database import DatabaseManager
from app.models.activity import Activity
from app.services.parser import IndexMasterParser
import time
class ActivityIndexer:
"""Service for indexing activities from INDEX_MASTER into database"""
def __init__(self, db_manager: DatabaseManager, index_master_path: str):
"""Initialize indexer with database manager and INDEX_MASTER path"""
self.db = db_manager
self.parser = IndexMasterParser(index_master_path)
self.indexing_stats = {}
def index_all_activities(self, clear_existing: bool = False) -> Dict[str, Any]:
"""Index all activities from INDEX_MASTER into database"""
print("🚀 Starting activity indexing process...")
start_time = time.time()
# Clear existing data if requested
if clear_existing:
print("🗑️ Clearing existing database...")
self.db.clear_database()
# Parse activities from INDEX_MASTER
print("📖 Parsing INDEX_MASTER file...")
activities = self.parser.parse_all_categories()
if not activities:
print("❌ No activities were parsed!")
return {'success': False, 'error': 'No activities parsed'}
# Filter valid activities
valid_activities = []
for activity in activities:
if self.parser.validate_activity_completeness(activity):
valid_activities.append(activity)
else:
print(f"⚠️ Skipping incomplete activity: {activity.name[:50]}...")
print(f"✅ Validated {len(valid_activities)} activities out of {len(activities)} parsed")
if len(valid_activities) < 100:
print(f"⚠️ Warning: Only {len(valid_activities)} valid activities found. Expected 500+")
# Bulk insert into database
print("💾 Inserting activities into database...")
try:
inserted_count = self.db.bulk_insert_activities(valid_activities)
# Rebuild FTS index for optimal search performance
print("🔍 Rebuilding search index...")
self.db.rebuild_fts_index()
end_time = time.time()
indexing_time = end_time - start_time
# Generate final statistics (with error handling)
try:
stats = self._generate_indexing_stats(valid_activities, indexing_time)
stats['inserted_count'] = inserted_count
stats['success'] = True
except Exception as e:
print(f"⚠️ Error generating statistics: {e}")
stats = {
'success': True,
'inserted_count': inserted_count,
'indexing_time_seconds': indexing_time,
'error': f'Stats generation failed: {str(e)}'
}
print(f"✅ Indexing complete! {inserted_count} activities indexed in {indexing_time:.2f}s")
# Verify database state (with error handling)
try:
db_stats = self.db.get_statistics()
print(f"📊 Database now contains {db_stats['total_activities']} activities")
except Exception as e:
print(f"⚠️ Error getting database statistics: {e}")
print(f"📊 Database insertion completed, statistics unavailable")
return stats
except Exception as e:
print(f"❌ Error during database insertion: {e}")
return {'success': False, 'error': str(e)}
def index_specific_category(self, category_code: str) -> Dict[str, Any]:
"""Index activities from a specific category only"""
print(f"🎯 Indexing specific category: {category_code}")
# Load content and parse specific category
if not self.parser.load_content():
return {'success': False, 'error': 'Could not load INDEX_MASTER'}
category_name = self.parser.category_mapping.get(category_code)
if not category_name:
return {'success': False, 'error': f'Unknown category code: {category_code}'}
activities = self.parser.parse_category_section(category_code, category_name)
if not activities:
return {'success': False, 'error': f'No activities found in category {category_code}'}
# Filter valid activities
valid_activities = [a for a in activities if self.parser.validate_activity_completeness(a)]
try:
inserted_count = self.db.bulk_insert_activities(valid_activities)
return {
'success': True,
'category': category_name,
'inserted_count': inserted_count,
'total_parsed': len(activities),
'valid_activities': len(valid_activities)
}
except Exception as e:
return {'success': False, 'error': str(e)}
def _generate_indexing_stats(self, activities: List[Activity], indexing_time: float) -> Dict[str, Any]:
"""Generate comprehensive indexing statistics"""
# Get parser statistics
parser_stats = self.parser.get_parsing_statistics()
# Calculate additional metrics
categories = {}
age_ranges = {}
durations = {}
materials = {}
for activity in activities:
# Category breakdown
if activity.category in categories:
categories[activity.category] += 1
else:
categories[activity.category] = 1
# Age range analysis (with safety check)
try:
age_key = activity.get_age_range_display() or "nespecificat"
age_ranges[age_key] = age_ranges.get(age_key, 0) + 1
except Exception as e:
print(f"Warning: Error getting age range for activity {activity.name}: {e}")
age_ranges["nespecificat"] = age_ranges.get("nespecificat", 0) + 1
# Duration analysis (with safety check)
try:
duration_key = activity.get_duration_display() or "nespecificat"
durations[duration_key] = durations.get(duration_key, 0) + 1
except Exception as e:
print(f"Warning: Error getting duration for activity {activity.name}: {e}")
durations["nespecificat"] = durations.get("nespecificat", 0) + 1
# Materials analysis (with safety check)
try:
materials_key = activity.get_materials_display() or "nespecificat"
materials[materials_key] = materials.get(materials_key, 0) + 1
except Exception as e:
print(f"Warning: Error getting materials for activity {activity.name}: {e}")
materials["nespecificat"] = materials.get("nespecificat", 0) + 1
return {
'indexing_time_seconds': indexing_time,
'parsing_stats': parser_stats,
'distribution': {
'categories': categories,
'age_ranges': age_ranges,
'durations': durations,
'materials': materials
},
'quality_metrics': {
'completion_rate': parser_stats.get('completion_rate', 0),
'average_description_length': parser_stats.get('average_description_length', 0),
'activities_with_metadata': sum(1 for a in activities if a.age_group_min or a.participants_min or a.duration_min)
}
}
def verify_indexing_quality(self) -> Dict[str, Any]:
"""Verify the quality of indexed data"""
try:
# Get database statistics
db_stats = self.db.get_statistics()
# Check for minimum activity count
total_activities = db_stats['total_activities']
meets_minimum = total_activities >= 500
# Check category distribution
categories = db_stats.get('categories', {})
category_coverage = len(categories)
# Sample some activities to check quality
sample_activities = self.db.search_activities(limit=10)
quality_issues = []
for activity in sample_activities:
if not activity.get('description') or len(activity['description']) < 10:
quality_issues.append(f"Activity {activity.get('name', 'Unknown')} has insufficient description")
if not activity.get('category'):
quality_issues.append(f"Activity {activity.get('name', 'Unknown')} missing category")
return {
'total_activities': total_activities,
'meets_minimum_requirement': meets_minimum,
'minimum_target': 500,
'category_coverage': category_coverage,
'expected_categories': len(self.parser.category_mapping),
'quality_issues': quality_issues,
'quality_score': max(0, 100 - len(quality_issues) * 10),
'database_stats': db_stats
}
except Exception as e:
return {'error': str(e), 'quality_score': 0}
def get_indexing_progress(self) -> Dict[str, Any]:
"""Get current indexing progress and status"""
try:
db_stats = self.db.get_statistics()
# Calculate progress towards 500+ activities goal
total_activities = db_stats['total_activities']
target_activities = 500
progress_percentage = min(100, (total_activities / target_activities) * 100)
return {
'current_activities': total_activities,
'target_activities': target_activities,
'progress_percentage': progress_percentage,
'status': 'completed' if total_activities >= target_activities else 'in_progress',
'categories_indexed': list(db_stats.get('categories', {}).keys()),
'database_size_mb': db_stats.get('database_size_bytes', 0) / (1024 * 1024)
}
except Exception as e:
return {'error': str(e), 'status': 'error'}

View File

@@ -1,340 +0,0 @@
"""
Advanced parser for INDEX_MASTER_JOCURI_ACTIVITATI.md
Extracts 500+ individual activities with full details
"""
import re
from pathlib import Path
from typing import List, Dict, Optional, Tuple
from app.models.activity import Activity
class IndexMasterParser:
"""Advanced parser for extracting real activities from INDEX_MASTER"""
def __init__(self, index_file_path: str):
"""Initialize parser with INDEX_MASTER file path"""
self.index_file_path = Path(index_file_path)
self.content = ""
self.activities = []
# Category mapping for main sections (exact match from file)
self.category_mapping = {
'[A]': 'JOCURI CERCETĂȘEȘTI ȘI SCOUT',
'[B]': 'TEAM BUILDING ȘI COMUNICARE',
'[C]': 'CAMPING ȘI ACTIVITĂȚI EXTERIOR',
'[D]': 'ESCAPE ROOM ȘI PUZZLE-URI',
'[E]': 'ORIENTARE ȘI BUSOLE',
'[F]': 'PRIMUL AJUTOR ȘI SIGURANȚA',
'[G]': 'ACTIVITĂȚI EDUCAȚIONALE',
'[H]': 'RESURSE SPECIALE'
}
def load_content(self) -> bool:
"""Load and validate INDEX_MASTER content"""
try:
if not self.index_file_path.exists():
print(f"❌ INDEX_MASTER file not found: {self.index_file_path}")
return False
with open(self.index_file_path, 'r', encoding='utf-8') as f:
self.content = f.read()
if len(self.content) < 1000: # Sanity check
print(f"⚠️ INDEX_MASTER file seems too small: {len(self.content)} chars")
return False
print(f"✅ Loaded INDEX_MASTER: {len(self.content)} characters")
return True
except Exception as e:
print(f"❌ Error loading INDEX_MASTER: {e}")
return False
def parse_all_categories(self) -> List[Activity]:
"""Parse all categories and extract individual activities"""
if not self.load_content():
return []
print("🔍 Starting comprehensive parsing of INDEX_MASTER...")
# Parse each main category
for category_code, category_name in self.category_mapping.items():
print(f"\n📂 Processing category {category_code}: {category_name}")
category_activities = self.parse_category_section(category_code, category_name)
self.activities.extend(category_activities)
print(f" ✅ Extracted {len(category_activities)} activities")
print(f"\n🎯 Total activities extracted: {len(self.activities)}")
return self.activities
def parse_category_section(self, category_code: str, category_name: str) -> List[Activity]:
"""Parse a specific category section"""
activities = []
# Find the category section - exact pattern match
# Look for the actual section, not the table of contents
pattern = rf"^## {re.escape(category_code)} {re.escape(category_name)}\s*$"
matches = list(re.finditer(pattern, self.content, re.MULTILINE | re.IGNORECASE))
if not matches:
print(f" ⚠️ Category section not found: {category_code}")
return activities
# Take the last match (should be the actual section, not TOC)
match = matches[-1]
print(f" 📍 Found section at position {match.start()}")
# Extract content until next main category or end
start_pos = match.end()
# Find next main category (look for complete header)
next_category_pattern = r"^## \[[A-H]\] [A-ZĂÂÎȘȚ]"
next_match = re.search(next_category_pattern, self.content[start_pos:], re.MULTILINE)
if next_match:
end_pos = start_pos + next_match.start()
section_content = self.content[start_pos:end_pos]
else:
section_content = self.content[start_pos:]
# Parse subsections within the category
activities.extend(self._parse_subsections(section_content, category_name))
return activities
def _parse_subsections(self, section_content: str, category_name: str) -> List[Activity]:
"""Parse subsections within a category"""
activities = []
# Find all subsections (### markers)
subsection_pattern = r"^### (.+?)$"
subsections = re.finditer(subsection_pattern, section_content, re.MULTILINE)
subsection_list = list(subsections)
for i, subsection in enumerate(subsection_list):
subsection_title = subsection.group(1).strip()
subsection_start = subsection.end()
# Find end of subsection
if i + 1 < len(subsection_list):
subsection_end = subsection_list[i + 1].start()
else:
subsection_end = len(section_content)
subsection_text = section_content[subsection_start:subsection_end]
# Parse individual games in this subsection
subsection_activities = self._parse_games_in_subsection(
subsection_text, category_name, subsection_title
)
activities.extend(subsection_activities)
return activities
def _parse_games_in_subsection(self, subsection_text: str, category_name: str, subsection_title: str) -> List[Activity]:
"""Parse individual games within a subsection"""
activities = []
# Look for "Exemple de jocuri:" sections
examples_pattern = r"\*\*Exemple de jocuri:\*\*\s*\n(.*?)(?=\n\*\*|$)"
examples_matches = re.finditer(examples_pattern, subsection_text, re.DOTALL)
for examples_match in examples_matches:
examples_text = examples_match.group(1)
# Extract individual games (numbered list)
game_pattern = r"^(\d+)\.\s*\*\*(.+?)\*\*\s*-\s*(.+?)$"
games = re.finditer(game_pattern, examples_text, re.MULTILINE)
for game_match in games:
game_number = game_match.group(1)
game_name = game_match.group(2).strip()
game_description = game_match.group(3).strip()
# Extract metadata from subsection
metadata = self._extract_subsection_metadata(subsection_text)
# Create activity
activity = Activity(
name=game_name,
description=game_description,
category=category_name,
subcategory=subsection_title,
source_file=f"INDEX_MASTER_JOCURI_ACTIVITATI.md",
page_reference=f"{category_name} > {subsection_title} > #{game_number}",
**metadata
)
activities.append(activity)
# Also extract from direct activity descriptions without "Exemple de jocuri"
activities.extend(self._parse_direct_activities(subsection_text, category_name, subsection_title))
return activities
def _extract_subsection_metadata(self, subsection_text: str) -> Dict:
"""Extract metadata from subsection text"""
metadata = {}
# Extract participants info
participants_pattern = r"\*\*Participanți:\*\*\s*(.+?)(?:\n|\*\*)"
participants_match = re.search(participants_pattern, subsection_text)
if participants_match:
participants_text = participants_match.group(1).strip()
participants = self._parse_participants(participants_text)
metadata.update(participants)
# Extract duration
duration_pattern = r"\*\*Durata:\*\*\s*(.+?)(?:\n|\*\*)"
duration_match = re.search(duration_pattern, subsection_text)
if duration_match:
duration_text = duration_match.group(1).strip()
duration = self._parse_duration(duration_text)
metadata.update(duration)
# Extract materials
materials_pattern = r"\*\*Materiale:\*\*\s*(.+?)(?:\n|\*\*)"
materials_match = re.search(materials_pattern, subsection_text)
if materials_match:
materials_text = materials_match.group(1).strip()
metadata['materials_list'] = materials_text
metadata['materials_category'] = self._categorize_materials(materials_text)
# Extract keywords
keywords_pattern = r"\*\*Cuvinte cheie:\*\*\s*(.+?)(?:\n|\*\*)"
keywords_match = re.search(keywords_pattern, subsection_text)
if keywords_match:
metadata['keywords'] = keywords_match.group(1).strip()
return metadata
def _parse_participants(self, participants_text: str) -> Dict:
"""Parse participants information"""
result = {}
# Look for number ranges like "8-30 copii" or "5-15 persoane"
range_pattern = r"(\d+)-(\d+)"
range_match = re.search(range_pattern, participants_text)
if range_match:
result['participants_min'] = int(range_match.group(1))
result['participants_max'] = int(range_match.group(2))
else:
# Look for single numbers
number_pattern = r"(\d+)\+"
number_match = re.search(number_pattern, participants_text)
if number_match:
result['participants_min'] = int(number_match.group(1))
# Extract age information
age_pattern = r"(\d+)-(\d+)\s*ani"
age_match = re.search(age_pattern, participants_text)
if age_match:
result['age_group_min'] = int(age_match.group(1))
result['age_group_max'] = int(age_match.group(2))
return result
def _parse_duration(self, duration_text: str) -> Dict:
"""Parse duration information"""
result = {}
# Look for time ranges like "5-20 minute" or "15-30min"
range_pattern = r"(\d+)-(\d+)\s*(?:minute|min)"
range_match = re.search(range_pattern, duration_text)
if range_match:
result['duration_min'] = int(range_match.group(1))
result['duration_max'] = int(range_match.group(2))
else:
# Look for single duration
single_pattern = r"(\d+)\+?\s*(?:minute|min)"
single_match = re.search(single_pattern, duration_text)
if single_match:
result['duration_min'] = int(single_match.group(1))
return result
def _categorize_materials(self, materials_text: str) -> str:
"""Categorize materials into simple categories"""
materials_lower = materials_text.lower()
if any(word in materials_lower for word in ['fără', 'nu necesare', 'nimic', 'minime']):
return 'Fără materiale'
elif any(word in materials_lower for word in ['hârtie', 'creion', 'marker', 'simple']):
return 'Materiale simple'
elif any(word in materials_lower for word in ['computer', 'proiector', 'echipament', 'complexe']):
return 'Materiale complexe'
else:
return 'Materiale variate'
def _parse_direct_activities(self, subsection_text: str, category_name: str, subsection_title: str) -> List[Activity]:
"""Parse activities that are described directly without 'Exemple de jocuri' section"""
activities = []
# Look for activity descriptions in sections that don't have "Exemple de jocuri"
if "**Exemple de jocuri:**" not in subsection_text:
# Try to extract from file descriptions
file_pattern = r"\*\*Fișier:\*\*\s*`([^`]+)`.*?\*\*(.+?)\*\*"
file_matches = re.finditer(file_pattern, subsection_text, re.DOTALL)
for file_match in file_matches:
file_name = file_match.group(1)
description_part = file_match.group(2)
# Create a general activity for this file
activity = Activity(
name=f"Activități din {file_name}",
description=f"Colecție de activități din fișierul {file_name}. {description_part[:200]}...",
category=category_name,
subcategory=subsection_title,
source_file=file_name,
page_reference=f"{category_name} > {subsection_title}",
**self._extract_subsection_metadata(subsection_text)
)
activities.append(activity)
return activities
def validate_activity_completeness(self, activity: Activity) -> bool:
"""Validate that an activity has all necessary fields"""
required_fields = ['name', 'description', 'category', 'source_file']
for field in required_fields:
if not getattr(activity, field) or not getattr(activity, field).strip():
return False
# Check minimum description length
if len(activity.description) < 10:
return False
return True
def get_parsing_statistics(self) -> Dict:
"""Get statistics about the parsing process"""
if not self.activities:
return {'total_activities': 0}
category_counts = {}
valid_activities = 0
for activity in self.activities:
# Count by category
if activity.category in category_counts:
category_counts[activity.category] += 1
else:
category_counts[activity.category] = 1
# Count valid activities
if self.validate_activity_completeness(activity):
valid_activities += 1
return {
'total_activities': len(self.activities),
'valid_activities': valid_activities,
'completion_rate': (valid_activities / len(self.activities)) * 100 if self.activities else 0,
'category_breakdown': category_counts,
'average_description_length': sum(len(a.description) for a in self.activities) / len(self.activities) if self.activities else 0
}

View File

@@ -5,8 +5,19 @@ Enhanced search with FTS5 and intelligent filtering
from typing import List, Dict, Any, Optional
from app.models.database import DatabaseManager
from app.config_taxonomy import NON_GAME_CONTENT_TYPES
import re
# Category slugs that are themselves "non-game" — selecting one of these as a
# category filter also lifts the default non-game content_type exclusion.
NON_GAME_CATEGORIES = {"retete", "cantece-ceremonii"}
# When a Python-side post-filter is active the DB LIMIT is applied *before*
# filtering, so we over-fetch to still satisfy the caller's `limit`.
_OVERSCAN_FACTOR = 5
_OVERSCAN_CAP = 2000
class SearchService:
"""Enhanced search service with intelligent query processing"""
@@ -24,22 +35,72 @@ class SearchService:
if filters is None:
filters = {}
# Process and normalize search text
processed_search = self._process_search_text(search_text)
# Map web filters to database fields
db_filters = self._map_filters_to_db_fields(filters)
# content_type and language are filtered in Python: the DB layer does
# not expose them as query parameters. The DEFAULT search excludes the
# non-game content types (rețete / cântece / ceremonii) — they surface
# only when the user explicitly filters that content_type, or picks a
# non-game category. See plan §6.
content_type, exclude_non_game = self._resolve_content_type_filter(filters)
language = (filters.get('language') or '').strip().lower() or None
post_filtering = bool(content_type or exclude_non_game or language)
# Over-fetch when post-filtering so the final list can still reach `limit`.
fetch_limit = min(limit * _OVERSCAN_FACTOR, _OVERSCAN_CAP) if post_filtering else limit
# Perform database search
results = self.db.search_activities(
search_text=processed_search,
**db_filters,
limit=limit
limit=fetch_limit
)
# Post-process results for relevance and ranking
return self._post_process_results(results, processed_search, filters)
# Apply content_type / language post-filters
results = self._apply_content_type_filter(results, content_type, exclude_non_game)
if language:
results = [r for r in results
if (r.get('language') or '').strip().lower() == language]
# Post-process results for relevance and ranking, then honour `limit`
results = self._post_process_results(results, processed_search, filters)
return results[:limit]
def _resolve_content_type_filter(self, filters: Dict[str, str]):
"""Determine the content_type post-filter.
Returns (explicit_content_type | None, exclude_non_game: bool):
- an explicit `content_type` filter → that value, no exclusion;
- a `category` filter on a non-game category → no exclusion;
- otherwise → default search, exclude non-game content types.
"""
content_type = (filters.get('content_type') or '').strip()
if content_type:
return content_type, False
category = (filters.get('category') or '').strip()
if category in NON_GAME_CATEGORIES:
return None, False
return None, True
def _apply_content_type_filter(self,
results: List[Dict[str, Any]],
content_type: Optional[str],
exclude_non_game: bool) -> List[Dict[str, Any]]:
"""Filter results by content_type (explicit include vs default exclude)."""
if content_type:
return [r for r in results
if (r.get('content_type') or '') == content_type]
if exclude_non_game:
# Rows with NULL/unknown content_type are kept — only the known
# non-game types are dropped from the default search.
return [r for r in results
if (r.get('content_type') or '') not in NON_GAME_CONTENT_TYPES]
return results
def _process_search_text(self, search_text: Optional[str]) -> Optional[str]:
"""Process and enhance search text for better FTS5 results"""
@@ -83,10 +144,16 @@ class SearchService:
if not filter_value or not filter_value.strip():
continue
# content_type / language are NOT database query params — they are
# applied as Python post-filters in search_activities(). Skip them
# here so they never reach DatabaseManager.search_activities().
if filter_key in ('content_type', 'language'):
continue
# Map filter types to database fields
if filter_key == 'category':
db_filters['category'] = filter_value
elif filter_key == 'age_group':
# Parse age range (e.g., "5-8 ani", "12+ ani")
age_match = re.search(r'(\d+)(?:-(\d+))?\s*ani?', filter_value)
@@ -177,21 +244,22 @@ class SearchService:
boost_score = 0
# Check name matches (highest priority)
name_lower = result.get('name', '').lower()
# NB: use `or ''` — nullable columns come back as None, not ''.
name_lower = (result.get('name') or '').lower()
for term in search_terms:
if term in name_lower:
boost_score += 10
if name_lower.startswith(term):
boost_score += 5 # Extra boost for name starts with term
# Check description matches
desc_lower = result.get('description', '').lower()
desc_lower = (result.get('description') or '').lower()
for term in search_terms:
if term in desc_lower:
boost_score += 3
# Check keywords matches
keywords_lower = result.get('keywords', '').lower()
keywords_lower = (result.get('keywords') or '').lower()
for term in search_terms:
if term in keywords_lower:
boost_score += 5
@@ -280,11 +348,14 @@ class SearchService:
return []
try:
# Search for activities that match the partial query
# Search for activities that match the partial query.
# Over-fetch then drop non-game content types so autocomplete
# mirrors the default search (no rețete / cântece / ceremonii).
results = self.db.search_activities(
search_text=f'"{partial_query}"',
limit=limit * 2
limit=limit * 6
)
results = self._apply_content_type_filter(results, None, True)
suggestions = []
seen = set()

View File

@@ -15,7 +15,13 @@
<header class="activity-detail-header">
<div class="activity-title-section">
<h1 class="activity-detail-title">{{ activity.name }}</h1>
<span class="activity-category-badge">{{ activity.category }}</span>
<span class="activity-category-badge">{{ display_names.get(activity.category, activity.category) }}</span>
{% if activity.content_type %}
<span class="activity-content-type-badge">{{ display_names.get(activity.content_type, activity.content_type) }}</span>
{% endif %}
{% if activity.needs_review %}
<span class="activity-badge needs-review" title="Această activitate necesită verificare">⚠ De verificat</span>
{% endif %}
</div>
{% if activity.subcategory %}

View File

@@ -36,7 +36,31 @@
<select name="category" id="category" class="filter-select">
<option value="">Toate categoriile</option>
{% for category in filters.category %}
<option value="{{ category }}">{{ category }}</option>
<option value="{{ category }}">{{ display_names.get(category, category) }}</option>
{% endfor %}
</select>
</div>
{% endif %}
{% if filters.content_type %}
<div class="filter-group">
<label for="content_type" class="filter-label">Tip conținut</label>
<select name="content_type" id="content_type" class="filter-select">
<option value="">Doar jocuri și activități</option>
{% for content_type in filters.content_type %}
<option value="{{ content_type }}">{{ display_names.get(content_type, content_type) }}</option>
{% endfor %}
</select>
</div>
{% endif %}
{% if filters.language %}
<div class="filter-group">
<label for="language" class="filter-label">Limbă</label>
<select name="language" id="language" class="filter-select">
<option value="">Toate limbile</option>
{% for language in filters.language %}
<option value="{{ language }}">{{ display_names.get(language, language) }}</option>
{% endfor %}
</select>
</div>

View File

@@ -24,7 +24,29 @@
<option value="">Toate categoriile</option>
{% for category in filters.category %}
<option value="{{ category }}" {% if applied_filters.category == category %}selected{% endif %}>
{{ category }}
{{ display_names.get(category, category) }}
</option>
{% endfor %}
</select>
{% endif %}
{% if filters.content_type %}
<select name="content_type" class="filter-select compact">
<option value="">Doar jocuri și activități</option>
{% for content_type in filters.content_type %}
<option value="{{ content_type }}" {% if applied_filters.content_type == content_type %}selected{% endif %}>
{{ display_names.get(content_type, content_type) }}
</option>
{% endfor %}
</select>
{% endif %}
{% if filters.language %}
<select name="language" class="filter-select compact">
<option value="">Toate limbile</option>
{% for language in filters.language %}
<option value="{{ language }}" {% if applied_filters.language == language %}selected{% endif %}>
{{ display_names.get(language, language) }}
</option>
{% endfor %}
</select>
@@ -109,7 +131,10 @@
{{ activity.name }}
</a>
</h3>
<span class="activity-category">{{ activity.category }}</span>
<span class="activity-category">{{ display_names.get(activity.category, activity.category) }}</span>
{% if activity.needs_review %}
<span class="activity-badge needs-review" title="Această activitate necesită verificare">⚠ De verificat</span>
{% endif %}
</header>
<div class="activity-content">

View File

@@ -7,11 +7,17 @@ from flask import Blueprint, request, render_template, jsonify, current_app
from app.models.database import DatabaseManager
from app.models.activity import Activity
from app.services.search import SearchService
from app.config_taxonomy import CATEGORIES, CONTENT_TYPES
import os
from pathlib import Path
bp = Blueprint('main', __name__)
# Slug -> Romanian display name. Category and content_type slugs never collide,
# so a single flat map is enough for the UI filter labels.
LANGUAGE_NAMES = {'ro': 'Română', 'en': 'Engleză'}
DISPLAY_NAMES = {**CATEGORIES, **CONTENT_TYPES, **LANGUAGE_NAMES}
# Initialize database manager (will be configured in application factory)
def get_db_manager():
"""Get database manager instance"""
@@ -36,15 +42,17 @@ def index():
# Get database statistics for the interface
stats = db.get_statistics()
return render_template('index.html',
return render_template('index.html',
filters=filter_options,
display_names=DISPLAY_NAMES,
stats=stats)
except Exception as e:
print(f"Error loading main page: {e}")
# Fallback with empty filters
return render_template('index.html',
return render_template('index.html',
filters={},
display_names=DISPLAY_NAMES,
stats={'total_activities': 0})
@bp.route('/search', methods=['GET', 'POST'])
@@ -82,8 +90,9 @@ def search():
search_query=search_query,
applied_filters=filters,
filters=filter_options,
display_names=DISPLAY_NAMES,
results_count=len(activities))
except Exception as e:
print(f"Search error: {e}")
return render_template('results.html',
@@ -91,6 +100,7 @@ def search():
search_query='',
applied_filters={},
filters={},
display_names=DISPLAY_NAMES,
results_count=0,
error=str(e))
@@ -121,6 +131,7 @@ def activity_detail(activity_id):
return render_template('activity.html',
activity=activity,
display_names=DISPLAY_NAMES,
similar_activities=similar_activities)
except Exception as e: