Rebuild extraction pipeline infrastructure (Faza 0 prep)
Implements the approved plan to replace the broken regex/index-master extraction with an LLM-subagent pipeline. Four parallel lanes: Lane A — scripts/extract_common.py (PDF/docx/doc/pptx/html/zip, no max_pages truncation), normalize_sources.py, chunk_sources.py (~20pg chunks + overlap, manifest registry), activity_schema.json. Lane B — app/config_taxonomy.py (16 fixed category slugs), schema rebuilt from scratch in app/models/ with content_type, language, source_files, source_excerpt, normalized_name, extraction_confidence, needs_review; FTS5 + 3 triggers extended with materials_list and skills_developed. Lane C — build_database.py (--rebuild, atomic swap, schema + fuzzy source_excerpt validation, dedup with needs_review band), validate_extractions.py, review_queue.py, new run_extraction.py orchestrator, SUBAGENT_PROMPT.md. Lane D — search.py content_type/language filters (default search excludes non-game content), E7 schema-compat audit; fixed a NULL keywords AttributeError in _boost_search_relevance. Removes 8 orphaned/dead scripts and app/services/parser.py + indexer.py. Adds tests/ (70 passing, 1 skipped — libreoffice absent). Note: Lane D made one additive edit to app/models/database.py (_update_category_counts) to surface content_type/language in get_filter_options, outside its nominal lane boundary but after Lane B completed. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
230
app/config_taxonomy.py
Normal file
230
app/config_taxonomy.py
Normal file
@@ -0,0 +1,230 @@
|
||||
"""
|
||||
Controlled category taxonomy for game-library.
|
||||
|
||||
Single source of truth for activity categories. The DB stores the *slug*;
|
||||
the UI displays the Romanian name. `category` (thematic domain) and
|
||||
`content_type` (form of the content) are INDEPENDENT axes — see plan §2.
|
||||
"""
|
||||
|
||||
import unicodedata
|
||||
import re
|
||||
from typing import Dict, List
|
||||
|
||||
# --- Categories (thematic domain) --------------------------------------------
|
||||
# slug -> Romanian display name. ~16 fixed slugs; `altele` is the mandatory
|
||||
# fallback and MUST always be present.
|
||||
CATEGORIES: Dict[str, str] = {
|
||||
"jocuri-cercetasesti": "Jocuri cercetășești",
|
||||
"team-building": "Team-building",
|
||||
"icebreakers": "Icebreakers / spargerea gheții",
|
||||
"camp-outdoor": "Tabără și activități în aer liber",
|
||||
"wide-games": "Wide games / jocuri de teren",
|
||||
"orientare": "Orientare",
|
||||
"prim-ajutor": "Prim ajutor",
|
||||
"escape-room-puzzle": "Escape room și puzzle",
|
||||
"creative-stem": "Creativitate și STEM",
|
||||
"sports-active": "Sport și activități fizice",
|
||||
"cantece-ceremonii": "Cântece și ceremonii",
|
||||
"retete": "Rețete",
|
||||
"supravietuire": "Supraviețuire",
|
||||
"integrare-incluziune": "Integrare și incluziune",
|
||||
"conflict-empatie": "Conflict și empatie",
|
||||
"altele": "Altele",
|
||||
}
|
||||
|
||||
# Mandatory fallback slug.
|
||||
FALLBACK_CATEGORY = "altele"
|
||||
|
||||
# Ordered list of valid slugs.
|
||||
CATEGORY_SLUGS: List[str] = list(CATEGORIES.keys())
|
||||
|
||||
# --- Content type (form of the content) --------------------------------------
|
||||
# Independent axis from `category`. The UI default search excludes the
|
||||
# non-game content types (see plan §6).
|
||||
CONTENT_TYPES: Dict[str, str] = {
|
||||
"joc": "Joc",
|
||||
"activitate": "Activitate",
|
||||
"reteta": "Rețetă",
|
||||
"cantec": "Cântec",
|
||||
"ceremonie": "Ceremonie",
|
||||
}
|
||||
|
||||
CONTENT_TYPE_SLUGS: List[str] = list(CONTENT_TYPES.keys())
|
||||
|
||||
# Content types considered "non-game" — excluded from the default UI search.
|
||||
NON_GAME_CONTENT_TYPES: List[str] = ["reteta", "cantec", "ceremonie"]
|
||||
|
||||
DEFAULT_CONTENT_TYPE = "activitate"
|
||||
|
||||
# --- Aliases -----------------------------------------------------------------
|
||||
# Map of normalized arbitrary strings -> canonical slug. Keys are already
|
||||
# diacritic-stripped, lowercased and hyphenated (see _slugify). This catches
|
||||
# legacy / messy values from the old DB and common English/Romanian variants.
|
||||
_CATEGORY_ALIASES: Dict[str, str] = {
|
||||
# legacy junk
|
||||
"general-activity": "altele",
|
||||
"general": "altele",
|
||||
"educational": "creative-stem",
|
||||
"d": "altele",
|
||||
"a": "altele",
|
||||
"b": "altele",
|
||||
"c": "altele",
|
||||
# scouting
|
||||
"cercetasie": "jocuri-cercetasesti",
|
||||
"cercetasesti": "jocuri-cercetasesti",
|
||||
"scout": "jocuri-cercetasesti",
|
||||
"scouting": "jocuri-cercetasesti",
|
||||
"scout-games": "jocuri-cercetasesti",
|
||||
"jocuri-cercetasesti": "jocuri-cercetasesti",
|
||||
# team building
|
||||
"teambuilding": "team-building",
|
||||
"team": "team-building",
|
||||
"cooperare": "team-building",
|
||||
# icebreakers
|
||||
"icebreaker": "icebreakers",
|
||||
"spargerea-ghetii": "icebreakers",
|
||||
"cunoastere": "icebreakers",
|
||||
"energizers": "icebreakers",
|
||||
"energizer": "icebreakers",
|
||||
# camp / outdoor
|
||||
"camp": "camp-outdoor",
|
||||
"tabara": "camp-outdoor",
|
||||
"outdoor": "camp-outdoor",
|
||||
"aer-liber": "camp-outdoor",
|
||||
# wide games
|
||||
"wide-game": "wide-games",
|
||||
"jocuri-de-teren": "wide-games",
|
||||
"joc-de-teren": "wide-games",
|
||||
"big-games": "wide-games",
|
||||
# orientare
|
||||
"orienteering": "orientare",
|
||||
"navigatie": "orientare",
|
||||
# prim ajutor
|
||||
"first-aid": "prim-ajutor",
|
||||
"primul-ajutor": "prim-ajutor",
|
||||
# escape room / puzzle
|
||||
"escape-room": "escape-room-puzzle",
|
||||
"escaperoom": "escape-room-puzzle",
|
||||
"puzzle": "escape-room-puzzle",
|
||||
"puzzles": "escape-room-puzzle",
|
||||
"ghicitori": "escape-room-puzzle",
|
||||
# creative / stem
|
||||
"creative": "creative-stem",
|
||||
"creativitate": "creative-stem",
|
||||
"stem": "creative-stem",
|
||||
"arts-and-crafts": "creative-stem",
|
||||
"craft": "creative-stem",
|
||||
"crafts": "creative-stem",
|
||||
"stiinta": "creative-stem",
|
||||
# sports
|
||||
"sport": "sports-active",
|
||||
"sports": "sports-active",
|
||||
"sportive": "sports-active",
|
||||
"active": "sports-active",
|
||||
"miscare": "sports-active",
|
||||
"physical": "sports-active",
|
||||
# songs / ceremonies
|
||||
"cantece": "cantece-ceremonii",
|
||||
"cantec": "cantece-ceremonii",
|
||||
"songs": "cantece-ceremonii",
|
||||
"ceremonii": "cantece-ceremonii",
|
||||
"ceremonie": "cantece-ceremonii",
|
||||
"ceremony": "cantece-ceremonii",
|
||||
# recipes
|
||||
"reteta": "retete",
|
||||
"recipe": "retete",
|
||||
"recipes": "retete",
|
||||
"cooking": "retete",
|
||||
"gatit": "retete",
|
||||
# survival
|
||||
"survival": "supravietuire",
|
||||
"supravietuire": "supravietuire",
|
||||
# inclusion
|
||||
"integrare": "integrare-incluziune",
|
||||
"incluziune": "integrare-incluziune",
|
||||
"inclusion": "integrare-incluziune",
|
||||
# conflict / empathy
|
||||
"conflict": "conflict-empatie",
|
||||
"empatie": "conflict-empatie",
|
||||
"empathy": "conflict-empatie",
|
||||
"rezolvarea-conflictelor": "conflict-empatie",
|
||||
# fallback
|
||||
"altele": "altele",
|
||||
"other": "altele",
|
||||
"others": "altele",
|
||||
"misc": "altele",
|
||||
}
|
||||
|
||||
|
||||
def _slugify(value: str) -> str:
|
||||
"""Lowercase, strip diacritics, collapse non-alphanumerics to hyphens."""
|
||||
if not value:
|
||||
return ""
|
||||
# Decompose accents (ă -> a, ș -> s, ț -> t, etc.)
|
||||
decomposed = unicodedata.normalize("NFKD", value)
|
||||
ascii_str = "".join(c for c in decomposed if not unicodedata.combining(c))
|
||||
ascii_str = ascii_str.lower().strip()
|
||||
ascii_str = re.sub(r"[^a-z0-9]+", "-", ascii_str)
|
||||
return ascii_str.strip("-")
|
||||
|
||||
|
||||
def normalize_category(value: str) -> str:
|
||||
"""Map an arbitrary string to a valid category slug.
|
||||
|
||||
Returns one of CATEGORY_SLUGS, falling back to `altele` for anything
|
||||
unrecognised or empty.
|
||||
"""
|
||||
if not value:
|
||||
return FALLBACK_CATEGORY
|
||||
slug = _slugify(str(value))
|
||||
if not slug:
|
||||
return FALLBACK_CATEGORY
|
||||
# Exact slug match.
|
||||
if slug in CATEGORIES:
|
||||
return slug
|
||||
# Alias match.
|
||||
if slug in _CATEGORY_ALIASES:
|
||||
return _CATEGORY_ALIASES[slug]
|
||||
return FALLBACK_CATEGORY
|
||||
|
||||
|
||||
def normalize_content_type(value: str) -> str:
|
||||
"""Map an arbitrary string to a valid content_type slug.
|
||||
|
||||
Returns one of CONTENT_TYPE_SLUGS, falling back to `activitate`.
|
||||
"""
|
||||
if not value:
|
||||
return DEFAULT_CONTENT_TYPE
|
||||
slug = _slugify(str(value))
|
||||
if slug in CONTENT_TYPES:
|
||||
return slug
|
||||
# Light alias handling for plural / English forms.
|
||||
aliases = {
|
||||
"jocuri": "joc",
|
||||
"game": "joc",
|
||||
"games": "joc",
|
||||
"activitati": "activitate",
|
||||
"activity": "activitate",
|
||||
"retete": "reteta",
|
||||
"recipe": "reteta",
|
||||
"cantece": "cantec",
|
||||
"song": "cantec",
|
||||
"ceremonii": "ceremonie",
|
||||
"ceremony": "ceremonie",
|
||||
}
|
||||
return aliases.get(slug, DEFAULT_CONTENT_TYPE)
|
||||
|
||||
|
||||
def is_valid_category(slug: str) -> bool:
|
||||
"""True if `slug` is a valid category slug."""
|
||||
return slug in CATEGORIES
|
||||
|
||||
|
||||
def category_display_name(slug: str) -> str:
|
||||
"""Romanian display name for a slug (fallback to the slug itself)."""
|
||||
return CATEGORIES.get(slug, slug)
|
||||
|
||||
|
||||
def content_type_display_name(slug: str) -> str:
|
||||
"""Romanian display name for a content_type slug."""
|
||||
return CONTENT_TYPES.get(slug, slug)
|
||||
@@ -5,6 +5,22 @@ Activity data model for INDEX-SISTEM-JOCURI v2.0
|
||||
from dataclasses import dataclass, field
|
||||
from typing import List, Optional, Dict, Any
|
||||
import json
|
||||
import re
|
||||
import unicodedata
|
||||
|
||||
|
||||
def normalize_name(name: str) -> str:
|
||||
"""Diacritic-free, lowercased, whitespace-collapsed form of a name.
|
||||
|
||||
Used as the exact-match key for dedup grouping (see plan §4).
|
||||
"""
|
||||
if not name:
|
||||
return ""
|
||||
decomposed = unicodedata.normalize("NFKD", name)
|
||||
ascii_str = "".join(c for c in decomposed if not unicodedata.combining(c))
|
||||
ascii_str = ascii_str.lower().strip()
|
||||
ascii_str = re.sub(r"\s+", " ", ascii_str)
|
||||
return ascii_str
|
||||
|
||||
@dataclass
|
||||
class Activity:
|
||||
@@ -19,10 +35,19 @@ class Activity:
|
||||
# Categories
|
||||
category: str = ""
|
||||
subcategory: Optional[str] = None
|
||||
|
||||
# content_type is an axis INDEPENDENT of category:
|
||||
# one of joc/activitate/reteta/cantec/ceremonie (see config_taxonomy).
|
||||
content_type: Optional[str] = None
|
||||
|
||||
# Source information
|
||||
source_file: str = ""
|
||||
page_reference: Optional[str] = None
|
||||
# source_files: JSON-encoded list of every source the activity was seen in.
|
||||
# `source_file` (singular) stays as the primary/original source; build_database
|
||||
# (Lane C) accumulates the full list here on dedup-merge.
|
||||
source_files: List[str] = field(default_factory=list)
|
||||
# Short verbatim quote from the source — anti-hallucination anchor.
|
||||
source_excerpt: Optional[str] = None
|
||||
|
||||
# Age and participants
|
||||
age_group_min: Optional[int] = None
|
||||
@@ -44,11 +69,22 @@ class Activity:
|
||||
keywords: Optional[str] = None
|
||||
tags: List[str] = field(default_factory=list)
|
||||
popularity_score: int = 0
|
||||
|
||||
|
||||
# Extraction / language metadata
|
||||
language: Optional[str] = None # 'ro' / 'en'
|
||||
normalized_name: Optional[str] = None # dedup key; auto-derived from name
|
||||
extraction_confidence: Optional[str] = None # 'high' / 'med' / 'low'
|
||||
needs_review: int = 0
|
||||
|
||||
# Database fields
|
||||
id: Optional[int] = None
|
||||
created_at: Optional[str] = None
|
||||
updated_at: Optional[str] = None
|
||||
|
||||
def __post_init__(self):
|
||||
"""Derive normalized_name from name when not explicitly provided."""
|
||||
if not self.normalized_name:
|
||||
self.normalized_name = normalize_name(self.name)
|
||||
|
||||
def to_dict(self) -> Dict[str, Any]:
|
||||
"""Convert activity to dictionary for database storage"""
|
||||
@@ -59,8 +95,11 @@ class Activity:
|
||||
'variations': self.variations,
|
||||
'category': self.category,
|
||||
'subcategory': self.subcategory,
|
||||
'content_type': self.content_type,
|
||||
'source_file': self.source_file,
|
||||
'source_files': json.dumps(self.source_files) if self.source_files else None,
|
||||
'page_reference': self.page_reference,
|
||||
'source_excerpt': self.source_excerpt,
|
||||
'age_group_min': self.age_group_min,
|
||||
'age_group_max': self.age_group_max,
|
||||
'participants_min': self.participants_min,
|
||||
@@ -73,7 +112,11 @@ class Activity:
|
||||
'difficulty_level': self.difficulty_level,
|
||||
'keywords': self.keywords,
|
||||
'tags': json.dumps(self.tags) if self.tags else None,
|
||||
'popularity_score': self.popularity_score
|
||||
'popularity_score': self.popularity_score,
|
||||
'language': self.language,
|
||||
'normalized_name': self.normalized_name or normalize_name(self.name),
|
||||
'extraction_confidence': self.extraction_confidence,
|
||||
'needs_review': self.needs_review,
|
||||
}
|
||||
|
||||
@classmethod
|
||||
@@ -86,7 +129,17 @@ class Activity:
|
||||
tags = json.loads(data['tags'])
|
||||
except (json.JSONDecodeError, TypeError):
|
||||
tags = []
|
||||
|
||||
|
||||
# source_files may arrive as a JSON string (DB) or a list (extraction)
|
||||
source_files = data.get('source_files')
|
||||
if isinstance(source_files, str):
|
||||
try:
|
||||
source_files = json.loads(source_files)
|
||||
except (json.JSONDecodeError, TypeError):
|
||||
source_files = []
|
||||
elif source_files is None:
|
||||
source_files = []
|
||||
|
||||
return cls(
|
||||
id=data.get('id'),
|
||||
name=data.get('name', ''),
|
||||
@@ -95,8 +148,11 @@ class Activity:
|
||||
variations=data.get('variations'),
|
||||
category=data.get('category', ''),
|
||||
subcategory=data.get('subcategory'),
|
||||
content_type=data.get('content_type'),
|
||||
source_file=data.get('source_file', ''),
|
||||
source_files=source_files,
|
||||
page_reference=data.get('page_reference'),
|
||||
source_excerpt=data.get('source_excerpt'),
|
||||
age_group_min=data.get('age_group_min'),
|
||||
age_group_max=data.get('age_group_max'),
|
||||
participants_min=data.get('participants_min'),
|
||||
@@ -110,6 +166,10 @@ class Activity:
|
||||
keywords=data.get('keywords'),
|
||||
tags=tags,
|
||||
popularity_score=data.get('popularity_score', 0),
|
||||
language=data.get('language'),
|
||||
normalized_name=data.get('normalized_name'),
|
||||
extraction_confidence=data.get('extraction_confidence'),
|
||||
needs_review=data.get('needs_review', 0) or 0,
|
||||
created_at=data.get('created_at'),
|
||||
updated_at=data.get('updated_at')
|
||||
)
|
||||
|
||||
@@ -30,6 +30,8 @@ class DatabaseManager:
|
||||
"""Initialize database with v2.0 schema"""
|
||||
with self._get_connection() as conn:
|
||||
# Main activities table
|
||||
# NOTE: schema is rebuilt from scratch (plan §6) — no in-place
|
||||
# migration. The old DB is deleted and recreated by build_database.
|
||||
conn.execute("""
|
||||
CREATE TABLE IF NOT EXISTS activities (
|
||||
id INTEGER PRIMARY KEY AUTOINCREMENT,
|
||||
@@ -39,9 +41,12 @@ class DatabaseManager:
|
||||
variations TEXT,
|
||||
category TEXT NOT NULL,
|
||||
subcategory TEXT,
|
||||
content_type TEXT,
|
||||
source_file TEXT NOT NULL,
|
||||
source_files TEXT,
|
||||
page_reference TEXT,
|
||||
|
||||
source_excerpt TEXT,
|
||||
|
||||
-- Structured parameters
|
||||
age_group_min INTEGER,
|
||||
age_group_max INTEGER,
|
||||
@@ -49,26 +54,34 @@ class DatabaseManager:
|
||||
participants_max INTEGER,
|
||||
duration_min INTEGER,
|
||||
duration_max INTEGER,
|
||||
|
||||
|
||||
-- Categories for filtering
|
||||
materials_category TEXT,
|
||||
materials_list TEXT,
|
||||
skills_developed TEXT,
|
||||
difficulty_level TEXT,
|
||||
|
||||
|
||||
-- Metadata
|
||||
keywords TEXT,
|
||||
tags TEXT,
|
||||
popularity_score INTEGER DEFAULT 0,
|
||||
|
||||
-- Extraction / language metadata
|
||||
language TEXT,
|
||||
normalized_name TEXT,
|
||||
extraction_confidence TEXT,
|
||||
needs_review INTEGER DEFAULT 0,
|
||||
|
||||
created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
|
||||
updated_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP
|
||||
)
|
||||
""")
|
||||
|
||||
|
||||
# FTS5 virtual table for search
|
||||
conn.execute("""
|
||||
CREATE VIRTUAL TABLE IF NOT EXISTS activities_fts USING fts5(
|
||||
name, description, rules, variations, keywords,
|
||||
materials_list, skills_developed,
|
||||
content='activities',
|
||||
content_rowid='id'
|
||||
)
|
||||
@@ -92,6 +105,7 @@ class DatabaseManager:
|
||||
"CREATE INDEX IF NOT EXISTS idx_activities_age ON activities(age_group_min, age_group_max)",
|
||||
"CREATE INDEX IF NOT EXISTS idx_activities_participants ON activities(participants_min, participants_max)",
|
||||
"CREATE INDEX IF NOT EXISTS idx_activities_duration ON activities(duration_min, duration_max)",
|
||||
"CREATE INDEX IF NOT EXISTS idx_activities_normalized_name ON activities(normalized_name)",
|
||||
"CREATE INDEX IF NOT EXISTS idx_categories_type ON categories(type)"
|
||||
]
|
||||
|
||||
@@ -102,24 +116,34 @@ class DatabaseManager:
|
||||
conn.execute("""
|
||||
CREATE TRIGGER IF NOT EXISTS activities_fts_insert AFTER INSERT ON activities
|
||||
BEGIN
|
||||
INSERT INTO activities_fts(rowid, name, description, rules, variations, keywords)
|
||||
VALUES (new.id, new.name, new.description, new.rules, new.variations, new.keywords);
|
||||
INSERT INTO activities_fts(rowid, name, description, rules, variations,
|
||||
keywords, materials_list, skills_developed)
|
||||
VALUES (new.id, new.name, new.description, new.rules, new.variations,
|
||||
new.keywords, new.materials_list, new.skills_developed);
|
||||
END
|
||||
""")
|
||||
|
||||
|
||||
conn.execute("""
|
||||
CREATE TRIGGER IF NOT EXISTS activities_fts_delete AFTER DELETE ON activities
|
||||
BEGIN
|
||||
DELETE FROM activities_fts WHERE rowid = old.id;
|
||||
INSERT INTO activities_fts(activities_fts, rowid, name, description, rules,
|
||||
variations, keywords, materials_list, skills_developed)
|
||||
VALUES ('delete', old.id, old.name, old.description, old.rules,
|
||||
old.variations, old.keywords, old.materials_list, old.skills_developed);
|
||||
END
|
||||
""")
|
||||
|
||||
|
||||
conn.execute("""
|
||||
CREATE TRIGGER IF NOT EXISTS activities_fts_update AFTER UPDATE ON activities
|
||||
BEGIN
|
||||
DELETE FROM activities_fts WHERE rowid = old.id;
|
||||
INSERT INTO activities_fts(rowid, name, description, rules, variations, keywords)
|
||||
VALUES (new.id, new.name, new.description, new.rules, new.variations, new.keywords);
|
||||
INSERT INTO activities_fts(activities_fts, rowid, name, description, rules,
|
||||
variations, keywords, materials_list, skills_developed)
|
||||
VALUES ('delete', old.id, old.name, old.description, old.rules,
|
||||
old.variations, old.keywords, old.materials_list, old.skills_developed);
|
||||
INSERT INTO activities_fts(rowid, name, description, rules, variations,
|
||||
keywords, materials_list, skills_developed)
|
||||
VALUES (new.id, new.name, new.description, new.rules, new.variations,
|
||||
new.keywords, new.materials_list, new.skills_developed);
|
||||
END
|
||||
""")
|
||||
|
||||
@@ -179,6 +203,8 @@ class DatabaseManager:
|
||||
"""Update category usage counts"""
|
||||
categories_to_update = [
|
||||
('category', activity.category),
|
||||
('content_type', activity.content_type),
|
||||
('language', activity.language),
|
||||
('age_group', activity.get_age_range_display()),
|
||||
('participants', activity.get_participants_display()),
|
||||
('duration', activity.get_duration_display()),
|
||||
@@ -332,8 +358,11 @@ class DatabaseManager:
|
||||
def clear_database(self):
|
||||
"""Clear all data from database"""
|
||||
with self._get_connection() as conn:
|
||||
# Deleting from activities fires the delete trigger, which removes
|
||||
# the matching FTS rows. The explicit 'delete-all' command then
|
||||
# guarantees the external-content FTS index is fully cleared.
|
||||
conn.execute("DELETE FROM activities")
|
||||
conn.execute("DELETE FROM activities_fts")
|
||||
conn.execute("INSERT INTO activities_fts(activities_fts) VALUES('delete-all')")
|
||||
conn.execute("DELETE FROM categories")
|
||||
conn.commit()
|
||||
|
||||
|
||||
@@ -2,8 +2,6 @@
|
||||
Services for INDEX-SISTEM-JOCURI v2.0
|
||||
"""
|
||||
|
||||
from .parser import IndexMasterParser
|
||||
from .indexer import ActivityIndexer
|
||||
from .search import SearchService
|
||||
|
||||
__all__ = ['IndexMasterParser', 'ActivityIndexer', 'SearchService']
|
||||
__all__ = ['SearchService']
|
||||
|
||||
@@ -1,248 +0,0 @@
|
||||
"""
|
||||
Activity indexer service for INDEX-SISTEM-JOCURI v2.0
|
||||
Coordinates parsing and database indexing
|
||||
"""
|
||||
|
||||
from typing import List, Dict, Any
|
||||
from pathlib import Path
|
||||
from app.models.database import DatabaseManager
|
||||
from app.models.activity import Activity
|
||||
from app.services.parser import IndexMasterParser
|
||||
import time
|
||||
|
||||
class ActivityIndexer:
|
||||
"""Service for indexing activities from INDEX_MASTER into database"""
|
||||
|
||||
def __init__(self, db_manager: DatabaseManager, index_master_path: str):
|
||||
"""Initialize indexer with database manager and INDEX_MASTER path"""
|
||||
self.db = db_manager
|
||||
self.parser = IndexMasterParser(index_master_path)
|
||||
self.indexing_stats = {}
|
||||
|
||||
def index_all_activities(self, clear_existing: bool = False) -> Dict[str, Any]:
|
||||
"""Index all activities from INDEX_MASTER into database"""
|
||||
|
||||
print("🚀 Starting activity indexing process...")
|
||||
start_time = time.time()
|
||||
|
||||
# Clear existing data if requested
|
||||
if clear_existing:
|
||||
print("🗑️ Clearing existing database...")
|
||||
self.db.clear_database()
|
||||
|
||||
# Parse activities from INDEX_MASTER
|
||||
print("📖 Parsing INDEX_MASTER file...")
|
||||
activities = self.parser.parse_all_categories()
|
||||
|
||||
if not activities:
|
||||
print("❌ No activities were parsed!")
|
||||
return {'success': False, 'error': 'No activities parsed'}
|
||||
|
||||
# Filter valid activities
|
||||
valid_activities = []
|
||||
for activity in activities:
|
||||
if self.parser.validate_activity_completeness(activity):
|
||||
valid_activities.append(activity)
|
||||
else:
|
||||
print(f"⚠️ Skipping incomplete activity: {activity.name[:50]}...")
|
||||
|
||||
print(f"✅ Validated {len(valid_activities)} activities out of {len(activities)} parsed")
|
||||
|
||||
if len(valid_activities) < 100:
|
||||
print(f"⚠️ Warning: Only {len(valid_activities)} valid activities found. Expected 500+")
|
||||
|
||||
# Bulk insert into database
|
||||
print("💾 Inserting activities into database...")
|
||||
try:
|
||||
inserted_count = self.db.bulk_insert_activities(valid_activities)
|
||||
|
||||
# Rebuild FTS index for optimal search performance
|
||||
print("🔍 Rebuilding search index...")
|
||||
self.db.rebuild_fts_index()
|
||||
|
||||
end_time = time.time()
|
||||
indexing_time = end_time - start_time
|
||||
|
||||
# Generate final statistics (with error handling)
|
||||
try:
|
||||
stats = self._generate_indexing_stats(valid_activities, indexing_time)
|
||||
stats['inserted_count'] = inserted_count
|
||||
stats['success'] = True
|
||||
except Exception as e:
|
||||
print(f"⚠️ Error generating statistics: {e}")
|
||||
stats = {
|
||||
'success': True,
|
||||
'inserted_count': inserted_count,
|
||||
'indexing_time_seconds': indexing_time,
|
||||
'error': f'Stats generation failed: {str(e)}'
|
||||
}
|
||||
|
||||
print(f"✅ Indexing complete! {inserted_count} activities indexed in {indexing_time:.2f}s")
|
||||
|
||||
# Verify database state (with error handling)
|
||||
try:
|
||||
db_stats = self.db.get_statistics()
|
||||
print(f"📊 Database now contains {db_stats['total_activities']} activities")
|
||||
except Exception as e:
|
||||
print(f"⚠️ Error getting database statistics: {e}")
|
||||
print(f"📊 Database insertion completed, statistics unavailable")
|
||||
|
||||
return stats
|
||||
|
||||
except Exception as e:
|
||||
print(f"❌ Error during database insertion: {e}")
|
||||
return {'success': False, 'error': str(e)}
|
||||
|
||||
def index_specific_category(self, category_code: str) -> Dict[str, Any]:
|
||||
"""Index activities from a specific category only"""
|
||||
|
||||
print(f"🎯 Indexing specific category: {category_code}")
|
||||
|
||||
# Load content and parse specific category
|
||||
if not self.parser.load_content():
|
||||
return {'success': False, 'error': 'Could not load INDEX_MASTER'}
|
||||
|
||||
category_name = self.parser.category_mapping.get(category_code)
|
||||
if not category_name:
|
||||
return {'success': False, 'error': f'Unknown category code: {category_code}'}
|
||||
|
||||
activities = self.parser.parse_category_section(category_code, category_name)
|
||||
|
||||
if not activities:
|
||||
return {'success': False, 'error': f'No activities found in category {category_code}'}
|
||||
|
||||
# Filter valid activities
|
||||
valid_activities = [a for a in activities if self.parser.validate_activity_completeness(a)]
|
||||
|
||||
try:
|
||||
inserted_count = self.db.bulk_insert_activities(valid_activities)
|
||||
return {
|
||||
'success': True,
|
||||
'category': category_name,
|
||||
'inserted_count': inserted_count,
|
||||
'total_parsed': len(activities),
|
||||
'valid_activities': len(valid_activities)
|
||||
}
|
||||
except Exception as e:
|
||||
return {'success': False, 'error': str(e)}
|
||||
|
||||
def _generate_indexing_stats(self, activities: List[Activity], indexing_time: float) -> Dict[str, Any]:
|
||||
"""Generate comprehensive indexing statistics"""
|
||||
|
||||
# Get parser statistics
|
||||
parser_stats = self.parser.get_parsing_statistics()
|
||||
|
||||
# Calculate additional metrics
|
||||
categories = {}
|
||||
age_ranges = {}
|
||||
durations = {}
|
||||
materials = {}
|
||||
|
||||
for activity in activities:
|
||||
# Category breakdown
|
||||
if activity.category in categories:
|
||||
categories[activity.category] += 1
|
||||
else:
|
||||
categories[activity.category] = 1
|
||||
|
||||
# Age range analysis (with safety check)
|
||||
try:
|
||||
age_key = activity.get_age_range_display() or "nespecificat"
|
||||
age_ranges[age_key] = age_ranges.get(age_key, 0) + 1
|
||||
except Exception as e:
|
||||
print(f"Warning: Error getting age range for activity {activity.name}: {e}")
|
||||
age_ranges["nespecificat"] = age_ranges.get("nespecificat", 0) + 1
|
||||
|
||||
# Duration analysis (with safety check)
|
||||
try:
|
||||
duration_key = activity.get_duration_display() or "nespecificat"
|
||||
durations[duration_key] = durations.get(duration_key, 0) + 1
|
||||
except Exception as e:
|
||||
print(f"Warning: Error getting duration for activity {activity.name}: {e}")
|
||||
durations["nespecificat"] = durations.get("nespecificat", 0) + 1
|
||||
|
||||
# Materials analysis (with safety check)
|
||||
try:
|
||||
materials_key = activity.get_materials_display() or "nespecificat"
|
||||
materials[materials_key] = materials.get(materials_key, 0) + 1
|
||||
except Exception as e:
|
||||
print(f"Warning: Error getting materials for activity {activity.name}: {e}")
|
||||
materials["nespecificat"] = materials.get("nespecificat", 0) + 1
|
||||
|
||||
return {
|
||||
'indexing_time_seconds': indexing_time,
|
||||
'parsing_stats': parser_stats,
|
||||
'distribution': {
|
||||
'categories': categories,
|
||||
'age_ranges': age_ranges,
|
||||
'durations': durations,
|
||||
'materials': materials
|
||||
},
|
||||
'quality_metrics': {
|
||||
'completion_rate': parser_stats.get('completion_rate', 0),
|
||||
'average_description_length': parser_stats.get('average_description_length', 0),
|
||||
'activities_with_metadata': sum(1 for a in activities if a.age_group_min or a.participants_min or a.duration_min)
|
||||
}
|
||||
}
|
||||
|
||||
def verify_indexing_quality(self) -> Dict[str, Any]:
|
||||
"""Verify the quality of indexed data"""
|
||||
|
||||
try:
|
||||
# Get database statistics
|
||||
db_stats = self.db.get_statistics()
|
||||
|
||||
# Check for minimum activity count
|
||||
total_activities = db_stats['total_activities']
|
||||
meets_minimum = total_activities >= 500
|
||||
|
||||
# Check category distribution
|
||||
categories = db_stats.get('categories', {})
|
||||
category_coverage = len(categories)
|
||||
|
||||
# Sample some activities to check quality
|
||||
sample_activities = self.db.search_activities(limit=10)
|
||||
|
||||
quality_issues = []
|
||||
for activity in sample_activities:
|
||||
if not activity.get('description') or len(activity['description']) < 10:
|
||||
quality_issues.append(f"Activity {activity.get('name', 'Unknown')} has insufficient description")
|
||||
|
||||
if not activity.get('category'):
|
||||
quality_issues.append(f"Activity {activity.get('name', 'Unknown')} missing category")
|
||||
|
||||
return {
|
||||
'total_activities': total_activities,
|
||||
'meets_minimum_requirement': meets_minimum,
|
||||
'minimum_target': 500,
|
||||
'category_coverage': category_coverage,
|
||||
'expected_categories': len(self.parser.category_mapping),
|
||||
'quality_issues': quality_issues,
|
||||
'quality_score': max(0, 100 - len(quality_issues) * 10),
|
||||
'database_stats': db_stats
|
||||
}
|
||||
|
||||
except Exception as e:
|
||||
return {'error': str(e), 'quality_score': 0}
|
||||
|
||||
def get_indexing_progress(self) -> Dict[str, Any]:
|
||||
"""Get current indexing progress and status"""
|
||||
try:
|
||||
db_stats = self.db.get_statistics()
|
||||
|
||||
# Calculate progress towards 500+ activities goal
|
||||
total_activities = db_stats['total_activities']
|
||||
target_activities = 500
|
||||
progress_percentage = min(100, (total_activities / target_activities) * 100)
|
||||
|
||||
return {
|
||||
'current_activities': total_activities,
|
||||
'target_activities': target_activities,
|
||||
'progress_percentage': progress_percentage,
|
||||
'status': 'completed' if total_activities >= target_activities else 'in_progress',
|
||||
'categories_indexed': list(db_stats.get('categories', {}).keys()),
|
||||
'database_size_mb': db_stats.get('database_size_bytes', 0) / (1024 * 1024)
|
||||
}
|
||||
|
||||
except Exception as e:
|
||||
return {'error': str(e), 'status': 'error'}
|
||||
@@ -1,340 +0,0 @@
|
||||
"""
|
||||
Advanced parser for INDEX_MASTER_JOCURI_ACTIVITATI.md
|
||||
Extracts 500+ individual activities with full details
|
||||
"""
|
||||
|
||||
import re
|
||||
from pathlib import Path
|
||||
from typing import List, Dict, Optional, Tuple
|
||||
from app.models.activity import Activity
|
||||
|
||||
class IndexMasterParser:
|
||||
"""Advanced parser for extracting real activities from INDEX_MASTER"""
|
||||
|
||||
def __init__(self, index_file_path: str):
|
||||
"""Initialize parser with INDEX_MASTER file path"""
|
||||
self.index_file_path = Path(index_file_path)
|
||||
self.content = ""
|
||||
self.activities = []
|
||||
|
||||
# Category mapping for main sections (exact match from file)
|
||||
self.category_mapping = {
|
||||
'[A]': 'JOCURI CERCETĂȘEȘTI ȘI SCOUT',
|
||||
'[B]': 'TEAM BUILDING ȘI COMUNICARE',
|
||||
'[C]': 'CAMPING ȘI ACTIVITĂȚI EXTERIOR',
|
||||
'[D]': 'ESCAPE ROOM ȘI PUZZLE-URI',
|
||||
'[E]': 'ORIENTARE ȘI BUSOLE',
|
||||
'[F]': 'PRIMUL AJUTOR ȘI SIGURANȚA',
|
||||
'[G]': 'ACTIVITĂȚI EDUCAȚIONALE',
|
||||
'[H]': 'RESURSE SPECIALE'
|
||||
}
|
||||
|
||||
def load_content(self) -> bool:
|
||||
"""Load and validate INDEX_MASTER content"""
|
||||
try:
|
||||
if not self.index_file_path.exists():
|
||||
print(f"❌ INDEX_MASTER file not found: {self.index_file_path}")
|
||||
return False
|
||||
|
||||
with open(self.index_file_path, 'r', encoding='utf-8') as f:
|
||||
self.content = f.read()
|
||||
|
||||
if len(self.content) < 1000: # Sanity check
|
||||
print(f"⚠️ INDEX_MASTER file seems too small: {len(self.content)} chars")
|
||||
return False
|
||||
|
||||
print(f"✅ Loaded INDEX_MASTER: {len(self.content)} characters")
|
||||
return True
|
||||
|
||||
except Exception as e:
|
||||
print(f"❌ Error loading INDEX_MASTER: {e}")
|
||||
return False
|
||||
|
||||
def parse_all_categories(self) -> List[Activity]:
|
||||
"""Parse all categories and extract individual activities"""
|
||||
if not self.load_content():
|
||||
return []
|
||||
|
||||
print("🔍 Starting comprehensive parsing of INDEX_MASTER...")
|
||||
|
||||
# Parse each main category
|
||||
for category_code, category_name in self.category_mapping.items():
|
||||
print(f"\n📂 Processing category {category_code}: {category_name}")
|
||||
category_activities = self.parse_category_section(category_code, category_name)
|
||||
self.activities.extend(category_activities)
|
||||
print(f" ✅ Extracted {len(category_activities)} activities")
|
||||
|
||||
print(f"\n🎯 Total activities extracted: {len(self.activities)}")
|
||||
return self.activities
|
||||
|
||||
def parse_category_section(self, category_code: str, category_name: str) -> List[Activity]:
|
||||
"""Parse a specific category section"""
|
||||
activities = []
|
||||
|
||||
# Find the category section - exact pattern match
|
||||
# Look for the actual section, not the table of contents
|
||||
pattern = rf"^## {re.escape(category_code)} {re.escape(category_name)}\s*$"
|
||||
matches = list(re.finditer(pattern, self.content, re.MULTILINE | re.IGNORECASE))
|
||||
|
||||
if not matches:
|
||||
print(f" ⚠️ Category section not found: {category_code}")
|
||||
return activities
|
||||
|
||||
# Take the last match (should be the actual section, not TOC)
|
||||
match = matches[-1]
|
||||
print(f" 📍 Found section at position {match.start()}")
|
||||
|
||||
# Extract content until next main category or end
|
||||
start_pos = match.end()
|
||||
|
||||
# Find next main category (look for complete header)
|
||||
next_category_pattern = r"^## \[[A-H]\] [A-ZĂÂÎȘȚ]"
|
||||
next_match = re.search(next_category_pattern, self.content[start_pos:], re.MULTILINE)
|
||||
|
||||
if next_match:
|
||||
end_pos = start_pos + next_match.start()
|
||||
section_content = self.content[start_pos:end_pos]
|
||||
else:
|
||||
section_content = self.content[start_pos:]
|
||||
|
||||
# Parse subsections within the category
|
||||
activities.extend(self._parse_subsections(section_content, category_name))
|
||||
|
||||
return activities
|
||||
|
||||
def _parse_subsections(self, section_content: str, category_name: str) -> List[Activity]:
|
||||
"""Parse subsections within a category"""
|
||||
activities = []
|
||||
|
||||
# Find all subsections (### markers)
|
||||
subsection_pattern = r"^### (.+?)$"
|
||||
subsections = re.finditer(subsection_pattern, section_content, re.MULTILINE)
|
||||
|
||||
subsection_list = list(subsections)
|
||||
|
||||
for i, subsection in enumerate(subsection_list):
|
||||
subsection_title = subsection.group(1).strip()
|
||||
subsection_start = subsection.end()
|
||||
|
||||
# Find end of subsection
|
||||
if i + 1 < len(subsection_list):
|
||||
subsection_end = subsection_list[i + 1].start()
|
||||
else:
|
||||
subsection_end = len(section_content)
|
||||
|
||||
subsection_text = section_content[subsection_start:subsection_end]
|
||||
|
||||
# Parse individual games in this subsection
|
||||
subsection_activities = self._parse_games_in_subsection(
|
||||
subsection_text, category_name, subsection_title
|
||||
)
|
||||
activities.extend(subsection_activities)
|
||||
|
||||
return activities
|
||||
|
||||
def _parse_games_in_subsection(self, subsection_text: str, category_name: str, subsection_title: str) -> List[Activity]:
|
||||
"""Parse individual games within a subsection"""
|
||||
activities = []
|
||||
|
||||
# Look for "Exemple de jocuri:" sections
|
||||
examples_pattern = r"\*\*Exemple de jocuri:\*\*\s*\n(.*?)(?=\n\*\*|$)"
|
||||
examples_matches = re.finditer(examples_pattern, subsection_text, re.DOTALL)
|
||||
|
||||
for examples_match in examples_matches:
|
||||
examples_text = examples_match.group(1)
|
||||
|
||||
# Extract individual games (numbered list)
|
||||
game_pattern = r"^(\d+)\.\s*\*\*(.+?)\*\*\s*-\s*(.+?)$"
|
||||
games = re.finditer(game_pattern, examples_text, re.MULTILINE)
|
||||
|
||||
for game_match in games:
|
||||
game_number = game_match.group(1)
|
||||
game_name = game_match.group(2).strip()
|
||||
game_description = game_match.group(3).strip()
|
||||
|
||||
# Extract metadata from subsection
|
||||
metadata = self._extract_subsection_metadata(subsection_text)
|
||||
|
||||
# Create activity
|
||||
activity = Activity(
|
||||
name=game_name,
|
||||
description=game_description,
|
||||
category=category_name,
|
||||
subcategory=subsection_title,
|
||||
source_file=f"INDEX_MASTER_JOCURI_ACTIVITATI.md",
|
||||
page_reference=f"{category_name} > {subsection_title} > #{game_number}",
|
||||
**metadata
|
||||
)
|
||||
|
||||
activities.append(activity)
|
||||
|
||||
# Also extract from direct activity descriptions without "Exemple de jocuri"
|
||||
activities.extend(self._parse_direct_activities(subsection_text, category_name, subsection_title))
|
||||
|
||||
return activities
|
||||
|
||||
def _extract_subsection_metadata(self, subsection_text: str) -> Dict:
|
||||
"""Extract metadata from subsection text"""
|
||||
metadata = {}
|
||||
|
||||
# Extract participants info
|
||||
participants_pattern = r"\*\*Participanți:\*\*\s*(.+?)(?:\n|\*\*)"
|
||||
participants_match = re.search(participants_pattern, subsection_text)
|
||||
if participants_match:
|
||||
participants_text = participants_match.group(1).strip()
|
||||
participants = self._parse_participants(participants_text)
|
||||
metadata.update(participants)
|
||||
|
||||
# Extract duration
|
||||
duration_pattern = r"\*\*Durata:\*\*\s*(.+?)(?:\n|\*\*)"
|
||||
duration_match = re.search(duration_pattern, subsection_text)
|
||||
if duration_match:
|
||||
duration_text = duration_match.group(1).strip()
|
||||
duration = self._parse_duration(duration_text)
|
||||
metadata.update(duration)
|
||||
|
||||
# Extract materials
|
||||
materials_pattern = r"\*\*Materiale:\*\*\s*(.+?)(?:\n|\*\*)"
|
||||
materials_match = re.search(materials_pattern, subsection_text)
|
||||
if materials_match:
|
||||
materials_text = materials_match.group(1).strip()
|
||||
metadata['materials_list'] = materials_text
|
||||
metadata['materials_category'] = self._categorize_materials(materials_text)
|
||||
|
||||
# Extract keywords
|
||||
keywords_pattern = r"\*\*Cuvinte cheie:\*\*\s*(.+?)(?:\n|\*\*)"
|
||||
keywords_match = re.search(keywords_pattern, subsection_text)
|
||||
if keywords_match:
|
||||
metadata['keywords'] = keywords_match.group(1).strip()
|
||||
|
||||
return metadata
|
||||
|
||||
def _parse_participants(self, participants_text: str) -> Dict:
|
||||
"""Parse participants information"""
|
||||
result = {}
|
||||
|
||||
# Look for number ranges like "8-30 copii" or "5-15 persoane"
|
||||
range_pattern = r"(\d+)-(\d+)"
|
||||
range_match = re.search(range_pattern, participants_text)
|
||||
|
||||
if range_match:
|
||||
result['participants_min'] = int(range_match.group(1))
|
||||
result['participants_max'] = int(range_match.group(2))
|
||||
else:
|
||||
# Look for single numbers
|
||||
number_pattern = r"(\d+)\+"
|
||||
number_match = re.search(number_pattern, participants_text)
|
||||
if number_match:
|
||||
result['participants_min'] = int(number_match.group(1))
|
||||
|
||||
# Extract age information
|
||||
age_pattern = r"(\d+)-(\d+)\s*ani"
|
||||
age_match = re.search(age_pattern, participants_text)
|
||||
if age_match:
|
||||
result['age_group_min'] = int(age_match.group(1))
|
||||
result['age_group_max'] = int(age_match.group(2))
|
||||
|
||||
return result
|
||||
|
||||
def _parse_duration(self, duration_text: str) -> Dict:
|
||||
"""Parse duration information"""
|
||||
result = {}
|
||||
|
||||
# Look for time ranges like "5-20 minute" or "15-30min"
|
||||
range_pattern = r"(\d+)-(\d+)\s*(?:minute|min)"
|
||||
range_match = re.search(range_pattern, duration_text)
|
||||
|
||||
if range_match:
|
||||
result['duration_min'] = int(range_match.group(1))
|
||||
result['duration_max'] = int(range_match.group(2))
|
||||
else:
|
||||
# Look for single duration
|
||||
single_pattern = r"(\d+)\+?\s*(?:minute|min)"
|
||||
single_match = re.search(single_pattern, duration_text)
|
||||
if single_match:
|
||||
result['duration_min'] = int(single_match.group(1))
|
||||
|
||||
return result
|
||||
|
||||
def _categorize_materials(self, materials_text: str) -> str:
|
||||
"""Categorize materials into simple categories"""
|
||||
materials_lower = materials_text.lower()
|
||||
|
||||
if any(word in materials_lower for word in ['fără', 'nu necesare', 'nimic', 'minime']):
|
||||
return 'Fără materiale'
|
||||
elif any(word in materials_lower for word in ['hârtie', 'creion', 'marker', 'simple']):
|
||||
return 'Materiale simple'
|
||||
elif any(word in materials_lower for word in ['computer', 'proiector', 'echipament', 'complexe']):
|
||||
return 'Materiale complexe'
|
||||
else:
|
||||
return 'Materiale variate'
|
||||
|
||||
def _parse_direct_activities(self, subsection_text: str, category_name: str, subsection_title: str) -> List[Activity]:
|
||||
"""Parse activities that are described directly without 'Exemple de jocuri' section"""
|
||||
activities = []
|
||||
|
||||
# Look for activity descriptions in sections that don't have "Exemple de jocuri"
|
||||
if "**Exemple de jocuri:**" not in subsection_text:
|
||||
# Try to extract from file descriptions
|
||||
file_pattern = r"\*\*Fișier:\*\*\s*`([^`]+)`.*?\*\*(.+?)\*\*"
|
||||
file_matches = re.finditer(file_pattern, subsection_text, re.DOTALL)
|
||||
|
||||
for file_match in file_matches:
|
||||
file_name = file_match.group(1)
|
||||
description_part = file_match.group(2)
|
||||
|
||||
# Create a general activity for this file
|
||||
activity = Activity(
|
||||
name=f"Activități din {file_name}",
|
||||
description=f"Colecție de activități din fișierul {file_name}. {description_part[:200]}...",
|
||||
category=category_name,
|
||||
subcategory=subsection_title,
|
||||
source_file=file_name,
|
||||
page_reference=f"{category_name} > {subsection_title}",
|
||||
**self._extract_subsection_metadata(subsection_text)
|
||||
)
|
||||
|
||||
activities.append(activity)
|
||||
|
||||
return activities
|
||||
|
||||
def validate_activity_completeness(self, activity: Activity) -> bool:
|
||||
"""Validate that an activity has all necessary fields"""
|
||||
required_fields = ['name', 'description', 'category', 'source_file']
|
||||
|
||||
for field in required_fields:
|
||||
if not getattr(activity, field) or not getattr(activity, field).strip():
|
||||
return False
|
||||
|
||||
# Check minimum description length
|
||||
if len(activity.description) < 10:
|
||||
return False
|
||||
|
||||
return True
|
||||
|
||||
def get_parsing_statistics(self) -> Dict:
|
||||
"""Get statistics about the parsing process"""
|
||||
if not self.activities:
|
||||
return {'total_activities': 0}
|
||||
|
||||
category_counts = {}
|
||||
valid_activities = 0
|
||||
|
||||
for activity in self.activities:
|
||||
# Count by category
|
||||
if activity.category in category_counts:
|
||||
category_counts[activity.category] += 1
|
||||
else:
|
||||
category_counts[activity.category] = 1
|
||||
|
||||
# Count valid activities
|
||||
if self.validate_activity_completeness(activity):
|
||||
valid_activities += 1
|
||||
|
||||
return {
|
||||
'total_activities': len(self.activities),
|
||||
'valid_activities': valid_activities,
|
||||
'completion_rate': (valid_activities / len(self.activities)) * 100 if self.activities else 0,
|
||||
'category_breakdown': category_counts,
|
||||
'average_description_length': sum(len(a.description) for a in self.activities) / len(self.activities) if self.activities else 0
|
||||
}
|
||||
@@ -5,8 +5,19 @@ Enhanced search with FTS5 and intelligent filtering
|
||||
|
||||
from typing import List, Dict, Any, Optional
|
||||
from app.models.database import DatabaseManager
|
||||
from app.config_taxonomy import NON_GAME_CONTENT_TYPES
|
||||
import re
|
||||
|
||||
# Category slugs that are themselves "non-game" — selecting one of these as a
|
||||
# category filter also lifts the default non-game content_type exclusion.
|
||||
NON_GAME_CATEGORIES = {"retete", "cantece-ceremonii"}
|
||||
|
||||
# When a Python-side post-filter is active the DB LIMIT is applied *before*
|
||||
# filtering, so we over-fetch to still satisfy the caller's `limit`.
|
||||
_OVERSCAN_FACTOR = 5
|
||||
_OVERSCAN_CAP = 2000
|
||||
|
||||
|
||||
class SearchService:
|
||||
"""Enhanced search service with intelligent query processing"""
|
||||
|
||||
@@ -24,22 +35,72 @@ class SearchService:
|
||||
|
||||
if filters is None:
|
||||
filters = {}
|
||||
|
||||
|
||||
# Process and normalize search text
|
||||
processed_search = self._process_search_text(search_text)
|
||||
|
||||
|
||||
# Map web filters to database fields
|
||||
db_filters = self._map_filters_to_db_fields(filters)
|
||||
|
||||
|
||||
# content_type and language are filtered in Python: the DB layer does
|
||||
# not expose them as query parameters. The DEFAULT search excludes the
|
||||
# non-game content types (rețete / cântece / ceremonii) — they surface
|
||||
# only when the user explicitly filters that content_type, or picks a
|
||||
# non-game category. See plan §6.
|
||||
content_type, exclude_non_game = self._resolve_content_type_filter(filters)
|
||||
language = (filters.get('language') or '').strip().lower() or None
|
||||
post_filtering = bool(content_type or exclude_non_game or language)
|
||||
|
||||
# Over-fetch when post-filtering so the final list can still reach `limit`.
|
||||
fetch_limit = min(limit * _OVERSCAN_FACTOR, _OVERSCAN_CAP) if post_filtering else limit
|
||||
|
||||
# Perform database search
|
||||
results = self.db.search_activities(
|
||||
search_text=processed_search,
|
||||
**db_filters,
|
||||
limit=limit
|
||||
limit=fetch_limit
|
||||
)
|
||||
|
||||
# Post-process results for relevance and ranking
|
||||
return self._post_process_results(results, processed_search, filters)
|
||||
|
||||
# Apply content_type / language post-filters
|
||||
results = self._apply_content_type_filter(results, content_type, exclude_non_game)
|
||||
if language:
|
||||
results = [r for r in results
|
||||
if (r.get('language') or '').strip().lower() == language]
|
||||
|
||||
# Post-process results for relevance and ranking, then honour `limit`
|
||||
results = self._post_process_results(results, processed_search, filters)
|
||||
return results[:limit]
|
||||
|
||||
def _resolve_content_type_filter(self, filters: Dict[str, str]):
|
||||
"""Determine the content_type post-filter.
|
||||
|
||||
Returns (explicit_content_type | None, exclude_non_game: bool):
|
||||
- an explicit `content_type` filter → that value, no exclusion;
|
||||
- a `category` filter on a non-game category → no exclusion;
|
||||
- otherwise → default search, exclude non-game content types.
|
||||
"""
|
||||
content_type = (filters.get('content_type') or '').strip()
|
||||
if content_type:
|
||||
return content_type, False
|
||||
category = (filters.get('category') or '').strip()
|
||||
if category in NON_GAME_CATEGORIES:
|
||||
return None, False
|
||||
return None, True
|
||||
|
||||
def _apply_content_type_filter(self,
|
||||
results: List[Dict[str, Any]],
|
||||
content_type: Optional[str],
|
||||
exclude_non_game: bool) -> List[Dict[str, Any]]:
|
||||
"""Filter results by content_type (explicit include vs default exclude)."""
|
||||
if content_type:
|
||||
return [r for r in results
|
||||
if (r.get('content_type') or '') == content_type]
|
||||
if exclude_non_game:
|
||||
# Rows with NULL/unknown content_type are kept — only the known
|
||||
# non-game types are dropped from the default search.
|
||||
return [r for r in results
|
||||
if (r.get('content_type') or '') not in NON_GAME_CONTENT_TYPES]
|
||||
return results
|
||||
|
||||
def _process_search_text(self, search_text: Optional[str]) -> Optional[str]:
|
||||
"""Process and enhance search text for better FTS5 results"""
|
||||
@@ -83,10 +144,16 @@ class SearchService:
|
||||
if not filter_value or not filter_value.strip():
|
||||
continue
|
||||
|
||||
# content_type / language are NOT database query params — they are
|
||||
# applied as Python post-filters in search_activities(). Skip them
|
||||
# here so they never reach DatabaseManager.search_activities().
|
||||
if filter_key in ('content_type', 'language'):
|
||||
continue
|
||||
|
||||
# Map filter types to database fields
|
||||
if filter_key == 'category':
|
||||
db_filters['category'] = filter_value
|
||||
|
||||
|
||||
elif filter_key == 'age_group':
|
||||
# Parse age range (e.g., "5-8 ani", "12+ ani")
|
||||
age_match = re.search(r'(\d+)(?:-(\d+))?\s*ani?', filter_value)
|
||||
@@ -177,21 +244,22 @@ class SearchService:
|
||||
boost_score = 0
|
||||
|
||||
# Check name matches (highest priority)
|
||||
name_lower = result.get('name', '').lower()
|
||||
# NB: use `or ''` — nullable columns come back as None, not ''.
|
||||
name_lower = (result.get('name') or '').lower()
|
||||
for term in search_terms:
|
||||
if term in name_lower:
|
||||
boost_score += 10
|
||||
if name_lower.startswith(term):
|
||||
boost_score += 5 # Extra boost for name starts with term
|
||||
|
||||
|
||||
# Check description matches
|
||||
desc_lower = result.get('description', '').lower()
|
||||
desc_lower = (result.get('description') or '').lower()
|
||||
for term in search_terms:
|
||||
if term in desc_lower:
|
||||
boost_score += 3
|
||||
|
||||
|
||||
# Check keywords matches
|
||||
keywords_lower = result.get('keywords', '').lower()
|
||||
keywords_lower = (result.get('keywords') or '').lower()
|
||||
for term in search_terms:
|
||||
if term in keywords_lower:
|
||||
boost_score += 5
|
||||
@@ -280,11 +348,14 @@ class SearchService:
|
||||
return []
|
||||
|
||||
try:
|
||||
# Search for activities that match the partial query
|
||||
# Search for activities that match the partial query.
|
||||
# Over-fetch then drop non-game content types so autocomplete
|
||||
# mirrors the default search (no rețete / cântece / ceremonii).
|
||||
results = self.db.search_activities(
|
||||
search_text=f'"{partial_query}"',
|
||||
limit=limit * 2
|
||||
limit=limit * 6
|
||||
)
|
||||
results = self._apply_content_type_filter(results, None, True)
|
||||
|
||||
suggestions = []
|
||||
seen = set()
|
||||
|
||||
@@ -15,7 +15,13 @@
|
||||
<header class="activity-detail-header">
|
||||
<div class="activity-title-section">
|
||||
<h1 class="activity-detail-title">{{ activity.name }}</h1>
|
||||
<span class="activity-category-badge">{{ activity.category }}</span>
|
||||
<span class="activity-category-badge">{{ display_names.get(activity.category, activity.category) }}</span>
|
||||
{% if activity.content_type %}
|
||||
<span class="activity-content-type-badge">{{ display_names.get(activity.content_type, activity.content_type) }}</span>
|
||||
{% endif %}
|
||||
{% if activity.needs_review %}
|
||||
<span class="activity-badge needs-review" title="Această activitate necesită verificare">⚠ De verificat</span>
|
||||
{% endif %}
|
||||
</div>
|
||||
|
||||
{% if activity.subcategory %}
|
||||
|
||||
@@ -36,7 +36,31 @@
|
||||
<select name="category" id="category" class="filter-select">
|
||||
<option value="">Toate categoriile</option>
|
||||
{% for category in filters.category %}
|
||||
<option value="{{ category }}">{{ category }}</option>
|
||||
<option value="{{ category }}">{{ display_names.get(category, category) }}</option>
|
||||
{% endfor %}
|
||||
</select>
|
||||
</div>
|
||||
{% endif %}
|
||||
|
||||
{% if filters.content_type %}
|
||||
<div class="filter-group">
|
||||
<label for="content_type" class="filter-label">Tip conținut</label>
|
||||
<select name="content_type" id="content_type" class="filter-select">
|
||||
<option value="">Doar jocuri și activități</option>
|
||||
{% for content_type in filters.content_type %}
|
||||
<option value="{{ content_type }}">{{ display_names.get(content_type, content_type) }}</option>
|
||||
{% endfor %}
|
||||
</select>
|
||||
</div>
|
||||
{% endif %}
|
||||
|
||||
{% if filters.language %}
|
||||
<div class="filter-group">
|
||||
<label for="language" class="filter-label">Limbă</label>
|
||||
<select name="language" id="language" class="filter-select">
|
||||
<option value="">Toate limbile</option>
|
||||
{% for language in filters.language %}
|
||||
<option value="{{ language }}">{{ display_names.get(language, language) }}</option>
|
||||
{% endfor %}
|
||||
</select>
|
||||
</div>
|
||||
|
||||
@@ -24,7 +24,29 @@
|
||||
<option value="">Toate categoriile</option>
|
||||
{% for category in filters.category %}
|
||||
<option value="{{ category }}" {% if applied_filters.category == category %}selected{% endif %}>
|
||||
{{ category }}
|
||||
{{ display_names.get(category, category) }}
|
||||
</option>
|
||||
{% endfor %}
|
||||
</select>
|
||||
{% endif %}
|
||||
|
||||
{% if filters.content_type %}
|
||||
<select name="content_type" class="filter-select compact">
|
||||
<option value="">Doar jocuri și activități</option>
|
||||
{% for content_type in filters.content_type %}
|
||||
<option value="{{ content_type }}" {% if applied_filters.content_type == content_type %}selected{% endif %}>
|
||||
{{ display_names.get(content_type, content_type) }}
|
||||
</option>
|
||||
{% endfor %}
|
||||
</select>
|
||||
{% endif %}
|
||||
|
||||
{% if filters.language %}
|
||||
<select name="language" class="filter-select compact">
|
||||
<option value="">Toate limbile</option>
|
||||
{% for language in filters.language %}
|
||||
<option value="{{ language }}" {% if applied_filters.language == language %}selected{% endif %}>
|
||||
{{ display_names.get(language, language) }}
|
||||
</option>
|
||||
{% endfor %}
|
||||
</select>
|
||||
@@ -109,7 +131,10 @@
|
||||
{{ activity.name }}
|
||||
</a>
|
||||
</h3>
|
||||
<span class="activity-category">{{ activity.category }}</span>
|
||||
<span class="activity-category">{{ display_names.get(activity.category, activity.category) }}</span>
|
||||
{% if activity.needs_review %}
|
||||
<span class="activity-badge needs-review" title="Această activitate necesită verificare">⚠ De verificat</span>
|
||||
{% endif %}
|
||||
</header>
|
||||
|
||||
<div class="activity-content">
|
||||
|
||||
@@ -7,11 +7,17 @@ from flask import Blueprint, request, render_template, jsonify, current_app
|
||||
from app.models.database import DatabaseManager
|
||||
from app.models.activity import Activity
|
||||
from app.services.search import SearchService
|
||||
from app.config_taxonomy import CATEGORIES, CONTENT_TYPES
|
||||
import os
|
||||
from pathlib import Path
|
||||
|
||||
bp = Blueprint('main', __name__)
|
||||
|
||||
# Slug -> Romanian display name. Category and content_type slugs never collide,
|
||||
# so a single flat map is enough for the UI filter labels.
|
||||
LANGUAGE_NAMES = {'ro': 'Română', 'en': 'Engleză'}
|
||||
DISPLAY_NAMES = {**CATEGORIES, **CONTENT_TYPES, **LANGUAGE_NAMES}
|
||||
|
||||
# Initialize database manager (will be configured in application factory)
|
||||
def get_db_manager():
|
||||
"""Get database manager instance"""
|
||||
@@ -36,15 +42,17 @@ def index():
|
||||
# Get database statistics for the interface
|
||||
stats = db.get_statistics()
|
||||
|
||||
return render_template('index.html',
|
||||
return render_template('index.html',
|
||||
filters=filter_options,
|
||||
display_names=DISPLAY_NAMES,
|
||||
stats=stats)
|
||||
|
||||
|
||||
except Exception as e:
|
||||
print(f"Error loading main page: {e}")
|
||||
# Fallback with empty filters
|
||||
return render_template('index.html',
|
||||
return render_template('index.html',
|
||||
filters={},
|
||||
display_names=DISPLAY_NAMES,
|
||||
stats={'total_activities': 0})
|
||||
|
||||
@bp.route('/search', methods=['GET', 'POST'])
|
||||
@@ -82,8 +90,9 @@ def search():
|
||||
search_query=search_query,
|
||||
applied_filters=filters,
|
||||
filters=filter_options,
|
||||
display_names=DISPLAY_NAMES,
|
||||
results_count=len(activities))
|
||||
|
||||
|
||||
except Exception as e:
|
||||
print(f"Search error: {e}")
|
||||
return render_template('results.html',
|
||||
@@ -91,6 +100,7 @@ def search():
|
||||
search_query='',
|
||||
applied_filters={},
|
||||
filters={},
|
||||
display_names=DISPLAY_NAMES,
|
||||
results_count=0,
|
||||
error=str(e))
|
||||
|
||||
@@ -121,6 +131,7 @@ def activity_detail(activity_id):
|
||||
|
||||
return render_template('activity.html',
|
||||
activity=activity,
|
||||
display_names=DISPLAY_NAMES,
|
||||
similar_activities=similar_activities)
|
||||
|
||||
except Exception as e:
|
||||
|
||||
Reference in New Issue
Block a user