Rebuild extraction pipeline infrastructure (Faza 0 prep)

Implements the approved plan to replace the broken regex/index-master extraction with an LLM-subagent pipeline. Four parallel lanes: Lane A — scripts/extract_common.py (PDF/docx/doc/pptx/html/zip, no max_pages truncation), normalize_sources.py, chunk_sources.py (~20pg chunks + overlap, manifest registry), activity_schema.json. Lane B — app/config_taxonomy.py (16 fixed category slugs), schema rebuilt from scratch in app/models/ with content_type, language, source_files, source_excerpt, normalized_name, extraction_confidence, needs_review; FTS5 + 3 triggers extended with materials_list and skills_developed. Lane C — build_database.py (--rebuild, atomic swap, schema + fuzzy source_excerpt validation, dedup with needs_review band), validate_extractions.py, review_queue.py, new run_extraction.py orchestrator, SUBAGENT_PROMPT.md. Lane D — search.py content_type/language filters (default search excludes non-game content), E7 schema-compat audit; fixed a NULL keywords AttributeError in _boost_search_relevance. Removes 8 orphaned/dead scripts and app/services/parser.py + indexer.py. Adds tests/ (70 passing, 1 skipped — libreoffice absent). Note: Lane D made one additive edit to app/models/database.py (_update_category_counts) to surface content_type/language in get_filter_options, outside its nominal lane boundary but after Lane B completed. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-05-19 17:43:38 +00:00
parent e0080edf85
commit 66ae831c36
37 changed files with 4101 additions and 1881 deletions
--- a/app/config_taxonomy.py
+++ b/app/config_taxonomy.py
@@ -0,0 +1,230 @@
+"""
+Controlled category taxonomy for game-library.
+
+Single source of truth for activity categories. The DB stores the *slug*;
+the UI displays the Romanian name. `category` (thematic domain) and
+`content_type` (form of the content) are INDEPENDENT axes — see plan §2.
+"""
+
+import unicodedata
+import re
+from typing import Dict, List
+
+# --- Categories (thematic domain) --------------------------------------------
+# slug -> Romanian display name. ~16 fixed slugs; `altele` is the mandatory
+# fallback and MUST always be present.
+CATEGORIES: Dict[str, str] = {
+    "jocuri-cercetasesti": "Jocuri cercetășești",
+    "team-building": "Team-building",
+    "icebreakers": "Icebreakers / spargerea gheții",
+    "camp-outdoor": "Tabără și activități în aer liber",
+    "wide-games": "Wide games / jocuri de teren",
+    "orientare": "Orientare",
+    "prim-ajutor": "Prim ajutor",
+    "escape-room-puzzle": "Escape room și puzzle",
+    "creative-stem": "Creativitate și STEM",
+    "sports-active": "Sport și activități fizice",
+    "cantece-ceremonii": "Cântece și ceremonii",
+    "retete": "Rețete",
+    "supravietuire": "Supraviețuire",
+    "integrare-incluziune": "Integrare și incluziune",
+    "conflict-empatie": "Conflict și empatie",
+    "altele": "Altele",
+}
+
+# Mandatory fallback slug.
+FALLBACK_CATEGORY = "altele"
+
+# Ordered list of valid slugs.
+CATEGORY_SLUGS: List[str] = list(CATEGORIES.keys())
+
+# --- Content type (form of the content) --------------------------------------
+# Independent axis from `category`. The UI default search excludes the
+# non-game content types (see plan §6).
+CONTENT_TYPES: Dict[str, str] = {
+    "joc": "Joc",
+    "activitate": "Activitate",
+    "reteta": "Rețetă",
+    "cantec": "Cântec",
+    "ceremonie": "Ceremonie",
+}
+
+CONTENT_TYPE_SLUGS: List[str] = list(CONTENT_TYPES.keys())
+
+# Content types considered "non-game" — excluded from the default UI search.
+NON_GAME_CONTENT_TYPES: List[str] = ["reteta", "cantec", "ceremonie"]
+
+DEFAULT_CONTENT_TYPE = "activitate"
+
+# --- Aliases -----------------------------------------------------------------
+# Map of normalized arbitrary strings -> canonical slug. Keys are already
+# diacritic-stripped, lowercased and hyphenated (see _slugify). This catches
+# legacy / messy values from the old DB and common English/Romanian variants.
+_CATEGORY_ALIASES: Dict[str, str] = {
+    # legacy junk
+    "general-activity": "altele",
+    "general": "altele",
+    "educational": "creative-stem",
+    "d": "altele",
+    "a": "altele",
+    "b": "altele",
+    "c": "altele",
+    # scouting
+    "cercetasie": "jocuri-cercetasesti",
+    "cercetasesti": "jocuri-cercetasesti",
+    "scout": "jocuri-cercetasesti",
+    "scouting": "jocuri-cercetasesti",
+    "scout-games": "jocuri-cercetasesti",
+    "jocuri-cercetasesti": "jocuri-cercetasesti",
+    # team building
+    "teambuilding": "team-building",
+    "team": "team-building",
+    "cooperare": "team-building",
+    # icebreakers
+    "icebreaker": "icebreakers",
+    "spargerea-ghetii": "icebreakers",
+    "cunoastere": "icebreakers",
+    "energizers": "icebreakers",
+    "energizer": "icebreakers",
+    # camp / outdoor
+    "camp": "camp-outdoor",
+    "tabara": "camp-outdoor",
+    "outdoor": "camp-outdoor",
+    "aer-liber": "camp-outdoor",
+    # wide games
+    "wide-game": "wide-games",
+    "jocuri-de-teren": "wide-games",
+    "joc-de-teren": "wide-games",
+    "big-games": "wide-games",
+    # orientare
+    "orienteering": "orientare",
+    "navigatie": "orientare",
+    # prim ajutor
+    "first-aid": "prim-ajutor",
+    "primul-ajutor": "prim-ajutor",
+    # escape room / puzzle
+    "escape-room": "escape-room-puzzle",
+    "escaperoom": "escape-room-puzzle",
+    "puzzle": "escape-room-puzzle",
+    "puzzles": "escape-room-puzzle",
+    "ghicitori": "escape-room-puzzle",
+    # creative / stem
+    "creative": "creative-stem",
+    "creativitate": "creative-stem",
+    "stem": "creative-stem",
+    "arts-and-crafts": "creative-stem",
+    "craft": "creative-stem",
+    "crafts": "creative-stem",
+    "stiinta": "creative-stem",
+    # sports
+    "sport": "sports-active",
+    "sports": "sports-active",
+    "sportive": "sports-active",
+    "active": "sports-active",
+    "miscare": "sports-active",
+    "physical": "sports-active",
+    # songs / ceremonies
+    "cantece": "cantece-ceremonii",
+    "cantec": "cantece-ceremonii",
+    "songs": "cantece-ceremonii",
+    "ceremonii": "cantece-ceremonii",
+    "ceremonie": "cantece-ceremonii",
+    "ceremony": "cantece-ceremonii",
+    # recipes
+    "reteta": "retete",
+    "recipe": "retete",
+    "recipes": "retete",
+    "cooking": "retete",
+    "gatit": "retete",
+    # survival
+    "survival": "supravietuire",
+    "supravietuire": "supravietuire",
+    # inclusion
+    "integrare": "integrare-incluziune",
+    "incluziune": "integrare-incluziune",
+    "inclusion": "integrare-incluziune",
+    # conflict / empathy
+    "conflict": "conflict-empatie",
+    "empatie": "conflict-empatie",
+    "empathy": "conflict-empatie",
+    "rezolvarea-conflictelor": "conflict-empatie",
+    # fallback
+    "altele": "altele",
+    "other": "altele",
+    "others": "altele",
+    "misc": "altele",
+}
+
+
+def _slugify(value: str) -> str:
+    """Lowercase, strip diacritics, collapse non-alphanumerics to hyphens."""
+    if not value:
+        return ""
+    # Decompose accents (ă -> a, ș -> s, ț -> t, etc.)
+    decomposed = unicodedata.normalize("NFKD", value)
+    ascii_str = "".join(c for c in decomposed if not unicodedata.combining(c))
+    ascii_str = ascii_str.lower().strip()
+    ascii_str = re.sub(r"[^a-z0-9]+", "-", ascii_str)
+    return ascii_str.strip("-")
+
+
+def normalize_category(value: str) -> str:
+    """Map an arbitrary string to a valid category slug.
+
+    Returns one of CATEGORY_SLUGS, falling back to `altele` for anything
+    unrecognised or empty.
+    """
+    if not value:
+        return FALLBACK_CATEGORY
+    slug = _slugify(str(value))
+    if not slug:
+        return FALLBACK_CATEGORY
+    # Exact slug match.
+    if slug in CATEGORIES:
+        return slug
+    # Alias match.
+    if slug in _CATEGORY_ALIASES:
+        return _CATEGORY_ALIASES[slug]
+    return FALLBACK_CATEGORY
+
+
+def normalize_content_type(value: str) -> str:
+    """Map an arbitrary string to a valid content_type slug.
+
+    Returns one of CONTENT_TYPE_SLUGS, falling back to `activitate`.
+    """
+    if not value:
+        return DEFAULT_CONTENT_TYPE
+    slug = _slugify(str(value))
+    if slug in CONTENT_TYPES:
+        return slug
+    # Light alias handling for plural / English forms.
+    aliases = {
+        "jocuri": "joc",
+        "game": "joc",
+        "games": "joc",
+        "activitati": "activitate",
+        "activity": "activitate",
+        "retete": "reteta",
+        "recipe": "reteta",
+        "cantece": "cantec",
+        "song": "cantec",
+        "ceremonii": "ceremonie",
+        "ceremony": "ceremonie",
+    }
+    return aliases.get(slug, DEFAULT_CONTENT_TYPE)
+
+
+def is_valid_category(slug: str) -> bool:
+    """True if `slug` is a valid category slug."""
+    return slug in CATEGORIES
+
+
+def category_display_name(slug: str) -> str:
+    """Romanian display name for a slug (fallback to the slug itself)."""
+    return CATEGORIES.get(slug, slug)
+
+
+def content_type_display_name(slug: str) -> str:
+    """Romanian display name for a content_type slug."""
+    return CONTENT_TYPES.get(slug, slug)