Implements the approved plan to replace the broken regex/index-master extraction with an LLM-subagent pipeline. Four parallel lanes: Lane A — scripts/extract_common.py (PDF/docx/doc/pptx/html/zip, no max_pages truncation), normalize_sources.py, chunk_sources.py (~20pg chunks + overlap, manifest registry), activity_schema.json. Lane B — app/config_taxonomy.py (16 fixed category slugs), schema rebuilt from scratch in app/models/ with content_type, language, source_files, source_excerpt, normalized_name, extraction_confidence, needs_review; FTS5 + 3 triggers extended with materials_list and skills_developed. Lane C — build_database.py (--rebuild, atomic swap, schema + fuzzy source_excerpt validation, dedup with needs_review band), validate_extractions.py, review_queue.py, new run_extraction.py orchestrator, SUBAGENT_PROMPT.md. Lane D — search.py content_type/language filters (default search excludes non-game content), E7 schema-compat audit; fixed a NULL keywords AttributeError in _boost_search_relevance. Removes 8 orphaned/dead scripts and app/services/parser.py + indexer.py. Adds tests/ (70 passing, 1 skipped — libreoffice absent). Note: Lane D made one additive edit to app/models/database.py (_update_category_counts) to surface content_type/language in get_filter_options, outside its nominal lane boundary but after Lane B completed. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
231 lines
7.2 KiB
Python
231 lines
7.2 KiB
Python
"""
|
|
Controlled category taxonomy for game-library.
|
|
|
|
Single source of truth for activity categories. The DB stores the *slug*;
|
|
the UI displays the Romanian name. `category` (thematic domain) and
|
|
`content_type` (form of the content) are INDEPENDENT axes — see plan §2.
|
|
"""
|
|
|
|
import unicodedata
|
|
import re
|
|
from typing import Dict, List
|
|
|
|
# --- Categories (thematic domain) --------------------------------------------
|
|
# slug -> Romanian display name. ~16 fixed slugs; `altele` is the mandatory
|
|
# fallback and MUST always be present.
|
|
CATEGORIES: Dict[str, str] = {
|
|
"jocuri-cercetasesti": "Jocuri cercetășești",
|
|
"team-building": "Team-building",
|
|
"icebreakers": "Icebreakers / spargerea gheții",
|
|
"camp-outdoor": "Tabără și activități în aer liber",
|
|
"wide-games": "Wide games / jocuri de teren",
|
|
"orientare": "Orientare",
|
|
"prim-ajutor": "Prim ajutor",
|
|
"escape-room-puzzle": "Escape room și puzzle",
|
|
"creative-stem": "Creativitate și STEM",
|
|
"sports-active": "Sport și activități fizice",
|
|
"cantece-ceremonii": "Cântece și ceremonii",
|
|
"retete": "Rețete",
|
|
"supravietuire": "Supraviețuire",
|
|
"integrare-incluziune": "Integrare și incluziune",
|
|
"conflict-empatie": "Conflict și empatie",
|
|
"altele": "Altele",
|
|
}
|
|
|
|
# Mandatory fallback slug.
|
|
FALLBACK_CATEGORY = "altele"
|
|
|
|
# Ordered list of valid slugs.
|
|
CATEGORY_SLUGS: List[str] = list(CATEGORIES.keys())
|
|
|
|
# --- Content type (form of the content) --------------------------------------
|
|
# Independent axis from `category`. The UI default search excludes the
|
|
# non-game content types (see plan §6).
|
|
CONTENT_TYPES: Dict[str, str] = {
|
|
"joc": "Joc",
|
|
"activitate": "Activitate",
|
|
"reteta": "Rețetă",
|
|
"cantec": "Cântec",
|
|
"ceremonie": "Ceremonie",
|
|
}
|
|
|
|
CONTENT_TYPE_SLUGS: List[str] = list(CONTENT_TYPES.keys())
|
|
|
|
# Content types considered "non-game" — excluded from the default UI search.
|
|
NON_GAME_CONTENT_TYPES: List[str] = ["reteta", "cantec", "ceremonie"]
|
|
|
|
DEFAULT_CONTENT_TYPE = "activitate"
|
|
|
|
# --- Aliases -----------------------------------------------------------------
|
|
# Map of normalized arbitrary strings -> canonical slug. Keys are already
|
|
# diacritic-stripped, lowercased and hyphenated (see _slugify). This catches
|
|
# legacy / messy values from the old DB and common English/Romanian variants.
|
|
_CATEGORY_ALIASES: Dict[str, str] = {
|
|
# legacy junk
|
|
"general-activity": "altele",
|
|
"general": "altele",
|
|
"educational": "creative-stem",
|
|
"d": "altele",
|
|
"a": "altele",
|
|
"b": "altele",
|
|
"c": "altele",
|
|
# scouting
|
|
"cercetasie": "jocuri-cercetasesti",
|
|
"cercetasesti": "jocuri-cercetasesti",
|
|
"scout": "jocuri-cercetasesti",
|
|
"scouting": "jocuri-cercetasesti",
|
|
"scout-games": "jocuri-cercetasesti",
|
|
"jocuri-cercetasesti": "jocuri-cercetasesti",
|
|
# team building
|
|
"teambuilding": "team-building",
|
|
"team": "team-building",
|
|
"cooperare": "team-building",
|
|
# icebreakers
|
|
"icebreaker": "icebreakers",
|
|
"spargerea-ghetii": "icebreakers",
|
|
"cunoastere": "icebreakers",
|
|
"energizers": "icebreakers",
|
|
"energizer": "icebreakers",
|
|
# camp / outdoor
|
|
"camp": "camp-outdoor",
|
|
"tabara": "camp-outdoor",
|
|
"outdoor": "camp-outdoor",
|
|
"aer-liber": "camp-outdoor",
|
|
# wide games
|
|
"wide-game": "wide-games",
|
|
"jocuri-de-teren": "wide-games",
|
|
"joc-de-teren": "wide-games",
|
|
"big-games": "wide-games",
|
|
# orientare
|
|
"orienteering": "orientare",
|
|
"navigatie": "orientare",
|
|
# prim ajutor
|
|
"first-aid": "prim-ajutor",
|
|
"primul-ajutor": "prim-ajutor",
|
|
# escape room / puzzle
|
|
"escape-room": "escape-room-puzzle",
|
|
"escaperoom": "escape-room-puzzle",
|
|
"puzzle": "escape-room-puzzle",
|
|
"puzzles": "escape-room-puzzle",
|
|
"ghicitori": "escape-room-puzzle",
|
|
# creative / stem
|
|
"creative": "creative-stem",
|
|
"creativitate": "creative-stem",
|
|
"stem": "creative-stem",
|
|
"arts-and-crafts": "creative-stem",
|
|
"craft": "creative-stem",
|
|
"crafts": "creative-stem",
|
|
"stiinta": "creative-stem",
|
|
# sports
|
|
"sport": "sports-active",
|
|
"sports": "sports-active",
|
|
"sportive": "sports-active",
|
|
"active": "sports-active",
|
|
"miscare": "sports-active",
|
|
"physical": "sports-active",
|
|
# songs / ceremonies
|
|
"cantece": "cantece-ceremonii",
|
|
"cantec": "cantece-ceremonii",
|
|
"songs": "cantece-ceremonii",
|
|
"ceremonii": "cantece-ceremonii",
|
|
"ceremonie": "cantece-ceremonii",
|
|
"ceremony": "cantece-ceremonii",
|
|
# recipes
|
|
"reteta": "retete",
|
|
"recipe": "retete",
|
|
"recipes": "retete",
|
|
"cooking": "retete",
|
|
"gatit": "retete",
|
|
# survival
|
|
"survival": "supravietuire",
|
|
"supravietuire": "supravietuire",
|
|
# inclusion
|
|
"integrare": "integrare-incluziune",
|
|
"incluziune": "integrare-incluziune",
|
|
"inclusion": "integrare-incluziune",
|
|
# conflict / empathy
|
|
"conflict": "conflict-empatie",
|
|
"empatie": "conflict-empatie",
|
|
"empathy": "conflict-empatie",
|
|
"rezolvarea-conflictelor": "conflict-empatie",
|
|
# fallback
|
|
"altele": "altele",
|
|
"other": "altele",
|
|
"others": "altele",
|
|
"misc": "altele",
|
|
}
|
|
|
|
|
|
def _slugify(value: str) -> str:
|
|
"""Lowercase, strip diacritics, collapse non-alphanumerics to hyphens."""
|
|
if not value:
|
|
return ""
|
|
# Decompose accents (ă -> a, ș -> s, ț -> t, etc.)
|
|
decomposed = unicodedata.normalize("NFKD", value)
|
|
ascii_str = "".join(c for c in decomposed if not unicodedata.combining(c))
|
|
ascii_str = ascii_str.lower().strip()
|
|
ascii_str = re.sub(r"[^a-z0-9]+", "-", ascii_str)
|
|
return ascii_str.strip("-")
|
|
|
|
|
|
def normalize_category(value: str) -> str:
|
|
"""Map an arbitrary string to a valid category slug.
|
|
|
|
Returns one of CATEGORY_SLUGS, falling back to `altele` for anything
|
|
unrecognised or empty.
|
|
"""
|
|
if not value:
|
|
return FALLBACK_CATEGORY
|
|
slug = _slugify(str(value))
|
|
if not slug:
|
|
return FALLBACK_CATEGORY
|
|
# Exact slug match.
|
|
if slug in CATEGORIES:
|
|
return slug
|
|
# Alias match.
|
|
if slug in _CATEGORY_ALIASES:
|
|
return _CATEGORY_ALIASES[slug]
|
|
return FALLBACK_CATEGORY
|
|
|
|
|
|
def normalize_content_type(value: str) -> str:
|
|
"""Map an arbitrary string to a valid content_type slug.
|
|
|
|
Returns one of CONTENT_TYPE_SLUGS, falling back to `activitate`.
|
|
"""
|
|
if not value:
|
|
return DEFAULT_CONTENT_TYPE
|
|
slug = _slugify(str(value))
|
|
if slug in CONTENT_TYPES:
|
|
return slug
|
|
# Light alias handling for plural / English forms.
|
|
aliases = {
|
|
"jocuri": "joc",
|
|
"game": "joc",
|
|
"games": "joc",
|
|
"activitati": "activitate",
|
|
"activity": "activitate",
|
|
"retete": "reteta",
|
|
"recipe": "reteta",
|
|
"cantece": "cantec",
|
|
"song": "cantec",
|
|
"ceremonii": "ceremonie",
|
|
"ceremony": "ceremonie",
|
|
}
|
|
return aliases.get(slug, DEFAULT_CONTENT_TYPE)
|
|
|
|
|
|
def is_valid_category(slug: str) -> bool:
|
|
"""True if `slug` is a valid category slug."""
|
|
return slug in CATEGORIES
|
|
|
|
|
|
def category_display_name(slug: str) -> str:
|
|
"""Romanian display name for a slug (fallback to the slug itself)."""
|
|
return CATEGORIES.get(slug, slug)
|
|
|
|
|
|
def content_type_display_name(slug: str) -> str:
|
|
"""Romanian display name for a content_type slug."""
|
|
return CONTENT_TYPES.get(slug, slug)
|