Extraction finished (575/588 chunks; 6 content-filter-blocked, 7 await re-extraction). DB rebuilt and frozen at 9418 activities — content_keys are now stable for the enrichment overlay. Part A (plumbing + UI): - database.py: name_ro/description_ro/rules_ro/variations_ro, indoor_outdoor, space_needed, estimated_fields, source_id/source_ids/chunk_key columns; FTS5 indexes the 4 *_ro columns across CREATE + all 3 triggers; new equality filters + category counts for both axes. - activity.py: new fields + bilingual display helpers (get_display_*, is_estimated, axis displays). - config_taxonomy.py: INDOOR_OUTDOOR/SPACE_NEEDED enums + normalizers (None on unrecognised, no fabrication). - search.py / routes.py / config.py / templates / css: new dropdowns, RO-primary rendering with "(estimat)" markers and collapsible original text, and a /source/<id> download route shipped DARK behind SOURCE_DOWNLOAD_ENABLED (copyright opt-in). - build_database.py: source_id/chunk_key in dict_to_activity; merge_cluster unions source_ids without touching enrichment fields. Part B (enrichment pipeline, built not yet run): - build_database.py: load_enrichment + apply_enrichment (post-dedup, keyed on content_key) + --enrichment CLI + stated-vs-estimated QA. - run_enrichment.py (resumable, --source/--limit pilot scoping, --collect), ENRICHMENT_PROMPT.md. Repair: scripts/repair_extractions.py fixes the subagents' systematic unescaped-ASCII-quote bug with a faithful char-scanner (escapes, never truncates) + schema validation + a strictly-more-text guard. json_repair was tried first, truncated silently, and is NOT used. build_database has no repair dependency. Tests: tests/test_enrichment.py added; 99 pass. Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
314 lines
9.6 KiB
Python
314 lines
9.6 KiB
Python
"""
|
|
Controlled category taxonomy for game-library.
|
|
|
|
Single source of truth for activity categories. The DB stores the *slug*;
|
|
the UI displays the Romanian name. `category` (thematic domain) and
|
|
`content_type` (form of the content) are INDEPENDENT axes — see plan §2.
|
|
"""
|
|
|
|
import unicodedata
|
|
import re
|
|
from typing import Dict, List, Optional
|
|
|
|
# --- Categories (thematic domain) --------------------------------------------
|
|
# slug -> Romanian display name. ~16 fixed slugs; `altele` is the mandatory
|
|
# fallback and MUST always be present.
|
|
CATEGORIES: Dict[str, str] = {
|
|
"jocuri-cercetasesti": "Jocuri cercetășești",
|
|
"team-building": "Team-building",
|
|
"icebreakers": "Icebreakers / spargerea gheții",
|
|
"camp-outdoor": "Tabără și activități în aer liber",
|
|
"wide-games": "Wide games / jocuri de teren",
|
|
"orientare": "Orientare",
|
|
"prim-ajutor": "Prim ajutor",
|
|
"escape-room-puzzle": "Escape room și puzzle",
|
|
"creative-stem": "Creativitate și STEM",
|
|
"sports-active": "Sport și activități fizice",
|
|
"cantece-ceremonii": "Cântece și ceremonii",
|
|
"retete": "Rețete",
|
|
"supravietuire": "Supraviețuire",
|
|
"integrare-incluziune": "Integrare și incluziune",
|
|
"conflict-empatie": "Conflict și empatie",
|
|
"altele": "Altele",
|
|
}
|
|
|
|
# Mandatory fallback slug.
|
|
FALLBACK_CATEGORY = "altele"
|
|
|
|
# Ordered list of valid slugs.
|
|
CATEGORY_SLUGS: List[str] = list(CATEGORIES.keys())
|
|
|
|
# --- Content type (form of the content) --------------------------------------
|
|
# Independent axis from `category`. The UI default search excludes the
|
|
# non-game content types (see plan §6).
|
|
CONTENT_TYPES: Dict[str, str] = {
|
|
"joc": "Joc",
|
|
"activitate": "Activitate",
|
|
"reteta": "Rețetă",
|
|
"cantec": "Cântec",
|
|
"ceremonie": "Ceremonie",
|
|
}
|
|
|
|
CONTENT_TYPE_SLUGS: List[str] = list(CONTENT_TYPES.keys())
|
|
|
|
# Content types considered "non-game" — excluded from the default UI search.
|
|
NON_GAME_CONTENT_TYPES: List[str] = ["reteta", "cantec", "ceremonie"]
|
|
|
|
DEFAULT_CONTENT_TYPE = "activitate"
|
|
|
|
# --- Aliases -----------------------------------------------------------------
|
|
# Map of normalized arbitrary strings -> canonical slug. Keys are already
|
|
# diacritic-stripped, lowercased and hyphenated (see _slugify). This catches
|
|
# legacy / messy values from the old DB and common English/Romanian variants.
|
|
_CATEGORY_ALIASES: Dict[str, str] = {
|
|
# legacy junk
|
|
"general-activity": "altele",
|
|
"general": "altele",
|
|
"educational": "creative-stem",
|
|
"d": "altele",
|
|
"a": "altele",
|
|
"b": "altele",
|
|
"c": "altele",
|
|
# scouting
|
|
"cercetasie": "jocuri-cercetasesti",
|
|
"cercetasesti": "jocuri-cercetasesti",
|
|
"scout": "jocuri-cercetasesti",
|
|
"scouting": "jocuri-cercetasesti",
|
|
"scout-games": "jocuri-cercetasesti",
|
|
"jocuri-cercetasesti": "jocuri-cercetasesti",
|
|
# team building
|
|
"teambuilding": "team-building",
|
|
"team": "team-building",
|
|
"cooperare": "team-building",
|
|
# icebreakers
|
|
"icebreaker": "icebreakers",
|
|
"spargerea-ghetii": "icebreakers",
|
|
"cunoastere": "icebreakers",
|
|
"energizers": "icebreakers",
|
|
"energizer": "icebreakers",
|
|
# camp / outdoor
|
|
"camp": "camp-outdoor",
|
|
"tabara": "camp-outdoor",
|
|
"outdoor": "camp-outdoor",
|
|
"aer-liber": "camp-outdoor",
|
|
# wide games
|
|
"wide-game": "wide-games",
|
|
"jocuri-de-teren": "wide-games",
|
|
"joc-de-teren": "wide-games",
|
|
"big-games": "wide-games",
|
|
# orientare
|
|
"orienteering": "orientare",
|
|
"navigatie": "orientare",
|
|
# prim ajutor
|
|
"first-aid": "prim-ajutor",
|
|
"primul-ajutor": "prim-ajutor",
|
|
# escape room / puzzle
|
|
"escape-room": "escape-room-puzzle",
|
|
"escaperoom": "escape-room-puzzle",
|
|
"puzzle": "escape-room-puzzle",
|
|
"puzzles": "escape-room-puzzle",
|
|
"ghicitori": "escape-room-puzzle",
|
|
# creative / stem
|
|
"creative": "creative-stem",
|
|
"creativitate": "creative-stem",
|
|
"stem": "creative-stem",
|
|
"arts-and-crafts": "creative-stem",
|
|
"craft": "creative-stem",
|
|
"crafts": "creative-stem",
|
|
"stiinta": "creative-stem",
|
|
# sports
|
|
"sport": "sports-active",
|
|
"sports": "sports-active",
|
|
"sportive": "sports-active",
|
|
"active": "sports-active",
|
|
"miscare": "sports-active",
|
|
"physical": "sports-active",
|
|
# songs / ceremonies
|
|
"cantece": "cantece-ceremonii",
|
|
"cantec": "cantece-ceremonii",
|
|
"songs": "cantece-ceremonii",
|
|
"ceremonii": "cantece-ceremonii",
|
|
"ceremonie": "cantece-ceremonii",
|
|
"ceremony": "cantece-ceremonii",
|
|
# recipes
|
|
"reteta": "retete",
|
|
"recipe": "retete",
|
|
"recipes": "retete",
|
|
"cooking": "retete",
|
|
"gatit": "retete",
|
|
# survival
|
|
"survival": "supravietuire",
|
|
"supravietuire": "supravietuire",
|
|
# inclusion
|
|
"integrare": "integrare-incluziune",
|
|
"incluziune": "integrare-incluziune",
|
|
"inclusion": "integrare-incluziune",
|
|
# conflict / empathy
|
|
"conflict": "conflict-empatie",
|
|
"empatie": "conflict-empatie",
|
|
"empathy": "conflict-empatie",
|
|
"rezolvarea-conflictelor": "conflict-empatie",
|
|
# fallback
|
|
"altele": "altele",
|
|
"other": "altele",
|
|
"others": "altele",
|
|
"misc": "altele",
|
|
}
|
|
|
|
|
|
def _slugify(value: str) -> str:
|
|
"""Lowercase, strip diacritics, collapse non-alphanumerics to hyphens."""
|
|
if not value:
|
|
return ""
|
|
# Decompose accents (ă -> a, ș -> s, ț -> t, etc.)
|
|
decomposed = unicodedata.normalize("NFKD", value)
|
|
ascii_str = "".join(c for c in decomposed if not unicodedata.combining(c))
|
|
ascii_str = ascii_str.lower().strip()
|
|
ascii_str = re.sub(r"[^a-z0-9]+", "-", ascii_str)
|
|
return ascii_str.strip("-")
|
|
|
|
|
|
def normalize_category(value: str) -> str:
|
|
"""Map an arbitrary string to a valid category slug.
|
|
|
|
Returns one of CATEGORY_SLUGS, falling back to `altele` for anything
|
|
unrecognised or empty.
|
|
"""
|
|
if not value:
|
|
return FALLBACK_CATEGORY
|
|
slug = _slugify(str(value))
|
|
if not slug:
|
|
return FALLBACK_CATEGORY
|
|
# Exact slug match.
|
|
if slug in CATEGORIES:
|
|
return slug
|
|
# Alias match.
|
|
if slug in _CATEGORY_ALIASES:
|
|
return _CATEGORY_ALIASES[slug]
|
|
return FALLBACK_CATEGORY
|
|
|
|
|
|
def normalize_content_type(value: str) -> str:
|
|
"""Map an arbitrary string to a valid content_type slug.
|
|
|
|
Returns one of CONTENT_TYPE_SLUGS, falling back to `activitate`.
|
|
"""
|
|
if not value:
|
|
return DEFAULT_CONTENT_TYPE
|
|
slug = _slugify(str(value))
|
|
if slug in CONTENT_TYPES:
|
|
return slug
|
|
# Light alias handling for plural / English forms.
|
|
aliases = {
|
|
"jocuri": "joc",
|
|
"game": "joc",
|
|
"games": "joc",
|
|
"activitati": "activitate",
|
|
"activity": "activitate",
|
|
"retete": "reteta",
|
|
"recipe": "reteta",
|
|
"cantece": "cantec",
|
|
"song": "cantec",
|
|
"ceremonii": "ceremonie",
|
|
"ceremony": "ceremonie",
|
|
}
|
|
return aliases.get(slug, DEFAULT_CONTENT_TYPE)
|
|
|
|
|
|
# --- Indoor / outdoor (enrichment axis) --------------------------------------
|
|
# Where the activity is run. Inferred during enrichment when the source is
|
|
# silent — such inferences are flagged in `estimated_fields`. slug -> RO label.
|
|
INDOOR_OUTDOOR: Dict[str, str] = {
|
|
"indoor": "Interior",
|
|
"outdoor": "Exterior",
|
|
"either": "Interior sau exterior",
|
|
}
|
|
|
|
# --- Space needed (enrichment axis) ------------------------------------------
|
|
# Rough footprint the activity requires. slug -> RO label.
|
|
SPACE_NEEDED: Dict[str, str] = {
|
|
"mic": "Spațiu mic",
|
|
"mediu": "Spațiu mediu",
|
|
"mare": "Spațiu mare",
|
|
}
|
|
|
|
# Aliases for robustness against LLM output variation. Keys are _slugify'd.
|
|
_INDOOR_OUTDOOR_ALIASES: Dict[str, str] = {
|
|
"interior": "indoor",
|
|
"inside": "indoor",
|
|
"in": "indoor",
|
|
"exterior": "outdoor",
|
|
"outside": "outdoor",
|
|
"out": "outdoor",
|
|
"aer-liber": "outdoor",
|
|
"both": "either",
|
|
"any": "either",
|
|
"ambele": "either",
|
|
"interior-exterior": "either",
|
|
"indoor-outdoor": "either",
|
|
}
|
|
|
|
_SPACE_NEEDED_ALIASES: Dict[str, str] = {
|
|
"small": "mic",
|
|
"redus": "mic",
|
|
"putin": "mic",
|
|
"medium": "mediu",
|
|
"moderat": "mediu",
|
|
"large": "mare",
|
|
"big": "mare",
|
|
"mult": "mare",
|
|
"spatiu-mic": "mic",
|
|
"spatiu-mediu": "mediu",
|
|
"spatiu-mare": "mare",
|
|
}
|
|
|
|
|
|
def normalize_indoor_outdoor(value: str) -> Optional[str]:
|
|
"""Map an arbitrary string to an indoor_outdoor slug, or None.
|
|
|
|
Unlike categories, this has NO mandatory fallback: an unrecognised or
|
|
empty value yields None (field simply absent), so we never fabricate a
|
|
location the enrichment did not assert.
|
|
"""
|
|
if not value:
|
|
return None
|
|
slug = _slugify(str(value))
|
|
if slug in INDOOR_OUTDOOR:
|
|
return slug
|
|
return _INDOOR_OUTDOOR_ALIASES.get(slug)
|
|
|
|
|
|
def normalize_space_needed(value: str) -> Optional[str]:
|
|
"""Map an arbitrary string to a space_needed slug, or None (no fallback)."""
|
|
if not value:
|
|
return None
|
|
slug = _slugify(str(value))
|
|
if slug in SPACE_NEEDED:
|
|
return slug
|
|
return _SPACE_NEEDED_ALIASES.get(slug)
|
|
|
|
|
|
def indoor_outdoor_display_name(slug: str) -> str:
|
|
"""RO display name for an indoor_outdoor slug."""
|
|
return INDOOR_OUTDOOR.get(slug, slug)
|
|
|
|
|
|
def space_needed_display_name(slug: str) -> str:
|
|
"""RO display name for a space_needed slug."""
|
|
return SPACE_NEEDED.get(slug, slug)
|
|
|
|
|
|
def is_valid_category(slug: str) -> bool:
|
|
"""True if `slug` is a valid category slug."""
|
|
return slug in CATEGORIES
|
|
|
|
|
|
def category_display_name(slug: str) -> str:
|
|
"""Romanian display name for a slug (fallback to the slug itself)."""
|
|
return CATEGORIES.get(slug, slug)
|
|
|
|
|
|
def content_type_display_name(slug: str) -> str:
|
|
"""Romanian display name for a content_type slug."""
|
|
return CONTENT_TYPES.get(slug, slug)
|