Files
game-library/app/config_taxonomy.py
Claude Agent bcfb6841eb Faza 1 complete: bilingual+enrichment plumbing, UI/filters, frozen DB
Extraction finished (575/588 chunks; 6 content-filter-blocked, 7 await
re-extraction). DB rebuilt and frozen at 9418 activities — content_keys
are now stable for the enrichment overlay.

Part A (plumbing + UI):
- database.py: name_ro/description_ro/rules_ro/variations_ro, indoor_outdoor,
  space_needed, estimated_fields, source_id/source_ids/chunk_key columns;
  FTS5 indexes the 4 *_ro columns across CREATE + all 3 triggers; new equality
  filters + category counts for both axes.
- activity.py: new fields + bilingual display helpers (get_display_*,
  is_estimated, axis displays).
- config_taxonomy.py: INDOOR_OUTDOOR/SPACE_NEEDED enums + normalizers
  (None on unrecognised, no fabrication).
- search.py / routes.py / config.py / templates / css: new dropdowns,
  RO-primary rendering with "(estimat)" markers and collapsible original
  text, and a /source/<id> download route shipped DARK behind
  SOURCE_DOWNLOAD_ENABLED (copyright opt-in).
- build_database.py: source_id/chunk_key in dict_to_activity; merge_cluster
  unions source_ids without touching enrichment fields.

Part B (enrichment pipeline, built not yet run):
- build_database.py: load_enrichment + apply_enrichment (post-dedup, keyed on
  content_key) + --enrichment CLI + stated-vs-estimated QA.
- run_enrichment.py (resumable, --source/--limit pilot scoping, --collect),
  ENRICHMENT_PROMPT.md.

Repair: scripts/repair_extractions.py fixes the subagents' systematic
unescaped-ASCII-quote bug with a faithful char-scanner (escapes, never
truncates) + schema validation + a strictly-more-text guard. json_repair was
tried first, truncated silently, and is NOT used. build_database has no repair
dependency.

Tests: tests/test_enrichment.py added; 99 pass.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
2026-05-29 18:10:13 +00:00

314 lines
9.6 KiB
Python

"""
Controlled category taxonomy for game-library.
Single source of truth for activity categories. The DB stores the *slug*;
the UI displays the Romanian name. `category` (thematic domain) and
`content_type` (form of the content) are INDEPENDENT axes — see plan §2.
"""
import unicodedata
import re
from typing import Dict, List, Optional
# --- Categories (thematic domain) --------------------------------------------
# slug -> Romanian display name. ~16 fixed slugs; `altele` is the mandatory
# fallback and MUST always be present.
CATEGORIES: Dict[str, str] = {
"jocuri-cercetasesti": "Jocuri cercetășești",
"team-building": "Team-building",
"icebreakers": "Icebreakers / spargerea gheții",
"camp-outdoor": "Tabără și activități în aer liber",
"wide-games": "Wide games / jocuri de teren",
"orientare": "Orientare",
"prim-ajutor": "Prim ajutor",
"escape-room-puzzle": "Escape room și puzzle",
"creative-stem": "Creativitate și STEM",
"sports-active": "Sport și activități fizice",
"cantece-ceremonii": "Cântece și ceremonii",
"retete": "Rețete",
"supravietuire": "Supraviețuire",
"integrare-incluziune": "Integrare și incluziune",
"conflict-empatie": "Conflict și empatie",
"altele": "Altele",
}
# Mandatory fallback slug.
FALLBACK_CATEGORY = "altele"
# Ordered list of valid slugs.
CATEGORY_SLUGS: List[str] = list(CATEGORIES.keys())
# --- Content type (form of the content) --------------------------------------
# Independent axis from `category`. The UI default search excludes the
# non-game content types (see plan §6).
CONTENT_TYPES: Dict[str, str] = {
"joc": "Joc",
"activitate": "Activitate",
"reteta": "Rețetă",
"cantec": "Cântec",
"ceremonie": "Ceremonie",
}
CONTENT_TYPE_SLUGS: List[str] = list(CONTENT_TYPES.keys())
# Content types considered "non-game" — excluded from the default UI search.
NON_GAME_CONTENT_TYPES: List[str] = ["reteta", "cantec", "ceremonie"]
DEFAULT_CONTENT_TYPE = "activitate"
# --- Aliases -----------------------------------------------------------------
# Map of normalized arbitrary strings -> canonical slug. Keys are already
# diacritic-stripped, lowercased and hyphenated (see _slugify). This catches
# legacy / messy values from the old DB and common English/Romanian variants.
_CATEGORY_ALIASES: Dict[str, str] = {
# legacy junk
"general-activity": "altele",
"general": "altele",
"educational": "creative-stem",
"d": "altele",
"a": "altele",
"b": "altele",
"c": "altele",
# scouting
"cercetasie": "jocuri-cercetasesti",
"cercetasesti": "jocuri-cercetasesti",
"scout": "jocuri-cercetasesti",
"scouting": "jocuri-cercetasesti",
"scout-games": "jocuri-cercetasesti",
"jocuri-cercetasesti": "jocuri-cercetasesti",
# team building
"teambuilding": "team-building",
"team": "team-building",
"cooperare": "team-building",
# icebreakers
"icebreaker": "icebreakers",
"spargerea-ghetii": "icebreakers",
"cunoastere": "icebreakers",
"energizers": "icebreakers",
"energizer": "icebreakers",
# camp / outdoor
"camp": "camp-outdoor",
"tabara": "camp-outdoor",
"outdoor": "camp-outdoor",
"aer-liber": "camp-outdoor",
# wide games
"wide-game": "wide-games",
"jocuri-de-teren": "wide-games",
"joc-de-teren": "wide-games",
"big-games": "wide-games",
# orientare
"orienteering": "orientare",
"navigatie": "orientare",
# prim ajutor
"first-aid": "prim-ajutor",
"primul-ajutor": "prim-ajutor",
# escape room / puzzle
"escape-room": "escape-room-puzzle",
"escaperoom": "escape-room-puzzle",
"puzzle": "escape-room-puzzle",
"puzzles": "escape-room-puzzle",
"ghicitori": "escape-room-puzzle",
# creative / stem
"creative": "creative-stem",
"creativitate": "creative-stem",
"stem": "creative-stem",
"arts-and-crafts": "creative-stem",
"craft": "creative-stem",
"crafts": "creative-stem",
"stiinta": "creative-stem",
# sports
"sport": "sports-active",
"sports": "sports-active",
"sportive": "sports-active",
"active": "sports-active",
"miscare": "sports-active",
"physical": "sports-active",
# songs / ceremonies
"cantece": "cantece-ceremonii",
"cantec": "cantece-ceremonii",
"songs": "cantece-ceremonii",
"ceremonii": "cantece-ceremonii",
"ceremonie": "cantece-ceremonii",
"ceremony": "cantece-ceremonii",
# recipes
"reteta": "retete",
"recipe": "retete",
"recipes": "retete",
"cooking": "retete",
"gatit": "retete",
# survival
"survival": "supravietuire",
"supravietuire": "supravietuire",
# inclusion
"integrare": "integrare-incluziune",
"incluziune": "integrare-incluziune",
"inclusion": "integrare-incluziune",
# conflict / empathy
"conflict": "conflict-empatie",
"empatie": "conflict-empatie",
"empathy": "conflict-empatie",
"rezolvarea-conflictelor": "conflict-empatie",
# fallback
"altele": "altele",
"other": "altele",
"others": "altele",
"misc": "altele",
}
def _slugify(value: str) -> str:
"""Lowercase, strip diacritics, collapse non-alphanumerics to hyphens."""
if not value:
return ""
# Decompose accents (ă -> a, ș -> s, ț -> t, etc.)
decomposed = unicodedata.normalize("NFKD", value)
ascii_str = "".join(c for c in decomposed if not unicodedata.combining(c))
ascii_str = ascii_str.lower().strip()
ascii_str = re.sub(r"[^a-z0-9]+", "-", ascii_str)
return ascii_str.strip("-")
def normalize_category(value: str) -> str:
"""Map an arbitrary string to a valid category slug.
Returns one of CATEGORY_SLUGS, falling back to `altele` for anything
unrecognised or empty.
"""
if not value:
return FALLBACK_CATEGORY
slug = _slugify(str(value))
if not slug:
return FALLBACK_CATEGORY
# Exact slug match.
if slug in CATEGORIES:
return slug
# Alias match.
if slug in _CATEGORY_ALIASES:
return _CATEGORY_ALIASES[slug]
return FALLBACK_CATEGORY
def normalize_content_type(value: str) -> str:
"""Map an arbitrary string to a valid content_type slug.
Returns one of CONTENT_TYPE_SLUGS, falling back to `activitate`.
"""
if not value:
return DEFAULT_CONTENT_TYPE
slug = _slugify(str(value))
if slug in CONTENT_TYPES:
return slug
# Light alias handling for plural / English forms.
aliases = {
"jocuri": "joc",
"game": "joc",
"games": "joc",
"activitati": "activitate",
"activity": "activitate",
"retete": "reteta",
"recipe": "reteta",
"cantece": "cantec",
"song": "cantec",
"ceremonii": "ceremonie",
"ceremony": "ceremonie",
}
return aliases.get(slug, DEFAULT_CONTENT_TYPE)
# --- Indoor / outdoor (enrichment axis) --------------------------------------
# Where the activity is run. Inferred during enrichment when the source is
# silent — such inferences are flagged in `estimated_fields`. slug -> RO label.
INDOOR_OUTDOOR: Dict[str, str] = {
"indoor": "Interior",
"outdoor": "Exterior",
"either": "Interior sau exterior",
}
# --- Space needed (enrichment axis) ------------------------------------------
# Rough footprint the activity requires. slug -> RO label.
SPACE_NEEDED: Dict[str, str] = {
"mic": "Spațiu mic",
"mediu": "Spațiu mediu",
"mare": "Spațiu mare",
}
# Aliases for robustness against LLM output variation. Keys are _slugify'd.
_INDOOR_OUTDOOR_ALIASES: Dict[str, str] = {
"interior": "indoor",
"inside": "indoor",
"in": "indoor",
"exterior": "outdoor",
"outside": "outdoor",
"out": "outdoor",
"aer-liber": "outdoor",
"both": "either",
"any": "either",
"ambele": "either",
"interior-exterior": "either",
"indoor-outdoor": "either",
}
_SPACE_NEEDED_ALIASES: Dict[str, str] = {
"small": "mic",
"redus": "mic",
"putin": "mic",
"medium": "mediu",
"moderat": "mediu",
"large": "mare",
"big": "mare",
"mult": "mare",
"spatiu-mic": "mic",
"spatiu-mediu": "mediu",
"spatiu-mare": "mare",
}
def normalize_indoor_outdoor(value: str) -> Optional[str]:
"""Map an arbitrary string to an indoor_outdoor slug, or None.
Unlike categories, this has NO mandatory fallback: an unrecognised or
empty value yields None (field simply absent), so we never fabricate a
location the enrichment did not assert.
"""
if not value:
return None
slug = _slugify(str(value))
if slug in INDOOR_OUTDOOR:
return slug
return _INDOOR_OUTDOOR_ALIASES.get(slug)
def normalize_space_needed(value: str) -> Optional[str]:
"""Map an arbitrary string to a space_needed slug, or None (no fallback)."""
if not value:
return None
slug = _slugify(str(value))
if slug in SPACE_NEEDED:
return slug
return _SPACE_NEEDED_ALIASES.get(slug)
def indoor_outdoor_display_name(slug: str) -> str:
"""RO display name for an indoor_outdoor slug."""
return INDOOR_OUTDOOR.get(slug, slug)
def space_needed_display_name(slug: str) -> str:
"""RO display name for a space_needed slug."""
return SPACE_NEEDED.get(slug, slug)
def is_valid_category(slug: str) -> bool:
"""True if `slug` is a valid category slug."""
return slug in CATEGORIES
def category_display_name(slug: str) -> str:
"""Romanian display name for a slug (fallback to the slug itself)."""
return CATEGORIES.get(slug, slug)
def content_type_display_name(slug: str) -> str:
"""Romanian display name for a content_type slug."""
return CONTENT_TYPES.get(slug, slug)