Rebuild extraction pipeline infrastructure (Faza 0 prep)
Implements the approved plan to replace the broken regex/index-master extraction with an LLM-subagent pipeline. Four parallel lanes: Lane A — scripts/extract_common.py (PDF/docx/doc/pptx/html/zip, no max_pages truncation), normalize_sources.py, chunk_sources.py (~20pg chunks + overlap, manifest registry), activity_schema.json. Lane B — app/config_taxonomy.py (16 fixed category slugs), schema rebuilt from scratch in app/models/ with content_type, language, source_files, source_excerpt, normalized_name, extraction_confidence, needs_review; FTS5 + 3 triggers extended with materials_list and skills_developed. Lane C — build_database.py (--rebuild, atomic swap, schema + fuzzy source_excerpt validation, dedup with needs_review band), validate_extractions.py, review_queue.py, new run_extraction.py orchestrator, SUBAGENT_PROMPT.md. Lane D — search.py content_type/language filters (default search excludes non-game content), E7 schema-compat audit; fixed a NULL keywords AttributeError in _boost_search_relevance. Removes 8 orphaned/dead scripts and app/services/parser.py + indexer.py. Adds tests/ (70 passing, 1 skipped — libreoffice absent). Note: Lane D made one additive edit to app/models/database.py (_update_category_counts) to surface content_type/language in get_filter_options, outside its nominal lane boundary but after Lane B completed. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
230
app/config_taxonomy.py
Normal file
230
app/config_taxonomy.py
Normal file
@@ -0,0 +1,230 @@
|
||||
"""
|
||||
Controlled category taxonomy for game-library.
|
||||
|
||||
Single source of truth for activity categories. The DB stores the *slug*;
|
||||
the UI displays the Romanian name. `category` (thematic domain) and
|
||||
`content_type` (form of the content) are INDEPENDENT axes — see plan §2.
|
||||
"""
|
||||
|
||||
import unicodedata
|
||||
import re
|
||||
from typing import Dict, List
|
||||
|
||||
# --- Categories (thematic domain) --------------------------------------------
|
||||
# slug -> Romanian display name. ~16 fixed slugs; `altele` is the mandatory
|
||||
# fallback and MUST always be present.
|
||||
CATEGORIES: Dict[str, str] = {
|
||||
"jocuri-cercetasesti": "Jocuri cercetășești",
|
||||
"team-building": "Team-building",
|
||||
"icebreakers": "Icebreakers / spargerea gheții",
|
||||
"camp-outdoor": "Tabără și activități în aer liber",
|
||||
"wide-games": "Wide games / jocuri de teren",
|
||||
"orientare": "Orientare",
|
||||
"prim-ajutor": "Prim ajutor",
|
||||
"escape-room-puzzle": "Escape room și puzzle",
|
||||
"creative-stem": "Creativitate și STEM",
|
||||
"sports-active": "Sport și activități fizice",
|
||||
"cantece-ceremonii": "Cântece și ceremonii",
|
||||
"retete": "Rețete",
|
||||
"supravietuire": "Supraviețuire",
|
||||
"integrare-incluziune": "Integrare și incluziune",
|
||||
"conflict-empatie": "Conflict și empatie",
|
||||
"altele": "Altele",
|
||||
}
|
||||
|
||||
# Mandatory fallback slug.
|
||||
FALLBACK_CATEGORY = "altele"
|
||||
|
||||
# Ordered list of valid slugs.
|
||||
CATEGORY_SLUGS: List[str] = list(CATEGORIES.keys())
|
||||
|
||||
# --- Content type (form of the content) --------------------------------------
|
||||
# Independent axis from `category`. The UI default search excludes the
|
||||
# non-game content types (see plan §6).
|
||||
CONTENT_TYPES: Dict[str, str] = {
|
||||
"joc": "Joc",
|
||||
"activitate": "Activitate",
|
||||
"reteta": "Rețetă",
|
||||
"cantec": "Cântec",
|
||||
"ceremonie": "Ceremonie",
|
||||
}
|
||||
|
||||
CONTENT_TYPE_SLUGS: List[str] = list(CONTENT_TYPES.keys())
|
||||
|
||||
# Content types considered "non-game" — excluded from the default UI search.
|
||||
NON_GAME_CONTENT_TYPES: List[str] = ["reteta", "cantec", "ceremonie"]
|
||||
|
||||
DEFAULT_CONTENT_TYPE = "activitate"
|
||||
|
||||
# --- Aliases -----------------------------------------------------------------
|
||||
# Map of normalized arbitrary strings -> canonical slug. Keys are already
|
||||
# diacritic-stripped, lowercased and hyphenated (see _slugify). This catches
|
||||
# legacy / messy values from the old DB and common English/Romanian variants.
|
||||
_CATEGORY_ALIASES: Dict[str, str] = {
|
||||
# legacy junk
|
||||
"general-activity": "altele",
|
||||
"general": "altele",
|
||||
"educational": "creative-stem",
|
||||
"d": "altele",
|
||||
"a": "altele",
|
||||
"b": "altele",
|
||||
"c": "altele",
|
||||
# scouting
|
||||
"cercetasie": "jocuri-cercetasesti",
|
||||
"cercetasesti": "jocuri-cercetasesti",
|
||||
"scout": "jocuri-cercetasesti",
|
||||
"scouting": "jocuri-cercetasesti",
|
||||
"scout-games": "jocuri-cercetasesti",
|
||||
"jocuri-cercetasesti": "jocuri-cercetasesti",
|
||||
# team building
|
||||
"teambuilding": "team-building",
|
||||
"team": "team-building",
|
||||
"cooperare": "team-building",
|
||||
# icebreakers
|
||||
"icebreaker": "icebreakers",
|
||||
"spargerea-ghetii": "icebreakers",
|
||||
"cunoastere": "icebreakers",
|
||||
"energizers": "icebreakers",
|
||||
"energizer": "icebreakers",
|
||||
# camp / outdoor
|
||||
"camp": "camp-outdoor",
|
||||
"tabara": "camp-outdoor",
|
||||
"outdoor": "camp-outdoor",
|
||||
"aer-liber": "camp-outdoor",
|
||||
# wide games
|
||||
"wide-game": "wide-games",
|
||||
"jocuri-de-teren": "wide-games",
|
||||
"joc-de-teren": "wide-games",
|
||||
"big-games": "wide-games",
|
||||
# orientare
|
||||
"orienteering": "orientare",
|
||||
"navigatie": "orientare",
|
||||
# prim ajutor
|
||||
"first-aid": "prim-ajutor",
|
||||
"primul-ajutor": "prim-ajutor",
|
||||
# escape room / puzzle
|
||||
"escape-room": "escape-room-puzzle",
|
||||
"escaperoom": "escape-room-puzzle",
|
||||
"puzzle": "escape-room-puzzle",
|
||||
"puzzles": "escape-room-puzzle",
|
||||
"ghicitori": "escape-room-puzzle",
|
||||
# creative / stem
|
||||
"creative": "creative-stem",
|
||||
"creativitate": "creative-stem",
|
||||
"stem": "creative-stem",
|
||||
"arts-and-crafts": "creative-stem",
|
||||
"craft": "creative-stem",
|
||||
"crafts": "creative-stem",
|
||||
"stiinta": "creative-stem",
|
||||
# sports
|
||||
"sport": "sports-active",
|
||||
"sports": "sports-active",
|
||||
"sportive": "sports-active",
|
||||
"active": "sports-active",
|
||||
"miscare": "sports-active",
|
||||
"physical": "sports-active",
|
||||
# songs / ceremonies
|
||||
"cantece": "cantece-ceremonii",
|
||||
"cantec": "cantece-ceremonii",
|
||||
"songs": "cantece-ceremonii",
|
||||
"ceremonii": "cantece-ceremonii",
|
||||
"ceremonie": "cantece-ceremonii",
|
||||
"ceremony": "cantece-ceremonii",
|
||||
# recipes
|
||||
"reteta": "retete",
|
||||
"recipe": "retete",
|
||||
"recipes": "retete",
|
||||
"cooking": "retete",
|
||||
"gatit": "retete",
|
||||
# survival
|
||||
"survival": "supravietuire",
|
||||
"supravietuire": "supravietuire",
|
||||
# inclusion
|
||||
"integrare": "integrare-incluziune",
|
||||
"incluziune": "integrare-incluziune",
|
||||
"inclusion": "integrare-incluziune",
|
||||
# conflict / empathy
|
||||
"conflict": "conflict-empatie",
|
||||
"empatie": "conflict-empatie",
|
||||
"empathy": "conflict-empatie",
|
||||
"rezolvarea-conflictelor": "conflict-empatie",
|
||||
# fallback
|
||||
"altele": "altele",
|
||||
"other": "altele",
|
||||
"others": "altele",
|
||||
"misc": "altele",
|
||||
}
|
||||
|
||||
|
||||
def _slugify(value: str) -> str:
|
||||
"""Lowercase, strip diacritics, collapse non-alphanumerics to hyphens."""
|
||||
if not value:
|
||||
return ""
|
||||
# Decompose accents (ă -> a, ș -> s, ț -> t, etc.)
|
||||
decomposed = unicodedata.normalize("NFKD", value)
|
||||
ascii_str = "".join(c for c in decomposed if not unicodedata.combining(c))
|
||||
ascii_str = ascii_str.lower().strip()
|
||||
ascii_str = re.sub(r"[^a-z0-9]+", "-", ascii_str)
|
||||
return ascii_str.strip("-")
|
||||
|
||||
|
||||
def normalize_category(value: str) -> str:
|
||||
"""Map an arbitrary string to a valid category slug.
|
||||
|
||||
Returns one of CATEGORY_SLUGS, falling back to `altele` for anything
|
||||
unrecognised or empty.
|
||||
"""
|
||||
if not value:
|
||||
return FALLBACK_CATEGORY
|
||||
slug = _slugify(str(value))
|
||||
if not slug:
|
||||
return FALLBACK_CATEGORY
|
||||
# Exact slug match.
|
||||
if slug in CATEGORIES:
|
||||
return slug
|
||||
# Alias match.
|
||||
if slug in _CATEGORY_ALIASES:
|
||||
return _CATEGORY_ALIASES[slug]
|
||||
return FALLBACK_CATEGORY
|
||||
|
||||
|
||||
def normalize_content_type(value: str) -> str:
|
||||
"""Map an arbitrary string to a valid content_type slug.
|
||||
|
||||
Returns one of CONTENT_TYPE_SLUGS, falling back to `activitate`.
|
||||
"""
|
||||
if not value:
|
||||
return DEFAULT_CONTENT_TYPE
|
||||
slug = _slugify(str(value))
|
||||
if slug in CONTENT_TYPES:
|
||||
return slug
|
||||
# Light alias handling for plural / English forms.
|
||||
aliases = {
|
||||
"jocuri": "joc",
|
||||
"game": "joc",
|
||||
"games": "joc",
|
||||
"activitati": "activitate",
|
||||
"activity": "activitate",
|
||||
"retete": "reteta",
|
||||
"recipe": "reteta",
|
||||
"cantece": "cantec",
|
||||
"song": "cantec",
|
||||
"ceremonii": "ceremonie",
|
||||
"ceremony": "ceremonie",
|
||||
}
|
||||
return aliases.get(slug, DEFAULT_CONTENT_TYPE)
|
||||
|
||||
|
||||
def is_valid_category(slug: str) -> bool:
|
||||
"""True if `slug` is a valid category slug."""
|
||||
return slug in CATEGORIES
|
||||
|
||||
|
||||
def category_display_name(slug: str) -> str:
|
||||
"""Romanian display name for a slug (fallback to the slug itself)."""
|
||||
return CATEGORIES.get(slug, slug)
|
||||
|
||||
|
||||
def content_type_display_name(slug: str) -> str:
|
||||
"""Romanian display name for a content_type slug."""
|
||||
return CONTENT_TYPES.get(slug, slug)
|
||||
Reference in New Issue
Block a user