Rebuild extraction pipeline infrastructure (Faza 0 prep)

Implements the approved plan to replace the broken regex/index-master
extraction with an LLM-subagent pipeline. Four parallel lanes:

Lane A — scripts/extract_common.py (PDF/docx/doc/pptx/html/zip, no
  max_pages truncation), normalize_sources.py, chunk_sources.py
  (~20pg chunks + overlap, manifest registry), activity_schema.json.
Lane B — app/config_taxonomy.py (16 fixed category slugs), schema
  rebuilt from scratch in app/models/ with content_type, language,
  source_files, source_excerpt, normalized_name, extraction_confidence,
  needs_review; FTS5 + 3 triggers extended with materials_list and
  skills_developed.
Lane C — build_database.py (--rebuild, atomic swap, schema + fuzzy
  source_excerpt validation, dedup with needs_review band),
  validate_extractions.py, review_queue.py, new run_extraction.py
  orchestrator, SUBAGENT_PROMPT.md.
Lane D — search.py content_type/language filters (default search
  excludes non-game content), E7 schema-compat audit; fixed a NULL
  keywords AttributeError in _boost_search_relevance.

Removes 8 orphaned/dead scripts and app/services/parser.py +
indexer.py. Adds tests/ (70 passing, 1 skipped — libreoffice absent).

Note: Lane D made one additive edit to app/models/database.py
(_update_category_counts) to surface content_type/language in
get_filter_options, outside its nominal lane boundary but after
Lane B completed.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
Claude Agent
2026-05-19 17:43:38 +00:00
parent e0080edf85
commit 66ae831c36
37 changed files with 4101 additions and 1881 deletions

230
app/config_taxonomy.py Normal file
View File

@@ -0,0 +1,230 @@
"""
Controlled category taxonomy for game-library.
Single source of truth for activity categories. The DB stores the *slug*;
the UI displays the Romanian name. `category` (thematic domain) and
`content_type` (form of the content) are INDEPENDENT axes — see plan §2.
"""
import unicodedata
import re
from typing import Dict, List
# --- Categories (thematic domain) --------------------------------------------
# slug -> Romanian display name. ~16 fixed slugs; `altele` is the mandatory
# fallback and MUST always be present.
CATEGORIES: Dict[str, str] = {
"jocuri-cercetasesti": "Jocuri cercetășești",
"team-building": "Team-building",
"icebreakers": "Icebreakers / spargerea gheții",
"camp-outdoor": "Tabără și activități în aer liber",
"wide-games": "Wide games / jocuri de teren",
"orientare": "Orientare",
"prim-ajutor": "Prim ajutor",
"escape-room-puzzle": "Escape room și puzzle",
"creative-stem": "Creativitate și STEM",
"sports-active": "Sport și activități fizice",
"cantece-ceremonii": "Cântece și ceremonii",
"retete": "Rețete",
"supravietuire": "Supraviețuire",
"integrare-incluziune": "Integrare și incluziune",
"conflict-empatie": "Conflict și empatie",
"altele": "Altele",
}
# Mandatory fallback slug.
FALLBACK_CATEGORY = "altele"
# Ordered list of valid slugs.
CATEGORY_SLUGS: List[str] = list(CATEGORIES.keys())
# --- Content type (form of the content) --------------------------------------
# Independent axis from `category`. The UI default search excludes the
# non-game content types (see plan §6).
CONTENT_TYPES: Dict[str, str] = {
"joc": "Joc",
"activitate": "Activitate",
"reteta": "Rețetă",
"cantec": "Cântec",
"ceremonie": "Ceremonie",
}
CONTENT_TYPE_SLUGS: List[str] = list(CONTENT_TYPES.keys())
# Content types considered "non-game" — excluded from the default UI search.
NON_GAME_CONTENT_TYPES: List[str] = ["reteta", "cantec", "ceremonie"]
DEFAULT_CONTENT_TYPE = "activitate"
# --- Aliases -----------------------------------------------------------------
# Map of normalized arbitrary strings -> canonical slug. Keys are already
# diacritic-stripped, lowercased and hyphenated (see _slugify). This catches
# legacy / messy values from the old DB and common English/Romanian variants.
_CATEGORY_ALIASES: Dict[str, str] = {
# legacy junk
"general-activity": "altele",
"general": "altele",
"educational": "creative-stem",
"d": "altele",
"a": "altele",
"b": "altele",
"c": "altele",
# scouting
"cercetasie": "jocuri-cercetasesti",
"cercetasesti": "jocuri-cercetasesti",
"scout": "jocuri-cercetasesti",
"scouting": "jocuri-cercetasesti",
"scout-games": "jocuri-cercetasesti",
"jocuri-cercetasesti": "jocuri-cercetasesti",
# team building
"teambuilding": "team-building",
"team": "team-building",
"cooperare": "team-building",
# icebreakers
"icebreaker": "icebreakers",
"spargerea-ghetii": "icebreakers",
"cunoastere": "icebreakers",
"energizers": "icebreakers",
"energizer": "icebreakers",
# camp / outdoor
"camp": "camp-outdoor",
"tabara": "camp-outdoor",
"outdoor": "camp-outdoor",
"aer-liber": "camp-outdoor",
# wide games
"wide-game": "wide-games",
"jocuri-de-teren": "wide-games",
"joc-de-teren": "wide-games",
"big-games": "wide-games",
# orientare
"orienteering": "orientare",
"navigatie": "orientare",
# prim ajutor
"first-aid": "prim-ajutor",
"primul-ajutor": "prim-ajutor",
# escape room / puzzle
"escape-room": "escape-room-puzzle",
"escaperoom": "escape-room-puzzle",
"puzzle": "escape-room-puzzle",
"puzzles": "escape-room-puzzle",
"ghicitori": "escape-room-puzzle",
# creative / stem
"creative": "creative-stem",
"creativitate": "creative-stem",
"stem": "creative-stem",
"arts-and-crafts": "creative-stem",
"craft": "creative-stem",
"crafts": "creative-stem",
"stiinta": "creative-stem",
# sports
"sport": "sports-active",
"sports": "sports-active",
"sportive": "sports-active",
"active": "sports-active",
"miscare": "sports-active",
"physical": "sports-active",
# songs / ceremonies
"cantece": "cantece-ceremonii",
"cantec": "cantece-ceremonii",
"songs": "cantece-ceremonii",
"ceremonii": "cantece-ceremonii",
"ceremonie": "cantece-ceremonii",
"ceremony": "cantece-ceremonii",
# recipes
"reteta": "retete",
"recipe": "retete",
"recipes": "retete",
"cooking": "retete",
"gatit": "retete",
# survival
"survival": "supravietuire",
"supravietuire": "supravietuire",
# inclusion
"integrare": "integrare-incluziune",
"incluziune": "integrare-incluziune",
"inclusion": "integrare-incluziune",
# conflict / empathy
"conflict": "conflict-empatie",
"empatie": "conflict-empatie",
"empathy": "conflict-empatie",
"rezolvarea-conflictelor": "conflict-empatie",
# fallback
"altele": "altele",
"other": "altele",
"others": "altele",
"misc": "altele",
}
def _slugify(value: str) -> str:
"""Lowercase, strip diacritics, collapse non-alphanumerics to hyphens."""
if not value:
return ""
# Decompose accents (ă -> a, ș -> s, ț -> t, etc.)
decomposed = unicodedata.normalize("NFKD", value)
ascii_str = "".join(c for c in decomposed if not unicodedata.combining(c))
ascii_str = ascii_str.lower().strip()
ascii_str = re.sub(r"[^a-z0-9]+", "-", ascii_str)
return ascii_str.strip("-")
def normalize_category(value: str) -> str:
"""Map an arbitrary string to a valid category slug.
Returns one of CATEGORY_SLUGS, falling back to `altele` for anything
unrecognised or empty.
"""
if not value:
return FALLBACK_CATEGORY
slug = _slugify(str(value))
if not slug:
return FALLBACK_CATEGORY
# Exact slug match.
if slug in CATEGORIES:
return slug
# Alias match.
if slug in _CATEGORY_ALIASES:
return _CATEGORY_ALIASES[slug]
return FALLBACK_CATEGORY
def normalize_content_type(value: str) -> str:
"""Map an arbitrary string to a valid content_type slug.
Returns one of CONTENT_TYPE_SLUGS, falling back to `activitate`.
"""
if not value:
return DEFAULT_CONTENT_TYPE
slug = _slugify(str(value))
if slug in CONTENT_TYPES:
return slug
# Light alias handling for plural / English forms.
aliases = {
"jocuri": "joc",
"game": "joc",
"games": "joc",
"activitati": "activitate",
"activity": "activitate",
"retete": "reteta",
"recipe": "reteta",
"cantece": "cantec",
"song": "cantec",
"ceremonii": "ceremonie",
"ceremony": "ceremonie",
}
return aliases.get(slug, DEFAULT_CONTENT_TYPE)
def is_valid_category(slug: str) -> bool:
"""True if `slug` is a valid category slug."""
return slug in CATEGORIES
def category_display_name(slug: str) -> str:
"""Romanian display name for a slug (fallback to the slug itself)."""
return CATEGORIES.get(slug, slug)
def content_type_display_name(slug: str) -> str:
"""Romanian display name for a content_type slug."""
return CONTENT_TYPES.get(slug, slug)