Implements the approved plan to replace the broken regex/index-master extraction with an LLM-subagent pipeline. Four parallel lanes: Lane A — scripts/extract_common.py (PDF/docx/doc/pptx/html/zip, no max_pages truncation), normalize_sources.py, chunk_sources.py (~20pg chunks + overlap, manifest registry), activity_schema.json. Lane B — app/config_taxonomy.py (16 fixed category slugs), schema rebuilt from scratch in app/models/ with content_type, language, source_files, source_excerpt, normalized_name, extraction_confidence, needs_review; FTS5 + 3 triggers extended with materials_list and skills_developed. Lane C — build_database.py (--rebuild, atomic swap, schema + fuzzy source_excerpt validation, dedup with needs_review band), validate_extractions.py, review_queue.py, new run_extraction.py orchestrator, SUBAGENT_PROMPT.md. Lane D — search.py content_type/language filters (default search excludes non-game content), E7 schema-compat audit; fixed a NULL keywords AttributeError in _boost_search_relevance. Removes 8 orphaned/dead scripts and app/services/parser.py + indexer.py. Adds tests/ (70 passing, 1 skipped — libreoffice absent). Note: Lane D made one additive edit to app/models/database.py (_update_category_counts) to surface content_type/language in get_filter_options, outside its nominal lane boundary but after Lane B completed. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
111 lines
3.6 KiB
JSON
111 lines
3.6 KiB
JSON
{
|
|
"$schema": "http://json-schema.org/draft-07/schema#",
|
|
"title": "Game-library extraction output",
|
|
"description": "One subagent output file: a header carrying provenance/version metadata plus the list of activities extracted from a single chunk.",
|
|
"type": "object",
|
|
"required": ["header", "activities"],
|
|
"additionalProperties": false,
|
|
"properties": {
|
|
"header": {
|
|
"type": "object",
|
|
"required": ["source_hash", "schema_version", "prompt_version", "chunk_range"],
|
|
"additionalProperties": true,
|
|
"properties": {
|
|
"source_hash": {"type": "string", "minLength": 8},
|
|
"schema_version": {"type": "string"},
|
|
"prompt_version": {"type": "string"},
|
|
"chunk_range": {"type": "string"},
|
|
"source_id": {"type": ["string", "null"]},
|
|
"chunk_key": {"type": ["string", "null"]}
|
|
}
|
|
},
|
|
"activities": {
|
|
"type": "array",
|
|
"items": {"$ref": "#/definitions/activity"}
|
|
}
|
|
},
|
|
"definitions": {
|
|
"activity": {
|
|
"type": "object",
|
|
"required": [
|
|
"name",
|
|
"description",
|
|
"category",
|
|
"content_type",
|
|
"language",
|
|
"extraction_confidence",
|
|
"source_excerpt",
|
|
"page_reference"
|
|
],
|
|
"additionalProperties": false,
|
|
"properties": {
|
|
"name": {"type": "string", "minLength": 3},
|
|
"description": {"type": "string", "minLength": 1},
|
|
"rules": {"type": ["string", "null"]},
|
|
"variations": {"type": ["string", "null"]},
|
|
"category": {
|
|
"type": "string",
|
|
"enum": [
|
|
"jocuri-cercetasesti",
|
|
"team-building",
|
|
"icebreakers",
|
|
"camp-outdoor",
|
|
"wide-games",
|
|
"orientare",
|
|
"prim-ajutor",
|
|
"escape-room-puzzle",
|
|
"creative-stem",
|
|
"sports-active",
|
|
"cantece-ceremonii",
|
|
"retete",
|
|
"supravietuire",
|
|
"integrare-incluziune",
|
|
"conflict-empatie",
|
|
"altele"
|
|
]
|
|
},
|
|
"subcategory": {"type": ["string", "null"]},
|
|
"content_type": {
|
|
"type": "string",
|
|
"enum": ["joc", "activitate", "reteta", "cantec", "ceremonie"]
|
|
},
|
|
"language": {"type": "string", "enum": ["ro", "en"]},
|
|
"extraction_confidence": {
|
|
"type": "string",
|
|
"enum": ["high", "med", "low"]
|
|
},
|
|
"source_excerpt": {"type": "string", "minLength": 1},
|
|
"page_reference": {"type": "string", "minLength": 1},
|
|
"source_file": {"type": ["string", "null"]},
|
|
"age_group_min": {"type": ["integer", "null"], "minimum": 0},
|
|
"age_group_max": {"type": ["integer", "null"], "minimum": 0},
|
|
"participants_min": {"type": ["integer", "null"], "minimum": 0},
|
|
"participants_max": {"type": ["integer", "null"], "minimum": 0},
|
|
"duration_min": {"type": ["integer", "null"], "minimum": 0},
|
|
"duration_max": {"type": ["integer", "null"], "minimum": 0},
|
|
"materials_category": {"type": ["string", "null"]},
|
|
"materials_list": {
|
|
"type": ["array", "null"],
|
|
"items": {"type": "string"}
|
|
},
|
|
"skills_developed": {
|
|
"type": ["array", "null"],
|
|
"items": {"type": "string"}
|
|
},
|
|
"difficulty_level": {
|
|
"type": ["string", "null"],
|
|
"enum": ["usor", "mediu", "dificil", null]
|
|
},
|
|
"keywords": {
|
|
"type": ["array", "null"],
|
|
"items": {"type": "string"}
|
|
},
|
|
"tags": {
|
|
"type": ["array", "null"],
|
|
"items": {"type": "string"}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|