Rebuild extraction pipeline infrastructure (Faza 0 prep)

Implements the approved plan to replace the broken regex/index-master
extraction with an LLM-subagent pipeline. Four parallel lanes:

Lane A — scripts/extract_common.py (PDF/docx/doc/pptx/html/zip, no
  max_pages truncation), normalize_sources.py, chunk_sources.py
  (~20pg chunks + overlap, manifest registry), activity_schema.json.
Lane B — app/config_taxonomy.py (16 fixed category slugs), schema
  rebuilt from scratch in app/models/ with content_type, language,
  source_files, source_excerpt, normalized_name, extraction_confidence,
  needs_review; FTS5 + 3 triggers extended with materials_list and
  skills_developed.
Lane C — build_database.py (--rebuild, atomic swap, schema + fuzzy
  source_excerpt validation, dedup with needs_review band),
  validate_extractions.py, review_queue.py, new run_extraction.py
  orchestrator, SUBAGENT_PROMPT.md.
Lane D — search.py content_type/language filters (default search
  excludes non-game content), E7 schema-compat audit; fixed a NULL
  keywords AttributeError in _boost_search_relevance.

Removes 8 orphaned/dead scripts and app/services/parser.py +
indexer.py. Adds tests/ (70 passing, 1 skipped — libreoffice absent).

Note: Lane D made one additive edit to app/models/database.py
(_update_category_counts) to surface content_type/language in
get_filter_options, outside its nominal lane boundary but after
Lane B completed.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
Claude Agent
2026-05-19 17:43:38 +00:00
parent e0080edf85
commit 66ae831c36
37 changed files with 4101 additions and 1881 deletions

139
tests/test_fts.py Normal file
View File

@@ -0,0 +1,139 @@
"""
Integration tests for the FTS5 search index.
Confirms that materials_list and skills_developed are indexed by FTS5 and kept
in sync by the insert / update / delete triggers (plan §6, §7).
"""
import os
import sys
import json
import pytest
# Make the project root importable when pytest is run from anywhere.
PROJECT_ROOT = os.path.abspath(os.path.join(os.path.dirname(__file__), ".."))
if PROJECT_ROOT not in sys.path:
sys.path.insert(0, PROJECT_ROOT)
from app.models.activity import Activity # noqa: E402
from app.models.database import DatabaseManager # noqa: E402
@pytest.fixture
def db(tmp_path):
"""A fresh DatabaseManager backed by a temporary SQLite file."""
return DatabaseManager(str(tmp_path / "test_activities.db"))
def _make_activity(**overrides):
base = dict(
name="Vânătoarea de comori",
description="O activitate de echipă în aer liber.",
category="camp-outdoor",
content_type="joc",
source_file="test.txt",
language="ro",
)
base.update(overrides)
return Activity(**base)
def test_search_by_materials_list(db):
"""A term that only appears in materials_list returns the activity."""
activity = _make_activity(materials_list="frânghie, eșarfă, busolă")
db.insert_activity(activity)
results = db.search_activities(search_text="busolă")
assert len(results) == 1
assert results[0]["name"] == "Vânătoarea de comori"
def test_search_by_skills_developed(db):
"""A term that only appears in skills_developed returns the activity."""
activity = _make_activity(skills_developed="comunicare, leadership, rabdare")
db.insert_activity(activity)
results = db.search_activities(search_text="leadership")
assert len(results) == 1
assert results[0]["name"] == "Vânătoarea de comori"
def test_term_absent_from_indexed_columns_no_hit(db):
"""A term present in no indexed column yields no hit (control)."""
db.insert_activity(_make_activity(materials_list="frânghie"))
assert db.search_activities(search_text="zzzunlikelyterm") == []
def test_delete_trigger_removes_from_fts(db):
"""Deleting an activity removes it from the FTS index (delete trigger)."""
activity = _make_activity(materials_list="catalige")
activity_id = db.insert_activity(activity)
assert len(db.search_activities(search_text="catalige")) == 1
with db._get_connection() as conn:
conn.execute("DELETE FROM activities WHERE id = ?", (activity_id,))
conn.commit()
assert db.search_activities(search_text="catalige") == []
def test_update_trigger_resyncs_fts(db):
"""Updating materials_list re-syncs the FTS index (update trigger)."""
activity = _make_activity(materials_list="creioane")
activity_id = db.insert_activity(activity)
assert len(db.search_activities(search_text="creioane")) == 1
with db._get_connection() as conn:
conn.execute(
"UPDATE activities SET materials_list = ? WHERE id = ?",
("acuarele", activity_id),
)
conn.commit()
# Old term gone, new term found.
assert db.search_activities(search_text="creioane") == []
assert len(db.search_activities(search_text="acuarele")) == 1
def test_rebuild_fts_index(db):
"""rebuild_fts_index keeps materials_list / skills_developed searchable."""
db.insert_activity(_make_activity(skills_developed="orientare"))
db.rebuild_fts_index()
assert len(db.search_activities(search_text="orientare")) == 1
def test_new_schema_columns_round_trip(db):
"""New activity columns persist and load back via from_dict."""
activity = _make_activity(
source_files=["a.txt", "b.txt"],
source_excerpt="Citat scurt din sursă.",
extraction_confidence="high",
needs_review=1,
normalized_name="vanatoarea de comori",
)
activity_id = db.insert_activity(activity)
row = db.get_activity_by_id(activity_id)
assert row["content_type"] == "joc"
assert row["language"] == "ro"
assert row["extraction_confidence"] == "high"
assert row["needs_review"] == 1
assert row["normalized_name"] == "vanatoarea de comori"
assert json.loads(row["source_files"]) == ["a.txt", "b.txt"]
assert row["source_excerpt"] == "Citat scurt din sursă."
loaded = Activity.from_dict(row)
assert loaded.source_files == ["a.txt", "b.txt"]
assert loaded.content_type == "joc"
def test_normalized_name_auto_derived(db):
"""normalized_name is auto-derived from name when not provided."""
activity = Activity(
name="Ștafetă cu Obstacole",
description="desc",
category="sports-active",
source_file="t.txt",
)
assert activity.normalized_name == "stafeta cu obstacole"