Implements the approved plan to replace the broken regex/index-master extraction with an LLM-subagent pipeline. Four parallel lanes: Lane A — scripts/extract_common.py (PDF/docx/doc/pptx/html/zip, no max_pages truncation), normalize_sources.py, chunk_sources.py (~20pg chunks + overlap, manifest registry), activity_schema.json. Lane B — app/config_taxonomy.py (16 fixed category slugs), schema rebuilt from scratch in app/models/ with content_type, language, source_files, source_excerpt, normalized_name, extraction_confidence, needs_review; FTS5 + 3 triggers extended with materials_list and skills_developed. Lane C — build_database.py (--rebuild, atomic swap, schema + fuzzy source_excerpt validation, dedup with needs_review band), validate_extractions.py, review_queue.py, new run_extraction.py orchestrator, SUBAGENT_PROMPT.md. Lane D — search.py content_type/language filters (default search excludes non-game content), E7 schema-compat audit; fixed a NULL keywords AttributeError in _boost_search_relevance. Removes 8 orphaned/dead scripts and app/services/parser.py + indexer.py. Adds tests/ (70 passing, 1 skipped — libreoffice absent). Note: Lane D made one additive edit to app/models/database.py (_update_category_counts) to surface content_type/language in get_filter_options, outside its nominal lane boundary but after Lane B completed. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
140 lines
4.7 KiB
Python
140 lines
4.7 KiB
Python
"""
|
|
Integration tests for the FTS5 search index.
|
|
|
|
Confirms that materials_list and skills_developed are indexed by FTS5 and kept
|
|
in sync by the insert / update / delete triggers (plan §6, §7).
|
|
"""
|
|
|
|
import os
|
|
import sys
|
|
import json
|
|
|
|
import pytest
|
|
|
|
# Make the project root importable when pytest is run from anywhere.
|
|
PROJECT_ROOT = os.path.abspath(os.path.join(os.path.dirname(__file__), ".."))
|
|
if PROJECT_ROOT not in sys.path:
|
|
sys.path.insert(0, PROJECT_ROOT)
|
|
|
|
from app.models.activity import Activity # noqa: E402
|
|
from app.models.database import DatabaseManager # noqa: E402
|
|
|
|
|
|
@pytest.fixture
|
|
def db(tmp_path):
|
|
"""A fresh DatabaseManager backed by a temporary SQLite file."""
|
|
return DatabaseManager(str(tmp_path / "test_activities.db"))
|
|
|
|
|
|
def _make_activity(**overrides):
|
|
base = dict(
|
|
name="Vânătoarea de comori",
|
|
description="O activitate de echipă în aer liber.",
|
|
category="camp-outdoor",
|
|
content_type="joc",
|
|
source_file="test.txt",
|
|
language="ro",
|
|
)
|
|
base.update(overrides)
|
|
return Activity(**base)
|
|
|
|
|
|
def test_search_by_materials_list(db):
|
|
"""A term that only appears in materials_list returns the activity."""
|
|
activity = _make_activity(materials_list="frânghie, eșarfă, busolă")
|
|
db.insert_activity(activity)
|
|
|
|
results = db.search_activities(search_text="busolă")
|
|
assert len(results) == 1
|
|
assert results[0]["name"] == "Vânătoarea de comori"
|
|
|
|
|
|
def test_search_by_skills_developed(db):
|
|
"""A term that only appears in skills_developed returns the activity."""
|
|
activity = _make_activity(skills_developed="comunicare, leadership, rabdare")
|
|
db.insert_activity(activity)
|
|
|
|
results = db.search_activities(search_text="leadership")
|
|
assert len(results) == 1
|
|
assert results[0]["name"] == "Vânătoarea de comori"
|
|
|
|
|
|
def test_term_absent_from_indexed_columns_no_hit(db):
|
|
"""A term present in no indexed column yields no hit (control)."""
|
|
db.insert_activity(_make_activity(materials_list="frânghie"))
|
|
assert db.search_activities(search_text="zzzunlikelyterm") == []
|
|
|
|
|
|
def test_delete_trigger_removes_from_fts(db):
|
|
"""Deleting an activity removes it from the FTS index (delete trigger)."""
|
|
activity = _make_activity(materials_list="catalige")
|
|
activity_id = db.insert_activity(activity)
|
|
assert len(db.search_activities(search_text="catalige")) == 1
|
|
|
|
with db._get_connection() as conn:
|
|
conn.execute("DELETE FROM activities WHERE id = ?", (activity_id,))
|
|
conn.commit()
|
|
|
|
assert db.search_activities(search_text="catalige") == []
|
|
|
|
|
|
def test_update_trigger_resyncs_fts(db):
|
|
"""Updating materials_list re-syncs the FTS index (update trigger)."""
|
|
activity = _make_activity(materials_list="creioane")
|
|
activity_id = db.insert_activity(activity)
|
|
assert len(db.search_activities(search_text="creioane")) == 1
|
|
|
|
with db._get_connection() as conn:
|
|
conn.execute(
|
|
"UPDATE activities SET materials_list = ? WHERE id = ?",
|
|
("acuarele", activity_id),
|
|
)
|
|
conn.commit()
|
|
|
|
# Old term gone, new term found.
|
|
assert db.search_activities(search_text="creioane") == []
|
|
assert len(db.search_activities(search_text="acuarele")) == 1
|
|
|
|
|
|
def test_rebuild_fts_index(db):
|
|
"""rebuild_fts_index keeps materials_list / skills_developed searchable."""
|
|
db.insert_activity(_make_activity(skills_developed="orientare"))
|
|
db.rebuild_fts_index()
|
|
assert len(db.search_activities(search_text="orientare")) == 1
|
|
|
|
|
|
def test_new_schema_columns_round_trip(db):
|
|
"""New activity columns persist and load back via from_dict."""
|
|
activity = _make_activity(
|
|
source_files=["a.txt", "b.txt"],
|
|
source_excerpt="Citat scurt din sursă.",
|
|
extraction_confidence="high",
|
|
needs_review=1,
|
|
normalized_name="vanatoarea de comori",
|
|
)
|
|
activity_id = db.insert_activity(activity)
|
|
|
|
row = db.get_activity_by_id(activity_id)
|
|
assert row["content_type"] == "joc"
|
|
assert row["language"] == "ro"
|
|
assert row["extraction_confidence"] == "high"
|
|
assert row["needs_review"] == 1
|
|
assert row["normalized_name"] == "vanatoarea de comori"
|
|
assert json.loads(row["source_files"]) == ["a.txt", "b.txt"]
|
|
assert row["source_excerpt"] == "Citat scurt din sursă."
|
|
|
|
loaded = Activity.from_dict(row)
|
|
assert loaded.source_files == ["a.txt", "b.txt"]
|
|
assert loaded.content_type == "joc"
|
|
|
|
|
|
def test_normalized_name_auto_derived(db):
|
|
"""normalized_name is auto-derived from name when not provided."""
|
|
activity = Activity(
|
|
name="Ștafetă cu Obstacole",
|
|
description="desc",
|
|
category="sports-active",
|
|
source_file="t.txt",
|
|
)
|
|
assert activity.normalized_name == "stafeta cu obstacole"
|