Faza 1 complete: bilingual+enrichment plumbing, UI/filters, frozen DB
Extraction finished (575/588 chunks; 6 content-filter-blocked, 7 await re-extraction). DB rebuilt and frozen at 9418 activities — content_keys are now stable for the enrichment overlay. Part A (plumbing + UI): - database.py: name_ro/description_ro/rules_ro/variations_ro, indoor_outdoor, space_needed, estimated_fields, source_id/source_ids/chunk_key columns; FTS5 indexes the 4 *_ro columns across CREATE + all 3 triggers; new equality filters + category counts for both axes. - activity.py: new fields + bilingual display helpers (get_display_*, is_estimated, axis displays). - config_taxonomy.py: INDOOR_OUTDOOR/SPACE_NEEDED enums + normalizers (None on unrecognised, no fabrication). - search.py / routes.py / config.py / templates / css: new dropdowns, RO-primary rendering with "(estimat)" markers and collapsible original text, and a /source/<id> download route shipped DARK behind SOURCE_DOWNLOAD_ENABLED (copyright opt-in). - build_database.py: source_id/chunk_key in dict_to_activity; merge_cluster unions source_ids without touching enrichment fields. Part B (enrichment pipeline, built not yet run): - build_database.py: load_enrichment + apply_enrichment (post-dedup, keyed on content_key) + --enrichment CLI + stated-vs-estimated QA. - run_enrichment.py (resumable, --source/--limit pilot scoping, --collect), ENRICHMENT_PROMPT.md. Repair: scripts/repair_extractions.py fixes the subagents' systematic unescaped-ASCII-quote bug with a faithful char-scanner (escapes, never truncates) + schema validation + a strictly-more-text guard. json_repair was tried first, truncated silently, and is NOT used. build_database has no repair dependency. Tests: tests/test_enrichment.py added; 99 pass. Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
This commit is contained in:
231
tests/test_enrichment.py
Normal file
231
tests/test_enrichment.py
Normal file
@@ -0,0 +1,231 @@
|
||||
"""
|
||||
Tests for the enrichment overlay (plan Part B) and the new filter axes /
|
||||
bilingual display helpers (plan Part A).
|
||||
|
||||
Covers:
|
||||
* config_taxonomy.normalize_indoor_outdoor / normalize_space_needed
|
||||
* build_database.apply_enrichment keying, field application, estimated tally
|
||||
* DatabaseManager indoor_outdoor / space_needed equality filters
|
||||
* FTS5 indexing of the *_ro columns
|
||||
* Activity bilingual display helpers
|
||||
"""
|
||||
|
||||
import os
|
||||
import sys
|
||||
|
||||
import pytest
|
||||
|
||||
PROJECT_ROOT = os.path.abspath(os.path.join(os.path.dirname(__file__), ".."))
|
||||
if PROJECT_ROOT not in sys.path:
|
||||
sys.path.insert(0, PROJECT_ROOT)
|
||||
SCRIPTS = os.path.join(PROJECT_ROOT, "scripts")
|
||||
if SCRIPTS not in sys.path:
|
||||
sys.path.insert(0, SCRIPTS)
|
||||
|
||||
from app.models.activity import Activity # noqa: E402
|
||||
from app.models.database import DatabaseManager # noqa: E402
|
||||
from app.config_taxonomy import ( # noqa: E402
|
||||
normalize_indoor_outdoor,
|
||||
normalize_space_needed,
|
||||
)
|
||||
from import_common import content_key, normalize_name # noqa: E402
|
||||
from build_database import apply_enrichment # noqa: E402
|
||||
|
||||
|
||||
# --------------------------------------------------------------------------
|
||||
# taxonomy normalizers
|
||||
# --------------------------------------------------------------------------
|
||||
@pytest.mark.parametrize("raw,expected", [
|
||||
("indoor", "indoor"),
|
||||
("Outdoor", "outdoor"),
|
||||
("either", "either"),
|
||||
("interior", "indoor"),
|
||||
("aer liber", "outdoor"),
|
||||
("both", "either"),
|
||||
("", None),
|
||||
("nonsense", None),
|
||||
(None, None),
|
||||
])
|
||||
def test_normalize_indoor_outdoor(raw, expected):
|
||||
assert normalize_indoor_outdoor(raw) == expected
|
||||
|
||||
|
||||
@pytest.mark.parametrize("raw,expected", [
|
||||
("mic", "mic"),
|
||||
("MEDIU", "mediu"),
|
||||
("mare", "mare"),
|
||||
("small", "mic"),
|
||||
("large", "mare"),
|
||||
("", None),
|
||||
("huge", None),
|
||||
(None, None),
|
||||
])
|
||||
def test_normalize_space_needed(raw, expected):
|
||||
assert normalize_space_needed(raw) == expected
|
||||
|
||||
|
||||
# --------------------------------------------------------------------------
|
||||
# apply_enrichment
|
||||
# --------------------------------------------------------------------------
|
||||
def _activity(name="Joc de test", description="O descriere de test.", language="ro"):
|
||||
return Activity(
|
||||
name=name, description=description, category="team-building",
|
||||
content_type="joc", source_file="t.txt", language=language,
|
||||
)
|
||||
|
||||
|
||||
def _key_for(act: Activity) -> str:
|
||||
return content_key(
|
||||
act.normalized_name or normalize_name(act.name),
|
||||
act.language,
|
||||
act.description or "",
|
||||
)
|
||||
|
||||
|
||||
def test_apply_enrichment_matches_and_applies_fields():
|
||||
act = _activity()
|
||||
key = _key_for(act)
|
||||
enrichment = {
|
||||
key: {
|
||||
"name_ro": "Joc de test (RO)",
|
||||
"description_ro": "Descriere îmbogățită în română.",
|
||||
"indoor_outdoor": "outdoor",
|
||||
"space_needed": "mediu",
|
||||
"participants_min": 4,
|
||||
"participants_max": 12,
|
||||
"estimated_fields": ["space_needed", "participants_min", "participants_max"],
|
||||
}
|
||||
}
|
||||
stats = apply_enrichment([act], enrichment)
|
||||
|
||||
assert act.name_ro == "Joc de test (RO)"
|
||||
assert act.description_ro == "Descriere îmbogățită în română."
|
||||
assert act.indoor_outdoor == "outdoor"
|
||||
assert act.space_needed == "mediu"
|
||||
assert act.participants_min == 4 and act.participants_max == 12
|
||||
assert set(act.estimated_fields) == {"space_needed", "participants_min", "participants_max"}
|
||||
|
||||
assert stats["entries"] == 1
|
||||
assert stats["matched"] == 1
|
||||
assert stats["orphaned"] == 0
|
||||
# indoor_outdoor stated, space_needed estimated
|
||||
assert stats["fields_stated"].get("indoor_outdoor") == 1
|
||||
assert stats["fields_estimated"].get("space_needed") == 1
|
||||
|
||||
|
||||
def test_apply_enrichment_orphan_entry_counted():
|
||||
act = _activity()
|
||||
enrichment = {"deadbeef" * 5: {"name_ro": "nu se potrivește"}}
|
||||
stats = apply_enrichment([act], enrichment)
|
||||
assert stats["matched"] == 0
|
||||
assert stats["orphaned"] == 1
|
||||
assert act.name_ro is None # untouched
|
||||
|
||||
|
||||
def test_apply_enrichment_absent_fields_leave_value_untouched():
|
||||
act = _activity()
|
||||
act.participants_min = 5
|
||||
key = _key_for(act)
|
||||
# entry only translates name; participants must be preserved
|
||||
apply_enrichment([act], {key: {"name_ro": "Tradus"}})
|
||||
assert act.participants_min == 5
|
||||
assert act.name_ro == "Tradus"
|
||||
|
||||
|
||||
def test_apply_enrichment_drops_unrecognised_enum():
|
||||
act = _activity()
|
||||
key = _key_for(act)
|
||||
apply_enrichment([act], {key: {"indoor_outdoor": "spaceship", "space_needed": "mic"}})
|
||||
assert act.indoor_outdoor is None # unrecognised → dropped
|
||||
assert act.space_needed == "mic"
|
||||
|
||||
|
||||
# --------------------------------------------------------------------------
|
||||
# DB equality filters + FTS on *_ro
|
||||
# --------------------------------------------------------------------------
|
||||
@pytest.fixture
|
||||
def db(tmp_path):
|
||||
return DatabaseManager(str(tmp_path / "enrich.db"))
|
||||
|
||||
|
||||
def _insert(db, **overrides):
|
||||
base = dict(
|
||||
name="Activitate", description="desc", category="camp-outdoor",
|
||||
content_type="joc", source_file="t.txt", language="ro",
|
||||
)
|
||||
base.update(overrides)
|
||||
return db.insert_activity(Activity(**base))
|
||||
|
||||
|
||||
def test_indoor_outdoor_equality_filter(db):
|
||||
_insert(db, name="In casa", indoor_outdoor="indoor")
|
||||
_insert(db, name="Afara", indoor_outdoor="outdoor")
|
||||
res = db.search_activities(indoor_outdoor="outdoor")
|
||||
assert len(res) == 1
|
||||
assert res[0]["name"] == "Afara"
|
||||
|
||||
|
||||
def test_space_needed_equality_filter(db):
|
||||
_insert(db, name="Mic", space_needed="mic")
|
||||
_insert(db, name="Mare", space_needed="mare")
|
||||
res = db.search_activities(space_needed="mare")
|
||||
assert len(res) == 1
|
||||
assert res[0]["name"] == "Mare"
|
||||
|
||||
|
||||
def test_fts_indexes_name_ro(db):
|
||||
_insert(db, name="Treasure Hunt", name_ro="Vânătoarea de comori")
|
||||
# term only present in the Romanian twin
|
||||
res = db.search_activities(search_text="comori")
|
||||
assert len(res) == 1
|
||||
assert res[0]["name"] == "Treasure Hunt"
|
||||
|
||||
|
||||
def test_fts_indexes_description_ro(db):
|
||||
_insert(db, name="Game", description="english desc",
|
||||
description_ro="o activitate de cooperare")
|
||||
res = db.search_activities(search_text="cooperare")
|
||||
assert len(res) == 1
|
||||
|
||||
|
||||
def test_ro_columns_round_trip(db):
|
||||
aid = _insert(
|
||||
db, name="X", name_ro="X-ro", description_ro="d-ro",
|
||||
rules_ro="r-ro", variations_ro="v-ro",
|
||||
indoor_outdoor="either", space_needed="mediu",
|
||||
estimated_fields=["duration_min"], source_id="src1",
|
||||
source_ids=["src1", "src2"], chunk_key="src1.part01",
|
||||
)
|
||||
row = db.get_activity_by_id(aid)
|
||||
loaded = Activity.from_dict(row)
|
||||
assert loaded.name_ro == "X-ro"
|
||||
assert loaded.indoor_outdoor == "either"
|
||||
assert loaded.space_needed == "mediu"
|
||||
assert loaded.estimated_fields == ["duration_min"]
|
||||
assert loaded.source_ids == ["src1", "src2"]
|
||||
assert loaded.chunk_key == "src1.part01"
|
||||
|
||||
|
||||
# --------------------------------------------------------------------------
|
||||
# display helpers
|
||||
# --------------------------------------------------------------------------
|
||||
def test_display_helpers_prefer_ro_with_fallback():
|
||||
act = _activity(name="Original", description="Original desc")
|
||||
assert act.get_display_name() == "Original" # no translation yet
|
||||
assert act.get_display_description() == "Original desc"
|
||||
act.name_ro = "Tradus"
|
||||
act.description_ro = "Descriere tradusă"
|
||||
assert act.get_display_name() == "Tradus"
|
||||
assert act.get_display_description() == "Descriere tradusă"
|
||||
assert act.has_translation() is True
|
||||
|
||||
|
||||
def test_is_estimated_and_axis_displays():
|
||||
act = _activity()
|
||||
act.indoor_outdoor = "outdoor"
|
||||
act.space_needed = "mare"
|
||||
act.estimated_fields = ["space_needed"]
|
||||
assert act.get_indoor_outdoor_display() == "Exterior"
|
||||
assert act.get_space_needed_display() == "Spațiu mare"
|
||||
assert act.is_estimated("space_needed") is True
|
||||
assert act.is_estimated("indoor_outdoor") is False
|
||||
Reference in New Issue
Block a user