Faza 1 complete: bilingual+enrichment plumbing, UI/filters, frozen DB

Extraction finished (575/588 chunks; 6 content-filter-blocked, 7 await
re-extraction). DB rebuilt and frozen at 9418 activities — content_keys
are now stable for the enrichment overlay.

Part A (plumbing + UI):
- database.py: name_ro/description_ro/rules_ro/variations_ro, indoor_outdoor,
  space_needed, estimated_fields, source_id/source_ids/chunk_key columns;
  FTS5 indexes the 4 *_ro columns across CREATE + all 3 triggers; new equality
  filters + category counts for both axes.
- activity.py: new fields + bilingual display helpers (get_display_*,
  is_estimated, axis displays).
- config_taxonomy.py: INDOOR_OUTDOOR/SPACE_NEEDED enums + normalizers
  (None on unrecognised, no fabrication).
- search.py / routes.py / config.py / templates / css: new dropdowns,
  RO-primary rendering with "(estimat)" markers and collapsible original
  text, and a /source/<id> download route shipped DARK behind
  SOURCE_DOWNLOAD_ENABLED (copyright opt-in).
- build_database.py: source_id/chunk_key in dict_to_activity; merge_cluster
  unions source_ids without touching enrichment fields.

Part B (enrichment pipeline, built not yet run):
- build_database.py: load_enrichment + apply_enrichment (post-dedup, keyed on
  content_key) + --enrichment CLI + stated-vs-estimated QA.
- run_enrichment.py (resumable, --source/--limit pilot scoping, --collect),
  ENRICHMENT_PROMPT.md.

Repair: scripts/repair_extractions.py fixes the subagents' systematic
unescaped-ASCII-quote bug with a faithful char-scanner (escapes, never
truncates) + schema validation + a strictly-more-text guard. json_repair was
tried first, truncated silently, and is NOT used. build_database has no repair
dependency.

Tests: tests/test_enrichment.py added; 99 pass.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
This commit is contained in:
Claude Agent
2026-05-29 18:10:13 +00:00
parent 46d9592a55
commit bcfb6841eb
18 changed files with 1579 additions and 167 deletions

231
tests/test_enrichment.py Normal file
View File

@@ -0,0 +1,231 @@
"""
Tests for the enrichment overlay (plan Part B) and the new filter axes /
bilingual display helpers (plan Part A).
Covers:
* config_taxonomy.normalize_indoor_outdoor / normalize_space_needed
* build_database.apply_enrichment keying, field application, estimated tally
* DatabaseManager indoor_outdoor / space_needed equality filters
* FTS5 indexing of the *_ro columns
* Activity bilingual display helpers
"""
import os
import sys
import pytest
PROJECT_ROOT = os.path.abspath(os.path.join(os.path.dirname(__file__), ".."))
if PROJECT_ROOT not in sys.path:
sys.path.insert(0, PROJECT_ROOT)
SCRIPTS = os.path.join(PROJECT_ROOT, "scripts")
if SCRIPTS not in sys.path:
sys.path.insert(0, SCRIPTS)
from app.models.activity import Activity # noqa: E402
from app.models.database import DatabaseManager # noqa: E402
from app.config_taxonomy import ( # noqa: E402
normalize_indoor_outdoor,
normalize_space_needed,
)
from import_common import content_key, normalize_name # noqa: E402
from build_database import apply_enrichment # noqa: E402
# --------------------------------------------------------------------------
# taxonomy normalizers
# --------------------------------------------------------------------------
@pytest.mark.parametrize("raw,expected", [
("indoor", "indoor"),
("Outdoor", "outdoor"),
("either", "either"),
("interior", "indoor"),
("aer liber", "outdoor"),
("both", "either"),
("", None),
("nonsense", None),
(None, None),
])
def test_normalize_indoor_outdoor(raw, expected):
assert normalize_indoor_outdoor(raw) == expected
@pytest.mark.parametrize("raw,expected", [
("mic", "mic"),
("MEDIU", "mediu"),
("mare", "mare"),
("small", "mic"),
("large", "mare"),
("", None),
("huge", None),
(None, None),
])
def test_normalize_space_needed(raw, expected):
assert normalize_space_needed(raw) == expected
# --------------------------------------------------------------------------
# apply_enrichment
# --------------------------------------------------------------------------
def _activity(name="Joc de test", description="O descriere de test.", language="ro"):
return Activity(
name=name, description=description, category="team-building",
content_type="joc", source_file="t.txt", language=language,
)
def _key_for(act: Activity) -> str:
return content_key(
act.normalized_name or normalize_name(act.name),
act.language,
act.description or "",
)
def test_apply_enrichment_matches_and_applies_fields():
act = _activity()
key = _key_for(act)
enrichment = {
key: {
"name_ro": "Joc de test (RO)",
"description_ro": "Descriere îmbogățită în română.",
"indoor_outdoor": "outdoor",
"space_needed": "mediu",
"participants_min": 4,
"participants_max": 12,
"estimated_fields": ["space_needed", "participants_min", "participants_max"],
}
}
stats = apply_enrichment([act], enrichment)
assert act.name_ro == "Joc de test (RO)"
assert act.description_ro == "Descriere îmbogățită în română."
assert act.indoor_outdoor == "outdoor"
assert act.space_needed == "mediu"
assert act.participants_min == 4 and act.participants_max == 12
assert set(act.estimated_fields) == {"space_needed", "participants_min", "participants_max"}
assert stats["entries"] == 1
assert stats["matched"] == 1
assert stats["orphaned"] == 0
# indoor_outdoor stated, space_needed estimated
assert stats["fields_stated"].get("indoor_outdoor") == 1
assert stats["fields_estimated"].get("space_needed") == 1
def test_apply_enrichment_orphan_entry_counted():
act = _activity()
enrichment = {"deadbeef" * 5: {"name_ro": "nu se potrivește"}}
stats = apply_enrichment([act], enrichment)
assert stats["matched"] == 0
assert stats["orphaned"] == 1
assert act.name_ro is None # untouched
def test_apply_enrichment_absent_fields_leave_value_untouched():
act = _activity()
act.participants_min = 5
key = _key_for(act)
# entry only translates name; participants must be preserved
apply_enrichment([act], {key: {"name_ro": "Tradus"}})
assert act.participants_min == 5
assert act.name_ro == "Tradus"
def test_apply_enrichment_drops_unrecognised_enum():
act = _activity()
key = _key_for(act)
apply_enrichment([act], {key: {"indoor_outdoor": "spaceship", "space_needed": "mic"}})
assert act.indoor_outdoor is None # unrecognised → dropped
assert act.space_needed == "mic"
# --------------------------------------------------------------------------
# DB equality filters + FTS on *_ro
# --------------------------------------------------------------------------
@pytest.fixture
def db(tmp_path):
return DatabaseManager(str(tmp_path / "enrich.db"))
def _insert(db, **overrides):
base = dict(
name="Activitate", description="desc", category="camp-outdoor",
content_type="joc", source_file="t.txt", language="ro",
)
base.update(overrides)
return db.insert_activity(Activity(**base))
def test_indoor_outdoor_equality_filter(db):
_insert(db, name="In casa", indoor_outdoor="indoor")
_insert(db, name="Afara", indoor_outdoor="outdoor")
res = db.search_activities(indoor_outdoor="outdoor")
assert len(res) == 1
assert res[0]["name"] == "Afara"
def test_space_needed_equality_filter(db):
_insert(db, name="Mic", space_needed="mic")
_insert(db, name="Mare", space_needed="mare")
res = db.search_activities(space_needed="mare")
assert len(res) == 1
assert res[0]["name"] == "Mare"
def test_fts_indexes_name_ro(db):
_insert(db, name="Treasure Hunt", name_ro="Vânătoarea de comori")
# term only present in the Romanian twin
res = db.search_activities(search_text="comori")
assert len(res) == 1
assert res[0]["name"] == "Treasure Hunt"
def test_fts_indexes_description_ro(db):
_insert(db, name="Game", description="english desc",
description_ro="o activitate de cooperare")
res = db.search_activities(search_text="cooperare")
assert len(res) == 1
def test_ro_columns_round_trip(db):
aid = _insert(
db, name="X", name_ro="X-ro", description_ro="d-ro",
rules_ro="r-ro", variations_ro="v-ro",
indoor_outdoor="either", space_needed="mediu",
estimated_fields=["duration_min"], source_id="src1",
source_ids=["src1", "src2"], chunk_key="src1.part01",
)
row = db.get_activity_by_id(aid)
loaded = Activity.from_dict(row)
assert loaded.name_ro == "X-ro"
assert loaded.indoor_outdoor == "either"
assert loaded.space_needed == "mediu"
assert loaded.estimated_fields == ["duration_min"]
assert loaded.source_ids == ["src1", "src2"]
assert loaded.chunk_key == "src1.part01"
# --------------------------------------------------------------------------
# display helpers
# --------------------------------------------------------------------------
def test_display_helpers_prefer_ro_with_fallback():
act = _activity(name="Original", description="Original desc")
assert act.get_display_name() == "Original" # no translation yet
assert act.get_display_description() == "Original desc"
act.name_ro = "Tradus"
act.description_ro = "Descriere tradusă"
assert act.get_display_name() == "Tradus"
assert act.get_display_description() == "Descriere tradusă"
assert act.has_translation() is True
def test_is_estimated_and_axis_displays():
act = _activity()
act.indoor_outdoor = "outdoor"
act.space_needed = "mare"
act.estimated_fields = ["space_needed"]
assert act.get_indoor_outdoor_display() == "Exterior"
assert act.get_space_needed_display() == "Spațiu mare"
assert act.is_estimated("space_needed") is True
assert act.is_estimated("indoor_outdoor") is False