# -*- coding: utf-8 -*- """ Tests for scripts/build_database.py — the import / dedup / swap side. Covers: category -> slug + `altele` fallback; dedup across all three threshold bands; EN != RO never merged; field combination on merge; atomic swap with a simulated mid-build crash; the source_excerpt substring check. """ import json import os import sys from pathlib import Path import pytest REPO_ROOT = Path(__file__).resolve().parent.parent SCRIPTS_DIR = REPO_ROOT / "scripts" for _p in (str(REPO_ROOT), str(SCRIPTS_DIR)): if _p not in sys.path: sys.path.insert(0, _p) import build_database as bd # noqa: E402 from app.models.activity import Activity # noqa: E402 from app.models.database import DatabaseManager # noqa: E402 # -------------------------------------------------------------------------- # helpers # -------------------------------------------------------------------------- def _activity(**over): base = dict( name="Jocul testului", description="O activitate de echipa in aer liber.", category="team-building", content_type="joc", language="ro", extraction_confidence="high", ) base.update(over) return Activity(**base) def _ext_activity(**over): """A schema-valid extraction-JSON activity object.""" base = dict( name="Jocul testului", description="O activitate de echipa in aer liber.", category="team-building", content_type="joc", language="ro", extraction_confidence="high", source_excerpt="ANCHOR-EXCERPT despre jocul testului", page_reference="page 1", ) base.update(over) return base def _write_extraction(extracted_dir, chunk_key, activities, source_id="src01"): extracted_dir.mkdir(parents=True, exist_ok=True) payload = { "header": { "source_hash": "hash1234deadbeef", "schema_version": "1.0", "prompt_version": "1.0", "chunk_range": "pages 1-20", "source_id": source_id, "chunk_key": chunk_key, }, "activities": activities, } (extracted_dir / f"{chunk_key}.json").write_text( json.dumps(payload, ensure_ascii=False), encoding="utf-8" ) def _write_chunk(chunks_dir, source_id, chunk_key, text): d = chunks_dir / source_id d.mkdir(parents=True, exist_ok=True) (d / f"{chunk_key}.txt").write_text(text, encoding="utf-8") # -------------------------------------------------------------------------- # step 3 — category normalization # -------------------------------------------------------------------------- def test_category_alias_mapped_to_slug(): act = bd.dict_to_activity(_ext_activity(category="teambuilding"), "s.txt") assert act.category == "team-building" def test_unknown_category_falls_back_to_altele(): act = bd.dict_to_activity(_ext_activity(category="zzz-not-a-category"), "s.txt") assert act.category == "altele" def test_content_type_normalized(): act = bd.dict_to_activity(_ext_activity(content_type="games"), "s.txt") assert act.content_type == "joc" # -------------------------------------------------------------------------- # step 4 — dedup, three bands # -------------------------------------------------------------------------- def test_dedup_auto_merge_identical_descriptions(): """>= 85 similar -> a single merged row.""" a = _activity(description="copiii formeaza echipe si traverseaza terenul") b = _activity(description="copiii formeaza echipe si traverseaza terenul") out, stats = bd.dedup_activities([a, b]) assert len(out) == 1 assert stats["auto_merged"] == 1 assert out[0].needs_review == 0 def test_dedup_borderline_keeps_both_and_flags_needs_review(): """60-85 similar -> both kept, both flagged needs_review.""" from rapidfuzz import fuzz d1 = "alpha beta gamma delta epsilon" d2 = "alpha beta gamma delta epsilon zeta eta theta iota" score = fuzz.token_sort_ratio(d1, d2) assert 60.0 <= score < 85.0, f"precondition: score={score} not borderline" a = _activity(description=d1) b = _activity(description=d2) out, stats = bd.dedup_activities([a, b]) assert len(out) == 2 assert stats["borderline"] == 2 assert all(act.needs_review == 1 for act in out) def test_dedup_low_similarity_kept_as_separate_variants(): """< 60 similar -> separate variants, no needs_review.""" from rapidfuzz import fuzz d1 = "alpha beta gamma delta epsilon" d2 = "quebec romeo sierra tango uniform victor whiskey" assert fuzz.token_sort_ratio(d1, d2) < 60.0 a = _activity(description=d1) b = _activity(description=d2) out, stats = bd.dedup_activities([a, b]) assert len(out) == 2 assert stats["auto_merged"] == 0 assert all(act.needs_review == 0 for act in out) def test_dedup_never_merges_across_languages(): """Same name + same description but EN vs RO -> two distinct rows.""" desc = "children form teams and cross the field" ro = _activity(name="Cursa", description=desc, language="ro") en = _activity(name="Cursa", description=desc, language="en") out, stats = bd.dedup_activities([ro, en]) assert len(out) == 2 assert stats["auto_merged"] == 0 langs = {a.language for a in out} assert langs == {"ro", "en"} def test_merge_combines_fields(): """On merge: longest description/rules, union materials, accumulated sources.""" desc = "copiii formeaza echipe si traverseaza terenul cu obstacole" a = _activity( description=desc, rules="regula scurta", materials_list="franghie, esarfa", source_file="a.txt", keywords="echipa", ) b = _activity( description=desc, rules="o regula mult mai lunga si mai detaliata pentru joc", materials_list="busola, esarfa", source_file="b.txt", keywords="cooperare", ) out, _ = bd.dedup_activities([a, b]) assert len(out) == 1 merged = out[0] assert merged.rules == "o regula mult mai lunga si mai detaliata pentru joc" mats = set(m.strip() for m in merged.materials_list.split(",")) assert mats == {"franghie", "esarfa", "busola"} assert set(merged.source_files) == {"a.txt", "b.txt"} assert merged.popularity_score == 1 assert set(k.strip() for k in merged.keywords.split(",")) == {"echipa", "cooperare"} # -------------------------------------------------------------------------- # step 5 — review decisions # -------------------------------------------------------------------------- def test_review_decision_drop_removes_row(): from import_common import content_key, normalize_name a = _activity(description="o descriere de test") key = content_key(normalize_name(a.name), a.language, a.description) kept, stats = bd.apply_review_decisions([a], {key: {"decision": "drop"}}) assert kept == [] assert stats["dropped"] == 1 def test_review_decision_keep_separate_clears_needs_review(): from import_common import content_key, normalize_name a = _activity(description="o descriere de test") a.needs_review = 1 key = content_key(normalize_name(a.name), a.language, a.description) kept, stats = bd.apply_review_decisions([a], {key: {"decision": "keep-separate"}}) assert len(kept) == 1 and kept[0].needs_review == 0 assert stats["resolved"] == 1 # -------------------------------------------------------------------------- # step 2b — source_excerpt hallucination check # -------------------------------------------------------------------------- def test_hallucinated_excerpt_activity_dropped(tmp_path): extracted = tmp_path / "extracted" chunks = tmp_path / "chunks" sources = tmp_path / "sources" good = _ext_activity( name="Joc real", source_excerpt="textul real apare in bucata sursa" ) bad = _ext_activity( name="Joc inventat", source_excerpt="acest citat nu exista nicaieri in sursa originala xyzzy", ) _write_extraction(extracted, "src01.part01", [good, bad]) _write_chunk( chunks, "src01", "src01.part01", "--- PAGE 1 ---\ntextul real apare in bucata sursa pentru jocul real.\n", ) from import_common import load_schema schema = load_schema() res = bd.collect_activities(extracted, chunks, sources, schema) names = {a.name for a in res["activities"]} assert names == {"Joc real"} assert res["activities_hallucinated"] == 1 assert (extracted / "_rejected").exists() def test_schema_invalid_file_moved_to_rejected(tmp_path): extracted = tmp_path / "extracted" chunks = tmp_path / "chunks" sources = tmp_path / "sources" extracted.mkdir(parents=True) # missing required header keys + bad activity (extracted / "bad.json").write_text( json.dumps({"header": {}, "activities": [{"name": "x"}]}), encoding="utf-8", ) from import_common import load_schema res = bd.collect_activities(extracted, chunks, sources, load_schema()) assert res["files_rejected_schema"] == 1 assert not (extracted / "bad.json").exists() assert (extracted / "_rejected" / "bad.json").exists() assert (extracted / "_rejected" / "bad.errors.txt").exists() # -------------------------------------------------------------------------- # end-to-end rebuild + atomic swap # -------------------------------------------------------------------------- def _setup_corpus(tmp_path): extracted = tmp_path / "extracted" chunks = tmp_path / "chunks" sources = tmp_path / "sources" excerpt = "jocul testului este o activitate de echipa" _write_extraction( extracted, "src01.part01", [_ext_activity(source_excerpt=excerpt)], ) _write_chunk(chunks, "src01", "src01.part01", f"--- PAGE 1 ---\n{excerpt} in aer liber.\n") return extracted, chunks, sources def test_rebuild_creates_database(tmp_path): extracted, chunks, sources = _setup_corpus(tmp_path) db_path = tmp_path / "activities.db" report = bd.rebuild( extracted_dir=extracted, chunks_dir=chunks, sources_dir=sources, db_path=db_path, ) assert db_path.exists() assert report["final_count"] == 1 db = DatabaseManager(str(db_path)) rows = db.search_activities() assert len(rows) == 1 assert rows[0]["category"] == "team-building" def test_atomic_swap_keeps_live_db_intact_on_crash(tmp_path, monkeypatch): """A mid-build crash must leave the live DB byte-identical.""" extracted, chunks, sources = _setup_corpus(tmp_path) db_path = tmp_path / "activities.db" # a pre-existing live DB with sentinel content live = DatabaseManager(str(db_path)) live.insert_activity(_activity(name="Sentinel viu")) before = db_path.read_bytes() def boom(self, *a, **k): raise RuntimeError("simulated mid-build crash") monkeypatch.setattr(DatabaseManager, "bulk_insert_activities", boom) with pytest.raises(RuntimeError, match="simulated mid-build crash"): bd.rebuild( extracted_dir=extracted, chunks_dir=chunks, sources_dir=sources, db_path=db_path, ) # live DB untouched, tmp cleaned up assert db_path.read_bytes() == before assert not (tmp_path / "activities.db.tmp").exists() def test_rebuild_backs_up_live_db(tmp_path): extracted, chunks, sources = _setup_corpus(tmp_path) db_path = tmp_path / "activities.db" DatabaseManager(str(db_path)).insert_activity(_activity(name="Vechi")) report = bd.rebuild( extracted_dir=extracted, chunks_dir=chunks, sources_dir=sources, db_path=db_path, ) assert report["backup"] is not None assert Path(report["backup"]).exists() assert os.path.basename(report["backup"]) == "activities.db.bak"