Files
game-library/tests/test_build_database.py
Claude Agent 66ae831c36 Rebuild extraction pipeline infrastructure (Faza 0 prep)
Implements the approved plan to replace the broken regex/index-master
extraction with an LLM-subagent pipeline. Four parallel lanes:

Lane A — scripts/extract_common.py (PDF/docx/doc/pptx/html/zip, no
  max_pages truncation), normalize_sources.py, chunk_sources.py
  (~20pg chunks + overlap, manifest registry), activity_schema.json.
Lane B — app/config_taxonomy.py (16 fixed category slugs), schema
  rebuilt from scratch in app/models/ with content_type, language,
  source_files, source_excerpt, normalized_name, extraction_confidence,
  needs_review; FTS5 + 3 triggers extended with materials_list and
  skills_developed.
Lane C — build_database.py (--rebuild, atomic swap, schema + fuzzy
  source_excerpt validation, dedup with needs_review band),
  validate_extractions.py, review_queue.py, new run_extraction.py
  orchestrator, SUBAGENT_PROMPT.md.
Lane D — search.py content_type/language filters (default search
  excludes non-game content), E7 schema-compat audit; fixed a NULL
  keywords AttributeError in _boost_search_relevance.

Removes 8 orphaned/dead scripts and app/services/parser.py +
indexer.py. Adds tests/ (70 passing, 1 skipped — libreoffice absent).

Note: Lane D made one additive edit to app/models/database.py
(_update_category_counts) to surface content_type/language in
get_filter_options, outside its nominal lane boundary but after
Lane B completed.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-05-19 17:43:38 +00:00

335 lines
12 KiB
Python

# -*- coding: utf-8 -*-
"""
Tests for scripts/build_database.py — the import / dedup / swap side.
Covers: category -> slug + `altele` fallback; dedup across all three threshold
bands; EN != RO never merged; field combination on merge; atomic swap with a
simulated mid-build crash; the source_excerpt substring check.
"""
import json
import os
import sys
from pathlib import Path
import pytest
REPO_ROOT = Path(__file__).resolve().parent.parent
SCRIPTS_DIR = REPO_ROOT / "scripts"
for _p in (str(REPO_ROOT), str(SCRIPTS_DIR)):
if _p not in sys.path:
sys.path.insert(0, _p)
import build_database as bd # noqa: E402
from app.models.activity import Activity # noqa: E402
from app.models.database import DatabaseManager # noqa: E402
# --------------------------------------------------------------------------
# helpers
# --------------------------------------------------------------------------
def _activity(**over):
base = dict(
name="Jocul testului",
description="O activitate de echipa in aer liber.",
category="team-building",
content_type="joc",
language="ro",
extraction_confidence="high",
)
base.update(over)
return Activity(**base)
def _ext_activity(**over):
"""A schema-valid extraction-JSON activity object."""
base = dict(
name="Jocul testului",
description="O activitate de echipa in aer liber.",
category="team-building",
content_type="joc",
language="ro",
extraction_confidence="high",
source_excerpt="ANCHOR-EXCERPT despre jocul testului",
page_reference="page 1",
)
base.update(over)
return base
def _write_extraction(extracted_dir, chunk_key, activities, source_id="src01"):
extracted_dir.mkdir(parents=True, exist_ok=True)
payload = {
"header": {
"source_hash": "hash1234deadbeef",
"schema_version": "1.0",
"prompt_version": "1.0",
"chunk_range": "pages 1-20",
"source_id": source_id,
"chunk_key": chunk_key,
},
"activities": activities,
}
(extracted_dir / f"{chunk_key}.json").write_text(
json.dumps(payload, ensure_ascii=False), encoding="utf-8"
)
def _write_chunk(chunks_dir, source_id, chunk_key, text):
d = chunks_dir / source_id
d.mkdir(parents=True, exist_ok=True)
(d / f"{chunk_key}.txt").write_text(text, encoding="utf-8")
# --------------------------------------------------------------------------
# step 3 — category normalization
# --------------------------------------------------------------------------
def test_category_alias_mapped_to_slug():
act = bd.dict_to_activity(_ext_activity(category="teambuilding"), "s.txt")
assert act.category == "team-building"
def test_unknown_category_falls_back_to_altele():
act = bd.dict_to_activity(_ext_activity(category="zzz-not-a-category"), "s.txt")
assert act.category == "altele"
def test_content_type_normalized():
act = bd.dict_to_activity(_ext_activity(content_type="games"), "s.txt")
assert act.content_type == "joc"
# --------------------------------------------------------------------------
# step 4 — dedup, three bands
# --------------------------------------------------------------------------
def test_dedup_auto_merge_identical_descriptions():
""">= 85 similar -> a single merged row."""
a = _activity(description="copiii formeaza echipe si traverseaza terenul")
b = _activity(description="copiii formeaza echipe si traverseaza terenul")
out, stats = bd.dedup_activities([a, b])
assert len(out) == 1
assert stats["auto_merged"] == 1
assert out[0].needs_review == 0
def test_dedup_borderline_keeps_both_and_flags_needs_review():
"""60-85 similar -> both kept, both flagged needs_review."""
from rapidfuzz import fuzz
d1 = "alpha beta gamma delta epsilon"
d2 = "alpha beta gamma delta epsilon zeta eta theta iota"
score = fuzz.token_sort_ratio(d1, d2)
assert 60.0 <= score < 85.0, f"precondition: score={score} not borderline"
a = _activity(description=d1)
b = _activity(description=d2)
out, stats = bd.dedup_activities([a, b])
assert len(out) == 2
assert stats["borderline"] == 2
assert all(act.needs_review == 1 for act in out)
def test_dedup_low_similarity_kept_as_separate_variants():
"""< 60 similar -> separate variants, no needs_review."""
from rapidfuzz import fuzz
d1 = "alpha beta gamma delta epsilon"
d2 = "quebec romeo sierra tango uniform victor whiskey"
assert fuzz.token_sort_ratio(d1, d2) < 60.0
a = _activity(description=d1)
b = _activity(description=d2)
out, stats = bd.dedup_activities([a, b])
assert len(out) == 2
assert stats["auto_merged"] == 0
assert all(act.needs_review == 0 for act in out)
def test_dedup_never_merges_across_languages():
"""Same name + same description but EN vs RO -> two distinct rows."""
desc = "children form teams and cross the field"
ro = _activity(name="Cursa", description=desc, language="ro")
en = _activity(name="Cursa", description=desc, language="en")
out, stats = bd.dedup_activities([ro, en])
assert len(out) == 2
assert stats["auto_merged"] == 0
langs = {a.language for a in out}
assert langs == {"ro", "en"}
def test_merge_combines_fields():
"""On merge: longest description/rules, union materials, accumulated sources."""
desc = "copiii formeaza echipe si traverseaza terenul cu obstacole"
a = _activity(
description=desc,
rules="regula scurta",
materials_list="franghie, esarfa",
source_file="a.txt",
keywords="echipa",
)
b = _activity(
description=desc,
rules="o regula mult mai lunga si mai detaliata pentru joc",
materials_list="busola, esarfa",
source_file="b.txt",
keywords="cooperare",
)
out, _ = bd.dedup_activities([a, b])
assert len(out) == 1
merged = out[0]
assert merged.rules == "o regula mult mai lunga si mai detaliata pentru joc"
mats = set(m.strip() for m in merged.materials_list.split(","))
assert mats == {"franghie", "esarfa", "busola"}
assert set(merged.source_files) == {"a.txt", "b.txt"}
assert merged.popularity_score == 1
assert set(k.strip() for k in merged.keywords.split(",")) == {"echipa", "cooperare"}
# --------------------------------------------------------------------------
# step 5 — review decisions
# --------------------------------------------------------------------------
def test_review_decision_drop_removes_row():
from import_common import content_key, normalize_name
a = _activity(description="o descriere de test")
key = content_key(normalize_name(a.name), a.language, a.description)
kept, stats = bd.apply_review_decisions([a], {key: {"decision": "drop"}})
assert kept == []
assert stats["dropped"] == 1
def test_review_decision_keep_separate_clears_needs_review():
from import_common import content_key, normalize_name
a = _activity(description="o descriere de test")
a.needs_review = 1
key = content_key(normalize_name(a.name), a.language, a.description)
kept, stats = bd.apply_review_decisions([a], {key: {"decision": "keep-separate"}})
assert len(kept) == 1 and kept[0].needs_review == 0
assert stats["resolved"] == 1
# --------------------------------------------------------------------------
# step 2b — source_excerpt hallucination check
# --------------------------------------------------------------------------
def test_hallucinated_excerpt_activity_dropped(tmp_path):
extracted = tmp_path / "extracted"
chunks = tmp_path / "chunks"
sources = tmp_path / "sources"
good = _ext_activity(
name="Joc real", source_excerpt="textul real apare in bucata sursa"
)
bad = _ext_activity(
name="Joc inventat",
source_excerpt="acest citat nu exista nicaieri in sursa originala xyzzy",
)
_write_extraction(extracted, "src01.part01", [good, bad])
_write_chunk(
chunks, "src01", "src01.part01",
"--- PAGE 1 ---\ntextul real apare in bucata sursa pentru jocul real.\n",
)
from import_common import load_schema
schema = load_schema()
res = bd.collect_activities(extracted, chunks, sources, schema)
names = {a.name for a in res["activities"]}
assert names == {"Joc real"}
assert res["activities_hallucinated"] == 1
assert (extracted / "_rejected").exists()
def test_schema_invalid_file_moved_to_rejected(tmp_path):
extracted = tmp_path / "extracted"
chunks = tmp_path / "chunks"
sources = tmp_path / "sources"
extracted.mkdir(parents=True)
# missing required header keys + bad activity
(extracted / "bad.json").write_text(
json.dumps({"header": {}, "activities": [{"name": "x"}]}),
encoding="utf-8",
)
from import_common import load_schema
res = bd.collect_activities(extracted, chunks, sources, load_schema())
assert res["files_rejected_schema"] == 1
assert not (extracted / "bad.json").exists()
assert (extracted / "_rejected" / "bad.json").exists()
assert (extracted / "_rejected" / "bad.errors.txt").exists()
# --------------------------------------------------------------------------
# end-to-end rebuild + atomic swap
# --------------------------------------------------------------------------
def _setup_corpus(tmp_path):
extracted = tmp_path / "extracted"
chunks = tmp_path / "chunks"
sources = tmp_path / "sources"
excerpt = "jocul testului este o activitate de echipa"
_write_extraction(
extracted, "src01.part01",
[_ext_activity(source_excerpt=excerpt)],
)
_write_chunk(chunks, "src01", "src01.part01",
f"--- PAGE 1 ---\n{excerpt} in aer liber.\n")
return extracted, chunks, sources
def test_rebuild_creates_database(tmp_path):
extracted, chunks, sources = _setup_corpus(tmp_path)
db_path = tmp_path / "activities.db"
report = bd.rebuild(
extracted_dir=extracted, chunks_dir=chunks, sources_dir=sources,
db_path=db_path,
)
assert db_path.exists()
assert report["final_count"] == 1
db = DatabaseManager(str(db_path))
rows = db.search_activities()
assert len(rows) == 1
assert rows[0]["category"] == "team-building"
def test_atomic_swap_keeps_live_db_intact_on_crash(tmp_path, monkeypatch):
"""A mid-build crash must leave the live DB byte-identical."""
extracted, chunks, sources = _setup_corpus(tmp_path)
db_path = tmp_path / "activities.db"
# a pre-existing live DB with sentinel content
live = DatabaseManager(str(db_path))
live.insert_activity(_activity(name="Sentinel viu"))
before = db_path.read_bytes()
def boom(self, *a, **k):
raise RuntimeError("simulated mid-build crash")
monkeypatch.setattr(DatabaseManager, "bulk_insert_activities", boom)
with pytest.raises(RuntimeError, match="simulated mid-build crash"):
bd.rebuild(
extracted_dir=extracted, chunks_dir=chunks, sources_dir=sources,
db_path=db_path,
)
# live DB untouched, tmp cleaned up
assert db_path.read_bytes() == before
assert not (tmp_path / "activities.db.tmp").exists()
def test_rebuild_backs_up_live_db(tmp_path):
extracted, chunks, sources = _setup_corpus(tmp_path)
db_path = tmp_path / "activities.db"
DatabaseManager(str(db_path)).insert_activity(_activity(name="Vechi"))
report = bd.rebuild(
extracted_dir=extracted, chunks_dir=chunks, sources_dir=sources,
db_path=db_path,
)
assert report["backup"] is not None
assert Path(report["backup"]).exists()
assert os.path.basename(report["backup"]) == "activities.db.bak"