Implements the approved plan to replace the broken regex/index-master extraction with an LLM-subagent pipeline. Four parallel lanes: Lane A — scripts/extract_common.py (PDF/docx/doc/pptx/html/zip, no max_pages truncation), normalize_sources.py, chunk_sources.py (~20pg chunks + overlap, manifest registry), activity_schema.json. Lane B — app/config_taxonomy.py (16 fixed category slugs), schema rebuilt from scratch in app/models/ with content_type, language, source_files, source_excerpt, normalized_name, extraction_confidence, needs_review; FTS5 + 3 triggers extended with materials_list and skills_developed. Lane C — build_database.py (--rebuild, atomic swap, schema + fuzzy source_excerpt validation, dedup with needs_review band), validate_extractions.py, review_queue.py, new run_extraction.py orchestrator, SUBAGENT_PROMPT.md. Lane D — search.py content_type/language filters (default search excludes non-game content), E7 schema-compat audit; fixed a NULL keywords AttributeError in _boost_search_relevance. Removes 8 orphaned/dead scripts and app/services/parser.py + indexer.py. Adds tests/ (70 passing, 1 skipped — libreoffice absent). Note: Lane D made one additive edit to app/models/database.py (_update_category_counts) to surface content_type/language in get_filter_options, outside its nominal lane boundary but after Lane B completed. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
335 lines
12 KiB
Python
335 lines
12 KiB
Python
# -*- coding: utf-8 -*-
|
|
"""
|
|
Tests for scripts/build_database.py — the import / dedup / swap side.
|
|
|
|
Covers: category -> slug + `altele` fallback; dedup across all three threshold
|
|
bands; EN != RO never merged; field combination on merge; atomic swap with a
|
|
simulated mid-build crash; the source_excerpt substring check.
|
|
"""
|
|
|
|
import json
|
|
import os
|
|
import sys
|
|
from pathlib import Path
|
|
|
|
import pytest
|
|
|
|
REPO_ROOT = Path(__file__).resolve().parent.parent
|
|
SCRIPTS_DIR = REPO_ROOT / "scripts"
|
|
for _p in (str(REPO_ROOT), str(SCRIPTS_DIR)):
|
|
if _p not in sys.path:
|
|
sys.path.insert(0, _p)
|
|
|
|
import build_database as bd # noqa: E402
|
|
from app.models.activity import Activity # noqa: E402
|
|
from app.models.database import DatabaseManager # noqa: E402
|
|
|
|
|
|
# --------------------------------------------------------------------------
|
|
# helpers
|
|
# --------------------------------------------------------------------------
|
|
def _activity(**over):
|
|
base = dict(
|
|
name="Jocul testului",
|
|
description="O activitate de echipa in aer liber.",
|
|
category="team-building",
|
|
content_type="joc",
|
|
language="ro",
|
|
extraction_confidence="high",
|
|
)
|
|
base.update(over)
|
|
return Activity(**base)
|
|
|
|
|
|
def _ext_activity(**over):
|
|
"""A schema-valid extraction-JSON activity object."""
|
|
base = dict(
|
|
name="Jocul testului",
|
|
description="O activitate de echipa in aer liber.",
|
|
category="team-building",
|
|
content_type="joc",
|
|
language="ro",
|
|
extraction_confidence="high",
|
|
source_excerpt="ANCHOR-EXCERPT despre jocul testului",
|
|
page_reference="page 1",
|
|
)
|
|
base.update(over)
|
|
return base
|
|
|
|
|
|
def _write_extraction(extracted_dir, chunk_key, activities, source_id="src01"):
|
|
extracted_dir.mkdir(parents=True, exist_ok=True)
|
|
payload = {
|
|
"header": {
|
|
"source_hash": "hash1234deadbeef",
|
|
"schema_version": "1.0",
|
|
"prompt_version": "1.0",
|
|
"chunk_range": "pages 1-20",
|
|
"source_id": source_id,
|
|
"chunk_key": chunk_key,
|
|
},
|
|
"activities": activities,
|
|
}
|
|
(extracted_dir / f"{chunk_key}.json").write_text(
|
|
json.dumps(payload, ensure_ascii=False), encoding="utf-8"
|
|
)
|
|
|
|
|
|
def _write_chunk(chunks_dir, source_id, chunk_key, text):
|
|
d = chunks_dir / source_id
|
|
d.mkdir(parents=True, exist_ok=True)
|
|
(d / f"{chunk_key}.txt").write_text(text, encoding="utf-8")
|
|
|
|
|
|
# --------------------------------------------------------------------------
|
|
# step 3 — category normalization
|
|
# --------------------------------------------------------------------------
|
|
def test_category_alias_mapped_to_slug():
|
|
act = bd.dict_to_activity(_ext_activity(category="teambuilding"), "s.txt")
|
|
assert act.category == "team-building"
|
|
|
|
|
|
def test_unknown_category_falls_back_to_altele():
|
|
act = bd.dict_to_activity(_ext_activity(category="zzz-not-a-category"), "s.txt")
|
|
assert act.category == "altele"
|
|
|
|
|
|
def test_content_type_normalized():
|
|
act = bd.dict_to_activity(_ext_activity(content_type="games"), "s.txt")
|
|
assert act.content_type == "joc"
|
|
|
|
|
|
# --------------------------------------------------------------------------
|
|
# step 4 — dedup, three bands
|
|
# --------------------------------------------------------------------------
|
|
def test_dedup_auto_merge_identical_descriptions():
|
|
""">= 85 similar -> a single merged row."""
|
|
a = _activity(description="copiii formeaza echipe si traverseaza terenul")
|
|
b = _activity(description="copiii formeaza echipe si traverseaza terenul")
|
|
out, stats = bd.dedup_activities([a, b])
|
|
assert len(out) == 1
|
|
assert stats["auto_merged"] == 1
|
|
assert out[0].needs_review == 0
|
|
|
|
|
|
def test_dedup_borderline_keeps_both_and_flags_needs_review():
|
|
"""60-85 similar -> both kept, both flagged needs_review."""
|
|
from rapidfuzz import fuzz
|
|
|
|
d1 = "alpha beta gamma delta epsilon"
|
|
d2 = "alpha beta gamma delta epsilon zeta eta theta iota"
|
|
score = fuzz.token_sort_ratio(d1, d2)
|
|
assert 60.0 <= score < 85.0, f"precondition: score={score} not borderline"
|
|
|
|
a = _activity(description=d1)
|
|
b = _activity(description=d2)
|
|
out, stats = bd.dedup_activities([a, b])
|
|
assert len(out) == 2
|
|
assert stats["borderline"] == 2
|
|
assert all(act.needs_review == 1 for act in out)
|
|
|
|
|
|
def test_dedup_low_similarity_kept_as_separate_variants():
|
|
"""< 60 similar -> separate variants, no needs_review."""
|
|
from rapidfuzz import fuzz
|
|
|
|
d1 = "alpha beta gamma delta epsilon"
|
|
d2 = "quebec romeo sierra tango uniform victor whiskey"
|
|
assert fuzz.token_sort_ratio(d1, d2) < 60.0
|
|
|
|
a = _activity(description=d1)
|
|
b = _activity(description=d2)
|
|
out, stats = bd.dedup_activities([a, b])
|
|
assert len(out) == 2
|
|
assert stats["auto_merged"] == 0
|
|
assert all(act.needs_review == 0 for act in out)
|
|
|
|
|
|
def test_dedup_never_merges_across_languages():
|
|
"""Same name + same description but EN vs RO -> two distinct rows."""
|
|
desc = "children form teams and cross the field"
|
|
ro = _activity(name="Cursa", description=desc, language="ro")
|
|
en = _activity(name="Cursa", description=desc, language="en")
|
|
out, stats = bd.dedup_activities([ro, en])
|
|
assert len(out) == 2
|
|
assert stats["auto_merged"] == 0
|
|
langs = {a.language for a in out}
|
|
assert langs == {"ro", "en"}
|
|
|
|
|
|
def test_merge_combines_fields():
|
|
"""On merge: longest description/rules, union materials, accumulated sources."""
|
|
desc = "copiii formeaza echipe si traverseaza terenul cu obstacole"
|
|
a = _activity(
|
|
description=desc,
|
|
rules="regula scurta",
|
|
materials_list="franghie, esarfa",
|
|
source_file="a.txt",
|
|
keywords="echipa",
|
|
)
|
|
b = _activity(
|
|
description=desc,
|
|
rules="o regula mult mai lunga si mai detaliata pentru joc",
|
|
materials_list="busola, esarfa",
|
|
source_file="b.txt",
|
|
keywords="cooperare",
|
|
)
|
|
out, _ = bd.dedup_activities([a, b])
|
|
assert len(out) == 1
|
|
merged = out[0]
|
|
assert merged.rules == "o regula mult mai lunga si mai detaliata pentru joc"
|
|
mats = set(m.strip() for m in merged.materials_list.split(","))
|
|
assert mats == {"franghie", "esarfa", "busola"}
|
|
assert set(merged.source_files) == {"a.txt", "b.txt"}
|
|
assert merged.popularity_score == 1
|
|
assert set(k.strip() for k in merged.keywords.split(",")) == {"echipa", "cooperare"}
|
|
|
|
|
|
# --------------------------------------------------------------------------
|
|
# step 5 — review decisions
|
|
# --------------------------------------------------------------------------
|
|
def test_review_decision_drop_removes_row():
|
|
from import_common import content_key, normalize_name
|
|
|
|
a = _activity(description="o descriere de test")
|
|
key = content_key(normalize_name(a.name), a.language, a.description)
|
|
kept, stats = bd.apply_review_decisions([a], {key: {"decision": "drop"}})
|
|
assert kept == []
|
|
assert stats["dropped"] == 1
|
|
|
|
|
|
def test_review_decision_keep_separate_clears_needs_review():
|
|
from import_common import content_key, normalize_name
|
|
|
|
a = _activity(description="o descriere de test")
|
|
a.needs_review = 1
|
|
key = content_key(normalize_name(a.name), a.language, a.description)
|
|
kept, stats = bd.apply_review_decisions([a], {key: {"decision": "keep-separate"}})
|
|
assert len(kept) == 1 and kept[0].needs_review == 0
|
|
assert stats["resolved"] == 1
|
|
|
|
|
|
# --------------------------------------------------------------------------
|
|
# step 2b — source_excerpt hallucination check
|
|
# --------------------------------------------------------------------------
|
|
def test_hallucinated_excerpt_activity_dropped(tmp_path):
|
|
extracted = tmp_path / "extracted"
|
|
chunks = tmp_path / "chunks"
|
|
sources = tmp_path / "sources"
|
|
|
|
good = _ext_activity(
|
|
name="Joc real", source_excerpt="textul real apare in bucata sursa"
|
|
)
|
|
bad = _ext_activity(
|
|
name="Joc inventat",
|
|
source_excerpt="acest citat nu exista nicaieri in sursa originala xyzzy",
|
|
)
|
|
_write_extraction(extracted, "src01.part01", [good, bad])
|
|
_write_chunk(
|
|
chunks, "src01", "src01.part01",
|
|
"--- PAGE 1 ---\ntextul real apare in bucata sursa pentru jocul real.\n",
|
|
)
|
|
|
|
from import_common import load_schema
|
|
|
|
schema = load_schema()
|
|
res = bd.collect_activities(extracted, chunks, sources, schema)
|
|
names = {a.name for a in res["activities"]}
|
|
assert names == {"Joc real"}
|
|
assert res["activities_hallucinated"] == 1
|
|
assert (extracted / "_rejected").exists()
|
|
|
|
|
|
def test_schema_invalid_file_moved_to_rejected(tmp_path):
|
|
extracted = tmp_path / "extracted"
|
|
chunks = tmp_path / "chunks"
|
|
sources = tmp_path / "sources"
|
|
extracted.mkdir(parents=True)
|
|
|
|
# missing required header keys + bad activity
|
|
(extracted / "bad.json").write_text(
|
|
json.dumps({"header": {}, "activities": [{"name": "x"}]}),
|
|
encoding="utf-8",
|
|
)
|
|
from import_common import load_schema
|
|
|
|
res = bd.collect_activities(extracted, chunks, sources, load_schema())
|
|
assert res["files_rejected_schema"] == 1
|
|
assert not (extracted / "bad.json").exists()
|
|
assert (extracted / "_rejected" / "bad.json").exists()
|
|
assert (extracted / "_rejected" / "bad.errors.txt").exists()
|
|
|
|
|
|
# --------------------------------------------------------------------------
|
|
# end-to-end rebuild + atomic swap
|
|
# --------------------------------------------------------------------------
|
|
def _setup_corpus(tmp_path):
|
|
extracted = tmp_path / "extracted"
|
|
chunks = tmp_path / "chunks"
|
|
sources = tmp_path / "sources"
|
|
excerpt = "jocul testului este o activitate de echipa"
|
|
_write_extraction(
|
|
extracted, "src01.part01",
|
|
[_ext_activity(source_excerpt=excerpt)],
|
|
)
|
|
_write_chunk(chunks, "src01", "src01.part01",
|
|
f"--- PAGE 1 ---\n{excerpt} in aer liber.\n")
|
|
return extracted, chunks, sources
|
|
|
|
|
|
def test_rebuild_creates_database(tmp_path):
|
|
extracted, chunks, sources = _setup_corpus(tmp_path)
|
|
db_path = tmp_path / "activities.db"
|
|
|
|
report = bd.rebuild(
|
|
extracted_dir=extracted, chunks_dir=chunks, sources_dir=sources,
|
|
db_path=db_path,
|
|
)
|
|
assert db_path.exists()
|
|
assert report["final_count"] == 1
|
|
|
|
db = DatabaseManager(str(db_path))
|
|
rows = db.search_activities()
|
|
assert len(rows) == 1
|
|
assert rows[0]["category"] == "team-building"
|
|
|
|
|
|
def test_atomic_swap_keeps_live_db_intact_on_crash(tmp_path, monkeypatch):
|
|
"""A mid-build crash must leave the live DB byte-identical."""
|
|
extracted, chunks, sources = _setup_corpus(tmp_path)
|
|
db_path = tmp_path / "activities.db"
|
|
|
|
# a pre-existing live DB with sentinel content
|
|
live = DatabaseManager(str(db_path))
|
|
live.insert_activity(_activity(name="Sentinel viu"))
|
|
before = db_path.read_bytes()
|
|
|
|
def boom(self, *a, **k):
|
|
raise RuntimeError("simulated mid-build crash")
|
|
|
|
monkeypatch.setattr(DatabaseManager, "bulk_insert_activities", boom)
|
|
|
|
with pytest.raises(RuntimeError, match="simulated mid-build crash"):
|
|
bd.rebuild(
|
|
extracted_dir=extracted, chunks_dir=chunks, sources_dir=sources,
|
|
db_path=db_path,
|
|
)
|
|
|
|
# live DB untouched, tmp cleaned up
|
|
assert db_path.read_bytes() == before
|
|
assert not (tmp_path / "activities.db.tmp").exists()
|
|
|
|
|
|
def test_rebuild_backs_up_live_db(tmp_path):
|
|
extracted, chunks, sources = _setup_corpus(tmp_path)
|
|
db_path = tmp_path / "activities.db"
|
|
DatabaseManager(str(db_path)).insert_activity(_activity(name="Vechi"))
|
|
|
|
report = bd.rebuild(
|
|
extracted_dir=extracted, chunks_dir=chunks, sources_dir=sources,
|
|
db_path=db_path,
|
|
)
|
|
assert report["backup"] is not None
|
|
assert Path(report["backup"]).exists()
|
|
assert os.path.basename(report["backup"]) == "activities.db.bak"
|