Rebuild extraction pipeline infrastructure (Faza 0 prep)

Implements the approved plan to replace the broken regex/index-master extraction with an LLM-subagent pipeline. Four parallel lanes: Lane A — scripts/extract_common.py (PDF/docx/doc/pptx/html/zip, no max_pages truncation), normalize_sources.py, chunk_sources.py (~20pg chunks + overlap, manifest registry), activity_schema.json. Lane B — app/config_taxonomy.py (16 fixed category slugs), schema rebuilt from scratch in app/models/ with content_type, language, source_files, source_excerpt, normalized_name, extraction_confidence, needs_review; FTS5 + 3 triggers extended with materials_list and skills_developed. Lane C — build_database.py (--rebuild, atomic swap, schema + fuzzy source_excerpt validation, dedup with needs_review band), validate_extractions.py, review_queue.py, new run_extraction.py orchestrator, SUBAGENT_PROMPT.md. Lane D — search.py content_type/language filters (default search excludes non-game content), E7 schema-compat audit; fixed a NULL keywords AttributeError in _boost_search_relevance. Removes 8 orphaned/dead scripts and app/services/parser.py + indexer.py. Adds tests/ (70 passing, 1 skipped — libreoffice absent). Note: Lane D made one additive edit to app/models/database.py (_update_category_counts) to surface content_type/language in get_filter_options, outside its nominal lane boundary but after Lane B completed. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-05-19 17:43:38 +00:00
parent e0080edf85
commit 66ae831c36
37 changed files with 4101 additions and 1881 deletions
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -0,0 +1,114 @@
+# -*- coding: utf-8 -*-
+"""
+Shared pytest fixtures for the extraction-pipeline tests.
+
+scripts/ is not a package, so it is added to sys.path here. Synthetic fixtures
+(PDF, docx, zip, HTML) are generated at runtime — no binary blobs in the repo.
+"""
+
+import sys
+import zipfile
+from pathlib import Path
+
+import pytest
+
+REPO_ROOT = Path(__file__).resolve().parent.parent
+SCRIPTS_DIR = REPO_ROOT / "scripts"
+if str(SCRIPTS_DIR) not in sys.path:
+    sys.path.insert(0, str(SCRIPTS_DIR))
+
+
+# --------------------------------------------------------------------------
+# synthetic PDF — deliberately large to pin the "no max_pages" regression
+# --------------------------------------------------------------------------
+@pytest.fixture
+def big_pdf(tmp_path):
+    """A 60-page PDF; each page carries a unique 'PDFMARK-<n>' token."""
+    from reportlab.pdfgen import canvas
+    from reportlab.lib.pagesizes import letter
+
+    path = tmp_path / "big.pdf"
+    c = canvas.Canvas(str(path), pagesize=letter)
+    for n in range(1, 61):
+        c.drawString(72, 720, f"PDFMARK-{n} synthetic activity page number {n}")
+        c.drawString(72, 700, "Acest joc educativ se joaca in echipa.")
+        c.showPage()
+    c.save()
+    return path
+
+
+# --------------------------------------------------------------------------
+# synthetic docx — 100 paragraphs => 3 synthetic pages at 40 paras/page
+# --------------------------------------------------------------------------
+@pytest.fixture
+def sample_docx(tmp_path):
+    import docx
+
+    path = tmp_path / "sample.docx"
+    document = docx.Document()
+    for i in range(100):
+        document.add_paragraph(f"Paragraf {i}: continut joc team-building.")
+    document.save(str(path))
+    return path
+
+
+# --------------------------------------------------------------------------
+# synthetic HTML mirror page — with nav/script/footer chrome to strip
+# --------------------------------------------------------------------------
+HTML_WITH_NAV = """<!doctype html>
+<html><head><title>Joc</title>
+<style>.x{color:red}</style>
+<script>var tracking = 1;</script>
+</head><body>
+<nav><a href="/">Home</a><a href="/games">Games</a></nav>
+<header>Site Banner Junk</header>
+<main>
+<h1>Vanatoarea de comori</h1>
+<p>Acesta este un joc real de orientare pentru cercetasi.</p>
+<p>Jucatorii cauta indicii ascunse in tabara.</p>
+</main>
+<footer>Copyright 2024 - toate drepturile rezervate</footer>
+</body></html>
+"""
+
+
+@pytest.fixture
+def html_with_nav(tmp_path):
+    path = tmp_path / "page.html"
+    path.write_text(HTML_WITH_NAV, encoding="utf-8")
+    return path
+
+
+# --------------------------------------------------------------------------
+# synthetic zip — contains a docx and a stray junk file
+# --------------------------------------------------------------------------
+@pytest.fixture
+def sample_zip(tmp_path, sample_docx):
+    path = tmp_path / "archive.zip"
+    with zipfile.ZipFile(path, "w") as zf:
+        zf.write(sample_docx, arcname="inner/sample.docx")
+        zf.writestr("desktop.ini", "junk")
+    return path
+
+
+# --------------------------------------------------------------------------
+# synthetic normalized source — paginated, with an activity straddling a
+# page boundary so the chunker overlap can be verified.
+# --------------------------------------------------------------------------
+@pytest.fixture
+def paginated_source(tmp_path):
+    """A 50-page normalized source. An activity spans the page 20/21 boundary."""
+    lines = ["SOURCE: synthetic/test.pdf", "CONVERTED: 2026-05-19",
+             "FORMAT: pdf", "=" * 50, ""]
+    for n in range(1, 51):
+        lines.append(f"--- PAGE {n} ---")
+        if n == 20:
+            lines.append("ACTIVITY-START jocul podului care traverseaza pagina")
+        elif n == 21:
+            lines.append("continuare a jocului podului ACTIVITY-END")
+        else:
+            lines.append(f"continut obisnuit pe pagina {n}")
+        lines.append("")
+    path = tmp_path / "src_paginated.txt"
+    path.write_text("\n".join(lines), encoding="utf-8")
+    return path
--- a/tests/fixtures/.gitkeep
+++ b/tests/fixtures/.gitkeep
@@ -0,0 +1,3 @@
+# Test fixtures (synthetic PDF/docx/zip/HTML) are generated at runtime by
+# tests/conftest.py — no binary blobs are committed. This file only preserves
+# the directory in git.
--- a/tests/test_build_database.py
+++ b/tests/test_build_database.py
@@ -0,0 +1,334 @@
+# -*- coding: utf-8 -*-
+"""
+Tests for scripts/build_database.py — the import / dedup / swap side.
+
+Covers: category -> slug + `altele` fallback; dedup across all three threshold
+bands; EN != RO never merged; field combination on merge; atomic swap with a
+simulated mid-build crash; the source_excerpt substring check.
+"""
+
+import json
+import os
+import sys
+from pathlib import Path
+
+import pytest
+
+REPO_ROOT = Path(__file__).resolve().parent.parent
+SCRIPTS_DIR = REPO_ROOT / "scripts"
+for _p in (str(REPO_ROOT), str(SCRIPTS_DIR)):
+    if _p not in sys.path:
+        sys.path.insert(0, _p)
+
+import build_database as bd  # noqa: E402
+from app.models.activity import Activity  # noqa: E402
+from app.models.database import DatabaseManager  # noqa: E402
+
+
+# --------------------------------------------------------------------------
+# helpers
+# --------------------------------------------------------------------------
+def _activity(**over):
+    base = dict(
+        name="Jocul testului",
+        description="O activitate de echipa in aer liber.",
+        category="team-building",
+        content_type="joc",
+        language="ro",
+        extraction_confidence="high",
+    )
+    base.update(over)
+    return Activity(**base)
+
+
+def _ext_activity(**over):
+    """A schema-valid extraction-JSON activity object."""
+    base = dict(
+        name="Jocul testului",
+        description="O activitate de echipa in aer liber.",
+        category="team-building",
+        content_type="joc",
+        language="ro",
+        extraction_confidence="high",
+        source_excerpt="ANCHOR-EXCERPT despre jocul testului",
+        page_reference="page 1",
+    )
+    base.update(over)
+    return base
+
+
+def _write_extraction(extracted_dir, chunk_key, activities, source_id="src01"):
+    extracted_dir.mkdir(parents=True, exist_ok=True)
+    payload = {
+        "header": {
+            "source_hash": "hash1234deadbeef",
+            "schema_version": "1.0",
+            "prompt_version": "1.0",
+            "chunk_range": "pages 1-20",
+            "source_id": source_id,
+            "chunk_key": chunk_key,
+        },
+        "activities": activities,
+    }
+    (extracted_dir / f"{chunk_key}.json").write_text(
+        json.dumps(payload, ensure_ascii=False), encoding="utf-8"
+    )
+
+
+def _write_chunk(chunks_dir, source_id, chunk_key, text):
+    d = chunks_dir / source_id
+    d.mkdir(parents=True, exist_ok=True)
+    (d / f"{chunk_key}.txt").write_text(text, encoding="utf-8")
+
+
+# --------------------------------------------------------------------------
+# step 3 — category normalization
+# --------------------------------------------------------------------------
+def test_category_alias_mapped_to_slug():
+    act = bd.dict_to_activity(_ext_activity(category="teambuilding"), "s.txt")
+    assert act.category == "team-building"
+
+
+def test_unknown_category_falls_back_to_altele():
+    act = bd.dict_to_activity(_ext_activity(category="zzz-not-a-category"), "s.txt")
+    assert act.category == "altele"
+
+
+def test_content_type_normalized():
+    act = bd.dict_to_activity(_ext_activity(content_type="games"), "s.txt")
+    assert act.content_type == "joc"
+
+
+# --------------------------------------------------------------------------
+# step 4 — dedup, three bands
+# --------------------------------------------------------------------------
+def test_dedup_auto_merge_identical_descriptions():
+    """>= 85 similar -> a single merged row."""
+    a = _activity(description="copiii formeaza echipe si traverseaza terenul")
+    b = _activity(description="copiii formeaza echipe si traverseaza terenul")
+    out, stats = bd.dedup_activities([a, b])
+    assert len(out) == 1
+    assert stats["auto_merged"] == 1
+    assert out[0].needs_review == 0
+
+
+def test_dedup_borderline_keeps_both_and_flags_needs_review():
+    """60-85 similar -> both kept, both flagged needs_review."""
+    from rapidfuzz import fuzz
+
+    d1 = "alpha beta gamma delta epsilon"
+    d2 = "alpha beta gamma delta epsilon zeta eta theta iota"
+    score = fuzz.token_sort_ratio(d1, d2)
+    assert 60.0 <= score < 85.0, f"precondition: score={score} not borderline"
+
+    a = _activity(description=d1)
+    b = _activity(description=d2)
+    out, stats = bd.dedup_activities([a, b])
+    assert len(out) == 2
+    assert stats["borderline"] == 2
+    assert all(act.needs_review == 1 for act in out)
+
+
+def test_dedup_low_similarity_kept_as_separate_variants():
+    """< 60 similar -> separate variants, no needs_review."""
+    from rapidfuzz import fuzz
+
+    d1 = "alpha beta gamma delta epsilon"
+    d2 = "quebec romeo sierra tango uniform victor whiskey"
+    assert fuzz.token_sort_ratio(d1, d2) < 60.0
+
+    a = _activity(description=d1)
+    b = _activity(description=d2)
+    out, stats = bd.dedup_activities([a, b])
+    assert len(out) == 2
+    assert stats["auto_merged"] == 0
+    assert all(act.needs_review == 0 for act in out)
+
+
+def test_dedup_never_merges_across_languages():
+    """Same name + same description but EN vs RO -> two distinct rows."""
+    desc = "children form teams and cross the field"
+    ro = _activity(name="Cursa", description=desc, language="ro")
+    en = _activity(name="Cursa", description=desc, language="en")
+    out, stats = bd.dedup_activities([ro, en])
+    assert len(out) == 2
+    assert stats["auto_merged"] == 0
+    langs = {a.language for a in out}
+    assert langs == {"ro", "en"}
+
+
+def test_merge_combines_fields():
+    """On merge: longest description/rules, union materials, accumulated sources."""
+    desc = "copiii formeaza echipe si traverseaza terenul cu obstacole"
+    a = _activity(
+        description=desc,
+        rules="regula scurta",
+        materials_list="franghie, esarfa",
+        source_file="a.txt",
+        keywords="echipa",
+    )
+    b = _activity(
+        description=desc,
+        rules="o regula mult mai lunga si mai detaliata pentru joc",
+        materials_list="busola, esarfa",
+        source_file="b.txt",
+        keywords="cooperare",
+    )
+    out, _ = bd.dedup_activities([a, b])
+    assert len(out) == 1
+    merged = out[0]
+    assert merged.rules == "o regula mult mai lunga si mai detaliata pentru joc"
+    mats = set(m.strip() for m in merged.materials_list.split(","))
+    assert mats == {"franghie", "esarfa", "busola"}
+    assert set(merged.source_files) == {"a.txt", "b.txt"}
+    assert merged.popularity_score == 1
+    assert set(k.strip() for k in merged.keywords.split(",")) == {"echipa", "cooperare"}
+
+
+# --------------------------------------------------------------------------
+# step 5 — review decisions
+# --------------------------------------------------------------------------
+def test_review_decision_drop_removes_row():
+    from import_common import content_key, normalize_name
+
+    a = _activity(description="o descriere de test")
+    key = content_key(normalize_name(a.name), a.language, a.description)
+    kept, stats = bd.apply_review_decisions([a], {key: {"decision": "drop"}})
+    assert kept == []
+    assert stats["dropped"] == 1
+
+
+def test_review_decision_keep_separate_clears_needs_review():
+    from import_common import content_key, normalize_name
+
+    a = _activity(description="o descriere de test")
+    a.needs_review = 1
+    key = content_key(normalize_name(a.name), a.language, a.description)
+    kept, stats = bd.apply_review_decisions([a], {key: {"decision": "keep-separate"}})
+    assert len(kept) == 1 and kept[0].needs_review == 0
+    assert stats["resolved"] == 1
+
+
+# --------------------------------------------------------------------------
+# step 2b — source_excerpt hallucination check
+# --------------------------------------------------------------------------
+def test_hallucinated_excerpt_activity_dropped(tmp_path):
+    extracted = tmp_path / "extracted"
+    chunks = tmp_path / "chunks"
+    sources = tmp_path / "sources"
+
+    good = _ext_activity(
+        name="Joc real", source_excerpt="textul real apare in bucata sursa"
+    )
+    bad = _ext_activity(
+        name="Joc inventat",
+        source_excerpt="acest citat nu exista nicaieri in sursa originala xyzzy",
+    )
+    _write_extraction(extracted, "src01.part01", [good, bad])
+    _write_chunk(
+        chunks, "src01", "src01.part01",
+        "--- PAGE 1 ---\ntextul real apare in bucata sursa pentru jocul real.\n",
+    )
+
+    from import_common import load_schema
+
+    schema = load_schema()
+    res = bd.collect_activities(extracted, chunks, sources, schema)
+    names = {a.name for a in res["activities"]}
+    assert names == {"Joc real"}
+    assert res["activities_hallucinated"] == 1
+    assert (extracted / "_rejected").exists()
+
+
+def test_schema_invalid_file_moved_to_rejected(tmp_path):
+    extracted = tmp_path / "extracted"
+    chunks = tmp_path / "chunks"
+    sources = tmp_path / "sources"
+    extracted.mkdir(parents=True)
+
+    # missing required header keys + bad activity
+    (extracted / "bad.json").write_text(
+        json.dumps({"header": {}, "activities": [{"name": "x"}]}),
+        encoding="utf-8",
+    )
+    from import_common import load_schema
+
+    res = bd.collect_activities(extracted, chunks, sources, load_schema())
+    assert res["files_rejected_schema"] == 1
+    assert not (extracted / "bad.json").exists()
+    assert (extracted / "_rejected" / "bad.json").exists()
+    assert (extracted / "_rejected" / "bad.errors.txt").exists()
+
+
+# --------------------------------------------------------------------------
+# end-to-end rebuild + atomic swap
+# --------------------------------------------------------------------------
+def _setup_corpus(tmp_path):
+    extracted = tmp_path / "extracted"
+    chunks = tmp_path / "chunks"
+    sources = tmp_path / "sources"
+    excerpt = "jocul testului este o activitate de echipa"
+    _write_extraction(
+        extracted, "src01.part01",
+        [_ext_activity(source_excerpt=excerpt)],
+    )
+    _write_chunk(chunks, "src01", "src01.part01",
+                 f"--- PAGE 1 ---\n{excerpt} in aer liber.\n")
+    return extracted, chunks, sources
+
+
+def test_rebuild_creates_database(tmp_path):
+    extracted, chunks, sources = _setup_corpus(tmp_path)
+    db_path = tmp_path / "activities.db"
+
+    report = bd.rebuild(
+        extracted_dir=extracted, chunks_dir=chunks, sources_dir=sources,
+        db_path=db_path,
+    )
+    assert db_path.exists()
+    assert report["final_count"] == 1
+
+    db = DatabaseManager(str(db_path))
+    rows = db.search_activities()
+    assert len(rows) == 1
+    assert rows[0]["category"] == "team-building"
+
+
+def test_atomic_swap_keeps_live_db_intact_on_crash(tmp_path, monkeypatch):
+    """A mid-build crash must leave the live DB byte-identical."""
+    extracted, chunks, sources = _setup_corpus(tmp_path)
+    db_path = tmp_path / "activities.db"
+
+    # a pre-existing live DB with sentinel content
+    live = DatabaseManager(str(db_path))
+    live.insert_activity(_activity(name="Sentinel viu"))
+    before = db_path.read_bytes()
+
+    def boom(self, *a, **k):
+        raise RuntimeError("simulated mid-build crash")
+
+    monkeypatch.setattr(DatabaseManager, "bulk_insert_activities", boom)
+
+    with pytest.raises(RuntimeError, match="simulated mid-build crash"):
+        bd.rebuild(
+            extracted_dir=extracted, chunks_dir=chunks, sources_dir=sources,
+            db_path=db_path,
+        )
+
+    # live DB untouched, tmp cleaned up
+    assert db_path.read_bytes() == before
+    assert not (tmp_path / "activities.db.tmp").exists()
+
+
+def test_rebuild_backs_up_live_db(tmp_path):
+    extracted, chunks, sources = _setup_corpus(tmp_path)
+    db_path = tmp_path / "activities.db"
+    DatabaseManager(str(db_path)).insert_activity(_activity(name="Vechi"))
+
+    report = bd.rebuild(
+        extracted_dir=extracted, chunks_dir=chunks, sources_dir=sources,
+        db_path=db_path,
+    )
+    assert report["backup"] is not None
+    assert Path(report["backup"]).exists()
+    assert os.path.basename(report["backup"]) == "activities.db.bak"
--- a/tests/test_chunk_sources.py
+++ b/tests/test_chunk_sources.py
@@ -0,0 +1,183 @@
+# -*- coding: utf-8 -*-
+"""Tests for scripts/chunk_sources.py."""
+
+import json
+
+import chunk_sources as cs
+import normalize_sources as ns
+
+
+def _pages(n):
+    return [(i, f"text-{i}") for i in range(1, n + 1)]
+
+
+# --------------------------------------------------------------------------
+# header parsing
+# --------------------------------------------------------------------------
+def test_parse_source_splits_header_and_body(paginated_source):
+    text = paginated_source.read_text(encoding="utf-8")
+    header, body = cs.parse_source(text)
+    assert header["FORMAT"] == "pdf"
+    assert body.lstrip().startswith("--- PAGE 1 ---")
+
+
+# --------------------------------------------------------------------------
+# page chunking
+# --------------------------------------------------------------------------
+def test_chunk_pages_basic_split():
+    chunks = cs.chunk_pages(_pages(50), pages_per_chunk=20, overlap=4)
+    # stride 16: starts at pages 1, 17, 33, ...
+    assert chunks[0]["page_start"] == 1 and chunks[0]["page_end"] == 20
+    assert chunks[1]["page_start"] == 17
+    assert chunks[-1]["page_end"] == 50
+
+
+def test_chunk_pages_have_overlap():
+    chunks = cs.chunk_pages(_pages(50), pages_per_chunk=20, overlap=4)
+    overlap = chunks[0]["page_end"] - chunks[1]["page_start"] + 1
+    assert overlap == 4
+
+
+def test_chunk_pages_short_document_single_chunk():
+    chunks = cs.chunk_pages(_pages(8), pages_per_chunk=20, overlap=4)
+    assert len(chunks) == 1
+    assert chunks[0]["page_start"] == 1 and chunks[0]["page_end"] == 8
+
+
+def test_chunk_pages_empty():
+    assert cs.chunk_pages([]) == []
+
+
+def test_activity_at_page_boundary_intact_in_one_chunk(paginated_source):
+    """An activity straddling the page 20/21 boundary must appear whole in >=1 chunk."""
+    text = paginated_source.read_text(encoding="utf-8")
+    chunks = cs.make_chunks(text)
+    full = [
+        c for c in chunks
+        if "ACTIVITY-START" in c["text"] and "ACTIVITY-END" in c["text"]
+    ]
+    assert full, "activity spanning a page boundary was split across all chunks"
+
+
+# --------------------------------------------------------------------------
+# word-window chunking for unpaginated text
+# --------------------------------------------------------------------------
+def test_chunk_words_window_and_overlap():
+    text = " ".join(f"w{i}" for i in range(25_000))
+    chunks = cs.chunk_words(text, window=10_000, overlap=2_000)
+    assert len(chunks) == 3  # stride 8000 over 25000 words
+    first = chunks[0]["text"].split()
+    second = chunks[1]["text"].split()
+    assert first[8_000:10_000] == second[0:2_000]  # 2000-word overlap
+
+
+def test_make_chunks_unpaginated_uses_word_windows():
+    body = "cuvant " * 15_000
+    text = "SOURCE: x\nFORMAT: txt\n" + "=" * 50 + "\n\n" + body
+    chunks = cs.make_chunks(text)
+    assert len(chunks) >= 2
+    assert chunks[0]["chunk_range"].startswith("words")
+
+
+# --------------------------------------------------------------------------
+# stable source ids — anti-collision
+# --------------------------------------------------------------------------
+def test_stable_id_same_stem_different_path_no_collision():
+    a = ns.stable_id("camp/games/scout.pdf")
+    b = ns.stable_id("school/lessons/scout.pdf")
+    assert a != b
+    assert a.endswith("_scout") and b.endswith("_scout")
+
+
+def test_stable_id_deterministic():
+    assert ns.stable_id("a/b/c.pdf") == ns.stable_id("a/b/c.pdf")
+
+
+# --------------------------------------------------------------------------
+# manifest registry + idempotency
+# --------------------------------------------------------------------------
+def test_run_writes_chunks_and_manifest(paginated_source, tmp_path):
+    sources_dir = tmp_path / "sources"
+    sources_dir.mkdir()
+    (sources_dir / paginated_source.name).write_text(
+        paginated_source.read_text(encoding="utf-8"), encoding="utf-8"
+    )
+    chunks_dir = tmp_path / "chunks"
+
+    summary = cs.run(sources_dir, chunks_dir)
+    assert summary["sources"] == 1
+    assert summary["chunks"] >= 2
+
+    manifest = json.loads((chunks_dir / "manifest.json").read_text())
+    assert manifest["chunks"]
+    for key, meta in manifest["chunks"].items():
+        assert meta["state"] == "pending"
+        assert meta["expected_json"] == f"{key}.json"
+        assert (chunks_dir.parent / meta["chunk_file"]).exists()
+
+
+def test_manifest_idempotent_preserves_state(paginated_source, tmp_path):
+    sources_dir = tmp_path / "sources"
+    sources_dir.mkdir()
+    (sources_dir / paginated_source.name).write_text(
+        paginated_source.read_text(encoding="utf-8"), encoding="utf-8"
+    )
+    chunks_dir = tmp_path / "chunks"
+    manifest_path = chunks_dir / "manifest.json"
+
+    cs.run(sources_dir, chunks_dir)
+
+    # orchestrator marks one chunk done
+    manifest = json.loads(manifest_path.read_text())
+    first_key = next(iter(manifest["chunks"]))
+    n_before = len(manifest["chunks"])
+    manifest["chunks"][first_key]["state"] = "done"
+    manifest_path.write_text(json.dumps(manifest), encoding="utf-8")
+
+    # re-run: 'done' must survive, no chunk added or lost
+    cs.run(sources_dir, chunks_dir)
+    manifest2 = json.loads(manifest_path.read_text())
+    assert len(manifest2["chunks"]) == n_before
+    assert manifest2["chunks"][first_key]["state"] == "done"
+    assert all(
+        m["state"] in ("pending", "done") for m in manifest2["chunks"].values()
+    )
+
+
+def test_manifest_resets_state_when_source_changes(paginated_source, tmp_path):
+    sources_dir = tmp_path / "sources"
+    sources_dir.mkdir()
+    src = sources_dir / paginated_source.name
+    src.write_text(paginated_source.read_text(encoding="utf-8"), encoding="utf-8")
+    chunks_dir = tmp_path / "chunks"
+    manifest_path = chunks_dir / "manifest.json"
+
+    cs.run(sources_dir, chunks_dir)
+    manifest = json.loads(manifest_path.read_text())
+    first_key = next(iter(manifest["chunks"]))
+    manifest["chunks"][first_key]["state"] = "done"
+    manifest_path.write_text(json.dumps(manifest), encoding="utf-8")
+
+    # mutate the source content -> hash changes -> state resets
+    src.write_text(src.read_text(encoding="utf-8") + "\n--- PAGE 51 ---\nextra\n",
+                   encoding="utf-8")
+    cs.run(sources_dir, chunks_dir)
+    manifest2 = json.loads(manifest_path.read_text())
+    assert manifest2["chunks"][first_key]["state"] == "pending"
+
+
+def test_prune_stale_removes_orphan_entries(paginated_source, tmp_path):
+    sources_dir = tmp_path / "sources"
+    sources_dir.mkdir()
+    src = sources_dir / paginated_source.name
+    src.write_text(paginated_source.read_text(encoding="utf-8"), encoding="utf-8")
+    chunks_dir = tmp_path / "chunks"
+
+    cs.run(sources_dir, chunks_dir)
+    # delete the source -> its chunks become stale
+    src.unlink()
+    summary = cs.run(sources_dir, chunks_dir)
+    assert summary["chunks"] == 0
+    assert summary["pruned"] >= 1
+    manifest = json.loads((chunks_dir / "manifest.json").read_text())
+    assert manifest["chunks"] == {}
--- a/tests/test_extract_common.py
+++ b/tests/test_extract_common.py
@@ -0,0 +1,177 @@
+# -*- coding: utf-8 -*-
+"""Tests for scripts/extract_common.py."""
+
+import shutil
+import zipfile
+
+import pytest
+
+import extract_common as ec
+
+
+# --------------------------------------------------------------------------
+# format detection
+# --------------------------------------------------------------------------
+def test_detect_format():
+    assert ec.detect_format("a/b/file.PDF") == "pdf"
+    assert ec.detect_format("x.docx") == "docx"
+    assert ec.detect_format("x.doc") == "doc"
+    assert ec.detect_format("x.pptx") == "pptx"
+    assert ec.detect_format("x.html") == "html"
+    assert ec.detect_format("x.zip") == "zip"
+    assert ec.detect_format("x.epub") == "epub"
+    assert ec.detect_format("x.xyz") == "unknown"
+
+
+def test_is_junk():
+    assert ec.is_junk("some/desktop.ini")
+    assert ec.is_junk("notes.bak")
+    assert ec.is_junk("README.md")
+    assert not ec.is_junk("1000 Scout Games.pdf")
+
+
+# --------------------------------------------------------------------------
+# PDF — the critical "no max_pages" regression
+# --------------------------------------------------------------------------
+def test_pdf_extracts_all_60_pages(big_pdf):
+    body = ec.extract_pdf(big_pdf)
+    # the old converter capped at 50 pages — page 60 must be present now
+    assert "--- PAGE 60 ---" in body
+    assert "PDFMARK-60" in body
+    assert ec.count_page_markers(body) == 60
+
+
+def test_pdf_does_not_truncate_mid_document(big_pdf):
+    body = ec.extract_pdf(big_pdf)
+    pages = ec.split_pages(body)
+    assert pages[-1][0] == 60  # last marker is the real last page
+
+
+# --------------------------------------------------------------------------
+# page join / split round-trip
+# --------------------------------------------------------------------------
+def test_join_split_round_trip():
+    body = ec.join_pages(["alpha", "beta", "gamma"])
+    pages = ec.split_pages(body)
+    assert [n for n, _ in pages] == [1, 2, 3]
+    assert [t for _, t in pages] == ["alpha", "beta", "gamma"]
+
+
+def test_split_pages_no_markers_returns_empty():
+    assert ec.split_pages("plain text with no markers") == []
+
+
+# --------------------------------------------------------------------------
+# docx — synthetic page markers
+# --------------------------------------------------------------------------
+def test_docx_synthetic_page_markers(sample_docx):
+    body = ec.extract_docx(sample_docx)
+    # 100 paragraphs / 40 per page => 3 pages
+    assert ec.count_page_markers(body) == 3
+    assert "Paragraf 99" in body
+
+
+# --------------------------------------------------------------------------
+# HTML mirror — nav/script/footer stripped
+# --------------------------------------------------------------------------
+def test_html_strips_chrome(html_with_nav):
+    body = ec.extract_html(html_with_nav)
+    assert "Vanatoarea de comori" in body
+    assert "joc real de orientare" in body
+    # chrome must be gone
+    assert "tracking" not in body
+    assert "Site Banner Junk" not in body
+    assert "toate drepturile rezervate" not in body
+    assert "Games" not in body
+
+
+# --------------------------------------------------------------------------
+# content hash + near-duplicate elimination
+# --------------------------------------------------------------------------
+def test_content_hash_ignores_whitespace():
+    assert ec.content_hash("hello  world") == ec.content_hash("hello world\n")
+    assert ec.content_hash("hello world") != ec.content_hash("goodbye world")
+
+
+def test_dedupe_exact_duplicates():
+    items = [("a", "joc identic"), ("b", "joc identic"), ("c", "alt joc")]
+    kept = ec.dedupe_texts(items)
+    assert [k for k, _ in kept] == ["a", "c"]
+
+
+def test_dedupe_near_duplicates():
+    base = "Vanatoarea de comori este un joc de orientare pentru cercetasi in tabara."
+    near = base + " Pagina printata."  # >95% similar
+    items = [("orig", base), ("print", near), ("other", "Cu totul alt continut diferit aici.")]
+    kept = ec.dedupe_texts(items, threshold=85.0)
+    keys = [k for k, _ in kept]
+    assert "orig" in keys
+    assert "print" not in keys
+    assert "other" in keys
+
+
+# --------------------------------------------------------------------------
+# zip recursion
+# --------------------------------------------------------------------------
+def test_zip_recurses_into_inner_files(sample_zip):
+    body = ec.extract_zip(sample_zip)
+    assert "Paragraf 0" in body
+    assert ec.count_page_markers(body) > 0
+
+
+def test_zip_bad_archive_returns_empty(tmp_path):
+    bad = tmp_path / "broken.zip"
+    bad.write_text("not a zip", encoding="utf-8")
+    assert ec.extract_zip(bad) == ""
+
+
+def test_nested_zip(tmp_path, sample_zip):
+    outer = tmp_path / "outer.zip"
+    with zipfile.ZipFile(outer, "w") as zf:
+        zf.write(sample_zip, arcname="nested/archive.zip")
+    body = ec.extract_zip(outer)
+    assert "Paragraf 0" in body
+
+
+# --------------------------------------------------------------------------
+# preflight
+# --------------------------------------------------------------------------
+def test_preflight_python_packages_present():
+    report = ec.preflight()
+    # all required packages are installed in the test environment
+    assert report["missing_python"] == []
+
+
+def test_preflight_reports_libreoffice_state():
+    report = ec.preflight()
+    has_lo = bool(shutil.which("libreoffice") or shutil.which("soffice"))
+    if has_lo:
+        assert all("libreoffice" not in w for w in report["warnings"])
+    else:
+        assert any("libreoffice" in w for w in report["warnings"])
+
+
+def test_preflight_ocr_flag():
+    report = ec.preflight(check_ocr=True)
+    if not shutil.which("tesseract"):
+        assert any("tesseract" in m for m in report["missing_system"])
+
+
+# --------------------------------------------------------------------------
+# legacy .doc — skipped unless libreoffice is installed
+# --------------------------------------------------------------------------
+@pytest.mark.skipif(
+    not (shutil.which("libreoffice") or shutil.which("soffice")),
+    reason="libreoffice not installed",
+)
+def test_doc_conversion(tmp_path, sample_docx):
+    doc_path = tmp_path / "legacy.doc"
+    shutil.copy(sample_docx, doc_path)  # smoke test of the docx path
+    body = ec.extract_doc(doc_path)
+    assert ec.count_page_markers(body) >= 1
+
+
+def test_doc_without_libreoffice_raises(tmp_path, monkeypatch):
+    monkeypatch.setattr(ec.shutil, "which", lambda _: None)
+    with pytest.raises(RuntimeError):
+        ec.extract_doc(tmp_path / "whatever.doc")
--- a/tests/test_fts.py
+++ b/tests/test_fts.py
@@ -0,0 +1,139 @@
+"""
+Integration tests for the FTS5 search index.
+
+Confirms that materials_list and skills_developed are indexed by FTS5 and kept
+in sync by the insert / update / delete triggers (plan §6, §7).
+"""
+
+import os
+import sys
+import json
+
+import pytest
+
+# Make the project root importable when pytest is run from anywhere.
+PROJECT_ROOT = os.path.abspath(os.path.join(os.path.dirname(__file__), ".."))
+if PROJECT_ROOT not in sys.path:
+    sys.path.insert(0, PROJECT_ROOT)
+
+from app.models.activity import Activity  # noqa: E402
+from app.models.database import DatabaseManager  # noqa: E402
+
+
+@pytest.fixture
+def db(tmp_path):
+    """A fresh DatabaseManager backed by a temporary SQLite file."""
+    return DatabaseManager(str(tmp_path / "test_activities.db"))
+
+
+def _make_activity(**overrides):
+    base = dict(
+        name="Vânătoarea de comori",
+        description="O activitate de echipă în aer liber.",
+        category="camp-outdoor",
+        content_type="joc",
+        source_file="test.txt",
+        language="ro",
+    )
+    base.update(overrides)
+    return Activity(**base)
+
+
+def test_search_by_materials_list(db):
+    """A term that only appears in materials_list returns the activity."""
+    activity = _make_activity(materials_list="frânghie, eșarfă, busolă")
+    db.insert_activity(activity)
+
+    results = db.search_activities(search_text="busolă")
+    assert len(results) == 1
+    assert results[0]["name"] == "Vânătoarea de comori"
+
+
+def test_search_by_skills_developed(db):
+    """A term that only appears in skills_developed returns the activity."""
+    activity = _make_activity(skills_developed="comunicare, leadership, rabdare")
+    db.insert_activity(activity)
+
+    results = db.search_activities(search_text="leadership")
+    assert len(results) == 1
+    assert results[0]["name"] == "Vânătoarea de comori"
+
+
+def test_term_absent_from_indexed_columns_no_hit(db):
+    """A term present in no indexed column yields no hit (control)."""
+    db.insert_activity(_make_activity(materials_list="frânghie"))
+    assert db.search_activities(search_text="zzzunlikelyterm") == []
+
+
+def test_delete_trigger_removes_from_fts(db):
+    """Deleting an activity removes it from the FTS index (delete trigger)."""
+    activity = _make_activity(materials_list="catalige")
+    activity_id = db.insert_activity(activity)
+    assert len(db.search_activities(search_text="catalige")) == 1
+
+    with db._get_connection() as conn:
+        conn.execute("DELETE FROM activities WHERE id = ?", (activity_id,))
+        conn.commit()
+
+    assert db.search_activities(search_text="catalige") == []
+
+
+def test_update_trigger_resyncs_fts(db):
+    """Updating materials_list re-syncs the FTS index (update trigger)."""
+    activity = _make_activity(materials_list="creioane")
+    activity_id = db.insert_activity(activity)
+    assert len(db.search_activities(search_text="creioane")) == 1
+
+    with db._get_connection() as conn:
+        conn.execute(
+            "UPDATE activities SET materials_list = ? WHERE id = ?",
+            ("acuarele", activity_id),
+        )
+        conn.commit()
+
+    # Old term gone, new term found.
+    assert db.search_activities(search_text="creioane") == []
+    assert len(db.search_activities(search_text="acuarele")) == 1
+
+
+def test_rebuild_fts_index(db):
+    """rebuild_fts_index keeps materials_list / skills_developed searchable."""
+    db.insert_activity(_make_activity(skills_developed="orientare"))
+    db.rebuild_fts_index()
+    assert len(db.search_activities(search_text="orientare")) == 1
+
+
+def test_new_schema_columns_round_trip(db):
+    """New activity columns persist and load back via from_dict."""
+    activity = _make_activity(
+        source_files=["a.txt", "b.txt"],
+        source_excerpt="Citat scurt din sursă.",
+        extraction_confidence="high",
+        needs_review=1,
+        normalized_name="vanatoarea de comori",
+    )
+    activity_id = db.insert_activity(activity)
+
+    row = db.get_activity_by_id(activity_id)
+    assert row["content_type"] == "joc"
+    assert row["language"] == "ro"
+    assert row["extraction_confidence"] == "high"
+    assert row["needs_review"] == 1
+    assert row["normalized_name"] == "vanatoarea de comori"
+    assert json.loads(row["source_files"]) == ["a.txt", "b.txt"]
+    assert row["source_excerpt"] == "Citat scurt din sursă."
+
+    loaded = Activity.from_dict(row)
+    assert loaded.source_files == ["a.txt", "b.txt"]
+    assert loaded.content_type == "joc"
+
+
+def test_normalized_name_auto_derived(db):
+    """normalized_name is auto-derived from name when not provided."""
+    activity = Activity(
+        name="Ștafetă cu  Obstacole",
+        description="desc",
+        category="sports-active",
+        source_file="t.txt",
+    )
+    assert activity.normalized_name == "stafeta cu obstacole"
--- a/tests/test_search.py
+++ b/tests/test_search.py
@@ -0,0 +1,140 @@
+# -*- coding: utf-8 -*-
+"""
+CRITICAL REGRESSION TEST (plan §6, §7).
+
+`search.py` changed the result sets of /search and /api/search: the default
+search now EXCLUDES the non-game content types (rețetă / cântec / ceremonie),
+which surface only when the user explicitly filters that content_type or picks
+a non-game category. This test guards that behaviour.
+"""
+
+import pytest
+
+from app.models.activity import Activity
+from app.models.database import DatabaseManager
+from app.services.search import SearchService
+from app.config_taxonomy import NON_GAME_CONTENT_TYPES
+
+
+# --------------------------------------------------------------------------
+# fixtures
+# --------------------------------------------------------------------------
+def _activity(name, content_type, category="altele", language="ro"):
+    return Activity(
+        name=name,
+        description=f"Descriere pentru {name}, un conținut de tip {content_type}.",
+        category=category,
+        content_type=content_type,
+        language=language,
+        source_file="test/fixture.txt",
+    )
+
+
+@pytest.fixture
+def search_service(tmp_path):
+    """A SearchService over a temp DB seeded with one row per content_type."""
+    db = DatabaseManager(str(tmp_path / "activities.db"))
+    db.clear_database()
+    db.bulk_insert_activities([
+        _activity("Vanatoarea de comori", "joc", category="wide-games"),
+        _activity("Cercul de cunoastere", "activitate", category="icebreakers"),
+        _activity("Reteta de paine la ceaun", "reteta", category="retete"),
+        _activity("Cantecul de tabara", "cantec", category="cantece-ceremonii"),
+        _activity("Ceremonia de inchidere", "ceremonie", category="cantece-ceremonii"),
+        _activity("Game in English", "joc", category="wide-games", language="en"),
+    ])
+    return SearchService(db)
+
+
+def _content_types(results):
+    return {r.get("content_type") for r in results}
+
+
+# --------------------------------------------------------------------------
+# the regression: default search excludes non-game content types
+# --------------------------------------------------------------------------
+def test_default_search_excludes_non_game_content(search_service):
+    """No filters → rețete / cântece / ceremonii must NOT appear."""
+    results = search_service.search_activities()
+    types = _content_types(results)
+
+    assert types, "default search returned nothing"
+    for non_game in NON_GAME_CONTENT_TYPES:
+        assert non_game not in types, (
+            f"default search leaked non-game content_type '{non_game}'"
+        )
+    # game content is still present
+    assert "joc" in types
+    assert "activitate" in types
+
+
+def test_default_search_with_text_excludes_non_game(search_service):
+    """A text query still excludes non-game content by default."""
+    results = search_service.search_activities(search_text="conținut")
+    assert NON_GAME_CONTENT_TYPES[0] not in _content_types(results)
+
+
+# --------------------------------------------------------------------------
+# explicit content_type filter INCLUDES the non-game rows
+# --------------------------------------------------------------------------
+def test_explicit_content_type_filter_includes_non_game(search_service):
+    """Filtering content_type=reteta returns exactly the rețete."""
+    results = search_service.search_activities(filters={"content_type": "reteta"})
+    types = _content_types(results)
+
+    assert types == {"reteta"}, f"expected only rețete, got {types}"
+    assert len(results) == 1
+
+
+def test_explicit_content_type_filter_for_cantec(search_service):
+    results = search_service.search_activities(filters={"content_type": "cantec"})
+    assert _content_types(results) == {"cantec"}
+
+
+# --------------------------------------------------------------------------
+# a non-game CATEGORY filter also lifts the exclusion
+# --------------------------------------------------------------------------
+def test_non_game_category_filter_includes_non_game(search_service):
+    """Picking category=cantece-ceremonii surfaces cântece + ceremonii."""
+    results = search_service.search_activities(
+        filters={"category": "cantece-ceremonii"})
+    types = _content_types(results)
+
+    assert "cantec" in types
+    assert "ceremonie" in types
+
+
+def test_game_category_filter_still_excludes_non_game(search_service):
+    """A normal (game) category filter keeps the non-game exclusion."""
+    results = search_service.search_activities(filters={"category": "wide-games"})
+    types = _content_types(results)
+    for non_game in NON_GAME_CONTENT_TYPES:
+        assert non_game not in types
+
+
+# --------------------------------------------------------------------------
+# language filter
+# --------------------------------------------------------------------------
+def test_language_filter_ro(search_service):
+    results = search_service.search_activities(filters={"language": "ro"})
+    assert results
+    assert all(r.get("language") == "ro" for r in results)
+
+
+def test_language_filter_en(search_service):
+    results = search_service.search_activities(filters={"language": "en"})
+    assert results
+    assert all(r.get("language") == "en" for r in results)
+    assert {r.get("name") for r in results} == {"Game in English"}
+
+
+# --------------------------------------------------------------------------
+# get_filter_options surfaces the new axes
+# --------------------------------------------------------------------------
+def test_filter_options_include_content_type_and_language(search_service):
+    """The dynamic-filter mechanism now exposes content_type + language."""
+    options = search_service.db.get_filter_options()
+    assert "content_type" in options
+    assert "language" in options
+    assert "joc" in options["content_type"]
+    assert set(options["language"]) == {"ro", "en"}
--- a/tests/test_validate_extractions.py
+++ b/tests/test_validate_extractions.py
@@ -0,0 +1,156 @@
+# -*- coding: utf-8 -*-
+"""
+Tests for scripts/validate_extractions.py.
+
+Covers: schema rejection, the source_excerpt hallucination check, the content
+of the generated re-extraction prompt, and the manifest `rejected` marking.
+"""
+
+import json
+import sys
+from pathlib import Path
+
+REPO_ROOT = Path(__file__).resolve().parent.parent
+SCRIPTS_DIR = REPO_ROOT / "scripts"
+for _p in (str(REPO_ROOT), str(SCRIPTS_DIR)):
+    if _p not in sys.path:
+        sys.path.insert(0, _p)
+
+import validate_extractions as ve  # noqa: E402
+
+
+# --------------------------------------------------------------------------
+# helpers
+# --------------------------------------------------------------------------
+def _ext_activity(**over):
+    base = dict(
+        name="Jocul testului",
+        description="O activitate de echipa in aer liber.",
+        category="team-building",
+        content_type="joc",
+        language="ro",
+        extraction_confidence="high",
+        source_excerpt="ancora din bucata sursa",
+        page_reference="page 1",
+    )
+    base.update(over)
+    return base
+
+
+def _write_extraction(extracted_dir, chunk_key, activities, header_extra=None):
+    extracted_dir.mkdir(parents=True, exist_ok=True)
+    header = {
+        "source_hash": "hash1234deadbeef",
+        "schema_version": "1.0",
+        "prompt_version": "1.0",
+        "chunk_range": "pages 1-20",
+        "source_id": "src01",
+        "chunk_key": chunk_key,
+    }
+    if header_extra:
+        header.update(header_extra)
+    payload = {"header": header, "activities": activities}
+    (extracted_dir / f"{chunk_key}.json").write_text(
+        json.dumps(payload, ensure_ascii=False), encoding="utf-8"
+    )
+
+
+def _write_chunk(chunks_dir, source_id, chunk_key, text):
+    d = chunks_dir / source_id
+    d.mkdir(parents=True, exist_ok=True)
+    (d / f"{chunk_key}.txt").write_text(text, encoding="utf-8")
+
+
+# --------------------------------------------------------------------------
+# tests
+# --------------------------------------------------------------------------
+def test_valid_file_passes(tmp_path):
+    extracted = tmp_path / "extracted"
+    chunks = tmp_path / "chunks"
+    excerpt = "ancora din bucata sursa apare aici"
+    _write_extraction(extracted, "src01.part01", [_ext_activity(source_excerpt=excerpt)])
+    _write_chunk(chunks, "src01", "src01.part01", f"--- PAGE 1 ---\n{excerpt}\n")
+
+    report = ve.run(extracted, chunks, tmp_path / "manifest.json")
+    assert report["valid"] == 1
+    assert report["rejected"] == 0
+
+
+def test_schema_invalid_file_rejected(tmp_path):
+    extracted = tmp_path / "extracted"
+    chunks = tmp_path / "chunks"
+    extracted.mkdir(parents=True)
+    (extracted / "src01.part01.json").write_text(
+        json.dumps({"header": {}, "activities": [{"name": "x"}]}), encoding="utf-8"
+    )
+
+    report = ve.run(extracted, chunks, tmp_path / "manifest.json")
+    assert report["rejected"] == 1
+    prompt = extracted / "_reextract" / "src01.part01.prompt.md"
+    assert prompt.exists()
+
+
+def test_hallucinated_excerpt_rejected(tmp_path):
+    extracted = tmp_path / "extracted"
+    chunks = tmp_path / "chunks"
+    _write_extraction(
+        extracted, "src01.part01",
+        [_ext_activity(source_excerpt="citat complet inventat care nu exista qqqq")],
+    )
+    _write_chunk(chunks, "src01", "src01.part01",
+                 "--- PAGE 1 ---\ntext complet diferit despre altceva.\n")
+
+    report = ve.run(extracted, chunks, tmp_path / "manifest.json")
+    assert report["rejected"] == 1
+    errors = report["rejected_chunks"][0]["errors"]
+    assert any("hallucination" in e for e in errors)
+
+
+def test_reextraction_prompt_content(tmp_path):
+    extracted = tmp_path / "extracted"
+    chunks = tmp_path / "chunks"
+    _write_extraction(
+        extracted, "src01.part01",
+        [_ext_activity(source_excerpt="citat inventat care nu exista zzzz")],
+    )
+    _write_chunk(chunks, "src01", "src01.part01",
+                 "--- PAGE 1 ---\ntext despre cu totul altceva aici.\n")
+
+    ve.run(extracted, chunks, tmp_path / "manifest.json")
+    prompt = (extracted / "_reextract" / "src01.part01.prompt.md").read_text(
+        encoding="utf-8"
+    )
+    assert "src01.part01" in prompt
+    assert "REJECTED" in prompt
+    assert "verbatim" in prompt
+    assert "data/extracted/src01.part01.json" in prompt
+
+
+def test_manifest_marks_chunk_rejected(tmp_path):
+    extracted = tmp_path / "extracted"
+    chunks = tmp_path / "chunks"
+    manifest_path = tmp_path / "manifest.json"
+    manifest_path.write_text(
+        json.dumps({"chunks": {"src01.part01": {"state": "done",
+                                                "chunk_file": "chunks/src01/src01.part01.txt"}}}),
+        encoding="utf-8",
+    )
+    _write_extraction(
+        extracted, "src01.part01",
+        [_ext_activity(source_excerpt="citat fabricat absent vvvv")],
+    )
+    _write_chunk(chunks, "src01", "src01.part01",
+                 "--- PAGE 1 ---\nun continut neinrudit.\n")
+
+    ve.run(extracted, chunks, manifest_path)
+    manifest = json.loads(manifest_path.read_text(encoding="utf-8"))
+    assert manifest["chunks"]["src01.part01"]["state"] == "rejected"
+
+
+def test_build_reextraction_prompt_lists_errors():
+    prompt = ve.build_reextraction_prompt(
+        "abc.part03", "data/chunks/abc/abc.part03.txt",
+        ["header: 'source_hash' is a required property"],
+    )
+    assert "abc.part03" in prompt
+    assert "source_hash" in prompt