Rebuild extraction pipeline infrastructure (Faza 0 prep)
Implements the approved plan to replace the broken regex/index-master extraction with an LLM-subagent pipeline. Four parallel lanes: Lane A — scripts/extract_common.py (PDF/docx/doc/pptx/html/zip, no max_pages truncation), normalize_sources.py, chunk_sources.py (~20pg chunks + overlap, manifest registry), activity_schema.json. Lane B — app/config_taxonomy.py (16 fixed category slugs), schema rebuilt from scratch in app/models/ with content_type, language, source_files, source_excerpt, normalized_name, extraction_confidence, needs_review; FTS5 + 3 triggers extended with materials_list and skills_developed. Lane C — build_database.py (--rebuild, atomic swap, schema + fuzzy source_excerpt validation, dedup with needs_review band), validate_extractions.py, review_queue.py, new run_extraction.py orchestrator, SUBAGENT_PROMPT.md. Lane D — search.py content_type/language filters (default search excludes non-game content), E7 schema-compat audit; fixed a NULL keywords AttributeError in _boost_search_relevance. Removes 8 orphaned/dead scripts and app/services/parser.py + indexer.py. Adds tests/ (70 passing, 1 skipped — libreoffice absent). Note: Lane D made one additive edit to app/models/database.py (_update_category_counts) to surface content_type/language in get_filter_options, outside its nominal lane boundary but after Lane B completed. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
114
tests/conftest.py
Normal file
114
tests/conftest.py
Normal file
@@ -0,0 +1,114 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
"""
|
||||
Shared pytest fixtures for the extraction-pipeline tests.
|
||||
|
||||
scripts/ is not a package, so it is added to sys.path here. Synthetic fixtures
|
||||
(PDF, docx, zip, HTML) are generated at runtime — no binary blobs in the repo.
|
||||
"""
|
||||
|
||||
import sys
|
||||
import zipfile
|
||||
from pathlib import Path
|
||||
|
||||
import pytest
|
||||
|
||||
REPO_ROOT = Path(__file__).resolve().parent.parent
|
||||
SCRIPTS_DIR = REPO_ROOT / "scripts"
|
||||
if str(SCRIPTS_DIR) not in sys.path:
|
||||
sys.path.insert(0, str(SCRIPTS_DIR))
|
||||
|
||||
|
||||
# --------------------------------------------------------------------------
|
||||
# synthetic PDF — deliberately large to pin the "no max_pages" regression
|
||||
# --------------------------------------------------------------------------
|
||||
@pytest.fixture
|
||||
def big_pdf(tmp_path):
|
||||
"""A 60-page PDF; each page carries a unique 'PDFMARK-<n>' token."""
|
||||
from reportlab.pdfgen import canvas
|
||||
from reportlab.lib.pagesizes import letter
|
||||
|
||||
path = tmp_path / "big.pdf"
|
||||
c = canvas.Canvas(str(path), pagesize=letter)
|
||||
for n in range(1, 61):
|
||||
c.drawString(72, 720, f"PDFMARK-{n} synthetic activity page number {n}")
|
||||
c.drawString(72, 700, "Acest joc educativ se joaca in echipa.")
|
||||
c.showPage()
|
||||
c.save()
|
||||
return path
|
||||
|
||||
|
||||
# --------------------------------------------------------------------------
|
||||
# synthetic docx — 100 paragraphs => 3 synthetic pages at 40 paras/page
|
||||
# --------------------------------------------------------------------------
|
||||
@pytest.fixture
|
||||
def sample_docx(tmp_path):
|
||||
import docx
|
||||
|
||||
path = tmp_path / "sample.docx"
|
||||
document = docx.Document()
|
||||
for i in range(100):
|
||||
document.add_paragraph(f"Paragraf {i}: continut joc team-building.")
|
||||
document.save(str(path))
|
||||
return path
|
||||
|
||||
|
||||
# --------------------------------------------------------------------------
|
||||
# synthetic HTML mirror page — with nav/script/footer chrome to strip
|
||||
# --------------------------------------------------------------------------
|
||||
HTML_WITH_NAV = """<!doctype html>
|
||||
<html><head><title>Joc</title>
|
||||
<style>.x{color:red}</style>
|
||||
<script>var tracking = 1;</script>
|
||||
</head><body>
|
||||
<nav><a href="/">Home</a><a href="/games">Games</a></nav>
|
||||
<header>Site Banner Junk</header>
|
||||
<main>
|
||||
<h1>Vanatoarea de comori</h1>
|
||||
<p>Acesta este un joc real de orientare pentru cercetasi.</p>
|
||||
<p>Jucatorii cauta indicii ascunse in tabara.</p>
|
||||
</main>
|
||||
<footer>Copyright 2024 - toate drepturile rezervate</footer>
|
||||
</body></html>
|
||||
"""
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def html_with_nav(tmp_path):
|
||||
path = tmp_path / "page.html"
|
||||
path.write_text(HTML_WITH_NAV, encoding="utf-8")
|
||||
return path
|
||||
|
||||
|
||||
# --------------------------------------------------------------------------
|
||||
# synthetic zip — contains a docx and a stray junk file
|
||||
# --------------------------------------------------------------------------
|
||||
@pytest.fixture
|
||||
def sample_zip(tmp_path, sample_docx):
|
||||
path = tmp_path / "archive.zip"
|
||||
with zipfile.ZipFile(path, "w") as zf:
|
||||
zf.write(sample_docx, arcname="inner/sample.docx")
|
||||
zf.writestr("desktop.ini", "junk")
|
||||
return path
|
||||
|
||||
|
||||
# --------------------------------------------------------------------------
|
||||
# synthetic normalized source — paginated, with an activity straddling a
|
||||
# page boundary so the chunker overlap can be verified.
|
||||
# --------------------------------------------------------------------------
|
||||
@pytest.fixture
|
||||
def paginated_source(tmp_path):
|
||||
"""A 50-page normalized source. An activity spans the page 20/21 boundary."""
|
||||
lines = ["SOURCE: synthetic/test.pdf", "CONVERTED: 2026-05-19",
|
||||
"FORMAT: pdf", "=" * 50, ""]
|
||||
for n in range(1, 51):
|
||||
lines.append(f"--- PAGE {n} ---")
|
||||
if n == 20:
|
||||
lines.append("ACTIVITY-START jocul podului care traverseaza pagina")
|
||||
elif n == 21:
|
||||
lines.append("continuare a jocului podului ACTIVITY-END")
|
||||
else:
|
||||
lines.append(f"continut obisnuit pe pagina {n}")
|
||||
lines.append("")
|
||||
path = tmp_path / "src_paginated.txt"
|
||||
path.write_text("\n".join(lines), encoding="utf-8")
|
||||
return path
|
||||
3
tests/fixtures/.gitkeep
vendored
Normal file
3
tests/fixtures/.gitkeep
vendored
Normal file
@@ -0,0 +1,3 @@
|
||||
# Test fixtures (synthetic PDF/docx/zip/HTML) are generated at runtime by
|
||||
# tests/conftest.py — no binary blobs are committed. This file only preserves
|
||||
# the directory in git.
|
||||
334
tests/test_build_database.py
Normal file
334
tests/test_build_database.py
Normal file
@@ -0,0 +1,334 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
"""
|
||||
Tests for scripts/build_database.py — the import / dedup / swap side.
|
||||
|
||||
Covers: category -> slug + `altele` fallback; dedup across all three threshold
|
||||
bands; EN != RO never merged; field combination on merge; atomic swap with a
|
||||
simulated mid-build crash; the source_excerpt substring check.
|
||||
"""
|
||||
|
||||
import json
|
||||
import os
|
||||
import sys
|
||||
from pathlib import Path
|
||||
|
||||
import pytest
|
||||
|
||||
REPO_ROOT = Path(__file__).resolve().parent.parent
|
||||
SCRIPTS_DIR = REPO_ROOT / "scripts"
|
||||
for _p in (str(REPO_ROOT), str(SCRIPTS_DIR)):
|
||||
if _p not in sys.path:
|
||||
sys.path.insert(0, _p)
|
||||
|
||||
import build_database as bd # noqa: E402
|
||||
from app.models.activity import Activity # noqa: E402
|
||||
from app.models.database import DatabaseManager # noqa: E402
|
||||
|
||||
|
||||
# --------------------------------------------------------------------------
|
||||
# helpers
|
||||
# --------------------------------------------------------------------------
|
||||
def _activity(**over):
|
||||
base = dict(
|
||||
name="Jocul testului",
|
||||
description="O activitate de echipa in aer liber.",
|
||||
category="team-building",
|
||||
content_type="joc",
|
||||
language="ro",
|
||||
extraction_confidence="high",
|
||||
)
|
||||
base.update(over)
|
||||
return Activity(**base)
|
||||
|
||||
|
||||
def _ext_activity(**over):
|
||||
"""A schema-valid extraction-JSON activity object."""
|
||||
base = dict(
|
||||
name="Jocul testului",
|
||||
description="O activitate de echipa in aer liber.",
|
||||
category="team-building",
|
||||
content_type="joc",
|
||||
language="ro",
|
||||
extraction_confidence="high",
|
||||
source_excerpt="ANCHOR-EXCERPT despre jocul testului",
|
||||
page_reference="page 1",
|
||||
)
|
||||
base.update(over)
|
||||
return base
|
||||
|
||||
|
||||
def _write_extraction(extracted_dir, chunk_key, activities, source_id="src01"):
|
||||
extracted_dir.mkdir(parents=True, exist_ok=True)
|
||||
payload = {
|
||||
"header": {
|
||||
"source_hash": "hash1234deadbeef",
|
||||
"schema_version": "1.0",
|
||||
"prompt_version": "1.0",
|
||||
"chunk_range": "pages 1-20",
|
||||
"source_id": source_id,
|
||||
"chunk_key": chunk_key,
|
||||
},
|
||||
"activities": activities,
|
||||
}
|
||||
(extracted_dir / f"{chunk_key}.json").write_text(
|
||||
json.dumps(payload, ensure_ascii=False), encoding="utf-8"
|
||||
)
|
||||
|
||||
|
||||
def _write_chunk(chunks_dir, source_id, chunk_key, text):
|
||||
d = chunks_dir / source_id
|
||||
d.mkdir(parents=True, exist_ok=True)
|
||||
(d / f"{chunk_key}.txt").write_text(text, encoding="utf-8")
|
||||
|
||||
|
||||
# --------------------------------------------------------------------------
|
||||
# step 3 — category normalization
|
||||
# --------------------------------------------------------------------------
|
||||
def test_category_alias_mapped_to_slug():
|
||||
act = bd.dict_to_activity(_ext_activity(category="teambuilding"), "s.txt")
|
||||
assert act.category == "team-building"
|
||||
|
||||
|
||||
def test_unknown_category_falls_back_to_altele():
|
||||
act = bd.dict_to_activity(_ext_activity(category="zzz-not-a-category"), "s.txt")
|
||||
assert act.category == "altele"
|
||||
|
||||
|
||||
def test_content_type_normalized():
|
||||
act = bd.dict_to_activity(_ext_activity(content_type="games"), "s.txt")
|
||||
assert act.content_type == "joc"
|
||||
|
||||
|
||||
# --------------------------------------------------------------------------
|
||||
# step 4 — dedup, three bands
|
||||
# --------------------------------------------------------------------------
|
||||
def test_dedup_auto_merge_identical_descriptions():
|
||||
""">= 85 similar -> a single merged row."""
|
||||
a = _activity(description="copiii formeaza echipe si traverseaza terenul")
|
||||
b = _activity(description="copiii formeaza echipe si traverseaza terenul")
|
||||
out, stats = bd.dedup_activities([a, b])
|
||||
assert len(out) == 1
|
||||
assert stats["auto_merged"] == 1
|
||||
assert out[0].needs_review == 0
|
||||
|
||||
|
||||
def test_dedup_borderline_keeps_both_and_flags_needs_review():
|
||||
"""60-85 similar -> both kept, both flagged needs_review."""
|
||||
from rapidfuzz import fuzz
|
||||
|
||||
d1 = "alpha beta gamma delta epsilon"
|
||||
d2 = "alpha beta gamma delta epsilon zeta eta theta iota"
|
||||
score = fuzz.token_sort_ratio(d1, d2)
|
||||
assert 60.0 <= score < 85.0, f"precondition: score={score} not borderline"
|
||||
|
||||
a = _activity(description=d1)
|
||||
b = _activity(description=d2)
|
||||
out, stats = bd.dedup_activities([a, b])
|
||||
assert len(out) == 2
|
||||
assert stats["borderline"] == 2
|
||||
assert all(act.needs_review == 1 for act in out)
|
||||
|
||||
|
||||
def test_dedup_low_similarity_kept_as_separate_variants():
|
||||
"""< 60 similar -> separate variants, no needs_review."""
|
||||
from rapidfuzz import fuzz
|
||||
|
||||
d1 = "alpha beta gamma delta epsilon"
|
||||
d2 = "quebec romeo sierra tango uniform victor whiskey"
|
||||
assert fuzz.token_sort_ratio(d1, d2) < 60.0
|
||||
|
||||
a = _activity(description=d1)
|
||||
b = _activity(description=d2)
|
||||
out, stats = bd.dedup_activities([a, b])
|
||||
assert len(out) == 2
|
||||
assert stats["auto_merged"] == 0
|
||||
assert all(act.needs_review == 0 for act in out)
|
||||
|
||||
|
||||
def test_dedup_never_merges_across_languages():
|
||||
"""Same name + same description but EN vs RO -> two distinct rows."""
|
||||
desc = "children form teams and cross the field"
|
||||
ro = _activity(name="Cursa", description=desc, language="ro")
|
||||
en = _activity(name="Cursa", description=desc, language="en")
|
||||
out, stats = bd.dedup_activities([ro, en])
|
||||
assert len(out) == 2
|
||||
assert stats["auto_merged"] == 0
|
||||
langs = {a.language for a in out}
|
||||
assert langs == {"ro", "en"}
|
||||
|
||||
|
||||
def test_merge_combines_fields():
|
||||
"""On merge: longest description/rules, union materials, accumulated sources."""
|
||||
desc = "copiii formeaza echipe si traverseaza terenul cu obstacole"
|
||||
a = _activity(
|
||||
description=desc,
|
||||
rules="regula scurta",
|
||||
materials_list="franghie, esarfa",
|
||||
source_file="a.txt",
|
||||
keywords="echipa",
|
||||
)
|
||||
b = _activity(
|
||||
description=desc,
|
||||
rules="o regula mult mai lunga si mai detaliata pentru joc",
|
||||
materials_list="busola, esarfa",
|
||||
source_file="b.txt",
|
||||
keywords="cooperare",
|
||||
)
|
||||
out, _ = bd.dedup_activities([a, b])
|
||||
assert len(out) == 1
|
||||
merged = out[0]
|
||||
assert merged.rules == "o regula mult mai lunga si mai detaliata pentru joc"
|
||||
mats = set(m.strip() for m in merged.materials_list.split(","))
|
||||
assert mats == {"franghie", "esarfa", "busola"}
|
||||
assert set(merged.source_files) == {"a.txt", "b.txt"}
|
||||
assert merged.popularity_score == 1
|
||||
assert set(k.strip() for k in merged.keywords.split(",")) == {"echipa", "cooperare"}
|
||||
|
||||
|
||||
# --------------------------------------------------------------------------
|
||||
# step 5 — review decisions
|
||||
# --------------------------------------------------------------------------
|
||||
def test_review_decision_drop_removes_row():
|
||||
from import_common import content_key, normalize_name
|
||||
|
||||
a = _activity(description="o descriere de test")
|
||||
key = content_key(normalize_name(a.name), a.language, a.description)
|
||||
kept, stats = bd.apply_review_decisions([a], {key: {"decision": "drop"}})
|
||||
assert kept == []
|
||||
assert stats["dropped"] == 1
|
||||
|
||||
|
||||
def test_review_decision_keep_separate_clears_needs_review():
|
||||
from import_common import content_key, normalize_name
|
||||
|
||||
a = _activity(description="o descriere de test")
|
||||
a.needs_review = 1
|
||||
key = content_key(normalize_name(a.name), a.language, a.description)
|
||||
kept, stats = bd.apply_review_decisions([a], {key: {"decision": "keep-separate"}})
|
||||
assert len(kept) == 1 and kept[0].needs_review == 0
|
||||
assert stats["resolved"] == 1
|
||||
|
||||
|
||||
# --------------------------------------------------------------------------
|
||||
# step 2b — source_excerpt hallucination check
|
||||
# --------------------------------------------------------------------------
|
||||
def test_hallucinated_excerpt_activity_dropped(tmp_path):
|
||||
extracted = tmp_path / "extracted"
|
||||
chunks = tmp_path / "chunks"
|
||||
sources = tmp_path / "sources"
|
||||
|
||||
good = _ext_activity(
|
||||
name="Joc real", source_excerpt="textul real apare in bucata sursa"
|
||||
)
|
||||
bad = _ext_activity(
|
||||
name="Joc inventat",
|
||||
source_excerpt="acest citat nu exista nicaieri in sursa originala xyzzy",
|
||||
)
|
||||
_write_extraction(extracted, "src01.part01", [good, bad])
|
||||
_write_chunk(
|
||||
chunks, "src01", "src01.part01",
|
||||
"--- PAGE 1 ---\ntextul real apare in bucata sursa pentru jocul real.\n",
|
||||
)
|
||||
|
||||
from import_common import load_schema
|
||||
|
||||
schema = load_schema()
|
||||
res = bd.collect_activities(extracted, chunks, sources, schema)
|
||||
names = {a.name for a in res["activities"]}
|
||||
assert names == {"Joc real"}
|
||||
assert res["activities_hallucinated"] == 1
|
||||
assert (extracted / "_rejected").exists()
|
||||
|
||||
|
||||
def test_schema_invalid_file_moved_to_rejected(tmp_path):
|
||||
extracted = tmp_path / "extracted"
|
||||
chunks = tmp_path / "chunks"
|
||||
sources = tmp_path / "sources"
|
||||
extracted.mkdir(parents=True)
|
||||
|
||||
# missing required header keys + bad activity
|
||||
(extracted / "bad.json").write_text(
|
||||
json.dumps({"header": {}, "activities": [{"name": "x"}]}),
|
||||
encoding="utf-8",
|
||||
)
|
||||
from import_common import load_schema
|
||||
|
||||
res = bd.collect_activities(extracted, chunks, sources, load_schema())
|
||||
assert res["files_rejected_schema"] == 1
|
||||
assert not (extracted / "bad.json").exists()
|
||||
assert (extracted / "_rejected" / "bad.json").exists()
|
||||
assert (extracted / "_rejected" / "bad.errors.txt").exists()
|
||||
|
||||
|
||||
# --------------------------------------------------------------------------
|
||||
# end-to-end rebuild + atomic swap
|
||||
# --------------------------------------------------------------------------
|
||||
def _setup_corpus(tmp_path):
|
||||
extracted = tmp_path / "extracted"
|
||||
chunks = tmp_path / "chunks"
|
||||
sources = tmp_path / "sources"
|
||||
excerpt = "jocul testului este o activitate de echipa"
|
||||
_write_extraction(
|
||||
extracted, "src01.part01",
|
||||
[_ext_activity(source_excerpt=excerpt)],
|
||||
)
|
||||
_write_chunk(chunks, "src01", "src01.part01",
|
||||
f"--- PAGE 1 ---\n{excerpt} in aer liber.\n")
|
||||
return extracted, chunks, sources
|
||||
|
||||
|
||||
def test_rebuild_creates_database(tmp_path):
|
||||
extracted, chunks, sources = _setup_corpus(tmp_path)
|
||||
db_path = tmp_path / "activities.db"
|
||||
|
||||
report = bd.rebuild(
|
||||
extracted_dir=extracted, chunks_dir=chunks, sources_dir=sources,
|
||||
db_path=db_path,
|
||||
)
|
||||
assert db_path.exists()
|
||||
assert report["final_count"] == 1
|
||||
|
||||
db = DatabaseManager(str(db_path))
|
||||
rows = db.search_activities()
|
||||
assert len(rows) == 1
|
||||
assert rows[0]["category"] == "team-building"
|
||||
|
||||
|
||||
def test_atomic_swap_keeps_live_db_intact_on_crash(tmp_path, monkeypatch):
|
||||
"""A mid-build crash must leave the live DB byte-identical."""
|
||||
extracted, chunks, sources = _setup_corpus(tmp_path)
|
||||
db_path = tmp_path / "activities.db"
|
||||
|
||||
# a pre-existing live DB with sentinel content
|
||||
live = DatabaseManager(str(db_path))
|
||||
live.insert_activity(_activity(name="Sentinel viu"))
|
||||
before = db_path.read_bytes()
|
||||
|
||||
def boom(self, *a, **k):
|
||||
raise RuntimeError("simulated mid-build crash")
|
||||
|
||||
monkeypatch.setattr(DatabaseManager, "bulk_insert_activities", boom)
|
||||
|
||||
with pytest.raises(RuntimeError, match="simulated mid-build crash"):
|
||||
bd.rebuild(
|
||||
extracted_dir=extracted, chunks_dir=chunks, sources_dir=sources,
|
||||
db_path=db_path,
|
||||
)
|
||||
|
||||
# live DB untouched, tmp cleaned up
|
||||
assert db_path.read_bytes() == before
|
||||
assert not (tmp_path / "activities.db.tmp").exists()
|
||||
|
||||
|
||||
def test_rebuild_backs_up_live_db(tmp_path):
|
||||
extracted, chunks, sources = _setup_corpus(tmp_path)
|
||||
db_path = tmp_path / "activities.db"
|
||||
DatabaseManager(str(db_path)).insert_activity(_activity(name="Vechi"))
|
||||
|
||||
report = bd.rebuild(
|
||||
extracted_dir=extracted, chunks_dir=chunks, sources_dir=sources,
|
||||
db_path=db_path,
|
||||
)
|
||||
assert report["backup"] is not None
|
||||
assert Path(report["backup"]).exists()
|
||||
assert os.path.basename(report["backup"]) == "activities.db.bak"
|
||||
183
tests/test_chunk_sources.py
Normal file
183
tests/test_chunk_sources.py
Normal file
@@ -0,0 +1,183 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
"""Tests for scripts/chunk_sources.py."""
|
||||
|
||||
import json
|
||||
|
||||
import chunk_sources as cs
|
||||
import normalize_sources as ns
|
||||
|
||||
|
||||
def _pages(n):
|
||||
return [(i, f"text-{i}") for i in range(1, n + 1)]
|
||||
|
||||
|
||||
# --------------------------------------------------------------------------
|
||||
# header parsing
|
||||
# --------------------------------------------------------------------------
|
||||
def test_parse_source_splits_header_and_body(paginated_source):
|
||||
text = paginated_source.read_text(encoding="utf-8")
|
||||
header, body = cs.parse_source(text)
|
||||
assert header["FORMAT"] == "pdf"
|
||||
assert body.lstrip().startswith("--- PAGE 1 ---")
|
||||
|
||||
|
||||
# --------------------------------------------------------------------------
|
||||
# page chunking
|
||||
# --------------------------------------------------------------------------
|
||||
def test_chunk_pages_basic_split():
|
||||
chunks = cs.chunk_pages(_pages(50), pages_per_chunk=20, overlap=4)
|
||||
# stride 16: starts at pages 1, 17, 33, ...
|
||||
assert chunks[0]["page_start"] == 1 and chunks[0]["page_end"] == 20
|
||||
assert chunks[1]["page_start"] == 17
|
||||
assert chunks[-1]["page_end"] == 50
|
||||
|
||||
|
||||
def test_chunk_pages_have_overlap():
|
||||
chunks = cs.chunk_pages(_pages(50), pages_per_chunk=20, overlap=4)
|
||||
overlap = chunks[0]["page_end"] - chunks[1]["page_start"] + 1
|
||||
assert overlap == 4
|
||||
|
||||
|
||||
def test_chunk_pages_short_document_single_chunk():
|
||||
chunks = cs.chunk_pages(_pages(8), pages_per_chunk=20, overlap=4)
|
||||
assert len(chunks) == 1
|
||||
assert chunks[0]["page_start"] == 1 and chunks[0]["page_end"] == 8
|
||||
|
||||
|
||||
def test_chunk_pages_empty():
|
||||
assert cs.chunk_pages([]) == []
|
||||
|
||||
|
||||
def test_activity_at_page_boundary_intact_in_one_chunk(paginated_source):
|
||||
"""An activity straddling the page 20/21 boundary must appear whole in >=1 chunk."""
|
||||
text = paginated_source.read_text(encoding="utf-8")
|
||||
chunks = cs.make_chunks(text)
|
||||
full = [
|
||||
c for c in chunks
|
||||
if "ACTIVITY-START" in c["text"] and "ACTIVITY-END" in c["text"]
|
||||
]
|
||||
assert full, "activity spanning a page boundary was split across all chunks"
|
||||
|
||||
|
||||
# --------------------------------------------------------------------------
|
||||
# word-window chunking for unpaginated text
|
||||
# --------------------------------------------------------------------------
|
||||
def test_chunk_words_window_and_overlap():
|
||||
text = " ".join(f"w{i}" for i in range(25_000))
|
||||
chunks = cs.chunk_words(text, window=10_000, overlap=2_000)
|
||||
assert len(chunks) == 3 # stride 8000 over 25000 words
|
||||
first = chunks[0]["text"].split()
|
||||
second = chunks[1]["text"].split()
|
||||
assert first[8_000:10_000] == second[0:2_000] # 2000-word overlap
|
||||
|
||||
|
||||
def test_make_chunks_unpaginated_uses_word_windows():
|
||||
body = "cuvant " * 15_000
|
||||
text = "SOURCE: x\nFORMAT: txt\n" + "=" * 50 + "\n\n" + body
|
||||
chunks = cs.make_chunks(text)
|
||||
assert len(chunks) >= 2
|
||||
assert chunks[0]["chunk_range"].startswith("words")
|
||||
|
||||
|
||||
# --------------------------------------------------------------------------
|
||||
# stable source ids — anti-collision
|
||||
# --------------------------------------------------------------------------
|
||||
def test_stable_id_same_stem_different_path_no_collision():
|
||||
a = ns.stable_id("camp/games/scout.pdf")
|
||||
b = ns.stable_id("school/lessons/scout.pdf")
|
||||
assert a != b
|
||||
assert a.endswith("_scout") and b.endswith("_scout")
|
||||
|
||||
|
||||
def test_stable_id_deterministic():
|
||||
assert ns.stable_id("a/b/c.pdf") == ns.stable_id("a/b/c.pdf")
|
||||
|
||||
|
||||
# --------------------------------------------------------------------------
|
||||
# manifest registry + idempotency
|
||||
# --------------------------------------------------------------------------
|
||||
def test_run_writes_chunks_and_manifest(paginated_source, tmp_path):
|
||||
sources_dir = tmp_path / "sources"
|
||||
sources_dir.mkdir()
|
||||
(sources_dir / paginated_source.name).write_text(
|
||||
paginated_source.read_text(encoding="utf-8"), encoding="utf-8"
|
||||
)
|
||||
chunks_dir = tmp_path / "chunks"
|
||||
|
||||
summary = cs.run(sources_dir, chunks_dir)
|
||||
assert summary["sources"] == 1
|
||||
assert summary["chunks"] >= 2
|
||||
|
||||
manifest = json.loads((chunks_dir / "manifest.json").read_text())
|
||||
assert manifest["chunks"]
|
||||
for key, meta in manifest["chunks"].items():
|
||||
assert meta["state"] == "pending"
|
||||
assert meta["expected_json"] == f"{key}.json"
|
||||
assert (chunks_dir.parent / meta["chunk_file"]).exists()
|
||||
|
||||
|
||||
def test_manifest_idempotent_preserves_state(paginated_source, tmp_path):
|
||||
sources_dir = tmp_path / "sources"
|
||||
sources_dir.mkdir()
|
||||
(sources_dir / paginated_source.name).write_text(
|
||||
paginated_source.read_text(encoding="utf-8"), encoding="utf-8"
|
||||
)
|
||||
chunks_dir = tmp_path / "chunks"
|
||||
manifest_path = chunks_dir / "manifest.json"
|
||||
|
||||
cs.run(sources_dir, chunks_dir)
|
||||
|
||||
# orchestrator marks one chunk done
|
||||
manifest = json.loads(manifest_path.read_text())
|
||||
first_key = next(iter(manifest["chunks"]))
|
||||
n_before = len(manifest["chunks"])
|
||||
manifest["chunks"][first_key]["state"] = "done"
|
||||
manifest_path.write_text(json.dumps(manifest), encoding="utf-8")
|
||||
|
||||
# re-run: 'done' must survive, no chunk added or lost
|
||||
cs.run(sources_dir, chunks_dir)
|
||||
manifest2 = json.loads(manifest_path.read_text())
|
||||
assert len(manifest2["chunks"]) == n_before
|
||||
assert manifest2["chunks"][first_key]["state"] == "done"
|
||||
assert all(
|
||||
m["state"] in ("pending", "done") for m in manifest2["chunks"].values()
|
||||
)
|
||||
|
||||
|
||||
def test_manifest_resets_state_when_source_changes(paginated_source, tmp_path):
|
||||
sources_dir = tmp_path / "sources"
|
||||
sources_dir.mkdir()
|
||||
src = sources_dir / paginated_source.name
|
||||
src.write_text(paginated_source.read_text(encoding="utf-8"), encoding="utf-8")
|
||||
chunks_dir = tmp_path / "chunks"
|
||||
manifest_path = chunks_dir / "manifest.json"
|
||||
|
||||
cs.run(sources_dir, chunks_dir)
|
||||
manifest = json.loads(manifest_path.read_text())
|
||||
first_key = next(iter(manifest["chunks"]))
|
||||
manifest["chunks"][first_key]["state"] = "done"
|
||||
manifest_path.write_text(json.dumps(manifest), encoding="utf-8")
|
||||
|
||||
# mutate the source content -> hash changes -> state resets
|
||||
src.write_text(src.read_text(encoding="utf-8") + "\n--- PAGE 51 ---\nextra\n",
|
||||
encoding="utf-8")
|
||||
cs.run(sources_dir, chunks_dir)
|
||||
manifest2 = json.loads(manifest_path.read_text())
|
||||
assert manifest2["chunks"][first_key]["state"] == "pending"
|
||||
|
||||
|
||||
def test_prune_stale_removes_orphan_entries(paginated_source, tmp_path):
|
||||
sources_dir = tmp_path / "sources"
|
||||
sources_dir.mkdir()
|
||||
src = sources_dir / paginated_source.name
|
||||
src.write_text(paginated_source.read_text(encoding="utf-8"), encoding="utf-8")
|
||||
chunks_dir = tmp_path / "chunks"
|
||||
|
||||
cs.run(sources_dir, chunks_dir)
|
||||
# delete the source -> its chunks become stale
|
||||
src.unlink()
|
||||
summary = cs.run(sources_dir, chunks_dir)
|
||||
assert summary["chunks"] == 0
|
||||
assert summary["pruned"] >= 1
|
||||
manifest = json.loads((chunks_dir / "manifest.json").read_text())
|
||||
assert manifest["chunks"] == {}
|
||||
177
tests/test_extract_common.py
Normal file
177
tests/test_extract_common.py
Normal file
@@ -0,0 +1,177 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
"""Tests for scripts/extract_common.py."""
|
||||
|
||||
import shutil
|
||||
import zipfile
|
||||
|
||||
import pytest
|
||||
|
||||
import extract_common as ec
|
||||
|
||||
|
||||
# --------------------------------------------------------------------------
|
||||
# format detection
|
||||
# --------------------------------------------------------------------------
|
||||
def test_detect_format():
|
||||
assert ec.detect_format("a/b/file.PDF") == "pdf"
|
||||
assert ec.detect_format("x.docx") == "docx"
|
||||
assert ec.detect_format("x.doc") == "doc"
|
||||
assert ec.detect_format("x.pptx") == "pptx"
|
||||
assert ec.detect_format("x.html") == "html"
|
||||
assert ec.detect_format("x.zip") == "zip"
|
||||
assert ec.detect_format("x.epub") == "epub"
|
||||
assert ec.detect_format("x.xyz") == "unknown"
|
||||
|
||||
|
||||
def test_is_junk():
|
||||
assert ec.is_junk("some/desktop.ini")
|
||||
assert ec.is_junk("notes.bak")
|
||||
assert ec.is_junk("README.md")
|
||||
assert not ec.is_junk("1000 Scout Games.pdf")
|
||||
|
||||
|
||||
# --------------------------------------------------------------------------
|
||||
# PDF — the critical "no max_pages" regression
|
||||
# --------------------------------------------------------------------------
|
||||
def test_pdf_extracts_all_60_pages(big_pdf):
|
||||
body = ec.extract_pdf(big_pdf)
|
||||
# the old converter capped at 50 pages — page 60 must be present now
|
||||
assert "--- PAGE 60 ---" in body
|
||||
assert "PDFMARK-60" in body
|
||||
assert ec.count_page_markers(body) == 60
|
||||
|
||||
|
||||
def test_pdf_does_not_truncate_mid_document(big_pdf):
|
||||
body = ec.extract_pdf(big_pdf)
|
||||
pages = ec.split_pages(body)
|
||||
assert pages[-1][0] == 60 # last marker is the real last page
|
||||
|
||||
|
||||
# --------------------------------------------------------------------------
|
||||
# page join / split round-trip
|
||||
# --------------------------------------------------------------------------
|
||||
def test_join_split_round_trip():
|
||||
body = ec.join_pages(["alpha", "beta", "gamma"])
|
||||
pages = ec.split_pages(body)
|
||||
assert [n for n, _ in pages] == [1, 2, 3]
|
||||
assert [t for _, t in pages] == ["alpha", "beta", "gamma"]
|
||||
|
||||
|
||||
def test_split_pages_no_markers_returns_empty():
|
||||
assert ec.split_pages("plain text with no markers") == []
|
||||
|
||||
|
||||
# --------------------------------------------------------------------------
|
||||
# docx — synthetic page markers
|
||||
# --------------------------------------------------------------------------
|
||||
def test_docx_synthetic_page_markers(sample_docx):
|
||||
body = ec.extract_docx(sample_docx)
|
||||
# 100 paragraphs / 40 per page => 3 pages
|
||||
assert ec.count_page_markers(body) == 3
|
||||
assert "Paragraf 99" in body
|
||||
|
||||
|
||||
# --------------------------------------------------------------------------
|
||||
# HTML mirror — nav/script/footer stripped
|
||||
# --------------------------------------------------------------------------
|
||||
def test_html_strips_chrome(html_with_nav):
|
||||
body = ec.extract_html(html_with_nav)
|
||||
assert "Vanatoarea de comori" in body
|
||||
assert "joc real de orientare" in body
|
||||
# chrome must be gone
|
||||
assert "tracking" not in body
|
||||
assert "Site Banner Junk" not in body
|
||||
assert "toate drepturile rezervate" not in body
|
||||
assert "Games" not in body
|
||||
|
||||
|
||||
# --------------------------------------------------------------------------
|
||||
# content hash + near-duplicate elimination
|
||||
# --------------------------------------------------------------------------
|
||||
def test_content_hash_ignores_whitespace():
|
||||
assert ec.content_hash("hello world") == ec.content_hash("hello world\n")
|
||||
assert ec.content_hash("hello world") != ec.content_hash("goodbye world")
|
||||
|
||||
|
||||
def test_dedupe_exact_duplicates():
|
||||
items = [("a", "joc identic"), ("b", "joc identic"), ("c", "alt joc")]
|
||||
kept = ec.dedupe_texts(items)
|
||||
assert [k for k, _ in kept] == ["a", "c"]
|
||||
|
||||
|
||||
def test_dedupe_near_duplicates():
|
||||
base = "Vanatoarea de comori este un joc de orientare pentru cercetasi in tabara."
|
||||
near = base + " Pagina printata." # >95% similar
|
||||
items = [("orig", base), ("print", near), ("other", "Cu totul alt continut diferit aici.")]
|
||||
kept = ec.dedupe_texts(items, threshold=85.0)
|
||||
keys = [k for k, _ in kept]
|
||||
assert "orig" in keys
|
||||
assert "print" not in keys
|
||||
assert "other" in keys
|
||||
|
||||
|
||||
# --------------------------------------------------------------------------
|
||||
# zip recursion
|
||||
# --------------------------------------------------------------------------
|
||||
def test_zip_recurses_into_inner_files(sample_zip):
|
||||
body = ec.extract_zip(sample_zip)
|
||||
assert "Paragraf 0" in body
|
||||
assert ec.count_page_markers(body) > 0
|
||||
|
||||
|
||||
def test_zip_bad_archive_returns_empty(tmp_path):
|
||||
bad = tmp_path / "broken.zip"
|
||||
bad.write_text("not a zip", encoding="utf-8")
|
||||
assert ec.extract_zip(bad) == ""
|
||||
|
||||
|
||||
def test_nested_zip(tmp_path, sample_zip):
|
||||
outer = tmp_path / "outer.zip"
|
||||
with zipfile.ZipFile(outer, "w") as zf:
|
||||
zf.write(sample_zip, arcname="nested/archive.zip")
|
||||
body = ec.extract_zip(outer)
|
||||
assert "Paragraf 0" in body
|
||||
|
||||
|
||||
# --------------------------------------------------------------------------
|
||||
# preflight
|
||||
# --------------------------------------------------------------------------
|
||||
def test_preflight_python_packages_present():
|
||||
report = ec.preflight()
|
||||
# all required packages are installed in the test environment
|
||||
assert report["missing_python"] == []
|
||||
|
||||
|
||||
def test_preflight_reports_libreoffice_state():
|
||||
report = ec.preflight()
|
||||
has_lo = bool(shutil.which("libreoffice") or shutil.which("soffice"))
|
||||
if has_lo:
|
||||
assert all("libreoffice" not in w for w in report["warnings"])
|
||||
else:
|
||||
assert any("libreoffice" in w for w in report["warnings"])
|
||||
|
||||
|
||||
def test_preflight_ocr_flag():
|
||||
report = ec.preflight(check_ocr=True)
|
||||
if not shutil.which("tesseract"):
|
||||
assert any("tesseract" in m for m in report["missing_system"])
|
||||
|
||||
|
||||
# --------------------------------------------------------------------------
|
||||
# legacy .doc — skipped unless libreoffice is installed
|
||||
# --------------------------------------------------------------------------
|
||||
@pytest.mark.skipif(
|
||||
not (shutil.which("libreoffice") or shutil.which("soffice")),
|
||||
reason="libreoffice not installed",
|
||||
)
|
||||
def test_doc_conversion(tmp_path, sample_docx):
|
||||
doc_path = tmp_path / "legacy.doc"
|
||||
shutil.copy(sample_docx, doc_path) # smoke test of the docx path
|
||||
body = ec.extract_doc(doc_path)
|
||||
assert ec.count_page_markers(body) >= 1
|
||||
|
||||
|
||||
def test_doc_without_libreoffice_raises(tmp_path, monkeypatch):
|
||||
monkeypatch.setattr(ec.shutil, "which", lambda _: None)
|
||||
with pytest.raises(RuntimeError):
|
||||
ec.extract_doc(tmp_path / "whatever.doc")
|
||||
139
tests/test_fts.py
Normal file
139
tests/test_fts.py
Normal file
@@ -0,0 +1,139 @@
|
||||
"""
|
||||
Integration tests for the FTS5 search index.
|
||||
|
||||
Confirms that materials_list and skills_developed are indexed by FTS5 and kept
|
||||
in sync by the insert / update / delete triggers (plan §6, §7).
|
||||
"""
|
||||
|
||||
import os
|
||||
import sys
|
||||
import json
|
||||
|
||||
import pytest
|
||||
|
||||
# Make the project root importable when pytest is run from anywhere.
|
||||
PROJECT_ROOT = os.path.abspath(os.path.join(os.path.dirname(__file__), ".."))
|
||||
if PROJECT_ROOT not in sys.path:
|
||||
sys.path.insert(0, PROJECT_ROOT)
|
||||
|
||||
from app.models.activity import Activity # noqa: E402
|
||||
from app.models.database import DatabaseManager # noqa: E402
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def db(tmp_path):
|
||||
"""A fresh DatabaseManager backed by a temporary SQLite file."""
|
||||
return DatabaseManager(str(tmp_path / "test_activities.db"))
|
||||
|
||||
|
||||
def _make_activity(**overrides):
|
||||
base = dict(
|
||||
name="Vânătoarea de comori",
|
||||
description="O activitate de echipă în aer liber.",
|
||||
category="camp-outdoor",
|
||||
content_type="joc",
|
||||
source_file="test.txt",
|
||||
language="ro",
|
||||
)
|
||||
base.update(overrides)
|
||||
return Activity(**base)
|
||||
|
||||
|
||||
def test_search_by_materials_list(db):
|
||||
"""A term that only appears in materials_list returns the activity."""
|
||||
activity = _make_activity(materials_list="frânghie, eșarfă, busolă")
|
||||
db.insert_activity(activity)
|
||||
|
||||
results = db.search_activities(search_text="busolă")
|
||||
assert len(results) == 1
|
||||
assert results[0]["name"] == "Vânătoarea de comori"
|
||||
|
||||
|
||||
def test_search_by_skills_developed(db):
|
||||
"""A term that only appears in skills_developed returns the activity."""
|
||||
activity = _make_activity(skills_developed="comunicare, leadership, rabdare")
|
||||
db.insert_activity(activity)
|
||||
|
||||
results = db.search_activities(search_text="leadership")
|
||||
assert len(results) == 1
|
||||
assert results[0]["name"] == "Vânătoarea de comori"
|
||||
|
||||
|
||||
def test_term_absent_from_indexed_columns_no_hit(db):
|
||||
"""A term present in no indexed column yields no hit (control)."""
|
||||
db.insert_activity(_make_activity(materials_list="frânghie"))
|
||||
assert db.search_activities(search_text="zzzunlikelyterm") == []
|
||||
|
||||
|
||||
def test_delete_trigger_removes_from_fts(db):
|
||||
"""Deleting an activity removes it from the FTS index (delete trigger)."""
|
||||
activity = _make_activity(materials_list="catalige")
|
||||
activity_id = db.insert_activity(activity)
|
||||
assert len(db.search_activities(search_text="catalige")) == 1
|
||||
|
||||
with db._get_connection() as conn:
|
||||
conn.execute("DELETE FROM activities WHERE id = ?", (activity_id,))
|
||||
conn.commit()
|
||||
|
||||
assert db.search_activities(search_text="catalige") == []
|
||||
|
||||
|
||||
def test_update_trigger_resyncs_fts(db):
|
||||
"""Updating materials_list re-syncs the FTS index (update trigger)."""
|
||||
activity = _make_activity(materials_list="creioane")
|
||||
activity_id = db.insert_activity(activity)
|
||||
assert len(db.search_activities(search_text="creioane")) == 1
|
||||
|
||||
with db._get_connection() as conn:
|
||||
conn.execute(
|
||||
"UPDATE activities SET materials_list = ? WHERE id = ?",
|
||||
("acuarele", activity_id),
|
||||
)
|
||||
conn.commit()
|
||||
|
||||
# Old term gone, new term found.
|
||||
assert db.search_activities(search_text="creioane") == []
|
||||
assert len(db.search_activities(search_text="acuarele")) == 1
|
||||
|
||||
|
||||
def test_rebuild_fts_index(db):
|
||||
"""rebuild_fts_index keeps materials_list / skills_developed searchable."""
|
||||
db.insert_activity(_make_activity(skills_developed="orientare"))
|
||||
db.rebuild_fts_index()
|
||||
assert len(db.search_activities(search_text="orientare")) == 1
|
||||
|
||||
|
||||
def test_new_schema_columns_round_trip(db):
|
||||
"""New activity columns persist and load back via from_dict."""
|
||||
activity = _make_activity(
|
||||
source_files=["a.txt", "b.txt"],
|
||||
source_excerpt="Citat scurt din sursă.",
|
||||
extraction_confidence="high",
|
||||
needs_review=1,
|
||||
normalized_name="vanatoarea de comori",
|
||||
)
|
||||
activity_id = db.insert_activity(activity)
|
||||
|
||||
row = db.get_activity_by_id(activity_id)
|
||||
assert row["content_type"] == "joc"
|
||||
assert row["language"] == "ro"
|
||||
assert row["extraction_confidence"] == "high"
|
||||
assert row["needs_review"] == 1
|
||||
assert row["normalized_name"] == "vanatoarea de comori"
|
||||
assert json.loads(row["source_files"]) == ["a.txt", "b.txt"]
|
||||
assert row["source_excerpt"] == "Citat scurt din sursă."
|
||||
|
||||
loaded = Activity.from_dict(row)
|
||||
assert loaded.source_files == ["a.txt", "b.txt"]
|
||||
assert loaded.content_type == "joc"
|
||||
|
||||
|
||||
def test_normalized_name_auto_derived(db):
|
||||
"""normalized_name is auto-derived from name when not provided."""
|
||||
activity = Activity(
|
||||
name="Ștafetă cu Obstacole",
|
||||
description="desc",
|
||||
category="sports-active",
|
||||
source_file="t.txt",
|
||||
)
|
||||
assert activity.normalized_name == "stafeta cu obstacole"
|
||||
140
tests/test_search.py
Normal file
140
tests/test_search.py
Normal file
@@ -0,0 +1,140 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
"""
|
||||
CRITICAL REGRESSION TEST (plan §6, §7).
|
||||
|
||||
`search.py` changed the result sets of /search and /api/search: the default
|
||||
search now EXCLUDES the non-game content types (rețetă / cântec / ceremonie),
|
||||
which surface only when the user explicitly filters that content_type or picks
|
||||
a non-game category. This test guards that behaviour.
|
||||
"""
|
||||
|
||||
import pytest
|
||||
|
||||
from app.models.activity import Activity
|
||||
from app.models.database import DatabaseManager
|
||||
from app.services.search import SearchService
|
||||
from app.config_taxonomy import NON_GAME_CONTENT_TYPES
|
||||
|
||||
|
||||
# --------------------------------------------------------------------------
|
||||
# fixtures
|
||||
# --------------------------------------------------------------------------
|
||||
def _activity(name, content_type, category="altele", language="ro"):
|
||||
return Activity(
|
||||
name=name,
|
||||
description=f"Descriere pentru {name}, un conținut de tip {content_type}.",
|
||||
category=category,
|
||||
content_type=content_type,
|
||||
language=language,
|
||||
source_file="test/fixture.txt",
|
||||
)
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def search_service(tmp_path):
|
||||
"""A SearchService over a temp DB seeded with one row per content_type."""
|
||||
db = DatabaseManager(str(tmp_path / "activities.db"))
|
||||
db.clear_database()
|
||||
db.bulk_insert_activities([
|
||||
_activity("Vanatoarea de comori", "joc", category="wide-games"),
|
||||
_activity("Cercul de cunoastere", "activitate", category="icebreakers"),
|
||||
_activity("Reteta de paine la ceaun", "reteta", category="retete"),
|
||||
_activity("Cantecul de tabara", "cantec", category="cantece-ceremonii"),
|
||||
_activity("Ceremonia de inchidere", "ceremonie", category="cantece-ceremonii"),
|
||||
_activity("Game in English", "joc", category="wide-games", language="en"),
|
||||
])
|
||||
return SearchService(db)
|
||||
|
||||
|
||||
def _content_types(results):
|
||||
return {r.get("content_type") for r in results}
|
||||
|
||||
|
||||
# --------------------------------------------------------------------------
|
||||
# the regression: default search excludes non-game content types
|
||||
# --------------------------------------------------------------------------
|
||||
def test_default_search_excludes_non_game_content(search_service):
|
||||
"""No filters → rețete / cântece / ceremonii must NOT appear."""
|
||||
results = search_service.search_activities()
|
||||
types = _content_types(results)
|
||||
|
||||
assert types, "default search returned nothing"
|
||||
for non_game in NON_GAME_CONTENT_TYPES:
|
||||
assert non_game not in types, (
|
||||
f"default search leaked non-game content_type '{non_game}'"
|
||||
)
|
||||
# game content is still present
|
||||
assert "joc" in types
|
||||
assert "activitate" in types
|
||||
|
||||
|
||||
def test_default_search_with_text_excludes_non_game(search_service):
|
||||
"""A text query still excludes non-game content by default."""
|
||||
results = search_service.search_activities(search_text="conținut")
|
||||
assert NON_GAME_CONTENT_TYPES[0] not in _content_types(results)
|
||||
|
||||
|
||||
# --------------------------------------------------------------------------
|
||||
# explicit content_type filter INCLUDES the non-game rows
|
||||
# --------------------------------------------------------------------------
|
||||
def test_explicit_content_type_filter_includes_non_game(search_service):
|
||||
"""Filtering content_type=reteta returns exactly the rețete."""
|
||||
results = search_service.search_activities(filters={"content_type": "reteta"})
|
||||
types = _content_types(results)
|
||||
|
||||
assert types == {"reteta"}, f"expected only rețete, got {types}"
|
||||
assert len(results) == 1
|
||||
|
||||
|
||||
def test_explicit_content_type_filter_for_cantec(search_service):
|
||||
results = search_service.search_activities(filters={"content_type": "cantec"})
|
||||
assert _content_types(results) == {"cantec"}
|
||||
|
||||
|
||||
# --------------------------------------------------------------------------
|
||||
# a non-game CATEGORY filter also lifts the exclusion
|
||||
# --------------------------------------------------------------------------
|
||||
def test_non_game_category_filter_includes_non_game(search_service):
|
||||
"""Picking category=cantece-ceremonii surfaces cântece + ceremonii."""
|
||||
results = search_service.search_activities(
|
||||
filters={"category": "cantece-ceremonii"})
|
||||
types = _content_types(results)
|
||||
|
||||
assert "cantec" in types
|
||||
assert "ceremonie" in types
|
||||
|
||||
|
||||
def test_game_category_filter_still_excludes_non_game(search_service):
|
||||
"""A normal (game) category filter keeps the non-game exclusion."""
|
||||
results = search_service.search_activities(filters={"category": "wide-games"})
|
||||
types = _content_types(results)
|
||||
for non_game in NON_GAME_CONTENT_TYPES:
|
||||
assert non_game not in types
|
||||
|
||||
|
||||
# --------------------------------------------------------------------------
|
||||
# language filter
|
||||
# --------------------------------------------------------------------------
|
||||
def test_language_filter_ro(search_service):
|
||||
results = search_service.search_activities(filters={"language": "ro"})
|
||||
assert results
|
||||
assert all(r.get("language") == "ro" for r in results)
|
||||
|
||||
|
||||
def test_language_filter_en(search_service):
|
||||
results = search_service.search_activities(filters={"language": "en"})
|
||||
assert results
|
||||
assert all(r.get("language") == "en" for r in results)
|
||||
assert {r.get("name") for r in results} == {"Game in English"}
|
||||
|
||||
|
||||
# --------------------------------------------------------------------------
|
||||
# get_filter_options surfaces the new axes
|
||||
# --------------------------------------------------------------------------
|
||||
def test_filter_options_include_content_type_and_language(search_service):
|
||||
"""The dynamic-filter mechanism now exposes content_type + language."""
|
||||
options = search_service.db.get_filter_options()
|
||||
assert "content_type" in options
|
||||
assert "language" in options
|
||||
assert "joc" in options["content_type"]
|
||||
assert set(options["language"]) == {"ro", "en"}
|
||||
156
tests/test_validate_extractions.py
Normal file
156
tests/test_validate_extractions.py
Normal file
@@ -0,0 +1,156 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
"""
|
||||
Tests for scripts/validate_extractions.py.
|
||||
|
||||
Covers: schema rejection, the source_excerpt hallucination check, the content
|
||||
of the generated re-extraction prompt, and the manifest `rejected` marking.
|
||||
"""
|
||||
|
||||
import json
|
||||
import sys
|
||||
from pathlib import Path
|
||||
|
||||
REPO_ROOT = Path(__file__).resolve().parent.parent
|
||||
SCRIPTS_DIR = REPO_ROOT / "scripts"
|
||||
for _p in (str(REPO_ROOT), str(SCRIPTS_DIR)):
|
||||
if _p not in sys.path:
|
||||
sys.path.insert(0, _p)
|
||||
|
||||
import validate_extractions as ve # noqa: E402
|
||||
|
||||
|
||||
# --------------------------------------------------------------------------
|
||||
# helpers
|
||||
# --------------------------------------------------------------------------
|
||||
def _ext_activity(**over):
|
||||
base = dict(
|
||||
name="Jocul testului",
|
||||
description="O activitate de echipa in aer liber.",
|
||||
category="team-building",
|
||||
content_type="joc",
|
||||
language="ro",
|
||||
extraction_confidence="high",
|
||||
source_excerpt="ancora din bucata sursa",
|
||||
page_reference="page 1",
|
||||
)
|
||||
base.update(over)
|
||||
return base
|
||||
|
||||
|
||||
def _write_extraction(extracted_dir, chunk_key, activities, header_extra=None):
|
||||
extracted_dir.mkdir(parents=True, exist_ok=True)
|
||||
header = {
|
||||
"source_hash": "hash1234deadbeef",
|
||||
"schema_version": "1.0",
|
||||
"prompt_version": "1.0",
|
||||
"chunk_range": "pages 1-20",
|
||||
"source_id": "src01",
|
||||
"chunk_key": chunk_key,
|
||||
}
|
||||
if header_extra:
|
||||
header.update(header_extra)
|
||||
payload = {"header": header, "activities": activities}
|
||||
(extracted_dir / f"{chunk_key}.json").write_text(
|
||||
json.dumps(payload, ensure_ascii=False), encoding="utf-8"
|
||||
)
|
||||
|
||||
|
||||
def _write_chunk(chunks_dir, source_id, chunk_key, text):
|
||||
d = chunks_dir / source_id
|
||||
d.mkdir(parents=True, exist_ok=True)
|
||||
(d / f"{chunk_key}.txt").write_text(text, encoding="utf-8")
|
||||
|
||||
|
||||
# --------------------------------------------------------------------------
|
||||
# tests
|
||||
# --------------------------------------------------------------------------
|
||||
def test_valid_file_passes(tmp_path):
|
||||
extracted = tmp_path / "extracted"
|
||||
chunks = tmp_path / "chunks"
|
||||
excerpt = "ancora din bucata sursa apare aici"
|
||||
_write_extraction(extracted, "src01.part01", [_ext_activity(source_excerpt=excerpt)])
|
||||
_write_chunk(chunks, "src01", "src01.part01", f"--- PAGE 1 ---\n{excerpt}\n")
|
||||
|
||||
report = ve.run(extracted, chunks, tmp_path / "manifest.json")
|
||||
assert report["valid"] == 1
|
||||
assert report["rejected"] == 0
|
||||
|
||||
|
||||
def test_schema_invalid_file_rejected(tmp_path):
|
||||
extracted = tmp_path / "extracted"
|
||||
chunks = tmp_path / "chunks"
|
||||
extracted.mkdir(parents=True)
|
||||
(extracted / "src01.part01.json").write_text(
|
||||
json.dumps({"header": {}, "activities": [{"name": "x"}]}), encoding="utf-8"
|
||||
)
|
||||
|
||||
report = ve.run(extracted, chunks, tmp_path / "manifest.json")
|
||||
assert report["rejected"] == 1
|
||||
prompt = extracted / "_reextract" / "src01.part01.prompt.md"
|
||||
assert prompt.exists()
|
||||
|
||||
|
||||
def test_hallucinated_excerpt_rejected(tmp_path):
|
||||
extracted = tmp_path / "extracted"
|
||||
chunks = tmp_path / "chunks"
|
||||
_write_extraction(
|
||||
extracted, "src01.part01",
|
||||
[_ext_activity(source_excerpt="citat complet inventat care nu exista qqqq")],
|
||||
)
|
||||
_write_chunk(chunks, "src01", "src01.part01",
|
||||
"--- PAGE 1 ---\ntext complet diferit despre altceva.\n")
|
||||
|
||||
report = ve.run(extracted, chunks, tmp_path / "manifest.json")
|
||||
assert report["rejected"] == 1
|
||||
errors = report["rejected_chunks"][0]["errors"]
|
||||
assert any("hallucination" in e for e in errors)
|
||||
|
||||
|
||||
def test_reextraction_prompt_content(tmp_path):
|
||||
extracted = tmp_path / "extracted"
|
||||
chunks = tmp_path / "chunks"
|
||||
_write_extraction(
|
||||
extracted, "src01.part01",
|
||||
[_ext_activity(source_excerpt="citat inventat care nu exista zzzz")],
|
||||
)
|
||||
_write_chunk(chunks, "src01", "src01.part01",
|
||||
"--- PAGE 1 ---\ntext despre cu totul altceva aici.\n")
|
||||
|
||||
ve.run(extracted, chunks, tmp_path / "manifest.json")
|
||||
prompt = (extracted / "_reextract" / "src01.part01.prompt.md").read_text(
|
||||
encoding="utf-8"
|
||||
)
|
||||
assert "src01.part01" in prompt
|
||||
assert "REJECTED" in prompt
|
||||
assert "verbatim" in prompt
|
||||
assert "data/extracted/src01.part01.json" in prompt
|
||||
|
||||
|
||||
def test_manifest_marks_chunk_rejected(tmp_path):
|
||||
extracted = tmp_path / "extracted"
|
||||
chunks = tmp_path / "chunks"
|
||||
manifest_path = tmp_path / "manifest.json"
|
||||
manifest_path.write_text(
|
||||
json.dumps({"chunks": {"src01.part01": {"state": "done",
|
||||
"chunk_file": "chunks/src01/src01.part01.txt"}}}),
|
||||
encoding="utf-8",
|
||||
)
|
||||
_write_extraction(
|
||||
extracted, "src01.part01",
|
||||
[_ext_activity(source_excerpt="citat fabricat absent vvvv")],
|
||||
)
|
||||
_write_chunk(chunks, "src01", "src01.part01",
|
||||
"--- PAGE 1 ---\nun continut neinrudit.\n")
|
||||
|
||||
ve.run(extracted, chunks, manifest_path)
|
||||
manifest = json.loads(manifest_path.read_text(encoding="utf-8"))
|
||||
assert manifest["chunks"]["src01.part01"]["state"] == "rejected"
|
||||
|
||||
|
||||
def test_build_reextraction_prompt_lists_errors():
|
||||
prompt = ve.build_reextraction_prompt(
|
||||
"abc.part03", "data/chunks/abc/abc.part03.txt",
|
||||
["header: 'source_hash' is a required property"],
|
||||
)
|
||||
assert "abc.part03" in prompt
|
||||
assert "source_hash" in prompt
|
||||
Reference in New Issue
Block a user