Implements the approved plan to replace the broken regex/index-master extraction with an LLM-subagent pipeline. Four parallel lanes: Lane A — scripts/extract_common.py (PDF/docx/doc/pptx/html/zip, no max_pages truncation), normalize_sources.py, chunk_sources.py (~20pg chunks + overlap, manifest registry), activity_schema.json. Lane B — app/config_taxonomy.py (16 fixed category slugs), schema rebuilt from scratch in app/models/ with content_type, language, source_files, source_excerpt, normalized_name, extraction_confidence, needs_review; FTS5 + 3 triggers extended with materials_list and skills_developed. Lane C — build_database.py (--rebuild, atomic swap, schema + fuzzy source_excerpt validation, dedup with needs_review band), validate_extractions.py, review_queue.py, new run_extraction.py orchestrator, SUBAGENT_PROMPT.md. Lane D — search.py content_type/language filters (default search excludes non-game content), E7 schema-compat audit; fixed a NULL keywords AttributeError in _boost_search_relevance. Removes 8 orphaned/dead scripts and app/services/parser.py + indexer.py. Adds tests/ (70 passing, 1 skipped — libreoffice absent). Note: Lane D made one additive edit to app/models/database.py (_update_category_counts) to surface content_type/language in get_filter_options, outside its nominal lane boundary but after Lane B completed. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
184 lines
6.8 KiB
Python
184 lines
6.8 KiB
Python
# -*- coding: utf-8 -*-
|
|
"""Tests for scripts/chunk_sources.py."""
|
|
|
|
import json
|
|
|
|
import chunk_sources as cs
|
|
import normalize_sources as ns
|
|
|
|
|
|
def _pages(n):
|
|
return [(i, f"text-{i}") for i in range(1, n + 1)]
|
|
|
|
|
|
# --------------------------------------------------------------------------
|
|
# header parsing
|
|
# --------------------------------------------------------------------------
|
|
def test_parse_source_splits_header_and_body(paginated_source):
|
|
text = paginated_source.read_text(encoding="utf-8")
|
|
header, body = cs.parse_source(text)
|
|
assert header["FORMAT"] == "pdf"
|
|
assert body.lstrip().startswith("--- PAGE 1 ---")
|
|
|
|
|
|
# --------------------------------------------------------------------------
|
|
# page chunking
|
|
# --------------------------------------------------------------------------
|
|
def test_chunk_pages_basic_split():
|
|
chunks = cs.chunk_pages(_pages(50), pages_per_chunk=20, overlap=4)
|
|
# stride 16: starts at pages 1, 17, 33, ...
|
|
assert chunks[0]["page_start"] == 1 and chunks[0]["page_end"] == 20
|
|
assert chunks[1]["page_start"] == 17
|
|
assert chunks[-1]["page_end"] == 50
|
|
|
|
|
|
def test_chunk_pages_have_overlap():
|
|
chunks = cs.chunk_pages(_pages(50), pages_per_chunk=20, overlap=4)
|
|
overlap = chunks[0]["page_end"] - chunks[1]["page_start"] + 1
|
|
assert overlap == 4
|
|
|
|
|
|
def test_chunk_pages_short_document_single_chunk():
|
|
chunks = cs.chunk_pages(_pages(8), pages_per_chunk=20, overlap=4)
|
|
assert len(chunks) == 1
|
|
assert chunks[0]["page_start"] == 1 and chunks[0]["page_end"] == 8
|
|
|
|
|
|
def test_chunk_pages_empty():
|
|
assert cs.chunk_pages([]) == []
|
|
|
|
|
|
def test_activity_at_page_boundary_intact_in_one_chunk(paginated_source):
|
|
"""An activity straddling the page 20/21 boundary must appear whole in >=1 chunk."""
|
|
text = paginated_source.read_text(encoding="utf-8")
|
|
chunks = cs.make_chunks(text)
|
|
full = [
|
|
c for c in chunks
|
|
if "ACTIVITY-START" in c["text"] and "ACTIVITY-END" in c["text"]
|
|
]
|
|
assert full, "activity spanning a page boundary was split across all chunks"
|
|
|
|
|
|
# --------------------------------------------------------------------------
|
|
# word-window chunking for unpaginated text
|
|
# --------------------------------------------------------------------------
|
|
def test_chunk_words_window_and_overlap():
|
|
text = " ".join(f"w{i}" for i in range(25_000))
|
|
chunks = cs.chunk_words(text, window=10_000, overlap=2_000)
|
|
assert len(chunks) == 3 # stride 8000 over 25000 words
|
|
first = chunks[0]["text"].split()
|
|
second = chunks[1]["text"].split()
|
|
assert first[8_000:10_000] == second[0:2_000] # 2000-word overlap
|
|
|
|
|
|
def test_make_chunks_unpaginated_uses_word_windows():
|
|
body = "cuvant " * 15_000
|
|
text = "SOURCE: x\nFORMAT: txt\n" + "=" * 50 + "\n\n" + body
|
|
chunks = cs.make_chunks(text)
|
|
assert len(chunks) >= 2
|
|
assert chunks[0]["chunk_range"].startswith("words")
|
|
|
|
|
|
# --------------------------------------------------------------------------
|
|
# stable source ids — anti-collision
|
|
# --------------------------------------------------------------------------
|
|
def test_stable_id_same_stem_different_path_no_collision():
|
|
a = ns.stable_id("camp/games/scout.pdf")
|
|
b = ns.stable_id("school/lessons/scout.pdf")
|
|
assert a != b
|
|
assert a.endswith("_scout") and b.endswith("_scout")
|
|
|
|
|
|
def test_stable_id_deterministic():
|
|
assert ns.stable_id("a/b/c.pdf") == ns.stable_id("a/b/c.pdf")
|
|
|
|
|
|
# --------------------------------------------------------------------------
|
|
# manifest registry + idempotency
|
|
# --------------------------------------------------------------------------
|
|
def test_run_writes_chunks_and_manifest(paginated_source, tmp_path):
|
|
sources_dir = tmp_path / "sources"
|
|
sources_dir.mkdir()
|
|
(sources_dir / paginated_source.name).write_text(
|
|
paginated_source.read_text(encoding="utf-8"), encoding="utf-8"
|
|
)
|
|
chunks_dir = tmp_path / "chunks"
|
|
|
|
summary = cs.run(sources_dir, chunks_dir)
|
|
assert summary["sources"] == 1
|
|
assert summary["chunks"] >= 2
|
|
|
|
manifest = json.loads((chunks_dir / "manifest.json").read_text())
|
|
assert manifest["chunks"]
|
|
for key, meta in manifest["chunks"].items():
|
|
assert meta["state"] == "pending"
|
|
assert meta["expected_json"] == f"{key}.json"
|
|
assert (chunks_dir.parent / meta["chunk_file"]).exists()
|
|
|
|
|
|
def test_manifest_idempotent_preserves_state(paginated_source, tmp_path):
|
|
sources_dir = tmp_path / "sources"
|
|
sources_dir.mkdir()
|
|
(sources_dir / paginated_source.name).write_text(
|
|
paginated_source.read_text(encoding="utf-8"), encoding="utf-8"
|
|
)
|
|
chunks_dir = tmp_path / "chunks"
|
|
manifest_path = chunks_dir / "manifest.json"
|
|
|
|
cs.run(sources_dir, chunks_dir)
|
|
|
|
# orchestrator marks one chunk done
|
|
manifest = json.loads(manifest_path.read_text())
|
|
first_key = next(iter(manifest["chunks"]))
|
|
n_before = len(manifest["chunks"])
|
|
manifest["chunks"][first_key]["state"] = "done"
|
|
manifest_path.write_text(json.dumps(manifest), encoding="utf-8")
|
|
|
|
# re-run: 'done' must survive, no chunk added or lost
|
|
cs.run(sources_dir, chunks_dir)
|
|
manifest2 = json.loads(manifest_path.read_text())
|
|
assert len(manifest2["chunks"]) == n_before
|
|
assert manifest2["chunks"][first_key]["state"] == "done"
|
|
assert all(
|
|
m["state"] in ("pending", "done") for m in manifest2["chunks"].values()
|
|
)
|
|
|
|
|
|
def test_manifest_resets_state_when_source_changes(paginated_source, tmp_path):
|
|
sources_dir = tmp_path / "sources"
|
|
sources_dir.mkdir()
|
|
src = sources_dir / paginated_source.name
|
|
src.write_text(paginated_source.read_text(encoding="utf-8"), encoding="utf-8")
|
|
chunks_dir = tmp_path / "chunks"
|
|
manifest_path = chunks_dir / "manifest.json"
|
|
|
|
cs.run(sources_dir, chunks_dir)
|
|
manifest = json.loads(manifest_path.read_text())
|
|
first_key = next(iter(manifest["chunks"]))
|
|
manifest["chunks"][first_key]["state"] = "done"
|
|
manifest_path.write_text(json.dumps(manifest), encoding="utf-8")
|
|
|
|
# mutate the source content -> hash changes -> state resets
|
|
src.write_text(src.read_text(encoding="utf-8") + "\n--- PAGE 51 ---\nextra\n",
|
|
encoding="utf-8")
|
|
cs.run(sources_dir, chunks_dir)
|
|
manifest2 = json.loads(manifest_path.read_text())
|
|
assert manifest2["chunks"][first_key]["state"] == "pending"
|
|
|
|
|
|
def test_prune_stale_removes_orphan_entries(paginated_source, tmp_path):
|
|
sources_dir = tmp_path / "sources"
|
|
sources_dir.mkdir()
|
|
src = sources_dir / paginated_source.name
|
|
src.write_text(paginated_source.read_text(encoding="utf-8"), encoding="utf-8")
|
|
chunks_dir = tmp_path / "chunks"
|
|
|
|
cs.run(sources_dir, chunks_dir)
|
|
# delete the source -> its chunks become stale
|
|
src.unlink()
|
|
summary = cs.run(sources_dir, chunks_dir)
|
|
assert summary["chunks"] == 0
|
|
assert summary["pruned"] >= 1
|
|
manifest = json.loads((chunks_dir / "manifest.json").read_text())
|
|
assert manifest["chunks"] == {}
|