Files
game-library/tests/test_chunk_sources.py
Claude Agent 66ae831c36 Rebuild extraction pipeline infrastructure (Faza 0 prep)
Implements the approved plan to replace the broken regex/index-master
extraction with an LLM-subagent pipeline. Four parallel lanes:

Lane A — scripts/extract_common.py (PDF/docx/doc/pptx/html/zip, no
  max_pages truncation), normalize_sources.py, chunk_sources.py
  (~20pg chunks + overlap, manifest registry), activity_schema.json.
Lane B — app/config_taxonomy.py (16 fixed category slugs), schema
  rebuilt from scratch in app/models/ with content_type, language,
  source_files, source_excerpt, normalized_name, extraction_confidence,
  needs_review; FTS5 + 3 triggers extended with materials_list and
  skills_developed.
Lane C — build_database.py (--rebuild, atomic swap, schema + fuzzy
  source_excerpt validation, dedup with needs_review band),
  validate_extractions.py, review_queue.py, new run_extraction.py
  orchestrator, SUBAGENT_PROMPT.md.
Lane D — search.py content_type/language filters (default search
  excludes non-game content), E7 schema-compat audit; fixed a NULL
  keywords AttributeError in _boost_search_relevance.

Removes 8 orphaned/dead scripts and app/services/parser.py +
indexer.py. Adds tests/ (70 passing, 1 skipped — libreoffice absent).

Note: Lane D made one additive edit to app/models/database.py
(_update_category_counts) to surface content_type/language in
get_filter_options, outside its nominal lane boundary but after
Lane B completed.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-05-19 17:43:38 +00:00

184 lines
6.8 KiB
Python

# -*- coding: utf-8 -*-
"""Tests for scripts/chunk_sources.py."""
import json
import chunk_sources as cs
import normalize_sources as ns
def _pages(n):
return [(i, f"text-{i}") for i in range(1, n + 1)]
# --------------------------------------------------------------------------
# header parsing
# --------------------------------------------------------------------------
def test_parse_source_splits_header_and_body(paginated_source):
text = paginated_source.read_text(encoding="utf-8")
header, body = cs.parse_source(text)
assert header["FORMAT"] == "pdf"
assert body.lstrip().startswith("--- PAGE 1 ---")
# --------------------------------------------------------------------------
# page chunking
# --------------------------------------------------------------------------
def test_chunk_pages_basic_split():
chunks = cs.chunk_pages(_pages(50), pages_per_chunk=20, overlap=4)
# stride 16: starts at pages 1, 17, 33, ...
assert chunks[0]["page_start"] == 1 and chunks[0]["page_end"] == 20
assert chunks[1]["page_start"] == 17
assert chunks[-1]["page_end"] == 50
def test_chunk_pages_have_overlap():
chunks = cs.chunk_pages(_pages(50), pages_per_chunk=20, overlap=4)
overlap = chunks[0]["page_end"] - chunks[1]["page_start"] + 1
assert overlap == 4
def test_chunk_pages_short_document_single_chunk():
chunks = cs.chunk_pages(_pages(8), pages_per_chunk=20, overlap=4)
assert len(chunks) == 1
assert chunks[0]["page_start"] == 1 and chunks[0]["page_end"] == 8
def test_chunk_pages_empty():
assert cs.chunk_pages([]) == []
def test_activity_at_page_boundary_intact_in_one_chunk(paginated_source):
"""An activity straddling the page 20/21 boundary must appear whole in >=1 chunk."""
text = paginated_source.read_text(encoding="utf-8")
chunks = cs.make_chunks(text)
full = [
c for c in chunks
if "ACTIVITY-START" in c["text"] and "ACTIVITY-END" in c["text"]
]
assert full, "activity spanning a page boundary was split across all chunks"
# --------------------------------------------------------------------------
# word-window chunking for unpaginated text
# --------------------------------------------------------------------------
def test_chunk_words_window_and_overlap():
text = " ".join(f"w{i}" for i in range(25_000))
chunks = cs.chunk_words(text, window=10_000, overlap=2_000)
assert len(chunks) == 3 # stride 8000 over 25000 words
first = chunks[0]["text"].split()
second = chunks[1]["text"].split()
assert first[8_000:10_000] == second[0:2_000] # 2000-word overlap
def test_make_chunks_unpaginated_uses_word_windows():
body = "cuvant " * 15_000
text = "SOURCE: x\nFORMAT: txt\n" + "=" * 50 + "\n\n" + body
chunks = cs.make_chunks(text)
assert len(chunks) >= 2
assert chunks[0]["chunk_range"].startswith("words")
# --------------------------------------------------------------------------
# stable source ids — anti-collision
# --------------------------------------------------------------------------
def test_stable_id_same_stem_different_path_no_collision():
a = ns.stable_id("camp/games/scout.pdf")
b = ns.stable_id("school/lessons/scout.pdf")
assert a != b
assert a.endswith("_scout") and b.endswith("_scout")
def test_stable_id_deterministic():
assert ns.stable_id("a/b/c.pdf") == ns.stable_id("a/b/c.pdf")
# --------------------------------------------------------------------------
# manifest registry + idempotency
# --------------------------------------------------------------------------
def test_run_writes_chunks_and_manifest(paginated_source, tmp_path):
sources_dir = tmp_path / "sources"
sources_dir.mkdir()
(sources_dir / paginated_source.name).write_text(
paginated_source.read_text(encoding="utf-8"), encoding="utf-8"
)
chunks_dir = tmp_path / "chunks"
summary = cs.run(sources_dir, chunks_dir)
assert summary["sources"] == 1
assert summary["chunks"] >= 2
manifest = json.loads((chunks_dir / "manifest.json").read_text())
assert manifest["chunks"]
for key, meta in manifest["chunks"].items():
assert meta["state"] == "pending"
assert meta["expected_json"] == f"{key}.json"
assert (chunks_dir.parent / meta["chunk_file"]).exists()
def test_manifest_idempotent_preserves_state(paginated_source, tmp_path):
sources_dir = tmp_path / "sources"
sources_dir.mkdir()
(sources_dir / paginated_source.name).write_text(
paginated_source.read_text(encoding="utf-8"), encoding="utf-8"
)
chunks_dir = tmp_path / "chunks"
manifest_path = chunks_dir / "manifest.json"
cs.run(sources_dir, chunks_dir)
# orchestrator marks one chunk done
manifest = json.loads(manifest_path.read_text())
first_key = next(iter(manifest["chunks"]))
n_before = len(manifest["chunks"])
manifest["chunks"][first_key]["state"] = "done"
manifest_path.write_text(json.dumps(manifest), encoding="utf-8")
# re-run: 'done' must survive, no chunk added or lost
cs.run(sources_dir, chunks_dir)
manifest2 = json.loads(manifest_path.read_text())
assert len(manifest2["chunks"]) == n_before
assert manifest2["chunks"][first_key]["state"] == "done"
assert all(
m["state"] in ("pending", "done") for m in manifest2["chunks"].values()
)
def test_manifest_resets_state_when_source_changes(paginated_source, tmp_path):
sources_dir = tmp_path / "sources"
sources_dir.mkdir()
src = sources_dir / paginated_source.name
src.write_text(paginated_source.read_text(encoding="utf-8"), encoding="utf-8")
chunks_dir = tmp_path / "chunks"
manifest_path = chunks_dir / "manifest.json"
cs.run(sources_dir, chunks_dir)
manifest = json.loads(manifest_path.read_text())
first_key = next(iter(manifest["chunks"]))
manifest["chunks"][first_key]["state"] = "done"
manifest_path.write_text(json.dumps(manifest), encoding="utf-8")
# mutate the source content -> hash changes -> state resets
src.write_text(src.read_text(encoding="utf-8") + "\n--- PAGE 51 ---\nextra\n",
encoding="utf-8")
cs.run(sources_dir, chunks_dir)
manifest2 = json.loads(manifest_path.read_text())
assert manifest2["chunks"][first_key]["state"] == "pending"
def test_prune_stale_removes_orphan_entries(paginated_source, tmp_path):
sources_dir = tmp_path / "sources"
sources_dir.mkdir()
src = sources_dir / paginated_source.name
src.write_text(paginated_source.read_text(encoding="utf-8"), encoding="utf-8")
chunks_dir = tmp_path / "chunks"
cs.run(sources_dir, chunks_dir)
# delete the source -> its chunks become stale
src.unlink()
summary = cs.run(sources_dir, chunks_dir)
assert summary["chunks"] == 0
assert summary["pruned"] >= 1
manifest = json.loads((chunks_dir / "manifest.json").read_text())
assert manifest["chunks"] == {}