# -*- coding: utf-8 -*- """Tests for scripts/chunk_sources.py.""" import json import chunk_sources as cs import normalize_sources as ns def _pages(n): return [(i, f"text-{i}") for i in range(1, n + 1)] # -------------------------------------------------------------------------- # header parsing # -------------------------------------------------------------------------- def test_parse_source_splits_header_and_body(paginated_source): text = paginated_source.read_text(encoding="utf-8") header, body = cs.parse_source(text) assert header["FORMAT"] == "pdf" assert body.lstrip().startswith("--- PAGE 1 ---") # -------------------------------------------------------------------------- # page chunking # -------------------------------------------------------------------------- def test_chunk_pages_basic_split(): chunks = cs.chunk_pages(_pages(50), pages_per_chunk=20, overlap=4) # stride 16: starts at pages 1, 17, 33, ... assert chunks[0]["page_start"] == 1 and chunks[0]["page_end"] == 20 assert chunks[1]["page_start"] == 17 assert chunks[-1]["page_end"] == 50 def test_chunk_pages_have_overlap(): chunks = cs.chunk_pages(_pages(50), pages_per_chunk=20, overlap=4) overlap = chunks[0]["page_end"] - chunks[1]["page_start"] + 1 assert overlap == 4 def test_chunk_pages_short_document_single_chunk(): chunks = cs.chunk_pages(_pages(8), pages_per_chunk=20, overlap=4) assert len(chunks) == 1 assert chunks[0]["page_start"] == 1 and chunks[0]["page_end"] == 8 def test_chunk_pages_empty(): assert cs.chunk_pages([]) == [] def test_activity_at_page_boundary_intact_in_one_chunk(paginated_source): """An activity straddling the page 20/21 boundary must appear whole in >=1 chunk.""" text = paginated_source.read_text(encoding="utf-8") chunks = cs.make_chunks(text) full = [ c for c in chunks if "ACTIVITY-START" in c["text"] and "ACTIVITY-END" in c["text"] ] assert full, "activity spanning a page boundary was split across all chunks" # -------------------------------------------------------------------------- # word-window chunking for unpaginated text # -------------------------------------------------------------------------- def test_chunk_words_window_and_overlap(): text = " ".join(f"w{i}" for i in range(25_000)) chunks = cs.chunk_words(text, window=10_000, overlap=2_000) assert len(chunks) == 3 # stride 8000 over 25000 words first = chunks[0]["text"].split() second = chunks[1]["text"].split() assert first[8_000:10_000] == second[0:2_000] # 2000-word overlap def test_make_chunks_unpaginated_uses_word_windows(): body = "cuvant " * 15_000 text = "SOURCE: x\nFORMAT: txt\n" + "=" * 50 + "\n\n" + body chunks = cs.make_chunks(text) assert len(chunks) >= 2 assert chunks[0]["chunk_range"].startswith("words") # -------------------------------------------------------------------------- # stable source ids — anti-collision # -------------------------------------------------------------------------- def test_stable_id_same_stem_different_path_no_collision(): a = ns.stable_id("camp/games/scout.pdf") b = ns.stable_id("school/lessons/scout.pdf") assert a != b assert a.endswith("_scout") and b.endswith("_scout") def test_stable_id_deterministic(): assert ns.stable_id("a/b/c.pdf") == ns.stable_id("a/b/c.pdf") # -------------------------------------------------------------------------- # manifest registry + idempotency # -------------------------------------------------------------------------- def test_run_writes_chunks_and_manifest(paginated_source, tmp_path): sources_dir = tmp_path / "sources" sources_dir.mkdir() (sources_dir / paginated_source.name).write_text( paginated_source.read_text(encoding="utf-8"), encoding="utf-8" ) chunks_dir = tmp_path / "chunks" summary = cs.run(sources_dir, chunks_dir) assert summary["sources"] == 1 assert summary["chunks"] >= 2 manifest = json.loads((chunks_dir / "manifest.json").read_text()) assert manifest["chunks"] for key, meta in manifest["chunks"].items(): assert meta["state"] == "pending" assert meta["expected_json"] == f"{key}.json" assert (chunks_dir.parent / meta["chunk_file"]).exists() def test_manifest_idempotent_preserves_state(paginated_source, tmp_path): sources_dir = tmp_path / "sources" sources_dir.mkdir() (sources_dir / paginated_source.name).write_text( paginated_source.read_text(encoding="utf-8"), encoding="utf-8" ) chunks_dir = tmp_path / "chunks" manifest_path = chunks_dir / "manifest.json" cs.run(sources_dir, chunks_dir) # orchestrator marks one chunk done manifest = json.loads(manifest_path.read_text()) first_key = next(iter(manifest["chunks"])) n_before = len(manifest["chunks"]) manifest["chunks"][first_key]["state"] = "done" manifest_path.write_text(json.dumps(manifest), encoding="utf-8") # re-run: 'done' must survive, no chunk added or lost cs.run(sources_dir, chunks_dir) manifest2 = json.loads(manifest_path.read_text()) assert len(manifest2["chunks"]) == n_before assert manifest2["chunks"][first_key]["state"] == "done" assert all( m["state"] in ("pending", "done") for m in manifest2["chunks"].values() ) def test_manifest_resets_state_when_source_changes(paginated_source, tmp_path): sources_dir = tmp_path / "sources" sources_dir.mkdir() src = sources_dir / paginated_source.name src.write_text(paginated_source.read_text(encoding="utf-8"), encoding="utf-8") chunks_dir = tmp_path / "chunks" manifest_path = chunks_dir / "manifest.json" cs.run(sources_dir, chunks_dir) manifest = json.loads(manifest_path.read_text()) first_key = next(iter(manifest["chunks"])) manifest["chunks"][first_key]["state"] = "done" manifest_path.write_text(json.dumps(manifest), encoding="utf-8") # mutate the source content -> hash changes -> state resets src.write_text(src.read_text(encoding="utf-8") + "\n--- PAGE 51 ---\nextra\n", encoding="utf-8") cs.run(sources_dir, chunks_dir) manifest2 = json.loads(manifest_path.read_text()) assert manifest2["chunks"][first_key]["state"] == "pending" def test_prune_stale_removes_orphan_entries(paginated_source, tmp_path): sources_dir = tmp_path / "sources" sources_dir.mkdir() src = sources_dir / paginated_source.name src.write_text(paginated_source.read_text(encoding="utf-8"), encoding="utf-8") chunks_dir = tmp_path / "chunks" cs.run(sources_dir, chunks_dir) # delete the source -> its chunks become stale src.unlink() summary = cs.run(sources_dir, chunks_dir) assert summary["chunks"] == 0 assert summary["pruned"] >= 1 manifest = json.loads((chunks_dir / "manifest.json").read_text()) assert manifest["chunks"] == {}