# -*- coding: utf-8 -*- """ Tests for scripts/validate_extractions.py. Covers: schema rejection, the source_excerpt hallucination check, the content of the generated re-extraction prompt, and the manifest `rejected` marking. """ import json import sys from pathlib import Path REPO_ROOT = Path(__file__).resolve().parent.parent SCRIPTS_DIR = REPO_ROOT / "scripts" for _p in (str(REPO_ROOT), str(SCRIPTS_DIR)): if _p not in sys.path: sys.path.insert(0, _p) import validate_extractions as ve # noqa: E402 # -------------------------------------------------------------------------- # helpers # -------------------------------------------------------------------------- def _ext_activity(**over): base = dict( name="Jocul testului", description="O activitate de echipa in aer liber.", category="team-building", content_type="joc", language="ro", extraction_confidence="high", source_excerpt="ancora din bucata sursa", page_reference="page 1", ) base.update(over) return base def _write_extraction(extracted_dir, chunk_key, activities, header_extra=None): extracted_dir.mkdir(parents=True, exist_ok=True) header = { "source_hash": "hash1234deadbeef", "schema_version": "1.0", "prompt_version": "1.0", "chunk_range": "pages 1-20", "source_id": "src01", "chunk_key": chunk_key, } if header_extra: header.update(header_extra) payload = {"header": header, "activities": activities} (extracted_dir / f"{chunk_key}.json").write_text( json.dumps(payload, ensure_ascii=False), encoding="utf-8" ) def _write_chunk(chunks_dir, source_id, chunk_key, text): d = chunks_dir / source_id d.mkdir(parents=True, exist_ok=True) (d / f"{chunk_key}.txt").write_text(text, encoding="utf-8") # -------------------------------------------------------------------------- # tests # -------------------------------------------------------------------------- def test_valid_file_passes(tmp_path): extracted = tmp_path / "extracted" chunks = tmp_path / "chunks" excerpt = "ancora din bucata sursa apare aici" _write_extraction(extracted, "src01.part01", [_ext_activity(source_excerpt=excerpt)]) _write_chunk(chunks, "src01", "src01.part01", f"--- PAGE 1 ---\n{excerpt}\n") report = ve.run(extracted, chunks, tmp_path / "manifest.json") assert report["valid"] == 1 assert report["rejected"] == 0 def test_schema_invalid_file_rejected(tmp_path): extracted = tmp_path / "extracted" chunks = tmp_path / "chunks" extracted.mkdir(parents=True) (extracted / "src01.part01.json").write_text( json.dumps({"header": {}, "activities": [{"name": "x"}]}), encoding="utf-8" ) report = ve.run(extracted, chunks, tmp_path / "manifest.json") assert report["rejected"] == 1 prompt = extracted / "_reextract" / "src01.part01.prompt.md" assert prompt.exists() def test_hallucinated_excerpt_rejected(tmp_path): extracted = tmp_path / "extracted" chunks = tmp_path / "chunks" _write_extraction( extracted, "src01.part01", [_ext_activity(source_excerpt="citat complet inventat care nu exista qqqq")], ) _write_chunk(chunks, "src01", "src01.part01", "--- PAGE 1 ---\ntext complet diferit despre altceva.\n") report = ve.run(extracted, chunks, tmp_path / "manifest.json") assert report["rejected"] == 1 errors = report["rejected_chunks"][0]["errors"] assert any("hallucination" in e for e in errors) def test_reextraction_prompt_content(tmp_path): extracted = tmp_path / "extracted" chunks = tmp_path / "chunks" _write_extraction( extracted, "src01.part01", [_ext_activity(source_excerpt="citat inventat care nu exista zzzz")], ) _write_chunk(chunks, "src01", "src01.part01", "--- PAGE 1 ---\ntext despre cu totul altceva aici.\n") ve.run(extracted, chunks, tmp_path / "manifest.json") prompt = (extracted / "_reextract" / "src01.part01.prompt.md").read_text( encoding="utf-8" ) assert "src01.part01" in prompt assert "REJECTED" in prompt assert "verbatim" in prompt assert "data/extracted/src01.part01.json" in prompt def test_manifest_marks_chunk_rejected(tmp_path): extracted = tmp_path / "extracted" chunks = tmp_path / "chunks" manifest_path = tmp_path / "manifest.json" manifest_path.write_text( json.dumps({"chunks": {"src01.part01": {"state": "done", "chunk_file": "chunks/src01/src01.part01.txt"}}}), encoding="utf-8", ) _write_extraction( extracted, "src01.part01", [_ext_activity(source_excerpt="citat fabricat absent vvvv")], ) _write_chunk(chunks, "src01", "src01.part01", "--- PAGE 1 ---\nun continut neinrudit.\n") ve.run(extracted, chunks, manifest_path) manifest = json.loads(manifest_path.read_text(encoding="utf-8")) assert manifest["chunks"]["src01.part01"]["state"] == "rejected" def test_build_reextraction_prompt_lists_errors(): prompt = ve.build_reextraction_prompt( "abc.part03", "data/chunks/abc/abc.part03.txt", ["header: 'source_hash' is a required property"], ) assert "abc.part03" in prompt assert "source_hash" in prompt