#!/usr/bin/env python3 # -*- coding: utf-8 -*- """ validate_extractions.py — validate every data/extracted/*.json (plan §5b). For each extraction file it runs two checks: 1. JSON-schema validation against scripts/activity_schema.json, 2. the source_excerpt anti-hallucination check (each excerpt must be a fuzzy substring of the chunk it came from). For every failing chunk it: * writes the exact re-extraction prompt to data/extracted/_reextract/.prompt.md, * marks the chunk `rejected` in data/chunks/manifest.json. The orchestrator then re-launches subagents only on the `rejected` chunks; the loop repeats until nothing is rejected. Usage: python scripts/validate_extractions.py """ from __future__ import annotations import argparse import json import sys from pathlib import Path from typing import Optional SCRIPT_DIR = Path(__file__).resolve().parent REPO_ROOT = SCRIPT_DIR.parent for _p in (str(SCRIPT_DIR), str(REPO_ROOT)): if _p not in sys.path: sys.path.insert(0, _p) from import_common import ( # noqa: E402 DEFAULT_SCHEMA_PATH, chunk_key_for, excerpt_matches, excerpt_score, find_chunk_text, iter_extraction_files, load_schema, validate_extraction, ) SUBAGENT_PROMPT = SCRIPT_DIR / "SUBAGENT_PROMPT.md" # -------------------------------------------------------------------------- # re-extraction prompt # -------------------------------------------------------------------------- def build_reextraction_prompt( chunk_key: str, chunk_file: Optional[str], errors: list[str] ) -> str: """The exact prompt to hand a subagent to re-extract a rejected chunk.""" chunk_ref = chunk_file or f"data/chunks//{chunk_key}.txt" lines = [ f"# RE-EXTRACTION — chunk `{chunk_key}`", "", "The previous extraction for this chunk was **REJECTED**. Reasons:", "", ] lines += [f"- {e}" for e in errors] lines += [ "", "## What to do", "", f"1. Read ONLY this chunk: `{chunk_ref}`", f"2. Follow the extraction rules in `{SUBAGENT_PROMPT.relative_to(REPO_ROOT)}`.", "3. Fix every problem listed above. In particular:", " - every `source_excerpt` must be copied **verbatim** from the chunk", " (it is checked as a fuzzy substring — invented quotes are rejected);", " - `source_excerpt` and `page_reference` are mandatory on every activity;", " - the output must validate against `scripts/activity_schema.json`.", f"4. Overwrite the extraction file `data/extracted/{chunk_key}.json`.", "", ] return "\n".join(lines) # -------------------------------------------------------------------------- # manifest # -------------------------------------------------------------------------- def load_manifest(manifest_path: Path) -> dict: if manifest_path.is_file(): try: data = json.loads(manifest_path.read_text(encoding="utf-8")) data.setdefault("chunks", {}) return data except (json.JSONDecodeError, OSError): pass return {"chunks": {}} def save_manifest(manifest: dict, manifest_path: Path) -> None: manifest_path.parent.mkdir(parents=True, exist_ok=True) manifest_path.write_text( json.dumps(manifest, indent=2, ensure_ascii=False), encoding="utf-8" ) def mark_rejected(manifest: dict, chunk_key: str) -> None: """Flip a chunk to `rejected` in the manifest (creating the entry if new).""" entry = manifest["chunks"].get(chunk_key, {}) entry["state"] = "rejected" manifest["chunks"][chunk_key] = entry # -------------------------------------------------------------------------- # validation # -------------------------------------------------------------------------- def validate_file(json_path: Path, schema: dict, chunks_dir: Path) -> list[str]: """Return the list of errors for one extraction file (empty == valid).""" try: data = json.loads(json_path.read_text(encoding="utf-8")) except json.JSONDecodeError as exc: return [f"invalid JSON: {exc}"] errors = validate_extraction(data, schema) if errors: return errors header = data.get("header", {}) chunk_text = find_chunk_text(json_path, header, chunks_dir) if chunk_text is None: return [f"source chunk not found for {chunk_key_for(json_path, header)}"] for adict in data.get("activities", []): excerpt = adict.get("source_excerpt") or "" if not excerpt_matches(excerpt, chunk_text): score = excerpt_score(excerpt, chunk_text) errors.append( f"activity {adict.get('name')!r}: source_excerpt not found in " f"chunk (best match {score:.0f}/100) — possible hallucination" ) return errors def run( extracted_dir: Path, chunks_dir: Path, manifest_path: Path, schema_path: Path = DEFAULT_SCHEMA_PATH, ) -> dict: schema = load_schema(schema_path) manifest = load_manifest(manifest_path) reextract_dir = extracted_dir / "_reextract" report = {"total": 0, "valid": 0, "rejected": 0, "rejected_chunks": []} for json_path in iter_extraction_files(extracted_dir): report["total"] += 1 errors = validate_file(json_path, schema, chunks_dir) if not errors: report["valid"] += 1 continue report["rejected"] += 1 try: data = json.loads(json_path.read_text(encoding="utf-8")) header = data.get("header", {}) except json.JSONDecodeError: header = {} chunk_key = chunk_key_for(json_path, header) chunk_file = None meta = manifest["chunks"].get(chunk_key) if meta: chunk_file = meta.get("chunk_file") reextract_dir.mkdir(parents=True, exist_ok=True) prompt = build_reextraction_prompt(chunk_key, chunk_file, errors) (reextract_dir / f"{chunk_key}.prompt.md").write_text(prompt, encoding="utf-8") mark_rejected(manifest, chunk_key) report["rejected_chunks"].append({"chunk": chunk_key, "errors": errors}) save_manifest(manifest, manifest_path) return report # -------------------------------------------------------------------------- # CLI # -------------------------------------------------------------------------- def main(argv: Optional[list[str]] = None) -> int: parser = argparse.ArgumentParser(description="Validate extraction JSON files.") parser.add_argument("--extracted", default="data/extracted") parser.add_argument("--chunks", default="data/chunks") parser.add_argument("--manifest", default="data/chunks/manifest.json") parser.add_argument("--schema", default=str(DEFAULT_SCHEMA_PATH)) args = parser.parse_args(argv) report = run( Path(args.extracted), Path(args.chunks), Path(args.manifest), Path(args.schema) ) print(f"extraction files : {report['total']}") print(f" valid : {report['valid']}") print(f" rejected : {report['rejected']}") for item in report["rejected_chunks"]: print(f" [rejected] {item['chunk']}") for err in item["errors"]: print(f" - {err}") if report["rejected"]: print(f"\nRe-extraction prompts written to {args.extracted}/_reextract/") return 0 if __name__ == "__main__": raise SystemExit(main())