game-library/scripts/validate_extractions.py

#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
validate_extractions.py — validate every data/extracted/*.json (plan §5b).

For each extraction file it runs two checks:
  1. JSON-schema validation against scripts/activity_schema.json,
  2. the source_excerpt anti-hallucination check (each excerpt must be a fuzzy
     substring of the chunk it came from).

For every failing chunk it:
  * writes the exact re-extraction prompt to data/extracted/_reextract/<chunk>.prompt.md,
  * marks the chunk `rejected` in data/chunks/manifest.json.

The orchestrator then re-launches subagents only on the `rejected` chunks; the
loop repeats until nothing is rejected.

Usage:
    python scripts/validate_extractions.py
"""

from __future__ import annotations

import argparse
import json
import sys
from pathlib import Path
from typing import Optional

SCRIPT_DIR = Path(__file__).resolve().parent
REPO_ROOT = SCRIPT_DIR.parent
for _p in (str(SCRIPT_DIR), str(REPO_ROOT)):
    if _p not in sys.path:
        sys.path.insert(0, _p)

from import_common import (  # noqa: E402
    DEFAULT_SCHEMA_PATH,
    chunk_key_for,
    excerpt_matches,
    excerpt_score,
    find_chunk_text,
    iter_extraction_files,
    load_schema,
    validate_extraction,
)

SUBAGENT_PROMPT = SCRIPT_DIR / "SUBAGENT_PROMPT.md"


# --------------------------------------------------------------------------
# re-extraction prompt
# --------------------------------------------------------------------------
def build_reextraction_prompt(
    chunk_key: str, chunk_file: Optional[str], errors: list[str]
) -> str:
    """The exact prompt to hand a subagent to re-extract a rejected chunk."""
    chunk_ref = chunk_file or f"data/chunks/<source_id>/{chunk_key}.txt"
    lines = [
        f"# RE-EXTRACTION — chunk `{chunk_key}`",
        "",
        "The previous extraction for this chunk was **REJECTED**. Reasons:",
        "",
    ]
    lines += [f"- {e}" for e in errors]
    lines += [
        "",
        "## What to do",
        "",
        f"1. Read ONLY this chunk: `{chunk_ref}`",
        f"2. Follow the extraction rules in `{SUBAGENT_PROMPT.relative_to(REPO_ROOT)}`.",
        "3. Fix every problem listed above. In particular:",
        "   - every `source_excerpt` must be copied **verbatim** from the chunk",
        "     (it is checked as a fuzzy substring — invented quotes are rejected);",
        "   - `source_excerpt` and `page_reference` are mandatory on every activity;",
        "   - the output must validate against `scripts/activity_schema.json`.",
        f"4. Overwrite the extraction file `data/extracted/{chunk_key}.json`.",
        "",
    ]
    return "\n".join(lines)


# --------------------------------------------------------------------------
# manifest
# --------------------------------------------------------------------------
def load_manifest(manifest_path: Path) -> dict:
    if manifest_path.is_file():
        try:
            data = json.loads(manifest_path.read_text(encoding="utf-8"))
            data.setdefault("chunks", {})
            return data
        except (json.JSONDecodeError, OSError):
            pass
    return {"chunks": {}}


def save_manifest(manifest: dict, manifest_path: Path) -> None:
    manifest_path.parent.mkdir(parents=True, exist_ok=True)
    manifest_path.write_text(
        json.dumps(manifest, indent=2, ensure_ascii=False), encoding="utf-8"
    )


def mark_rejected(manifest: dict, chunk_key: str) -> None:
    """Flip a chunk to `rejected` in the manifest (creating the entry if new)."""
    entry = manifest["chunks"].get(chunk_key, {})
    entry["state"] = "rejected"
    manifest["chunks"][chunk_key] = entry


# --------------------------------------------------------------------------
# validation
# --------------------------------------------------------------------------
def validate_file(json_path: Path, schema: dict, chunks_dir: Path) -> list[str]:
    """Return the list of errors for one extraction file (empty == valid)."""
    try:
        data = json.loads(json_path.read_text(encoding="utf-8"))
    except json.JSONDecodeError as exc:
        return [f"invalid JSON: {exc}"]

    errors = validate_extraction(data, schema)
    if errors:
        return errors

    header = data.get("header", {})
    chunk_text = find_chunk_text(json_path, header, chunks_dir)
    if chunk_text is None:
        return [f"source chunk not found for {chunk_key_for(json_path, header)}"]

    for adict in data.get("activities", []):
        excerpt = adict.get("source_excerpt") or ""
        if not excerpt_matches(excerpt, chunk_text):
            score = excerpt_score(excerpt, chunk_text)
            errors.append(
                f"activity {adict.get('name')!r}: source_excerpt not found in "
                f"chunk (best match {score:.0f}/100) — possible hallucination"
            )
    return errors


def run(
    extracted_dir: Path,
    chunks_dir: Path,
    manifest_path: Path,
    schema_path: Path = DEFAULT_SCHEMA_PATH,
) -> dict:
    schema = load_schema(schema_path)
    manifest = load_manifest(manifest_path)
    reextract_dir = extracted_dir / "_reextract"

    report = {"total": 0, "valid": 0, "rejected": 0, "rejected_chunks": []}
    for json_path in iter_extraction_files(extracted_dir):
        report["total"] += 1
        errors = validate_file(json_path, schema, chunks_dir)
        if not errors:
            report["valid"] += 1
            continue

        report["rejected"] += 1
        try:
            data = json.loads(json_path.read_text(encoding="utf-8"))
            header = data.get("header", {})
        except json.JSONDecodeError:
            header = {}
        chunk_key = chunk_key_for(json_path, header)
        chunk_file = None
        meta = manifest["chunks"].get(chunk_key)
        if meta:
            chunk_file = meta.get("chunk_file")

        reextract_dir.mkdir(parents=True, exist_ok=True)
        prompt = build_reextraction_prompt(chunk_key, chunk_file, errors)
        (reextract_dir / f"{chunk_key}.prompt.md").write_text(prompt, encoding="utf-8")

        mark_rejected(manifest, chunk_key)
        report["rejected_chunks"].append({"chunk": chunk_key, "errors": errors})

    save_manifest(manifest, manifest_path)
    return report


# --------------------------------------------------------------------------
# CLI
# --------------------------------------------------------------------------
def main(argv: Optional[list[str]] = None) -> int:
    parser = argparse.ArgumentParser(description="Validate extraction JSON files.")
    parser.add_argument("--extracted", default="data/extracted")
    parser.add_argument("--chunks", default="data/chunks")
    parser.add_argument("--manifest", default="data/chunks/manifest.json")
    parser.add_argument("--schema", default=str(DEFAULT_SCHEMA_PATH))
    args = parser.parse_args(argv)

    report = run(
        Path(args.extracted), Path(args.chunks), Path(args.manifest), Path(args.schema)
    )
    print(f"extraction files : {report['total']}")
    print(f"  valid          : {report['valid']}")
    print(f"  rejected       : {report['rejected']}")
    for item in report["rejected_chunks"]:
        print(f"  [rejected] {item['chunk']}")
        for err in item["errors"]:
            print(f"      - {err}")
    if report["rejected"]:
        print(f"\nRe-extraction prompts written to {args.extracted}/_reextract/")
    return 0


if __name__ == "__main__":
    raise SystemExit(main())