game-library/scripts/run_extraction.py

#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
run_extraction.py — extraction orchestrator (plan §3).

The pipeline is script-only up to the LLM step: this script normalizes the
corpus, chunks the normalized sources, and emits one subagent prompt per
`pending` chunk. It does NOT run the extraction itself — that step is the
interactive Claude Code orchestrator launching waves of subagents.

Steps:
  1. normalize  data/carti-camp-jocuri/ -> data/sources/*.txt
  2. chunk      data/sources/*.txt      -> data/chunks/<id>/*.txt + manifest.json
  3. emit       one prompt per `pending` chunk -> data/chunks/_prompts/*.md
  4. report     how many chunks remain `pending`

Usage:
    python scripts/run_extraction.py
    python scripts/run_extraction.py --skip-normalize   # re-chunk only
"""

from __future__ import annotations

import argparse
import sys
from pathlib import Path
from typing import Optional

SCRIPT_DIR = Path(__file__).resolve().parent
REPO_ROOT = SCRIPT_DIR.parent
for _p in (str(SCRIPT_DIR), str(REPO_ROOT)):
    if _p not in sys.path:
        sys.path.insert(0, _p)

import chunk_sources  # noqa: E402
import normalize_sources  # noqa: E402

SUBAGENT_PROMPT = SCRIPT_DIR / "SUBAGENT_PROMPT.md"


def emit_chunk_prompt(chunk_key: str, meta: dict, prompts_dir: Path) -> Path:
    """Write the subagent prompt for one pending chunk."""
    chunk_file = meta.get("chunk_file", f"data/chunks/<id>/{chunk_key}.txt")
    expected_json = meta.get("expected_json", f"{chunk_key}.json")
    text = "\n".join([
        f"# EXTRACTION — chunk `{chunk_key}`",
        "",
        f"Read ONLY this chunk: `{chunk_file}`",
        f"Chunk range: {meta.get('chunk_range', '?')}",
        "",
        f"Follow the rules in `{SUBAGENT_PROMPT.relative_to(REPO_ROOT)}`.",
        "Identify every distinct activity, fill the schema "
        "(`scripts/activity_schema.json`), and write the result to:",
        "",
        f"    data/extracted/{expected_json}",
        "",
        "Header fields to set: "
        f'source_id="{meta.get("source_id", "")}", chunk_key="{chunk_key}", '
        f'source_hash="{meta.get("source_hash", "")}".',
        "",
    ])
    prompts_dir.mkdir(parents=True, exist_ok=True)
    out = prompts_dir / f"{chunk_key}.prompt.md"
    out.write_text(text, encoding="utf-8")
    return out


def run(
    *,
    corpus_root: Path,
    sources_dir: Path,
    chunks_dir: Path,
    skip_normalize: bool = False,
) -> dict:
    summary: dict = {}

    if not skip_normalize:
        norm = normalize_sources.run(corpus_root, sources_dir)
        summary["normalized"] = {"ok": norm["ok"], "total": norm["total"],
                                 "errors": norm["errors"]}

    chunk_summary = chunk_sources.run(sources_dir, chunks_dir)
    summary["chunks"] = chunk_summary

    manifest_path = chunks_dir / "manifest.json"
    manifest = chunk_sources.load_manifest(manifest_path)
    prompts_dir = chunks_dir / "_prompts"

    pending = {k: m for k, m in manifest["chunks"].items()
               if m.get("state") == "pending"}
    for key, meta in sorted(pending.items()):
        emit_chunk_prompt(key, meta, prompts_dir)

    states: dict[str, int] = {}
    for m in manifest["chunks"].values():
        states[m.get("state", "?")] = states.get(m.get("state", "?"), 0) + 1
    summary["states"] = states
    summary["pending"] = len(pending)
    summary["prompts_dir"] = str(prompts_dir)
    return summary


def main(argv: Optional[list[str]] = None) -> int:
    parser = argparse.ArgumentParser(description="Extraction orchestrator.")
    parser.add_argument("--corpus", default="data/carti-camp-jocuri")
    parser.add_argument("--sources", default="data/sources")
    parser.add_argument("--chunks", default="data/chunks")
    parser.add_argument("--skip-normalize", action="store_true",
                        help="skip normalization, re-chunk existing sources only")
    args = parser.parse_args(argv)

    summary = run(
        corpus_root=Path(args.corpus),
        sources_dir=Path(args.sources),
        chunks_dir=Path(args.chunks),
        skip_normalize=args.skip_normalize,
    )

    print("=" * 60)
    print("EXTRACTION ORCHESTRATOR")
    print("=" * 60)
    if "normalized" in summary:
        n = summary["normalized"]
        print(f"normalized : {n['ok']}/{n['total']} (errors {n['errors']})")
    print(f"chunks     : {summary['chunks']['chunks']}")
    for state, count in sorted(summary["states"].items()):
        print(f"  {state:<10}: {count}")
    print(f"\npending chunks remaining : {summary['pending']}")
    if summary["pending"]:
        print(f"subagent prompts written : {summary['prompts_dir']}/")
        print("Launch waves of ~5-10 subagents on those prompts, then run "
              "validate_extractions.py and build_database.py --rebuild.")
    else:
        print("All chunks extracted — run build_database.py --rebuild.")
    print("=" * 60)
    return 0


if __name__ == "__main__":
    raise SystemExit(main())