#!/usr/bin/env python3 # -*- coding: utf-8 -*- """ run_extraction.py — extraction orchestrator (plan §3). The pipeline is script-only up to the LLM step: this script normalizes the corpus, chunks the normalized sources, and emits one subagent prompt per `pending` chunk. It does NOT run the extraction itself — that step is the interactive Claude Code orchestrator launching waves of subagents. Steps: 1. normalize data/carti-camp-jocuri/ -> data/sources/*.txt 2. chunk data/sources/*.txt -> data/chunks//*.txt + manifest.json 3. emit one prompt per `pending` chunk -> data/chunks/_prompts/*.md 4. report how many chunks remain `pending` Usage: python scripts/run_extraction.py python scripts/run_extraction.py --skip-normalize # re-chunk only """ from __future__ import annotations import argparse import sys from pathlib import Path from typing import Optional SCRIPT_DIR = Path(__file__).resolve().parent REPO_ROOT = SCRIPT_DIR.parent for _p in (str(SCRIPT_DIR), str(REPO_ROOT)): if _p not in sys.path: sys.path.insert(0, _p) import chunk_sources # noqa: E402 import normalize_sources # noqa: E402 SUBAGENT_PROMPT = SCRIPT_DIR / "SUBAGENT_PROMPT.md" def emit_chunk_prompt(chunk_key: str, meta: dict, prompts_dir: Path) -> Path: """Write the subagent prompt for one pending chunk.""" chunk_file = meta.get("chunk_file", f"data/chunks//{chunk_key}.txt") expected_json = meta.get("expected_json", f"{chunk_key}.json") text = "\n".join([ f"# EXTRACTION — chunk `{chunk_key}`", "", f"Read ONLY this chunk: `{chunk_file}`", f"Chunk range: {meta.get('chunk_range', '?')}", "", f"Follow the rules in `{SUBAGENT_PROMPT.relative_to(REPO_ROOT)}`.", "Identify every distinct activity, fill the schema " "(`scripts/activity_schema.json`), and write the result to:", "", f" data/extracted/{expected_json}", "", "Header fields to set: " f'source_id="{meta.get("source_id", "")}", chunk_key="{chunk_key}", ' f'source_hash="{meta.get("source_hash", "")}".', "", ]) prompts_dir.mkdir(parents=True, exist_ok=True) out = prompts_dir / f"{chunk_key}.prompt.md" out.write_text(text, encoding="utf-8") return out def run( *, corpus_root: Path, sources_dir: Path, chunks_dir: Path, skip_normalize: bool = False, ) -> dict: summary: dict = {} if not skip_normalize: norm = normalize_sources.run(corpus_root, sources_dir) summary["normalized"] = {"ok": norm["ok"], "total": norm["total"], "errors": norm["errors"]} chunk_summary = chunk_sources.run(sources_dir, chunks_dir) summary["chunks"] = chunk_summary manifest_path = chunks_dir / "manifest.json" manifest = chunk_sources.load_manifest(manifest_path) prompts_dir = chunks_dir / "_prompts" pending = {k: m for k, m in manifest["chunks"].items() if m.get("state") == "pending"} for key, meta in sorted(pending.items()): emit_chunk_prompt(key, meta, prompts_dir) states: dict[str, int] = {} for m in manifest["chunks"].values(): states[m.get("state", "?")] = states.get(m.get("state", "?"), 0) + 1 summary["states"] = states summary["pending"] = len(pending) summary["prompts_dir"] = str(prompts_dir) return summary def main(argv: Optional[list[str]] = None) -> int: parser = argparse.ArgumentParser(description="Extraction orchestrator.") parser.add_argument("--corpus", default="data/carti-camp-jocuri") parser.add_argument("--sources", default="data/sources") parser.add_argument("--chunks", default="data/chunks") parser.add_argument("--skip-normalize", action="store_true", help="skip normalization, re-chunk existing sources only") args = parser.parse_args(argv) summary = run( corpus_root=Path(args.corpus), sources_dir=Path(args.sources), chunks_dir=Path(args.chunks), skip_normalize=args.skip_normalize, ) print("=" * 60) print("EXTRACTION ORCHESTRATOR") print("=" * 60) if "normalized" in summary: n = summary["normalized"] print(f"normalized : {n['ok']}/{n['total']} (errors {n['errors']})") print(f"chunks : {summary['chunks']['chunks']}") for state, count in sorted(summary["states"].items()): print(f" {state:<10}: {count}") print(f"\npending chunks remaining : {summary['pending']}") if summary["pending"]: print(f"subagent prompts written : {summary['prompts_dir']}/") print("Launch waves of ~5-10 subagents on those prompts, then run " "validate_extractions.py and build_database.py --rebuild.") else: print("All chunks extracted — run build_database.py --rebuild.") print("=" * 60) return 0 if __name__ == "__main__": raise SystemExit(main())