Files
game-library/scripts/run_extraction.py
Claude Agent 66ae831c36 Rebuild extraction pipeline infrastructure (Faza 0 prep)
Implements the approved plan to replace the broken regex/index-master
extraction with an LLM-subagent pipeline. Four parallel lanes:

Lane A — scripts/extract_common.py (PDF/docx/doc/pptx/html/zip, no
  max_pages truncation), normalize_sources.py, chunk_sources.py
  (~20pg chunks + overlap, manifest registry), activity_schema.json.
Lane B — app/config_taxonomy.py (16 fixed category slugs), schema
  rebuilt from scratch in app/models/ with content_type, language,
  source_files, source_excerpt, normalized_name, extraction_confidence,
  needs_review; FTS5 + 3 triggers extended with materials_list and
  skills_developed.
Lane C — build_database.py (--rebuild, atomic swap, schema + fuzzy
  source_excerpt validation, dedup with needs_review band),
  validate_extractions.py, review_queue.py, new run_extraction.py
  orchestrator, SUBAGENT_PROMPT.md.
Lane D — search.py content_type/language filters (default search
  excludes non-game content), E7 schema-compat audit; fixed a NULL
  keywords AttributeError in _boost_search_relevance.

Removes 8 orphaned/dead scripts and app/services/parser.py +
indexer.py. Adds tests/ (70 passing, 1 skipped — libreoffice absent).

Note: Lane D made one additive edit to app/models/database.py
(_update_category_counts) to surface content_type/language in
get_filter_options, outside its nominal lane boundary but after
Lane B completed.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-05-19 17:43:38 +00:00

141 lines
4.9 KiB
Python

#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
run_extraction.py — extraction orchestrator (plan §3).
The pipeline is script-only up to the LLM step: this script normalizes the
corpus, chunks the normalized sources, and emits one subagent prompt per
`pending` chunk. It does NOT run the extraction itself — that step is the
interactive Claude Code orchestrator launching waves of subagents.
Steps:
1. normalize data/carti-camp-jocuri/ -> data/sources/*.txt
2. chunk data/sources/*.txt -> data/chunks/<id>/*.txt + manifest.json
3. emit one prompt per `pending` chunk -> data/chunks/_prompts/*.md
4. report how many chunks remain `pending`
Usage:
python scripts/run_extraction.py
python scripts/run_extraction.py --skip-normalize # re-chunk only
"""
from __future__ import annotations
import argparse
import sys
from pathlib import Path
from typing import Optional
SCRIPT_DIR = Path(__file__).resolve().parent
REPO_ROOT = SCRIPT_DIR.parent
for _p in (str(SCRIPT_DIR), str(REPO_ROOT)):
if _p not in sys.path:
sys.path.insert(0, _p)
import chunk_sources # noqa: E402
import normalize_sources # noqa: E402
SUBAGENT_PROMPT = SCRIPT_DIR / "SUBAGENT_PROMPT.md"
def emit_chunk_prompt(chunk_key: str, meta: dict, prompts_dir: Path) -> Path:
"""Write the subagent prompt for one pending chunk."""
chunk_file = meta.get("chunk_file", f"data/chunks/<id>/{chunk_key}.txt")
expected_json = meta.get("expected_json", f"{chunk_key}.json")
text = "\n".join([
f"# EXTRACTION — chunk `{chunk_key}`",
"",
f"Read ONLY this chunk: `{chunk_file}`",
f"Chunk range: {meta.get('chunk_range', '?')}",
"",
f"Follow the rules in `{SUBAGENT_PROMPT.relative_to(REPO_ROOT)}`.",
"Identify every distinct activity, fill the schema "
"(`scripts/activity_schema.json`), and write the result to:",
"",
f" data/extracted/{expected_json}",
"",
"Header fields to set: "
f'source_id="{meta.get("source_id", "")}", chunk_key="{chunk_key}", '
f'source_hash="{meta.get("source_hash", "")}".',
"",
])
prompts_dir.mkdir(parents=True, exist_ok=True)
out = prompts_dir / f"{chunk_key}.prompt.md"
out.write_text(text, encoding="utf-8")
return out
def run(
*,
corpus_root: Path,
sources_dir: Path,
chunks_dir: Path,
skip_normalize: bool = False,
) -> dict:
summary: dict = {}
if not skip_normalize:
norm = normalize_sources.run(corpus_root, sources_dir)
summary["normalized"] = {"ok": norm["ok"], "total": norm["total"],
"errors": norm["errors"]}
chunk_summary = chunk_sources.run(sources_dir, chunks_dir)
summary["chunks"] = chunk_summary
manifest_path = chunks_dir / "manifest.json"
manifest = chunk_sources.load_manifest(manifest_path)
prompts_dir = chunks_dir / "_prompts"
pending = {k: m for k, m in manifest["chunks"].items()
if m.get("state") == "pending"}
for key, meta in sorted(pending.items()):
emit_chunk_prompt(key, meta, prompts_dir)
states: dict[str, int] = {}
for m in manifest["chunks"].values():
states[m.get("state", "?")] = states.get(m.get("state", "?"), 0) + 1
summary["states"] = states
summary["pending"] = len(pending)
summary["prompts_dir"] = str(prompts_dir)
return summary
def main(argv: Optional[list[str]] = None) -> int:
parser = argparse.ArgumentParser(description="Extraction orchestrator.")
parser.add_argument("--corpus", default="data/carti-camp-jocuri")
parser.add_argument("--sources", default="data/sources")
parser.add_argument("--chunks", default="data/chunks")
parser.add_argument("--skip-normalize", action="store_true",
help="skip normalization, re-chunk existing sources only")
args = parser.parse_args(argv)
summary = run(
corpus_root=Path(args.corpus),
sources_dir=Path(args.sources),
chunks_dir=Path(args.chunks),
skip_normalize=args.skip_normalize,
)
print("=" * 60)
print("EXTRACTION ORCHESTRATOR")
print("=" * 60)
if "normalized" in summary:
n = summary["normalized"]
print(f"normalized : {n['ok']}/{n['total']} (errors {n['errors']})")
print(f"chunks : {summary['chunks']['chunks']}")
for state, count in sorted(summary["states"].items()):
print(f" {state:<10}: {count}")
print(f"\npending chunks remaining : {summary['pending']}")
if summary["pending"]:
print(f"subagent prompts written : {summary['prompts_dir']}/")
print("Launch waves of ~5-10 subagents on those prompts, then run "
"validate_extractions.py and build_database.py --rebuild.")
else:
print("All chunks extracted — run build_database.py --rebuild.")
print("=" * 60)
return 0
if __name__ == "__main__":
raise SystemExit(main())