Rebuild extraction pipeline infrastructure (Faza 0 prep)

Implements the approved plan to replace the broken regex/index-master extraction with an LLM-subagent pipeline. Four parallel lanes: Lane A — scripts/extract_common.py (PDF/docx/doc/pptx/html/zip, no max_pages truncation), normalize_sources.py, chunk_sources.py (~20pg chunks + overlap, manifest registry), activity_schema.json. Lane B — app/config_taxonomy.py (16 fixed category slugs), schema rebuilt from scratch in app/models/ with content_type, language, source_files, source_excerpt, normalized_name, extraction_confidence, needs_review; FTS5 + 3 triggers extended with materials_list and skills_developed. Lane C — build_database.py (--rebuild, atomic swap, schema + fuzzy source_excerpt validation, dedup with needs_review band), validate_extractions.py, review_queue.py, new run_extraction.py orchestrator, SUBAGENT_PROMPT.md. Lane D — search.py content_type/language filters (default search excludes non-game content), E7 schema-compat audit; fixed a NULL keywords AttributeError in _boost_search_relevance. Removes 8 orphaned/dead scripts and app/services/parser.py + indexer.py. Adds tests/ (70 passing, 1 skipped — libreoffice absent). Note: Lane D made one additive edit to app/models/database.py (_update_category_counts) to surface content_type/language in get_filter_options, outside its nominal lane boundary but after Lane B completed. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-05-19 17:43:38 +00:00
parent e0080edf85
commit 66ae831c36
37 changed files with 4101 additions and 1881 deletions
--- a/scripts/run_extraction.py
+++ b/scripts/run_extraction.py
@@ -1,50 +1,140 @@
 #!/usr/bin/env python3
 # -*- coding: utf-8 -*-
 """
-Main extraction orchestrator
-Ruleaza intregul proces de extractie
+run_extraction.py — extraction orchestrator (plan §3).
+
+The pipeline is script-only up to the LLM step: this script normalizes the
+corpus, chunks the normalized sources, and emits one subagent prompt per
+`pending` chunk. It does NOT run the extraction itself — that step is the
+interactive Claude Code orchestrator launching waves of subagents.
+
+Steps:
+  1. normalize  data/carti-camp-jocuri/ -> data/sources/*.txt
+  2. chunk      data/sources/*.txt      -> data/chunks/<id>/*.txt + manifest.json
+  3. emit       one prompt per `pending` chunk -> data/chunks/_prompts/*.md
+  4. report     how many chunks remain `pending`
+
+Usage:
+    python scripts/run_extraction.py
+    python scripts/run_extraction.py --skip-normalize   # re-chunk only
 """

+from __future__ import annotations
+
+import argparse
 import sys
-import time
 from pathlib import Path
+from typing import Optional

-from unified_processor import UnifiedProcessor
-from import_claude_activities import ClaudeActivityImporter
+SCRIPT_DIR = Path(__file__).resolve().parent
+REPO_ROOT = SCRIPT_DIR.parent
+for _p in (str(SCRIPT_DIR), str(REPO_ROOT)):
+    if _p not in sys.path:
+        sys.path.insert(0, _p)
+
+import chunk_sources  # noqa: E402
+import normalize_sources  # noqa: E402
+
+SUBAGENT_PROMPT = SCRIPT_DIR / "SUBAGENT_PROMPT.md"
+
+
+def emit_chunk_prompt(chunk_key: str, meta: dict, prompts_dir: Path) -> Path:
+    """Write the subagent prompt for one pending chunk."""
+    chunk_file = meta.get("chunk_file", f"data/chunks/<id>/{chunk_key}.txt")
+    expected_json = meta.get("expected_json", f"{chunk_key}.json")
+    text = "\n".join([
+        f"# EXTRACTION — chunk `{chunk_key}`",
+        "",
+        f"Read ONLY this chunk: `{chunk_file}`",
+        f"Chunk range: {meta.get('chunk_range', '?')}",
+        "",
+        f"Follow the rules in `{SUBAGENT_PROMPT.relative_to(REPO_ROOT)}`.",
+        "Identify every distinct activity, fill the schema "
+        "(`scripts/activity_schema.json`), and write the result to:",
+        "",
+        f"    data/extracted/{expected_json}",
+        "",
+        "Header fields to set: "
+        f'source_id="{meta.get("source_id", "")}", chunk_key="{chunk_key}", '
+        f'source_hash="{meta.get("source_hash", "")}".',
+        "",
+    ])
+    prompts_dir.mkdir(parents=True, exist_ok=True)
+    out = prompts_dir / f"{chunk_key}.prompt.md"
+    out.write_text(text, encoding="utf-8")
+    return out
+
+
+def run(
+    *,
+    corpus_root: Path,
+    sources_dir: Path,
+    chunks_dir: Path,
+    skip_normalize: bool = False,
+) -> dict:
+    summary: dict = {}
+
+    if not skip_normalize:
+        norm = normalize_sources.run(corpus_root, sources_dir)
+        summary["normalized"] = {"ok": norm["ok"], "total": norm["total"],
+                                 "errors": norm["errors"]}
+
+    chunk_summary = chunk_sources.run(sources_dir, chunks_dir)
+    summary["chunks"] = chunk_summary
+
+    manifest_path = chunks_dir / "manifest.json"
+    manifest = chunk_sources.load_manifest(manifest_path)
+    prompts_dir = chunks_dir / "_prompts"
+
+    pending = {k: m for k, m in manifest["chunks"].items()
+               if m.get("state") == "pending"}
+    for key, meta in sorted(pending.items()):
+        emit_chunk_prompt(key, meta, prompts_dir)
+
+    states: dict[str, int] = {}
+    for m in manifest["chunks"].values():
+        states[m.get("state", "?")] = states.get(m.get("state", "?"), 0) + 1
+    summary["states"] = states
+    summary["pending"] = len(pending)
+    summary["prompts_dir"] = str(prompts_dir)
+    return summary
+
+
+def main(argv: Optional[list[str]] = None) -> int:
+    parser = argparse.ArgumentParser(description="Extraction orchestrator.")
+    parser.add_argument("--corpus", default="data/carti-camp-jocuri")
+    parser.add_argument("--sources", default="data/sources")
+    parser.add_argument("--chunks", default="data/chunks")
+    parser.add_argument("--skip-normalize", action="store_true",
+                        help="skip normalization, re-chunk existing sources only")
+    args = parser.parse_args(argv)
+
+    summary = run(
+        corpus_root=Path(args.corpus),
+        sources_dir=Path(args.sources),
+        chunks_dir=Path(args.chunks),
+        skip_normalize=args.skip_normalize,
+    )
+
+    print("=" * 60)
+    print("EXTRACTION ORCHESTRATOR")
+    print("=" * 60)
+    if "normalized" in summary:
+        n = summary["normalized"]
+        print(f"normalized : {n['ok']}/{n['total']} (errors {n['errors']})")
+    print(f"chunks     : {summary['chunks']['chunks']}")
+    for state, count in sorted(summary["states"].items()):
+        print(f"  {state:<10}: {count}")
+    print(f"\npending chunks remaining : {summary['pending']}")
+    if summary["pending"]:
+        print(f"subagent prompts written : {summary['prompts_dir']}/")
+        print("Launch waves of ~5-10 subagents on those prompts, then run "
+              "validate_extractions.py and build_database.py --rebuild.")
+    else:
+        print("All chunks extracted — run build_database.py --rebuild.")
+    print("=" * 60)
+    return 0

-def main():
-    print("="*60)
-    print("ACTIVITY EXTRACTION SYSTEM")
-    print("Strategy S8: Hybrid Claude + Scripts")
-    print("="*60)
-    
-    # Step 1: Run automated extraction
-    print("\nSTEP 1: Automated Extraction")
-    print("-"*40)
-    processor = UnifiedProcessor()
-    processor.process_automated_formats()
-    
-    # Step 2: Wait for Claude processing
-    print("\n" + "="*60)
-    print("STEP 2: Manual Claude Processing Required")
-    print("-"*40)
-    print("Please process PDF/DOC files with Claude using the template.")
-    print("Files are listed in: pdf_doc_for_claude.txt")
-    print("Save extracted activities as JSON in: scripts/extracted_activities/")
-    print("="*60)
-    
-    response = input("\nHave you completed Claude processing? (y/n): ")
-    
-    if response.lower() == 'y':
-        # Step 3: Import Claude-extracted activities
-        print("\nSTEP 3: Importing Claude-extracted activities")
-        print("-"*40)
-        importer = ClaudeActivityImporter()
-        importer.import_all_json_files()
-    
-    print("\n" + "="*60)
-    print("EXTRACTION COMPLETE!")
-    print("="*60)

 if __name__ == "__main__":
-    main()
+    raise SystemExit(main())