Rebuild extraction pipeline infrastructure (Faza 0 prep)
Implements the approved plan to replace the broken regex/index-master extraction with an LLM-subagent pipeline. Four parallel lanes: Lane A — scripts/extract_common.py (PDF/docx/doc/pptx/html/zip, no max_pages truncation), normalize_sources.py, chunk_sources.py (~20pg chunks + overlap, manifest registry), activity_schema.json. Lane B — app/config_taxonomy.py (16 fixed category slugs), schema rebuilt from scratch in app/models/ with content_type, language, source_files, source_excerpt, normalized_name, extraction_confidence, needs_review; FTS5 + 3 triggers extended with materials_list and skills_developed. Lane C — build_database.py (--rebuild, atomic swap, schema + fuzzy source_excerpt validation, dedup with needs_review band), validate_extractions.py, review_queue.py, new run_extraction.py orchestrator, SUBAGENT_PROMPT.md. Lane D — search.py content_type/language filters (default search excludes non-game content), E7 schema-compat audit; fixed a NULL keywords AttributeError in _boost_search_relevance. Removes 8 orphaned/dead scripts and app/services/parser.py + indexer.py. Adds tests/ (70 passing, 1 skipped — libreoffice absent). Note: Lane D made one additive edit to app/models/database.py (_update_category_counts) to surface content_type/language in get_filter_options, outside its nominal lane boundary but after Lane B completed. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -1,50 +1,140 @@
|
||||
#!/usr/bin/env python3
|
||||
# -*- coding: utf-8 -*-
|
||||
"""
|
||||
Main extraction orchestrator
|
||||
Ruleaza intregul proces de extractie
|
||||
run_extraction.py — extraction orchestrator (plan §3).
|
||||
|
||||
The pipeline is script-only up to the LLM step: this script normalizes the
|
||||
corpus, chunks the normalized sources, and emits one subagent prompt per
|
||||
`pending` chunk. It does NOT run the extraction itself — that step is the
|
||||
interactive Claude Code orchestrator launching waves of subagents.
|
||||
|
||||
Steps:
|
||||
1. normalize data/carti-camp-jocuri/ -> data/sources/*.txt
|
||||
2. chunk data/sources/*.txt -> data/chunks/<id>/*.txt + manifest.json
|
||||
3. emit one prompt per `pending` chunk -> data/chunks/_prompts/*.md
|
||||
4. report how many chunks remain `pending`
|
||||
|
||||
Usage:
|
||||
python scripts/run_extraction.py
|
||||
python scripts/run_extraction.py --skip-normalize # re-chunk only
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import argparse
|
||||
import sys
|
||||
import time
|
||||
from pathlib import Path
|
||||
from typing import Optional
|
||||
|
||||
from unified_processor import UnifiedProcessor
|
||||
from import_claude_activities import ClaudeActivityImporter
|
||||
SCRIPT_DIR = Path(__file__).resolve().parent
|
||||
REPO_ROOT = SCRIPT_DIR.parent
|
||||
for _p in (str(SCRIPT_DIR), str(REPO_ROOT)):
|
||||
if _p not in sys.path:
|
||||
sys.path.insert(0, _p)
|
||||
|
||||
import chunk_sources # noqa: E402
|
||||
import normalize_sources # noqa: E402
|
||||
|
||||
SUBAGENT_PROMPT = SCRIPT_DIR / "SUBAGENT_PROMPT.md"
|
||||
|
||||
|
||||
def emit_chunk_prompt(chunk_key: str, meta: dict, prompts_dir: Path) -> Path:
|
||||
"""Write the subagent prompt for one pending chunk."""
|
||||
chunk_file = meta.get("chunk_file", f"data/chunks/<id>/{chunk_key}.txt")
|
||||
expected_json = meta.get("expected_json", f"{chunk_key}.json")
|
||||
text = "\n".join([
|
||||
f"# EXTRACTION — chunk `{chunk_key}`",
|
||||
"",
|
||||
f"Read ONLY this chunk: `{chunk_file}`",
|
||||
f"Chunk range: {meta.get('chunk_range', '?')}",
|
||||
"",
|
||||
f"Follow the rules in `{SUBAGENT_PROMPT.relative_to(REPO_ROOT)}`.",
|
||||
"Identify every distinct activity, fill the schema "
|
||||
"(`scripts/activity_schema.json`), and write the result to:",
|
||||
"",
|
||||
f" data/extracted/{expected_json}",
|
||||
"",
|
||||
"Header fields to set: "
|
||||
f'source_id="{meta.get("source_id", "")}", chunk_key="{chunk_key}", '
|
||||
f'source_hash="{meta.get("source_hash", "")}".',
|
||||
"",
|
||||
])
|
||||
prompts_dir.mkdir(parents=True, exist_ok=True)
|
||||
out = prompts_dir / f"{chunk_key}.prompt.md"
|
||||
out.write_text(text, encoding="utf-8")
|
||||
return out
|
||||
|
||||
|
||||
def run(
|
||||
*,
|
||||
corpus_root: Path,
|
||||
sources_dir: Path,
|
||||
chunks_dir: Path,
|
||||
skip_normalize: bool = False,
|
||||
) -> dict:
|
||||
summary: dict = {}
|
||||
|
||||
if not skip_normalize:
|
||||
norm = normalize_sources.run(corpus_root, sources_dir)
|
||||
summary["normalized"] = {"ok": norm["ok"], "total": norm["total"],
|
||||
"errors": norm["errors"]}
|
||||
|
||||
chunk_summary = chunk_sources.run(sources_dir, chunks_dir)
|
||||
summary["chunks"] = chunk_summary
|
||||
|
||||
manifest_path = chunks_dir / "manifest.json"
|
||||
manifest = chunk_sources.load_manifest(manifest_path)
|
||||
prompts_dir = chunks_dir / "_prompts"
|
||||
|
||||
pending = {k: m for k, m in manifest["chunks"].items()
|
||||
if m.get("state") == "pending"}
|
||||
for key, meta in sorted(pending.items()):
|
||||
emit_chunk_prompt(key, meta, prompts_dir)
|
||||
|
||||
states: dict[str, int] = {}
|
||||
for m in manifest["chunks"].values():
|
||||
states[m.get("state", "?")] = states.get(m.get("state", "?"), 0) + 1
|
||||
summary["states"] = states
|
||||
summary["pending"] = len(pending)
|
||||
summary["prompts_dir"] = str(prompts_dir)
|
||||
return summary
|
||||
|
||||
|
||||
def main(argv: Optional[list[str]] = None) -> int:
|
||||
parser = argparse.ArgumentParser(description="Extraction orchestrator.")
|
||||
parser.add_argument("--corpus", default="data/carti-camp-jocuri")
|
||||
parser.add_argument("--sources", default="data/sources")
|
||||
parser.add_argument("--chunks", default="data/chunks")
|
||||
parser.add_argument("--skip-normalize", action="store_true",
|
||||
help="skip normalization, re-chunk existing sources only")
|
||||
args = parser.parse_args(argv)
|
||||
|
||||
summary = run(
|
||||
corpus_root=Path(args.corpus),
|
||||
sources_dir=Path(args.sources),
|
||||
chunks_dir=Path(args.chunks),
|
||||
skip_normalize=args.skip_normalize,
|
||||
)
|
||||
|
||||
print("=" * 60)
|
||||
print("EXTRACTION ORCHESTRATOR")
|
||||
print("=" * 60)
|
||||
if "normalized" in summary:
|
||||
n = summary["normalized"]
|
||||
print(f"normalized : {n['ok']}/{n['total']} (errors {n['errors']})")
|
||||
print(f"chunks : {summary['chunks']['chunks']}")
|
||||
for state, count in sorted(summary["states"].items()):
|
||||
print(f" {state:<10}: {count}")
|
||||
print(f"\npending chunks remaining : {summary['pending']}")
|
||||
if summary["pending"]:
|
||||
print(f"subagent prompts written : {summary['prompts_dir']}/")
|
||||
print("Launch waves of ~5-10 subagents on those prompts, then run "
|
||||
"validate_extractions.py and build_database.py --rebuild.")
|
||||
else:
|
||||
print("All chunks extracted — run build_database.py --rebuild.")
|
||||
print("=" * 60)
|
||||
return 0
|
||||
|
||||
def main():
|
||||
print("="*60)
|
||||
print("ACTIVITY EXTRACTION SYSTEM")
|
||||
print("Strategy S8: Hybrid Claude + Scripts")
|
||||
print("="*60)
|
||||
|
||||
# Step 1: Run automated extraction
|
||||
print("\nSTEP 1: Automated Extraction")
|
||||
print("-"*40)
|
||||
processor = UnifiedProcessor()
|
||||
processor.process_automated_formats()
|
||||
|
||||
# Step 2: Wait for Claude processing
|
||||
print("\n" + "="*60)
|
||||
print("STEP 2: Manual Claude Processing Required")
|
||||
print("-"*40)
|
||||
print("Please process PDF/DOC files with Claude using the template.")
|
||||
print("Files are listed in: pdf_doc_for_claude.txt")
|
||||
print("Save extracted activities as JSON in: scripts/extracted_activities/")
|
||||
print("="*60)
|
||||
|
||||
response = input("\nHave you completed Claude processing? (y/n): ")
|
||||
|
||||
if response.lower() == 'y':
|
||||
# Step 3: Import Claude-extracted activities
|
||||
print("\nSTEP 3: Importing Claude-extracted activities")
|
||||
print("-"*40)
|
||||
importer = ClaudeActivityImporter()
|
||||
importer.import_all_json_files()
|
||||
|
||||
print("\n" + "="*60)
|
||||
print("EXTRACTION COMPLETE!")
|
||||
print("="*60)
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
raise SystemExit(main())
|
||||
|
||||
Reference in New Issue
Block a user