Rebuild extraction pipeline infrastructure (Faza 0 prep)

Implements the approved plan to replace the broken regex/index-master extraction with an LLM-subagent pipeline. Four parallel lanes: Lane A — scripts/extract_common.py (PDF/docx/doc/pptx/html/zip, no max_pages truncation), normalize_sources.py, chunk_sources.py (~20pg chunks + overlap, manifest registry), activity_schema.json. Lane B — app/config_taxonomy.py (16 fixed category slugs), schema rebuilt from scratch in app/models/ with content_type, language, source_files, source_excerpt, normalized_name, extraction_confidence, needs_review; FTS5 + 3 triggers extended with materials_list and skills_developed. Lane C — build_database.py (--rebuild, atomic swap, schema + fuzzy source_excerpt validation, dedup with needs_review band), validate_extractions.py, review_queue.py, new run_extraction.py orchestrator, SUBAGENT_PROMPT.md. Lane D — search.py content_type/language filters (default search excludes non-game content), E7 schema-compat audit; fixed a NULL keywords AttributeError in _boost_search_relevance. Removes 8 orphaned/dead scripts and app/services/parser.py + indexer.py. Adds tests/ (70 passing, 1 skipped — libreoffice absent). Note: Lane D made one additive edit to app/models/database.py (_update_category_counts) to surface content_type/language in get_filter_options, outside its nominal lane boundary but after Lane B completed. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-05-19 17:43:38 +00:00
parent e0080edf85
commit 66ae831c36
37 changed files with 4101 additions and 1881 deletions
--- a/scripts/import_common.py
+++ b/scripts/import_common.py
@@ -0,0 +1,179 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+"""
+import_common.py — shared helpers for the import / validation side of the
+extraction pipeline (Lane C).
+
+Used by build_database.py and validate_extractions.py:
+  * JSON-schema validation of subagent extraction files,
+  * the anti-hallucination source_excerpt substring check (E5),
+  * locating the source chunk that an extraction file came from,
+  * the stable content key used by the needs_review queue.
+"""
+
+from __future__ import annotations
+
+import hashlib
+import json
+import re
+import unicodedata
+from pathlib import Path
+from typing import Any, Optional
+
+SCRIPT_DIR = Path(__file__).resolve().parent
+REPO_ROOT = SCRIPT_DIR.parent
+
+DEFAULT_SCHEMA_PATH = SCRIPT_DIR / "activity_schema.json"
+
+# rapidfuzz.partial_ratio is on a 0..100 scale; an excerpt counts as a real
+# quote from the source when it scores at least this against the chunk text.
+EXCERPT_MATCH_THRESHOLD = 90.0
+
+
+# --------------------------------------------------------------------------
+# schema validation
+# --------------------------------------------------------------------------
+def load_schema(schema_path: str | Path = DEFAULT_SCHEMA_PATH) -> dict:
+    """Load the activity JSON schema produced by Lane A."""
+    return json.loads(Path(schema_path).read_text(encoding="utf-8"))
+
+
+def validate_extraction(data: Any, schema: dict) -> list[str]:
+    """
+    Validate one parsed extraction file against `schema`.
+
+    Returns a list of human-readable error strings; empty list == valid.
+    """
+    import jsonschema
+
+    validator = jsonschema.Draft7Validator(schema)
+    errors: list[str] = []
+    for err in sorted(validator.iter_errors(data), key=lambda e: list(e.path)):
+        location = "/".join(str(p) for p in err.path) or "<root>"
+        errors.append(f"{location}: {err.message}")
+    return errors
+
+
+# --------------------------------------------------------------------------
+# excerpt verification (E5 — anti-hallucination)
+# --------------------------------------------------------------------------
+def _normalize_text(text: str) -> str:
+    return re.sub(r"\s+", " ", (text or "")).strip().lower()
+
+
+def excerpt_score(excerpt: str, chunk_text: str) -> float:
+    """Best fuzzy-substring score (0..100) of `excerpt` inside `chunk_text`."""
+    from rapidfuzz import fuzz
+
+    if not excerpt or not chunk_text:
+        return 0.0
+    return float(fuzz.partial_ratio(_normalize_text(excerpt), _normalize_text(chunk_text)))
+
+
+def excerpt_matches(
+    excerpt: str, chunk_text: str, threshold: float = EXCERPT_MATCH_THRESHOLD
+) -> bool:
+    """True when `excerpt` appears (fuzzily) as a substring of `chunk_text`."""
+    return excerpt_score(excerpt, chunk_text) >= threshold
+
+
+# --------------------------------------------------------------------------
+# locating the source chunk an extraction file came from
+# --------------------------------------------------------------------------
+def chunk_key_for(json_path: Path, header: Optional[dict]) -> str:
+    """
+    Resolve the chunk key for an extraction file.
+
+    Prefers the explicit `chunk_key` in the header, otherwise falls back to the
+    JSON file stem (extraction files are named `<chunk_key>.json`).
+    """
+    if header and header.get("chunk_key"):
+        return str(header["chunk_key"])
+    return json_path.stem
+
+
+def source_id_for(chunk_key: str, header: Optional[dict]) -> str:
+    """Resolve the source id; `<source_id>.partNN` → `<source_id>`."""
+    if header and header.get("source_id"):
+        return str(header["source_id"])
+    # chunk keys look like "<source_id>.partNN"
+    return chunk_key.rsplit(".part", 1)[0]
+
+
+def find_chunk_text(
+    json_path: Path, header: Optional[dict], chunks_dir: Path
+) -> Optional[str]:
+    """
+    Return the text of the source chunk for an extraction file, or None.
+
+    Looks for data/chunks/<source_id>/<chunk_key>.txt, then falls back to a
+    recursive glob on the chunk key.
+    """
+    chunk_key = chunk_key_for(json_path, header)
+    source_id = source_id_for(chunk_key, header)
+
+    candidate = chunks_dir / source_id / f"{chunk_key}.txt"
+    if candidate.is_file():
+        return candidate.read_text(encoding="utf-8", errors="replace")
+
+    matches = list(chunks_dir.rglob(f"{chunk_key}.txt"))
+    if matches:
+        return matches[0].read_text(encoding="utf-8", errors="replace")
+    return None
+
+
+def source_path_for(source_id: str, sources_dir: Path) -> Optional[str]:
+    """
+    Read the original `SOURCE:` path from a normalized source header.
+
+    data/sources/<source_id>.txt starts with a `SOURCE: <relative path>` line.
+    """
+    src_file = sources_dir / f"{source_id}.txt"
+    if not src_file.is_file():
+        return None
+    try:
+        with src_file.open(encoding="utf-8", errors="replace") as fh:
+            for line in fh:
+                if line.startswith("SOURCE:"):
+                    return line.split(":", 1)[1].strip()
+                if line.startswith("=") or line.startswith("--- PAGE "):
+                    break
+    except OSError:
+        return None
+    return None
+
+
+# --------------------------------------------------------------------------
+# stable content key for the needs_review queue (plan §5c)
+# --------------------------------------------------------------------------
+def normalize_name(name: str) -> str:
+    """Diacritic-free, lowercased, whitespace-collapsed name (dedup key)."""
+    if not name:
+        return ""
+    decomposed = unicodedata.normalize("NFKD", name)
+    ascii_str = "".join(c for c in decomposed if not unicodedata.combining(c))
+    return re.sub(r"\s+", " ", ascii_str.lower().strip())
+
+
+def content_key(normalized_name: str, language: Optional[str], description: str) -> str:
+    """
+    Stable hash identifying a row for the review queue.
+
+    Only borderline-kept-separate rows and legacy `.doc` rows ever carry
+    needs_review, and neither is auto-merged — so their (normalized_name,
+    language, description) triple is stable across rebuilds.
+    """
+    payload = f"{normalized_name}\x1f{language or ''}\x1f{_normalize_text(description)}"
+    return hashlib.sha1(payload.encode("utf-8")).hexdigest()
+
+
+# --------------------------------------------------------------------------
+# iteration
+# --------------------------------------------------------------------------
+def iter_extraction_files(extracted_dir: Path):
+    """Yield every *.json directly under `extracted_dir` (skips _rejected/)."""
+    if not extracted_dir.is_dir():
+        return
+    for path in sorted(extracted_dir.glob("*.json")):
+        if path.is_file():
+            yield path