Rebuild extraction pipeline infrastructure (Faza 0 prep)

Implements the approved plan to replace the broken regex/index-master extraction with an LLM-subagent pipeline. Four parallel lanes: Lane A — scripts/extract_common.py (PDF/docx/doc/pptx/html/zip, no max_pages truncation), normalize_sources.py, chunk_sources.py (~20pg chunks + overlap, manifest registry), activity_schema.json. Lane B — app/config_taxonomy.py (16 fixed category slugs), schema rebuilt from scratch in app/models/ with content_type, language, source_files, source_excerpt, normalized_name, extraction_confidence, needs_review; FTS5 + 3 triggers extended with materials_list and skills_developed. Lane C — build_database.py (--rebuild, atomic swap, schema + fuzzy source_excerpt validation, dedup with needs_review band), validate_extractions.py, review_queue.py, new run_extraction.py orchestrator, SUBAGENT_PROMPT.md. Lane D — search.py content_type/language filters (default search excludes non-game content), E7 schema-compat audit; fixed a NULL keywords AttributeError in _boost_search_relevance. Removes 8 orphaned/dead scripts and app/services/parser.py + indexer.py. Adds tests/ (70 passing, 1 skipped — libreoffice absent). Note: Lane D made one additive edit to app/models/database.py (_update_category_counts) to surface content_type/language in get_filter_options, outside its nominal lane boundary but after Lane B completed. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-05-19 17:43:38 +00:00
parent e0080edf85
commit 66ae831c36
37 changed files with 4101 additions and 1881 deletions
--- a/scripts/normalize_sources.py
+++ b/scripts/normalize_sources.py
@@ -0,0 +1,255 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+"""
+normalize_sources.py — walk data/carti-camp-jocuri/ and write data/sources/<id>.txt.
+
+Output files keep the existing header format:
+
+    SOURCE: <original relative path>
+    CONVERTED: <iso date>
+    FORMAT: <pdf|docx|doc|pptx|html-mirror|zip>
+    NEEDS_REVIEW: <reason>            (optional — legacy .doc conversions)
+    ==================================================
+
+    --- PAGE 1 ---
+    ...
+
+Each source gets a stable id = <8-hex hash of relative path>_<sanitized stem>,
+so two files with the same name in different folders never collide.
+
+The pipeline is script-only: this normalizes formats, it does NOT run extraction.
+Run `--check-deps` before a long job.
+"""
+
+from __future__ import annotations
+
+import argparse
+import datetime as _dt
+import hashlib
+import re
+import sys
+from pathlib import Path
+
+SCRIPT_DIR = Path(__file__).resolve().parent
+if str(SCRIPT_DIR) not in sys.path:
+    sys.path.insert(0, str(SCRIPT_DIR))
+
+from extract_common import (  # noqa: E402
+    count_page_markers,
+    dedupe_texts,
+    detect_format,
+    extract_file,
+    extract_html,
+    is_junk,
+    join_pages,
+    preflight,
+    split_pages,
+)
+
+HEADER_RULE = "=" * 50
+
+
+# --------------------------------------------------------------------------
+# stable source id
+# --------------------------------------------------------------------------
+def sanitize_stem(stem: str) -> str:
+    s = re.sub(r"[^\w]+", "_", stem, flags=re.UNICODE).strip("_").lower()
+    return s[:60] or "source"
+
+
+def stable_id(relative_path: str | Path) -> str:
+    """Collision-proof id derived from the path relative to the corpus root."""
+    rel = str(relative_path).replace("\\", "/")
+    digest = hashlib.sha1(rel.encode("utf-8")).hexdigest()[:8]
+    stem = sanitize_stem(Path(rel).stem)
+    return f"{digest}_{stem}"
+
+
+# --------------------------------------------------------------------------
+# header
+# --------------------------------------------------------------------------
+def build_header(
+    source_rel: str, fmt: str, needs_review: str | None = None
+) -> str:
+    today = _dt.date.today().isoformat()
+    lines = [
+        f"SOURCE: {source_rel}",
+        f"CONVERTED: {today}",
+        f"FORMAT: {fmt}",
+    ]
+    if needs_review:
+        lines.append(f"NEEDS_REVIEW: {needs_review}")
+    lines.append(HEADER_RULE)
+    return "\n".join(lines) + "\n\n"
+
+
+# --------------------------------------------------------------------------
+# mirror-site directories
+# --------------------------------------------------------------------------
+MIRROR_PAGE_EXTS = {".html", ".htm"}
+
+
+def is_mirror_dir(path: Path) -> bool:
+    """A directory counts as a site mirror if it contains HTML pages."""
+    if not path.is_dir():
+        return False
+    if path.name.endswith("_files"):
+        return False
+    return any(
+        p.suffix.lower() in MIRROR_PAGE_EXTS
+        for p in path.rglob("*")
+        if p.is_file()
+    )
+
+
+def normalize_mirror(mirror_dir: Path) -> str:
+    """Extract every HTML page in a mirror dir, dedupe near-duplicates, join."""
+    pages: list[tuple[str, str]] = []
+    for html in sorted(mirror_dir.rglob("*")):
+        if not html.is_file() or html.suffix.lower() not in MIRROR_PAGE_EXTS:
+            continue
+        if "_files" in html.parts:
+            continue
+        try:
+            body = extract_html(html)
+        except Exception:
+            continue
+        text = "\n".join(t for _, t in split_pages(body))
+        if text.strip():
+            pages.append((str(html.relative_to(mirror_dir)), text))
+    pages = dedupe_texts(pages)
+    return join_pages([t for _, t in pages])
+
+
+# --------------------------------------------------------------------------
+# one source
+# --------------------------------------------------------------------------
+def normalize_one(
+    path: Path, corpus_root: Path, out_dir: Path
+) -> dict | None:
+    """
+    Normalize a single file or mirror directory → data/sources/<id>.txt.
+
+    Returns a result dict, or None if the entry was skipped (junk / ignored).
+    """
+    rel = path.relative_to(corpus_root)
+    sid = stable_id(rel)
+
+    if path.is_dir():
+        if not is_mirror_dir(path):
+            return None
+        fmt, needs_review = "html-mirror", None
+        body = normalize_mirror(path)
+    else:
+        if is_junk(path):
+            return None
+        fmt = detect_format(path)
+        if fmt in ("unknown", "epub", "txt"):
+            return None  # epub duplicates PDFs; txt is not a source format here
+        needs_review = "legacy .doc conversion is imperfect" if fmt == "doc" else None
+        try:
+            body = extract_file(path)
+        except Exception as exc:  # noqa: BLE001
+            return {"id": sid, "source": str(rel), "status": "error", "error": str(exc)}
+
+    if not body.strip():
+        return {"id": sid, "source": str(rel), "status": "empty"}
+
+    out_path = out_dir / f"{sid}.txt"
+    out_path.write_text(build_header(str(rel), fmt, needs_review) + body,
+                        encoding="utf-8")
+    return {
+        "id": sid,
+        "source": str(rel),
+        "status": "ok",
+        "format": fmt,
+        "pages": count_page_markers(body),
+        "needs_review": bool(needs_review),
+    }
+
+
+# --------------------------------------------------------------------------
+# walk
+# --------------------------------------------------------------------------
+def iter_corpus_entries(corpus_root: Path):
+    """Yield top-level files and mirror directories under the corpus root."""
+    for entry in sorted(corpus_root.iterdir()):
+        if entry.name.startswith("."):
+            continue
+        if entry.is_dir():
+            if is_mirror_dir(entry):
+                yield entry
+        else:
+            yield entry
+
+
+def run(corpus_root: Path, out_dir: Path) -> dict:
+    out_dir.mkdir(parents=True, exist_ok=True)
+    results: list[dict] = []
+    for entry in iter_corpus_entries(corpus_root):
+        res = normalize_one(entry, corpus_root, out_dir)
+        if res is not None:
+            results.append(res)
+    summary = {
+        "total": len(results),
+        "ok": sum(1 for r in results if r["status"] == "ok"),
+        "errors": sum(1 for r in results if r["status"] == "error"),
+        "empty": sum(1 for r in results if r["status"] == "empty"),
+        "needs_review": sum(1 for r in results if r.get("needs_review")),
+        "results": results,
+    }
+    return summary
+
+
+# --------------------------------------------------------------------------
+# CLI
+# --------------------------------------------------------------------------
+def print_preflight(report: dict) -> int:
+    print("Dependency preflight")
+    print("--------------------")
+    if report["missing_python"]:
+        print("  MISSING Python packages: " + ", ".join(report["missing_python"]))
+    else:
+        print("  Python packages: OK")
+    if report["missing_system"]:
+        print("  MISSING system tools  : " + ", ".join(report["missing_system"]))
+    for w in report["warnings"]:
+        print(f"  WARNING: {w}")
+    print("  => " + ("READY" if report["ok"] else "NOT READY — install the above"))
+    return 0 if report["ok"] else 1
+
+
+def main(argv: list[str] | None = None) -> int:
+    parser = argparse.ArgumentParser(description="Normalize mixed sources to .txt")
+    parser.add_argument("--corpus", default="data/carti-camp-jocuri",
+                        help="corpus root to walk")
+    parser.add_argument("--out", default="data/sources", help="output directory")
+    parser.add_argument("--check-deps", action="store_true",
+                        help="run dependency preflight and exit")
+    parser.add_argument("--ocr", action="store_true",
+                        help="include OCR (tesseract) in the preflight check")
+    args = parser.parse_args(argv)
+
+    if args.check_deps:
+        return print_preflight(preflight(check_ocr=args.ocr))
+
+    report = preflight(check_ocr=args.ocr)
+    if report["missing_python"]:
+        print_preflight(report)
+        return 1
+    for w in report["warnings"]:
+        print(f"WARNING: {w}")
+
+    summary = run(Path(args.corpus), Path(args.out))
+    print(f"normalized : {summary['ok']}/{summary['total']}")
+    print(f"errors     : {summary['errors']}")
+    print(f"empty      : {summary['empty']}")
+    print(f"needs_review: {summary['needs_review']}")
+    for r in summary["results"]:
+        if r["status"] != "ok":
+            print(f"  [{r['status']}] {r['source']}")
+    return 0
+
+
+if __name__ == "__main__":
+    raise SystemExit(main())