Files
game-library/scripts/normalize_sources.py
Claude Agent 66ae831c36 Rebuild extraction pipeline infrastructure (Faza 0 prep)
Implements the approved plan to replace the broken regex/index-master
extraction with an LLM-subagent pipeline. Four parallel lanes:

Lane A — scripts/extract_common.py (PDF/docx/doc/pptx/html/zip, no
  max_pages truncation), normalize_sources.py, chunk_sources.py
  (~20pg chunks + overlap, manifest registry), activity_schema.json.
Lane B — app/config_taxonomy.py (16 fixed category slugs), schema
  rebuilt from scratch in app/models/ with content_type, language,
  source_files, source_excerpt, normalized_name, extraction_confidence,
  needs_review; FTS5 + 3 triggers extended with materials_list and
  skills_developed.
Lane C — build_database.py (--rebuild, atomic swap, schema + fuzzy
  source_excerpt validation, dedup with needs_review band),
  validate_extractions.py, review_queue.py, new run_extraction.py
  orchestrator, SUBAGENT_PROMPT.md.
Lane D — search.py content_type/language filters (default search
  excludes non-game content), E7 schema-compat audit; fixed a NULL
  keywords AttributeError in _boost_search_relevance.

Removes 8 orphaned/dead scripts and app/services/parser.py +
indexer.py. Adds tests/ (70 passing, 1 skipped — libreoffice absent).

Note: Lane D made one additive edit to app/models/database.py
(_update_category_counts) to surface content_type/language in
get_filter_options, outside its nominal lane boundary but after
Lane B completed.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-05-19 17:43:38 +00:00

256 lines
8.4 KiB
Python

#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
normalize_sources.py — walk data/carti-camp-jocuri/ and write data/sources/<id>.txt.
Output files keep the existing header format:
SOURCE: <original relative path>
CONVERTED: <iso date>
FORMAT: <pdf|docx|doc|pptx|html-mirror|zip>
NEEDS_REVIEW: <reason> (optional — legacy .doc conversions)
==================================================
--- PAGE 1 ---
...
Each source gets a stable id = <8-hex hash of relative path>_<sanitized stem>,
so two files with the same name in different folders never collide.
The pipeline is script-only: this normalizes formats, it does NOT run extraction.
Run `--check-deps` before a long job.
"""
from __future__ import annotations
import argparse
import datetime as _dt
import hashlib
import re
import sys
from pathlib import Path
SCRIPT_DIR = Path(__file__).resolve().parent
if str(SCRIPT_DIR) not in sys.path:
sys.path.insert(0, str(SCRIPT_DIR))
from extract_common import ( # noqa: E402
count_page_markers,
dedupe_texts,
detect_format,
extract_file,
extract_html,
is_junk,
join_pages,
preflight,
split_pages,
)
HEADER_RULE = "=" * 50
# --------------------------------------------------------------------------
# stable source id
# --------------------------------------------------------------------------
def sanitize_stem(stem: str) -> str:
s = re.sub(r"[^\w]+", "_", stem, flags=re.UNICODE).strip("_").lower()
return s[:60] or "source"
def stable_id(relative_path: str | Path) -> str:
"""Collision-proof id derived from the path relative to the corpus root."""
rel = str(relative_path).replace("\\", "/")
digest = hashlib.sha1(rel.encode("utf-8")).hexdigest()[:8]
stem = sanitize_stem(Path(rel).stem)
return f"{digest}_{stem}"
# --------------------------------------------------------------------------
# header
# --------------------------------------------------------------------------
def build_header(
source_rel: str, fmt: str, needs_review: str | None = None
) -> str:
today = _dt.date.today().isoformat()
lines = [
f"SOURCE: {source_rel}",
f"CONVERTED: {today}",
f"FORMAT: {fmt}",
]
if needs_review:
lines.append(f"NEEDS_REVIEW: {needs_review}")
lines.append(HEADER_RULE)
return "\n".join(lines) + "\n\n"
# --------------------------------------------------------------------------
# mirror-site directories
# --------------------------------------------------------------------------
MIRROR_PAGE_EXTS = {".html", ".htm"}
def is_mirror_dir(path: Path) -> bool:
"""A directory counts as a site mirror if it contains HTML pages."""
if not path.is_dir():
return False
if path.name.endswith("_files"):
return False
return any(
p.suffix.lower() in MIRROR_PAGE_EXTS
for p in path.rglob("*")
if p.is_file()
)
def normalize_mirror(mirror_dir: Path) -> str:
"""Extract every HTML page in a mirror dir, dedupe near-duplicates, join."""
pages: list[tuple[str, str]] = []
for html in sorted(mirror_dir.rglob("*")):
if not html.is_file() or html.suffix.lower() not in MIRROR_PAGE_EXTS:
continue
if "_files" in html.parts:
continue
try:
body = extract_html(html)
except Exception:
continue
text = "\n".join(t for _, t in split_pages(body))
if text.strip():
pages.append((str(html.relative_to(mirror_dir)), text))
pages = dedupe_texts(pages)
return join_pages([t for _, t in pages])
# --------------------------------------------------------------------------
# one source
# --------------------------------------------------------------------------
def normalize_one(
path: Path, corpus_root: Path, out_dir: Path
) -> dict | None:
"""
Normalize a single file or mirror directory → data/sources/<id>.txt.
Returns a result dict, or None if the entry was skipped (junk / ignored).
"""
rel = path.relative_to(corpus_root)
sid = stable_id(rel)
if path.is_dir():
if not is_mirror_dir(path):
return None
fmt, needs_review = "html-mirror", None
body = normalize_mirror(path)
else:
if is_junk(path):
return None
fmt = detect_format(path)
if fmt in ("unknown", "epub", "txt"):
return None # epub duplicates PDFs; txt is not a source format here
needs_review = "legacy .doc conversion is imperfect" if fmt == "doc" else None
try:
body = extract_file(path)
except Exception as exc: # noqa: BLE001
return {"id": sid, "source": str(rel), "status": "error", "error": str(exc)}
if not body.strip():
return {"id": sid, "source": str(rel), "status": "empty"}
out_path = out_dir / f"{sid}.txt"
out_path.write_text(build_header(str(rel), fmt, needs_review) + body,
encoding="utf-8")
return {
"id": sid,
"source": str(rel),
"status": "ok",
"format": fmt,
"pages": count_page_markers(body),
"needs_review": bool(needs_review),
}
# --------------------------------------------------------------------------
# walk
# --------------------------------------------------------------------------
def iter_corpus_entries(corpus_root: Path):
"""Yield top-level files and mirror directories under the corpus root."""
for entry in sorted(corpus_root.iterdir()):
if entry.name.startswith("."):
continue
if entry.is_dir():
if is_mirror_dir(entry):
yield entry
else:
yield entry
def run(corpus_root: Path, out_dir: Path) -> dict:
out_dir.mkdir(parents=True, exist_ok=True)
results: list[dict] = []
for entry in iter_corpus_entries(corpus_root):
res = normalize_one(entry, corpus_root, out_dir)
if res is not None:
results.append(res)
summary = {
"total": len(results),
"ok": sum(1 for r in results if r["status"] == "ok"),
"errors": sum(1 for r in results if r["status"] == "error"),
"empty": sum(1 for r in results if r["status"] == "empty"),
"needs_review": sum(1 for r in results if r.get("needs_review")),
"results": results,
}
return summary
# --------------------------------------------------------------------------
# CLI
# --------------------------------------------------------------------------
def print_preflight(report: dict) -> int:
print("Dependency preflight")
print("--------------------")
if report["missing_python"]:
print(" MISSING Python packages: " + ", ".join(report["missing_python"]))
else:
print(" Python packages: OK")
if report["missing_system"]:
print(" MISSING system tools : " + ", ".join(report["missing_system"]))
for w in report["warnings"]:
print(f" WARNING: {w}")
print(" => " + ("READY" if report["ok"] else "NOT READY — install the above"))
return 0 if report["ok"] else 1
def main(argv: list[str] | None = None) -> int:
parser = argparse.ArgumentParser(description="Normalize mixed sources to .txt")
parser.add_argument("--corpus", default="data/carti-camp-jocuri",
help="corpus root to walk")
parser.add_argument("--out", default="data/sources", help="output directory")
parser.add_argument("--check-deps", action="store_true",
help="run dependency preflight and exit")
parser.add_argument("--ocr", action="store_true",
help="include OCR (tesseract) in the preflight check")
args = parser.parse_args(argv)
if args.check_deps:
return print_preflight(preflight(check_ocr=args.ocr))
report = preflight(check_ocr=args.ocr)
if report["missing_python"]:
print_preflight(report)
return 1
for w in report["warnings"]:
print(f"WARNING: {w}")
summary = run(Path(args.corpus), Path(args.out))
print(f"normalized : {summary['ok']}/{summary['total']}")
print(f"errors : {summary['errors']}")
print(f"empty : {summary['empty']}")
print(f"needs_review: {summary['needs_review']}")
for r in summary["results"]:
if r["status"] != "ok":
print(f" [{r['status']}] {r['source']}")
return 0
if __name__ == "__main__":
raise SystemExit(main())