Implements the approved plan to replace the broken regex/index-master extraction with an LLM-subagent pipeline. Four parallel lanes: Lane A — scripts/extract_common.py (PDF/docx/doc/pptx/html/zip, no max_pages truncation), normalize_sources.py, chunk_sources.py (~20pg chunks + overlap, manifest registry), activity_schema.json. Lane B — app/config_taxonomy.py (16 fixed category slugs), schema rebuilt from scratch in app/models/ with content_type, language, source_files, source_excerpt, normalized_name, extraction_confidence, needs_review; FTS5 + 3 triggers extended with materials_list and skills_developed. Lane C — build_database.py (--rebuild, atomic swap, schema + fuzzy source_excerpt validation, dedup with needs_review band), validate_extractions.py, review_queue.py, new run_extraction.py orchestrator, SUBAGENT_PROMPT.md. Lane D — search.py content_type/language filters (default search excludes non-game content), E7 schema-compat audit; fixed a NULL keywords AttributeError in _boost_search_relevance. Removes 8 orphaned/dead scripts and app/services/parser.py + indexer.py. Adds tests/ (70 passing, 1 skipped — libreoffice absent). Note: Lane D made one additive edit to app/models/database.py (_update_category_counts) to surface content_type/language in get_filter_options, outside its nominal lane boundary but after Lane B completed. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
256 lines
8.4 KiB
Python
256 lines
8.4 KiB
Python
#!/usr/bin/env python3
|
|
# -*- coding: utf-8 -*-
|
|
"""
|
|
normalize_sources.py — walk data/carti-camp-jocuri/ and write data/sources/<id>.txt.
|
|
|
|
Output files keep the existing header format:
|
|
|
|
SOURCE: <original relative path>
|
|
CONVERTED: <iso date>
|
|
FORMAT: <pdf|docx|doc|pptx|html-mirror|zip>
|
|
NEEDS_REVIEW: <reason> (optional — legacy .doc conversions)
|
|
==================================================
|
|
|
|
--- PAGE 1 ---
|
|
...
|
|
|
|
Each source gets a stable id = <8-hex hash of relative path>_<sanitized stem>,
|
|
so two files with the same name in different folders never collide.
|
|
|
|
The pipeline is script-only: this normalizes formats, it does NOT run extraction.
|
|
Run `--check-deps` before a long job.
|
|
"""
|
|
|
|
from __future__ import annotations
|
|
|
|
import argparse
|
|
import datetime as _dt
|
|
import hashlib
|
|
import re
|
|
import sys
|
|
from pathlib import Path
|
|
|
|
SCRIPT_DIR = Path(__file__).resolve().parent
|
|
if str(SCRIPT_DIR) not in sys.path:
|
|
sys.path.insert(0, str(SCRIPT_DIR))
|
|
|
|
from extract_common import ( # noqa: E402
|
|
count_page_markers,
|
|
dedupe_texts,
|
|
detect_format,
|
|
extract_file,
|
|
extract_html,
|
|
is_junk,
|
|
join_pages,
|
|
preflight,
|
|
split_pages,
|
|
)
|
|
|
|
HEADER_RULE = "=" * 50
|
|
|
|
|
|
# --------------------------------------------------------------------------
|
|
# stable source id
|
|
# --------------------------------------------------------------------------
|
|
def sanitize_stem(stem: str) -> str:
|
|
s = re.sub(r"[^\w]+", "_", stem, flags=re.UNICODE).strip("_").lower()
|
|
return s[:60] or "source"
|
|
|
|
|
|
def stable_id(relative_path: str | Path) -> str:
|
|
"""Collision-proof id derived from the path relative to the corpus root."""
|
|
rel = str(relative_path).replace("\\", "/")
|
|
digest = hashlib.sha1(rel.encode("utf-8")).hexdigest()[:8]
|
|
stem = sanitize_stem(Path(rel).stem)
|
|
return f"{digest}_{stem}"
|
|
|
|
|
|
# --------------------------------------------------------------------------
|
|
# header
|
|
# --------------------------------------------------------------------------
|
|
def build_header(
|
|
source_rel: str, fmt: str, needs_review: str | None = None
|
|
) -> str:
|
|
today = _dt.date.today().isoformat()
|
|
lines = [
|
|
f"SOURCE: {source_rel}",
|
|
f"CONVERTED: {today}",
|
|
f"FORMAT: {fmt}",
|
|
]
|
|
if needs_review:
|
|
lines.append(f"NEEDS_REVIEW: {needs_review}")
|
|
lines.append(HEADER_RULE)
|
|
return "\n".join(lines) + "\n\n"
|
|
|
|
|
|
# --------------------------------------------------------------------------
|
|
# mirror-site directories
|
|
# --------------------------------------------------------------------------
|
|
MIRROR_PAGE_EXTS = {".html", ".htm"}
|
|
|
|
|
|
def is_mirror_dir(path: Path) -> bool:
|
|
"""A directory counts as a site mirror if it contains HTML pages."""
|
|
if not path.is_dir():
|
|
return False
|
|
if path.name.endswith("_files"):
|
|
return False
|
|
return any(
|
|
p.suffix.lower() in MIRROR_PAGE_EXTS
|
|
for p in path.rglob("*")
|
|
if p.is_file()
|
|
)
|
|
|
|
|
|
def normalize_mirror(mirror_dir: Path) -> str:
|
|
"""Extract every HTML page in a mirror dir, dedupe near-duplicates, join."""
|
|
pages: list[tuple[str, str]] = []
|
|
for html in sorted(mirror_dir.rglob("*")):
|
|
if not html.is_file() or html.suffix.lower() not in MIRROR_PAGE_EXTS:
|
|
continue
|
|
if "_files" in html.parts:
|
|
continue
|
|
try:
|
|
body = extract_html(html)
|
|
except Exception:
|
|
continue
|
|
text = "\n".join(t for _, t in split_pages(body))
|
|
if text.strip():
|
|
pages.append((str(html.relative_to(mirror_dir)), text))
|
|
pages = dedupe_texts(pages)
|
|
return join_pages([t for _, t in pages])
|
|
|
|
|
|
# --------------------------------------------------------------------------
|
|
# one source
|
|
# --------------------------------------------------------------------------
|
|
def normalize_one(
|
|
path: Path, corpus_root: Path, out_dir: Path
|
|
) -> dict | None:
|
|
"""
|
|
Normalize a single file or mirror directory → data/sources/<id>.txt.
|
|
|
|
Returns a result dict, or None if the entry was skipped (junk / ignored).
|
|
"""
|
|
rel = path.relative_to(corpus_root)
|
|
sid = stable_id(rel)
|
|
|
|
if path.is_dir():
|
|
if not is_mirror_dir(path):
|
|
return None
|
|
fmt, needs_review = "html-mirror", None
|
|
body = normalize_mirror(path)
|
|
else:
|
|
if is_junk(path):
|
|
return None
|
|
fmt = detect_format(path)
|
|
if fmt in ("unknown", "epub", "txt"):
|
|
return None # epub duplicates PDFs; txt is not a source format here
|
|
needs_review = "legacy .doc conversion is imperfect" if fmt == "doc" else None
|
|
try:
|
|
body = extract_file(path)
|
|
except Exception as exc: # noqa: BLE001
|
|
return {"id": sid, "source": str(rel), "status": "error", "error": str(exc)}
|
|
|
|
if not body.strip():
|
|
return {"id": sid, "source": str(rel), "status": "empty"}
|
|
|
|
out_path = out_dir / f"{sid}.txt"
|
|
out_path.write_text(build_header(str(rel), fmt, needs_review) + body,
|
|
encoding="utf-8")
|
|
return {
|
|
"id": sid,
|
|
"source": str(rel),
|
|
"status": "ok",
|
|
"format": fmt,
|
|
"pages": count_page_markers(body),
|
|
"needs_review": bool(needs_review),
|
|
}
|
|
|
|
|
|
# --------------------------------------------------------------------------
|
|
# walk
|
|
# --------------------------------------------------------------------------
|
|
def iter_corpus_entries(corpus_root: Path):
|
|
"""Yield top-level files and mirror directories under the corpus root."""
|
|
for entry in sorted(corpus_root.iterdir()):
|
|
if entry.name.startswith("."):
|
|
continue
|
|
if entry.is_dir():
|
|
if is_mirror_dir(entry):
|
|
yield entry
|
|
else:
|
|
yield entry
|
|
|
|
|
|
def run(corpus_root: Path, out_dir: Path) -> dict:
|
|
out_dir.mkdir(parents=True, exist_ok=True)
|
|
results: list[dict] = []
|
|
for entry in iter_corpus_entries(corpus_root):
|
|
res = normalize_one(entry, corpus_root, out_dir)
|
|
if res is not None:
|
|
results.append(res)
|
|
summary = {
|
|
"total": len(results),
|
|
"ok": sum(1 for r in results if r["status"] == "ok"),
|
|
"errors": sum(1 for r in results if r["status"] == "error"),
|
|
"empty": sum(1 for r in results if r["status"] == "empty"),
|
|
"needs_review": sum(1 for r in results if r.get("needs_review")),
|
|
"results": results,
|
|
}
|
|
return summary
|
|
|
|
|
|
# --------------------------------------------------------------------------
|
|
# CLI
|
|
# --------------------------------------------------------------------------
|
|
def print_preflight(report: dict) -> int:
|
|
print("Dependency preflight")
|
|
print("--------------------")
|
|
if report["missing_python"]:
|
|
print(" MISSING Python packages: " + ", ".join(report["missing_python"]))
|
|
else:
|
|
print(" Python packages: OK")
|
|
if report["missing_system"]:
|
|
print(" MISSING system tools : " + ", ".join(report["missing_system"]))
|
|
for w in report["warnings"]:
|
|
print(f" WARNING: {w}")
|
|
print(" => " + ("READY" if report["ok"] else "NOT READY — install the above"))
|
|
return 0 if report["ok"] else 1
|
|
|
|
|
|
def main(argv: list[str] | None = None) -> int:
|
|
parser = argparse.ArgumentParser(description="Normalize mixed sources to .txt")
|
|
parser.add_argument("--corpus", default="data/carti-camp-jocuri",
|
|
help="corpus root to walk")
|
|
parser.add_argument("--out", default="data/sources", help="output directory")
|
|
parser.add_argument("--check-deps", action="store_true",
|
|
help="run dependency preflight and exit")
|
|
parser.add_argument("--ocr", action="store_true",
|
|
help="include OCR (tesseract) in the preflight check")
|
|
args = parser.parse_args(argv)
|
|
|
|
if args.check_deps:
|
|
return print_preflight(preflight(check_ocr=args.ocr))
|
|
|
|
report = preflight(check_ocr=args.ocr)
|
|
if report["missing_python"]:
|
|
print_preflight(report)
|
|
return 1
|
|
for w in report["warnings"]:
|
|
print(f"WARNING: {w}")
|
|
|
|
summary = run(Path(args.corpus), Path(args.out))
|
|
print(f"normalized : {summary['ok']}/{summary['total']}")
|
|
print(f"errors : {summary['errors']}")
|
|
print(f"empty : {summary['empty']}")
|
|
print(f"needs_review: {summary['needs_review']}")
|
|
for r in summary["results"]:
|
|
if r["status"] != "ok":
|
|
print(f" [{r['status']}] {r['source']}")
|
|
return 0
|
|
|
|
|
|
if __name__ == "__main__":
|
|
raise SystemExit(main())
|