Rebuild extraction pipeline infrastructure (Faza 0 prep)
Implements the approved plan to replace the broken regex/index-master extraction with an LLM-subagent pipeline. Four parallel lanes: Lane A — scripts/extract_common.py (PDF/docx/doc/pptx/html/zip, no max_pages truncation), normalize_sources.py, chunk_sources.py (~20pg chunks + overlap, manifest registry), activity_schema.json. Lane B — app/config_taxonomy.py (16 fixed category slugs), schema rebuilt from scratch in app/models/ with content_type, language, source_files, source_excerpt, normalized_name, extraction_confidence, needs_review; FTS5 + 3 triggers extended with materials_list and skills_developed. Lane C — build_database.py (--rebuild, atomic swap, schema + fuzzy source_excerpt validation, dedup with needs_review band), validate_extractions.py, review_queue.py, new run_extraction.py orchestrator, SUBAGENT_PROMPT.md. Lane D — search.py content_type/language filters (default search excludes non-game content), E7 schema-compat audit; fixed a NULL keywords AttributeError in _boost_search_relevance. Removes 8 orphaned/dead scripts and app/services/parser.py + indexer.py. Adds tests/ (70 passing, 1 skipped — libreoffice absent). Note: Lane D made one additive edit to app/models/database.py (_update_category_counts) to surface content_type/language in get_filter_options, outside its nominal lane boundary but after Lane B completed. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
255
scripts/normalize_sources.py
Normal file
255
scripts/normalize_sources.py
Normal file
@@ -0,0 +1,255 @@
|
||||
#!/usr/bin/env python3
|
||||
# -*- coding: utf-8 -*-
|
||||
"""
|
||||
normalize_sources.py — walk data/carti-camp-jocuri/ and write data/sources/<id>.txt.
|
||||
|
||||
Output files keep the existing header format:
|
||||
|
||||
SOURCE: <original relative path>
|
||||
CONVERTED: <iso date>
|
||||
FORMAT: <pdf|docx|doc|pptx|html-mirror|zip>
|
||||
NEEDS_REVIEW: <reason> (optional — legacy .doc conversions)
|
||||
==================================================
|
||||
|
||||
--- PAGE 1 ---
|
||||
...
|
||||
|
||||
Each source gets a stable id = <8-hex hash of relative path>_<sanitized stem>,
|
||||
so two files with the same name in different folders never collide.
|
||||
|
||||
The pipeline is script-only: this normalizes formats, it does NOT run extraction.
|
||||
Run `--check-deps` before a long job.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import argparse
|
||||
import datetime as _dt
|
||||
import hashlib
|
||||
import re
|
||||
import sys
|
||||
from pathlib import Path
|
||||
|
||||
SCRIPT_DIR = Path(__file__).resolve().parent
|
||||
if str(SCRIPT_DIR) not in sys.path:
|
||||
sys.path.insert(0, str(SCRIPT_DIR))
|
||||
|
||||
from extract_common import ( # noqa: E402
|
||||
count_page_markers,
|
||||
dedupe_texts,
|
||||
detect_format,
|
||||
extract_file,
|
||||
extract_html,
|
||||
is_junk,
|
||||
join_pages,
|
||||
preflight,
|
||||
split_pages,
|
||||
)
|
||||
|
||||
HEADER_RULE = "=" * 50
|
||||
|
||||
|
||||
# --------------------------------------------------------------------------
|
||||
# stable source id
|
||||
# --------------------------------------------------------------------------
|
||||
def sanitize_stem(stem: str) -> str:
|
||||
s = re.sub(r"[^\w]+", "_", stem, flags=re.UNICODE).strip("_").lower()
|
||||
return s[:60] or "source"
|
||||
|
||||
|
||||
def stable_id(relative_path: str | Path) -> str:
|
||||
"""Collision-proof id derived from the path relative to the corpus root."""
|
||||
rel = str(relative_path).replace("\\", "/")
|
||||
digest = hashlib.sha1(rel.encode("utf-8")).hexdigest()[:8]
|
||||
stem = sanitize_stem(Path(rel).stem)
|
||||
return f"{digest}_{stem}"
|
||||
|
||||
|
||||
# --------------------------------------------------------------------------
|
||||
# header
|
||||
# --------------------------------------------------------------------------
|
||||
def build_header(
|
||||
source_rel: str, fmt: str, needs_review: str | None = None
|
||||
) -> str:
|
||||
today = _dt.date.today().isoformat()
|
||||
lines = [
|
||||
f"SOURCE: {source_rel}",
|
||||
f"CONVERTED: {today}",
|
||||
f"FORMAT: {fmt}",
|
||||
]
|
||||
if needs_review:
|
||||
lines.append(f"NEEDS_REVIEW: {needs_review}")
|
||||
lines.append(HEADER_RULE)
|
||||
return "\n".join(lines) + "\n\n"
|
||||
|
||||
|
||||
# --------------------------------------------------------------------------
|
||||
# mirror-site directories
|
||||
# --------------------------------------------------------------------------
|
||||
MIRROR_PAGE_EXTS = {".html", ".htm"}
|
||||
|
||||
|
||||
def is_mirror_dir(path: Path) -> bool:
|
||||
"""A directory counts as a site mirror if it contains HTML pages."""
|
||||
if not path.is_dir():
|
||||
return False
|
||||
if path.name.endswith("_files"):
|
||||
return False
|
||||
return any(
|
||||
p.suffix.lower() in MIRROR_PAGE_EXTS
|
||||
for p in path.rglob("*")
|
||||
if p.is_file()
|
||||
)
|
||||
|
||||
|
||||
def normalize_mirror(mirror_dir: Path) -> str:
|
||||
"""Extract every HTML page in a mirror dir, dedupe near-duplicates, join."""
|
||||
pages: list[tuple[str, str]] = []
|
||||
for html in sorted(mirror_dir.rglob("*")):
|
||||
if not html.is_file() or html.suffix.lower() not in MIRROR_PAGE_EXTS:
|
||||
continue
|
||||
if "_files" in html.parts:
|
||||
continue
|
||||
try:
|
||||
body = extract_html(html)
|
||||
except Exception:
|
||||
continue
|
||||
text = "\n".join(t for _, t in split_pages(body))
|
||||
if text.strip():
|
||||
pages.append((str(html.relative_to(mirror_dir)), text))
|
||||
pages = dedupe_texts(pages)
|
||||
return join_pages([t for _, t in pages])
|
||||
|
||||
|
||||
# --------------------------------------------------------------------------
|
||||
# one source
|
||||
# --------------------------------------------------------------------------
|
||||
def normalize_one(
|
||||
path: Path, corpus_root: Path, out_dir: Path
|
||||
) -> dict | None:
|
||||
"""
|
||||
Normalize a single file or mirror directory → data/sources/<id>.txt.
|
||||
|
||||
Returns a result dict, or None if the entry was skipped (junk / ignored).
|
||||
"""
|
||||
rel = path.relative_to(corpus_root)
|
||||
sid = stable_id(rel)
|
||||
|
||||
if path.is_dir():
|
||||
if not is_mirror_dir(path):
|
||||
return None
|
||||
fmt, needs_review = "html-mirror", None
|
||||
body = normalize_mirror(path)
|
||||
else:
|
||||
if is_junk(path):
|
||||
return None
|
||||
fmt = detect_format(path)
|
||||
if fmt in ("unknown", "epub", "txt"):
|
||||
return None # epub duplicates PDFs; txt is not a source format here
|
||||
needs_review = "legacy .doc conversion is imperfect" if fmt == "doc" else None
|
||||
try:
|
||||
body = extract_file(path)
|
||||
except Exception as exc: # noqa: BLE001
|
||||
return {"id": sid, "source": str(rel), "status": "error", "error": str(exc)}
|
||||
|
||||
if not body.strip():
|
||||
return {"id": sid, "source": str(rel), "status": "empty"}
|
||||
|
||||
out_path = out_dir / f"{sid}.txt"
|
||||
out_path.write_text(build_header(str(rel), fmt, needs_review) + body,
|
||||
encoding="utf-8")
|
||||
return {
|
||||
"id": sid,
|
||||
"source": str(rel),
|
||||
"status": "ok",
|
||||
"format": fmt,
|
||||
"pages": count_page_markers(body),
|
||||
"needs_review": bool(needs_review),
|
||||
}
|
||||
|
||||
|
||||
# --------------------------------------------------------------------------
|
||||
# walk
|
||||
# --------------------------------------------------------------------------
|
||||
def iter_corpus_entries(corpus_root: Path):
|
||||
"""Yield top-level files and mirror directories under the corpus root."""
|
||||
for entry in sorted(corpus_root.iterdir()):
|
||||
if entry.name.startswith("."):
|
||||
continue
|
||||
if entry.is_dir():
|
||||
if is_mirror_dir(entry):
|
||||
yield entry
|
||||
else:
|
||||
yield entry
|
||||
|
||||
|
||||
def run(corpus_root: Path, out_dir: Path) -> dict:
|
||||
out_dir.mkdir(parents=True, exist_ok=True)
|
||||
results: list[dict] = []
|
||||
for entry in iter_corpus_entries(corpus_root):
|
||||
res = normalize_one(entry, corpus_root, out_dir)
|
||||
if res is not None:
|
||||
results.append(res)
|
||||
summary = {
|
||||
"total": len(results),
|
||||
"ok": sum(1 for r in results if r["status"] == "ok"),
|
||||
"errors": sum(1 for r in results if r["status"] == "error"),
|
||||
"empty": sum(1 for r in results if r["status"] == "empty"),
|
||||
"needs_review": sum(1 for r in results if r.get("needs_review")),
|
||||
"results": results,
|
||||
}
|
||||
return summary
|
||||
|
||||
|
||||
# --------------------------------------------------------------------------
|
||||
# CLI
|
||||
# --------------------------------------------------------------------------
|
||||
def print_preflight(report: dict) -> int:
|
||||
print("Dependency preflight")
|
||||
print("--------------------")
|
||||
if report["missing_python"]:
|
||||
print(" MISSING Python packages: " + ", ".join(report["missing_python"]))
|
||||
else:
|
||||
print(" Python packages: OK")
|
||||
if report["missing_system"]:
|
||||
print(" MISSING system tools : " + ", ".join(report["missing_system"]))
|
||||
for w in report["warnings"]:
|
||||
print(f" WARNING: {w}")
|
||||
print(" => " + ("READY" if report["ok"] else "NOT READY — install the above"))
|
||||
return 0 if report["ok"] else 1
|
||||
|
||||
|
||||
def main(argv: list[str] | None = None) -> int:
|
||||
parser = argparse.ArgumentParser(description="Normalize mixed sources to .txt")
|
||||
parser.add_argument("--corpus", default="data/carti-camp-jocuri",
|
||||
help="corpus root to walk")
|
||||
parser.add_argument("--out", default="data/sources", help="output directory")
|
||||
parser.add_argument("--check-deps", action="store_true",
|
||||
help="run dependency preflight and exit")
|
||||
parser.add_argument("--ocr", action="store_true",
|
||||
help="include OCR (tesseract) in the preflight check")
|
||||
args = parser.parse_args(argv)
|
||||
|
||||
if args.check_deps:
|
||||
return print_preflight(preflight(check_ocr=args.ocr))
|
||||
|
||||
report = preflight(check_ocr=args.ocr)
|
||||
if report["missing_python"]:
|
||||
print_preflight(report)
|
||||
return 1
|
||||
for w in report["warnings"]:
|
||||
print(f"WARNING: {w}")
|
||||
|
||||
summary = run(Path(args.corpus), Path(args.out))
|
||||
print(f"normalized : {summary['ok']}/{summary['total']}")
|
||||
print(f"errors : {summary['errors']}")
|
||||
print(f"empty : {summary['empty']}")
|
||||
print(f"needs_review: {summary['needs_review']}")
|
||||
for r in summary["results"]:
|
||||
if r["status"] != "ok":
|
||||
print(f" [{r['status']}] {r['source']}")
|
||||
return 0
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
raise SystemExit(main())
|
||||
Reference in New Issue
Block a user