Rebuild extraction pipeline infrastructure (Faza 0 prep)

Implements the approved plan to replace the broken regex/index-master extraction with an LLM-subagent pipeline. Four parallel lanes: Lane A — scripts/extract_common.py (PDF/docx/doc/pptx/html/zip, no max_pages truncation), normalize_sources.py, chunk_sources.py (~20pg chunks + overlap, manifest registry), activity_schema.json. Lane B — app/config_taxonomy.py (16 fixed category slugs), schema rebuilt from scratch in app/models/ with content_type, language, source_files, source_excerpt, normalized_name, extraction_confidence, needs_review; FTS5 + 3 triggers extended with materials_list and skills_developed. Lane C — build_database.py (--rebuild, atomic swap, schema + fuzzy source_excerpt validation, dedup with needs_review band), validate_extractions.py, review_queue.py, new run_extraction.py orchestrator, SUBAGENT_PROMPT.md. Lane D — search.py content_type/language filters (default search excludes non-game content), E7 schema-compat audit; fixed a NULL keywords AttributeError in _boost_search_relevance. Removes 8 orphaned/dead scripts and app/services/parser.py + indexer.py. Adds tests/ (70 passing, 1 skipped — libreoffice absent). Note: Lane D made one additive edit to app/models/database.py (_update_category_counts) to surface content_type/language in get_filter_options, outside its nominal lane boundary but after Lane B completed. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-05-19 17:43:38 +00:00
parent e0080edf85
commit 66ae831c36
37 changed files with 4101 additions and 1881 deletions
--- a/tests/test_extract_common.py
+++ b/tests/test_extract_common.py
@@ -0,0 +1,177 @@
+# -*- coding: utf-8 -*-
+"""Tests for scripts/extract_common.py."""
+
+import shutil
+import zipfile
+
+import pytest
+
+import extract_common as ec
+
+
+# --------------------------------------------------------------------------
+# format detection
+# --------------------------------------------------------------------------
+def test_detect_format():
+    assert ec.detect_format("a/b/file.PDF") == "pdf"
+    assert ec.detect_format("x.docx") == "docx"
+    assert ec.detect_format("x.doc") == "doc"
+    assert ec.detect_format("x.pptx") == "pptx"
+    assert ec.detect_format("x.html") == "html"
+    assert ec.detect_format("x.zip") == "zip"
+    assert ec.detect_format("x.epub") == "epub"
+    assert ec.detect_format("x.xyz") == "unknown"
+
+
+def test_is_junk():
+    assert ec.is_junk("some/desktop.ini")
+    assert ec.is_junk("notes.bak")
+    assert ec.is_junk("README.md")
+    assert not ec.is_junk("1000 Scout Games.pdf")
+
+
+# --------------------------------------------------------------------------
+# PDF — the critical "no max_pages" regression
+# --------------------------------------------------------------------------
+def test_pdf_extracts_all_60_pages(big_pdf):
+    body = ec.extract_pdf(big_pdf)
+    # the old converter capped at 50 pages — page 60 must be present now
+    assert "--- PAGE 60 ---" in body
+    assert "PDFMARK-60" in body
+    assert ec.count_page_markers(body) == 60
+
+
+def test_pdf_does_not_truncate_mid_document(big_pdf):
+    body = ec.extract_pdf(big_pdf)
+    pages = ec.split_pages(body)
+    assert pages[-1][0] == 60  # last marker is the real last page
+
+
+# --------------------------------------------------------------------------
+# page join / split round-trip
+# --------------------------------------------------------------------------
+def test_join_split_round_trip():
+    body = ec.join_pages(["alpha", "beta", "gamma"])
+    pages = ec.split_pages(body)
+    assert [n for n, _ in pages] == [1, 2, 3]
+    assert [t for _, t in pages] == ["alpha", "beta", "gamma"]
+
+
+def test_split_pages_no_markers_returns_empty():
+    assert ec.split_pages("plain text with no markers") == []
+
+
+# --------------------------------------------------------------------------
+# docx — synthetic page markers
+# --------------------------------------------------------------------------
+def test_docx_synthetic_page_markers(sample_docx):
+    body = ec.extract_docx(sample_docx)
+    # 100 paragraphs / 40 per page => 3 pages
+    assert ec.count_page_markers(body) == 3
+    assert "Paragraf 99" in body
+
+
+# --------------------------------------------------------------------------
+# HTML mirror — nav/script/footer stripped
+# --------------------------------------------------------------------------
+def test_html_strips_chrome(html_with_nav):
+    body = ec.extract_html(html_with_nav)
+    assert "Vanatoarea de comori" in body
+    assert "joc real de orientare" in body
+    # chrome must be gone
+    assert "tracking" not in body
+    assert "Site Banner Junk" not in body
+    assert "toate drepturile rezervate" not in body
+    assert "Games" not in body
+
+
+# --------------------------------------------------------------------------
+# content hash + near-duplicate elimination
+# --------------------------------------------------------------------------
+def test_content_hash_ignores_whitespace():
+    assert ec.content_hash("hello  world") == ec.content_hash("hello world\n")
+    assert ec.content_hash("hello world") != ec.content_hash("goodbye world")
+
+
+def test_dedupe_exact_duplicates():
+    items = [("a", "joc identic"), ("b", "joc identic"), ("c", "alt joc")]
+    kept = ec.dedupe_texts(items)
+    assert [k for k, _ in kept] == ["a", "c"]
+
+
+def test_dedupe_near_duplicates():
+    base = "Vanatoarea de comori este un joc de orientare pentru cercetasi in tabara."
+    near = base + " Pagina printata."  # >95% similar
+    items = [("orig", base), ("print", near), ("other", "Cu totul alt continut diferit aici.")]
+    kept = ec.dedupe_texts(items, threshold=85.0)
+    keys = [k for k, _ in kept]
+    assert "orig" in keys
+    assert "print" not in keys
+    assert "other" in keys
+
+
+# --------------------------------------------------------------------------
+# zip recursion
+# --------------------------------------------------------------------------
+def test_zip_recurses_into_inner_files(sample_zip):
+    body = ec.extract_zip(sample_zip)
+    assert "Paragraf 0" in body
+    assert ec.count_page_markers(body) > 0
+
+
+def test_zip_bad_archive_returns_empty(tmp_path):
+    bad = tmp_path / "broken.zip"
+    bad.write_text("not a zip", encoding="utf-8")
+    assert ec.extract_zip(bad) == ""
+
+
+def test_nested_zip(tmp_path, sample_zip):
+    outer = tmp_path / "outer.zip"
+    with zipfile.ZipFile(outer, "w") as zf:
+        zf.write(sample_zip, arcname="nested/archive.zip")
+    body = ec.extract_zip(outer)
+    assert "Paragraf 0" in body
+
+
+# --------------------------------------------------------------------------
+# preflight
+# --------------------------------------------------------------------------
+def test_preflight_python_packages_present():
+    report = ec.preflight()
+    # all required packages are installed in the test environment
+    assert report["missing_python"] == []
+
+
+def test_preflight_reports_libreoffice_state():
+    report = ec.preflight()
+    has_lo = bool(shutil.which("libreoffice") or shutil.which("soffice"))
+    if has_lo:
+        assert all("libreoffice" not in w for w in report["warnings"])
+    else:
+        assert any("libreoffice" in w for w in report["warnings"])
+
+
+def test_preflight_ocr_flag():
+    report = ec.preflight(check_ocr=True)
+    if not shutil.which("tesseract"):
+        assert any("tesseract" in m for m in report["missing_system"])
+
+
+# --------------------------------------------------------------------------
+# legacy .doc — skipped unless libreoffice is installed
+# --------------------------------------------------------------------------
+@pytest.mark.skipif(
+    not (shutil.which("libreoffice") or shutil.which("soffice")),
+    reason="libreoffice not installed",
+)
+def test_doc_conversion(tmp_path, sample_docx):
+    doc_path = tmp_path / "legacy.doc"
+    shutil.copy(sample_docx, doc_path)  # smoke test of the docx path
+    body = ec.extract_doc(doc_path)
+    assert ec.count_page_markers(body) >= 1
+
+
+def test_doc_without_libreoffice_raises(tmp_path, monkeypatch):
+    monkeypatch.setattr(ec.shutil, "which", lambda _: None)
+    with pytest.raises(RuntimeError):
+        ec.extract_doc(tmp_path / "whatever.doc")