Rebuild extraction pipeline infrastructure (Faza 0 prep)
Implements the approved plan to replace the broken regex/index-master extraction with an LLM-subagent pipeline. Four parallel lanes: Lane A — scripts/extract_common.py (PDF/docx/doc/pptx/html/zip, no max_pages truncation), normalize_sources.py, chunk_sources.py (~20pg chunks + overlap, manifest registry), activity_schema.json. Lane B — app/config_taxonomy.py (16 fixed category slugs), schema rebuilt from scratch in app/models/ with content_type, language, source_files, source_excerpt, normalized_name, extraction_confidence, needs_review; FTS5 + 3 triggers extended with materials_list and skills_developed. Lane C — build_database.py (--rebuild, atomic swap, schema + fuzzy source_excerpt validation, dedup with needs_review band), validate_extractions.py, review_queue.py, new run_extraction.py orchestrator, SUBAGENT_PROMPT.md. Lane D — search.py content_type/language filters (default search excludes non-game content), E7 schema-compat audit; fixed a NULL keywords AttributeError in _boost_search_relevance. Removes 8 orphaned/dead scripts and app/services/parser.py + indexer.py. Adds tests/ (70 passing, 1 skipped — libreoffice absent). Note: Lane D made one additive edit to app/models/database.py (_update_category_counts) to surface content_type/language in get_filter_options, outside its nominal lane boundary but after Lane B completed. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
177
tests/test_extract_common.py
Normal file
177
tests/test_extract_common.py
Normal file
@@ -0,0 +1,177 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
"""Tests for scripts/extract_common.py."""
|
||||
|
||||
import shutil
|
||||
import zipfile
|
||||
|
||||
import pytest
|
||||
|
||||
import extract_common as ec
|
||||
|
||||
|
||||
# --------------------------------------------------------------------------
|
||||
# format detection
|
||||
# --------------------------------------------------------------------------
|
||||
def test_detect_format():
|
||||
assert ec.detect_format("a/b/file.PDF") == "pdf"
|
||||
assert ec.detect_format("x.docx") == "docx"
|
||||
assert ec.detect_format("x.doc") == "doc"
|
||||
assert ec.detect_format("x.pptx") == "pptx"
|
||||
assert ec.detect_format("x.html") == "html"
|
||||
assert ec.detect_format("x.zip") == "zip"
|
||||
assert ec.detect_format("x.epub") == "epub"
|
||||
assert ec.detect_format("x.xyz") == "unknown"
|
||||
|
||||
|
||||
def test_is_junk():
|
||||
assert ec.is_junk("some/desktop.ini")
|
||||
assert ec.is_junk("notes.bak")
|
||||
assert ec.is_junk("README.md")
|
||||
assert not ec.is_junk("1000 Scout Games.pdf")
|
||||
|
||||
|
||||
# --------------------------------------------------------------------------
|
||||
# PDF — the critical "no max_pages" regression
|
||||
# --------------------------------------------------------------------------
|
||||
def test_pdf_extracts_all_60_pages(big_pdf):
|
||||
body = ec.extract_pdf(big_pdf)
|
||||
# the old converter capped at 50 pages — page 60 must be present now
|
||||
assert "--- PAGE 60 ---" in body
|
||||
assert "PDFMARK-60" in body
|
||||
assert ec.count_page_markers(body) == 60
|
||||
|
||||
|
||||
def test_pdf_does_not_truncate_mid_document(big_pdf):
|
||||
body = ec.extract_pdf(big_pdf)
|
||||
pages = ec.split_pages(body)
|
||||
assert pages[-1][0] == 60 # last marker is the real last page
|
||||
|
||||
|
||||
# --------------------------------------------------------------------------
|
||||
# page join / split round-trip
|
||||
# --------------------------------------------------------------------------
|
||||
def test_join_split_round_trip():
|
||||
body = ec.join_pages(["alpha", "beta", "gamma"])
|
||||
pages = ec.split_pages(body)
|
||||
assert [n for n, _ in pages] == [1, 2, 3]
|
||||
assert [t for _, t in pages] == ["alpha", "beta", "gamma"]
|
||||
|
||||
|
||||
def test_split_pages_no_markers_returns_empty():
|
||||
assert ec.split_pages("plain text with no markers") == []
|
||||
|
||||
|
||||
# --------------------------------------------------------------------------
|
||||
# docx — synthetic page markers
|
||||
# --------------------------------------------------------------------------
|
||||
def test_docx_synthetic_page_markers(sample_docx):
|
||||
body = ec.extract_docx(sample_docx)
|
||||
# 100 paragraphs / 40 per page => 3 pages
|
||||
assert ec.count_page_markers(body) == 3
|
||||
assert "Paragraf 99" in body
|
||||
|
||||
|
||||
# --------------------------------------------------------------------------
|
||||
# HTML mirror — nav/script/footer stripped
|
||||
# --------------------------------------------------------------------------
|
||||
def test_html_strips_chrome(html_with_nav):
|
||||
body = ec.extract_html(html_with_nav)
|
||||
assert "Vanatoarea de comori" in body
|
||||
assert "joc real de orientare" in body
|
||||
# chrome must be gone
|
||||
assert "tracking" not in body
|
||||
assert "Site Banner Junk" not in body
|
||||
assert "toate drepturile rezervate" not in body
|
||||
assert "Games" not in body
|
||||
|
||||
|
||||
# --------------------------------------------------------------------------
|
||||
# content hash + near-duplicate elimination
|
||||
# --------------------------------------------------------------------------
|
||||
def test_content_hash_ignores_whitespace():
|
||||
assert ec.content_hash("hello world") == ec.content_hash("hello world\n")
|
||||
assert ec.content_hash("hello world") != ec.content_hash("goodbye world")
|
||||
|
||||
|
||||
def test_dedupe_exact_duplicates():
|
||||
items = [("a", "joc identic"), ("b", "joc identic"), ("c", "alt joc")]
|
||||
kept = ec.dedupe_texts(items)
|
||||
assert [k for k, _ in kept] == ["a", "c"]
|
||||
|
||||
|
||||
def test_dedupe_near_duplicates():
|
||||
base = "Vanatoarea de comori este un joc de orientare pentru cercetasi in tabara."
|
||||
near = base + " Pagina printata." # >95% similar
|
||||
items = [("orig", base), ("print", near), ("other", "Cu totul alt continut diferit aici.")]
|
||||
kept = ec.dedupe_texts(items, threshold=85.0)
|
||||
keys = [k for k, _ in kept]
|
||||
assert "orig" in keys
|
||||
assert "print" not in keys
|
||||
assert "other" in keys
|
||||
|
||||
|
||||
# --------------------------------------------------------------------------
|
||||
# zip recursion
|
||||
# --------------------------------------------------------------------------
|
||||
def test_zip_recurses_into_inner_files(sample_zip):
|
||||
body = ec.extract_zip(sample_zip)
|
||||
assert "Paragraf 0" in body
|
||||
assert ec.count_page_markers(body) > 0
|
||||
|
||||
|
||||
def test_zip_bad_archive_returns_empty(tmp_path):
|
||||
bad = tmp_path / "broken.zip"
|
||||
bad.write_text("not a zip", encoding="utf-8")
|
||||
assert ec.extract_zip(bad) == ""
|
||||
|
||||
|
||||
def test_nested_zip(tmp_path, sample_zip):
|
||||
outer = tmp_path / "outer.zip"
|
||||
with zipfile.ZipFile(outer, "w") as zf:
|
||||
zf.write(sample_zip, arcname="nested/archive.zip")
|
||||
body = ec.extract_zip(outer)
|
||||
assert "Paragraf 0" in body
|
||||
|
||||
|
||||
# --------------------------------------------------------------------------
|
||||
# preflight
|
||||
# --------------------------------------------------------------------------
|
||||
def test_preflight_python_packages_present():
|
||||
report = ec.preflight()
|
||||
# all required packages are installed in the test environment
|
||||
assert report["missing_python"] == []
|
||||
|
||||
|
||||
def test_preflight_reports_libreoffice_state():
|
||||
report = ec.preflight()
|
||||
has_lo = bool(shutil.which("libreoffice") or shutil.which("soffice"))
|
||||
if has_lo:
|
||||
assert all("libreoffice" not in w for w in report["warnings"])
|
||||
else:
|
||||
assert any("libreoffice" in w for w in report["warnings"])
|
||||
|
||||
|
||||
def test_preflight_ocr_flag():
|
||||
report = ec.preflight(check_ocr=True)
|
||||
if not shutil.which("tesseract"):
|
||||
assert any("tesseract" in m for m in report["missing_system"])
|
||||
|
||||
|
||||
# --------------------------------------------------------------------------
|
||||
# legacy .doc — skipped unless libreoffice is installed
|
||||
# --------------------------------------------------------------------------
|
||||
@pytest.mark.skipif(
|
||||
not (shutil.which("libreoffice") or shutil.which("soffice")),
|
||||
reason="libreoffice not installed",
|
||||
)
|
||||
def test_doc_conversion(tmp_path, sample_docx):
|
||||
doc_path = tmp_path / "legacy.doc"
|
||||
shutil.copy(sample_docx, doc_path) # smoke test of the docx path
|
||||
body = ec.extract_doc(doc_path)
|
||||
assert ec.count_page_markers(body) >= 1
|
||||
|
||||
|
||||
def test_doc_without_libreoffice_raises(tmp_path, monkeypatch):
|
||||
monkeypatch.setattr(ec.shutil, "which", lambda _: None)
|
||||
with pytest.raises(RuntimeError):
|
||||
ec.extract_doc(tmp_path / "whatever.doc")
|
||||
Reference in New Issue
Block a user