# -*- coding: utf-8 -*- """Tests for scripts/extract_common.py.""" import shutil import zipfile import pytest import extract_common as ec # -------------------------------------------------------------------------- # format detection # -------------------------------------------------------------------------- def test_detect_format(): assert ec.detect_format("a/b/file.PDF") == "pdf" assert ec.detect_format("x.docx") == "docx" assert ec.detect_format("x.doc") == "doc" assert ec.detect_format("x.pptx") == "pptx" assert ec.detect_format("x.html") == "html" assert ec.detect_format("x.zip") == "zip" assert ec.detect_format("x.epub") == "epub" assert ec.detect_format("x.xyz") == "unknown" def test_is_junk(): assert ec.is_junk("some/desktop.ini") assert ec.is_junk("notes.bak") assert ec.is_junk("README.md") assert not ec.is_junk("1000 Scout Games.pdf") # -------------------------------------------------------------------------- # PDF — the critical "no max_pages" regression # -------------------------------------------------------------------------- def test_pdf_extracts_all_60_pages(big_pdf): body = ec.extract_pdf(big_pdf) # the old converter capped at 50 pages — page 60 must be present now assert "--- PAGE 60 ---" in body assert "PDFMARK-60" in body assert ec.count_page_markers(body) == 60 def test_pdf_does_not_truncate_mid_document(big_pdf): body = ec.extract_pdf(big_pdf) pages = ec.split_pages(body) assert pages[-1][0] == 60 # last marker is the real last page # -------------------------------------------------------------------------- # page join / split round-trip # -------------------------------------------------------------------------- def test_join_split_round_trip(): body = ec.join_pages(["alpha", "beta", "gamma"]) pages = ec.split_pages(body) assert [n for n, _ in pages] == [1, 2, 3] assert [t for _, t in pages] == ["alpha", "beta", "gamma"] def test_split_pages_no_markers_returns_empty(): assert ec.split_pages("plain text with no markers") == [] # -------------------------------------------------------------------------- # docx — synthetic page markers # -------------------------------------------------------------------------- def test_docx_synthetic_page_markers(sample_docx): body = ec.extract_docx(sample_docx) # 100 paragraphs / 40 per page => 3 pages assert ec.count_page_markers(body) == 3 assert "Paragraf 99" in body # -------------------------------------------------------------------------- # HTML mirror — nav/script/footer stripped # -------------------------------------------------------------------------- def test_html_strips_chrome(html_with_nav): body = ec.extract_html(html_with_nav) assert "Vanatoarea de comori" in body assert "joc real de orientare" in body # chrome must be gone assert "tracking" not in body assert "Site Banner Junk" not in body assert "toate drepturile rezervate" not in body assert "Games" not in body # -------------------------------------------------------------------------- # content hash + near-duplicate elimination # -------------------------------------------------------------------------- def test_content_hash_ignores_whitespace(): assert ec.content_hash("hello world") == ec.content_hash("hello world\n") assert ec.content_hash("hello world") != ec.content_hash("goodbye world") def test_dedupe_exact_duplicates(): items = [("a", "joc identic"), ("b", "joc identic"), ("c", "alt joc")] kept = ec.dedupe_texts(items) assert [k for k, _ in kept] == ["a", "c"] def test_dedupe_near_duplicates(): base = "Vanatoarea de comori este un joc de orientare pentru cercetasi in tabara." near = base + " Pagina printata." # >95% similar items = [("orig", base), ("print", near), ("other", "Cu totul alt continut diferit aici.")] kept = ec.dedupe_texts(items, threshold=85.0) keys = [k for k, _ in kept] assert "orig" in keys assert "print" not in keys assert "other" in keys # -------------------------------------------------------------------------- # zip recursion # -------------------------------------------------------------------------- def test_zip_recurses_into_inner_files(sample_zip): body = ec.extract_zip(sample_zip) assert "Paragraf 0" in body assert ec.count_page_markers(body) > 0 def test_zip_bad_archive_returns_empty(tmp_path): bad = tmp_path / "broken.zip" bad.write_text("not a zip", encoding="utf-8") assert ec.extract_zip(bad) == "" def test_nested_zip(tmp_path, sample_zip): outer = tmp_path / "outer.zip" with zipfile.ZipFile(outer, "w") as zf: zf.write(sample_zip, arcname="nested/archive.zip") body = ec.extract_zip(outer) assert "Paragraf 0" in body # -------------------------------------------------------------------------- # preflight # -------------------------------------------------------------------------- def test_preflight_python_packages_present(): report = ec.preflight() # all required packages are installed in the test environment assert report["missing_python"] == [] def test_preflight_reports_libreoffice_state(): report = ec.preflight() has_lo = bool(shutil.which("libreoffice") or shutil.which("soffice")) if has_lo: assert all("libreoffice" not in w for w in report["warnings"]) else: assert any("libreoffice" in w for w in report["warnings"]) def test_preflight_ocr_flag(): report = ec.preflight(check_ocr=True) if not shutil.which("tesseract"): assert any("tesseract" in m for m in report["missing_system"]) # -------------------------------------------------------------------------- # legacy .doc — skipped unless libreoffice is installed # -------------------------------------------------------------------------- @pytest.mark.skipif( not (shutil.which("libreoffice") or shutil.which("soffice")), reason="libreoffice not installed", ) def test_doc_conversion(tmp_path, sample_docx): doc_path = tmp_path / "legacy.doc" shutil.copy(sample_docx, doc_path) # smoke test of the docx path body = ec.extract_doc(doc_path) assert ec.count_page_markers(body) >= 1 def test_doc_without_libreoffice_raises(tmp_path, monkeypatch): monkeypatch.setattr(ec.shutil, "which", lambda _: None) with pytest.raises(RuntimeError): ec.extract_doc(tmp_path / "whatever.doc")