# -*- coding: utf-8 -*- """ Shared pytest fixtures for the extraction-pipeline tests. scripts/ is not a package, so it is added to sys.path here. Synthetic fixtures (PDF, docx, zip, HTML) are generated at runtime — no binary blobs in the repo. """ import sys import zipfile from pathlib import Path import pytest REPO_ROOT = Path(__file__).resolve().parent.parent SCRIPTS_DIR = REPO_ROOT / "scripts" if str(SCRIPTS_DIR) not in sys.path: sys.path.insert(0, str(SCRIPTS_DIR)) # -------------------------------------------------------------------------- # synthetic PDF — deliberately large to pin the "no max_pages" regression # -------------------------------------------------------------------------- @pytest.fixture def big_pdf(tmp_path): """A 60-page PDF; each page carries a unique 'PDFMARK-' token.""" from reportlab.pdfgen import canvas from reportlab.lib.pagesizes import letter path = tmp_path / "big.pdf" c = canvas.Canvas(str(path), pagesize=letter) for n in range(1, 61): c.drawString(72, 720, f"PDFMARK-{n} synthetic activity page number {n}") c.drawString(72, 700, "Acest joc educativ se joaca in echipa.") c.showPage() c.save() return path # -------------------------------------------------------------------------- # synthetic docx — 100 paragraphs => 3 synthetic pages at 40 paras/page # -------------------------------------------------------------------------- @pytest.fixture def sample_docx(tmp_path): import docx path = tmp_path / "sample.docx" document = docx.Document() for i in range(100): document.add_paragraph(f"Paragraf {i}: continut joc team-building.") document.save(str(path)) return path # -------------------------------------------------------------------------- # synthetic HTML mirror page — with nav/script/footer chrome to strip # -------------------------------------------------------------------------- HTML_WITH_NAV = """ Joc
Site Banner Junk

Vanatoarea de comori

Acesta este un joc real de orientare pentru cercetasi.

Jucatorii cauta indicii ascunse in tabara.

""" @pytest.fixture def html_with_nav(tmp_path): path = tmp_path / "page.html" path.write_text(HTML_WITH_NAV, encoding="utf-8") return path # -------------------------------------------------------------------------- # synthetic zip — contains a docx and a stray junk file # -------------------------------------------------------------------------- @pytest.fixture def sample_zip(tmp_path, sample_docx): path = tmp_path / "archive.zip" with zipfile.ZipFile(path, "w") as zf: zf.write(sample_docx, arcname="inner/sample.docx") zf.writestr("desktop.ini", "junk") return path # -------------------------------------------------------------------------- # synthetic normalized source — paginated, with an activity straddling a # page boundary so the chunker overlap can be verified. # -------------------------------------------------------------------------- @pytest.fixture def paginated_source(tmp_path): """A 50-page normalized source. An activity spans the page 20/21 boundary.""" lines = ["SOURCE: synthetic/test.pdf", "CONVERTED: 2026-05-19", "FORMAT: pdf", "=" * 50, ""] for n in range(1, 51): lines.append(f"--- PAGE {n} ---") if n == 20: lines.append("ACTIVITY-START jocul podului care traverseaza pagina") elif n == 21: lines.append("continuare a jocului podului ACTIVITY-END") else: lines.append(f"continut obisnuit pe pagina {n}") lines.append("") path = tmp_path / "src_paginated.txt" path.write_text("\n".join(lines), encoding="utf-8") return path