game-library/scripts/pdf_to_text_converter.py

#!/usr/bin/env python3
"""
PDF Mass Conversion to Text for Activity Extraction
Handles all PDF sizes efficiently with multiple fallback methods
"""

import os
import json
from pathlib import Path
import PyPDF2
import pdfplumber
from typing import List, Dict
import logging

class PDFConverter:
    def __init__(self, max_pages=50):
        self.max_pages = max_pages
        self.conversion_stats = {}

    def convert_pdf_to_text(self, pdf_path: str) -> str:
        """Convert PDF to text using multiple methods with fallbacks"""
        try:
            # Method 1: pdfplumber (best for tables and layout)
            return self._convert_with_pdfplumber(pdf_path)
        except Exception as e:
            print(f"pdfplumber failed for {pdf_path}: {e}")

            try:
                # Method 2: PyPDF2 (fallback)
                return self._convert_with_pypdf2(pdf_path)
            except Exception as e2:
                print(f"PyPDF2 also failed for {pdf_path}: {e2}")
                return ""

    def _convert_with_pdfplumber(self, pdf_path: str) -> str:
        """Primary conversion method using pdfplumber"""
        text_content = ""

        with pdfplumber.open(pdf_path) as pdf:
            total_pages = len(pdf.pages)
            pages_to_process = min(total_pages, self.max_pages)

            print(f"  Converting {pdf_path}: {pages_to_process}/{total_pages} pages")

            for i, page in enumerate(pdf.pages[:pages_to_process]):
                try:
                    page_text = page.extract_text()
                    if page_text:
                        text_content += f"\n--- PAGE {i+1} ---\n"
                        text_content += page_text
                        text_content += "\n"
                except Exception as e:
                    print(f"    Error on page {i+1}: {e}")
                    continue

        self.conversion_stats[pdf_path] = {
            'method': 'pdfplumber',
            'pages_processed': pages_to_process,
            'total_pages': total_pages,
            'success': True,
            'text_length': len(text_content)
        }

        return text_content

    def _convert_with_pypdf2(self, pdf_path: str) -> str:
        """Fallback conversion method using PyPDF2"""
        text_content = ""

        with open(pdf_path, 'rb') as file:
            reader = PyPDF2.PdfReader(file)
            total_pages = len(reader.pages)
            pages_to_process = min(total_pages, self.max_pages)

            print(f"  Converting {pdf_path} (fallback): {pages_to_process}/{total_pages} pages")

            for i in range(pages_to_process):
                try:
                    page = reader.pages[i]
                    page_text = page.extract_text()
                    if page_text:
                        text_content += f"\n--- PAGE {i+1} ---\n"
                        text_content += page_text
                        text_content += "\n"
                except Exception as e:
                    print(f"    Error on page {i+1}: {e}")
                    continue

        self.conversion_stats[pdf_path] = {
            'method': 'PyPDF2',
            'pages_processed': pages_to_process,
            'total_pages': total_pages,
            'success': True,
            'text_length': len(text_content)
        }

        return text_content

    def convert_all_pdfs(self, pdf_directory: str, output_directory: str):
        """Convert all PDFs in directory to text files"""
        pdf_files = list(Path(pdf_directory).glob("**/*.pdf"))

        print(f"🔄 Converting {len(pdf_files)} PDF files to text...")

        os.makedirs(output_directory, exist_ok=True)

        for i, pdf_path in enumerate(pdf_files):
            print(f"\n[{i+1}/{len(pdf_files)}] Processing {pdf_path.name}...")

            # Convert to text
            text_content = self.convert_pdf_to_text(str(pdf_path))

            if text_content.strip():
                # Save as text file
                output_file = Path(output_directory) / f"{pdf_path.stem}.txt"
                with open(output_file, 'w', encoding='utf-8') as f:
                    f.write(f"SOURCE: {pdf_path}\n")
                    f.write(f"CONVERTED: 2025-01-11\n")
                    f.write("="*50 + "\n\n")
                    f.write(text_content)

                print(f"  ✅ Saved: {output_file}")
            else:
                print(f"  ❌ No text extracted from {pdf_path.name}")

        # Save conversion statistics
        stats_file = Path(output_directory) / "conversion_stats.json"
        with open(stats_file, 'w', encoding='utf-8') as f:
            json.dump(self.conversion_stats, f, indent=2, ensure_ascii=False)

        print(f"\n🎉 PDF conversion complete! Check {output_directory}")
        return len([f for f in self.conversion_stats.values() if f['success']])

# Usage
if __name__ == "__main__":
    converter = PDFConverter(max_pages=50)

    # Convert all PDFs
    pdf_dir = "/mnt/d/GoogleDrive/Cercetasi/carti-camp-jocuri"
    output_dir = "/mnt/d/GoogleDrive/Cercetasi/carti-camp-jocuri/INDEX-SISTEM-JOCURI/converted_pdfs"

    converted_count = converter.convert_all_pdfs(pdf_dir, output_dir)
    print(f"Final result: {converted_count} PDFs successfully converted")