#!/usr/bin/env python3 """ PDF Mass Conversion to Text for Activity Extraction Handles all PDF sizes efficiently with multiple fallback methods """ import os import json from pathlib import Path import PyPDF2 import pdfplumber from typing import List, Dict import logging class PDFConverter: def __init__(self, max_pages=50): self.max_pages = max_pages self.conversion_stats = {} def convert_pdf_to_text(self, pdf_path: str) -> str: """Convert PDF to text using multiple methods with fallbacks""" try: # Method 1: pdfplumber (best for tables and layout) return self._convert_with_pdfplumber(pdf_path) except Exception as e: print(f"pdfplumber failed for {pdf_path}: {e}") try: # Method 2: PyPDF2 (fallback) return self._convert_with_pypdf2(pdf_path) except Exception as e2: print(f"PyPDF2 also failed for {pdf_path}: {e2}") return "" def _convert_with_pdfplumber(self, pdf_path: str) -> str: """Primary conversion method using pdfplumber""" text_content = "" with pdfplumber.open(pdf_path) as pdf: total_pages = len(pdf.pages) pages_to_process = min(total_pages, self.max_pages) print(f" Converting {pdf_path}: {pages_to_process}/{total_pages} pages") for i, page in enumerate(pdf.pages[:pages_to_process]): try: page_text = page.extract_text() if page_text: text_content += f"\n--- PAGE {i+1} ---\n" text_content += page_text text_content += "\n" except Exception as e: print(f" Error on page {i+1}: {e}") continue self.conversion_stats[pdf_path] = { 'method': 'pdfplumber', 'pages_processed': pages_to_process, 'total_pages': total_pages, 'success': True, 'text_length': len(text_content) } return text_content def _convert_with_pypdf2(self, pdf_path: str) -> str: """Fallback conversion method using PyPDF2""" text_content = "" with open(pdf_path, 'rb') as file: reader = PyPDF2.PdfReader(file) total_pages = len(reader.pages) pages_to_process = min(total_pages, self.max_pages) print(f" Converting {pdf_path} (fallback): {pages_to_process}/{total_pages} pages") for i in range(pages_to_process): try: page = reader.pages[i] page_text = page.extract_text() if page_text: text_content += f"\n--- PAGE {i+1} ---\n" text_content += page_text text_content += "\n" except Exception as e: print(f" Error on page {i+1}: {e}") continue self.conversion_stats[pdf_path] = { 'method': 'PyPDF2', 'pages_processed': pages_to_process, 'total_pages': total_pages, 'success': True, 'text_length': len(text_content) } return text_content def convert_all_pdfs(self, pdf_directory: str, output_directory: str): """Convert all PDFs in directory to text files""" pdf_files = list(Path(pdf_directory).glob("**/*.pdf")) print(f"šŸ”„ Converting {len(pdf_files)} PDF files to text...") os.makedirs(output_directory, exist_ok=True) for i, pdf_path in enumerate(pdf_files): print(f"\n[{i+1}/{len(pdf_files)}] Processing {pdf_path.name}...") # Convert to text text_content = self.convert_pdf_to_text(str(pdf_path)) if text_content.strip(): # Save as text file output_file = Path(output_directory) / f"{pdf_path.stem}.txt" with open(output_file, 'w', encoding='utf-8') as f: f.write(f"SOURCE: {pdf_path}\n") f.write(f"CONVERTED: 2025-01-11\n") f.write("="*50 + "\n\n") f.write(text_content) print(f" āœ… Saved: {output_file}") else: print(f" āŒ No text extracted from {pdf_path.name}") # Save conversion statistics stats_file = Path(output_directory) / "conversion_stats.json" with open(stats_file, 'w', encoding='utf-8') as f: json.dump(self.conversion_stats, f, indent=2, ensure_ascii=False) print(f"\nšŸŽ‰ PDF conversion complete! Check {output_directory}") return len([f for f in self.conversion_stats.values() if f['success']]) # Usage if __name__ == "__main__": converter = PDFConverter(max_pages=50) # Convert all PDFs pdf_dir = "/mnt/d/GoogleDrive/Cercetasi/carti-camp-jocuri" output_dir = "/mnt/d/GoogleDrive/Cercetasi/carti-camp-jocuri/INDEX-SISTEM-JOCURI/converted_pdfs" converted_count = converter.convert_all_pdfs(pdf_dir, output_dir) print(f"Final result: {converted_count} PDFs successfully converted")