- Remove obsolete documentation files (DEPLOYMENT.md, PLAN_IMPLEMENTARE_S8_DETALIAT.md, README.md) - Add comprehensive extraction pipeline with multiple format support (PDF, HTML, text) - Implement Claude-based activity extraction with structured templates - Update dependencies and Docker configuration - Reorganize scripts directory with modular extraction components - Move example documentation to appropriate location 🤖 Generated with [Claude Code](https://claude.ai/code) Co-Authored-By: Claude <noreply@anthropic.com>
143 lines
5.3 KiB
Python
143 lines
5.3 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
PDF Mass Conversion to Text for Activity Extraction
|
|
Handles all PDF sizes efficiently with multiple fallback methods
|
|
"""
|
|
|
|
import os
|
|
import json
|
|
from pathlib import Path
|
|
import PyPDF2
|
|
import pdfplumber
|
|
from typing import List, Dict
|
|
import logging
|
|
|
|
class PDFConverter:
|
|
def __init__(self, max_pages=50):
|
|
self.max_pages = max_pages
|
|
self.conversion_stats = {}
|
|
|
|
def convert_pdf_to_text(self, pdf_path: str) -> str:
|
|
"""Convert PDF to text using multiple methods with fallbacks"""
|
|
try:
|
|
# Method 1: pdfplumber (best for tables and layout)
|
|
return self._convert_with_pdfplumber(pdf_path)
|
|
except Exception as e:
|
|
print(f"pdfplumber failed for {pdf_path}: {e}")
|
|
|
|
try:
|
|
# Method 2: PyPDF2 (fallback)
|
|
return self._convert_with_pypdf2(pdf_path)
|
|
except Exception as e2:
|
|
print(f"PyPDF2 also failed for {pdf_path}: {e2}")
|
|
return ""
|
|
|
|
def _convert_with_pdfplumber(self, pdf_path: str) -> str:
|
|
"""Primary conversion method using pdfplumber"""
|
|
text_content = ""
|
|
|
|
with pdfplumber.open(pdf_path) as pdf:
|
|
total_pages = len(pdf.pages)
|
|
pages_to_process = min(total_pages, self.max_pages)
|
|
|
|
print(f" Converting {pdf_path}: {pages_to_process}/{total_pages} pages")
|
|
|
|
for i, page in enumerate(pdf.pages[:pages_to_process]):
|
|
try:
|
|
page_text = page.extract_text()
|
|
if page_text:
|
|
text_content += f"\n--- PAGE {i+1} ---\n"
|
|
text_content += page_text
|
|
text_content += "\n"
|
|
except Exception as e:
|
|
print(f" Error on page {i+1}: {e}")
|
|
continue
|
|
|
|
self.conversion_stats[pdf_path] = {
|
|
'method': 'pdfplumber',
|
|
'pages_processed': pages_to_process,
|
|
'total_pages': total_pages,
|
|
'success': True,
|
|
'text_length': len(text_content)
|
|
}
|
|
|
|
return text_content
|
|
|
|
def _convert_with_pypdf2(self, pdf_path: str) -> str:
|
|
"""Fallback conversion method using PyPDF2"""
|
|
text_content = ""
|
|
|
|
with open(pdf_path, 'rb') as file:
|
|
reader = PyPDF2.PdfReader(file)
|
|
total_pages = len(reader.pages)
|
|
pages_to_process = min(total_pages, self.max_pages)
|
|
|
|
print(f" Converting {pdf_path} (fallback): {pages_to_process}/{total_pages} pages")
|
|
|
|
for i in range(pages_to_process):
|
|
try:
|
|
page = reader.pages[i]
|
|
page_text = page.extract_text()
|
|
if page_text:
|
|
text_content += f"\n--- PAGE {i+1} ---\n"
|
|
text_content += page_text
|
|
text_content += "\n"
|
|
except Exception as e:
|
|
print(f" Error on page {i+1}: {e}")
|
|
continue
|
|
|
|
self.conversion_stats[pdf_path] = {
|
|
'method': 'PyPDF2',
|
|
'pages_processed': pages_to_process,
|
|
'total_pages': total_pages,
|
|
'success': True,
|
|
'text_length': len(text_content)
|
|
}
|
|
|
|
return text_content
|
|
|
|
def convert_all_pdfs(self, pdf_directory: str, output_directory: str):
|
|
"""Convert all PDFs in directory to text files"""
|
|
pdf_files = list(Path(pdf_directory).glob("**/*.pdf"))
|
|
|
|
print(f"🔄 Converting {len(pdf_files)} PDF files to text...")
|
|
|
|
os.makedirs(output_directory, exist_ok=True)
|
|
|
|
for i, pdf_path in enumerate(pdf_files):
|
|
print(f"\n[{i+1}/{len(pdf_files)}] Processing {pdf_path.name}...")
|
|
|
|
# Convert to text
|
|
text_content = self.convert_pdf_to_text(str(pdf_path))
|
|
|
|
if text_content.strip():
|
|
# Save as text file
|
|
output_file = Path(output_directory) / f"{pdf_path.stem}.txt"
|
|
with open(output_file, 'w', encoding='utf-8') as f:
|
|
f.write(f"SOURCE: {pdf_path}\n")
|
|
f.write(f"CONVERTED: 2025-01-11\n")
|
|
f.write("="*50 + "\n\n")
|
|
f.write(text_content)
|
|
|
|
print(f" ✅ Saved: {output_file}")
|
|
else:
|
|
print(f" ❌ No text extracted from {pdf_path.name}")
|
|
|
|
# Save conversion statistics
|
|
stats_file = Path(output_directory) / "conversion_stats.json"
|
|
with open(stats_file, 'w', encoding='utf-8') as f:
|
|
json.dump(self.conversion_stats, f, indent=2, ensure_ascii=False)
|
|
|
|
print(f"\n🎉 PDF conversion complete! Check {output_directory}")
|
|
return len([f for f in self.conversion_stats.values() if f['success']])
|
|
|
|
# Usage
|
|
if __name__ == "__main__":
|
|
converter = PDFConverter(max_pages=50)
|
|
|
|
# Convert all PDFs
|
|
pdf_dir = "/mnt/d/GoogleDrive/Cercetasi/carti-camp-jocuri"
|
|
output_dir = "/mnt/d/GoogleDrive/Cercetasi/carti-camp-jocuri/INDEX-SISTEM-JOCURI/converted_pdfs"
|
|
|
|
converted_count = converter.convert_all_pdfs(pdf_dir, output_dir)
|
|
print(f"Final result: {converted_count} PDFs successfully converted") |