Refactor extraction system and reorganize project structure
- Remove obsolete documentation files (DEPLOYMENT.md, PLAN_IMPLEMENTARE_S8_DETALIAT.md, README.md) - Add comprehensive extraction pipeline with multiple format support (PDF, HTML, text) - Implement Claude-based activity extraction with structured templates - Update dependencies and Docker configuration - Reorganize scripts directory with modular extraction components - Move example documentation to appropriate location 🤖 Generated with [Claude Code](https://claude.ai/code) Co-Authored-By: Claude <noreply@anthropic.com>
This commit is contained in:
143
scripts/pdf_to_text_converter.py
Normal file
143
scripts/pdf_to_text_converter.py
Normal file
@@ -0,0 +1,143 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
PDF Mass Conversion to Text for Activity Extraction
|
||||
Handles all PDF sizes efficiently with multiple fallback methods
|
||||
"""
|
||||
|
||||
import os
|
||||
import json
|
||||
from pathlib import Path
|
||||
import PyPDF2
|
||||
import pdfplumber
|
||||
from typing import List, Dict
|
||||
import logging
|
||||
|
||||
class PDFConverter:
|
||||
def __init__(self, max_pages=50):
|
||||
self.max_pages = max_pages
|
||||
self.conversion_stats = {}
|
||||
|
||||
def convert_pdf_to_text(self, pdf_path: str) -> str:
|
||||
"""Convert PDF to text using multiple methods with fallbacks"""
|
||||
try:
|
||||
# Method 1: pdfplumber (best for tables and layout)
|
||||
return self._convert_with_pdfplumber(pdf_path)
|
||||
except Exception as e:
|
||||
print(f"pdfplumber failed for {pdf_path}: {e}")
|
||||
|
||||
try:
|
||||
# Method 2: PyPDF2 (fallback)
|
||||
return self._convert_with_pypdf2(pdf_path)
|
||||
except Exception as e2:
|
||||
print(f"PyPDF2 also failed for {pdf_path}: {e2}")
|
||||
return ""
|
||||
|
||||
def _convert_with_pdfplumber(self, pdf_path: str) -> str:
|
||||
"""Primary conversion method using pdfplumber"""
|
||||
text_content = ""
|
||||
|
||||
with pdfplumber.open(pdf_path) as pdf:
|
||||
total_pages = len(pdf.pages)
|
||||
pages_to_process = min(total_pages, self.max_pages)
|
||||
|
||||
print(f" Converting {pdf_path}: {pages_to_process}/{total_pages} pages")
|
||||
|
||||
for i, page in enumerate(pdf.pages[:pages_to_process]):
|
||||
try:
|
||||
page_text = page.extract_text()
|
||||
if page_text:
|
||||
text_content += f"\n--- PAGE {i+1} ---\n"
|
||||
text_content += page_text
|
||||
text_content += "\n"
|
||||
except Exception as e:
|
||||
print(f" Error on page {i+1}: {e}")
|
||||
continue
|
||||
|
||||
self.conversion_stats[pdf_path] = {
|
||||
'method': 'pdfplumber',
|
||||
'pages_processed': pages_to_process,
|
||||
'total_pages': total_pages,
|
||||
'success': True,
|
||||
'text_length': len(text_content)
|
||||
}
|
||||
|
||||
return text_content
|
||||
|
||||
def _convert_with_pypdf2(self, pdf_path: str) -> str:
|
||||
"""Fallback conversion method using PyPDF2"""
|
||||
text_content = ""
|
||||
|
||||
with open(pdf_path, 'rb') as file:
|
||||
reader = PyPDF2.PdfReader(file)
|
||||
total_pages = len(reader.pages)
|
||||
pages_to_process = min(total_pages, self.max_pages)
|
||||
|
||||
print(f" Converting {pdf_path} (fallback): {pages_to_process}/{total_pages} pages")
|
||||
|
||||
for i in range(pages_to_process):
|
||||
try:
|
||||
page = reader.pages[i]
|
||||
page_text = page.extract_text()
|
||||
if page_text:
|
||||
text_content += f"\n--- PAGE {i+1} ---\n"
|
||||
text_content += page_text
|
||||
text_content += "\n"
|
||||
except Exception as e:
|
||||
print(f" Error on page {i+1}: {e}")
|
||||
continue
|
||||
|
||||
self.conversion_stats[pdf_path] = {
|
||||
'method': 'PyPDF2',
|
||||
'pages_processed': pages_to_process,
|
||||
'total_pages': total_pages,
|
||||
'success': True,
|
||||
'text_length': len(text_content)
|
||||
}
|
||||
|
||||
return text_content
|
||||
|
||||
def convert_all_pdfs(self, pdf_directory: str, output_directory: str):
|
||||
"""Convert all PDFs in directory to text files"""
|
||||
pdf_files = list(Path(pdf_directory).glob("**/*.pdf"))
|
||||
|
||||
print(f"🔄 Converting {len(pdf_files)} PDF files to text...")
|
||||
|
||||
os.makedirs(output_directory, exist_ok=True)
|
||||
|
||||
for i, pdf_path in enumerate(pdf_files):
|
||||
print(f"\n[{i+1}/{len(pdf_files)}] Processing {pdf_path.name}...")
|
||||
|
||||
# Convert to text
|
||||
text_content = self.convert_pdf_to_text(str(pdf_path))
|
||||
|
||||
if text_content.strip():
|
||||
# Save as text file
|
||||
output_file = Path(output_directory) / f"{pdf_path.stem}.txt"
|
||||
with open(output_file, 'w', encoding='utf-8') as f:
|
||||
f.write(f"SOURCE: {pdf_path}\n")
|
||||
f.write(f"CONVERTED: 2025-01-11\n")
|
||||
f.write("="*50 + "\n\n")
|
||||
f.write(text_content)
|
||||
|
||||
print(f" ✅ Saved: {output_file}")
|
||||
else:
|
||||
print(f" ❌ No text extracted from {pdf_path.name}")
|
||||
|
||||
# Save conversion statistics
|
||||
stats_file = Path(output_directory) / "conversion_stats.json"
|
||||
with open(stats_file, 'w', encoding='utf-8') as f:
|
||||
json.dump(self.conversion_stats, f, indent=2, ensure_ascii=False)
|
||||
|
||||
print(f"\n🎉 PDF conversion complete! Check {output_directory}")
|
||||
return len([f for f in self.conversion_stats.values() if f['success']])
|
||||
|
||||
# Usage
|
||||
if __name__ == "__main__":
|
||||
converter = PDFConverter(max_pages=50)
|
||||
|
||||
# Convert all PDFs
|
||||
pdf_dir = "/mnt/d/GoogleDrive/Cercetasi/carti-camp-jocuri"
|
||||
output_dir = "/mnt/d/GoogleDrive/Cercetasi/carti-camp-jocuri/INDEX-SISTEM-JOCURI/converted_pdfs"
|
||||
|
||||
converted_count = converter.convert_all_pdfs(pdf_dir, output_dir)
|
||||
print(f"Final result: {converted_count} PDFs successfully converted")
|
||||
Reference in New Issue
Block a user