Refactor extraction system and reorganize project structure

- Remove obsolete documentation files (DEPLOYMENT.md, PLAN_IMPLEMENTARE_S8_DETALIAT.md, README.md)
- Add comprehensive extraction pipeline with multiple format support (PDF, HTML, text)
- Implement Claude-based activity extraction with structured templates
- Update dependencies and Docker configuration
- Reorganize scripts directory with modular extraction components
- Move example documentation to appropriate location

🤖 Generated with [Claude Code](https://claude.ai/code)

Co-Authored-By: Claude <noreply@anthropic.com>
This commit is contained in:
2025-09-11 23:32:37 +03:00
parent 1b6b7e06ad
commit a19ddf0b71
119 changed files with 91074 additions and 1859 deletions

View File

@@ -0,0 +1,143 @@
#!/usr/bin/env python3
"""
PDF Mass Conversion to Text for Activity Extraction
Handles all PDF sizes efficiently with multiple fallback methods
"""
import os
import json
from pathlib import Path
import PyPDF2
import pdfplumber
from typing import List, Dict
import logging
class PDFConverter:
def __init__(self, max_pages=50):
self.max_pages = max_pages
self.conversion_stats = {}
def convert_pdf_to_text(self, pdf_path: str) -> str:
"""Convert PDF to text using multiple methods with fallbacks"""
try:
# Method 1: pdfplumber (best for tables and layout)
return self._convert_with_pdfplumber(pdf_path)
except Exception as e:
print(f"pdfplumber failed for {pdf_path}: {e}")
try:
# Method 2: PyPDF2 (fallback)
return self._convert_with_pypdf2(pdf_path)
except Exception as e2:
print(f"PyPDF2 also failed for {pdf_path}: {e2}")
return ""
def _convert_with_pdfplumber(self, pdf_path: str) -> str:
"""Primary conversion method using pdfplumber"""
text_content = ""
with pdfplumber.open(pdf_path) as pdf:
total_pages = len(pdf.pages)
pages_to_process = min(total_pages, self.max_pages)
print(f" Converting {pdf_path}: {pages_to_process}/{total_pages} pages")
for i, page in enumerate(pdf.pages[:pages_to_process]):
try:
page_text = page.extract_text()
if page_text:
text_content += f"\n--- PAGE {i+1} ---\n"
text_content += page_text
text_content += "\n"
except Exception as e:
print(f" Error on page {i+1}: {e}")
continue
self.conversion_stats[pdf_path] = {
'method': 'pdfplumber',
'pages_processed': pages_to_process,
'total_pages': total_pages,
'success': True,
'text_length': len(text_content)
}
return text_content
def _convert_with_pypdf2(self, pdf_path: str) -> str:
"""Fallback conversion method using PyPDF2"""
text_content = ""
with open(pdf_path, 'rb') as file:
reader = PyPDF2.PdfReader(file)
total_pages = len(reader.pages)
pages_to_process = min(total_pages, self.max_pages)
print(f" Converting {pdf_path} (fallback): {pages_to_process}/{total_pages} pages")
for i in range(pages_to_process):
try:
page = reader.pages[i]
page_text = page.extract_text()
if page_text:
text_content += f"\n--- PAGE {i+1} ---\n"
text_content += page_text
text_content += "\n"
except Exception as e:
print(f" Error on page {i+1}: {e}")
continue
self.conversion_stats[pdf_path] = {
'method': 'PyPDF2',
'pages_processed': pages_to_process,
'total_pages': total_pages,
'success': True,
'text_length': len(text_content)
}
return text_content
def convert_all_pdfs(self, pdf_directory: str, output_directory: str):
"""Convert all PDFs in directory to text files"""
pdf_files = list(Path(pdf_directory).glob("**/*.pdf"))
print(f"🔄 Converting {len(pdf_files)} PDF files to text...")
os.makedirs(output_directory, exist_ok=True)
for i, pdf_path in enumerate(pdf_files):
print(f"\n[{i+1}/{len(pdf_files)}] Processing {pdf_path.name}...")
# Convert to text
text_content = self.convert_pdf_to_text(str(pdf_path))
if text_content.strip():
# Save as text file
output_file = Path(output_directory) / f"{pdf_path.stem}.txt"
with open(output_file, 'w', encoding='utf-8') as f:
f.write(f"SOURCE: {pdf_path}\n")
f.write(f"CONVERTED: 2025-01-11\n")
f.write("="*50 + "\n\n")
f.write(text_content)
print(f" ✅ Saved: {output_file}")
else:
print(f" ❌ No text extracted from {pdf_path.name}")
# Save conversion statistics
stats_file = Path(output_directory) / "conversion_stats.json"
with open(stats_file, 'w', encoding='utf-8') as f:
json.dump(self.conversion_stats, f, indent=2, ensure_ascii=False)
print(f"\n🎉 PDF conversion complete! Check {output_directory}")
return len([f for f in self.conversion_stats.values() if f['success']])
# Usage
if __name__ == "__main__":
converter = PDFConverter(max_pages=50)
# Convert all PDFs
pdf_dir = "/mnt/d/GoogleDrive/Cercetasi/carti-camp-jocuri"
output_dir = "/mnt/d/GoogleDrive/Cercetasi/carti-camp-jocuri/INDEX-SISTEM-JOCURI/converted_pdfs"
converted_count = converter.convert_all_pdfs(pdf_dir, output_dir)
print(f"Final result: {converted_count} PDFs successfully converted")