Refactor extraction system and reorganize project structure

- Remove obsolete documentation files (DEPLOYMENT.md, PLAN_IMPLEMENTARE_S8_DETALIAT.md, README.md) - Add comprehensive extraction pipeline with multiple format support (PDF, HTML, text) - Implement Claude-based activity extraction with structured templates - Update dependencies and Docker configuration - Reorganize scripts directory with modular extraction components - Move example documentation to appropriate location 🤖 Generated with [Claude Code](https://claude.ai/code) Co-Authored-By: Claude <noreply@anthropic.com>
2025-09-11 23:32:37 +03:00
parent 1b6b7e06ad
commit a19ddf0b71
119 changed files with 91074 additions and 1859 deletions
--- a/scripts/pdf_to_text_converter.py
+++ b/scripts/pdf_to_text_converter.py
@@ -0,0 +1,143 @@
+#!/usr/bin/env python3
+"""
+PDF Mass Conversion to Text for Activity Extraction
+Handles all PDF sizes efficiently with multiple fallback methods
+"""
+
+import os
+import json
+from pathlib import Path
+import PyPDF2
+import pdfplumber
+from typing import List, Dict
+import logging
+
+class PDFConverter:
+    def __init__(self, max_pages=50):
+        self.max_pages = max_pages
+        self.conversion_stats = {}
+    
+    def convert_pdf_to_text(self, pdf_path: str) -> str:
+        """Convert PDF to text using multiple methods with fallbacks"""
+        try:
+            # Method 1: pdfplumber (best for tables and layout)
+            return self._convert_with_pdfplumber(pdf_path)
+        except Exception as e:
+            print(f"pdfplumber failed for {pdf_path}: {e}")
+            
+            try:
+                # Method 2: PyPDF2 (fallback)
+                return self._convert_with_pypdf2(pdf_path)
+            except Exception as e2:
+                print(f"PyPDF2 also failed for {pdf_path}: {e2}")
+                return ""
+    
+    def _convert_with_pdfplumber(self, pdf_path: str) -> str:
+        """Primary conversion method using pdfplumber"""
+        text_content = ""
+        
+        with pdfplumber.open(pdf_path) as pdf:
+            total_pages = len(pdf.pages)
+            pages_to_process = min(total_pages, self.max_pages)
+            
+            print(f"  Converting {pdf_path}: {pages_to_process}/{total_pages} pages")
+            
+            for i, page in enumerate(pdf.pages[:pages_to_process]):
+                try:
+                    page_text = page.extract_text()
+                    if page_text:
+                        text_content += f"\n--- PAGE {i+1} ---\n"
+                        text_content += page_text
+                        text_content += "\n"
+                except Exception as e:
+                    print(f"    Error on page {i+1}: {e}")
+                    continue
+        
+        self.conversion_stats[pdf_path] = {
+            'method': 'pdfplumber',
+            'pages_processed': pages_to_process,
+            'total_pages': total_pages,
+            'success': True,
+            'text_length': len(text_content)
+        }
+        
+        return text_content
+    
+    def _convert_with_pypdf2(self, pdf_path: str) -> str:
+        """Fallback conversion method using PyPDF2"""
+        text_content = ""
+        
+        with open(pdf_path, 'rb') as file:
+            reader = PyPDF2.PdfReader(file)
+            total_pages = len(reader.pages)
+            pages_to_process = min(total_pages, self.max_pages)
+            
+            print(f"  Converting {pdf_path} (fallback): {pages_to_process}/{total_pages} pages")
+            
+            for i in range(pages_to_process):
+                try:
+                    page = reader.pages[i]
+                    page_text = page.extract_text()
+                    if page_text:
+                        text_content += f"\n--- PAGE {i+1} ---\n"
+                        text_content += page_text
+                        text_content += "\n"
+                except Exception as e:
+                    print(f"    Error on page {i+1}: {e}")
+                    continue
+        
+        self.conversion_stats[pdf_path] = {
+            'method': 'PyPDF2',
+            'pages_processed': pages_to_process,
+            'total_pages': total_pages,
+            'success': True,
+            'text_length': len(text_content)
+        }
+        
+        return text_content
+    
+    def convert_all_pdfs(self, pdf_directory: str, output_directory: str):
+        """Convert all PDFs in directory to text files"""
+        pdf_files = list(Path(pdf_directory).glob("**/*.pdf"))
+        
+        print(f"🔄 Converting {len(pdf_files)} PDF files to text...")
+        
+        os.makedirs(output_directory, exist_ok=True)
+        
+        for i, pdf_path in enumerate(pdf_files):
+            print(f"\n[{i+1}/{len(pdf_files)}] Processing {pdf_path.name}...")
+            
+            # Convert to text
+            text_content = self.convert_pdf_to_text(str(pdf_path))
+            
+            if text_content.strip():
+                # Save as text file
+                output_file = Path(output_directory) / f"{pdf_path.stem}.txt"
+                with open(output_file, 'w', encoding='utf-8') as f:
+                    f.write(f"SOURCE: {pdf_path}\n")
+                    f.write(f"CONVERTED: 2025-01-11\n")
+                    f.write("="*50 + "\n\n")
+                    f.write(text_content)
+                
+                print(f"  ✅ Saved: {output_file}")
+            else:
+                print(f"  ❌ No text extracted from {pdf_path.name}")
+        
+        # Save conversion statistics
+        stats_file = Path(output_directory) / "conversion_stats.json"
+        with open(stats_file, 'w', encoding='utf-8') as f:
+            json.dump(self.conversion_stats, f, indent=2, ensure_ascii=False)
+        
+        print(f"\n🎉 PDF conversion complete! Check {output_directory}")
+        return len([f for f in self.conversion_stats.values() if f['success']])
+
+# Usage
+if __name__ == "__main__":
+    converter = PDFConverter(max_pages=50)
+    
+    # Convert all PDFs
+    pdf_dir = "/mnt/d/GoogleDrive/Cercetasi/carti-camp-jocuri"
+    output_dir = "/mnt/d/GoogleDrive/Cercetasi/carti-camp-jocuri/INDEX-SISTEM-JOCURI/converted_pdfs"
+    
+    converted_count = converter.convert_all_pdfs(pdf_dir, output_dir)
+    print(f"Final result: {converted_count} PDFs successfully converted")