#!/usr/bin/env python3
"""
Unified Activity Processor
Orchestreaz toate extractoarele pentru procesare complet
"""

import time
from pathlib import Path
from html_extractor import HTMLActivityExtractor
from text_extractor import TextActivityExtractor
import sqlite3

class UnifiedProcessor:
    def __init__(self, db_path='data/activities.db'):
        self.db_path = db_path
        self.html_extractor = HTMLActivityExtractor(db_path)
        self.text_extractor = TextActivityExtractor(db_path)
        self.stats = {
            'html_processed': 0,
            'text_processed': 0,
            'pdf_to_process': 0,
            'doc_to_process': 0,
            'total_activities': 0,
            'start_time': None,
            'end_time': None
        }
    
    def get_current_activity_count(self):
        """Obine numrul curent de activiti din DB"""
        conn = sqlite3.connect(self.db_path)
        cursor = conn.cursor()
        cursor.execute("SELECT COUNT(*) FROM activities")
        count = cursor.fetchone()[0]
        conn.close()
        return count
    
    def count_files_to_process(self, base_path):
        """Numr fiierele care trebuie procesate"""
        base_path = Path(base_path)
        
        counts = {
            'html': len(list(base_path.rglob("*.html"))) + len(list(base_path.rglob("*.htm"))),
            'txt': len(list(base_path.rglob("*.txt"))),
            'md': len(list(base_path.rglob("*.md"))),
            'pdf': len(list(base_path.rglob("*.pdf"))),
            'doc': len(list(base_path.rglob("*.doc"))),
            'docx': len(list(base_path.rglob("*.docx")))
        }
        
        return counts
    
    def process_automated_formats(self, base_path='/mnt/d/GoogleDrive/Cercetasi/carti-camp-jocuri'):
        """Proceseaz toate formatele care pot fi automatizate"""
        print("="*60)
        print("UNIFIED ACTIVITY PROCESSOR - AUTOMATED PHASE")
        print("="*60)
        
        self.stats['start_time'] = time.time()
        initial_count = self.get_current_activity_count()
        
        # Afieaz statistici iniiale
        file_counts = self.count_files_to_process(base_path)
        print(f"\nFiles to process:")
        for format, count in file_counts.items():
            print(f"  {format.upper()}: {count} files")
        print(f"\nCurrent activities in database: {initial_count}")
        print("-"*60)
        
        # FAZA 1: Procesare HTML (prioritate maxim - volum mare)
        print("\n[1/2] Processing HTML files...")
        print("-"*40)
        html_processed, html_errors = self.html_extractor.process_all_html_files(base_path)
        self.stats['html_processed'] = html_processed
        
        # FAZA 2: Procesare Text/MD
        print("\n[2/2] Processing Text/Markdown files...")
        print("-"*40)
        text_processed, text_saved = self.text_extractor.process_all_text_files(base_path)
        self.stats['text_processed'] = text_processed
        
        # Statistici finale
        self.stats['end_time'] = time.time()
        final_count = self.get_current_activity_count()
        self.stats['total_activities'] = final_count - initial_count
        
        # Identific fiierele care necesit procesare manual
        self.stats['pdf_to_process'] = file_counts['pdf']
        self.stats['doc_to_process'] = file_counts['doc'] + file_counts['docx']
        
        self.print_summary()
        self.save_pdf_doc_list(base_path)
    
    def print_summary(self):
        """Afieaz rezumatul procesrii"""
        print("\n" + "="*60)
        print("PROCESSING SUMMARY")
        print("="*60)
        
        duration = self.stats['end_time'] - self.stats['start_time']
        
        print(f"\nAutomated Processing Results:")
        print(f"  HTML files processed: {self.stats['html_processed']}")
        print(f"  Text/MD files processed: {self.stats['text_processed']}")
        print(f"  New activities added: {self.stats['total_activities']}")
        print(f"  Processing time: {duration:.1f} seconds")
        
        print(f"\nFiles requiring Claude processing:")
        print(f"  PDF files: {self.stats['pdf_to_process']}")
        print(f"  DOC/DOCX files: {self.stats['doc_to_process']}")
        
        print("\n" + "="*60)
        print("NEXT STEPS:")
        print("1. Review the file 'pdf_doc_for_claude.txt' for manual processing")
        print("2. Use Claude to extract activities from PDF/DOC files")
        print("3. Focus on largest PDF files first (highest activity density)")
        print("="*60)
    
    def save_pdf_doc_list(self, base_path):
        """Salveaz lista de PDF/DOC pentru procesare cu Claude"""
        base_path = Path(base_path)
        
        pdf_files = sorted(base_path.rglob("*.pdf"), key=lambda p: p.stat().st_size, reverse=True)
        doc_files = list(base_path.rglob("*.doc"))
        docx_files = list(base_path.rglob("*.docx"))
        
        with open('pdf_doc_for_claude.txt', 'w', encoding='utf-8') as f:
            f.write("PDF/DOC FILES FOR CLAUDE PROCESSING\n")
            f.write("="*60 + "\n")
            f.write("Files sorted by size (largest first = likely more activities)\n\n")
            
            f.write("TOP PRIORITY PDF FILES (process these first):\n")
            f.write("-"*40 + "\n")
            for i, pdf in enumerate(pdf_files[:20], 1):
                size_mb = pdf.stat().st_size / (1024*1024)
                f.write(f"{i}. {pdf.name} ({size_mb:.1f} MB)\n")
                f.write(f"   Path: {pdf}\n\n")
            
            if len(pdf_files) > 20:
                f.write(f"\n... and {len(pdf_files)-20} more PDF files\n\n")
            
            f.write("\nDOC/DOCX FILES:\n")
            f.write("-"*40 + "\n")
            for doc in doc_files + docx_files:
                size_kb = doc.stat().st_size / 1024
                f.write(f"- {doc.name} ({size_kb:.1f} KB)\n")
        
        print(f"\nPDF/DOC list saved to: pdf_doc_for_claude.txt")

if __name__ == "__main__":
    processor = UnifiedProcessor()
    processor.process_automated_formats()