#!/usr/bin/env python3
"""
HTML Activity Extractor - Proceseaz 1876 fiiere HTML
Extrage automat activiti folosind pattern recognition
"""

import os
import re
import json
from pathlib import Path
from bs4 import BeautifulSoup
import chardet
from typing import List, Dict, Optional
import sqlite3
from datetime import datetime

class HTMLActivityExtractor:
    def __init__(self, db_path='data/activities.db'):
        self.db_path = db_path
        # Pattern-uri pentru detectare activiti �n rom�n
        self.activity_patterns = {
            'title_patterns': [
                r'(?i)(joc|activitate|exerci[t]iu|team[\s-]?building|energizer|ice[\s-]?breaker)[\s:]+([^\.]{5,100})',
                r'(?i)<h[1-6][^>]*>([^<]*(?:joc|activitate|exerci[t]iu)[^<]*)</h[1-6]>',
                r'(?i)<strong>([^<]*(?:joc|activitate|exerci[t]iu)[^<]*)</strong>',
                r'(?i)^[\d]+\.?\s*([A-Z][^\.]{10,100}(?:joc|activitate|exerci[t]iu)[^\.]{0,50})$',
            ],
            'description_markers': [
                'descriere', 'reguli', 'cum se joac[a]', 'instructiuni', 
                'obiectiv', 'desfasurare', 'explicatie', 'mod de joc'
            ],
            'materials_markers': [
                'materiale', 'necesare', 'echipament', 'ce avem nevoie',
                'se folosesc', 'trebuie sa avem', 'dotari'
            ],
            'age_patterns': [
                r'(?i)v[�a]rst[a][\s:]+(\d+)[\s-]+(\d+)',
                r'(?i)(\d+)[\s-]+(\d+)\s*ani',
                r'(?i)pentru\s+(\d+)[\s-]+(\d+)\s*ani',
                r'(?i)categoria?\s*(?:de\s*)?v[�a]rst[a][\s:]+(\d+)[\s-]+(\d+)',
            ],
            'participants_patterns': [
                r'(?i)(\d+)[\s-]+(\d+)\s*(?:participan[t]i|juc[a]tori|persoane|copii)',
                r'(?i)num[a]r\s*(?:de\s*)?(?:participan[t]i|juc[a]tori)[\s:]+(\d+)[\s-]+(\d+)',
                r'(?i)grup\s*de\s*(\d+)[\s-]+(\d+)',
            ],
            'duration_patterns': [
                r'(?i)durat[a][\s:]+(\d+)[\s-]+(\d+)\s*(?:minute|min)',
                r'(?i)timp[\s:]+(\d+)[\s-]+(\d+)\s*(?:minute|min)',
                r'(?i)(\d+)[\s-]+(\d+)\s*minute',
            ]
        }
        
        # Categorii predefinite bazate pe sistemul existent
        self.categories = {
            '[A]': ['joc', 'joaca', 'distractie', 'amuzament'],
            '[B]': ['aventura', 'explorare', 'descoperire'],
            '[C]': ['camping', 'tabara', 'excursie', 'drumetie'],
            '[D]': ['foc', 'flacara', 'lumina'],
            '[E]': ['noduri', 'fr�nghii', 'sfori', 'legare'],
            '[F]': ['bushcraft', 'supravietuire', 'survival'],
            '[G]': ['educatie', 'educativ', 'invatare', 'scoala'],
            '[H]': ['orientare', 'busola', 'harta', 'navigare']
        }
    
    def detect_encoding(self, file_path):
        """Detecteaz encoding-ul fiierului"""
        with open(file_path, 'rb') as f:
            result = chardet.detect(f.read())
        return result['encoding'] or 'utf-8'
    
    def extract_from_html(self, html_path: str) -> List[Dict]:
        """Extrage activiti dintr-un singur fiier HTML"""
        activities = []
        
        try:
            # Detectare encoding i citire
            encoding = self.detect_encoding(html_path)
            with open(html_path, 'r', encoding=encoding, errors='ignore') as f:
                content = f.read()
            
            soup = BeautifulSoup(content, 'lxml')
            
            # Metod 1: Caut liste de activiti
            activities.extend(self._extract_from_lists(soup, html_path))
            
            # Metod 2: Caut activiti �n headings
            activities.extend(self._extract_from_headings(soup, html_path))
            
            # Metod 3: Caut pattern-uri �n text
            activities.extend(self._extract_from_patterns(soup, html_path))
            
            # Metod 4: Caut �n tabele
            activities.extend(self._extract_from_tables(soup, html_path))
            
        except Exception as e:
            print(f"Error processing {html_path}: {e}")
        
        return activities
    
    def _extract_from_lists(self, soup, source_file):
        """Extrage activiti din liste HTML (ul, ol)"""
        activities = []
        
        for list_elem in soup.find_all(['ul', 'ol']):
            # Verific dac lista pare s conin activiti
            list_text = list_elem.get_text().lower()
            if any(marker in list_text for marker in ['joc', 'activitate', 'exercitiu']):
                for li in list_elem.find_all('li'):
                    text = li.get_text(strip=True)
                    if len(text) > 20:  # Minim 20 caractere pentru o activitate valid
                        activity = self._create_activity_from_text(text, source_file)
                        if activity:
                            activities.append(activity)
        
        return activities
    
    def _extract_from_headings(self, soup, source_file):
        """Extrage activiti bazate pe headings"""
        activities = []
        
        for heading in soup.find_all(['h1', 'h2', 'h3', 'h4', 'h5', 'h6']):
            heading_text = heading.get_text(strip=True)
            
            # Verific dac heading-ul conine cuvinte cheie
            if any(keyword in heading_text.lower() for keyword in ['joc', 'activitate', 'exercitiu']):
                # Caut descrierea �n elementele urmtoare
                description = ""
                next_elem = heading.find_next_sibling()
                
                while next_elem and next_elem.name not in ['h1', 'h2', 'h3', 'h4', 'h5', 'h6']:
                    if next_elem.name in ['p', 'div', 'ul']:
                        description += next_elem.get_text(strip=True) + " "
                        if len(description) > 500:  # Limit descriere
                            break
                    next_elem = next_elem.find_next_sibling()
                
                if description:
                    activity = {
                        'name': heading_text[:200],
                        'description': description[:1000],
                        'source_file': str(source_file),
                        'category': self._detect_category(heading_text + " " + description)
                    }
                    activities.append(activity)
        
        return activities
    
    def _extract_from_patterns(self, soup, source_file):
        """Extrage activiti folosind pattern matching"""
        activities = []
        text = soup.get_text()
        
        # Caut pattern-uri de activiti
        for pattern in self.activity_patterns['title_patterns']:
            matches = re.finditer(pattern, text, re.MULTILINE)
            for match in matches:
                title = match.group(0) if match.lastindex == 0 else match.group(match.lastindex)
                if len(title) > 10:
                    # Extrage context �n jurul match-ului
                    start = max(0, match.start() - 200)
                    end = min(len(text), match.end() + 500)
                    context = text[start:end]
                    
                    activity = self._create_activity_from_text(context, source_file, title)
                    if activity:
                        activities.append(activity)
        
        return activities
    
    def _extract_from_tables(self, soup, source_file):
        """Extrage activiti din tabele"""
        activities = []
        
        for table in soup.find_all('table'):
            rows = table.find_all('tr')
            if len(rows) > 1:  # Cel puin header i o linie de date
                # Detecteaz coloanele relevante
                headers = [th.get_text(strip=True).lower() for th in rows[0].find_all(['th', 'td'])]
                
                for row in rows[1:]:
                    cells = row.find_all(['td'])
                    if cells:
                        activity_data = {}
                        for i, cell in enumerate(cells):
                            if i < len(headers):
                                activity_data[headers[i]] = cell.get_text(strip=True)
                        
                        # Creeaz activitate din date tabel
                        if any(key in activity_data for key in ['joc', 'activitate', 'nume', 'titlu']):
                            activity = self._create_activity_from_table_data(activity_data, source_file)
                            if activity:
                                activities.append(activity)
        
        return activities
    
    def _create_activity_from_text(self, text, source_file, title=None):
        """Creeaz un dicionar de activitate din text"""
        if not text or len(text) < 30:
            return None
        
        activity = {
            'name': title or text[:100].split('.')[0].strip(),
            'description': text[:1000],
            'source_file': str(source_file),
            'category': self._detect_category(text),
            'keywords': self._extract_keywords(text),
            'created_at': datetime.now().isoformat()
        }
        
        # Extrage metadata suplimentar
        activity.update(self._extract_metadata(text))
        
        return activity
    
    def _create_activity_from_table_data(self, data, source_file):
        """Creeaz activitate din date de tabel"""
        activity = {
            'source_file': str(source_file),
            'created_at': datetime.now().isoformat()
        }
        
        # Mapare c�mpuri tabel la c�mpuri DB
        field_mapping = {
            'nume': 'name', 'titlu': 'name', 'joc': 'name', 'activitate': 'name',
            'descriere': 'description', 'detalii': 'description', 'explicatie': 'description',
            'materiale': 'materials_list', 'echipament': 'materials_list',
            'varsta': 'age_group_min', 'categoria': 'category',
            'participanti': 'participants_min', 'numar': 'participants_min',
            'durata': 'duration_min', 'timp': 'duration_min'
        }
        
        for table_field, db_field in field_mapping.items():
            if table_field in data:
                activity[db_field] = data[table_field]
        
        # Validare minim
        if 'name' in activity and len(activity.get('name', '')) > 5:
            return activity
        
        return None
    
    def _extract_metadata(self, text):
        """Extrage metadata din text folosind pattern-uri"""
        metadata = {}
        
        # Extrage v�rsta
        for pattern in self.activity_patterns['age_patterns']:
            match = re.search(pattern, text)
            if match:
                metadata['age_group_min'] = int(match.group(1))
                metadata['age_group_max'] = int(match.group(2)) if match.lastindex >= 2 else int(match.group(1))
                break
        
        # Extrage numr participani
        for pattern in self.activity_patterns['participants_patterns']:
            match = re.search(pattern, text)
            if match:
                metadata['participants_min'] = int(match.group(1))
                metadata['participants_max'] = int(match.group(2)) if match.lastindex >= 2 else int(match.group(1))
                break
        
        # Extrage durata
        for pattern in self.activity_patterns['duration_patterns']:
            match = re.search(pattern, text)
            if match:
                metadata['duration_min'] = int(match.group(1))
                metadata['duration_max'] = int(match.group(2)) if match.lastindex >= 2 else int(match.group(1))
                break
        
        # Extrage materiale
        materials = []
        text_lower = text.lower()
        for marker in self.activity_patterns['materials_markers']:
            idx = text_lower.find(marker)
            if idx != -1:
                # Extrage urmtoarele 200 caractere dup marker
                materials_text = text[idx:idx+200]
                # Extrage items din list
                items = re.findall(r'[-"]\s*([^\n-"]+)', materials_text)
                if items:
                    materials.extend(items)
        
        if materials:
            metadata['materials_list'] = ', '.join(materials[:10])  # Maxim 10 materiale
        
        return metadata
    
    def _detect_category(self, text):
        """Detecteaz categoria activitii bazat pe cuvinte cheie"""
        text_lower = text.lower()
        
        for category, keywords in self.categories.items():
            if any(keyword in text_lower for keyword in keywords):
                return category
        
        return '[A]'  # Default categoria jocuri
    
    def _extract_keywords(self, text):
        """Extrage cuvinte cheie din text"""
        keywords = []
        text_lower = text.lower()
        
        # Lista de cuvinte cheie relevante
        keyword_list = [
            'cooperare', 'competitie', 'echipa', 'creativitate', 'miscare',
            'strategie', 'comunicare', 'incredere', 'coordonare', 'atentie',
            'reflexe', 'logica', 'imaginatie', 'muzica', 'dans', 'sport',
            'natura', 'mediu', 'stiinta', 'matematica', 'limba', 'cultura'
        ]
        
        for keyword in keyword_list:
            if keyword in text_lower:
                keywords.append(keyword)
        
        return ', '.join(keywords[:5])  # Maxim 5 keywords
    
    def save_to_database(self, activities):
        """Salveaz activitile �n baza de date"""
        conn = sqlite3.connect(self.db_path)
        cursor = conn.cursor()
        
        saved_count = 0
        duplicate_count = 0
        
        for activity in activities:
            try:
                # Verific duplicate
                cursor.execute(
                    "SELECT id FROM activities WHERE name = ? AND source_file = ?",
                    (activity.get('name'), activity.get('source_file'))
                )
                
                if cursor.fetchone():
                    duplicate_count += 1
                    continue
                
                # Pregtete valorile pentru insert
                columns = []
                values = []
                placeholders = []
                
                for key, value in activity.items():
                    if key != 'created_at':  # Skip created_at, it has default
                        columns.append(key)
                        values.append(value)
                        placeholders.append('?')
                
                # Insert �n DB
                query = f"INSERT INTO activities ({', '.join(columns)}) VALUES ({', '.join(placeholders)})"
                cursor.execute(query, values)
                saved_count += 1
                
            except Exception as e:
                print(f"Error saving activity: {e}")
                continue
        
        conn.commit()
        conn.close()
        
        return saved_count, duplicate_count
    
    def process_all_html_files(self, base_path='/mnt/d/GoogleDrive/Cercetasi/carti-camp-jocuri'):
        """Proceseaz toate fiierele HTML din directorul specificat"""
        base_path = Path(base_path)
        html_files = list(base_path.rglob("*.html"))
        html_files.extend(list(base_path.rglob("*.htm")))
        
        print(f"Found {len(html_files)} HTML files to process")
        
        all_activities = []
        processed = 0
        errors = 0
        
        for i, html_file in enumerate(html_files):
            try:
                activities = self.extract_from_html(str(html_file))
                all_activities.extend(activities)
                processed += 1
                
                # Progress update
                if (i + 1) % 100 == 0:
                    print(f"Progress: {i+1}/{len(html_files)} files processed, {len(all_activities)} activities found")
                    # Save batch to DB
                    if all_activities:
                        saved, dupes = self.save_to_database(all_activities)
                        print(f"Batch saved: {saved} new activities, {dupes} duplicates skipped")
                        all_activities = []  # Clear buffer
                
            except Exception as e:
                print(f"Error processing {html_file}: {e}")
                errors += 1
        
        # Save remaining activities
        if all_activities:
            saved, dupes = self.save_to_database(all_activities)
            print(f"Final batch saved: {saved} new activities, {dupes} duplicates skipped")
        
        print(f"\nProcessing complete!")
        print(f"Files processed: {processed}")
        print(f"Errors: {errors}")
        
        return processed, errors

# Funcie main pentru test
if __name__ == "__main__":
    extractor = HTMLActivityExtractor()
    
    # Test pe un fiier sample mai �nt�i
    print("Testing on sample file first...")
    # Gsete un fiier HTML pentru test
    test_files = list(Path('/mnt/d/GoogleDrive/Cercetasi/carti-camp-jocuri').rglob("*.html"))[:3]
    
    for test_file in test_files:
        print(f"\nTesting: {test_file}")
        activities = extractor.extract_from_html(str(test_file))
        print(f"Found {len(activities)} activities")
        if activities:
            print(f"Sample activity: {activities[0]['name'][:50]}...")
    
    # �ntreab dac s continue cu procesarea complet
    response = input("\nContinue with full processing? (y/n): ")
    if response.lower() == 'y':
        extractor.process_all_html_files()