"""
Advanced parser for INDEX_MASTER_JOCURI_ACTIVITATI.md
Extracts 500+ individual activities with full details
"""

import re
from pathlib import Path
from typing import List, Dict, Optional, Tuple
from app.models.activity import Activity

class IndexMasterParser:
    """Advanced parser for extracting real activities from INDEX_MASTER"""
    
    def __init__(self, index_file_path: str):
        """Initialize parser with INDEX_MASTER file path"""
        self.index_file_path = Path(index_file_path)
        self.content = ""
        self.activities = []
        
        # Category mapping for main sections (exact match from file)
        self.category_mapping = {
            '[A]': 'JOCURI CERCETĂȘEȘTI ȘI SCOUT',
            '[B]': 'TEAM BUILDING ȘI COMUNICARE',
            '[C]': 'CAMPING ȘI ACTIVITĂȚI EXTERIOR', 
            '[D]': 'ESCAPE ROOM ȘI PUZZLE-URI',
            '[E]': 'ORIENTARE ȘI BUSOLE',
            '[F]': 'PRIMUL AJUTOR ȘI SIGURANȚA',
            '[G]': 'ACTIVITĂȚI EDUCAȚIONALE',
            '[H]': 'RESURSE SPECIALE'
        }
    
    def load_content(self) -> bool:
        """Load and validate INDEX_MASTER content"""
        try:
            if not self.index_file_path.exists():
                print(f"❌ INDEX_MASTER file not found: {self.index_file_path}")
                return False
            
            with open(self.index_file_path, 'r', encoding='utf-8') as f:
                self.content = f.read()
            
            if len(self.content) < 1000:  # Sanity check
                print(f"⚠️  INDEX_MASTER file seems too small: {len(self.content)} chars")
                return False
            
            print(f"✅ Loaded INDEX_MASTER: {len(self.content)} characters")
            return True
            
        except Exception as e:
            print(f"❌ Error loading INDEX_MASTER: {e}")
            return False
    
    def parse_all_categories(self) -> List[Activity]:
        """Parse all categories and extract individual activities"""
        if not self.load_content():
            return []
        
        print("🔍 Starting comprehensive parsing of INDEX_MASTER...")
        
        # Parse each main category
        for category_code, category_name in self.category_mapping.items():
            print(f"\n📂 Processing category {category_code}: {category_name}")
            category_activities = self.parse_category_section(category_code, category_name)
            self.activities.extend(category_activities)
            print(f"   ✅ Extracted {len(category_activities)} activities")
        
        print(f"\n🎯 Total activities extracted: {len(self.activities)}")
        return self.activities
    
    def parse_category_section(self, category_code: str, category_name: str) -> List[Activity]:
        """Parse a specific category section"""
        activities = []
        
        # Find the category section - exact pattern match
        # Look for the actual section, not the table of contents
        pattern = rf"^## {re.escape(category_code)} {re.escape(category_name)}\s*$"
        matches = list(re.finditer(pattern, self.content, re.MULTILINE | re.IGNORECASE))
        
        if not matches:
            print(f"   ⚠️  Category section not found: {category_code}")
            return activities
        
        # Take the last match (should be the actual section, not TOC)
        match = matches[-1]
        print(f"   📍 Found section at position {match.start()}")
        
        # Extract content until next main category or end
        start_pos = match.end()
        
        # Find next main category (look for complete header)
        next_category_pattern = r"^## \[[A-H]\] [A-ZĂÂÎȘȚ]"
        next_match = re.search(next_category_pattern, self.content[start_pos:], re.MULTILINE)
        
        if next_match:
            end_pos = start_pos + next_match.start()
            section_content = self.content[start_pos:end_pos]
        else:
            section_content = self.content[start_pos:]
        
        # Parse subsections within the category
        activities.extend(self._parse_subsections(section_content, category_name))
        
        return activities
    
    def _parse_subsections(self, section_content: str, category_name: str) -> List[Activity]:
        """Parse subsections within a category"""
        activities = []
        
        # Find all subsections (### markers)
        subsection_pattern = r"^### (.+?)$"
        subsections = re.finditer(subsection_pattern, section_content, re.MULTILINE)
        
        subsection_list = list(subsections)
        
        for i, subsection in enumerate(subsection_list):
            subsection_title = subsection.group(1).strip()
            subsection_start = subsection.end()
            
            # Find end of subsection
            if i + 1 < len(subsection_list):
                subsection_end = subsection_list[i + 1].start()
            else:
                subsection_end = len(section_content)
            
            subsection_text = section_content[subsection_start:subsection_end]
            
            # Parse individual games in this subsection
            subsection_activities = self._parse_games_in_subsection(
                subsection_text, category_name, subsection_title
            )
            activities.extend(subsection_activities)
        
        return activities
    
    def _parse_games_in_subsection(self, subsection_text: str, category_name: str, subsection_title: str) -> List[Activity]:
        """Parse individual games within a subsection"""
        activities = []
        
        # Look for "Exemple de jocuri:" sections
        examples_pattern = r"\*\*Exemple de jocuri:\*\*\s*\n(.*?)(?=\n\*\*|$)"
        examples_matches = re.finditer(examples_pattern, subsection_text, re.DOTALL)
        
        for examples_match in examples_matches:
            examples_text = examples_match.group(1)
            
            # Extract individual games (numbered list)
            game_pattern = r"^(\d+)\.\s*\*\*(.+?)\*\*\s*-\s*(.+?)$"
            games = re.finditer(game_pattern, examples_text, re.MULTILINE)
            
            for game_match in games:
                game_number = game_match.group(1)
                game_name = game_match.group(2).strip()
                game_description = game_match.group(3).strip()
                
                # Extract metadata from subsection
                metadata = self._extract_subsection_metadata(subsection_text)
                
                # Create activity
                activity = Activity(
                    name=game_name,
                    description=game_description,
                    category=category_name,
                    subcategory=subsection_title,
                    source_file=f"INDEX_MASTER_JOCURI_ACTIVITATI.md",
                    page_reference=f"{category_name} > {subsection_title} > #{game_number}",
                    **metadata
                )
                
                activities.append(activity)
        
        # Also extract from direct activity descriptions without "Exemple de jocuri"
        activities.extend(self._parse_direct_activities(subsection_text, category_name, subsection_title))
        
        return activities
    
    def _extract_subsection_metadata(self, subsection_text: str) -> Dict:
        """Extract metadata from subsection text"""
        metadata = {}
        
        # Extract participants info
        participants_pattern = r"\*\*Participanți:\*\*\s*(.+?)(?:\n|\*\*)"
        participants_match = re.search(participants_pattern, subsection_text)
        if participants_match:
            participants_text = participants_match.group(1).strip()
            participants = self._parse_participants(participants_text)
            metadata.update(participants)
        
        # Extract duration
        duration_pattern = r"\*\*Durata:\*\*\s*(.+?)(?:\n|\*\*)"
        duration_match = re.search(duration_pattern, subsection_text)
        if duration_match:
            duration_text = duration_match.group(1).strip()
            duration = self._parse_duration(duration_text)
            metadata.update(duration)
        
        # Extract materials
        materials_pattern = r"\*\*Materiale:\*\*\s*(.+?)(?:\n|\*\*)"
        materials_match = re.search(materials_pattern, subsection_text)
        if materials_match:
            materials_text = materials_match.group(1).strip()
            metadata['materials_list'] = materials_text
            metadata['materials_category'] = self._categorize_materials(materials_text)
        
        # Extract keywords
        keywords_pattern = r"\*\*Cuvinte cheie:\*\*\s*(.+?)(?:\n|\*\*)"
        keywords_match = re.search(keywords_pattern, subsection_text)
        if keywords_match:
            metadata['keywords'] = keywords_match.group(1).strip()
        
        return metadata
    
    def _parse_participants(self, participants_text: str) -> Dict:
        """Parse participants information"""
        result = {}
        
        # Look for number ranges like "8-30 copii" or "5-15 persoane"
        range_pattern = r"(\d+)-(\d+)"
        range_match = re.search(range_pattern, participants_text)
        
        if range_match:
            result['participants_min'] = int(range_match.group(1))
            result['participants_max'] = int(range_match.group(2))
        else:
            # Look for single numbers
            number_pattern = r"(\d+)\+"
            number_match = re.search(number_pattern, participants_text)
            if number_match:
                result['participants_min'] = int(number_match.group(1))
        
        # Extract age information
        age_pattern = r"(\d+)-(\d+)\s*ani"
        age_match = re.search(age_pattern, participants_text)
        if age_match:
            result['age_group_min'] = int(age_match.group(1))
            result['age_group_max'] = int(age_match.group(2))
        
        return result
    
    def _parse_duration(self, duration_text: str) -> Dict:
        """Parse duration information"""
        result = {}
        
        # Look for time ranges like "5-20 minute" or "15-30min"
        range_pattern = r"(\d+)-(\d+)\s*(?:minute|min)"
        range_match = re.search(range_pattern, duration_text)
        
        if range_match:
            result['duration_min'] = int(range_match.group(1))
            result['duration_max'] = int(range_match.group(2))
        else:
            # Look for single duration
            single_pattern = r"(\d+)\+?\s*(?:minute|min)"
            single_match = re.search(single_pattern, duration_text)
            if single_match:
                result['duration_min'] = int(single_match.group(1))
        
        return result
    
    def _categorize_materials(self, materials_text: str) -> str:
        """Categorize materials into simple categories"""
        materials_lower = materials_text.lower()
        
        if any(word in materials_lower for word in ['fără', 'nu necesare', 'nimic', 'minime']):
            return 'Fără materiale'
        elif any(word in materials_lower for word in ['hârtie', 'creion', 'marker', 'simple']):
            return 'Materiale simple'
        elif any(word in materials_lower for word in ['computer', 'proiector', 'echipament', 'complexe']):
            return 'Materiale complexe'
        else:
            return 'Materiale variate'
    
    def _parse_direct_activities(self, subsection_text: str, category_name: str, subsection_title: str) -> List[Activity]:
        """Parse activities that are described directly without 'Exemple de jocuri' section"""
        activities = []
        
        # Look for activity descriptions in sections that don't have "Exemple de jocuri"
        if "**Exemple de jocuri:**" not in subsection_text:
            # Try to extract from file descriptions
            file_pattern = r"\*\*Fișier:\*\*\s*`([^`]+)`.*?\*\*(.+?)\*\*"
            file_matches = re.finditer(file_pattern, subsection_text, re.DOTALL)
            
            for file_match in file_matches:
                file_name = file_match.group(1)
                description_part = file_match.group(2)
                
                # Create a general activity for this file
                activity = Activity(
                    name=f"Activități din {file_name}",
                    description=f"Colecție de activități din fișierul {file_name}. {description_part[:200]}...",
                    category=category_name,
                    subcategory=subsection_title,
                    source_file=file_name,
                    page_reference=f"{category_name} > {subsection_title}",
                    **self._extract_subsection_metadata(subsection_text)
                )
                
                activities.append(activity)
        
        return activities
    
    def validate_activity_completeness(self, activity: Activity) -> bool:
        """Validate that an activity has all necessary fields"""
        required_fields = ['name', 'description', 'category', 'source_file']
        
        for field in required_fields:
            if not getattr(activity, field) or not getattr(activity, field).strip():
                return False
        
        # Check minimum description length
        if len(activity.description) < 10:
            return False
        
        return True
    
    def get_parsing_statistics(self) -> Dict:
        """Get statistics about the parsing process"""
        if not self.activities:
            return {'total_activities': 0}
        
        category_counts = {}
        valid_activities = 0
        
        for activity in self.activities:
            # Count by category
            if activity.category in category_counts:
                category_counts[activity.category] += 1
            else:
                category_counts[activity.category] = 1
            
            # Count valid activities
            if self.validate_activity_completeness(activity):
                valid_activities += 1
        
        return {
            'total_activities': len(self.activities),
            'valid_activities': valid_activities,
            'completion_rate': (valid_activities / len(self.activities)) * 100 if self.activities else 0,
            'category_breakdown': category_counts,
            'average_description_length': sum(len(a.description) for a in self.activities) / len(self.activities) if self.activities else 0
        }