""" Advanced parser for INDEX_MASTER_JOCURI_ACTIVITATI.md Extracts 500+ individual activities with full details """ import re from pathlib import Path from typing import List, Dict, Optional, Tuple from app.models.activity import Activity class IndexMasterParser: """Advanced parser for extracting real activities from INDEX_MASTER""" def __init__(self, index_file_path: str): """Initialize parser with INDEX_MASTER file path""" self.index_file_path = Path(index_file_path) self.content = "" self.activities = [] # Category mapping for main sections (exact match from file) self.category_mapping = { '[A]': 'JOCURI CERCETĂȘEȘTI ȘI SCOUT', '[B]': 'TEAM BUILDING ȘI COMUNICARE', '[C]': 'CAMPING ȘI ACTIVITĂȚI EXTERIOR', '[D]': 'ESCAPE ROOM ȘI PUZZLE-URI', '[E]': 'ORIENTARE ȘI BUSOLE', '[F]': 'PRIMUL AJUTOR ȘI SIGURANȚA', '[G]': 'ACTIVITĂȚI EDUCAȚIONALE', '[H]': 'RESURSE SPECIALE' } def load_content(self) -> bool: """Load and validate INDEX_MASTER content""" try: if not self.index_file_path.exists(): print(f"❌ INDEX_MASTER file not found: {self.index_file_path}") return False with open(self.index_file_path, 'r', encoding='utf-8') as f: self.content = f.read() if len(self.content) < 1000: # Sanity check print(f"⚠️ INDEX_MASTER file seems too small: {len(self.content)} chars") return False print(f"✅ Loaded INDEX_MASTER: {len(self.content)} characters") return True except Exception as e: print(f"❌ Error loading INDEX_MASTER: {e}") return False def parse_all_categories(self) -> List[Activity]: """Parse all categories and extract individual activities""" if not self.load_content(): return [] print("🔍 Starting comprehensive parsing of INDEX_MASTER...") # Parse each main category for category_code, category_name in self.category_mapping.items(): print(f"\n📂 Processing category {category_code}: {category_name}") category_activities = self.parse_category_section(category_code, category_name) self.activities.extend(category_activities) print(f" ✅ Extracted {len(category_activities)} activities") print(f"\n🎯 Total activities extracted: {len(self.activities)}") return self.activities def parse_category_section(self, category_code: str, category_name: str) -> List[Activity]: """Parse a specific category section""" activities = [] # Find the category section - exact pattern match # Look for the actual section, not the table of contents pattern = rf"^## {re.escape(category_code)} {re.escape(category_name)}\s*$" matches = list(re.finditer(pattern, self.content, re.MULTILINE | re.IGNORECASE)) if not matches: print(f" ⚠️ Category section not found: {category_code}") return activities # Take the last match (should be the actual section, not TOC) match = matches[-1] print(f" 📍 Found section at position {match.start()}") # Extract content until next main category or end start_pos = match.end() # Find next main category (look for complete header) next_category_pattern = r"^## \[[A-H]\] [A-ZĂÂÎȘȚ]" next_match = re.search(next_category_pattern, self.content[start_pos:], re.MULTILINE) if next_match: end_pos = start_pos + next_match.start() section_content = self.content[start_pos:end_pos] else: section_content = self.content[start_pos:] # Parse subsections within the category activities.extend(self._parse_subsections(section_content, category_name)) return activities def _parse_subsections(self, section_content: str, category_name: str) -> List[Activity]: """Parse subsections within a category""" activities = [] # Find all subsections (### markers) subsection_pattern = r"^### (.+?)$" subsections = re.finditer(subsection_pattern, section_content, re.MULTILINE) subsection_list = list(subsections) for i, subsection in enumerate(subsection_list): subsection_title = subsection.group(1).strip() subsection_start = subsection.end() # Find end of subsection if i + 1 < len(subsection_list): subsection_end = subsection_list[i + 1].start() else: subsection_end = len(section_content) subsection_text = section_content[subsection_start:subsection_end] # Parse individual games in this subsection subsection_activities = self._parse_games_in_subsection( subsection_text, category_name, subsection_title ) activities.extend(subsection_activities) return activities def _parse_games_in_subsection(self, subsection_text: str, category_name: str, subsection_title: str) -> List[Activity]: """Parse individual games within a subsection""" activities = [] # Look for "Exemple de jocuri:" sections examples_pattern = r"\*\*Exemple de jocuri:\*\*\s*\n(.*?)(?=\n\*\*|$)" examples_matches = re.finditer(examples_pattern, subsection_text, re.DOTALL) for examples_match in examples_matches: examples_text = examples_match.group(1) # Extract individual games (numbered list) game_pattern = r"^(\d+)\.\s*\*\*(.+?)\*\*\s*-\s*(.+?)$" games = re.finditer(game_pattern, examples_text, re.MULTILINE) for game_match in games: game_number = game_match.group(1) game_name = game_match.group(2).strip() game_description = game_match.group(3).strip() # Extract metadata from subsection metadata = self._extract_subsection_metadata(subsection_text) # Create activity activity = Activity( name=game_name, description=game_description, category=category_name, subcategory=subsection_title, source_file=f"INDEX_MASTER_JOCURI_ACTIVITATI.md", page_reference=f"{category_name} > {subsection_title} > #{game_number}", **metadata ) activities.append(activity) # Also extract from direct activity descriptions without "Exemple de jocuri" activities.extend(self._parse_direct_activities(subsection_text, category_name, subsection_title)) return activities def _extract_subsection_metadata(self, subsection_text: str) -> Dict: """Extract metadata from subsection text""" metadata = {} # Extract participants info participants_pattern = r"\*\*Participanți:\*\*\s*(.+?)(?:\n|\*\*)" participants_match = re.search(participants_pattern, subsection_text) if participants_match: participants_text = participants_match.group(1).strip() participants = self._parse_participants(participants_text) metadata.update(participants) # Extract duration duration_pattern = r"\*\*Durata:\*\*\s*(.+?)(?:\n|\*\*)" duration_match = re.search(duration_pattern, subsection_text) if duration_match: duration_text = duration_match.group(1).strip() duration = self._parse_duration(duration_text) metadata.update(duration) # Extract materials materials_pattern = r"\*\*Materiale:\*\*\s*(.+?)(?:\n|\*\*)" materials_match = re.search(materials_pattern, subsection_text) if materials_match: materials_text = materials_match.group(1).strip() metadata['materials_list'] = materials_text metadata['materials_category'] = self._categorize_materials(materials_text) # Extract keywords keywords_pattern = r"\*\*Cuvinte cheie:\*\*\s*(.+?)(?:\n|\*\*)" keywords_match = re.search(keywords_pattern, subsection_text) if keywords_match: metadata['keywords'] = keywords_match.group(1).strip() return metadata def _parse_participants(self, participants_text: str) -> Dict: """Parse participants information""" result = {} # Look for number ranges like "8-30 copii" or "5-15 persoane" range_pattern = r"(\d+)-(\d+)" range_match = re.search(range_pattern, participants_text) if range_match: result['participants_min'] = int(range_match.group(1)) result['participants_max'] = int(range_match.group(2)) else: # Look for single numbers number_pattern = r"(\d+)\+" number_match = re.search(number_pattern, participants_text) if number_match: result['participants_min'] = int(number_match.group(1)) # Extract age information age_pattern = r"(\d+)-(\d+)\s*ani" age_match = re.search(age_pattern, participants_text) if age_match: result['age_group_min'] = int(age_match.group(1)) result['age_group_max'] = int(age_match.group(2)) return result def _parse_duration(self, duration_text: str) -> Dict: """Parse duration information""" result = {} # Look for time ranges like "5-20 minute" or "15-30min" range_pattern = r"(\d+)-(\d+)\s*(?:minute|min)" range_match = re.search(range_pattern, duration_text) if range_match: result['duration_min'] = int(range_match.group(1)) result['duration_max'] = int(range_match.group(2)) else: # Look for single duration single_pattern = r"(\d+)\+?\s*(?:minute|min)" single_match = re.search(single_pattern, duration_text) if single_match: result['duration_min'] = int(single_match.group(1)) return result def _categorize_materials(self, materials_text: str) -> str: """Categorize materials into simple categories""" materials_lower = materials_text.lower() if any(word in materials_lower for word in ['fără', 'nu necesare', 'nimic', 'minime']): return 'Fără materiale' elif any(word in materials_lower for word in ['hârtie', 'creion', 'marker', 'simple']): return 'Materiale simple' elif any(word in materials_lower for word in ['computer', 'proiector', 'echipament', 'complexe']): return 'Materiale complexe' else: return 'Materiale variate' def _parse_direct_activities(self, subsection_text: str, category_name: str, subsection_title: str) -> List[Activity]: """Parse activities that are described directly without 'Exemple de jocuri' section""" activities = [] # Look for activity descriptions in sections that don't have "Exemple de jocuri" if "**Exemple de jocuri:**" not in subsection_text: # Try to extract from file descriptions file_pattern = r"\*\*Fișier:\*\*\s*`([^`]+)`.*?\*\*(.+?)\*\*" file_matches = re.finditer(file_pattern, subsection_text, re.DOTALL) for file_match in file_matches: file_name = file_match.group(1) description_part = file_match.group(2) # Create a general activity for this file activity = Activity( name=f"Activități din {file_name}", description=f"Colecție de activități din fișierul {file_name}. {description_part[:200]}...", category=category_name, subcategory=subsection_title, source_file=file_name, page_reference=f"{category_name} > {subsection_title}", **self._extract_subsection_metadata(subsection_text) ) activities.append(activity) return activities def validate_activity_completeness(self, activity: Activity) -> bool: """Validate that an activity has all necessary fields""" required_fields = ['name', 'description', 'category', 'source_file'] for field in required_fields: if not getattr(activity, field) or not getattr(activity, field).strip(): return False # Check minimum description length if len(activity.description) < 10: return False return True def get_parsing_statistics(self) -> Dict: """Get statistics about the parsing process""" if not self.activities: return {'total_activities': 0} category_counts = {} valid_activities = 0 for activity in self.activities: # Count by category if activity.category in category_counts: category_counts[activity.category] += 1 else: category_counts[activity.category] = 1 # Count valid activities if self.validate_activity_completeness(activity): valid_activities += 1 return { 'total_activities': len(self.activities), 'valid_activities': valid_activities, 'completion_rate': (valid_activities / len(self.activities)) * 100 if self.activities else 0, 'category_breakdown': category_counts, 'average_description_length': sum(len(a.description) for a in self.activities) / len(self.activities) if self.activities else 0 }