""" Activity indexer service for INDEX-SISTEM-JOCURI v2.0 Coordinates parsing and database indexing """ from typing import List, Dict, Any from pathlib import Path from app.models.database import DatabaseManager from app.models.activity import Activity from app.services.parser import IndexMasterParser import time class ActivityIndexer: """Service for indexing activities from INDEX_MASTER into database""" def __init__(self, db_manager: DatabaseManager, index_master_path: str): """Initialize indexer with database manager and INDEX_MASTER path""" self.db = db_manager self.parser = IndexMasterParser(index_master_path) self.indexing_stats = {} def index_all_activities(self, clear_existing: bool = False) -> Dict[str, Any]: """Index all activities from INDEX_MASTER into database""" print("🚀 Starting activity indexing process...") start_time = time.time() # Clear existing data if requested if clear_existing: print("🗑️ Clearing existing database...") self.db.clear_database() # Parse activities from INDEX_MASTER print("📖 Parsing INDEX_MASTER file...") activities = self.parser.parse_all_categories() if not activities: print("❌ No activities were parsed!") return {'success': False, 'error': 'No activities parsed'} # Filter valid activities valid_activities = [] for activity in activities: if self.parser.validate_activity_completeness(activity): valid_activities.append(activity) else: print(f"⚠️ Skipping incomplete activity: {activity.name[:50]}...") print(f"✅ Validated {len(valid_activities)} activities out of {len(activities)} parsed") if len(valid_activities) < 100: print(f"⚠️ Warning: Only {len(valid_activities)} valid activities found. Expected 500+") # Bulk insert into database print("💾 Inserting activities into database...") try: inserted_count = self.db.bulk_insert_activities(valid_activities) # Rebuild FTS index for optimal search performance print("🔍 Rebuilding search index...") self.db.rebuild_fts_index() end_time = time.time() indexing_time = end_time - start_time # Generate final statistics (with error handling) try: stats = self._generate_indexing_stats(valid_activities, indexing_time) stats['inserted_count'] = inserted_count stats['success'] = True except Exception as e: print(f"⚠️ Error generating statistics: {e}") stats = { 'success': True, 'inserted_count': inserted_count, 'indexing_time_seconds': indexing_time, 'error': f'Stats generation failed: {str(e)}' } print(f"✅ Indexing complete! {inserted_count} activities indexed in {indexing_time:.2f}s") # Verify database state (with error handling) try: db_stats = self.db.get_statistics() print(f"📊 Database now contains {db_stats['total_activities']} activities") except Exception as e: print(f"⚠️ Error getting database statistics: {e}") print(f"📊 Database insertion completed, statistics unavailable") return stats except Exception as e: print(f"❌ Error during database insertion: {e}") return {'success': False, 'error': str(e)} def index_specific_category(self, category_code: str) -> Dict[str, Any]: """Index activities from a specific category only""" print(f"🎯 Indexing specific category: {category_code}") # Load content and parse specific category if not self.parser.load_content(): return {'success': False, 'error': 'Could not load INDEX_MASTER'} category_name = self.parser.category_mapping.get(category_code) if not category_name: return {'success': False, 'error': f'Unknown category code: {category_code}'} activities = self.parser.parse_category_section(category_code, category_name) if not activities: return {'success': False, 'error': f'No activities found in category {category_code}'} # Filter valid activities valid_activities = [a for a in activities if self.parser.validate_activity_completeness(a)] try: inserted_count = self.db.bulk_insert_activities(valid_activities) return { 'success': True, 'category': category_name, 'inserted_count': inserted_count, 'total_parsed': len(activities), 'valid_activities': len(valid_activities) } except Exception as e: return {'success': False, 'error': str(e)} def _generate_indexing_stats(self, activities: List[Activity], indexing_time: float) -> Dict[str, Any]: """Generate comprehensive indexing statistics""" # Get parser statistics parser_stats = self.parser.get_parsing_statistics() # Calculate additional metrics categories = {} age_ranges = {} durations = {} materials = {} for activity in activities: # Category breakdown if activity.category in categories: categories[activity.category] += 1 else: categories[activity.category] = 1 # Age range analysis (with safety check) try: age_key = activity.get_age_range_display() or "nespecificat" age_ranges[age_key] = age_ranges.get(age_key, 0) + 1 except Exception as e: print(f"Warning: Error getting age range for activity {activity.name}: {e}") age_ranges["nespecificat"] = age_ranges.get("nespecificat", 0) + 1 # Duration analysis (with safety check) try: duration_key = activity.get_duration_display() or "nespecificat" durations[duration_key] = durations.get(duration_key, 0) + 1 except Exception as e: print(f"Warning: Error getting duration for activity {activity.name}: {e}") durations["nespecificat"] = durations.get("nespecificat", 0) + 1 # Materials analysis (with safety check) try: materials_key = activity.get_materials_display() or "nespecificat" materials[materials_key] = materials.get(materials_key, 0) + 1 except Exception as e: print(f"Warning: Error getting materials for activity {activity.name}: {e}") materials["nespecificat"] = materials.get("nespecificat", 0) + 1 return { 'indexing_time_seconds': indexing_time, 'parsing_stats': parser_stats, 'distribution': { 'categories': categories, 'age_ranges': age_ranges, 'durations': durations, 'materials': materials }, 'quality_metrics': { 'completion_rate': parser_stats.get('completion_rate', 0), 'average_description_length': parser_stats.get('average_description_length', 0), 'activities_with_metadata': sum(1 for a in activities if a.age_group_min or a.participants_min or a.duration_min) } } def verify_indexing_quality(self) -> Dict[str, Any]: """Verify the quality of indexed data""" try: # Get database statistics db_stats = self.db.get_statistics() # Check for minimum activity count total_activities = db_stats['total_activities'] meets_minimum = total_activities >= 500 # Check category distribution categories = db_stats.get('categories', {}) category_coverage = len(categories) # Sample some activities to check quality sample_activities = self.db.search_activities(limit=10) quality_issues = [] for activity in sample_activities: if not activity.get('description') or len(activity['description']) < 10: quality_issues.append(f"Activity {activity.get('name', 'Unknown')} has insufficient description") if not activity.get('category'): quality_issues.append(f"Activity {activity.get('name', 'Unknown')} missing category") return { 'total_activities': total_activities, 'meets_minimum_requirement': meets_minimum, 'minimum_target': 500, 'category_coverage': category_coverage, 'expected_categories': len(self.parser.category_mapping), 'quality_issues': quality_issues, 'quality_score': max(0, 100 - len(quality_issues) * 10), 'database_stats': db_stats } except Exception as e: return {'error': str(e), 'quality_score': 0} def get_indexing_progress(self) -> Dict[str, Any]: """Get current indexing progress and status""" try: db_stats = self.db.get_statistics() # Calculate progress towards 500+ activities goal total_activities = db_stats['total_activities'] target_activities = 500 progress_percentage = min(100, (total_activities / target_activities) * 100) return { 'current_activities': total_activities, 'target_activities': target_activities, 'progress_percentage': progress_percentage, 'status': 'completed' if total_activities >= target_activities else 'in_progress', 'categories_indexed': list(db_stats.get('categories', {}).keys()), 'database_size_mb': db_stats.get('database_size_bytes', 0) / (1024 * 1024) } except Exception as e: return {'error': str(e), 'status': 'error'}