#!/usr/bin/env python3 """ Data indexing script for INDEX-SISTEM-JOCURI v2.0 Extracts activities from INDEX_MASTER and populates database """ import sys import os from pathlib import Path # Add app directory to Python path sys.path.insert(0, str(Path(__file__).parent.parent)) from app.models.database import DatabaseManager from app.services.indexer import ActivityIndexer from app.config import Config import argparse import time def main(): """Main indexing function""" parser = argparse.ArgumentParser(description='Index activities from INDEX_MASTER') parser.add_argument('--clear', action='store_true', help='Clear existing database before indexing') parser.add_argument('--category', help='Index specific category only (e.g., [A], [B], etc.)') parser.add_argument('--verify', action='store_true', help='Verify indexing quality after completion') parser.add_argument('--stats', action='store_true', help='Show database statistics only') args = parser.parse_args() # Setup paths Config.ensure_directories() # Database path db_path = os.environ.get('DATABASE_URL', str(Config.DATA_DIR / 'activities.db')) if db_path.startswith('sqlite:///'): db_path = db_path[10:] # Remove sqlite:/// prefix # INDEX_MASTER path index_master_path = os.environ.get('INDEX_MASTER_FILE', str(Config.INDEX_MASTER_FILE)) print("šŸŽÆ INDEX-SISTEM-JOCURI v2.0 - Data Indexing") print("=" * 50) print(f"Database: {db_path}") print(f"INDEX_MASTER: {index_master_path}") print("=" * 50) # Verify INDEX_MASTER file exists if not Path(index_master_path).exists(): print(f"āŒ INDEX_MASTER file not found: {index_master_path}") print(" Please ensure the file is mounted in the container or available locally") return 1 # Initialize services try: db_manager = DatabaseManager(db_path) indexer = ActivityIndexer(db_manager, index_master_path) except Exception as e: print(f"āŒ Error initializing services: {e}") return 1 # Handle different operations if args.stats: return show_statistics(db_manager) if args.category: return index_category(indexer, args.category) if args.verify: return verify_indexing(indexer) # Default: full indexing return full_indexing(indexer, args.clear) def full_indexing(indexer: ActivityIndexer, clear_existing: bool) -> int: """Perform full indexing of all activities""" print("šŸš€ Starting full indexing process...") try: # Perform indexing result = indexer.index_all_activities(clear_existing=clear_existing) if not result.get('success'): print(f"āŒ Indexing failed: {result.get('error', 'Unknown error')}") return 1 # Print results print("\nšŸ“Š INDEXING RESULTS") print("=" * 30) print(f"āœ… Activities inserted: {result.get('inserted_count', 0)}") print(f"ā±ļø Indexing time: {result.get('indexing_time_seconds', 0):.2f}s") parsing_stats = result.get('parsing_stats', {}) print(f"šŸ“ˆ Completion rate: {parsing_stats.get('completion_rate', 0):.1f}%") print(f"šŸ“ Avg description length: {parsing_stats.get('average_description_length', 0):.0f} chars") # Category breakdown categories = result.get('distribution', {}).get('categories', {}) print(f"\nšŸ“‚ CATEGORY BREAKDOWN:") for category, count in categories.items(): print(f" {category}: {count} activities") # Quality check if result.get('inserted_count', 0) >= 500: print(f"\nšŸŽÆ SUCCESS: Target of 500+ activities achieved!") else: print(f"\nāš ļø Warning: Only {result.get('inserted_count', 0)} activities indexed (target: 500+)") return 0 except Exception as e: print(f"āŒ Error during indexing: {e}") return 1 def index_category(indexer: ActivityIndexer, category_code: str) -> int: """Index a specific category""" print(f"šŸŽÆ Indexing category: {category_code}") try: result = indexer.index_specific_category(category_code) if not result.get('success'): print(f"āŒ Category indexing failed: {result.get('error', 'Unknown error')}") return 1 print(f"āœ… Category '{result.get('category')}' indexed successfully") print(f" Inserted: {result.get('inserted_count')} activities") print(f" Parsed: {result.get('total_parsed')} total") print(f" Valid: {result.get('valid_activities')} valid") return 0 except Exception as e: print(f"āŒ Error during category indexing: {e}") return 1 def verify_indexing(indexer: ActivityIndexer) -> int: """Verify indexing quality""" print("šŸ” Verifying indexing quality...") try: result = indexer.verify_indexing_quality() if 'error' in result: print(f"āŒ Verification error: {result['error']}") return 1 print("\nšŸ“Š QUALITY VERIFICATION") print("=" * 30) print(f"Total activities: {result.get('total_activities', 0)}") print(f"Meets minimum (500+): {'āœ…' if result.get('meets_minimum_requirement') else 'āŒ'}") print(f"Category coverage: {result.get('category_coverage', 0)}/{result.get('expected_categories', 8)}") print(f"Quality score: {result.get('quality_score', 0)}/100") quality_issues = result.get('quality_issues', []) if quality_issues: print(f"\nāš ļø Quality Issues:") for issue in quality_issues[:5]: # Show first 5 issues print(f" • {issue}") if len(quality_issues) > 5: print(f" ... and {len(quality_issues) - 5} more issues") else: print(f"\nāœ… No quality issues detected") return 0 if result.get('quality_score', 0) >= 80 else 1 except Exception as e: print(f"āŒ Error during verification: {e}") return 1 def show_statistics(db_manager: DatabaseManager) -> int: """Show database statistics""" print("šŸ“Š Database Statistics") print("=" * 25) try: stats = db_manager.get_statistics() print(f"Total activities: {stats.get('total_activities', 0)}") print(f"Database size: {stats.get('database_size_bytes', 0) / 1024:.1f} KB") print(f"Database path: {stats.get('database_path', 'Unknown')}") categories = stats.get('categories', {}) if categories: print(f"\nCategories:") for category, count in categories.items(): print(f" {category}: {count}") return 0 except Exception as e: print(f"āŒ Error getting statistics: {e}") return 1 if __name__ == '__main__': exit_code = main() sys.exit(exit_code)