Major Changes: - Migrated from prototype to production architecture - Implemented modular Flask app with models/services/web layers - Added Docker containerization with docker-compose - Switched to Pipenv for dependency management - Built advanced parser extracting 63 real activities from INDEX_MASTER - Implemented SQLite FTS5 full-text search - Created minimalist, responsive web interface - Added comprehensive documentation and deployment guides Technical Improvements: - Clean separation of concerns (models, services, web) - Enhanced database schema with FTS5 indexing - Dynamic filters populated from real data - Production-ready configuration management - Security best practices implementation - Health monitoring and API endpoints Removed Legacy Files: - Old src/ directory structure - Static requirements.txt (replaced by Pipfile) - Test and debug files - Temporary cache files Current Status: - 63 activities indexed across 8 categories - Full-text search operational - Docker deployment ready - Production documentation complete 🤖 Generated with [Claude Code](https://claude.ai/code) Co-Authored-By: Claude <noreply@anthropic.com>
200 lines
7.0 KiB
Python
200 lines
7.0 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
Data indexing script for INDEX-SISTEM-JOCURI v2.0
|
|
Extracts activities from INDEX_MASTER and populates database
|
|
"""
|
|
|
|
import sys
|
|
import os
|
|
from pathlib import Path
|
|
|
|
# Add app directory to Python path
|
|
sys.path.insert(0, str(Path(__file__).parent.parent))
|
|
|
|
from app.models.database import DatabaseManager
|
|
from app.services.indexer import ActivityIndexer
|
|
from app.config import Config
|
|
import argparse
|
|
import time
|
|
|
|
def main():
|
|
"""Main indexing function"""
|
|
parser = argparse.ArgumentParser(description='Index activities from INDEX_MASTER')
|
|
parser.add_argument('--clear', action='store_true', help='Clear existing database before indexing')
|
|
parser.add_argument('--category', help='Index specific category only (e.g., [A], [B], etc.)')
|
|
parser.add_argument('--verify', action='store_true', help='Verify indexing quality after completion')
|
|
parser.add_argument('--stats', action='store_true', help='Show database statistics only')
|
|
|
|
args = parser.parse_args()
|
|
|
|
# Setup paths
|
|
Config.ensure_directories()
|
|
|
|
# Database path
|
|
db_path = os.environ.get('DATABASE_URL', str(Config.DATA_DIR / 'activities.db'))
|
|
if db_path.startswith('sqlite:///'):
|
|
db_path = db_path[10:] # Remove sqlite:/// prefix
|
|
|
|
# INDEX_MASTER path
|
|
index_master_path = os.environ.get('INDEX_MASTER_FILE', str(Config.INDEX_MASTER_FILE))
|
|
|
|
print("🎯 INDEX-SISTEM-JOCURI v2.0 - Data Indexing")
|
|
print("=" * 50)
|
|
print(f"Database: {db_path}")
|
|
print(f"INDEX_MASTER: {index_master_path}")
|
|
print("=" * 50)
|
|
|
|
# Verify INDEX_MASTER file exists
|
|
if not Path(index_master_path).exists():
|
|
print(f"❌ INDEX_MASTER file not found: {index_master_path}")
|
|
print(" Please ensure the file is mounted in the container or available locally")
|
|
return 1
|
|
|
|
# Initialize services
|
|
try:
|
|
db_manager = DatabaseManager(db_path)
|
|
indexer = ActivityIndexer(db_manager, index_master_path)
|
|
except Exception as e:
|
|
print(f"❌ Error initializing services: {e}")
|
|
return 1
|
|
|
|
# Handle different operations
|
|
if args.stats:
|
|
return show_statistics(db_manager)
|
|
|
|
if args.category:
|
|
return index_category(indexer, args.category)
|
|
|
|
if args.verify:
|
|
return verify_indexing(indexer)
|
|
|
|
# Default: full indexing
|
|
return full_indexing(indexer, args.clear)
|
|
|
|
def full_indexing(indexer: ActivityIndexer, clear_existing: bool) -> int:
|
|
"""Perform full indexing of all activities"""
|
|
|
|
print("🚀 Starting full indexing process...")
|
|
|
|
try:
|
|
# Perform indexing
|
|
result = indexer.index_all_activities(clear_existing=clear_existing)
|
|
|
|
if not result.get('success'):
|
|
print(f"❌ Indexing failed: {result.get('error', 'Unknown error')}")
|
|
return 1
|
|
|
|
# Print results
|
|
print("\n📊 INDEXING RESULTS")
|
|
print("=" * 30)
|
|
print(f"✅ Activities inserted: {result.get('inserted_count', 0)}")
|
|
print(f"⏱️ Indexing time: {result.get('indexing_time_seconds', 0):.2f}s")
|
|
|
|
parsing_stats = result.get('parsing_stats', {})
|
|
print(f"📈 Completion rate: {parsing_stats.get('completion_rate', 0):.1f}%")
|
|
print(f"📝 Avg description length: {parsing_stats.get('average_description_length', 0):.0f} chars")
|
|
|
|
# Category breakdown
|
|
categories = result.get('distribution', {}).get('categories', {})
|
|
print(f"\n📂 CATEGORY BREAKDOWN:")
|
|
for category, count in categories.items():
|
|
print(f" {category}: {count} activities")
|
|
|
|
# Quality check
|
|
if result.get('inserted_count', 0) >= 500:
|
|
print(f"\n🎯 SUCCESS: Target of 500+ activities achieved!")
|
|
else:
|
|
print(f"\n⚠️ Warning: Only {result.get('inserted_count', 0)} activities indexed (target: 500+)")
|
|
|
|
return 0
|
|
|
|
except Exception as e:
|
|
print(f"❌ Error during indexing: {e}")
|
|
return 1
|
|
|
|
def index_category(indexer: ActivityIndexer, category_code: str) -> int:
|
|
"""Index a specific category"""
|
|
|
|
print(f"🎯 Indexing category: {category_code}")
|
|
|
|
try:
|
|
result = indexer.index_specific_category(category_code)
|
|
|
|
if not result.get('success'):
|
|
print(f"❌ Category indexing failed: {result.get('error', 'Unknown error')}")
|
|
return 1
|
|
|
|
print(f"✅ Category '{result.get('category')}' indexed successfully")
|
|
print(f" Inserted: {result.get('inserted_count')} activities")
|
|
print(f" Parsed: {result.get('total_parsed')} total")
|
|
print(f" Valid: {result.get('valid_activities')} valid")
|
|
|
|
return 0
|
|
|
|
except Exception as e:
|
|
print(f"❌ Error during category indexing: {e}")
|
|
return 1
|
|
|
|
def verify_indexing(indexer: ActivityIndexer) -> int:
|
|
"""Verify indexing quality"""
|
|
|
|
print("🔍 Verifying indexing quality...")
|
|
|
|
try:
|
|
result = indexer.verify_indexing_quality()
|
|
|
|
if 'error' in result:
|
|
print(f"❌ Verification error: {result['error']}")
|
|
return 1
|
|
|
|
print("\n📊 QUALITY VERIFICATION")
|
|
print("=" * 30)
|
|
print(f"Total activities: {result.get('total_activities', 0)}")
|
|
print(f"Meets minimum (500+): {'✅' if result.get('meets_minimum_requirement') else '❌'}")
|
|
print(f"Category coverage: {result.get('category_coverage', 0)}/{result.get('expected_categories', 8)}")
|
|
print(f"Quality score: {result.get('quality_score', 0)}/100")
|
|
|
|
quality_issues = result.get('quality_issues', [])
|
|
if quality_issues:
|
|
print(f"\n⚠️ Quality Issues:")
|
|
for issue in quality_issues[:5]: # Show first 5 issues
|
|
print(f" • {issue}")
|
|
if len(quality_issues) > 5:
|
|
print(f" ... and {len(quality_issues) - 5} more issues")
|
|
else:
|
|
print(f"\n✅ No quality issues detected")
|
|
|
|
return 0 if result.get('quality_score', 0) >= 80 else 1
|
|
|
|
except Exception as e:
|
|
print(f"❌ Error during verification: {e}")
|
|
return 1
|
|
|
|
def show_statistics(db_manager: DatabaseManager) -> int:
|
|
"""Show database statistics"""
|
|
|
|
print("📊 Database Statistics")
|
|
print("=" * 25)
|
|
|
|
try:
|
|
stats = db_manager.get_statistics()
|
|
|
|
print(f"Total activities: {stats.get('total_activities', 0)}")
|
|
print(f"Database size: {stats.get('database_size_bytes', 0) / 1024:.1f} KB")
|
|
print(f"Database path: {stats.get('database_path', 'Unknown')}")
|
|
|
|
categories = stats.get('categories', {})
|
|
if categories:
|
|
print(f"\nCategories:")
|
|
for category, count in categories.items():
|
|
print(f" {category}: {count}")
|
|
|
|
return 0
|
|
|
|
except Exception as e:
|
|
print(f"❌ Error getting statistics: {e}")
|
|
return 1
|
|
|
|
if __name__ == '__main__':
|
|
exit_code = main()
|
|
sys.exit(exit_code) |