Refactor extraction system and reorganize project structure

- Remove obsolete documentation files (DEPLOYMENT.md, PLAN_IMPLEMENTARE_S8_DETALIAT.md, README.md) - Add comprehensive extraction pipeline with multiple format support (PDF, HTML, text) - Implement Claude-based activity extraction with structured templates - Update dependencies and Docker configuration - Reorganize scripts directory with modular extraction components - Move example documentation to appropriate location 🤖 Generated with [Claude Code](https://claude.ai/code) Co-Authored-By: Claude <noreply@anthropic.com>
2025-09-11 23:32:37 +03:00
parent 1b6b7e06ad
commit a19ddf0b71
119 changed files with 91074 additions and 1859 deletions
--- a/scripts/init.py
+++ b/scripts/init.py
--- a/scripts/claude_extraction_template.md
+++ b/scripts/claude_extraction_template.md
@@ -0,0 +1,54 @@
+# TEMPLATE PENTRU EXTRACȚIE ACTIVITĂȚI CU CLAUDE
+
+## Instrucțiuni pentru Claude Code:
+
+Pentru fiecare PDF/DOC, folosește următorul format de extracție:
+
+### 1. Citește fișierul:
+```
+Claude, te rog citește fișierul: [CALE_FISIER]
+```
+
+### 2. Extrage activitățile folosind acest template JSON:
+```json
+{
+  "source_file": "[NUME_FISIER]",
+  "activities": [
+    {
+      "name": "Numele activității",
+      "description": "Descrierea completă a activității",
+      "rules": "Regulile jocului/activității",
+      "variations": "Variante sau adaptări",
+      "category": "[A-H] bazat pe tip",
+      "age_group_min": 6,
+      "age_group_max": 14,
+      "participants_min": 4,
+      "participants_max": 20,
+      "duration_min": 10,
+      "duration_max": 30,
+      "materials_list": "Lista materialelor necesare",
+      "skills_developed": "Competențe dezvoltate",
+      "difficulty_level": "Ușor/Mediu/Dificil",
+      "keywords": "cuvinte cheie separate prin virgulă",
+      "tags": "taguri relevante"
+    }
+  ]
+}
+```
+
+### 3. Salvează în fișier:
+După extracție, salvează JSON-ul în: `/scripts/extracted_activities/[NUME_FISIER].json`
+
+### 4. Priorități de procesare:
+
+**TOP PRIORITY (procesează primele):**
+1. 1000 Fantastic Scout Games.pdf
+2. Cartea Mare a jocurilor.pdf
+3. 160-de-activitati-dinamice-jocuri-pentru-team-building.pdf
+4. 101 Ways to Create an Unforgettable Camp Experience.pdf
+5. 151 Awesome Summer Camp Nature Activities.pdf
+
+**Categorii de focus:**
+- [A] Jocuri Cercetășești
+- [C] Camping & Activități Exterior
+- [G] Activități Educaționale
--- a/scripts/html_extractor.py
+++ b/scripts/html_extractor.py
@@ -0,0 +1,424 @@
+#!/usr/bin/env python3
+"""
+HTML Activity Extractor - Proceseaz 1876 fiiere HTML
+Extrage automat activiti folosind pattern recognition
+"""
+
+import os
+import re
+import json
+from pathlib import Path
+from bs4 import BeautifulSoup
+import chardet
+from typing import List, Dict, Optional
+import sqlite3
+from datetime import datetime
+
+class HTMLActivityExtractor:
+    def __init__(self, db_path='data/activities.db'):
+        self.db_path = db_path
+        # Pattern-uri pentru detectare activiti <20>n rom<6F>n
+        self.activity_patterns = {
+            'title_patterns': [
+                r'(?i)(joc|activitate|exerci[t]iu|team[\s-]?building|energizer|ice[\s-]?breaker)[\s:]+([^\.]{5,100})',
+                r'(?i)<h[1-6][^>]*>([^<]*(?:joc|activitate|exerci[t]iu)[^<]*)</h[1-6]>',
+                r'(?i)<strong>([^<]*(?:joc|activitate|exerci[t]iu)[^<]*)</strong>',
+                r'(?i)^[\d]+\.?\s*([A-Z][^\.]{10,100}(?:joc|activitate|exerci[t]iu)[^\.]{0,50})$',
+            ],
+            'description_markers': [
+                'descriere', 'reguli', 'cum se joac[a]', 'instructiuni', 
+                'obiectiv', 'desfasurare', 'explicatie', 'mod de joc'
+            ],
+            'materials_markers': [
+                'materiale', 'necesare', 'echipament', 'ce avem nevoie',
+                'se folosesc', 'trebuie sa avem', 'dotari'
+            ],
+            'age_patterns': [
+                r'(?i)v[<5B>a]rst[a][\s:]+(\d+)[\s-]+(\d+)',
+                r'(?i)(\d+)[\s-]+(\d+)\s*ani',
+                r'(?i)pentru\s+(\d+)[\s-]+(\d+)\s*ani',
+                r'(?i)categoria?\s*(?:de\s*)?v[<5B>a]rst[a][\s:]+(\d+)[\s-]+(\d+)',
+            ],
+            'participants_patterns': [
+                r'(?i)(\d+)[\s-]+(\d+)\s*(?:participan[t]i|juc[a]tori|persoane|copii)',
+                r'(?i)num[a]r\s*(?:de\s*)?(?:participan[t]i|juc[a]tori)[\s:]+(\d+)[\s-]+(\d+)',
+                r'(?i)grup\s*de\s*(\d+)[\s-]+(\d+)',
+            ],
+            'duration_patterns': [
+                r'(?i)durat[a][\s:]+(\d+)[\s-]+(\d+)\s*(?:minute|min)',
+                r'(?i)timp[\s:]+(\d+)[\s-]+(\d+)\s*(?:minute|min)',
+                r'(?i)(\d+)[\s-]+(\d+)\s*minute',
+            ]
+        }
+        
+        # Categorii predefinite bazate pe sistemul existent
+        self.categories = {
+            '[A]': ['joc', 'joaca', 'distractie', 'amuzament'],
+            '[B]': ['aventura', 'explorare', 'descoperire'],
+            '[C]': ['camping', 'tabara', 'excursie', 'drumetie'],
+            '[D]': ['foc', 'flacara', 'lumina'],
+            '[E]': ['noduri', 'fr<EFBFBD>nghii', 'sfori', 'legare'],
+            '[F]': ['bushcraft', 'supravietuire', 'survival'],
+            '[G]': ['educatie', 'educativ', 'invatare', 'scoala'],
+            '[H]': ['orientare', 'busola', 'harta', 'navigare']
+        }
+    
+    def detect_encoding(self, file_path):
+        """Detecteaz encoding-ul fiierului"""
+        with open(file_path, 'rb') as f:
+            result = chardet.detect(f.read())
+        return result['encoding'] or 'utf-8'
+    
+    def extract_from_html(self, html_path: str) -> List[Dict]:
+        """Extrage activiti dintr-un singur fiier HTML"""
+        activities = []
+        
+        try:
+            # Detectare encoding i citire
+            encoding = self.detect_encoding(html_path)
+            with open(html_path, 'r', encoding=encoding, errors='ignore') as f:
+                content = f.read()
+            
+            soup = BeautifulSoup(content, 'lxml')
+            
+            # Metod 1: Caut liste de activiti
+            activities.extend(self._extract_from_lists(soup, html_path))
+            
+            # Metod 2: Caut activiti <20>n headings
+            activities.extend(self._extract_from_headings(soup, html_path))
+            
+            # Metod 3: Caut pattern-uri <20>n text
+            activities.extend(self._extract_from_patterns(soup, html_path))
+            
+            # Metod 4: Caut <20>n tabele
+            activities.extend(self._extract_from_tables(soup, html_path))
+            
+        except Exception as e:
+            print(f"Error processing {html_path}: {e}")
+        
+        return activities
+    
+    def _extract_from_lists(self, soup, source_file):
+        """Extrage activiti din liste HTML (ul, ol)"""
+        activities = []
+        
+        for list_elem in soup.find_all(['ul', 'ol']):
+            # Verific dac lista pare s conin activiti
+            list_text = list_elem.get_text().lower()
+            if any(marker in list_text for marker in ['joc', 'activitate', 'exercitiu']):
+                for li in list_elem.find_all('li'):
+                    text = li.get_text(strip=True)
+                    if len(text) > 20:  # Minim 20 caractere pentru o activitate valid
+                        activity = self._create_activity_from_text(text, source_file)
+                        if activity:
+                            activities.append(activity)
+        
+        return activities
+    
+    def _extract_from_headings(self, soup, source_file):
+        """Extrage activiti bazate pe headings"""
+        activities = []
+        
+        for heading in soup.find_all(['h1', 'h2', 'h3', 'h4', 'h5', 'h6']):
+            heading_text = heading.get_text(strip=True)
+            
+            # Verific dac heading-ul conine cuvinte cheie
+            if any(keyword in heading_text.lower() for keyword in ['joc', 'activitate', 'exercitiu']):
+                # Caut descrierea <20>n elementele urmtoare
+                description = ""
+                next_elem = heading.find_next_sibling()
+                
+                while next_elem and next_elem.name not in ['h1', 'h2', 'h3', 'h4', 'h5', 'h6']:
+                    if next_elem.name in ['p', 'div', 'ul']:
+                        description += next_elem.get_text(strip=True) + " "
+                        if len(description) > 500:  # Limit descriere
+                            break
+                    next_elem = next_elem.find_next_sibling()
+                
+                if description:
+                    activity = {
+                        'name': heading_text[:200],
+                        'description': description[:1000],
+                        'source_file': str(source_file),
+                        'category': self._detect_category(heading_text + " " + description)
+                    }
+                    activities.append(activity)
+        
+        return activities
+    
+    def _extract_from_patterns(self, soup, source_file):
+        """Extrage activiti folosind pattern matching"""
+        activities = []
+        text = soup.get_text()
+        
+        # Caut pattern-uri de activiti
+        for pattern in self.activity_patterns['title_patterns']:
+            matches = re.finditer(pattern, text, re.MULTILINE)
+            for match in matches:
+                title = match.group(0) if match.lastindex == 0 else match.group(match.lastindex)
+                if len(title) > 10:
+                    # Extrage context <20>n jurul match-ului
+                    start = max(0, match.start() - 200)
+                    end = min(len(text), match.end() + 500)
+                    context = text[start:end]
+                    
+                    activity = self._create_activity_from_text(context, source_file, title)
+                    if activity:
+                        activities.append(activity)
+        
+        return activities
+    
+    def _extract_from_tables(self, soup, source_file):
+        """Extrage activiti din tabele"""
+        activities = []
+        
+        for table in soup.find_all('table'):
+            rows = table.find_all('tr')
+            if len(rows) > 1:  # Cel puin header i o linie de date
+                # Detecteaz coloanele relevante
+                headers = [th.get_text(strip=True).lower() for th in rows[0].find_all(['th', 'td'])]
+                
+                for row in rows[1:]:
+                    cells = row.find_all(['td'])
+                    if cells:
+                        activity_data = {}
+                        for i, cell in enumerate(cells):
+                            if i < len(headers):
+                                activity_data[headers[i]] = cell.get_text(strip=True)
+                        
+                        # Creeaz activitate din date tabel
+                        if any(key in activity_data for key in ['joc', 'activitate', 'nume', 'titlu']):
+                            activity = self._create_activity_from_table_data(activity_data, source_file)
+                            if activity:
+                                activities.append(activity)
+        
+        return activities
+    
+    def _create_activity_from_text(self, text, source_file, title=None):
+        """Creeaz un dicionar de activitate din text"""
+        if not text or len(text) < 30:
+            return None
+        
+        activity = {
+            'name': title or text[:100].split('.')[0].strip(),
+            'description': text[:1000],
+            'source_file': str(source_file),
+            'category': self._detect_category(text),
+            'keywords': self._extract_keywords(text),
+            'created_at': datetime.now().isoformat()
+        }
+        
+        # Extrage metadata suplimentar
+        activity.update(self._extract_metadata(text))
+        
+        return activity
+    
+    def _create_activity_from_table_data(self, data, source_file):
+        """Creeaz activitate din date de tabel"""
+        activity = {
+            'source_file': str(source_file),
+            'created_at': datetime.now().isoformat()
+        }
+        
+        # Mapare c<>mpuri tabel la c<>mpuri DB
+        field_mapping = {
+            'nume': 'name', 'titlu': 'name', 'joc': 'name', 'activitate': 'name',
+            'descriere': 'description', 'detalii': 'description', 'explicatie': 'description',
+            'materiale': 'materials_list', 'echipament': 'materials_list',
+            'varsta': 'age_group_min', 'categoria': 'category',
+            'participanti': 'participants_min', 'numar': 'participants_min',
+            'durata': 'duration_min', 'timp': 'duration_min'
+        }
+        
+        for table_field, db_field in field_mapping.items():
+            if table_field in data:
+                activity[db_field] = data[table_field]
+        
+        # Validare minim
+        if 'name' in activity and len(activity.get('name', '')) > 5:
+            return activity
+        
+        return None
+    
+    def _extract_metadata(self, text):
+        """Extrage metadata din text folosind pattern-uri"""
+        metadata = {}
+        
+        # Extrage v<>rsta
+        for pattern in self.activity_patterns['age_patterns']:
+            match = re.search(pattern, text)
+            if match:
+                metadata['age_group_min'] = int(match.group(1))
+                metadata['age_group_max'] = int(match.group(2)) if match.lastindex >= 2 else int(match.group(1))
+                break
+        
+        # Extrage numr participani
+        for pattern in self.activity_patterns['participants_patterns']:
+            match = re.search(pattern, text)
+            if match:
+                metadata['participants_min'] = int(match.group(1))
+                metadata['participants_max'] = int(match.group(2)) if match.lastindex >= 2 else int(match.group(1))
+                break
+        
+        # Extrage durata
+        for pattern in self.activity_patterns['duration_patterns']:
+            match = re.search(pattern, text)
+            if match:
+                metadata['duration_min'] = int(match.group(1))
+                metadata['duration_max'] = int(match.group(2)) if match.lastindex >= 2 else int(match.group(1))
+                break
+        
+        # Extrage materiale
+        materials = []
+        text_lower = text.lower()
+        for marker in self.activity_patterns['materials_markers']:
+            idx = text_lower.find(marker)
+            if idx != -1:
+                # Extrage urmtoarele 200 caractere dup marker
+                materials_text = text[idx:idx+200]
+                # Extrage items din list
+                items = re.findall(r'[-"]\s*([^\n-"]+)', materials_text)
+                if items:
+                    materials.extend(items)
+        
+        if materials:
+            metadata['materials_list'] = ', '.join(materials[:10])  # Maxim 10 materiale
+        
+        return metadata
+    
+    def _detect_category(self, text):
+        """Detecteaz categoria activitii bazat pe cuvinte cheie"""
+        text_lower = text.lower()
+        
+        for category, keywords in self.categories.items():
+            if any(keyword in text_lower for keyword in keywords):
+                return category
+        
+        return '[A]'  # Default categoria jocuri
+    
+    def _extract_keywords(self, text):
+        """Extrage cuvinte cheie din text"""
+        keywords = []
+        text_lower = text.lower()
+        
+        # Lista de cuvinte cheie relevante
+        keyword_list = [
+            'cooperare', 'competitie', 'echipa', 'creativitate', 'miscare',
+            'strategie', 'comunicare', 'incredere', 'coordonare', 'atentie',
+            'reflexe', 'logica', 'imaginatie', 'muzica', 'dans', 'sport',
+            'natura', 'mediu', 'stiinta', 'matematica', 'limba', 'cultura'
+        ]
+        
+        for keyword in keyword_list:
+            if keyword in text_lower:
+                keywords.append(keyword)
+        
+        return ', '.join(keywords[:5])  # Maxim 5 keywords
+    
+    def save_to_database(self, activities):
+        """Salveaz activitile <20>n baza de date"""
+        conn = sqlite3.connect(self.db_path)
+        cursor = conn.cursor()
+        
+        saved_count = 0
+        duplicate_count = 0
+        
+        for activity in activities:
+            try:
+                # Verific duplicate
+                cursor.execute(
+                    "SELECT id FROM activities WHERE name = ? AND source_file = ?",
+                    (activity.get('name'), activity.get('source_file'))
+                )
+                
+                if cursor.fetchone():
+                    duplicate_count += 1
+                    continue
+                
+                # Pregtete valorile pentru insert
+                columns = []
+                values = []
+                placeholders = []
+                
+                for key, value in activity.items():
+                    if key != 'created_at':  # Skip created_at, it has default
+                        columns.append(key)
+                        values.append(value)
+                        placeholders.append('?')
+                
+                # Insert <20>n DB
+                query = f"INSERT INTO activities ({', '.join(columns)}) VALUES ({', '.join(placeholders)})"
+                cursor.execute(query, values)
+                saved_count += 1
+                
+            except Exception as e:
+                print(f"Error saving activity: {e}")
+                continue
+        
+        conn.commit()
+        conn.close()
+        
+        return saved_count, duplicate_count
+    
+    def process_all_html_files(self, base_path='/mnt/d/GoogleDrive/Cercetasi/carti-camp-jocuri'):
+        """Proceseaz toate fiierele HTML din directorul specificat"""
+        base_path = Path(base_path)
+        html_files = list(base_path.rglob("*.html"))
+        html_files.extend(list(base_path.rglob("*.htm")))
+        
+        print(f"Found {len(html_files)} HTML files to process")
+        
+        all_activities = []
+        processed = 0
+        errors = 0
+        
+        for i, html_file in enumerate(html_files):
+            try:
+                activities = self.extract_from_html(str(html_file))
+                all_activities.extend(activities)
+                processed += 1
+                
+                # Progress update
+                if (i + 1) % 100 == 0:
+                    print(f"Progress: {i+1}/{len(html_files)} files processed, {len(all_activities)} activities found")
+                    # Save batch to DB
+                    if all_activities:
+                        saved, dupes = self.save_to_database(all_activities)
+                        print(f"Batch saved: {saved} new activities, {dupes} duplicates skipped")
+                        all_activities = []  # Clear buffer
+                
+            except Exception as e:
+                print(f"Error processing {html_file}: {e}")
+                errors += 1
+        
+        # Save remaining activities
+        if all_activities:
+            saved, dupes = self.save_to_database(all_activities)
+            print(f"Final batch saved: {saved} new activities, {dupes} duplicates skipped")
+        
+        print(f"\nProcessing complete!")
+        print(f"Files processed: {processed}")
+        print(f"Errors: {errors}")
+        
+        return processed, errors
+
+# Funcie main pentru test
+if __name__ == "__main__":
+    extractor = HTMLActivityExtractor()
+    
+    # Test pe un fiier sample mai <20>nt<6E>i
+    print("Testing on sample file first...")
+    # Gsete un fiier HTML pentru test
+    test_files = list(Path('/mnt/d/GoogleDrive/Cercetasi/carti-camp-jocuri').rglob("*.html"))[:3]
+    
+    for test_file in test_files:
+        print(f"\nTesting: {test_file}")
+        activities = extractor.extract_from_html(str(test_file))
+        print(f"Found {len(activities)} activities")
+        if activities:
+            print(f"Sample activity: {activities[0]['name'][:50]}...")
+    
+    # <20>ntreab dac s continue cu procesarea complet
+    response = input("\nContinue with full processing? (y/n): ")
+    if response.lower() == 'y':
+        extractor.process_all_html_files()
--- a/scripts/import_claude_activities.py
+++ b/scripts/import_claude_activities.py
@@ -0,0 +1,78 @@
+#!/usr/bin/env python3
+"""
+Import activities extracted by Claude from JSON files
+"""
+
+import json
+import sqlite3
+from pathlib import Path
+from datetime import datetime
+
+class ClaudeActivityImporter:
+    def __init__(self, db_path='data/activities.db'):
+        self.db_path = db_path
+        self.json_dir = Path('scripts/extracted_activities')
+        self.json_dir.mkdir(exist_ok=True)
+    
+    def import_json_file(self, json_path):
+        """Import activities from a single JSON file"""
+        with open(json_path, 'r', encoding='utf-8') as f:
+            data = json.load(f)
+        
+        source_file = data.get('source_file', str(json_path))
+        activities = data.get('activities', [])
+        
+        conn = sqlite3.connect(self.db_path)
+        cursor = conn.cursor()
+        
+        imported = 0
+        for activity in activities:
+            try:
+                # Add source file and timestamp
+                activity['source_file'] = source_file
+                activity['created_at'] = datetime.now().isoformat()
+                
+                # Prepare insert
+                columns = list(activity.keys())
+                values = list(activity.values())
+                placeholders = ['?' for _ in values]
+                
+                # Check for duplicate
+                cursor.execute(
+                    "SELECT id FROM activities WHERE name = ? AND source_file = ?",
+                    (activity.get('name'), source_file)
+                )
+                
+                if not cursor.fetchone():
+                    query = f"INSERT INTO activities ({', '.join(columns)}) VALUES ({', '.join(placeholders)})"
+                    cursor.execute(query, values)
+                    imported += 1
+                    
+            except Exception as e:
+                print(f"Error importing activity: {e}")
+        
+        conn.commit()
+        conn.close()
+        
+        print(f"Imported {imported} activities from {json_path.name}")
+        return imported
+    
+    def import_all_json_files(self):
+        """Import all JSON files from the extracted_activities directory"""
+        json_files = list(self.json_dir.glob("*.json"))
+        
+        if not json_files:
+            print("No JSON files found in extracted_activities directory")
+            return 0
+        
+        total_imported = 0
+        for json_file in json_files:
+            imported = self.import_json_file(json_file)
+            total_imported += imported
+        
+        print(f"\nTotal imported: {total_imported} activities from {len(json_files)} files")
+        return total_imported
+
+if __name__ == "__main__":
+    importer = ClaudeActivityImporter()
+    importer.import_all_json_files()
--- a/scripts/index_data.py
+++ b/scripts/index_data.py
@@ -1,200 +0,0 @@
-#!/usr/bin/env python3
-"""
-Data indexing script for INDEX-SISTEM-JOCURI v2.0
-Extracts activities from INDEX_MASTER and populates database
-"""
-
-import sys
-import os
-from pathlib import Path
-
-# Add app directory to Python path
-sys.path.insert(0, str(Path(__file__).parent.parent))
-
-from app.models.database import DatabaseManager
-from app.services.indexer import ActivityIndexer
-from app.config import Config
-import argparse
-import time
-
-def main():
-    """Main indexing function"""
-    parser = argparse.ArgumentParser(description='Index activities from INDEX_MASTER')
-    parser.add_argument('--clear', action='store_true', help='Clear existing database before indexing')
-    parser.add_argument('--category', help='Index specific category only (e.g., [A], [B], etc.)')
-    parser.add_argument('--verify', action='store_true', help='Verify indexing quality after completion')
-    parser.add_argument('--stats', action='store_true', help='Show database statistics only')
-    
-    args = parser.parse_args()
-    
-    # Setup paths
-    Config.ensure_directories()
-    
-    # Database path
-    db_path = os.environ.get('DATABASE_URL', str(Config.DATA_DIR / 'activities.db'))
-    if db_path.startswith('sqlite:///'):
-        db_path = db_path[10:]  # Remove sqlite:/// prefix
-    
-    # INDEX_MASTER path  
-    index_master_path = os.environ.get('INDEX_MASTER_FILE', str(Config.INDEX_MASTER_FILE))
-    
-    print("🎯 INDEX-SISTEM-JOCURI v2.0 - Data Indexing")
-    print("=" * 50)
-    print(f"Database: {db_path}")
-    print(f"INDEX_MASTER: {index_master_path}")
-    print("=" * 50)
-    
-    # Verify INDEX_MASTER file exists
-    if not Path(index_master_path).exists():
-        print(f"❌ INDEX_MASTER file not found: {index_master_path}")
-        print("   Please ensure the file is mounted in the container or available locally")
-        return 1
-    
-    # Initialize services
-    try:
-        db_manager = DatabaseManager(db_path)
-        indexer = ActivityIndexer(db_manager, index_master_path)
-    except Exception as e:
-        print(f"❌ Error initializing services: {e}")
-        return 1
-    
-    # Handle different operations
-    if args.stats:
-        return show_statistics(db_manager)
-    
-    if args.category:
-        return index_category(indexer, args.category)
-    
-    if args.verify:
-        return verify_indexing(indexer)
-    
-    # Default: full indexing
-    return full_indexing(indexer, args.clear)
-
-def full_indexing(indexer: ActivityIndexer, clear_existing: bool) -> int:
-    """Perform full indexing of all activities"""
-    
-    print("🚀 Starting full indexing process...")
-    
-    try:
-        # Perform indexing
-        result = indexer.index_all_activities(clear_existing=clear_existing)
-        
-        if not result.get('success'):
-            print(f"❌ Indexing failed: {result.get('error', 'Unknown error')}")
-            return 1
-        
-        # Print results
-        print("\n📊 INDEXING RESULTS")
-        print("=" * 30)
-        print(f"✅ Activities inserted: {result.get('inserted_count', 0)}")
-        print(f"⏱️  Indexing time: {result.get('indexing_time_seconds', 0):.2f}s")
-        
-        parsing_stats = result.get('parsing_stats', {})
-        print(f"📈 Completion rate: {parsing_stats.get('completion_rate', 0):.1f}%")
-        print(f"📝 Avg description length: {parsing_stats.get('average_description_length', 0):.0f} chars")
-        
-        # Category breakdown
-        categories = result.get('distribution', {}).get('categories', {})
-        print(f"\n📂 CATEGORY BREAKDOWN:")
-        for category, count in categories.items():
-            print(f"   {category}: {count} activities")
-        
-        # Quality check
-        if result.get('inserted_count', 0) >= 500:
-            print(f"\n🎯 SUCCESS: Target of 500+ activities achieved!")
-        else:
-            print(f"\n⚠️  Warning: Only {result.get('inserted_count', 0)} activities indexed (target: 500+)")
-        
-        return 0
-        
-    except Exception as e:
-        print(f"❌ Error during indexing: {e}")
-        return 1
-
-def index_category(indexer: ActivityIndexer, category_code: str) -> int:
-    """Index a specific category"""
-    
-    print(f"🎯 Indexing category: {category_code}")
-    
-    try:
-        result = indexer.index_specific_category(category_code)
-        
-        if not result.get('success'):
-            print(f"❌ Category indexing failed: {result.get('error', 'Unknown error')}")
-            return 1
-        
-        print(f"✅ Category '{result.get('category')}' indexed successfully")
-        print(f"   Inserted: {result.get('inserted_count')} activities")
-        print(f"   Parsed: {result.get('total_parsed')} total")
-        print(f"   Valid: {result.get('valid_activities')} valid")
-        
-        return 0
-        
-    except Exception as e:
-        print(f"❌ Error during category indexing: {e}")
-        return 1
-
-def verify_indexing(indexer: ActivityIndexer) -> int:
-    """Verify indexing quality"""
-    
-    print("🔍 Verifying indexing quality...")
-    
-    try:
-        result = indexer.verify_indexing_quality()
-        
-        if 'error' in result:
-            print(f"❌ Verification error: {result['error']}")
-            return 1
-        
-        print("\n📊 QUALITY VERIFICATION")
-        print("=" * 30)
-        print(f"Total activities: {result.get('total_activities', 0)}")
-        print(f"Meets minimum (500+): {'✅' if result.get('meets_minimum_requirement') else '❌'}")
-        print(f"Category coverage: {result.get('category_coverage', 0)}/{result.get('expected_categories', 8)}")
-        print(f"Quality score: {result.get('quality_score', 0)}/100")
-        
-        quality_issues = result.get('quality_issues', [])
-        if quality_issues:
-            print(f"\n⚠️  Quality Issues:")
-            for issue in quality_issues[:5]:  # Show first 5 issues
-                print(f"   • {issue}")
-            if len(quality_issues) > 5:
-                print(f"   ... and {len(quality_issues) - 5} more issues")
-        else:
-            print(f"\n✅ No quality issues detected")
-        
-        return 0 if result.get('quality_score', 0) >= 80 else 1
-        
-    except Exception as e:
-        print(f"❌ Error during verification: {e}")
-        return 1
-
-def show_statistics(db_manager: DatabaseManager) -> int:
-    """Show database statistics"""
-    
-    print("📊 Database Statistics")
-    print("=" * 25)
-    
-    try:
-        stats = db_manager.get_statistics()
-        
-        print(f"Total activities: {stats.get('total_activities', 0)}")
-        print(f"Database size: {stats.get('database_size_bytes', 0) / 1024:.1f} KB")
-        print(f"Database path: {stats.get('database_path', 'Unknown')}")
-        
-        categories = stats.get('categories', {})
-        if categories:
-            print(f"\nCategories:")
-            for category, count in categories.items():
-                print(f"  {category}: {count}")
-        
-        return 0
-        
-    except Exception as e:
-        print(f"❌ Error getting statistics: {e}")
-        return 1
-
-if __name__ == '__main__':
-    exit_code = main()
-    sys.exit(exit_code)
--- a/scripts/pdf_extractor.py
+++ b/scripts/pdf_extractor.py
--- a/scripts/pdf_to_text_converter.py
+++ b/scripts/pdf_to_text_converter.py
@@ -0,0 +1,143 @@
+#!/usr/bin/env python3
+"""
+PDF Mass Conversion to Text for Activity Extraction
+Handles all PDF sizes efficiently with multiple fallback methods
+"""
+
+import os
+import json
+from pathlib import Path
+import PyPDF2
+import pdfplumber
+from typing import List, Dict
+import logging
+
+class PDFConverter:
+    def __init__(self, max_pages=50):
+        self.max_pages = max_pages
+        self.conversion_stats = {}
+    
+    def convert_pdf_to_text(self, pdf_path: str) -> str:
+        """Convert PDF to text using multiple methods with fallbacks"""
+        try:
+            # Method 1: pdfplumber (best for tables and layout)
+            return self._convert_with_pdfplumber(pdf_path)
+        except Exception as e:
+            print(f"pdfplumber failed for {pdf_path}: {e}")
+            
+            try:
+                # Method 2: PyPDF2 (fallback)
+                return self._convert_with_pypdf2(pdf_path)
+            except Exception as e2:
+                print(f"PyPDF2 also failed for {pdf_path}: {e2}")
+                return ""
+    
+    def _convert_with_pdfplumber(self, pdf_path: str) -> str:
+        """Primary conversion method using pdfplumber"""
+        text_content = ""
+        
+        with pdfplumber.open(pdf_path) as pdf:
+            total_pages = len(pdf.pages)
+            pages_to_process = min(total_pages, self.max_pages)
+            
+            print(f"  Converting {pdf_path}: {pages_to_process}/{total_pages} pages")
+            
+            for i, page in enumerate(pdf.pages[:pages_to_process]):
+                try:
+                    page_text = page.extract_text()
+                    if page_text:
+                        text_content += f"\n--- PAGE {i+1} ---\n"
+                        text_content += page_text
+                        text_content += "\n"
+                except Exception as e:
+                    print(f"    Error on page {i+1}: {e}")
+                    continue
+        
+        self.conversion_stats[pdf_path] = {
+            'method': 'pdfplumber',
+            'pages_processed': pages_to_process,
+            'total_pages': total_pages,
+            'success': True,
+            'text_length': len(text_content)
+        }
+        
+        return text_content
+    
+    def _convert_with_pypdf2(self, pdf_path: str) -> str:
+        """Fallback conversion method using PyPDF2"""
+        text_content = ""
+        
+        with open(pdf_path, 'rb') as file:
+            reader = PyPDF2.PdfReader(file)
+            total_pages = len(reader.pages)
+            pages_to_process = min(total_pages, self.max_pages)
+            
+            print(f"  Converting {pdf_path} (fallback): {pages_to_process}/{total_pages} pages")
+            
+            for i in range(pages_to_process):
+                try:
+                    page = reader.pages[i]
+                    page_text = page.extract_text()
+                    if page_text:
+                        text_content += f"\n--- PAGE {i+1} ---\n"
+                        text_content += page_text
+                        text_content += "\n"
+                except Exception as e:
+                    print(f"    Error on page {i+1}: {e}")
+                    continue
+        
+        self.conversion_stats[pdf_path] = {
+            'method': 'PyPDF2',
+            'pages_processed': pages_to_process,
+            'total_pages': total_pages,
+            'success': True,
+            'text_length': len(text_content)
+        }
+        
+        return text_content
+    
+    def convert_all_pdfs(self, pdf_directory: str, output_directory: str):
+        """Convert all PDFs in directory to text files"""
+        pdf_files = list(Path(pdf_directory).glob("**/*.pdf"))
+        
+        print(f"🔄 Converting {len(pdf_files)} PDF files to text...")
+        
+        os.makedirs(output_directory, exist_ok=True)
+        
+        for i, pdf_path in enumerate(pdf_files):
+            print(f"\n[{i+1}/{len(pdf_files)}] Processing {pdf_path.name}...")
+            
+            # Convert to text
+            text_content = self.convert_pdf_to_text(str(pdf_path))
+            
+            if text_content.strip():
+                # Save as text file
+                output_file = Path(output_directory) / f"{pdf_path.stem}.txt"
+                with open(output_file, 'w', encoding='utf-8') as f:
+                    f.write(f"SOURCE: {pdf_path}\n")
+                    f.write(f"CONVERTED: 2025-01-11\n")
+                    f.write("="*50 + "\n\n")
+                    f.write(text_content)
+                
+                print(f"  ✅ Saved: {output_file}")
+            else:
+                print(f"  ❌ No text extracted from {pdf_path.name}")
+        
+        # Save conversion statistics
+        stats_file = Path(output_directory) / "conversion_stats.json"
+        with open(stats_file, 'w', encoding='utf-8') as f:
+            json.dump(self.conversion_stats, f, indent=2, ensure_ascii=False)
+        
+        print(f"\n🎉 PDF conversion complete! Check {output_directory}")
+        return len([f for f in self.conversion_stats.values() if f['success']])
+
+# Usage
+if __name__ == "__main__":
+    converter = PDFConverter(max_pages=50)
+    
+    # Convert all PDFs
+    pdf_dir = "/mnt/d/GoogleDrive/Cercetasi/carti-camp-jocuri"
+    output_dir = "/mnt/d/GoogleDrive/Cercetasi/carti-camp-jocuri/INDEX-SISTEM-JOCURI/converted_pdfs"
+    
+    converted_count = converter.convert_all_pdfs(pdf_dir, output_dir)
+    print(f"Final result: {converted_count} PDFs successfully converted")
--- a/scripts/run_extraction.py
+++ b/scripts/run_extraction.py
@@ -0,0 +1,50 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+"""
+Main extraction orchestrator
+Ruleaza intregul proces de extractie
+"""
+
+import sys
+import time
+from pathlib import Path
+
+from unified_processor import UnifiedProcessor
+from import_claude_activities import ClaudeActivityImporter
+
+def main():
+    print("="*60)
+    print("ACTIVITY EXTRACTION SYSTEM")
+    print("Strategy S8: Hybrid Claude + Scripts")
+    print("="*60)
+    
+    # Step 1: Run automated extraction
+    print("\nSTEP 1: Automated Extraction")
+    print("-"*40)
+    processor = UnifiedProcessor()
+    processor.process_automated_formats()
+    
+    # Step 2: Wait for Claude processing
+    print("\n" + "="*60)
+    print("STEP 2: Manual Claude Processing Required")
+    print("-"*40)
+    print("Please process PDF/DOC files with Claude using the template.")
+    print("Files are listed in: pdf_doc_for_claude.txt")
+    print("Save extracted activities as JSON in: scripts/extracted_activities/")
+    print("="*60)
+    
+    response = input("\nHave you completed Claude processing? (y/n): ")
+    
+    if response.lower() == 'y':
+        # Step 3: Import Claude-extracted activities
+        print("\nSTEP 3: Importing Claude-extracted activities")
+        print("-"*40)
+        importer = ClaudeActivityImporter()
+        importer.import_all_json_files()
+    
+    print("\n" + "="*60)
+    print("EXTRACTION COMPLETE!")
+    print("="*60)
+
+if __name__ == "__main__":
+    main()
--- a/scripts/text_extractor.py
+++ b/scripts/text_extractor.py
@@ -0,0 +1,197 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+"""
+Text/Markdown Activity Extractor
+Proceseaza fisiere TXT si MD pentru extractie activitati
+"""
+
+import re
+from pathlib import Path
+from typing import List, Dict
+import sqlite3
+from datetime import datetime
+
+class TextActivityExtractor:
+    def __init__(self, db_path='data/activities.db'):
+        self.db_path = db_path
+        self.activity_patterns = {
+            'section_headers': [
+                r'^#{1,6}\s*(.+)$',  # Markdown headers
+                r'^([A-Z][^\.]{10,100})$',  # Titluri simple
+                r'^\d+\.\s*(.+)$',  # Numbered lists
+                r'^[•\-\*]\s*(.+)$',  # Bullet points
+            ],
+            'activity_markers': [
+                'joc:', 'activitate:', 'exercitiu:', 'team building:',
+                'nume:', 'titlu:', 'denumire:'
+            ]
+        }
+    
+    def extract_from_text(self, file_path: str) -> List[Dict]:
+        """Extrage activitati din fisier text/markdown"""
+        activities = []
+        
+        try:
+            with open(file_path, 'r', encoding='utf-8', errors='ignore') as f:
+                content = f.read()
+            
+            # Metoda 1: Cauta sectiuni markdown
+            if file_path.endswith('.md'):
+                activities.extend(self._extract_from_markdown(content, file_path))
+            
+            # Metoda 2: Cauta pattern-uri generale
+            activities.extend(self._extract_from_patterns(content, file_path))
+            
+            # Metoda 3: Cauta blocuri de text structurate
+            activities.extend(self._extract_from_blocks(content, file_path))
+            
+        except Exception as e:
+            print(f"Error processing {file_path}: {e}")
+        
+        return activities
+    
+    def _extract_from_markdown(self, content, source_file):
+        """Extrage activitati din format markdown"""
+        activities = []
+        lines = content.split('\n')
+        
+        current_activity = None
+        current_content = []
+        
+        for line in lines:
+            # Verifica daca e header de activitate
+            if re.match(r'^#{1,3}\s*(.+)', line):
+                # Salveaza activitatea anterioara daca exista
+                if current_activity and current_content:
+                    current_activity['description'] = '\n'.join(current_content[:20])  # Max 20 linii
+                    activities.append(current_activity)
+                
+                # Verifica daca noul header e o activitate
+                header_text = re.sub(r'^#{1,3}\s*', '', line)
+                if any(marker in header_text.lower() for marker in ['joc', 'activitate', 'exercitiu']):
+                    current_activity = {
+                        'name': header_text[:200],
+                        'source_file': str(source_file),
+                        'category': '[A]'
+                    }
+                    current_content = []
+                else:
+                    current_activity = None
+            
+            elif current_activity:
+                # Adauga continut la activitatea curenta
+                if line.strip():
+                    current_content.append(line)
+        
+        # Salveaza ultima activitate
+        if current_activity and current_content:
+            current_activity['description'] = '\n'.join(current_content[:20])
+            activities.append(current_activity)
+        
+        return activities
+    
+    def _extract_from_patterns(self, content, source_file):
+        """Extrage folosind pattern matching"""
+        activities = []
+        
+        # Cauta markeri specifici de activitati
+        for marker in self.activity_patterns['activity_markers']:
+            pattern = re.compile(f'{re.escape(marker)}\\s*(.+?)(?=\\n\\n|{re.escape(marker)}|$)', 
+                               re.IGNORECASE | re.DOTALL)
+            matches = pattern.finditer(content)
+            
+            for match in matches:
+                activity_text = match.group(1)
+                if len(activity_text) > 20:
+                    activity = {
+                        'name': activity_text.split('\n')[0][:200],
+                        'description': activity_text[:1000],
+                        'source_file': str(source_file),
+                        'category': '[A]'
+                    }
+                    activities.append(activity)
+        
+        return activities
+    
+    def _extract_from_blocks(self, content, source_file):
+        """Extrage din blocuri de text separate"""
+        activities = []
+        
+        # Imparte in blocuri separate de linii goale
+        blocks = re.split(r'\n\s*\n', content)
+        
+        for block in blocks:
+            if len(block) > 50:  # Minim 50 caractere
+                lines = block.strip().split('\n')
+                first_line = lines[0].strip()
+                
+                # Verifica daca blocul pare o activitate
+                if any(keyword in first_line.lower() for keyword in ['joc', 'activitate', 'exercitiu']):
+                    activity = {
+                        'name': first_line[:200],
+                        'description': block[:1000],
+                        'source_file': str(source_file),
+                        'category': '[A]'
+                    }
+                    activities.append(activity)
+        
+        return activities
+    
+    def save_to_database(self, activities):
+        """Salveaza in baza de date"""
+        conn = sqlite3.connect(self.db_path)
+        cursor = conn.cursor()
+        
+        saved_count = 0
+        
+        for activity in activities:
+            try:
+                # Check for duplicates
+                cursor.execute(
+                    "SELECT id FROM activities WHERE name = ? AND source_file = ?",
+                    (activity.get('name'), activity.get('source_file'))
+                )
+                
+                if not cursor.fetchone():
+                    columns = list(activity.keys())
+                    values = list(activity.values())
+                    placeholders = ['?' for _ in values]
+                    
+                    query = f"INSERT INTO activities ({', '.join(columns)}) VALUES ({', '.join(placeholders)})"
+                    cursor.execute(query, values)
+                    saved_count += 1
+                    
+            except Exception as e:
+                print(f"Error saving: {e}")
+        
+        conn.commit()
+        conn.close()
+        
+        return saved_count
+    
+    def process_all_text_files(self, base_path='/mnt/d/GoogleDrive/Cercetasi/carti-camp-jocuri'):
+        """Proceseaza toate fisierele text si markdown"""
+        base_path = Path(base_path)
+        
+        text_files = list(base_path.rglob("*.txt"))
+        md_files = list(base_path.rglob("*.md"))
+        all_files = text_files + md_files
+        
+        print(f"Found {len(all_files)} text/markdown files")
+        
+        all_activities = []
+        
+        for file_path in all_files:
+            activities = self.extract_from_text(str(file_path))
+            all_activities.extend(activities)
+            print(f"Processed {file_path.name}: {len(activities)} activities")
+        
+        # Save to database
+        saved = self.save_to_database(all_activities)
+        print(f"\nTotal saved: {saved} activities from {len(all_files)} files")
+        
+        return len(all_files), saved
+
+if __name__ == "__main__":
+    extractor = TextActivityExtractor()
+    extractor.process_all_text_files()
--- a/scripts/unified_processor.py
+++ b/scripts/unified_processor.py
@@ -0,0 +1,151 @@
+#!/usr/bin/env python3
+"""
+Unified Activity Processor
+Orchestreaz toate extractoarele pentru procesare complet
+"""
+
+import time
+from pathlib import Path
+from html_extractor import HTMLActivityExtractor
+from text_extractor import TextActivityExtractor
+import sqlite3
+
+class UnifiedProcessor:
+    def __init__(self, db_path='data/activities.db'):
+        self.db_path = db_path
+        self.html_extractor = HTMLActivityExtractor(db_path)
+        self.text_extractor = TextActivityExtractor(db_path)
+        self.stats = {
+            'html_processed': 0,
+            'text_processed': 0,
+            'pdf_to_process': 0,
+            'doc_to_process': 0,
+            'total_activities': 0,
+            'start_time': None,
+            'end_time': None
+        }
+    
+    def get_current_activity_count(self):
+        """Obine numrul curent de activiti din DB"""
+        conn = sqlite3.connect(self.db_path)
+        cursor = conn.cursor()
+        cursor.execute("SELECT COUNT(*) FROM activities")
+        count = cursor.fetchone()[0]
+        conn.close()
+        return count
+    
+    def count_files_to_process(self, base_path):
+        """Numr fiierele care trebuie procesate"""
+        base_path = Path(base_path)
+        
+        counts = {
+            'html': len(list(base_path.rglob("*.html"))) + len(list(base_path.rglob("*.htm"))),
+            'txt': len(list(base_path.rglob("*.txt"))),
+            'md': len(list(base_path.rglob("*.md"))),
+            'pdf': len(list(base_path.rglob("*.pdf"))),
+            'doc': len(list(base_path.rglob("*.doc"))),
+            'docx': len(list(base_path.rglob("*.docx")))
+        }
+        
+        return counts
+    
+    def process_automated_formats(self, base_path='/mnt/d/GoogleDrive/Cercetasi/carti-camp-jocuri'):
+        """Proceseaz toate formatele care pot fi automatizate"""
+        print("="*60)
+        print("UNIFIED ACTIVITY PROCESSOR - AUTOMATED PHASE")
+        print("="*60)
+        
+        self.stats['start_time'] = time.time()
+        initial_count = self.get_current_activity_count()
+        
+        # Afieaz statistici iniiale
+        file_counts = self.count_files_to_process(base_path)
+        print(f"\nFiles to process:")
+        for format, count in file_counts.items():
+            print(f"  {format.upper()}: {count} files")
+        print(f"\nCurrent activities in database: {initial_count}")
+        print("-"*60)
+        
+        # FAZA 1: Procesare HTML (prioritate maxim - volum mare)
+        print("\n[1/2] Processing HTML files...")
+        print("-"*40)
+        html_processed, html_errors = self.html_extractor.process_all_html_files(base_path)
+        self.stats['html_processed'] = html_processed
+        
+        # FAZA 2: Procesare Text/MD
+        print("\n[2/2] Processing Text/Markdown files...")
+        print("-"*40)
+        text_processed, text_saved = self.text_extractor.process_all_text_files(base_path)
+        self.stats['text_processed'] = text_processed
+        
+        # Statistici finale
+        self.stats['end_time'] = time.time()
+        final_count = self.get_current_activity_count()
+        self.stats['total_activities'] = final_count - initial_count
+        
+        # Identific fiierele care necesit procesare manual
+        self.stats['pdf_to_process'] = file_counts['pdf']
+        self.stats['doc_to_process'] = file_counts['doc'] + file_counts['docx']
+        
+        self.print_summary()
+        self.save_pdf_doc_list(base_path)
+    
+    def print_summary(self):
+        """Afieaz rezumatul procesrii"""
+        print("\n" + "="*60)
+        print("PROCESSING SUMMARY")
+        print("="*60)
+        
+        duration = self.stats['end_time'] - self.stats['start_time']
+        
+        print(f"\nAutomated Processing Results:")
+        print(f"  HTML files processed: {self.stats['html_processed']}")
+        print(f"  Text/MD files processed: {self.stats['text_processed']}")
+        print(f"  New activities added: {self.stats['total_activities']}")
+        print(f"  Processing time: {duration:.1f} seconds")
+        
+        print(f"\nFiles requiring Claude processing:")
+        print(f"  PDF files: {self.stats['pdf_to_process']}")
+        print(f"  DOC/DOCX files: {self.stats['doc_to_process']}")
+        
+        print("\n" + "="*60)
+        print("NEXT STEPS:")
+        print("1. Review the file 'pdf_doc_for_claude.txt' for manual processing")
+        print("2. Use Claude to extract activities from PDF/DOC files")
+        print("3. Focus on largest PDF files first (highest activity density)")
+        print("="*60)
+    
+    def save_pdf_doc_list(self, base_path):
+        """Salveaz lista de PDF/DOC pentru procesare cu Claude"""
+        base_path = Path(base_path)
+        
+        pdf_files = sorted(base_path.rglob("*.pdf"), key=lambda p: p.stat().st_size, reverse=True)
+        doc_files = list(base_path.rglob("*.doc"))
+        docx_files = list(base_path.rglob("*.docx"))
+        
+        with open('pdf_doc_for_claude.txt', 'w', encoding='utf-8') as f:
+            f.write("PDF/DOC FILES FOR CLAUDE PROCESSING\n")
+            f.write("="*60 + "\n")
+            f.write("Files sorted by size (largest first = likely more activities)\n\n")
+            
+            f.write("TOP PRIORITY PDF FILES (process these first):\n")
+            f.write("-"*40 + "\n")
+            for i, pdf in enumerate(pdf_files[:20], 1):
+                size_mb = pdf.stat().st_size / (1024*1024)
+                f.write(f"{i}. {pdf.name} ({size_mb:.1f} MB)\n")
+                f.write(f"   Path: {pdf}\n\n")
+            
+            if len(pdf_files) > 20:
+                f.write(f"\n... and {len(pdf_files)-20} more PDF files\n\n")
+            
+            f.write("\nDOC/DOCX FILES:\n")
+            f.write("-"*40 + "\n")
+            for doc in doc_files + docx_files:
+                size_kb = doc.stat().st_size / 1024
+                f.write(f"- {doc.name} ({size_kb:.1f} KB)\n")
+        
+        print(f"\nPDF/DOC list saved to: pdf_doc_for_claude.txt")
+
+if __name__ == "__main__":
+    processor = UnifiedProcessor()
+    processor.process_automated_formats()