Refactor extraction system and reorganize project structure

- Remove obsolete documentation files (DEPLOYMENT.md, PLAN_IMPLEMENTARE_S8_DETALIAT.md, README.md) - Add comprehensive extraction pipeline with multiple format support (PDF, HTML, text) - Implement Claude-based activity extraction with structured templates - Update dependencies and Docker configuration - Reorganize scripts directory with modular extraction components - Move example documentation to appropriate location 🤖 Generated with [Claude Code](https://claude.ai/code) Co-Authored-By: Claude <noreply@anthropic.com>
2025-09-11 23:32:37 +03:00
parent 1b6b7e06ad
commit a19ddf0b71
119 changed files with 91074 additions and 1859 deletions
--- a/scripts/text_extractor.py
+++ b/scripts/text_extractor.py
@@ -0,0 +1,197 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+"""
+Text/Markdown Activity Extractor
+Proceseaza fisiere TXT si MD pentru extractie activitati
+"""
+
+import re
+from pathlib import Path
+from typing import List, Dict
+import sqlite3
+from datetime import datetime
+
+class TextActivityExtractor:
+    def __init__(self, db_path='data/activities.db'):
+        self.db_path = db_path
+        self.activity_patterns = {
+            'section_headers': [
+                r'^#{1,6}\s*(.+)$',  # Markdown headers
+                r'^([A-Z][^\.]{10,100})$',  # Titluri simple
+                r'^\d+\.\s*(.+)$',  # Numbered lists
+                r'^[•\-\*]\s*(.+)$',  # Bullet points
+            ],
+            'activity_markers': [
+                'joc:', 'activitate:', 'exercitiu:', 'team building:',
+                'nume:', 'titlu:', 'denumire:'
+            ]
+        }
+    
+    def extract_from_text(self, file_path: str) -> List[Dict]:
+        """Extrage activitati din fisier text/markdown"""
+        activities = []
+        
+        try:
+            with open(file_path, 'r', encoding='utf-8', errors='ignore') as f:
+                content = f.read()
+            
+            # Metoda 1: Cauta sectiuni markdown
+            if file_path.endswith('.md'):
+                activities.extend(self._extract_from_markdown(content, file_path))
+            
+            # Metoda 2: Cauta pattern-uri generale
+            activities.extend(self._extract_from_patterns(content, file_path))
+            
+            # Metoda 3: Cauta blocuri de text structurate
+            activities.extend(self._extract_from_blocks(content, file_path))
+            
+        except Exception as e:
+            print(f"Error processing {file_path}: {e}")
+        
+        return activities
+    
+    def _extract_from_markdown(self, content, source_file):
+        """Extrage activitati din format markdown"""
+        activities = []
+        lines = content.split('\n')
+        
+        current_activity = None
+        current_content = []
+        
+        for line in lines:
+            # Verifica daca e header de activitate
+            if re.match(r'^#{1,3}\s*(.+)', line):
+                # Salveaza activitatea anterioara daca exista
+                if current_activity and current_content:
+                    current_activity['description'] = '\n'.join(current_content[:20])  # Max 20 linii
+                    activities.append(current_activity)
+                
+                # Verifica daca noul header e o activitate
+                header_text = re.sub(r'^#{1,3}\s*', '', line)
+                if any(marker in header_text.lower() for marker in ['joc', 'activitate', 'exercitiu']):
+                    current_activity = {
+                        'name': header_text[:200],
+                        'source_file': str(source_file),
+                        'category': '[A]'
+                    }
+                    current_content = []
+                else:
+                    current_activity = None
+            
+            elif current_activity:
+                # Adauga continut la activitatea curenta
+                if line.strip():
+                    current_content.append(line)
+        
+        # Salveaza ultima activitate
+        if current_activity and current_content:
+            current_activity['description'] = '\n'.join(current_content[:20])
+            activities.append(current_activity)
+        
+        return activities
+    
+    def _extract_from_patterns(self, content, source_file):
+        """Extrage folosind pattern matching"""
+        activities = []
+        
+        # Cauta markeri specifici de activitati
+        for marker in self.activity_patterns['activity_markers']:
+            pattern = re.compile(f'{re.escape(marker)}\\s*(.+?)(?=\\n\\n|{re.escape(marker)}|$)', 
+                               re.IGNORECASE | re.DOTALL)
+            matches = pattern.finditer(content)
+            
+            for match in matches:
+                activity_text = match.group(1)
+                if len(activity_text) > 20:
+                    activity = {
+                        'name': activity_text.split('\n')[0][:200],
+                        'description': activity_text[:1000],
+                        'source_file': str(source_file),
+                        'category': '[A]'
+                    }
+                    activities.append(activity)
+        
+        return activities
+    
+    def _extract_from_blocks(self, content, source_file):
+        """Extrage din blocuri de text separate"""
+        activities = []
+        
+        # Imparte in blocuri separate de linii goale
+        blocks = re.split(r'\n\s*\n', content)
+        
+        for block in blocks:
+            if len(block) > 50:  # Minim 50 caractere
+                lines = block.strip().split('\n')
+                first_line = lines[0].strip()
+                
+                # Verifica daca blocul pare o activitate
+                if any(keyword in first_line.lower() for keyword in ['joc', 'activitate', 'exercitiu']):
+                    activity = {
+                        'name': first_line[:200],
+                        'description': block[:1000],
+                        'source_file': str(source_file),
+                        'category': '[A]'
+                    }
+                    activities.append(activity)
+        
+        return activities
+    
+    def save_to_database(self, activities):
+        """Salveaza in baza de date"""
+        conn = sqlite3.connect(self.db_path)
+        cursor = conn.cursor()
+        
+        saved_count = 0
+        
+        for activity in activities:
+            try:
+                # Check for duplicates
+                cursor.execute(
+                    "SELECT id FROM activities WHERE name = ? AND source_file = ?",
+                    (activity.get('name'), activity.get('source_file'))
+                )
+                
+                if not cursor.fetchone():
+                    columns = list(activity.keys())
+                    values = list(activity.values())
+                    placeholders = ['?' for _ in values]
+                    
+                    query = f"INSERT INTO activities ({', '.join(columns)}) VALUES ({', '.join(placeholders)})"
+                    cursor.execute(query, values)
+                    saved_count += 1
+                    
+            except Exception as e:
+                print(f"Error saving: {e}")
+        
+        conn.commit()
+        conn.close()
+        
+        return saved_count
+    
+    def process_all_text_files(self, base_path='/mnt/d/GoogleDrive/Cercetasi/carti-camp-jocuri'):
+        """Proceseaza toate fisierele text si markdown"""
+        base_path = Path(base_path)
+        
+        text_files = list(base_path.rglob("*.txt"))
+        md_files = list(base_path.rglob("*.md"))
+        all_files = text_files + md_files
+        
+        print(f"Found {len(all_files)} text/markdown files")
+        
+        all_activities = []
+        
+        for file_path in all_files:
+            activities = self.extract_from_text(str(file_path))
+            all_activities.extend(activities)
+            print(f"Processed {file_path.name}: {len(activities)} activities")
+        
+        # Save to database
+        saved = self.save_to_database(all_activities)
+        print(f"\nTotal saved: {saved} activities from {len(all_files)} files")
+        
+        return len(all_files), saved
+
+if __name__ == "__main__":
+    extractor = TextActivityExtractor()
+    extractor.process_all_text_files()