#!/usr/bin/env python3 # -*- coding: utf-8 -*- """ Text/Markdown Activity Extractor Proceseaza fisiere TXT si MD pentru extractie activitati """ import re from pathlib import Path from typing import List, Dict import sqlite3 from datetime import datetime class TextActivityExtractor: def __init__(self, db_path='data/activities.db'): self.db_path = db_path self.activity_patterns = { 'section_headers': [ r'^#{1,6}\s*(.+)$', # Markdown headers r'^([A-Z][^\.]{10,100})$', # Titluri simple r'^\d+\.\s*(.+)$', # Numbered lists r'^[•\-\*]\s*(.+)$', # Bullet points ], 'activity_markers': [ 'joc:', 'activitate:', 'exercitiu:', 'team building:', 'nume:', 'titlu:', 'denumire:' ] } def extract_from_text(self, file_path: str) -> List[Dict]: """Extrage activitati din fisier text/markdown""" activities = [] try: with open(file_path, 'r', encoding='utf-8', errors='ignore') as f: content = f.read() # Metoda 1: Cauta sectiuni markdown if file_path.endswith('.md'): activities.extend(self._extract_from_markdown(content, file_path)) # Metoda 2: Cauta pattern-uri generale activities.extend(self._extract_from_patterns(content, file_path)) # Metoda 3: Cauta blocuri de text structurate activities.extend(self._extract_from_blocks(content, file_path)) except Exception as e: print(f"Error processing {file_path}: {e}") return activities def _extract_from_markdown(self, content, source_file): """Extrage activitati din format markdown""" activities = [] lines = content.split('\n') current_activity = None current_content = [] for line in lines: # Verifica daca e header de activitate if re.match(r'^#{1,3}\s*(.+)', line): # Salveaza activitatea anterioara daca exista if current_activity and current_content: current_activity['description'] = '\n'.join(current_content[:20]) # Max 20 linii activities.append(current_activity) # Verifica daca noul header e o activitate header_text = re.sub(r'^#{1,3}\s*', '', line) if any(marker in header_text.lower() for marker in ['joc', 'activitate', 'exercitiu']): current_activity = { 'name': header_text[:200], 'source_file': str(source_file), 'category': '[A]' } current_content = [] else: current_activity = None elif current_activity: # Adauga continut la activitatea curenta if line.strip(): current_content.append(line) # Salveaza ultima activitate if current_activity and current_content: current_activity['description'] = '\n'.join(current_content[:20]) activities.append(current_activity) return activities def _extract_from_patterns(self, content, source_file): """Extrage folosind pattern matching""" activities = [] # Cauta markeri specifici de activitati for marker in self.activity_patterns['activity_markers']: pattern = re.compile(f'{re.escape(marker)}\\s*(.+?)(?=\\n\\n|{re.escape(marker)}|$)', re.IGNORECASE | re.DOTALL) matches = pattern.finditer(content) for match in matches: activity_text = match.group(1) if len(activity_text) > 20: activity = { 'name': activity_text.split('\n')[0][:200], 'description': activity_text[:1000], 'source_file': str(source_file), 'category': '[A]' } activities.append(activity) return activities def _extract_from_blocks(self, content, source_file): """Extrage din blocuri de text separate""" activities = [] # Imparte in blocuri separate de linii goale blocks = re.split(r'\n\s*\n', content) for block in blocks: if len(block) > 50: # Minim 50 caractere lines = block.strip().split('\n') first_line = lines[0].strip() # Verifica daca blocul pare o activitate if any(keyword in first_line.lower() for keyword in ['joc', 'activitate', 'exercitiu']): activity = { 'name': first_line[:200], 'description': block[:1000], 'source_file': str(source_file), 'category': '[A]' } activities.append(activity) return activities def save_to_database(self, activities): """Salveaza in baza de date""" conn = sqlite3.connect(self.db_path) cursor = conn.cursor() saved_count = 0 for activity in activities: try: # Check for duplicates cursor.execute( "SELECT id FROM activities WHERE name = ? AND source_file = ?", (activity.get('name'), activity.get('source_file')) ) if not cursor.fetchone(): columns = list(activity.keys()) values = list(activity.values()) placeholders = ['?' for _ in values] query = f"INSERT INTO activities ({', '.join(columns)}) VALUES ({', '.join(placeholders)})" cursor.execute(query, values) saved_count += 1 except Exception as e: print(f"Error saving: {e}") conn.commit() conn.close() return saved_count def process_all_text_files(self, base_path='/mnt/d/GoogleDrive/Cercetasi/carti-camp-jocuri'): """Proceseaza toate fisierele text si markdown""" base_path = Path(base_path) text_files = list(base_path.rglob("*.txt")) md_files = list(base_path.rglob("*.md")) all_files = text_files + md_files print(f"Found {len(all_files)} text/markdown files") all_activities = [] for file_path in all_files: activities = self.extract_from_text(str(file_path)) all_activities.extend(activities) print(f"Processed {file_path.name}: {len(activities)} activities") # Save to database saved = self.save_to_database(all_activities) print(f"\nTotal saved: {saved} activities from {len(all_files)} files") return len(all_files), saved if __name__ == "__main__": extractor = TextActivityExtractor() extractor.process_all_text_files()