- Remove obsolete documentation files (DEPLOYMENT.md, PLAN_IMPLEMENTARE_S8_DETALIAT.md, README.md) - Add comprehensive extraction pipeline with multiple format support (PDF, HTML, text) - Implement Claude-based activity extraction with structured templates - Update dependencies and Docker configuration - Reorganize scripts directory with modular extraction components - Move example documentation to appropriate location 🤖 Generated with [Claude Code](https://claude.ai/code) Co-Authored-By: Claude <noreply@anthropic.com>
197 lines
7.3 KiB
Python
197 lines
7.3 KiB
Python
#!/usr/bin/env python3
|
|
# -*- coding: utf-8 -*-
|
|
"""
|
|
Text/Markdown Activity Extractor
|
|
Proceseaza fisiere TXT si MD pentru extractie activitati
|
|
"""
|
|
|
|
import re
|
|
from pathlib import Path
|
|
from typing import List, Dict
|
|
import sqlite3
|
|
from datetime import datetime
|
|
|
|
class TextActivityExtractor:
|
|
def __init__(self, db_path='data/activities.db'):
|
|
self.db_path = db_path
|
|
self.activity_patterns = {
|
|
'section_headers': [
|
|
r'^#{1,6}\s*(.+)$', # Markdown headers
|
|
r'^([A-Z][^\.]{10,100})$', # Titluri simple
|
|
r'^\d+\.\s*(.+)$', # Numbered lists
|
|
r'^[•\-\*]\s*(.+)$', # Bullet points
|
|
],
|
|
'activity_markers': [
|
|
'joc:', 'activitate:', 'exercitiu:', 'team building:',
|
|
'nume:', 'titlu:', 'denumire:'
|
|
]
|
|
}
|
|
|
|
def extract_from_text(self, file_path: str) -> List[Dict]:
|
|
"""Extrage activitati din fisier text/markdown"""
|
|
activities = []
|
|
|
|
try:
|
|
with open(file_path, 'r', encoding='utf-8', errors='ignore') as f:
|
|
content = f.read()
|
|
|
|
# Metoda 1: Cauta sectiuni markdown
|
|
if file_path.endswith('.md'):
|
|
activities.extend(self._extract_from_markdown(content, file_path))
|
|
|
|
# Metoda 2: Cauta pattern-uri generale
|
|
activities.extend(self._extract_from_patterns(content, file_path))
|
|
|
|
# Metoda 3: Cauta blocuri de text structurate
|
|
activities.extend(self._extract_from_blocks(content, file_path))
|
|
|
|
except Exception as e:
|
|
print(f"Error processing {file_path}: {e}")
|
|
|
|
return activities
|
|
|
|
def _extract_from_markdown(self, content, source_file):
|
|
"""Extrage activitati din format markdown"""
|
|
activities = []
|
|
lines = content.split('\n')
|
|
|
|
current_activity = None
|
|
current_content = []
|
|
|
|
for line in lines:
|
|
# Verifica daca e header de activitate
|
|
if re.match(r'^#{1,3}\s*(.+)', line):
|
|
# Salveaza activitatea anterioara daca exista
|
|
if current_activity and current_content:
|
|
current_activity['description'] = '\n'.join(current_content[:20]) # Max 20 linii
|
|
activities.append(current_activity)
|
|
|
|
# Verifica daca noul header e o activitate
|
|
header_text = re.sub(r'^#{1,3}\s*', '', line)
|
|
if any(marker in header_text.lower() for marker in ['joc', 'activitate', 'exercitiu']):
|
|
current_activity = {
|
|
'name': header_text[:200],
|
|
'source_file': str(source_file),
|
|
'category': '[A]'
|
|
}
|
|
current_content = []
|
|
else:
|
|
current_activity = None
|
|
|
|
elif current_activity:
|
|
# Adauga continut la activitatea curenta
|
|
if line.strip():
|
|
current_content.append(line)
|
|
|
|
# Salveaza ultima activitate
|
|
if current_activity and current_content:
|
|
current_activity['description'] = '\n'.join(current_content[:20])
|
|
activities.append(current_activity)
|
|
|
|
return activities
|
|
|
|
def _extract_from_patterns(self, content, source_file):
|
|
"""Extrage folosind pattern matching"""
|
|
activities = []
|
|
|
|
# Cauta markeri specifici de activitati
|
|
for marker in self.activity_patterns['activity_markers']:
|
|
pattern = re.compile(f'{re.escape(marker)}\\s*(.+?)(?=\\n\\n|{re.escape(marker)}|$)',
|
|
re.IGNORECASE | re.DOTALL)
|
|
matches = pattern.finditer(content)
|
|
|
|
for match in matches:
|
|
activity_text = match.group(1)
|
|
if len(activity_text) > 20:
|
|
activity = {
|
|
'name': activity_text.split('\n')[0][:200],
|
|
'description': activity_text[:1000],
|
|
'source_file': str(source_file),
|
|
'category': '[A]'
|
|
}
|
|
activities.append(activity)
|
|
|
|
return activities
|
|
|
|
def _extract_from_blocks(self, content, source_file):
|
|
"""Extrage din blocuri de text separate"""
|
|
activities = []
|
|
|
|
# Imparte in blocuri separate de linii goale
|
|
blocks = re.split(r'\n\s*\n', content)
|
|
|
|
for block in blocks:
|
|
if len(block) > 50: # Minim 50 caractere
|
|
lines = block.strip().split('\n')
|
|
first_line = lines[0].strip()
|
|
|
|
# Verifica daca blocul pare o activitate
|
|
if any(keyword in first_line.lower() for keyword in ['joc', 'activitate', 'exercitiu']):
|
|
activity = {
|
|
'name': first_line[:200],
|
|
'description': block[:1000],
|
|
'source_file': str(source_file),
|
|
'category': '[A]'
|
|
}
|
|
activities.append(activity)
|
|
|
|
return activities
|
|
|
|
def save_to_database(self, activities):
|
|
"""Salveaza in baza de date"""
|
|
conn = sqlite3.connect(self.db_path)
|
|
cursor = conn.cursor()
|
|
|
|
saved_count = 0
|
|
|
|
for activity in activities:
|
|
try:
|
|
# Check for duplicates
|
|
cursor.execute(
|
|
"SELECT id FROM activities WHERE name = ? AND source_file = ?",
|
|
(activity.get('name'), activity.get('source_file'))
|
|
)
|
|
|
|
if not cursor.fetchone():
|
|
columns = list(activity.keys())
|
|
values = list(activity.values())
|
|
placeholders = ['?' for _ in values]
|
|
|
|
query = f"INSERT INTO activities ({', '.join(columns)}) VALUES ({', '.join(placeholders)})"
|
|
cursor.execute(query, values)
|
|
saved_count += 1
|
|
|
|
except Exception as e:
|
|
print(f"Error saving: {e}")
|
|
|
|
conn.commit()
|
|
conn.close()
|
|
|
|
return saved_count
|
|
|
|
def process_all_text_files(self, base_path='/mnt/d/GoogleDrive/Cercetasi/carti-camp-jocuri'):
|
|
"""Proceseaza toate fisierele text si markdown"""
|
|
base_path = Path(base_path)
|
|
|
|
text_files = list(base_path.rglob("*.txt"))
|
|
md_files = list(base_path.rglob("*.md"))
|
|
all_files = text_files + md_files
|
|
|
|
print(f"Found {len(all_files)} text/markdown files")
|
|
|
|
all_activities = []
|
|
|
|
for file_path in all_files:
|
|
activities = self.extract_from_text(str(file_path))
|
|
all_activities.extend(activities)
|
|
print(f"Processed {file_path.name}: {len(activities)} activities")
|
|
|
|
# Save to database
|
|
saved = self.save_to_database(all_activities)
|
|
print(f"\nTotal saved: {saved} activities from {len(all_files)} files")
|
|
|
|
return len(all_files), saved
|
|
|
|
if __name__ == "__main__":
|
|
extractor = TextActivityExtractor()
|
|
extractor.process_all_text_files() |