Refactor extraction system and reorganize project structure
- Remove obsolete documentation files (DEPLOYMENT.md, PLAN_IMPLEMENTARE_S8_DETALIAT.md, README.md) - Add comprehensive extraction pipeline with multiple format support (PDF, HTML, text) - Implement Claude-based activity extraction with structured templates - Update dependencies and Docker configuration - Reorganize scripts directory with modular extraction components - Move example documentation to appropriate location 🤖 Generated with [Claude Code](https://claude.ai/code) Co-Authored-By: Claude <noreply@anthropic.com>
This commit is contained in:
197
scripts/text_extractor.py
Normal file
197
scripts/text_extractor.py
Normal file
@@ -0,0 +1,197 @@
|
||||
#!/usr/bin/env python3
|
||||
# -*- coding: utf-8 -*-
|
||||
"""
|
||||
Text/Markdown Activity Extractor
|
||||
Proceseaza fisiere TXT si MD pentru extractie activitati
|
||||
"""
|
||||
|
||||
import re
|
||||
from pathlib import Path
|
||||
from typing import List, Dict
|
||||
import sqlite3
|
||||
from datetime import datetime
|
||||
|
||||
class TextActivityExtractor:
|
||||
def __init__(self, db_path='data/activities.db'):
|
||||
self.db_path = db_path
|
||||
self.activity_patterns = {
|
||||
'section_headers': [
|
||||
r'^#{1,6}\s*(.+)$', # Markdown headers
|
||||
r'^([A-Z][^\.]{10,100})$', # Titluri simple
|
||||
r'^\d+\.\s*(.+)$', # Numbered lists
|
||||
r'^[•\-\*]\s*(.+)$', # Bullet points
|
||||
],
|
||||
'activity_markers': [
|
||||
'joc:', 'activitate:', 'exercitiu:', 'team building:',
|
||||
'nume:', 'titlu:', 'denumire:'
|
||||
]
|
||||
}
|
||||
|
||||
def extract_from_text(self, file_path: str) -> List[Dict]:
|
||||
"""Extrage activitati din fisier text/markdown"""
|
||||
activities = []
|
||||
|
||||
try:
|
||||
with open(file_path, 'r', encoding='utf-8', errors='ignore') as f:
|
||||
content = f.read()
|
||||
|
||||
# Metoda 1: Cauta sectiuni markdown
|
||||
if file_path.endswith('.md'):
|
||||
activities.extend(self._extract_from_markdown(content, file_path))
|
||||
|
||||
# Metoda 2: Cauta pattern-uri generale
|
||||
activities.extend(self._extract_from_patterns(content, file_path))
|
||||
|
||||
# Metoda 3: Cauta blocuri de text structurate
|
||||
activities.extend(self._extract_from_blocks(content, file_path))
|
||||
|
||||
except Exception as e:
|
||||
print(f"Error processing {file_path}: {e}")
|
||||
|
||||
return activities
|
||||
|
||||
def _extract_from_markdown(self, content, source_file):
|
||||
"""Extrage activitati din format markdown"""
|
||||
activities = []
|
||||
lines = content.split('\n')
|
||||
|
||||
current_activity = None
|
||||
current_content = []
|
||||
|
||||
for line in lines:
|
||||
# Verifica daca e header de activitate
|
||||
if re.match(r'^#{1,3}\s*(.+)', line):
|
||||
# Salveaza activitatea anterioara daca exista
|
||||
if current_activity and current_content:
|
||||
current_activity['description'] = '\n'.join(current_content[:20]) # Max 20 linii
|
||||
activities.append(current_activity)
|
||||
|
||||
# Verifica daca noul header e o activitate
|
||||
header_text = re.sub(r'^#{1,3}\s*', '', line)
|
||||
if any(marker in header_text.lower() for marker in ['joc', 'activitate', 'exercitiu']):
|
||||
current_activity = {
|
||||
'name': header_text[:200],
|
||||
'source_file': str(source_file),
|
||||
'category': '[A]'
|
||||
}
|
||||
current_content = []
|
||||
else:
|
||||
current_activity = None
|
||||
|
||||
elif current_activity:
|
||||
# Adauga continut la activitatea curenta
|
||||
if line.strip():
|
||||
current_content.append(line)
|
||||
|
||||
# Salveaza ultima activitate
|
||||
if current_activity and current_content:
|
||||
current_activity['description'] = '\n'.join(current_content[:20])
|
||||
activities.append(current_activity)
|
||||
|
||||
return activities
|
||||
|
||||
def _extract_from_patterns(self, content, source_file):
|
||||
"""Extrage folosind pattern matching"""
|
||||
activities = []
|
||||
|
||||
# Cauta markeri specifici de activitati
|
||||
for marker in self.activity_patterns['activity_markers']:
|
||||
pattern = re.compile(f'{re.escape(marker)}\\s*(.+?)(?=\\n\\n|{re.escape(marker)}|$)',
|
||||
re.IGNORECASE | re.DOTALL)
|
||||
matches = pattern.finditer(content)
|
||||
|
||||
for match in matches:
|
||||
activity_text = match.group(1)
|
||||
if len(activity_text) > 20:
|
||||
activity = {
|
||||
'name': activity_text.split('\n')[0][:200],
|
||||
'description': activity_text[:1000],
|
||||
'source_file': str(source_file),
|
||||
'category': '[A]'
|
||||
}
|
||||
activities.append(activity)
|
||||
|
||||
return activities
|
||||
|
||||
def _extract_from_blocks(self, content, source_file):
|
||||
"""Extrage din blocuri de text separate"""
|
||||
activities = []
|
||||
|
||||
# Imparte in blocuri separate de linii goale
|
||||
blocks = re.split(r'\n\s*\n', content)
|
||||
|
||||
for block in blocks:
|
||||
if len(block) > 50: # Minim 50 caractere
|
||||
lines = block.strip().split('\n')
|
||||
first_line = lines[0].strip()
|
||||
|
||||
# Verifica daca blocul pare o activitate
|
||||
if any(keyword in first_line.lower() for keyword in ['joc', 'activitate', 'exercitiu']):
|
||||
activity = {
|
||||
'name': first_line[:200],
|
||||
'description': block[:1000],
|
||||
'source_file': str(source_file),
|
||||
'category': '[A]'
|
||||
}
|
||||
activities.append(activity)
|
||||
|
||||
return activities
|
||||
|
||||
def save_to_database(self, activities):
|
||||
"""Salveaza in baza de date"""
|
||||
conn = sqlite3.connect(self.db_path)
|
||||
cursor = conn.cursor()
|
||||
|
||||
saved_count = 0
|
||||
|
||||
for activity in activities:
|
||||
try:
|
||||
# Check for duplicates
|
||||
cursor.execute(
|
||||
"SELECT id FROM activities WHERE name = ? AND source_file = ?",
|
||||
(activity.get('name'), activity.get('source_file'))
|
||||
)
|
||||
|
||||
if not cursor.fetchone():
|
||||
columns = list(activity.keys())
|
||||
values = list(activity.values())
|
||||
placeholders = ['?' for _ in values]
|
||||
|
||||
query = f"INSERT INTO activities ({', '.join(columns)}) VALUES ({', '.join(placeholders)})"
|
||||
cursor.execute(query, values)
|
||||
saved_count += 1
|
||||
|
||||
except Exception as e:
|
||||
print(f"Error saving: {e}")
|
||||
|
||||
conn.commit()
|
||||
conn.close()
|
||||
|
||||
return saved_count
|
||||
|
||||
def process_all_text_files(self, base_path='/mnt/d/GoogleDrive/Cercetasi/carti-camp-jocuri'):
|
||||
"""Proceseaza toate fisierele text si markdown"""
|
||||
base_path = Path(base_path)
|
||||
|
||||
text_files = list(base_path.rglob("*.txt"))
|
||||
md_files = list(base_path.rglob("*.md"))
|
||||
all_files = text_files + md_files
|
||||
|
||||
print(f"Found {len(all_files)} text/markdown files")
|
||||
|
||||
all_activities = []
|
||||
|
||||
for file_path in all_files:
|
||||
activities = self.extract_from_text(str(file_path))
|
||||
all_activities.extend(activities)
|
||||
print(f"Processed {file_path.name}: {len(activities)} activities")
|
||||
|
||||
# Save to database
|
||||
saved = self.save_to_database(all_activities)
|
||||
print(f"\nTotal saved: {saved} activities from {len(all_files)} files")
|
||||
|
||||
return len(all_files), saved
|
||||
|
||||
if __name__ == "__main__":
|
||||
extractor = TextActivityExtractor()
|
||||
extractor.process_all_text_files()
|
||||
Reference in New Issue
Block a user