Refactor extraction system and reorganize project structure

- Remove obsolete documentation files (DEPLOYMENT.md, PLAN_IMPLEMENTARE_S8_DETALIAT.md, README.md)
- Add comprehensive extraction pipeline with multiple format support (PDF, HTML, text)
- Implement Claude-based activity extraction with structured templates
- Update dependencies and Docker configuration
- Reorganize scripts directory with modular extraction components
- Move example documentation to appropriate location

🤖 Generated with [Claude Code](https://claude.ai/code)

Co-Authored-By: Claude <noreply@anthropic.com>
This commit is contained in:
2025-09-11 23:32:37 +03:00
parent 1b6b7e06ad
commit a19ddf0b71
119 changed files with 91074 additions and 1859 deletions

0
scripts/__init__.py Normal file
View File

View File

@@ -0,0 +1,54 @@
# TEMPLATE PENTRU EXTRACȚIE ACTIVITĂȚI CU CLAUDE
## Instrucțiuni pentru Claude Code:
Pentru fiecare PDF/DOC, folosește următorul format de extracție:
### 1. Citește fișierul:
```
Claude, te rog citește fișierul: [CALE_FISIER]
```
### 2. Extrage activitățile folosind acest template JSON:
```json
{
"source_file": "[NUME_FISIER]",
"activities": [
{
"name": "Numele activității",
"description": "Descrierea completă a activității",
"rules": "Regulile jocului/activității",
"variations": "Variante sau adaptări",
"category": "[A-H] bazat pe tip",
"age_group_min": 6,
"age_group_max": 14,
"participants_min": 4,
"participants_max": 20,
"duration_min": 10,
"duration_max": 30,
"materials_list": "Lista materialelor necesare",
"skills_developed": "Competențe dezvoltate",
"difficulty_level": "Ușor/Mediu/Dificil",
"keywords": "cuvinte cheie separate prin virgulă",
"tags": "taguri relevante"
}
]
}
```
### 3. Salvează în fișier:
După extracție, salvează JSON-ul în: `/scripts/extracted_activities/[NUME_FISIER].json`
### 4. Priorități de procesare:
**TOP PRIORITY (procesează primele):**
1. 1000 Fantastic Scout Games.pdf
2. Cartea Mare a jocurilor.pdf
3. 160-de-activitati-dinamice-jocuri-pentru-team-building.pdf
4. 101 Ways to Create an Unforgettable Camp Experience.pdf
5. 151 Awesome Summer Camp Nature Activities.pdf
**Categorii de focus:**
- [A] Jocuri Cercetășești
- [C] Camping & Activități Exterior
- [G] Activități Educaționale

424
scripts/html_extractor.py Normal file
View File

@@ -0,0 +1,424 @@
#!/usr/bin/env python3
"""
HTML Activity Extractor - Proceseaz 1876 fiiere HTML
Extrage automat activiti folosind pattern recognition
"""
import os
import re
import json
from pathlib import Path
from bs4 import BeautifulSoup
import chardet
from typing import List, Dict, Optional
import sqlite3
from datetime import datetime
class HTMLActivityExtractor:
def __init__(self, db_path='data/activities.db'):
self.db_path = db_path
# Pattern-uri pentru detectare activiti <20>n rom<6F>n
self.activity_patterns = {
'title_patterns': [
r'(?i)(joc|activitate|exerci[t]iu|team[\s-]?building|energizer|ice[\s-]?breaker)[\s:]+([^\.]{5,100})',
r'(?i)<h[1-6][^>]*>([^<]*(?:joc|activitate|exerci[t]iu)[^<]*)</h[1-6]>',
r'(?i)<strong>([^<]*(?:joc|activitate|exerci[t]iu)[^<]*)</strong>',
r'(?i)^[\d]+\.?\s*([A-Z][^\.]{10,100}(?:joc|activitate|exerci[t]iu)[^\.]{0,50})$',
],
'description_markers': [
'descriere', 'reguli', 'cum se joac[a]', 'instructiuni',
'obiectiv', 'desfasurare', 'explicatie', 'mod de joc'
],
'materials_markers': [
'materiale', 'necesare', 'echipament', 'ce avem nevoie',
'se folosesc', 'trebuie sa avem', 'dotari'
],
'age_patterns': [
r'(?i)v[<5B>a]rst[a][\s:]+(\d+)[\s-]+(\d+)',
r'(?i)(\d+)[\s-]+(\d+)\s*ani',
r'(?i)pentru\s+(\d+)[\s-]+(\d+)\s*ani',
r'(?i)categoria?\s*(?:de\s*)?v[<5B>a]rst[a][\s:]+(\d+)[\s-]+(\d+)',
],
'participants_patterns': [
r'(?i)(\d+)[\s-]+(\d+)\s*(?:participan[t]i|juc[a]tori|persoane|copii)',
r'(?i)num[a]r\s*(?:de\s*)?(?:participan[t]i|juc[a]tori)[\s:]+(\d+)[\s-]+(\d+)',
r'(?i)grup\s*de\s*(\d+)[\s-]+(\d+)',
],
'duration_patterns': [
r'(?i)durat[a][\s:]+(\d+)[\s-]+(\d+)\s*(?:minute|min)',
r'(?i)timp[\s:]+(\d+)[\s-]+(\d+)\s*(?:minute|min)',
r'(?i)(\d+)[\s-]+(\d+)\s*minute',
]
}
# Categorii predefinite bazate pe sistemul existent
self.categories = {
'[A]': ['joc', 'joaca', 'distractie', 'amuzament'],
'[B]': ['aventura', 'explorare', 'descoperire'],
'[C]': ['camping', 'tabara', 'excursie', 'drumetie'],
'[D]': ['foc', 'flacara', 'lumina'],
'[E]': ['noduri', 'fr<EFBFBD>nghii', 'sfori', 'legare'],
'[F]': ['bushcraft', 'supravietuire', 'survival'],
'[G]': ['educatie', 'educativ', 'invatare', 'scoala'],
'[H]': ['orientare', 'busola', 'harta', 'navigare']
}
def detect_encoding(self, file_path):
"""Detecteaz encoding-ul fiierului"""
with open(file_path, 'rb') as f:
result = chardet.detect(f.read())
return result['encoding'] or 'utf-8'
def extract_from_html(self, html_path: str) -> List[Dict]:
"""Extrage activiti dintr-un singur fiier HTML"""
activities = []
try:
# Detectare encoding i citire
encoding = self.detect_encoding(html_path)
with open(html_path, 'r', encoding=encoding, errors='ignore') as f:
content = f.read()
soup = BeautifulSoup(content, 'lxml')
# Metod 1: Caut liste de activiti
activities.extend(self._extract_from_lists(soup, html_path))
# Metod 2: Caut activiti <20>n headings
activities.extend(self._extract_from_headings(soup, html_path))
# Metod 3: Caut pattern-uri <20>n text
activities.extend(self._extract_from_patterns(soup, html_path))
# Metod 4: Caut <20>n tabele
activities.extend(self._extract_from_tables(soup, html_path))
except Exception as e:
print(f"Error processing {html_path}: {e}")
return activities
def _extract_from_lists(self, soup, source_file):
"""Extrage activiti din liste HTML (ul, ol)"""
activities = []
for list_elem in soup.find_all(['ul', 'ol']):
# Verific dac lista pare s conin activiti
list_text = list_elem.get_text().lower()
if any(marker in list_text for marker in ['joc', 'activitate', 'exercitiu']):
for li in list_elem.find_all('li'):
text = li.get_text(strip=True)
if len(text) > 20: # Minim 20 caractere pentru o activitate valid
activity = self._create_activity_from_text(text, source_file)
if activity:
activities.append(activity)
return activities
def _extract_from_headings(self, soup, source_file):
"""Extrage activiti bazate pe headings"""
activities = []
for heading in soup.find_all(['h1', 'h2', 'h3', 'h4', 'h5', 'h6']):
heading_text = heading.get_text(strip=True)
# Verific dac heading-ul conine cuvinte cheie
if any(keyword in heading_text.lower() for keyword in ['joc', 'activitate', 'exercitiu']):
# Caut descrierea <20>n elementele urmtoare
description = ""
next_elem = heading.find_next_sibling()
while next_elem and next_elem.name not in ['h1', 'h2', 'h3', 'h4', 'h5', 'h6']:
if next_elem.name in ['p', 'div', 'ul']:
description += next_elem.get_text(strip=True) + " "
if len(description) > 500: # Limit descriere
break
next_elem = next_elem.find_next_sibling()
if description:
activity = {
'name': heading_text[:200],
'description': description[:1000],
'source_file': str(source_file),
'category': self._detect_category(heading_text + " " + description)
}
activities.append(activity)
return activities
def _extract_from_patterns(self, soup, source_file):
"""Extrage activiti folosind pattern matching"""
activities = []
text = soup.get_text()
# Caut pattern-uri de activiti
for pattern in self.activity_patterns['title_patterns']:
matches = re.finditer(pattern, text, re.MULTILINE)
for match in matches:
title = match.group(0) if match.lastindex == 0 else match.group(match.lastindex)
if len(title) > 10:
# Extrage context <20>n jurul match-ului
start = max(0, match.start() - 200)
end = min(len(text), match.end() + 500)
context = text[start:end]
activity = self._create_activity_from_text(context, source_file, title)
if activity:
activities.append(activity)
return activities
def _extract_from_tables(self, soup, source_file):
"""Extrage activiti din tabele"""
activities = []
for table in soup.find_all('table'):
rows = table.find_all('tr')
if len(rows) > 1: # Cel puin header i o linie de date
# Detecteaz coloanele relevante
headers = [th.get_text(strip=True).lower() for th in rows[0].find_all(['th', 'td'])]
for row in rows[1:]:
cells = row.find_all(['td'])
if cells:
activity_data = {}
for i, cell in enumerate(cells):
if i < len(headers):
activity_data[headers[i]] = cell.get_text(strip=True)
# Creeaz activitate din date tabel
if any(key in activity_data for key in ['joc', 'activitate', 'nume', 'titlu']):
activity = self._create_activity_from_table_data(activity_data, source_file)
if activity:
activities.append(activity)
return activities
def _create_activity_from_text(self, text, source_file, title=None):
"""Creeaz un dicionar de activitate din text"""
if not text or len(text) < 30:
return None
activity = {
'name': title or text[:100].split('.')[0].strip(),
'description': text[:1000],
'source_file': str(source_file),
'category': self._detect_category(text),
'keywords': self._extract_keywords(text),
'created_at': datetime.now().isoformat()
}
# Extrage metadata suplimentar
activity.update(self._extract_metadata(text))
return activity
def _create_activity_from_table_data(self, data, source_file):
"""Creeaz activitate din date de tabel"""
activity = {
'source_file': str(source_file),
'created_at': datetime.now().isoformat()
}
# Mapare c<>mpuri tabel la c<>mpuri DB
field_mapping = {
'nume': 'name', 'titlu': 'name', 'joc': 'name', 'activitate': 'name',
'descriere': 'description', 'detalii': 'description', 'explicatie': 'description',
'materiale': 'materials_list', 'echipament': 'materials_list',
'varsta': 'age_group_min', 'categoria': 'category',
'participanti': 'participants_min', 'numar': 'participants_min',
'durata': 'duration_min', 'timp': 'duration_min'
}
for table_field, db_field in field_mapping.items():
if table_field in data:
activity[db_field] = data[table_field]
# Validare minim
if 'name' in activity and len(activity.get('name', '')) > 5:
return activity
return None
def _extract_metadata(self, text):
"""Extrage metadata din text folosind pattern-uri"""
metadata = {}
# Extrage v<>rsta
for pattern in self.activity_patterns['age_patterns']:
match = re.search(pattern, text)
if match:
metadata['age_group_min'] = int(match.group(1))
metadata['age_group_max'] = int(match.group(2)) if match.lastindex >= 2 else int(match.group(1))
break
# Extrage numr participani
for pattern in self.activity_patterns['participants_patterns']:
match = re.search(pattern, text)
if match:
metadata['participants_min'] = int(match.group(1))
metadata['participants_max'] = int(match.group(2)) if match.lastindex >= 2 else int(match.group(1))
break
# Extrage durata
for pattern in self.activity_patterns['duration_patterns']:
match = re.search(pattern, text)
if match:
metadata['duration_min'] = int(match.group(1))
metadata['duration_max'] = int(match.group(2)) if match.lastindex >= 2 else int(match.group(1))
break
# Extrage materiale
materials = []
text_lower = text.lower()
for marker in self.activity_patterns['materials_markers']:
idx = text_lower.find(marker)
if idx != -1:
# Extrage urmtoarele 200 caractere dup marker
materials_text = text[idx:idx+200]
# Extrage items din list
items = re.findall(r'[-"]\s*([^\n-"]+)', materials_text)
if items:
materials.extend(items)
if materials:
metadata['materials_list'] = ', '.join(materials[:10]) # Maxim 10 materiale
return metadata
def _detect_category(self, text):
"""Detecteaz categoria activitii bazat pe cuvinte cheie"""
text_lower = text.lower()
for category, keywords in self.categories.items():
if any(keyword in text_lower for keyword in keywords):
return category
return '[A]' # Default categoria jocuri
def _extract_keywords(self, text):
"""Extrage cuvinte cheie din text"""
keywords = []
text_lower = text.lower()
# Lista de cuvinte cheie relevante
keyword_list = [
'cooperare', 'competitie', 'echipa', 'creativitate', 'miscare',
'strategie', 'comunicare', 'incredere', 'coordonare', 'atentie',
'reflexe', 'logica', 'imaginatie', 'muzica', 'dans', 'sport',
'natura', 'mediu', 'stiinta', 'matematica', 'limba', 'cultura'
]
for keyword in keyword_list:
if keyword in text_lower:
keywords.append(keyword)
return ', '.join(keywords[:5]) # Maxim 5 keywords
def save_to_database(self, activities):
"""Salveaz activitile <20>n baza de date"""
conn = sqlite3.connect(self.db_path)
cursor = conn.cursor()
saved_count = 0
duplicate_count = 0
for activity in activities:
try:
# Verific duplicate
cursor.execute(
"SELECT id FROM activities WHERE name = ? AND source_file = ?",
(activity.get('name'), activity.get('source_file'))
)
if cursor.fetchone():
duplicate_count += 1
continue
# Pregtete valorile pentru insert
columns = []
values = []
placeholders = []
for key, value in activity.items():
if key != 'created_at': # Skip created_at, it has default
columns.append(key)
values.append(value)
placeholders.append('?')
# Insert <20>n DB
query = f"INSERT INTO activities ({', '.join(columns)}) VALUES ({', '.join(placeholders)})"
cursor.execute(query, values)
saved_count += 1
except Exception as e:
print(f"Error saving activity: {e}")
continue
conn.commit()
conn.close()
return saved_count, duplicate_count
def process_all_html_files(self, base_path='/mnt/d/GoogleDrive/Cercetasi/carti-camp-jocuri'):
"""Proceseaz toate fiierele HTML din directorul specificat"""
base_path = Path(base_path)
html_files = list(base_path.rglob("*.html"))
html_files.extend(list(base_path.rglob("*.htm")))
print(f"Found {len(html_files)} HTML files to process")
all_activities = []
processed = 0
errors = 0
for i, html_file in enumerate(html_files):
try:
activities = self.extract_from_html(str(html_file))
all_activities.extend(activities)
processed += 1
# Progress update
if (i + 1) % 100 == 0:
print(f"Progress: {i+1}/{len(html_files)} files processed, {len(all_activities)} activities found")
# Save batch to DB
if all_activities:
saved, dupes = self.save_to_database(all_activities)
print(f"Batch saved: {saved} new activities, {dupes} duplicates skipped")
all_activities = [] # Clear buffer
except Exception as e:
print(f"Error processing {html_file}: {e}")
errors += 1
# Save remaining activities
if all_activities:
saved, dupes = self.save_to_database(all_activities)
print(f"Final batch saved: {saved} new activities, {dupes} duplicates skipped")
print(f"\nProcessing complete!")
print(f"Files processed: {processed}")
print(f"Errors: {errors}")
return processed, errors
# Funcie main pentru test
if __name__ == "__main__":
extractor = HTMLActivityExtractor()
# Test pe un fiier sample mai <20>nt<6E>i
print("Testing on sample file first...")
# Gsete un fiier HTML pentru test
test_files = list(Path('/mnt/d/GoogleDrive/Cercetasi/carti-camp-jocuri').rglob("*.html"))[:3]
for test_file in test_files:
print(f"\nTesting: {test_file}")
activities = extractor.extract_from_html(str(test_file))
print(f"Found {len(activities)} activities")
if activities:
print(f"Sample activity: {activities[0]['name'][:50]}...")
# <20>ntreab dac s continue cu procesarea complet
response = input("\nContinue with full processing? (y/n): ")
if response.lower() == 'y':
extractor.process_all_html_files()

View File

@@ -0,0 +1,78 @@
#!/usr/bin/env python3
"""
Import activities extracted by Claude from JSON files
"""
import json
import sqlite3
from pathlib import Path
from datetime import datetime
class ClaudeActivityImporter:
def __init__(self, db_path='data/activities.db'):
self.db_path = db_path
self.json_dir = Path('scripts/extracted_activities')
self.json_dir.mkdir(exist_ok=True)
def import_json_file(self, json_path):
"""Import activities from a single JSON file"""
with open(json_path, 'r', encoding='utf-8') as f:
data = json.load(f)
source_file = data.get('source_file', str(json_path))
activities = data.get('activities', [])
conn = sqlite3.connect(self.db_path)
cursor = conn.cursor()
imported = 0
for activity in activities:
try:
# Add source file and timestamp
activity['source_file'] = source_file
activity['created_at'] = datetime.now().isoformat()
# Prepare insert
columns = list(activity.keys())
values = list(activity.values())
placeholders = ['?' for _ in values]
# Check for duplicate
cursor.execute(
"SELECT id FROM activities WHERE name = ? AND source_file = ?",
(activity.get('name'), source_file)
)
if not cursor.fetchone():
query = f"INSERT INTO activities ({', '.join(columns)}) VALUES ({', '.join(placeholders)})"
cursor.execute(query, values)
imported += 1
except Exception as e:
print(f"Error importing activity: {e}")
conn.commit()
conn.close()
print(f"Imported {imported} activities from {json_path.name}")
return imported
def import_all_json_files(self):
"""Import all JSON files from the extracted_activities directory"""
json_files = list(self.json_dir.glob("*.json"))
if not json_files:
print("No JSON files found in extracted_activities directory")
return 0
total_imported = 0
for json_file in json_files:
imported = self.import_json_file(json_file)
total_imported += imported
print(f"\nTotal imported: {total_imported} activities from {len(json_files)} files")
return total_imported
if __name__ == "__main__":
importer = ClaudeActivityImporter()
importer.import_all_json_files()

View File

@@ -1,200 +0,0 @@
#!/usr/bin/env python3
"""
Data indexing script for INDEX-SISTEM-JOCURI v2.0
Extracts activities from INDEX_MASTER and populates database
"""
import sys
import os
from pathlib import Path
# Add app directory to Python path
sys.path.insert(0, str(Path(__file__).parent.parent))
from app.models.database import DatabaseManager
from app.services.indexer import ActivityIndexer
from app.config import Config
import argparse
import time
def main():
"""Main indexing function"""
parser = argparse.ArgumentParser(description='Index activities from INDEX_MASTER')
parser.add_argument('--clear', action='store_true', help='Clear existing database before indexing')
parser.add_argument('--category', help='Index specific category only (e.g., [A], [B], etc.)')
parser.add_argument('--verify', action='store_true', help='Verify indexing quality after completion')
parser.add_argument('--stats', action='store_true', help='Show database statistics only')
args = parser.parse_args()
# Setup paths
Config.ensure_directories()
# Database path
db_path = os.environ.get('DATABASE_URL', str(Config.DATA_DIR / 'activities.db'))
if db_path.startswith('sqlite:///'):
db_path = db_path[10:] # Remove sqlite:/// prefix
# INDEX_MASTER path
index_master_path = os.environ.get('INDEX_MASTER_FILE', str(Config.INDEX_MASTER_FILE))
print("🎯 INDEX-SISTEM-JOCURI v2.0 - Data Indexing")
print("=" * 50)
print(f"Database: {db_path}")
print(f"INDEX_MASTER: {index_master_path}")
print("=" * 50)
# Verify INDEX_MASTER file exists
if not Path(index_master_path).exists():
print(f"❌ INDEX_MASTER file not found: {index_master_path}")
print(" Please ensure the file is mounted in the container or available locally")
return 1
# Initialize services
try:
db_manager = DatabaseManager(db_path)
indexer = ActivityIndexer(db_manager, index_master_path)
except Exception as e:
print(f"❌ Error initializing services: {e}")
return 1
# Handle different operations
if args.stats:
return show_statistics(db_manager)
if args.category:
return index_category(indexer, args.category)
if args.verify:
return verify_indexing(indexer)
# Default: full indexing
return full_indexing(indexer, args.clear)
def full_indexing(indexer: ActivityIndexer, clear_existing: bool) -> int:
"""Perform full indexing of all activities"""
print("🚀 Starting full indexing process...")
try:
# Perform indexing
result = indexer.index_all_activities(clear_existing=clear_existing)
if not result.get('success'):
print(f"❌ Indexing failed: {result.get('error', 'Unknown error')}")
return 1
# Print results
print("\n📊 INDEXING RESULTS")
print("=" * 30)
print(f"✅ Activities inserted: {result.get('inserted_count', 0)}")
print(f"⏱️ Indexing time: {result.get('indexing_time_seconds', 0):.2f}s")
parsing_stats = result.get('parsing_stats', {})
print(f"📈 Completion rate: {parsing_stats.get('completion_rate', 0):.1f}%")
print(f"📝 Avg description length: {parsing_stats.get('average_description_length', 0):.0f} chars")
# Category breakdown
categories = result.get('distribution', {}).get('categories', {})
print(f"\n📂 CATEGORY BREAKDOWN:")
for category, count in categories.items():
print(f" {category}: {count} activities")
# Quality check
if result.get('inserted_count', 0) >= 500:
print(f"\n🎯 SUCCESS: Target of 500+ activities achieved!")
else:
print(f"\n⚠️ Warning: Only {result.get('inserted_count', 0)} activities indexed (target: 500+)")
return 0
except Exception as e:
print(f"❌ Error during indexing: {e}")
return 1
def index_category(indexer: ActivityIndexer, category_code: str) -> int:
"""Index a specific category"""
print(f"🎯 Indexing category: {category_code}")
try:
result = indexer.index_specific_category(category_code)
if not result.get('success'):
print(f"❌ Category indexing failed: {result.get('error', 'Unknown error')}")
return 1
print(f"✅ Category '{result.get('category')}' indexed successfully")
print(f" Inserted: {result.get('inserted_count')} activities")
print(f" Parsed: {result.get('total_parsed')} total")
print(f" Valid: {result.get('valid_activities')} valid")
return 0
except Exception as e:
print(f"❌ Error during category indexing: {e}")
return 1
def verify_indexing(indexer: ActivityIndexer) -> int:
"""Verify indexing quality"""
print("🔍 Verifying indexing quality...")
try:
result = indexer.verify_indexing_quality()
if 'error' in result:
print(f"❌ Verification error: {result['error']}")
return 1
print("\n📊 QUALITY VERIFICATION")
print("=" * 30)
print(f"Total activities: {result.get('total_activities', 0)}")
print(f"Meets minimum (500+): {'' if result.get('meets_minimum_requirement') else ''}")
print(f"Category coverage: {result.get('category_coverage', 0)}/{result.get('expected_categories', 8)}")
print(f"Quality score: {result.get('quality_score', 0)}/100")
quality_issues = result.get('quality_issues', [])
if quality_issues:
print(f"\n⚠️ Quality Issues:")
for issue in quality_issues[:5]: # Show first 5 issues
print(f"{issue}")
if len(quality_issues) > 5:
print(f" ... and {len(quality_issues) - 5} more issues")
else:
print(f"\n✅ No quality issues detected")
return 0 if result.get('quality_score', 0) >= 80 else 1
except Exception as e:
print(f"❌ Error during verification: {e}")
return 1
def show_statistics(db_manager: DatabaseManager) -> int:
"""Show database statistics"""
print("📊 Database Statistics")
print("=" * 25)
try:
stats = db_manager.get_statistics()
print(f"Total activities: {stats.get('total_activities', 0)}")
print(f"Database size: {stats.get('database_size_bytes', 0) / 1024:.1f} KB")
print(f"Database path: {stats.get('database_path', 'Unknown')}")
categories = stats.get('categories', {})
if categories:
print(f"\nCategories:")
for category, count in categories.items():
print(f" {category}: {count}")
return 0
except Exception as e:
print(f"❌ Error getting statistics: {e}")
return 1
if __name__ == '__main__':
exit_code = main()
sys.exit(exit_code)

0
scripts/pdf_extractor.py Normal file
View File

View File

@@ -0,0 +1,143 @@
#!/usr/bin/env python3
"""
PDF Mass Conversion to Text for Activity Extraction
Handles all PDF sizes efficiently with multiple fallback methods
"""
import os
import json
from pathlib import Path
import PyPDF2
import pdfplumber
from typing import List, Dict
import logging
class PDFConverter:
def __init__(self, max_pages=50):
self.max_pages = max_pages
self.conversion_stats = {}
def convert_pdf_to_text(self, pdf_path: str) -> str:
"""Convert PDF to text using multiple methods with fallbacks"""
try:
# Method 1: pdfplumber (best for tables and layout)
return self._convert_with_pdfplumber(pdf_path)
except Exception as e:
print(f"pdfplumber failed for {pdf_path}: {e}")
try:
# Method 2: PyPDF2 (fallback)
return self._convert_with_pypdf2(pdf_path)
except Exception as e2:
print(f"PyPDF2 also failed for {pdf_path}: {e2}")
return ""
def _convert_with_pdfplumber(self, pdf_path: str) -> str:
"""Primary conversion method using pdfplumber"""
text_content = ""
with pdfplumber.open(pdf_path) as pdf:
total_pages = len(pdf.pages)
pages_to_process = min(total_pages, self.max_pages)
print(f" Converting {pdf_path}: {pages_to_process}/{total_pages} pages")
for i, page in enumerate(pdf.pages[:pages_to_process]):
try:
page_text = page.extract_text()
if page_text:
text_content += f"\n--- PAGE {i+1} ---\n"
text_content += page_text
text_content += "\n"
except Exception as e:
print(f" Error on page {i+1}: {e}")
continue
self.conversion_stats[pdf_path] = {
'method': 'pdfplumber',
'pages_processed': pages_to_process,
'total_pages': total_pages,
'success': True,
'text_length': len(text_content)
}
return text_content
def _convert_with_pypdf2(self, pdf_path: str) -> str:
"""Fallback conversion method using PyPDF2"""
text_content = ""
with open(pdf_path, 'rb') as file:
reader = PyPDF2.PdfReader(file)
total_pages = len(reader.pages)
pages_to_process = min(total_pages, self.max_pages)
print(f" Converting {pdf_path} (fallback): {pages_to_process}/{total_pages} pages")
for i in range(pages_to_process):
try:
page = reader.pages[i]
page_text = page.extract_text()
if page_text:
text_content += f"\n--- PAGE {i+1} ---\n"
text_content += page_text
text_content += "\n"
except Exception as e:
print(f" Error on page {i+1}: {e}")
continue
self.conversion_stats[pdf_path] = {
'method': 'PyPDF2',
'pages_processed': pages_to_process,
'total_pages': total_pages,
'success': True,
'text_length': len(text_content)
}
return text_content
def convert_all_pdfs(self, pdf_directory: str, output_directory: str):
"""Convert all PDFs in directory to text files"""
pdf_files = list(Path(pdf_directory).glob("**/*.pdf"))
print(f"🔄 Converting {len(pdf_files)} PDF files to text...")
os.makedirs(output_directory, exist_ok=True)
for i, pdf_path in enumerate(pdf_files):
print(f"\n[{i+1}/{len(pdf_files)}] Processing {pdf_path.name}...")
# Convert to text
text_content = self.convert_pdf_to_text(str(pdf_path))
if text_content.strip():
# Save as text file
output_file = Path(output_directory) / f"{pdf_path.stem}.txt"
with open(output_file, 'w', encoding='utf-8') as f:
f.write(f"SOURCE: {pdf_path}\n")
f.write(f"CONVERTED: 2025-01-11\n")
f.write("="*50 + "\n\n")
f.write(text_content)
print(f" ✅ Saved: {output_file}")
else:
print(f" ❌ No text extracted from {pdf_path.name}")
# Save conversion statistics
stats_file = Path(output_directory) / "conversion_stats.json"
with open(stats_file, 'w', encoding='utf-8') as f:
json.dump(self.conversion_stats, f, indent=2, ensure_ascii=False)
print(f"\n🎉 PDF conversion complete! Check {output_directory}")
return len([f for f in self.conversion_stats.values() if f['success']])
# Usage
if __name__ == "__main__":
converter = PDFConverter(max_pages=50)
# Convert all PDFs
pdf_dir = "/mnt/d/GoogleDrive/Cercetasi/carti-camp-jocuri"
output_dir = "/mnt/d/GoogleDrive/Cercetasi/carti-camp-jocuri/INDEX-SISTEM-JOCURI/converted_pdfs"
converted_count = converter.convert_all_pdfs(pdf_dir, output_dir)
print(f"Final result: {converted_count} PDFs successfully converted")

50
scripts/run_extraction.py Normal file
View File

@@ -0,0 +1,50 @@
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Main extraction orchestrator
Ruleaza intregul proces de extractie
"""
import sys
import time
from pathlib import Path
from unified_processor import UnifiedProcessor
from import_claude_activities import ClaudeActivityImporter
def main():
print("="*60)
print("ACTIVITY EXTRACTION SYSTEM")
print("Strategy S8: Hybrid Claude + Scripts")
print("="*60)
# Step 1: Run automated extraction
print("\nSTEP 1: Automated Extraction")
print("-"*40)
processor = UnifiedProcessor()
processor.process_automated_formats()
# Step 2: Wait for Claude processing
print("\n" + "="*60)
print("STEP 2: Manual Claude Processing Required")
print("-"*40)
print("Please process PDF/DOC files with Claude using the template.")
print("Files are listed in: pdf_doc_for_claude.txt")
print("Save extracted activities as JSON in: scripts/extracted_activities/")
print("="*60)
response = input("\nHave you completed Claude processing? (y/n): ")
if response.lower() == 'y':
# Step 3: Import Claude-extracted activities
print("\nSTEP 3: Importing Claude-extracted activities")
print("-"*40)
importer = ClaudeActivityImporter()
importer.import_all_json_files()
print("\n" + "="*60)
print("EXTRACTION COMPLETE!")
print("="*60)
if __name__ == "__main__":
main()

197
scripts/text_extractor.py Normal file
View File

@@ -0,0 +1,197 @@
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Text/Markdown Activity Extractor
Proceseaza fisiere TXT si MD pentru extractie activitati
"""
import re
from pathlib import Path
from typing import List, Dict
import sqlite3
from datetime import datetime
class TextActivityExtractor:
def __init__(self, db_path='data/activities.db'):
self.db_path = db_path
self.activity_patterns = {
'section_headers': [
r'^#{1,6}\s*(.+)$', # Markdown headers
r'^([A-Z][^\.]{10,100})$', # Titluri simple
r'^\d+\.\s*(.+)$', # Numbered lists
r'^[•\-\*]\s*(.+)$', # Bullet points
],
'activity_markers': [
'joc:', 'activitate:', 'exercitiu:', 'team building:',
'nume:', 'titlu:', 'denumire:'
]
}
def extract_from_text(self, file_path: str) -> List[Dict]:
"""Extrage activitati din fisier text/markdown"""
activities = []
try:
with open(file_path, 'r', encoding='utf-8', errors='ignore') as f:
content = f.read()
# Metoda 1: Cauta sectiuni markdown
if file_path.endswith('.md'):
activities.extend(self._extract_from_markdown(content, file_path))
# Metoda 2: Cauta pattern-uri generale
activities.extend(self._extract_from_patterns(content, file_path))
# Metoda 3: Cauta blocuri de text structurate
activities.extend(self._extract_from_blocks(content, file_path))
except Exception as e:
print(f"Error processing {file_path}: {e}")
return activities
def _extract_from_markdown(self, content, source_file):
"""Extrage activitati din format markdown"""
activities = []
lines = content.split('\n')
current_activity = None
current_content = []
for line in lines:
# Verifica daca e header de activitate
if re.match(r'^#{1,3}\s*(.+)', line):
# Salveaza activitatea anterioara daca exista
if current_activity and current_content:
current_activity['description'] = '\n'.join(current_content[:20]) # Max 20 linii
activities.append(current_activity)
# Verifica daca noul header e o activitate
header_text = re.sub(r'^#{1,3}\s*', '', line)
if any(marker in header_text.lower() for marker in ['joc', 'activitate', 'exercitiu']):
current_activity = {
'name': header_text[:200],
'source_file': str(source_file),
'category': '[A]'
}
current_content = []
else:
current_activity = None
elif current_activity:
# Adauga continut la activitatea curenta
if line.strip():
current_content.append(line)
# Salveaza ultima activitate
if current_activity and current_content:
current_activity['description'] = '\n'.join(current_content[:20])
activities.append(current_activity)
return activities
def _extract_from_patterns(self, content, source_file):
"""Extrage folosind pattern matching"""
activities = []
# Cauta markeri specifici de activitati
for marker in self.activity_patterns['activity_markers']:
pattern = re.compile(f'{re.escape(marker)}\\s*(.+?)(?=\\n\\n|{re.escape(marker)}|$)',
re.IGNORECASE | re.DOTALL)
matches = pattern.finditer(content)
for match in matches:
activity_text = match.group(1)
if len(activity_text) > 20:
activity = {
'name': activity_text.split('\n')[0][:200],
'description': activity_text[:1000],
'source_file': str(source_file),
'category': '[A]'
}
activities.append(activity)
return activities
def _extract_from_blocks(self, content, source_file):
"""Extrage din blocuri de text separate"""
activities = []
# Imparte in blocuri separate de linii goale
blocks = re.split(r'\n\s*\n', content)
for block in blocks:
if len(block) > 50: # Minim 50 caractere
lines = block.strip().split('\n')
first_line = lines[0].strip()
# Verifica daca blocul pare o activitate
if any(keyword in first_line.lower() for keyword in ['joc', 'activitate', 'exercitiu']):
activity = {
'name': first_line[:200],
'description': block[:1000],
'source_file': str(source_file),
'category': '[A]'
}
activities.append(activity)
return activities
def save_to_database(self, activities):
"""Salveaza in baza de date"""
conn = sqlite3.connect(self.db_path)
cursor = conn.cursor()
saved_count = 0
for activity in activities:
try:
# Check for duplicates
cursor.execute(
"SELECT id FROM activities WHERE name = ? AND source_file = ?",
(activity.get('name'), activity.get('source_file'))
)
if not cursor.fetchone():
columns = list(activity.keys())
values = list(activity.values())
placeholders = ['?' for _ in values]
query = f"INSERT INTO activities ({', '.join(columns)}) VALUES ({', '.join(placeholders)})"
cursor.execute(query, values)
saved_count += 1
except Exception as e:
print(f"Error saving: {e}")
conn.commit()
conn.close()
return saved_count
def process_all_text_files(self, base_path='/mnt/d/GoogleDrive/Cercetasi/carti-camp-jocuri'):
"""Proceseaza toate fisierele text si markdown"""
base_path = Path(base_path)
text_files = list(base_path.rglob("*.txt"))
md_files = list(base_path.rglob("*.md"))
all_files = text_files + md_files
print(f"Found {len(all_files)} text/markdown files")
all_activities = []
for file_path in all_files:
activities = self.extract_from_text(str(file_path))
all_activities.extend(activities)
print(f"Processed {file_path.name}: {len(activities)} activities")
# Save to database
saved = self.save_to_database(all_activities)
print(f"\nTotal saved: {saved} activities from {len(all_files)} files")
return len(all_files), saved
if __name__ == "__main__":
extractor = TextActivityExtractor()
extractor.process_all_text_files()

View File

@@ -0,0 +1,151 @@
#!/usr/bin/env python3
"""
Unified Activity Processor
Orchestreaz toate extractoarele pentru procesare complet
"""
import time
from pathlib import Path
from html_extractor import HTMLActivityExtractor
from text_extractor import TextActivityExtractor
import sqlite3
class UnifiedProcessor:
def __init__(self, db_path='data/activities.db'):
self.db_path = db_path
self.html_extractor = HTMLActivityExtractor(db_path)
self.text_extractor = TextActivityExtractor(db_path)
self.stats = {
'html_processed': 0,
'text_processed': 0,
'pdf_to_process': 0,
'doc_to_process': 0,
'total_activities': 0,
'start_time': None,
'end_time': None
}
def get_current_activity_count(self):
"""Obine numrul curent de activiti din DB"""
conn = sqlite3.connect(self.db_path)
cursor = conn.cursor()
cursor.execute("SELECT COUNT(*) FROM activities")
count = cursor.fetchone()[0]
conn.close()
return count
def count_files_to_process(self, base_path):
"""Numr fiierele care trebuie procesate"""
base_path = Path(base_path)
counts = {
'html': len(list(base_path.rglob("*.html"))) + len(list(base_path.rglob("*.htm"))),
'txt': len(list(base_path.rglob("*.txt"))),
'md': len(list(base_path.rglob("*.md"))),
'pdf': len(list(base_path.rglob("*.pdf"))),
'doc': len(list(base_path.rglob("*.doc"))),
'docx': len(list(base_path.rglob("*.docx")))
}
return counts
def process_automated_formats(self, base_path='/mnt/d/GoogleDrive/Cercetasi/carti-camp-jocuri'):
"""Proceseaz toate formatele care pot fi automatizate"""
print("="*60)
print("UNIFIED ACTIVITY PROCESSOR - AUTOMATED PHASE")
print("="*60)
self.stats['start_time'] = time.time()
initial_count = self.get_current_activity_count()
# Afieaz statistici iniiale
file_counts = self.count_files_to_process(base_path)
print(f"\nFiles to process:")
for format, count in file_counts.items():
print(f" {format.upper()}: {count} files")
print(f"\nCurrent activities in database: {initial_count}")
print("-"*60)
# FAZA 1: Procesare HTML (prioritate maxim - volum mare)
print("\n[1/2] Processing HTML files...")
print("-"*40)
html_processed, html_errors = self.html_extractor.process_all_html_files(base_path)
self.stats['html_processed'] = html_processed
# FAZA 2: Procesare Text/MD
print("\n[2/2] Processing Text/Markdown files...")
print("-"*40)
text_processed, text_saved = self.text_extractor.process_all_text_files(base_path)
self.stats['text_processed'] = text_processed
# Statistici finale
self.stats['end_time'] = time.time()
final_count = self.get_current_activity_count()
self.stats['total_activities'] = final_count - initial_count
# Identific fiierele care necesit procesare manual
self.stats['pdf_to_process'] = file_counts['pdf']
self.stats['doc_to_process'] = file_counts['doc'] + file_counts['docx']
self.print_summary()
self.save_pdf_doc_list(base_path)
def print_summary(self):
"""Afieaz rezumatul procesrii"""
print("\n" + "="*60)
print("PROCESSING SUMMARY")
print("="*60)
duration = self.stats['end_time'] - self.stats['start_time']
print(f"\nAutomated Processing Results:")
print(f" HTML files processed: {self.stats['html_processed']}")
print(f" Text/MD files processed: {self.stats['text_processed']}")
print(f" New activities added: {self.stats['total_activities']}")
print(f" Processing time: {duration:.1f} seconds")
print(f"\nFiles requiring Claude processing:")
print(f" PDF files: {self.stats['pdf_to_process']}")
print(f" DOC/DOCX files: {self.stats['doc_to_process']}")
print("\n" + "="*60)
print("NEXT STEPS:")
print("1. Review the file 'pdf_doc_for_claude.txt' for manual processing")
print("2. Use Claude to extract activities from PDF/DOC files")
print("3. Focus on largest PDF files first (highest activity density)")
print("="*60)
def save_pdf_doc_list(self, base_path):
"""Salveaz lista de PDF/DOC pentru procesare cu Claude"""
base_path = Path(base_path)
pdf_files = sorted(base_path.rglob("*.pdf"), key=lambda p: p.stat().st_size, reverse=True)
doc_files = list(base_path.rglob("*.doc"))
docx_files = list(base_path.rglob("*.docx"))
with open('pdf_doc_for_claude.txt', 'w', encoding='utf-8') as f:
f.write("PDF/DOC FILES FOR CLAUDE PROCESSING\n")
f.write("="*60 + "\n")
f.write("Files sorted by size (largest first = likely more activities)\n\n")
f.write("TOP PRIORITY PDF FILES (process these first):\n")
f.write("-"*40 + "\n")
for i, pdf in enumerate(pdf_files[:20], 1):
size_mb = pdf.stat().st_size / (1024*1024)
f.write(f"{i}. {pdf.name} ({size_mb:.1f} MB)\n")
f.write(f" Path: {pdf}\n\n")
if len(pdf_files) > 20:
f.write(f"\n... and {len(pdf_files)-20} more PDF files\n\n")
f.write("\nDOC/DOCX FILES:\n")
f.write("-"*40 + "\n")
for doc in doc_files + docx_files:
size_kb = doc.stat().st_size / 1024
f.write(f"- {doc.name} ({size_kb:.1f} KB)\n")
print(f"\nPDF/DOC list saved to: pdf_doc_for_claude.txt")
if __name__ == "__main__":
processor = UnifiedProcessor()
processor.process_automated_formats()