Refactor extraction system and reorganize project structure
- Remove obsolete documentation files (DEPLOYMENT.md, PLAN_IMPLEMENTARE_S8_DETALIAT.md, README.md) - Add comprehensive extraction pipeline with multiple format support (PDF, HTML, text) - Implement Claude-based activity extraction with structured templates - Update dependencies and Docker configuration - Reorganize scripts directory with modular extraction components - Move example documentation to appropriate location 🤖 Generated with [Claude Code](https://claude.ai/code) Co-Authored-By: Claude <noreply@anthropic.com>
This commit is contained in:
0
scripts/__init__.py
Normal file
0
scripts/__init__.py
Normal file
54
scripts/claude_extraction_template.md
Normal file
54
scripts/claude_extraction_template.md
Normal file
@@ -0,0 +1,54 @@
|
||||
# TEMPLATE PENTRU EXTRACȚIE ACTIVITĂȚI CU CLAUDE
|
||||
|
||||
## Instrucțiuni pentru Claude Code:
|
||||
|
||||
Pentru fiecare PDF/DOC, folosește următorul format de extracție:
|
||||
|
||||
### 1. Citește fișierul:
|
||||
```
|
||||
Claude, te rog citește fișierul: [CALE_FISIER]
|
||||
```
|
||||
|
||||
### 2. Extrage activitățile folosind acest template JSON:
|
||||
```json
|
||||
{
|
||||
"source_file": "[NUME_FISIER]",
|
||||
"activities": [
|
||||
{
|
||||
"name": "Numele activității",
|
||||
"description": "Descrierea completă a activității",
|
||||
"rules": "Regulile jocului/activității",
|
||||
"variations": "Variante sau adaptări",
|
||||
"category": "[A-H] bazat pe tip",
|
||||
"age_group_min": 6,
|
||||
"age_group_max": 14,
|
||||
"participants_min": 4,
|
||||
"participants_max": 20,
|
||||
"duration_min": 10,
|
||||
"duration_max": 30,
|
||||
"materials_list": "Lista materialelor necesare",
|
||||
"skills_developed": "Competențe dezvoltate",
|
||||
"difficulty_level": "Ușor/Mediu/Dificil",
|
||||
"keywords": "cuvinte cheie separate prin virgulă",
|
||||
"tags": "taguri relevante"
|
||||
}
|
||||
]
|
||||
}
|
||||
```
|
||||
|
||||
### 3. Salvează în fișier:
|
||||
După extracție, salvează JSON-ul în: `/scripts/extracted_activities/[NUME_FISIER].json`
|
||||
|
||||
### 4. Priorități de procesare:
|
||||
|
||||
**TOP PRIORITY (procesează primele):**
|
||||
1. 1000 Fantastic Scout Games.pdf
|
||||
2. Cartea Mare a jocurilor.pdf
|
||||
3. 160-de-activitati-dinamice-jocuri-pentru-team-building.pdf
|
||||
4. 101 Ways to Create an Unforgettable Camp Experience.pdf
|
||||
5. 151 Awesome Summer Camp Nature Activities.pdf
|
||||
|
||||
**Categorii de focus:**
|
||||
- [A] Jocuri Cercetășești
|
||||
- [C] Camping & Activități Exterior
|
||||
- [G] Activități Educaționale
|
||||
424
scripts/html_extractor.py
Normal file
424
scripts/html_extractor.py
Normal file
@@ -0,0 +1,424 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
HTML Activity Extractor - Proceseaz 1876 fiiere HTML
|
||||
Extrage automat activiti folosind pattern recognition
|
||||
"""
|
||||
|
||||
import os
|
||||
import re
|
||||
import json
|
||||
from pathlib import Path
|
||||
from bs4 import BeautifulSoup
|
||||
import chardet
|
||||
from typing import List, Dict, Optional
|
||||
import sqlite3
|
||||
from datetime import datetime
|
||||
|
||||
class HTMLActivityExtractor:
|
||||
def __init__(self, db_path='data/activities.db'):
|
||||
self.db_path = db_path
|
||||
# Pattern-uri pentru detectare activiti <20>n rom<6F>n
|
||||
self.activity_patterns = {
|
||||
'title_patterns': [
|
||||
r'(?i)(joc|activitate|exerci[t]iu|team[\s-]?building|energizer|ice[\s-]?breaker)[\s:]+([^\.]{5,100})',
|
||||
r'(?i)<h[1-6][^>]*>([^<]*(?:joc|activitate|exerci[t]iu)[^<]*)</h[1-6]>',
|
||||
r'(?i)<strong>([^<]*(?:joc|activitate|exerci[t]iu)[^<]*)</strong>',
|
||||
r'(?i)^[\d]+\.?\s*([A-Z][^\.]{10,100}(?:joc|activitate|exerci[t]iu)[^\.]{0,50})$',
|
||||
],
|
||||
'description_markers': [
|
||||
'descriere', 'reguli', 'cum se joac[a]', 'instructiuni',
|
||||
'obiectiv', 'desfasurare', 'explicatie', 'mod de joc'
|
||||
],
|
||||
'materials_markers': [
|
||||
'materiale', 'necesare', 'echipament', 'ce avem nevoie',
|
||||
'se folosesc', 'trebuie sa avem', 'dotari'
|
||||
],
|
||||
'age_patterns': [
|
||||
r'(?i)v[<5B>a]rst[a][\s:]+(\d+)[\s-]+(\d+)',
|
||||
r'(?i)(\d+)[\s-]+(\d+)\s*ani',
|
||||
r'(?i)pentru\s+(\d+)[\s-]+(\d+)\s*ani',
|
||||
r'(?i)categoria?\s*(?:de\s*)?v[<5B>a]rst[a][\s:]+(\d+)[\s-]+(\d+)',
|
||||
],
|
||||
'participants_patterns': [
|
||||
r'(?i)(\d+)[\s-]+(\d+)\s*(?:participan[t]i|juc[a]tori|persoane|copii)',
|
||||
r'(?i)num[a]r\s*(?:de\s*)?(?:participan[t]i|juc[a]tori)[\s:]+(\d+)[\s-]+(\d+)',
|
||||
r'(?i)grup\s*de\s*(\d+)[\s-]+(\d+)',
|
||||
],
|
||||
'duration_patterns': [
|
||||
r'(?i)durat[a][\s:]+(\d+)[\s-]+(\d+)\s*(?:minute|min)',
|
||||
r'(?i)timp[\s:]+(\d+)[\s-]+(\d+)\s*(?:minute|min)',
|
||||
r'(?i)(\d+)[\s-]+(\d+)\s*minute',
|
||||
]
|
||||
}
|
||||
|
||||
# Categorii predefinite bazate pe sistemul existent
|
||||
self.categories = {
|
||||
'[A]': ['joc', 'joaca', 'distractie', 'amuzament'],
|
||||
'[B]': ['aventura', 'explorare', 'descoperire'],
|
||||
'[C]': ['camping', 'tabara', 'excursie', 'drumetie'],
|
||||
'[D]': ['foc', 'flacara', 'lumina'],
|
||||
'[E]': ['noduri', 'fr<EFBFBD>nghii', 'sfori', 'legare'],
|
||||
'[F]': ['bushcraft', 'supravietuire', 'survival'],
|
||||
'[G]': ['educatie', 'educativ', 'invatare', 'scoala'],
|
||||
'[H]': ['orientare', 'busola', 'harta', 'navigare']
|
||||
}
|
||||
|
||||
def detect_encoding(self, file_path):
|
||||
"""Detecteaz encoding-ul fiierului"""
|
||||
with open(file_path, 'rb') as f:
|
||||
result = chardet.detect(f.read())
|
||||
return result['encoding'] or 'utf-8'
|
||||
|
||||
def extract_from_html(self, html_path: str) -> List[Dict]:
|
||||
"""Extrage activiti dintr-un singur fiier HTML"""
|
||||
activities = []
|
||||
|
||||
try:
|
||||
# Detectare encoding i citire
|
||||
encoding = self.detect_encoding(html_path)
|
||||
with open(html_path, 'r', encoding=encoding, errors='ignore') as f:
|
||||
content = f.read()
|
||||
|
||||
soup = BeautifulSoup(content, 'lxml')
|
||||
|
||||
# Metod 1: Caut liste de activiti
|
||||
activities.extend(self._extract_from_lists(soup, html_path))
|
||||
|
||||
# Metod 2: Caut activiti <20>n headings
|
||||
activities.extend(self._extract_from_headings(soup, html_path))
|
||||
|
||||
# Metod 3: Caut pattern-uri <20>n text
|
||||
activities.extend(self._extract_from_patterns(soup, html_path))
|
||||
|
||||
# Metod 4: Caut <20>n tabele
|
||||
activities.extend(self._extract_from_tables(soup, html_path))
|
||||
|
||||
except Exception as e:
|
||||
print(f"Error processing {html_path}: {e}")
|
||||
|
||||
return activities
|
||||
|
||||
def _extract_from_lists(self, soup, source_file):
|
||||
"""Extrage activiti din liste HTML (ul, ol)"""
|
||||
activities = []
|
||||
|
||||
for list_elem in soup.find_all(['ul', 'ol']):
|
||||
# Verific dac lista pare s conin activiti
|
||||
list_text = list_elem.get_text().lower()
|
||||
if any(marker in list_text for marker in ['joc', 'activitate', 'exercitiu']):
|
||||
for li in list_elem.find_all('li'):
|
||||
text = li.get_text(strip=True)
|
||||
if len(text) > 20: # Minim 20 caractere pentru o activitate valid
|
||||
activity = self._create_activity_from_text(text, source_file)
|
||||
if activity:
|
||||
activities.append(activity)
|
||||
|
||||
return activities
|
||||
|
||||
def _extract_from_headings(self, soup, source_file):
|
||||
"""Extrage activiti bazate pe headings"""
|
||||
activities = []
|
||||
|
||||
for heading in soup.find_all(['h1', 'h2', 'h3', 'h4', 'h5', 'h6']):
|
||||
heading_text = heading.get_text(strip=True)
|
||||
|
||||
# Verific dac heading-ul conine cuvinte cheie
|
||||
if any(keyword in heading_text.lower() for keyword in ['joc', 'activitate', 'exercitiu']):
|
||||
# Caut descrierea <20>n elementele urmtoare
|
||||
description = ""
|
||||
next_elem = heading.find_next_sibling()
|
||||
|
||||
while next_elem and next_elem.name not in ['h1', 'h2', 'h3', 'h4', 'h5', 'h6']:
|
||||
if next_elem.name in ['p', 'div', 'ul']:
|
||||
description += next_elem.get_text(strip=True) + " "
|
||||
if len(description) > 500: # Limit descriere
|
||||
break
|
||||
next_elem = next_elem.find_next_sibling()
|
||||
|
||||
if description:
|
||||
activity = {
|
||||
'name': heading_text[:200],
|
||||
'description': description[:1000],
|
||||
'source_file': str(source_file),
|
||||
'category': self._detect_category(heading_text + " " + description)
|
||||
}
|
||||
activities.append(activity)
|
||||
|
||||
return activities
|
||||
|
||||
def _extract_from_patterns(self, soup, source_file):
|
||||
"""Extrage activiti folosind pattern matching"""
|
||||
activities = []
|
||||
text = soup.get_text()
|
||||
|
||||
# Caut pattern-uri de activiti
|
||||
for pattern in self.activity_patterns['title_patterns']:
|
||||
matches = re.finditer(pattern, text, re.MULTILINE)
|
||||
for match in matches:
|
||||
title = match.group(0) if match.lastindex == 0 else match.group(match.lastindex)
|
||||
if len(title) > 10:
|
||||
# Extrage context <20>n jurul match-ului
|
||||
start = max(0, match.start() - 200)
|
||||
end = min(len(text), match.end() + 500)
|
||||
context = text[start:end]
|
||||
|
||||
activity = self._create_activity_from_text(context, source_file, title)
|
||||
if activity:
|
||||
activities.append(activity)
|
||||
|
||||
return activities
|
||||
|
||||
def _extract_from_tables(self, soup, source_file):
|
||||
"""Extrage activiti din tabele"""
|
||||
activities = []
|
||||
|
||||
for table in soup.find_all('table'):
|
||||
rows = table.find_all('tr')
|
||||
if len(rows) > 1: # Cel puin header i o linie de date
|
||||
# Detecteaz coloanele relevante
|
||||
headers = [th.get_text(strip=True).lower() for th in rows[0].find_all(['th', 'td'])]
|
||||
|
||||
for row in rows[1:]:
|
||||
cells = row.find_all(['td'])
|
||||
if cells:
|
||||
activity_data = {}
|
||||
for i, cell in enumerate(cells):
|
||||
if i < len(headers):
|
||||
activity_data[headers[i]] = cell.get_text(strip=True)
|
||||
|
||||
# Creeaz activitate din date tabel
|
||||
if any(key in activity_data for key in ['joc', 'activitate', 'nume', 'titlu']):
|
||||
activity = self._create_activity_from_table_data(activity_data, source_file)
|
||||
if activity:
|
||||
activities.append(activity)
|
||||
|
||||
return activities
|
||||
|
||||
def _create_activity_from_text(self, text, source_file, title=None):
|
||||
"""Creeaz un dicionar de activitate din text"""
|
||||
if not text or len(text) < 30:
|
||||
return None
|
||||
|
||||
activity = {
|
||||
'name': title or text[:100].split('.')[0].strip(),
|
||||
'description': text[:1000],
|
||||
'source_file': str(source_file),
|
||||
'category': self._detect_category(text),
|
||||
'keywords': self._extract_keywords(text),
|
||||
'created_at': datetime.now().isoformat()
|
||||
}
|
||||
|
||||
# Extrage metadata suplimentar
|
||||
activity.update(self._extract_metadata(text))
|
||||
|
||||
return activity
|
||||
|
||||
def _create_activity_from_table_data(self, data, source_file):
|
||||
"""Creeaz activitate din date de tabel"""
|
||||
activity = {
|
||||
'source_file': str(source_file),
|
||||
'created_at': datetime.now().isoformat()
|
||||
}
|
||||
|
||||
# Mapare c<>mpuri tabel la c<>mpuri DB
|
||||
field_mapping = {
|
||||
'nume': 'name', 'titlu': 'name', 'joc': 'name', 'activitate': 'name',
|
||||
'descriere': 'description', 'detalii': 'description', 'explicatie': 'description',
|
||||
'materiale': 'materials_list', 'echipament': 'materials_list',
|
||||
'varsta': 'age_group_min', 'categoria': 'category',
|
||||
'participanti': 'participants_min', 'numar': 'participants_min',
|
||||
'durata': 'duration_min', 'timp': 'duration_min'
|
||||
}
|
||||
|
||||
for table_field, db_field in field_mapping.items():
|
||||
if table_field in data:
|
||||
activity[db_field] = data[table_field]
|
||||
|
||||
# Validare minim
|
||||
if 'name' in activity and len(activity.get('name', '')) > 5:
|
||||
return activity
|
||||
|
||||
return None
|
||||
|
||||
def _extract_metadata(self, text):
|
||||
"""Extrage metadata din text folosind pattern-uri"""
|
||||
metadata = {}
|
||||
|
||||
# Extrage v<>rsta
|
||||
for pattern in self.activity_patterns['age_patterns']:
|
||||
match = re.search(pattern, text)
|
||||
if match:
|
||||
metadata['age_group_min'] = int(match.group(1))
|
||||
metadata['age_group_max'] = int(match.group(2)) if match.lastindex >= 2 else int(match.group(1))
|
||||
break
|
||||
|
||||
# Extrage numr participani
|
||||
for pattern in self.activity_patterns['participants_patterns']:
|
||||
match = re.search(pattern, text)
|
||||
if match:
|
||||
metadata['participants_min'] = int(match.group(1))
|
||||
metadata['participants_max'] = int(match.group(2)) if match.lastindex >= 2 else int(match.group(1))
|
||||
break
|
||||
|
||||
# Extrage durata
|
||||
for pattern in self.activity_patterns['duration_patterns']:
|
||||
match = re.search(pattern, text)
|
||||
if match:
|
||||
metadata['duration_min'] = int(match.group(1))
|
||||
metadata['duration_max'] = int(match.group(2)) if match.lastindex >= 2 else int(match.group(1))
|
||||
break
|
||||
|
||||
# Extrage materiale
|
||||
materials = []
|
||||
text_lower = text.lower()
|
||||
for marker in self.activity_patterns['materials_markers']:
|
||||
idx = text_lower.find(marker)
|
||||
if idx != -1:
|
||||
# Extrage urmtoarele 200 caractere dup marker
|
||||
materials_text = text[idx:idx+200]
|
||||
# Extrage items din list
|
||||
items = re.findall(r'[-"]\s*([^\n-"]+)', materials_text)
|
||||
if items:
|
||||
materials.extend(items)
|
||||
|
||||
if materials:
|
||||
metadata['materials_list'] = ', '.join(materials[:10]) # Maxim 10 materiale
|
||||
|
||||
return metadata
|
||||
|
||||
def _detect_category(self, text):
|
||||
"""Detecteaz categoria activitii bazat pe cuvinte cheie"""
|
||||
text_lower = text.lower()
|
||||
|
||||
for category, keywords in self.categories.items():
|
||||
if any(keyword in text_lower for keyword in keywords):
|
||||
return category
|
||||
|
||||
return '[A]' # Default categoria jocuri
|
||||
|
||||
def _extract_keywords(self, text):
|
||||
"""Extrage cuvinte cheie din text"""
|
||||
keywords = []
|
||||
text_lower = text.lower()
|
||||
|
||||
# Lista de cuvinte cheie relevante
|
||||
keyword_list = [
|
||||
'cooperare', 'competitie', 'echipa', 'creativitate', 'miscare',
|
||||
'strategie', 'comunicare', 'incredere', 'coordonare', 'atentie',
|
||||
'reflexe', 'logica', 'imaginatie', 'muzica', 'dans', 'sport',
|
||||
'natura', 'mediu', 'stiinta', 'matematica', 'limba', 'cultura'
|
||||
]
|
||||
|
||||
for keyword in keyword_list:
|
||||
if keyword in text_lower:
|
||||
keywords.append(keyword)
|
||||
|
||||
return ', '.join(keywords[:5]) # Maxim 5 keywords
|
||||
|
||||
def save_to_database(self, activities):
|
||||
"""Salveaz activitile <20>n baza de date"""
|
||||
conn = sqlite3.connect(self.db_path)
|
||||
cursor = conn.cursor()
|
||||
|
||||
saved_count = 0
|
||||
duplicate_count = 0
|
||||
|
||||
for activity in activities:
|
||||
try:
|
||||
# Verific duplicate
|
||||
cursor.execute(
|
||||
"SELECT id FROM activities WHERE name = ? AND source_file = ?",
|
||||
(activity.get('name'), activity.get('source_file'))
|
||||
)
|
||||
|
||||
if cursor.fetchone():
|
||||
duplicate_count += 1
|
||||
continue
|
||||
|
||||
# Pregtete valorile pentru insert
|
||||
columns = []
|
||||
values = []
|
||||
placeholders = []
|
||||
|
||||
for key, value in activity.items():
|
||||
if key != 'created_at': # Skip created_at, it has default
|
||||
columns.append(key)
|
||||
values.append(value)
|
||||
placeholders.append('?')
|
||||
|
||||
# Insert <20>n DB
|
||||
query = f"INSERT INTO activities ({', '.join(columns)}) VALUES ({', '.join(placeholders)})"
|
||||
cursor.execute(query, values)
|
||||
saved_count += 1
|
||||
|
||||
except Exception as e:
|
||||
print(f"Error saving activity: {e}")
|
||||
continue
|
||||
|
||||
conn.commit()
|
||||
conn.close()
|
||||
|
||||
return saved_count, duplicate_count
|
||||
|
||||
def process_all_html_files(self, base_path='/mnt/d/GoogleDrive/Cercetasi/carti-camp-jocuri'):
|
||||
"""Proceseaz toate fiierele HTML din directorul specificat"""
|
||||
base_path = Path(base_path)
|
||||
html_files = list(base_path.rglob("*.html"))
|
||||
html_files.extend(list(base_path.rglob("*.htm")))
|
||||
|
||||
print(f"Found {len(html_files)} HTML files to process")
|
||||
|
||||
all_activities = []
|
||||
processed = 0
|
||||
errors = 0
|
||||
|
||||
for i, html_file in enumerate(html_files):
|
||||
try:
|
||||
activities = self.extract_from_html(str(html_file))
|
||||
all_activities.extend(activities)
|
||||
processed += 1
|
||||
|
||||
# Progress update
|
||||
if (i + 1) % 100 == 0:
|
||||
print(f"Progress: {i+1}/{len(html_files)} files processed, {len(all_activities)} activities found")
|
||||
# Save batch to DB
|
||||
if all_activities:
|
||||
saved, dupes = self.save_to_database(all_activities)
|
||||
print(f"Batch saved: {saved} new activities, {dupes} duplicates skipped")
|
||||
all_activities = [] # Clear buffer
|
||||
|
||||
except Exception as e:
|
||||
print(f"Error processing {html_file}: {e}")
|
||||
errors += 1
|
||||
|
||||
# Save remaining activities
|
||||
if all_activities:
|
||||
saved, dupes = self.save_to_database(all_activities)
|
||||
print(f"Final batch saved: {saved} new activities, {dupes} duplicates skipped")
|
||||
|
||||
print(f"\nProcessing complete!")
|
||||
print(f"Files processed: {processed}")
|
||||
print(f"Errors: {errors}")
|
||||
|
||||
return processed, errors
|
||||
|
||||
# Funcie main pentru test
|
||||
if __name__ == "__main__":
|
||||
extractor = HTMLActivityExtractor()
|
||||
|
||||
# Test pe un fiier sample mai <20>nt<6E>i
|
||||
print("Testing on sample file first...")
|
||||
# Gsete un fiier HTML pentru test
|
||||
test_files = list(Path('/mnt/d/GoogleDrive/Cercetasi/carti-camp-jocuri').rglob("*.html"))[:3]
|
||||
|
||||
for test_file in test_files:
|
||||
print(f"\nTesting: {test_file}")
|
||||
activities = extractor.extract_from_html(str(test_file))
|
||||
print(f"Found {len(activities)} activities")
|
||||
if activities:
|
||||
print(f"Sample activity: {activities[0]['name'][:50]}...")
|
||||
|
||||
# <20>ntreab dac s continue cu procesarea complet
|
||||
response = input("\nContinue with full processing? (y/n): ")
|
||||
if response.lower() == 'y':
|
||||
extractor.process_all_html_files()
|
||||
78
scripts/import_claude_activities.py
Normal file
78
scripts/import_claude_activities.py
Normal file
@@ -0,0 +1,78 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Import activities extracted by Claude from JSON files
|
||||
"""
|
||||
|
||||
import json
|
||||
import sqlite3
|
||||
from pathlib import Path
|
||||
from datetime import datetime
|
||||
|
||||
class ClaudeActivityImporter:
|
||||
def __init__(self, db_path='data/activities.db'):
|
||||
self.db_path = db_path
|
||||
self.json_dir = Path('scripts/extracted_activities')
|
||||
self.json_dir.mkdir(exist_ok=True)
|
||||
|
||||
def import_json_file(self, json_path):
|
||||
"""Import activities from a single JSON file"""
|
||||
with open(json_path, 'r', encoding='utf-8') as f:
|
||||
data = json.load(f)
|
||||
|
||||
source_file = data.get('source_file', str(json_path))
|
||||
activities = data.get('activities', [])
|
||||
|
||||
conn = sqlite3.connect(self.db_path)
|
||||
cursor = conn.cursor()
|
||||
|
||||
imported = 0
|
||||
for activity in activities:
|
||||
try:
|
||||
# Add source file and timestamp
|
||||
activity['source_file'] = source_file
|
||||
activity['created_at'] = datetime.now().isoformat()
|
||||
|
||||
# Prepare insert
|
||||
columns = list(activity.keys())
|
||||
values = list(activity.values())
|
||||
placeholders = ['?' for _ in values]
|
||||
|
||||
# Check for duplicate
|
||||
cursor.execute(
|
||||
"SELECT id FROM activities WHERE name = ? AND source_file = ?",
|
||||
(activity.get('name'), source_file)
|
||||
)
|
||||
|
||||
if not cursor.fetchone():
|
||||
query = f"INSERT INTO activities ({', '.join(columns)}) VALUES ({', '.join(placeholders)})"
|
||||
cursor.execute(query, values)
|
||||
imported += 1
|
||||
|
||||
except Exception as e:
|
||||
print(f"Error importing activity: {e}")
|
||||
|
||||
conn.commit()
|
||||
conn.close()
|
||||
|
||||
print(f"Imported {imported} activities from {json_path.name}")
|
||||
return imported
|
||||
|
||||
def import_all_json_files(self):
|
||||
"""Import all JSON files from the extracted_activities directory"""
|
||||
json_files = list(self.json_dir.glob("*.json"))
|
||||
|
||||
if not json_files:
|
||||
print("No JSON files found in extracted_activities directory")
|
||||
return 0
|
||||
|
||||
total_imported = 0
|
||||
for json_file in json_files:
|
||||
imported = self.import_json_file(json_file)
|
||||
total_imported += imported
|
||||
|
||||
print(f"\nTotal imported: {total_imported} activities from {len(json_files)} files")
|
||||
return total_imported
|
||||
|
||||
if __name__ == "__main__":
|
||||
importer = ClaudeActivityImporter()
|
||||
importer.import_all_json_files()
|
||||
@@ -1,200 +0,0 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Data indexing script for INDEX-SISTEM-JOCURI v2.0
|
||||
Extracts activities from INDEX_MASTER and populates database
|
||||
"""
|
||||
|
||||
import sys
|
||||
import os
|
||||
from pathlib import Path
|
||||
|
||||
# Add app directory to Python path
|
||||
sys.path.insert(0, str(Path(__file__).parent.parent))
|
||||
|
||||
from app.models.database import DatabaseManager
|
||||
from app.services.indexer import ActivityIndexer
|
||||
from app.config import Config
|
||||
import argparse
|
||||
import time
|
||||
|
||||
def main():
|
||||
"""Main indexing function"""
|
||||
parser = argparse.ArgumentParser(description='Index activities from INDEX_MASTER')
|
||||
parser.add_argument('--clear', action='store_true', help='Clear existing database before indexing')
|
||||
parser.add_argument('--category', help='Index specific category only (e.g., [A], [B], etc.)')
|
||||
parser.add_argument('--verify', action='store_true', help='Verify indexing quality after completion')
|
||||
parser.add_argument('--stats', action='store_true', help='Show database statistics only')
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
# Setup paths
|
||||
Config.ensure_directories()
|
||||
|
||||
# Database path
|
||||
db_path = os.environ.get('DATABASE_URL', str(Config.DATA_DIR / 'activities.db'))
|
||||
if db_path.startswith('sqlite:///'):
|
||||
db_path = db_path[10:] # Remove sqlite:/// prefix
|
||||
|
||||
# INDEX_MASTER path
|
||||
index_master_path = os.environ.get('INDEX_MASTER_FILE', str(Config.INDEX_MASTER_FILE))
|
||||
|
||||
print("🎯 INDEX-SISTEM-JOCURI v2.0 - Data Indexing")
|
||||
print("=" * 50)
|
||||
print(f"Database: {db_path}")
|
||||
print(f"INDEX_MASTER: {index_master_path}")
|
||||
print("=" * 50)
|
||||
|
||||
# Verify INDEX_MASTER file exists
|
||||
if not Path(index_master_path).exists():
|
||||
print(f"❌ INDEX_MASTER file not found: {index_master_path}")
|
||||
print(" Please ensure the file is mounted in the container or available locally")
|
||||
return 1
|
||||
|
||||
# Initialize services
|
||||
try:
|
||||
db_manager = DatabaseManager(db_path)
|
||||
indexer = ActivityIndexer(db_manager, index_master_path)
|
||||
except Exception as e:
|
||||
print(f"❌ Error initializing services: {e}")
|
||||
return 1
|
||||
|
||||
# Handle different operations
|
||||
if args.stats:
|
||||
return show_statistics(db_manager)
|
||||
|
||||
if args.category:
|
||||
return index_category(indexer, args.category)
|
||||
|
||||
if args.verify:
|
||||
return verify_indexing(indexer)
|
||||
|
||||
# Default: full indexing
|
||||
return full_indexing(indexer, args.clear)
|
||||
|
||||
def full_indexing(indexer: ActivityIndexer, clear_existing: bool) -> int:
|
||||
"""Perform full indexing of all activities"""
|
||||
|
||||
print("🚀 Starting full indexing process...")
|
||||
|
||||
try:
|
||||
# Perform indexing
|
||||
result = indexer.index_all_activities(clear_existing=clear_existing)
|
||||
|
||||
if not result.get('success'):
|
||||
print(f"❌ Indexing failed: {result.get('error', 'Unknown error')}")
|
||||
return 1
|
||||
|
||||
# Print results
|
||||
print("\n📊 INDEXING RESULTS")
|
||||
print("=" * 30)
|
||||
print(f"✅ Activities inserted: {result.get('inserted_count', 0)}")
|
||||
print(f"⏱️ Indexing time: {result.get('indexing_time_seconds', 0):.2f}s")
|
||||
|
||||
parsing_stats = result.get('parsing_stats', {})
|
||||
print(f"📈 Completion rate: {parsing_stats.get('completion_rate', 0):.1f}%")
|
||||
print(f"📝 Avg description length: {parsing_stats.get('average_description_length', 0):.0f} chars")
|
||||
|
||||
# Category breakdown
|
||||
categories = result.get('distribution', {}).get('categories', {})
|
||||
print(f"\n📂 CATEGORY BREAKDOWN:")
|
||||
for category, count in categories.items():
|
||||
print(f" {category}: {count} activities")
|
||||
|
||||
# Quality check
|
||||
if result.get('inserted_count', 0) >= 500:
|
||||
print(f"\n🎯 SUCCESS: Target of 500+ activities achieved!")
|
||||
else:
|
||||
print(f"\n⚠️ Warning: Only {result.get('inserted_count', 0)} activities indexed (target: 500+)")
|
||||
|
||||
return 0
|
||||
|
||||
except Exception as e:
|
||||
print(f"❌ Error during indexing: {e}")
|
||||
return 1
|
||||
|
||||
def index_category(indexer: ActivityIndexer, category_code: str) -> int:
|
||||
"""Index a specific category"""
|
||||
|
||||
print(f"🎯 Indexing category: {category_code}")
|
||||
|
||||
try:
|
||||
result = indexer.index_specific_category(category_code)
|
||||
|
||||
if not result.get('success'):
|
||||
print(f"❌ Category indexing failed: {result.get('error', 'Unknown error')}")
|
||||
return 1
|
||||
|
||||
print(f"✅ Category '{result.get('category')}' indexed successfully")
|
||||
print(f" Inserted: {result.get('inserted_count')} activities")
|
||||
print(f" Parsed: {result.get('total_parsed')} total")
|
||||
print(f" Valid: {result.get('valid_activities')} valid")
|
||||
|
||||
return 0
|
||||
|
||||
except Exception as e:
|
||||
print(f"❌ Error during category indexing: {e}")
|
||||
return 1
|
||||
|
||||
def verify_indexing(indexer: ActivityIndexer) -> int:
|
||||
"""Verify indexing quality"""
|
||||
|
||||
print("🔍 Verifying indexing quality...")
|
||||
|
||||
try:
|
||||
result = indexer.verify_indexing_quality()
|
||||
|
||||
if 'error' in result:
|
||||
print(f"❌ Verification error: {result['error']}")
|
||||
return 1
|
||||
|
||||
print("\n📊 QUALITY VERIFICATION")
|
||||
print("=" * 30)
|
||||
print(f"Total activities: {result.get('total_activities', 0)}")
|
||||
print(f"Meets minimum (500+): {'✅' if result.get('meets_minimum_requirement') else '❌'}")
|
||||
print(f"Category coverage: {result.get('category_coverage', 0)}/{result.get('expected_categories', 8)}")
|
||||
print(f"Quality score: {result.get('quality_score', 0)}/100")
|
||||
|
||||
quality_issues = result.get('quality_issues', [])
|
||||
if quality_issues:
|
||||
print(f"\n⚠️ Quality Issues:")
|
||||
for issue in quality_issues[:5]: # Show first 5 issues
|
||||
print(f" • {issue}")
|
||||
if len(quality_issues) > 5:
|
||||
print(f" ... and {len(quality_issues) - 5} more issues")
|
||||
else:
|
||||
print(f"\n✅ No quality issues detected")
|
||||
|
||||
return 0 if result.get('quality_score', 0) >= 80 else 1
|
||||
|
||||
except Exception as e:
|
||||
print(f"❌ Error during verification: {e}")
|
||||
return 1
|
||||
|
||||
def show_statistics(db_manager: DatabaseManager) -> int:
|
||||
"""Show database statistics"""
|
||||
|
||||
print("📊 Database Statistics")
|
||||
print("=" * 25)
|
||||
|
||||
try:
|
||||
stats = db_manager.get_statistics()
|
||||
|
||||
print(f"Total activities: {stats.get('total_activities', 0)}")
|
||||
print(f"Database size: {stats.get('database_size_bytes', 0) / 1024:.1f} KB")
|
||||
print(f"Database path: {stats.get('database_path', 'Unknown')}")
|
||||
|
||||
categories = stats.get('categories', {})
|
||||
if categories:
|
||||
print(f"\nCategories:")
|
||||
for category, count in categories.items():
|
||||
print(f" {category}: {count}")
|
||||
|
||||
return 0
|
||||
|
||||
except Exception as e:
|
||||
print(f"❌ Error getting statistics: {e}")
|
||||
return 1
|
||||
|
||||
if __name__ == '__main__':
|
||||
exit_code = main()
|
||||
sys.exit(exit_code)
|
||||
0
scripts/pdf_extractor.py
Normal file
0
scripts/pdf_extractor.py
Normal file
143
scripts/pdf_to_text_converter.py
Normal file
143
scripts/pdf_to_text_converter.py
Normal file
@@ -0,0 +1,143 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
PDF Mass Conversion to Text for Activity Extraction
|
||||
Handles all PDF sizes efficiently with multiple fallback methods
|
||||
"""
|
||||
|
||||
import os
|
||||
import json
|
||||
from pathlib import Path
|
||||
import PyPDF2
|
||||
import pdfplumber
|
||||
from typing import List, Dict
|
||||
import logging
|
||||
|
||||
class PDFConverter:
|
||||
def __init__(self, max_pages=50):
|
||||
self.max_pages = max_pages
|
||||
self.conversion_stats = {}
|
||||
|
||||
def convert_pdf_to_text(self, pdf_path: str) -> str:
|
||||
"""Convert PDF to text using multiple methods with fallbacks"""
|
||||
try:
|
||||
# Method 1: pdfplumber (best for tables and layout)
|
||||
return self._convert_with_pdfplumber(pdf_path)
|
||||
except Exception as e:
|
||||
print(f"pdfplumber failed for {pdf_path}: {e}")
|
||||
|
||||
try:
|
||||
# Method 2: PyPDF2 (fallback)
|
||||
return self._convert_with_pypdf2(pdf_path)
|
||||
except Exception as e2:
|
||||
print(f"PyPDF2 also failed for {pdf_path}: {e2}")
|
||||
return ""
|
||||
|
||||
def _convert_with_pdfplumber(self, pdf_path: str) -> str:
|
||||
"""Primary conversion method using pdfplumber"""
|
||||
text_content = ""
|
||||
|
||||
with pdfplumber.open(pdf_path) as pdf:
|
||||
total_pages = len(pdf.pages)
|
||||
pages_to_process = min(total_pages, self.max_pages)
|
||||
|
||||
print(f" Converting {pdf_path}: {pages_to_process}/{total_pages} pages")
|
||||
|
||||
for i, page in enumerate(pdf.pages[:pages_to_process]):
|
||||
try:
|
||||
page_text = page.extract_text()
|
||||
if page_text:
|
||||
text_content += f"\n--- PAGE {i+1} ---\n"
|
||||
text_content += page_text
|
||||
text_content += "\n"
|
||||
except Exception as e:
|
||||
print(f" Error on page {i+1}: {e}")
|
||||
continue
|
||||
|
||||
self.conversion_stats[pdf_path] = {
|
||||
'method': 'pdfplumber',
|
||||
'pages_processed': pages_to_process,
|
||||
'total_pages': total_pages,
|
||||
'success': True,
|
||||
'text_length': len(text_content)
|
||||
}
|
||||
|
||||
return text_content
|
||||
|
||||
def _convert_with_pypdf2(self, pdf_path: str) -> str:
|
||||
"""Fallback conversion method using PyPDF2"""
|
||||
text_content = ""
|
||||
|
||||
with open(pdf_path, 'rb') as file:
|
||||
reader = PyPDF2.PdfReader(file)
|
||||
total_pages = len(reader.pages)
|
||||
pages_to_process = min(total_pages, self.max_pages)
|
||||
|
||||
print(f" Converting {pdf_path} (fallback): {pages_to_process}/{total_pages} pages")
|
||||
|
||||
for i in range(pages_to_process):
|
||||
try:
|
||||
page = reader.pages[i]
|
||||
page_text = page.extract_text()
|
||||
if page_text:
|
||||
text_content += f"\n--- PAGE {i+1} ---\n"
|
||||
text_content += page_text
|
||||
text_content += "\n"
|
||||
except Exception as e:
|
||||
print(f" Error on page {i+1}: {e}")
|
||||
continue
|
||||
|
||||
self.conversion_stats[pdf_path] = {
|
||||
'method': 'PyPDF2',
|
||||
'pages_processed': pages_to_process,
|
||||
'total_pages': total_pages,
|
||||
'success': True,
|
||||
'text_length': len(text_content)
|
||||
}
|
||||
|
||||
return text_content
|
||||
|
||||
def convert_all_pdfs(self, pdf_directory: str, output_directory: str):
|
||||
"""Convert all PDFs in directory to text files"""
|
||||
pdf_files = list(Path(pdf_directory).glob("**/*.pdf"))
|
||||
|
||||
print(f"🔄 Converting {len(pdf_files)} PDF files to text...")
|
||||
|
||||
os.makedirs(output_directory, exist_ok=True)
|
||||
|
||||
for i, pdf_path in enumerate(pdf_files):
|
||||
print(f"\n[{i+1}/{len(pdf_files)}] Processing {pdf_path.name}...")
|
||||
|
||||
# Convert to text
|
||||
text_content = self.convert_pdf_to_text(str(pdf_path))
|
||||
|
||||
if text_content.strip():
|
||||
# Save as text file
|
||||
output_file = Path(output_directory) / f"{pdf_path.stem}.txt"
|
||||
with open(output_file, 'w', encoding='utf-8') as f:
|
||||
f.write(f"SOURCE: {pdf_path}\n")
|
||||
f.write(f"CONVERTED: 2025-01-11\n")
|
||||
f.write("="*50 + "\n\n")
|
||||
f.write(text_content)
|
||||
|
||||
print(f" ✅ Saved: {output_file}")
|
||||
else:
|
||||
print(f" ❌ No text extracted from {pdf_path.name}")
|
||||
|
||||
# Save conversion statistics
|
||||
stats_file = Path(output_directory) / "conversion_stats.json"
|
||||
with open(stats_file, 'w', encoding='utf-8') as f:
|
||||
json.dump(self.conversion_stats, f, indent=2, ensure_ascii=False)
|
||||
|
||||
print(f"\n🎉 PDF conversion complete! Check {output_directory}")
|
||||
return len([f for f in self.conversion_stats.values() if f['success']])
|
||||
|
||||
# Usage
|
||||
if __name__ == "__main__":
|
||||
converter = PDFConverter(max_pages=50)
|
||||
|
||||
# Convert all PDFs
|
||||
pdf_dir = "/mnt/d/GoogleDrive/Cercetasi/carti-camp-jocuri"
|
||||
output_dir = "/mnt/d/GoogleDrive/Cercetasi/carti-camp-jocuri/INDEX-SISTEM-JOCURI/converted_pdfs"
|
||||
|
||||
converted_count = converter.convert_all_pdfs(pdf_dir, output_dir)
|
||||
print(f"Final result: {converted_count} PDFs successfully converted")
|
||||
50
scripts/run_extraction.py
Normal file
50
scripts/run_extraction.py
Normal file
@@ -0,0 +1,50 @@
|
||||
#!/usr/bin/env python3
|
||||
# -*- coding: utf-8 -*-
|
||||
"""
|
||||
Main extraction orchestrator
|
||||
Ruleaza intregul proces de extractie
|
||||
"""
|
||||
|
||||
import sys
|
||||
import time
|
||||
from pathlib import Path
|
||||
|
||||
from unified_processor import UnifiedProcessor
|
||||
from import_claude_activities import ClaudeActivityImporter
|
||||
|
||||
def main():
|
||||
print("="*60)
|
||||
print("ACTIVITY EXTRACTION SYSTEM")
|
||||
print("Strategy S8: Hybrid Claude + Scripts")
|
||||
print("="*60)
|
||||
|
||||
# Step 1: Run automated extraction
|
||||
print("\nSTEP 1: Automated Extraction")
|
||||
print("-"*40)
|
||||
processor = UnifiedProcessor()
|
||||
processor.process_automated_formats()
|
||||
|
||||
# Step 2: Wait for Claude processing
|
||||
print("\n" + "="*60)
|
||||
print("STEP 2: Manual Claude Processing Required")
|
||||
print("-"*40)
|
||||
print("Please process PDF/DOC files with Claude using the template.")
|
||||
print("Files are listed in: pdf_doc_for_claude.txt")
|
||||
print("Save extracted activities as JSON in: scripts/extracted_activities/")
|
||||
print("="*60)
|
||||
|
||||
response = input("\nHave you completed Claude processing? (y/n): ")
|
||||
|
||||
if response.lower() == 'y':
|
||||
# Step 3: Import Claude-extracted activities
|
||||
print("\nSTEP 3: Importing Claude-extracted activities")
|
||||
print("-"*40)
|
||||
importer = ClaudeActivityImporter()
|
||||
importer.import_all_json_files()
|
||||
|
||||
print("\n" + "="*60)
|
||||
print("EXTRACTION COMPLETE!")
|
||||
print("="*60)
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
197
scripts/text_extractor.py
Normal file
197
scripts/text_extractor.py
Normal file
@@ -0,0 +1,197 @@
|
||||
#!/usr/bin/env python3
|
||||
# -*- coding: utf-8 -*-
|
||||
"""
|
||||
Text/Markdown Activity Extractor
|
||||
Proceseaza fisiere TXT si MD pentru extractie activitati
|
||||
"""
|
||||
|
||||
import re
|
||||
from pathlib import Path
|
||||
from typing import List, Dict
|
||||
import sqlite3
|
||||
from datetime import datetime
|
||||
|
||||
class TextActivityExtractor:
|
||||
def __init__(self, db_path='data/activities.db'):
|
||||
self.db_path = db_path
|
||||
self.activity_patterns = {
|
||||
'section_headers': [
|
||||
r'^#{1,6}\s*(.+)$', # Markdown headers
|
||||
r'^([A-Z][^\.]{10,100})$', # Titluri simple
|
||||
r'^\d+\.\s*(.+)$', # Numbered lists
|
||||
r'^[•\-\*]\s*(.+)$', # Bullet points
|
||||
],
|
||||
'activity_markers': [
|
||||
'joc:', 'activitate:', 'exercitiu:', 'team building:',
|
||||
'nume:', 'titlu:', 'denumire:'
|
||||
]
|
||||
}
|
||||
|
||||
def extract_from_text(self, file_path: str) -> List[Dict]:
|
||||
"""Extrage activitati din fisier text/markdown"""
|
||||
activities = []
|
||||
|
||||
try:
|
||||
with open(file_path, 'r', encoding='utf-8', errors='ignore') as f:
|
||||
content = f.read()
|
||||
|
||||
# Metoda 1: Cauta sectiuni markdown
|
||||
if file_path.endswith('.md'):
|
||||
activities.extend(self._extract_from_markdown(content, file_path))
|
||||
|
||||
# Metoda 2: Cauta pattern-uri generale
|
||||
activities.extend(self._extract_from_patterns(content, file_path))
|
||||
|
||||
# Metoda 3: Cauta blocuri de text structurate
|
||||
activities.extend(self._extract_from_blocks(content, file_path))
|
||||
|
||||
except Exception as e:
|
||||
print(f"Error processing {file_path}: {e}")
|
||||
|
||||
return activities
|
||||
|
||||
def _extract_from_markdown(self, content, source_file):
|
||||
"""Extrage activitati din format markdown"""
|
||||
activities = []
|
||||
lines = content.split('\n')
|
||||
|
||||
current_activity = None
|
||||
current_content = []
|
||||
|
||||
for line in lines:
|
||||
# Verifica daca e header de activitate
|
||||
if re.match(r'^#{1,3}\s*(.+)', line):
|
||||
# Salveaza activitatea anterioara daca exista
|
||||
if current_activity and current_content:
|
||||
current_activity['description'] = '\n'.join(current_content[:20]) # Max 20 linii
|
||||
activities.append(current_activity)
|
||||
|
||||
# Verifica daca noul header e o activitate
|
||||
header_text = re.sub(r'^#{1,3}\s*', '', line)
|
||||
if any(marker in header_text.lower() for marker in ['joc', 'activitate', 'exercitiu']):
|
||||
current_activity = {
|
||||
'name': header_text[:200],
|
||||
'source_file': str(source_file),
|
||||
'category': '[A]'
|
||||
}
|
||||
current_content = []
|
||||
else:
|
||||
current_activity = None
|
||||
|
||||
elif current_activity:
|
||||
# Adauga continut la activitatea curenta
|
||||
if line.strip():
|
||||
current_content.append(line)
|
||||
|
||||
# Salveaza ultima activitate
|
||||
if current_activity and current_content:
|
||||
current_activity['description'] = '\n'.join(current_content[:20])
|
||||
activities.append(current_activity)
|
||||
|
||||
return activities
|
||||
|
||||
def _extract_from_patterns(self, content, source_file):
|
||||
"""Extrage folosind pattern matching"""
|
||||
activities = []
|
||||
|
||||
# Cauta markeri specifici de activitati
|
||||
for marker in self.activity_patterns['activity_markers']:
|
||||
pattern = re.compile(f'{re.escape(marker)}\\s*(.+?)(?=\\n\\n|{re.escape(marker)}|$)',
|
||||
re.IGNORECASE | re.DOTALL)
|
||||
matches = pattern.finditer(content)
|
||||
|
||||
for match in matches:
|
||||
activity_text = match.group(1)
|
||||
if len(activity_text) > 20:
|
||||
activity = {
|
||||
'name': activity_text.split('\n')[0][:200],
|
||||
'description': activity_text[:1000],
|
||||
'source_file': str(source_file),
|
||||
'category': '[A]'
|
||||
}
|
||||
activities.append(activity)
|
||||
|
||||
return activities
|
||||
|
||||
def _extract_from_blocks(self, content, source_file):
|
||||
"""Extrage din blocuri de text separate"""
|
||||
activities = []
|
||||
|
||||
# Imparte in blocuri separate de linii goale
|
||||
blocks = re.split(r'\n\s*\n', content)
|
||||
|
||||
for block in blocks:
|
||||
if len(block) > 50: # Minim 50 caractere
|
||||
lines = block.strip().split('\n')
|
||||
first_line = lines[0].strip()
|
||||
|
||||
# Verifica daca blocul pare o activitate
|
||||
if any(keyword in first_line.lower() for keyword in ['joc', 'activitate', 'exercitiu']):
|
||||
activity = {
|
||||
'name': first_line[:200],
|
||||
'description': block[:1000],
|
||||
'source_file': str(source_file),
|
||||
'category': '[A]'
|
||||
}
|
||||
activities.append(activity)
|
||||
|
||||
return activities
|
||||
|
||||
def save_to_database(self, activities):
|
||||
"""Salveaza in baza de date"""
|
||||
conn = sqlite3.connect(self.db_path)
|
||||
cursor = conn.cursor()
|
||||
|
||||
saved_count = 0
|
||||
|
||||
for activity in activities:
|
||||
try:
|
||||
# Check for duplicates
|
||||
cursor.execute(
|
||||
"SELECT id FROM activities WHERE name = ? AND source_file = ?",
|
||||
(activity.get('name'), activity.get('source_file'))
|
||||
)
|
||||
|
||||
if not cursor.fetchone():
|
||||
columns = list(activity.keys())
|
||||
values = list(activity.values())
|
||||
placeholders = ['?' for _ in values]
|
||||
|
||||
query = f"INSERT INTO activities ({', '.join(columns)}) VALUES ({', '.join(placeholders)})"
|
||||
cursor.execute(query, values)
|
||||
saved_count += 1
|
||||
|
||||
except Exception as e:
|
||||
print(f"Error saving: {e}")
|
||||
|
||||
conn.commit()
|
||||
conn.close()
|
||||
|
||||
return saved_count
|
||||
|
||||
def process_all_text_files(self, base_path='/mnt/d/GoogleDrive/Cercetasi/carti-camp-jocuri'):
|
||||
"""Proceseaza toate fisierele text si markdown"""
|
||||
base_path = Path(base_path)
|
||||
|
||||
text_files = list(base_path.rglob("*.txt"))
|
||||
md_files = list(base_path.rglob("*.md"))
|
||||
all_files = text_files + md_files
|
||||
|
||||
print(f"Found {len(all_files)} text/markdown files")
|
||||
|
||||
all_activities = []
|
||||
|
||||
for file_path in all_files:
|
||||
activities = self.extract_from_text(str(file_path))
|
||||
all_activities.extend(activities)
|
||||
print(f"Processed {file_path.name}: {len(activities)} activities")
|
||||
|
||||
# Save to database
|
||||
saved = self.save_to_database(all_activities)
|
||||
print(f"\nTotal saved: {saved} activities from {len(all_files)} files")
|
||||
|
||||
return len(all_files), saved
|
||||
|
||||
if __name__ == "__main__":
|
||||
extractor = TextActivityExtractor()
|
||||
extractor.process_all_text_files()
|
||||
151
scripts/unified_processor.py
Normal file
151
scripts/unified_processor.py
Normal file
@@ -0,0 +1,151 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Unified Activity Processor
|
||||
Orchestreaz toate extractoarele pentru procesare complet
|
||||
"""
|
||||
|
||||
import time
|
||||
from pathlib import Path
|
||||
from html_extractor import HTMLActivityExtractor
|
||||
from text_extractor import TextActivityExtractor
|
||||
import sqlite3
|
||||
|
||||
class UnifiedProcessor:
|
||||
def __init__(self, db_path='data/activities.db'):
|
||||
self.db_path = db_path
|
||||
self.html_extractor = HTMLActivityExtractor(db_path)
|
||||
self.text_extractor = TextActivityExtractor(db_path)
|
||||
self.stats = {
|
||||
'html_processed': 0,
|
||||
'text_processed': 0,
|
||||
'pdf_to_process': 0,
|
||||
'doc_to_process': 0,
|
||||
'total_activities': 0,
|
||||
'start_time': None,
|
||||
'end_time': None
|
||||
}
|
||||
|
||||
def get_current_activity_count(self):
|
||||
"""Obine numrul curent de activiti din DB"""
|
||||
conn = sqlite3.connect(self.db_path)
|
||||
cursor = conn.cursor()
|
||||
cursor.execute("SELECT COUNT(*) FROM activities")
|
||||
count = cursor.fetchone()[0]
|
||||
conn.close()
|
||||
return count
|
||||
|
||||
def count_files_to_process(self, base_path):
|
||||
"""Numr fiierele care trebuie procesate"""
|
||||
base_path = Path(base_path)
|
||||
|
||||
counts = {
|
||||
'html': len(list(base_path.rglob("*.html"))) + len(list(base_path.rglob("*.htm"))),
|
||||
'txt': len(list(base_path.rglob("*.txt"))),
|
||||
'md': len(list(base_path.rglob("*.md"))),
|
||||
'pdf': len(list(base_path.rglob("*.pdf"))),
|
||||
'doc': len(list(base_path.rglob("*.doc"))),
|
||||
'docx': len(list(base_path.rglob("*.docx")))
|
||||
}
|
||||
|
||||
return counts
|
||||
|
||||
def process_automated_formats(self, base_path='/mnt/d/GoogleDrive/Cercetasi/carti-camp-jocuri'):
|
||||
"""Proceseaz toate formatele care pot fi automatizate"""
|
||||
print("="*60)
|
||||
print("UNIFIED ACTIVITY PROCESSOR - AUTOMATED PHASE")
|
||||
print("="*60)
|
||||
|
||||
self.stats['start_time'] = time.time()
|
||||
initial_count = self.get_current_activity_count()
|
||||
|
||||
# Afieaz statistici iniiale
|
||||
file_counts = self.count_files_to_process(base_path)
|
||||
print(f"\nFiles to process:")
|
||||
for format, count in file_counts.items():
|
||||
print(f" {format.upper()}: {count} files")
|
||||
print(f"\nCurrent activities in database: {initial_count}")
|
||||
print("-"*60)
|
||||
|
||||
# FAZA 1: Procesare HTML (prioritate maxim - volum mare)
|
||||
print("\n[1/2] Processing HTML files...")
|
||||
print("-"*40)
|
||||
html_processed, html_errors = self.html_extractor.process_all_html_files(base_path)
|
||||
self.stats['html_processed'] = html_processed
|
||||
|
||||
# FAZA 2: Procesare Text/MD
|
||||
print("\n[2/2] Processing Text/Markdown files...")
|
||||
print("-"*40)
|
||||
text_processed, text_saved = self.text_extractor.process_all_text_files(base_path)
|
||||
self.stats['text_processed'] = text_processed
|
||||
|
||||
# Statistici finale
|
||||
self.stats['end_time'] = time.time()
|
||||
final_count = self.get_current_activity_count()
|
||||
self.stats['total_activities'] = final_count - initial_count
|
||||
|
||||
# Identific fiierele care necesit procesare manual
|
||||
self.stats['pdf_to_process'] = file_counts['pdf']
|
||||
self.stats['doc_to_process'] = file_counts['doc'] + file_counts['docx']
|
||||
|
||||
self.print_summary()
|
||||
self.save_pdf_doc_list(base_path)
|
||||
|
||||
def print_summary(self):
|
||||
"""Afieaz rezumatul procesrii"""
|
||||
print("\n" + "="*60)
|
||||
print("PROCESSING SUMMARY")
|
||||
print("="*60)
|
||||
|
||||
duration = self.stats['end_time'] - self.stats['start_time']
|
||||
|
||||
print(f"\nAutomated Processing Results:")
|
||||
print(f" HTML files processed: {self.stats['html_processed']}")
|
||||
print(f" Text/MD files processed: {self.stats['text_processed']}")
|
||||
print(f" New activities added: {self.stats['total_activities']}")
|
||||
print(f" Processing time: {duration:.1f} seconds")
|
||||
|
||||
print(f"\nFiles requiring Claude processing:")
|
||||
print(f" PDF files: {self.stats['pdf_to_process']}")
|
||||
print(f" DOC/DOCX files: {self.stats['doc_to_process']}")
|
||||
|
||||
print("\n" + "="*60)
|
||||
print("NEXT STEPS:")
|
||||
print("1. Review the file 'pdf_doc_for_claude.txt' for manual processing")
|
||||
print("2. Use Claude to extract activities from PDF/DOC files")
|
||||
print("3. Focus on largest PDF files first (highest activity density)")
|
||||
print("="*60)
|
||||
|
||||
def save_pdf_doc_list(self, base_path):
|
||||
"""Salveaz lista de PDF/DOC pentru procesare cu Claude"""
|
||||
base_path = Path(base_path)
|
||||
|
||||
pdf_files = sorted(base_path.rglob("*.pdf"), key=lambda p: p.stat().st_size, reverse=True)
|
||||
doc_files = list(base_path.rglob("*.doc"))
|
||||
docx_files = list(base_path.rglob("*.docx"))
|
||||
|
||||
with open('pdf_doc_for_claude.txt', 'w', encoding='utf-8') as f:
|
||||
f.write("PDF/DOC FILES FOR CLAUDE PROCESSING\n")
|
||||
f.write("="*60 + "\n")
|
||||
f.write("Files sorted by size (largest first = likely more activities)\n\n")
|
||||
|
||||
f.write("TOP PRIORITY PDF FILES (process these first):\n")
|
||||
f.write("-"*40 + "\n")
|
||||
for i, pdf in enumerate(pdf_files[:20], 1):
|
||||
size_mb = pdf.stat().st_size / (1024*1024)
|
||||
f.write(f"{i}. {pdf.name} ({size_mb:.1f} MB)\n")
|
||||
f.write(f" Path: {pdf}\n\n")
|
||||
|
||||
if len(pdf_files) > 20:
|
||||
f.write(f"\n... and {len(pdf_files)-20} more PDF files\n\n")
|
||||
|
||||
f.write("\nDOC/DOCX FILES:\n")
|
||||
f.write("-"*40 + "\n")
|
||||
for doc in doc_files + docx_files:
|
||||
size_kb = doc.stat().st_size / 1024
|
||||
f.write(f"- {doc.name} ({size_kb:.1f} KB)\n")
|
||||
|
||||
print(f"\nPDF/DOC list saved to: pdf_doc_for_claude.txt")
|
||||
|
||||
if __name__ == "__main__":
|
||||
processor = UnifiedProcessor()
|
||||
processor.process_automated_formats()
|
||||
Reference in New Issue
Block a user