Refactor extraction system and reorganize project structure
- Remove obsolete documentation files (DEPLOYMENT.md, PLAN_IMPLEMENTARE_S8_DETALIAT.md, README.md) - Add comprehensive extraction pipeline with multiple format support (PDF, HTML, text) - Implement Claude-based activity extraction with structured templates - Update dependencies and Docker configuration - Reorganize scripts directory with modular extraction components - Move example documentation to appropriate location 🤖 Generated with [Claude Code](https://claude.ai/code) Co-Authored-By: Claude <noreply@anthropic.com>
This commit is contained in:
424
scripts/html_extractor.py
Normal file
424
scripts/html_extractor.py
Normal file
@@ -0,0 +1,424 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
HTML Activity Extractor - Proceseaz 1876 fiiere HTML
|
||||
Extrage automat activiti folosind pattern recognition
|
||||
"""
|
||||
|
||||
import os
|
||||
import re
|
||||
import json
|
||||
from pathlib import Path
|
||||
from bs4 import BeautifulSoup
|
||||
import chardet
|
||||
from typing import List, Dict, Optional
|
||||
import sqlite3
|
||||
from datetime import datetime
|
||||
|
||||
class HTMLActivityExtractor:
|
||||
def __init__(self, db_path='data/activities.db'):
|
||||
self.db_path = db_path
|
||||
# Pattern-uri pentru detectare activiti <20>n rom<6F>n
|
||||
self.activity_patterns = {
|
||||
'title_patterns': [
|
||||
r'(?i)(joc|activitate|exerci[t]iu|team[\s-]?building|energizer|ice[\s-]?breaker)[\s:]+([^\.]{5,100})',
|
||||
r'(?i)<h[1-6][^>]*>([^<]*(?:joc|activitate|exerci[t]iu)[^<]*)</h[1-6]>',
|
||||
r'(?i)<strong>([^<]*(?:joc|activitate|exerci[t]iu)[^<]*)</strong>',
|
||||
r'(?i)^[\d]+\.?\s*([A-Z][^\.]{10,100}(?:joc|activitate|exerci[t]iu)[^\.]{0,50})$',
|
||||
],
|
||||
'description_markers': [
|
||||
'descriere', 'reguli', 'cum se joac[a]', 'instructiuni',
|
||||
'obiectiv', 'desfasurare', 'explicatie', 'mod de joc'
|
||||
],
|
||||
'materials_markers': [
|
||||
'materiale', 'necesare', 'echipament', 'ce avem nevoie',
|
||||
'se folosesc', 'trebuie sa avem', 'dotari'
|
||||
],
|
||||
'age_patterns': [
|
||||
r'(?i)v[<5B>a]rst[a][\s:]+(\d+)[\s-]+(\d+)',
|
||||
r'(?i)(\d+)[\s-]+(\d+)\s*ani',
|
||||
r'(?i)pentru\s+(\d+)[\s-]+(\d+)\s*ani',
|
||||
r'(?i)categoria?\s*(?:de\s*)?v[<5B>a]rst[a][\s:]+(\d+)[\s-]+(\d+)',
|
||||
],
|
||||
'participants_patterns': [
|
||||
r'(?i)(\d+)[\s-]+(\d+)\s*(?:participan[t]i|juc[a]tori|persoane|copii)',
|
||||
r'(?i)num[a]r\s*(?:de\s*)?(?:participan[t]i|juc[a]tori)[\s:]+(\d+)[\s-]+(\d+)',
|
||||
r'(?i)grup\s*de\s*(\d+)[\s-]+(\d+)',
|
||||
],
|
||||
'duration_patterns': [
|
||||
r'(?i)durat[a][\s:]+(\d+)[\s-]+(\d+)\s*(?:minute|min)',
|
||||
r'(?i)timp[\s:]+(\d+)[\s-]+(\d+)\s*(?:minute|min)',
|
||||
r'(?i)(\d+)[\s-]+(\d+)\s*minute',
|
||||
]
|
||||
}
|
||||
|
||||
# Categorii predefinite bazate pe sistemul existent
|
||||
self.categories = {
|
||||
'[A]': ['joc', 'joaca', 'distractie', 'amuzament'],
|
||||
'[B]': ['aventura', 'explorare', 'descoperire'],
|
||||
'[C]': ['camping', 'tabara', 'excursie', 'drumetie'],
|
||||
'[D]': ['foc', 'flacara', 'lumina'],
|
||||
'[E]': ['noduri', 'fr<EFBFBD>nghii', 'sfori', 'legare'],
|
||||
'[F]': ['bushcraft', 'supravietuire', 'survival'],
|
||||
'[G]': ['educatie', 'educativ', 'invatare', 'scoala'],
|
||||
'[H]': ['orientare', 'busola', 'harta', 'navigare']
|
||||
}
|
||||
|
||||
def detect_encoding(self, file_path):
|
||||
"""Detecteaz encoding-ul fiierului"""
|
||||
with open(file_path, 'rb') as f:
|
||||
result = chardet.detect(f.read())
|
||||
return result['encoding'] or 'utf-8'
|
||||
|
||||
def extract_from_html(self, html_path: str) -> List[Dict]:
|
||||
"""Extrage activiti dintr-un singur fiier HTML"""
|
||||
activities = []
|
||||
|
||||
try:
|
||||
# Detectare encoding i citire
|
||||
encoding = self.detect_encoding(html_path)
|
||||
with open(html_path, 'r', encoding=encoding, errors='ignore') as f:
|
||||
content = f.read()
|
||||
|
||||
soup = BeautifulSoup(content, 'lxml')
|
||||
|
||||
# Metod 1: Caut liste de activiti
|
||||
activities.extend(self._extract_from_lists(soup, html_path))
|
||||
|
||||
# Metod 2: Caut activiti <20>n headings
|
||||
activities.extend(self._extract_from_headings(soup, html_path))
|
||||
|
||||
# Metod 3: Caut pattern-uri <20>n text
|
||||
activities.extend(self._extract_from_patterns(soup, html_path))
|
||||
|
||||
# Metod 4: Caut <20>n tabele
|
||||
activities.extend(self._extract_from_tables(soup, html_path))
|
||||
|
||||
except Exception as e:
|
||||
print(f"Error processing {html_path}: {e}")
|
||||
|
||||
return activities
|
||||
|
||||
def _extract_from_lists(self, soup, source_file):
|
||||
"""Extrage activiti din liste HTML (ul, ol)"""
|
||||
activities = []
|
||||
|
||||
for list_elem in soup.find_all(['ul', 'ol']):
|
||||
# Verific dac lista pare s conin activiti
|
||||
list_text = list_elem.get_text().lower()
|
||||
if any(marker in list_text for marker in ['joc', 'activitate', 'exercitiu']):
|
||||
for li in list_elem.find_all('li'):
|
||||
text = li.get_text(strip=True)
|
||||
if len(text) > 20: # Minim 20 caractere pentru o activitate valid
|
||||
activity = self._create_activity_from_text(text, source_file)
|
||||
if activity:
|
||||
activities.append(activity)
|
||||
|
||||
return activities
|
||||
|
||||
def _extract_from_headings(self, soup, source_file):
|
||||
"""Extrage activiti bazate pe headings"""
|
||||
activities = []
|
||||
|
||||
for heading in soup.find_all(['h1', 'h2', 'h3', 'h4', 'h5', 'h6']):
|
||||
heading_text = heading.get_text(strip=True)
|
||||
|
||||
# Verific dac heading-ul conine cuvinte cheie
|
||||
if any(keyword in heading_text.lower() for keyword in ['joc', 'activitate', 'exercitiu']):
|
||||
# Caut descrierea <20>n elementele urmtoare
|
||||
description = ""
|
||||
next_elem = heading.find_next_sibling()
|
||||
|
||||
while next_elem and next_elem.name not in ['h1', 'h2', 'h3', 'h4', 'h5', 'h6']:
|
||||
if next_elem.name in ['p', 'div', 'ul']:
|
||||
description += next_elem.get_text(strip=True) + " "
|
||||
if len(description) > 500: # Limit descriere
|
||||
break
|
||||
next_elem = next_elem.find_next_sibling()
|
||||
|
||||
if description:
|
||||
activity = {
|
||||
'name': heading_text[:200],
|
||||
'description': description[:1000],
|
||||
'source_file': str(source_file),
|
||||
'category': self._detect_category(heading_text + " " + description)
|
||||
}
|
||||
activities.append(activity)
|
||||
|
||||
return activities
|
||||
|
||||
def _extract_from_patterns(self, soup, source_file):
|
||||
"""Extrage activiti folosind pattern matching"""
|
||||
activities = []
|
||||
text = soup.get_text()
|
||||
|
||||
# Caut pattern-uri de activiti
|
||||
for pattern in self.activity_patterns['title_patterns']:
|
||||
matches = re.finditer(pattern, text, re.MULTILINE)
|
||||
for match in matches:
|
||||
title = match.group(0) if match.lastindex == 0 else match.group(match.lastindex)
|
||||
if len(title) > 10:
|
||||
# Extrage context <20>n jurul match-ului
|
||||
start = max(0, match.start() - 200)
|
||||
end = min(len(text), match.end() + 500)
|
||||
context = text[start:end]
|
||||
|
||||
activity = self._create_activity_from_text(context, source_file, title)
|
||||
if activity:
|
||||
activities.append(activity)
|
||||
|
||||
return activities
|
||||
|
||||
def _extract_from_tables(self, soup, source_file):
|
||||
"""Extrage activiti din tabele"""
|
||||
activities = []
|
||||
|
||||
for table in soup.find_all('table'):
|
||||
rows = table.find_all('tr')
|
||||
if len(rows) > 1: # Cel puin header i o linie de date
|
||||
# Detecteaz coloanele relevante
|
||||
headers = [th.get_text(strip=True).lower() for th in rows[0].find_all(['th', 'td'])]
|
||||
|
||||
for row in rows[1:]:
|
||||
cells = row.find_all(['td'])
|
||||
if cells:
|
||||
activity_data = {}
|
||||
for i, cell in enumerate(cells):
|
||||
if i < len(headers):
|
||||
activity_data[headers[i]] = cell.get_text(strip=True)
|
||||
|
||||
# Creeaz activitate din date tabel
|
||||
if any(key in activity_data for key in ['joc', 'activitate', 'nume', 'titlu']):
|
||||
activity = self._create_activity_from_table_data(activity_data, source_file)
|
||||
if activity:
|
||||
activities.append(activity)
|
||||
|
||||
return activities
|
||||
|
||||
def _create_activity_from_text(self, text, source_file, title=None):
|
||||
"""Creeaz un dicionar de activitate din text"""
|
||||
if not text or len(text) < 30:
|
||||
return None
|
||||
|
||||
activity = {
|
||||
'name': title or text[:100].split('.')[0].strip(),
|
||||
'description': text[:1000],
|
||||
'source_file': str(source_file),
|
||||
'category': self._detect_category(text),
|
||||
'keywords': self._extract_keywords(text),
|
||||
'created_at': datetime.now().isoformat()
|
||||
}
|
||||
|
||||
# Extrage metadata suplimentar
|
||||
activity.update(self._extract_metadata(text))
|
||||
|
||||
return activity
|
||||
|
||||
def _create_activity_from_table_data(self, data, source_file):
|
||||
"""Creeaz activitate din date de tabel"""
|
||||
activity = {
|
||||
'source_file': str(source_file),
|
||||
'created_at': datetime.now().isoformat()
|
||||
}
|
||||
|
||||
# Mapare c<>mpuri tabel la c<>mpuri DB
|
||||
field_mapping = {
|
||||
'nume': 'name', 'titlu': 'name', 'joc': 'name', 'activitate': 'name',
|
||||
'descriere': 'description', 'detalii': 'description', 'explicatie': 'description',
|
||||
'materiale': 'materials_list', 'echipament': 'materials_list',
|
||||
'varsta': 'age_group_min', 'categoria': 'category',
|
||||
'participanti': 'participants_min', 'numar': 'participants_min',
|
||||
'durata': 'duration_min', 'timp': 'duration_min'
|
||||
}
|
||||
|
||||
for table_field, db_field in field_mapping.items():
|
||||
if table_field in data:
|
||||
activity[db_field] = data[table_field]
|
||||
|
||||
# Validare minim
|
||||
if 'name' in activity and len(activity.get('name', '')) > 5:
|
||||
return activity
|
||||
|
||||
return None
|
||||
|
||||
def _extract_metadata(self, text):
|
||||
"""Extrage metadata din text folosind pattern-uri"""
|
||||
metadata = {}
|
||||
|
||||
# Extrage v<>rsta
|
||||
for pattern in self.activity_patterns['age_patterns']:
|
||||
match = re.search(pattern, text)
|
||||
if match:
|
||||
metadata['age_group_min'] = int(match.group(1))
|
||||
metadata['age_group_max'] = int(match.group(2)) if match.lastindex >= 2 else int(match.group(1))
|
||||
break
|
||||
|
||||
# Extrage numr participani
|
||||
for pattern in self.activity_patterns['participants_patterns']:
|
||||
match = re.search(pattern, text)
|
||||
if match:
|
||||
metadata['participants_min'] = int(match.group(1))
|
||||
metadata['participants_max'] = int(match.group(2)) if match.lastindex >= 2 else int(match.group(1))
|
||||
break
|
||||
|
||||
# Extrage durata
|
||||
for pattern in self.activity_patterns['duration_patterns']:
|
||||
match = re.search(pattern, text)
|
||||
if match:
|
||||
metadata['duration_min'] = int(match.group(1))
|
||||
metadata['duration_max'] = int(match.group(2)) if match.lastindex >= 2 else int(match.group(1))
|
||||
break
|
||||
|
||||
# Extrage materiale
|
||||
materials = []
|
||||
text_lower = text.lower()
|
||||
for marker in self.activity_patterns['materials_markers']:
|
||||
idx = text_lower.find(marker)
|
||||
if idx != -1:
|
||||
# Extrage urmtoarele 200 caractere dup marker
|
||||
materials_text = text[idx:idx+200]
|
||||
# Extrage items din list
|
||||
items = re.findall(r'[-"]\s*([^\n-"]+)', materials_text)
|
||||
if items:
|
||||
materials.extend(items)
|
||||
|
||||
if materials:
|
||||
metadata['materials_list'] = ', '.join(materials[:10]) # Maxim 10 materiale
|
||||
|
||||
return metadata
|
||||
|
||||
def _detect_category(self, text):
|
||||
"""Detecteaz categoria activitii bazat pe cuvinte cheie"""
|
||||
text_lower = text.lower()
|
||||
|
||||
for category, keywords in self.categories.items():
|
||||
if any(keyword in text_lower for keyword in keywords):
|
||||
return category
|
||||
|
||||
return '[A]' # Default categoria jocuri
|
||||
|
||||
def _extract_keywords(self, text):
|
||||
"""Extrage cuvinte cheie din text"""
|
||||
keywords = []
|
||||
text_lower = text.lower()
|
||||
|
||||
# Lista de cuvinte cheie relevante
|
||||
keyword_list = [
|
||||
'cooperare', 'competitie', 'echipa', 'creativitate', 'miscare',
|
||||
'strategie', 'comunicare', 'incredere', 'coordonare', 'atentie',
|
||||
'reflexe', 'logica', 'imaginatie', 'muzica', 'dans', 'sport',
|
||||
'natura', 'mediu', 'stiinta', 'matematica', 'limba', 'cultura'
|
||||
]
|
||||
|
||||
for keyword in keyword_list:
|
||||
if keyword in text_lower:
|
||||
keywords.append(keyword)
|
||||
|
||||
return ', '.join(keywords[:5]) # Maxim 5 keywords
|
||||
|
||||
def save_to_database(self, activities):
|
||||
"""Salveaz activitile <20>n baza de date"""
|
||||
conn = sqlite3.connect(self.db_path)
|
||||
cursor = conn.cursor()
|
||||
|
||||
saved_count = 0
|
||||
duplicate_count = 0
|
||||
|
||||
for activity in activities:
|
||||
try:
|
||||
# Verific duplicate
|
||||
cursor.execute(
|
||||
"SELECT id FROM activities WHERE name = ? AND source_file = ?",
|
||||
(activity.get('name'), activity.get('source_file'))
|
||||
)
|
||||
|
||||
if cursor.fetchone():
|
||||
duplicate_count += 1
|
||||
continue
|
||||
|
||||
# Pregtete valorile pentru insert
|
||||
columns = []
|
||||
values = []
|
||||
placeholders = []
|
||||
|
||||
for key, value in activity.items():
|
||||
if key != 'created_at': # Skip created_at, it has default
|
||||
columns.append(key)
|
||||
values.append(value)
|
||||
placeholders.append('?')
|
||||
|
||||
# Insert <20>n DB
|
||||
query = f"INSERT INTO activities ({', '.join(columns)}) VALUES ({', '.join(placeholders)})"
|
||||
cursor.execute(query, values)
|
||||
saved_count += 1
|
||||
|
||||
except Exception as e:
|
||||
print(f"Error saving activity: {e}")
|
||||
continue
|
||||
|
||||
conn.commit()
|
||||
conn.close()
|
||||
|
||||
return saved_count, duplicate_count
|
||||
|
||||
def process_all_html_files(self, base_path='/mnt/d/GoogleDrive/Cercetasi/carti-camp-jocuri'):
|
||||
"""Proceseaz toate fiierele HTML din directorul specificat"""
|
||||
base_path = Path(base_path)
|
||||
html_files = list(base_path.rglob("*.html"))
|
||||
html_files.extend(list(base_path.rglob("*.htm")))
|
||||
|
||||
print(f"Found {len(html_files)} HTML files to process")
|
||||
|
||||
all_activities = []
|
||||
processed = 0
|
||||
errors = 0
|
||||
|
||||
for i, html_file in enumerate(html_files):
|
||||
try:
|
||||
activities = self.extract_from_html(str(html_file))
|
||||
all_activities.extend(activities)
|
||||
processed += 1
|
||||
|
||||
# Progress update
|
||||
if (i + 1) % 100 == 0:
|
||||
print(f"Progress: {i+1}/{len(html_files)} files processed, {len(all_activities)} activities found")
|
||||
# Save batch to DB
|
||||
if all_activities:
|
||||
saved, dupes = self.save_to_database(all_activities)
|
||||
print(f"Batch saved: {saved} new activities, {dupes} duplicates skipped")
|
||||
all_activities = [] # Clear buffer
|
||||
|
||||
except Exception as e:
|
||||
print(f"Error processing {html_file}: {e}")
|
||||
errors += 1
|
||||
|
||||
# Save remaining activities
|
||||
if all_activities:
|
||||
saved, dupes = self.save_to_database(all_activities)
|
||||
print(f"Final batch saved: {saved} new activities, {dupes} duplicates skipped")
|
||||
|
||||
print(f"\nProcessing complete!")
|
||||
print(f"Files processed: {processed}")
|
||||
print(f"Errors: {errors}")
|
||||
|
||||
return processed, errors
|
||||
|
||||
# Funcie main pentru test
|
||||
if __name__ == "__main__":
|
||||
extractor = HTMLActivityExtractor()
|
||||
|
||||
# Test pe un fiier sample mai <20>nt<6E>i
|
||||
print("Testing on sample file first...")
|
||||
# Gsete un fiier HTML pentru test
|
||||
test_files = list(Path('/mnt/d/GoogleDrive/Cercetasi/carti-camp-jocuri').rglob("*.html"))[:3]
|
||||
|
||||
for test_file in test_files:
|
||||
print(f"\nTesting: {test_file}")
|
||||
activities = extractor.extract_from_html(str(test_file))
|
||||
print(f"Found {len(activities)} activities")
|
||||
if activities:
|
||||
print(f"Sample activity: {activities[0]['name'][:50]}...")
|
||||
|
||||
# <20>ntreab dac s continue cu procesarea complet
|
||||
response = input("\nContinue with full processing? (y/n): ")
|
||||
if response.lower() == 'y':
|
||||
extractor.process_all_html_files()
|
||||
Reference in New Issue
Block a user