Refactor extraction system and reorganize project structure

- Remove obsolete documentation files (DEPLOYMENT.md, PLAN_IMPLEMENTARE_S8_DETALIAT.md, README.md)
- Add comprehensive extraction pipeline with multiple format support (PDF, HTML, text)
- Implement Claude-based activity extraction with structured templates
- Update dependencies and Docker configuration
- Reorganize scripts directory with modular extraction components
- Move example documentation to appropriate location

🤖 Generated with [Claude Code](https://claude.ai/code)

Co-Authored-By: Claude <noreply@anthropic.com>
This commit is contained in:
2025-09-11 23:32:37 +03:00
parent 1b6b7e06ad
commit a19ddf0b71
119 changed files with 91074 additions and 1859 deletions

View File

@@ -0,0 +1,151 @@
#!/usr/bin/env python3
"""
Unified Activity Processor
Orchestreaz toate extractoarele pentru procesare complet
"""
import time
from pathlib import Path
from html_extractor import HTMLActivityExtractor
from text_extractor import TextActivityExtractor
import sqlite3
class UnifiedProcessor:
def __init__(self, db_path='data/activities.db'):
self.db_path = db_path
self.html_extractor = HTMLActivityExtractor(db_path)
self.text_extractor = TextActivityExtractor(db_path)
self.stats = {
'html_processed': 0,
'text_processed': 0,
'pdf_to_process': 0,
'doc_to_process': 0,
'total_activities': 0,
'start_time': None,
'end_time': None
}
def get_current_activity_count(self):
"""Obine numrul curent de activiti din DB"""
conn = sqlite3.connect(self.db_path)
cursor = conn.cursor()
cursor.execute("SELECT COUNT(*) FROM activities")
count = cursor.fetchone()[0]
conn.close()
return count
def count_files_to_process(self, base_path):
"""Numr fiierele care trebuie procesate"""
base_path = Path(base_path)
counts = {
'html': len(list(base_path.rglob("*.html"))) + len(list(base_path.rglob("*.htm"))),
'txt': len(list(base_path.rglob("*.txt"))),
'md': len(list(base_path.rglob("*.md"))),
'pdf': len(list(base_path.rglob("*.pdf"))),
'doc': len(list(base_path.rglob("*.doc"))),
'docx': len(list(base_path.rglob("*.docx")))
}
return counts
def process_automated_formats(self, base_path='/mnt/d/GoogleDrive/Cercetasi/carti-camp-jocuri'):
"""Proceseaz toate formatele care pot fi automatizate"""
print("="*60)
print("UNIFIED ACTIVITY PROCESSOR - AUTOMATED PHASE")
print("="*60)
self.stats['start_time'] = time.time()
initial_count = self.get_current_activity_count()
# Afieaz statistici iniiale
file_counts = self.count_files_to_process(base_path)
print(f"\nFiles to process:")
for format, count in file_counts.items():
print(f" {format.upper()}: {count} files")
print(f"\nCurrent activities in database: {initial_count}")
print("-"*60)
# FAZA 1: Procesare HTML (prioritate maxim - volum mare)
print("\n[1/2] Processing HTML files...")
print("-"*40)
html_processed, html_errors = self.html_extractor.process_all_html_files(base_path)
self.stats['html_processed'] = html_processed
# FAZA 2: Procesare Text/MD
print("\n[2/2] Processing Text/Markdown files...")
print("-"*40)
text_processed, text_saved = self.text_extractor.process_all_text_files(base_path)
self.stats['text_processed'] = text_processed
# Statistici finale
self.stats['end_time'] = time.time()
final_count = self.get_current_activity_count()
self.stats['total_activities'] = final_count - initial_count
# Identific fiierele care necesit procesare manual
self.stats['pdf_to_process'] = file_counts['pdf']
self.stats['doc_to_process'] = file_counts['doc'] + file_counts['docx']
self.print_summary()
self.save_pdf_doc_list(base_path)
def print_summary(self):
"""Afieaz rezumatul procesrii"""
print("\n" + "="*60)
print("PROCESSING SUMMARY")
print("="*60)
duration = self.stats['end_time'] - self.stats['start_time']
print(f"\nAutomated Processing Results:")
print(f" HTML files processed: {self.stats['html_processed']}")
print(f" Text/MD files processed: {self.stats['text_processed']}")
print(f" New activities added: {self.stats['total_activities']}")
print(f" Processing time: {duration:.1f} seconds")
print(f"\nFiles requiring Claude processing:")
print(f" PDF files: {self.stats['pdf_to_process']}")
print(f" DOC/DOCX files: {self.stats['doc_to_process']}")
print("\n" + "="*60)
print("NEXT STEPS:")
print("1. Review the file 'pdf_doc_for_claude.txt' for manual processing")
print("2. Use Claude to extract activities from PDF/DOC files")
print("3. Focus on largest PDF files first (highest activity density)")
print("="*60)
def save_pdf_doc_list(self, base_path):
"""Salveaz lista de PDF/DOC pentru procesare cu Claude"""
base_path = Path(base_path)
pdf_files = sorted(base_path.rglob("*.pdf"), key=lambda p: p.stat().st_size, reverse=True)
doc_files = list(base_path.rglob("*.doc"))
docx_files = list(base_path.rglob("*.docx"))
with open('pdf_doc_for_claude.txt', 'w', encoding='utf-8') as f:
f.write("PDF/DOC FILES FOR CLAUDE PROCESSING\n")
f.write("="*60 + "\n")
f.write("Files sorted by size (largest first = likely more activities)\n\n")
f.write("TOP PRIORITY PDF FILES (process these first):\n")
f.write("-"*40 + "\n")
for i, pdf in enumerate(pdf_files[:20], 1):
size_mb = pdf.stat().st_size / (1024*1024)
f.write(f"{i}. {pdf.name} ({size_mb:.1f} MB)\n")
f.write(f" Path: {pdf}\n\n")
if len(pdf_files) > 20:
f.write(f"\n... and {len(pdf_files)-20} more PDF files\n\n")
f.write("\nDOC/DOCX FILES:\n")
f.write("-"*40 + "\n")
for doc in doc_files + docx_files:
size_kb = doc.stat().st_size / 1024
f.write(f"- {doc.name} ({size_kb:.1f} KB)\n")
print(f"\nPDF/DOC list saved to: pdf_doc_for_claude.txt")
if __name__ == "__main__":
processor = UnifiedProcessor()
processor.process_automated_formats()