Refactor extraction system and reorganize project structure
- Remove obsolete documentation files (DEPLOYMENT.md, PLAN_IMPLEMENTARE_S8_DETALIAT.md, README.md) - Add comprehensive extraction pipeline with multiple format support (PDF, HTML, text) - Implement Claude-based activity extraction with structured templates - Update dependencies and Docker configuration - Reorganize scripts directory with modular extraction components - Move example documentation to appropriate location 🤖 Generated with [Claude Code](https://claude.ai/code) Co-Authored-By: Claude <noreply@anthropic.com>
This commit is contained in:
151
scripts/unified_processor.py
Normal file
151
scripts/unified_processor.py
Normal file
@@ -0,0 +1,151 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Unified Activity Processor
|
||||
Orchestreaz toate extractoarele pentru procesare complet
|
||||
"""
|
||||
|
||||
import time
|
||||
from pathlib import Path
|
||||
from html_extractor import HTMLActivityExtractor
|
||||
from text_extractor import TextActivityExtractor
|
||||
import sqlite3
|
||||
|
||||
class UnifiedProcessor:
|
||||
def __init__(self, db_path='data/activities.db'):
|
||||
self.db_path = db_path
|
||||
self.html_extractor = HTMLActivityExtractor(db_path)
|
||||
self.text_extractor = TextActivityExtractor(db_path)
|
||||
self.stats = {
|
||||
'html_processed': 0,
|
||||
'text_processed': 0,
|
||||
'pdf_to_process': 0,
|
||||
'doc_to_process': 0,
|
||||
'total_activities': 0,
|
||||
'start_time': None,
|
||||
'end_time': None
|
||||
}
|
||||
|
||||
def get_current_activity_count(self):
|
||||
"""Obine numrul curent de activiti din DB"""
|
||||
conn = sqlite3.connect(self.db_path)
|
||||
cursor = conn.cursor()
|
||||
cursor.execute("SELECT COUNT(*) FROM activities")
|
||||
count = cursor.fetchone()[0]
|
||||
conn.close()
|
||||
return count
|
||||
|
||||
def count_files_to_process(self, base_path):
|
||||
"""Numr fiierele care trebuie procesate"""
|
||||
base_path = Path(base_path)
|
||||
|
||||
counts = {
|
||||
'html': len(list(base_path.rglob("*.html"))) + len(list(base_path.rglob("*.htm"))),
|
||||
'txt': len(list(base_path.rglob("*.txt"))),
|
||||
'md': len(list(base_path.rglob("*.md"))),
|
||||
'pdf': len(list(base_path.rglob("*.pdf"))),
|
||||
'doc': len(list(base_path.rglob("*.doc"))),
|
||||
'docx': len(list(base_path.rglob("*.docx")))
|
||||
}
|
||||
|
||||
return counts
|
||||
|
||||
def process_automated_formats(self, base_path='/mnt/d/GoogleDrive/Cercetasi/carti-camp-jocuri'):
|
||||
"""Proceseaz toate formatele care pot fi automatizate"""
|
||||
print("="*60)
|
||||
print("UNIFIED ACTIVITY PROCESSOR - AUTOMATED PHASE")
|
||||
print("="*60)
|
||||
|
||||
self.stats['start_time'] = time.time()
|
||||
initial_count = self.get_current_activity_count()
|
||||
|
||||
# Afieaz statistici iniiale
|
||||
file_counts = self.count_files_to_process(base_path)
|
||||
print(f"\nFiles to process:")
|
||||
for format, count in file_counts.items():
|
||||
print(f" {format.upper()}: {count} files")
|
||||
print(f"\nCurrent activities in database: {initial_count}")
|
||||
print("-"*60)
|
||||
|
||||
# FAZA 1: Procesare HTML (prioritate maxim - volum mare)
|
||||
print("\n[1/2] Processing HTML files...")
|
||||
print("-"*40)
|
||||
html_processed, html_errors = self.html_extractor.process_all_html_files(base_path)
|
||||
self.stats['html_processed'] = html_processed
|
||||
|
||||
# FAZA 2: Procesare Text/MD
|
||||
print("\n[2/2] Processing Text/Markdown files...")
|
||||
print("-"*40)
|
||||
text_processed, text_saved = self.text_extractor.process_all_text_files(base_path)
|
||||
self.stats['text_processed'] = text_processed
|
||||
|
||||
# Statistici finale
|
||||
self.stats['end_time'] = time.time()
|
||||
final_count = self.get_current_activity_count()
|
||||
self.stats['total_activities'] = final_count - initial_count
|
||||
|
||||
# Identific fiierele care necesit procesare manual
|
||||
self.stats['pdf_to_process'] = file_counts['pdf']
|
||||
self.stats['doc_to_process'] = file_counts['doc'] + file_counts['docx']
|
||||
|
||||
self.print_summary()
|
||||
self.save_pdf_doc_list(base_path)
|
||||
|
||||
def print_summary(self):
|
||||
"""Afieaz rezumatul procesrii"""
|
||||
print("\n" + "="*60)
|
||||
print("PROCESSING SUMMARY")
|
||||
print("="*60)
|
||||
|
||||
duration = self.stats['end_time'] - self.stats['start_time']
|
||||
|
||||
print(f"\nAutomated Processing Results:")
|
||||
print(f" HTML files processed: {self.stats['html_processed']}")
|
||||
print(f" Text/MD files processed: {self.stats['text_processed']}")
|
||||
print(f" New activities added: {self.stats['total_activities']}")
|
||||
print(f" Processing time: {duration:.1f} seconds")
|
||||
|
||||
print(f"\nFiles requiring Claude processing:")
|
||||
print(f" PDF files: {self.stats['pdf_to_process']}")
|
||||
print(f" DOC/DOCX files: {self.stats['doc_to_process']}")
|
||||
|
||||
print("\n" + "="*60)
|
||||
print("NEXT STEPS:")
|
||||
print("1. Review the file 'pdf_doc_for_claude.txt' for manual processing")
|
||||
print("2. Use Claude to extract activities from PDF/DOC files")
|
||||
print("3. Focus on largest PDF files first (highest activity density)")
|
||||
print("="*60)
|
||||
|
||||
def save_pdf_doc_list(self, base_path):
|
||||
"""Salveaz lista de PDF/DOC pentru procesare cu Claude"""
|
||||
base_path = Path(base_path)
|
||||
|
||||
pdf_files = sorted(base_path.rglob("*.pdf"), key=lambda p: p.stat().st_size, reverse=True)
|
||||
doc_files = list(base_path.rglob("*.doc"))
|
||||
docx_files = list(base_path.rglob("*.docx"))
|
||||
|
||||
with open('pdf_doc_for_claude.txt', 'w', encoding='utf-8') as f:
|
||||
f.write("PDF/DOC FILES FOR CLAUDE PROCESSING\n")
|
||||
f.write("="*60 + "\n")
|
||||
f.write("Files sorted by size (largest first = likely more activities)\n\n")
|
||||
|
||||
f.write("TOP PRIORITY PDF FILES (process these first):\n")
|
||||
f.write("-"*40 + "\n")
|
||||
for i, pdf in enumerate(pdf_files[:20], 1):
|
||||
size_mb = pdf.stat().st_size / (1024*1024)
|
||||
f.write(f"{i}. {pdf.name} ({size_mb:.1f} MB)\n")
|
||||
f.write(f" Path: {pdf}\n\n")
|
||||
|
||||
if len(pdf_files) > 20:
|
||||
f.write(f"\n... and {len(pdf_files)-20} more PDF files\n\n")
|
||||
|
||||
f.write("\nDOC/DOCX FILES:\n")
|
||||
f.write("-"*40 + "\n")
|
||||
for doc in doc_files + docx_files:
|
||||
size_kb = doc.stat().st_size / 1024
|
||||
f.write(f"- {doc.name} ({size_kb:.1f} KB)\n")
|
||||
|
||||
print(f"\nPDF/DOC list saved to: pdf_doc_for_claude.txt")
|
||||
|
||||
if __name__ == "__main__":
|
||||
processor = UnifiedProcessor()
|
||||
processor.process_automated_formats()
|
||||
Reference in New Issue
Block a user