""" Activity data model for INDEX-SISTEM-JOCURI v2.0 """ from dataclasses import dataclass, field from typing import List, Optional, Dict, Any import json import re import unicodedata def normalize_name(name: str) -> str: """Diacritic-free, lowercased, whitespace-collapsed form of a name. Used as the exact-match key for dedup grouping (see plan §4). """ if not name: return "" decomposed = unicodedata.normalize("NFKD", name) ascii_str = "".join(c for c in decomposed if not unicodedata.combining(c)) ascii_str = ascii_str.lower().strip() ascii_str = re.sub(r"\s+", " ", ascii_str) return ascii_str @dataclass class Activity: """Activity data model with comprehensive fields""" # Basic information name: str description: str rules: Optional[str] = None variations: Optional[str] = None # Categories category: str = "" subcategory: Optional[str] = None # content_type is an axis INDEPENDENT of category: # one of joc/activitate/reteta/cantec/ceremonie (see config_taxonomy). content_type: Optional[str] = None # Source information source_file: str = "" page_reference: Optional[str] = None # source_files: JSON-encoded list of every source the activity was seen in. # `source_file` (singular) stays as the primary/original source; build_database # (Lane C) accumulates the full list here on dedup-merge. source_files: List[str] = field(default_factory=list) # Short verbatim quote from the source — anti-hallucination anchor. source_excerpt: Optional[str] = None # Age and participants age_group_min: Optional[int] = None age_group_max: Optional[int] = None participants_min: Optional[int] = None participants_max: Optional[int] = None # Duration duration_min: Optional[int] = None # minutes duration_max: Optional[int] = None # minutes # Materials and setup materials_category: Optional[str] = None materials_list: Optional[str] = None skills_developed: Optional[str] = None difficulty_level: Optional[str] = None # Search and metadata keywords: Optional[str] = None tags: List[str] = field(default_factory=list) popularity_score: int = 0 # Extraction / language metadata language: Optional[str] = None # 'ro' / 'en' normalized_name: Optional[str] = None # dedup key; auto-derived from name extraction_confidence: Optional[str] = None # 'high' / 'med' / 'low' needs_review: int = 0 # Enrichment overlay (applied at build time from data/enrichment.json; see # plan Part B). Bilingual: the EN/source text stays in name/description/... # and the Romanian rendering lands in the *_ro twins. Absent fields leave # the underlying DB value untouched. name_ro: Optional[str] = None description_ro: Optional[str] = None rules_ro: Optional[str] = None variations_ro: Optional[str] = None indoor_outdoor: Optional[str] = None # slug: indoor / outdoor / either space_needed: Optional[str] = None # slug: mic / mediu / mare # Names of fields whose value was INFERRED by enrichment (source was # silent) rather than stated in the source — surfaced as "(estimat)" in UI. estimated_fields: List[str] = field(default_factory=list) # Source provenance for the download route + enrichment keying. source_id: Optional[str] = None # e.g. "876d1a2d_marcaje_turistice" source_ids: List[str] = field(default_factory=list) # all source_ids merged chunk_key: Optional[str] = None # e.g. ".part01" # Database fields id: Optional[int] = None created_at: Optional[str] = None updated_at: Optional[str] = None def __post_init__(self): """Derive normalized_name from name when not explicitly provided.""" if not self.normalized_name: self.normalized_name = normalize_name(self.name) def to_dict(self) -> Dict[str, Any]: """Convert activity to dictionary for database storage""" return { 'name': self.name, 'description': self.description, 'rules': self.rules, 'variations': self.variations, 'category': self.category, 'subcategory': self.subcategory, 'content_type': self.content_type, 'source_file': self.source_file, 'source_files': json.dumps(self.source_files) if self.source_files else None, 'page_reference': self.page_reference, 'source_excerpt': self.source_excerpt, 'age_group_min': self.age_group_min, 'age_group_max': self.age_group_max, 'participants_min': self.participants_min, 'participants_max': self.participants_max, 'duration_min': self.duration_min, 'duration_max': self.duration_max, 'materials_category': self.materials_category, 'materials_list': self.materials_list, 'skills_developed': self.skills_developed, 'difficulty_level': self.difficulty_level, 'keywords': self.keywords, 'tags': json.dumps(self.tags) if self.tags else None, 'popularity_score': self.popularity_score, 'language': self.language, 'normalized_name': self.normalized_name or normalize_name(self.name), 'extraction_confidence': self.extraction_confidence, 'needs_review': self.needs_review, 'name_ro': self.name_ro, 'description_ro': self.description_ro, 'rules_ro': self.rules_ro, 'variations_ro': self.variations_ro, 'indoor_outdoor': self.indoor_outdoor, 'space_needed': self.space_needed, 'estimated_fields': json.dumps(self.estimated_fields) if self.estimated_fields else None, 'source_id': self.source_id, 'source_ids': json.dumps(self.source_ids) if self.source_ids else None, 'chunk_key': self.chunk_key, } @classmethod def from_dict(cls, data: Dict[str, Any]) -> 'Activity': """Create activity from dictionary""" # Parse tags from JSON if present tags = [] if data.get('tags'): try: tags = json.loads(data['tags']) except (json.JSONDecodeError, TypeError): tags = [] # source_files may arrive as a JSON string (DB) or a list (extraction) source_files = data.get('source_files') if isinstance(source_files, str): try: source_files = json.loads(source_files) except (json.JSONDecodeError, TypeError): source_files = [] elif source_files is None: source_files = [] # estimated_fields / source_ids: JSON string (DB) or list (in-memory) def _json_list(value): if isinstance(value, str): try: parsed = json.loads(value) return parsed if isinstance(parsed, list) else [] except (json.JSONDecodeError, TypeError): return [] return list(value) if value else [] estimated_fields = _json_list(data.get('estimated_fields')) source_ids = _json_list(data.get('source_ids')) return cls( id=data.get('id'), name=data.get('name', ''), description=data.get('description', ''), rules=data.get('rules'), variations=data.get('variations'), category=data.get('category', ''), subcategory=data.get('subcategory'), content_type=data.get('content_type'), source_file=data.get('source_file', ''), source_files=source_files, page_reference=data.get('page_reference'), source_excerpt=data.get('source_excerpt'), age_group_min=data.get('age_group_min'), age_group_max=data.get('age_group_max'), participants_min=data.get('participants_min'), participants_max=data.get('participants_max'), duration_min=data.get('duration_min'), duration_max=data.get('duration_max'), materials_category=data.get('materials_category'), materials_list=data.get('materials_list'), skills_developed=data.get('skills_developed'), difficulty_level=data.get('difficulty_level'), keywords=data.get('keywords'), tags=tags, popularity_score=data.get('popularity_score', 0), language=data.get('language'), normalized_name=data.get('normalized_name'), extraction_confidence=data.get('extraction_confidence'), needs_review=data.get('needs_review', 0) or 0, name_ro=data.get('name_ro'), description_ro=data.get('description_ro'), rules_ro=data.get('rules_ro'), variations_ro=data.get('variations_ro'), indoor_outdoor=data.get('indoor_outdoor'), space_needed=data.get('space_needed'), estimated_fields=estimated_fields, source_id=data.get('source_id'), source_ids=source_ids, chunk_key=data.get('chunk_key'), created_at=data.get('created_at'), updated_at=data.get('updated_at') ) def get_age_range_display(self) -> str: """Get formatted age range for display""" if self.age_group_min and self.age_group_max: return f"{self.age_group_min}-{self.age_group_max} ani" elif self.age_group_min: return f"{self.age_group_min}+ ani" elif self.age_group_max: return f"până la {self.age_group_max} ani" return "toate vârstele" def get_participants_display(self) -> str: """Get formatted participants range for display""" if self.participants_min and self.participants_max: return f"{self.participants_min}-{self.participants_max} persoane" elif self.participants_min: return f"{self.participants_min}+ persoane" elif self.participants_max: return f"până la {self.participants_max} persoane" return "orice număr" def get_duration_display(self) -> str: """Get formatted duration for display""" if self.duration_min and self.duration_max: return f"{self.duration_min}-{self.duration_max} minute" elif self.duration_min: return f"{self.duration_min}+ minute" elif self.duration_max: return f"până la {self.duration_max} minute" return "durată variabilă" def get_materials_display(self) -> str: """Get formatted materials for display""" if self.materials_category: return self.materials_category elif self.materials_list: return self.materials_list[:100] + "..." if len(self.materials_list) > 100 else self.materials_list return "nu specificate" # --- Enrichment / bilingual display helpers ------------------------------ def get_display_name(self) -> str: """Romanian name when enriched, else the original.""" return self.name_ro or self.name def get_display_description(self) -> str: """Romanian description when enriched, else the original.""" return self.description_ro or self.description def get_display_rules(self) -> Optional[str]: """Romanian rules when enriched, else the original.""" return self.rules_ro or self.rules def get_display_variations(self) -> Optional[str]: """Romanian variations when enriched, else the original.""" return self.variations_ro or self.variations def has_translation(self) -> bool: """True if any Romanian enrichment text is present.""" return bool(self.name_ro or self.description_ro or self.rules_ro or self.variations_ro) def is_estimated(self, field_name: str) -> bool: """True if `field_name` was inferred by enrichment (source was silent).""" return field_name in (self.estimated_fields or []) def get_indoor_outdoor_display(self) -> Optional[str]: """RO label for indoor_outdoor, or None when unset.""" if not self.indoor_outdoor: return None from app.config_taxonomy import indoor_outdoor_display_name return indoor_outdoor_display_name(self.indoor_outdoor) def get_space_needed_display(self) -> Optional[str]: """RO label for space_needed, or None when unset.""" if not self.space_needed: return None from app.config_taxonomy import space_needed_display_name return space_needed_display_name(self.space_needed)