From 66ae831c36accf53ee167ab9d66299ec0cf62bdc Mon Sep 17 00:00:00 2001 From: Claude Agent Date: Tue, 19 May 2026 17:43:38 +0000 Subject: [PATCH] Rebuild extraction pipeline infrastructure (Faza 0 prep) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Implements the approved plan to replace the broken regex/index-master extraction with an LLM-subagent pipeline. Four parallel lanes: Lane A — scripts/extract_common.py (PDF/docx/doc/pptx/html/zip, no max_pages truncation), normalize_sources.py, chunk_sources.py (~20pg chunks + overlap, manifest registry), activity_schema.json. Lane B — app/config_taxonomy.py (16 fixed category slugs), schema rebuilt from scratch in app/models/ with content_type, language, source_files, source_excerpt, normalized_name, extraction_confidence, needs_review; FTS5 + 3 triggers extended with materials_list and skills_developed. Lane C — build_database.py (--rebuild, atomic swap, schema + fuzzy source_excerpt validation, dedup with needs_review band), validate_extractions.py, review_queue.py, new run_extraction.py orchestrator, SUBAGENT_PROMPT.md. Lane D — search.py content_type/language filters (default search excludes non-game content), E7 schema-compat audit; fixed a NULL keywords AttributeError in _boost_search_relevance. Removes 8 orphaned/dead scripts and app/services/parser.py + indexer.py. Adds tests/ (70 passing, 1 skipped — libreoffice absent). Note: Lane D made one additive edit to app/models/database.py (_update_category_counts) to surface content_type/language in get_filter_options, outside its nominal lane boundary but after Lane B completed. Co-Authored-By: Claude Opus 4.7 (1M context) --- app/config_taxonomy.py | 230 +++++++++ app/models/activity.py | 68 ++- app/models/database.py | 55 ++- app/services/__init__.py | 4 +- app/services/indexer.py | 248 ---------- app/services/parser.py | 340 -------------- app/services/search.py | 101 +++- app/templates/activity.html | 8 +- app/templates/index.html | 26 +- app/templates/results.html | 29 +- app/web/routes.py | 19 +- scripts/SUBAGENT_PROMPT.md | 81 ++++ scripts/activity_schema.json | 110 +++++ scripts/build_database.py | 639 ++++++++++++++++++++++++++ scripts/chunk_sources.py | 251 ++++++++++ scripts/claude_extraction_template.md | 54 --- scripts/create_databases.py | 164 ------- scripts/extract_common.py | 361 +++++++++++++++ scripts/html_extractor.py | 424 ----------------- scripts/import_claude_activities.py | 78 ---- scripts/import_common.py | 179 ++++++++ scripts/normalize_sources.py | 255 ++++++++++ scripts/pdf_extractor.py | 0 scripts/pdf_to_text_converter.py | 143 ------ scripts/review_queue.py | 145 ++++++ scripts/run_extraction.py | 168 +++++-- scripts/text_extractor.py | 197 -------- scripts/unified_processor.py | 151 ------ scripts/validate_extractions.py | 208 +++++++++ tests/conftest.py | 114 +++++ tests/fixtures/.gitkeep | 3 + tests/test_build_database.py | 334 ++++++++++++++ tests/test_chunk_sources.py | 183 ++++++++ tests/test_extract_common.py | 177 +++++++ tests/test_fts.py | 139 ++++++ tests/test_search.py | 140 ++++++ tests/test_validate_extractions.py | 156 +++++++ 37 files changed, 4101 insertions(+), 1881 deletions(-) create mode 100644 app/config_taxonomy.py delete mode 100644 app/services/indexer.py delete mode 100644 app/services/parser.py create mode 100644 scripts/SUBAGENT_PROMPT.md create mode 100644 scripts/activity_schema.json create mode 100644 scripts/build_database.py create mode 100644 scripts/chunk_sources.py delete mode 100644 scripts/claude_extraction_template.md delete mode 100644 scripts/create_databases.py create mode 100644 scripts/extract_common.py delete mode 100644 scripts/html_extractor.py delete mode 100644 scripts/import_claude_activities.py create mode 100644 scripts/import_common.py create mode 100644 scripts/normalize_sources.py delete mode 100644 scripts/pdf_extractor.py delete mode 100644 scripts/pdf_to_text_converter.py create mode 100644 scripts/review_queue.py delete mode 100644 scripts/text_extractor.py delete mode 100644 scripts/unified_processor.py create mode 100644 scripts/validate_extractions.py create mode 100644 tests/conftest.py create mode 100644 tests/fixtures/.gitkeep create mode 100644 tests/test_build_database.py create mode 100644 tests/test_chunk_sources.py create mode 100644 tests/test_extract_common.py create mode 100644 tests/test_fts.py create mode 100644 tests/test_search.py create mode 100644 tests/test_validate_extractions.py diff --git a/app/config_taxonomy.py b/app/config_taxonomy.py new file mode 100644 index 0000000..2e8db25 --- /dev/null +++ b/app/config_taxonomy.py @@ -0,0 +1,230 @@ +""" +Controlled category taxonomy for game-library. + +Single source of truth for activity categories. The DB stores the *slug*; +the UI displays the Romanian name. `category` (thematic domain) and +`content_type` (form of the content) are INDEPENDENT axes — see plan §2. +""" + +import unicodedata +import re +from typing import Dict, List + +# --- Categories (thematic domain) -------------------------------------------- +# slug -> Romanian display name. ~16 fixed slugs; `altele` is the mandatory +# fallback and MUST always be present. +CATEGORIES: Dict[str, str] = { + "jocuri-cercetasesti": "Jocuri cercetășești", + "team-building": "Team-building", + "icebreakers": "Icebreakers / spargerea gheții", + "camp-outdoor": "Tabără și activități în aer liber", + "wide-games": "Wide games / jocuri de teren", + "orientare": "Orientare", + "prim-ajutor": "Prim ajutor", + "escape-room-puzzle": "Escape room și puzzle", + "creative-stem": "Creativitate și STEM", + "sports-active": "Sport și activități fizice", + "cantece-ceremonii": "Cântece și ceremonii", + "retete": "Rețete", + "supravietuire": "Supraviețuire", + "integrare-incluziune": "Integrare și incluziune", + "conflict-empatie": "Conflict și empatie", + "altele": "Altele", +} + +# Mandatory fallback slug. +FALLBACK_CATEGORY = "altele" + +# Ordered list of valid slugs. +CATEGORY_SLUGS: List[str] = list(CATEGORIES.keys()) + +# --- Content type (form of the content) -------------------------------------- +# Independent axis from `category`. The UI default search excludes the +# non-game content types (see plan §6). +CONTENT_TYPES: Dict[str, str] = { + "joc": "Joc", + "activitate": "Activitate", + "reteta": "Rețetă", + "cantec": "Cântec", + "ceremonie": "Ceremonie", +} + +CONTENT_TYPE_SLUGS: List[str] = list(CONTENT_TYPES.keys()) + +# Content types considered "non-game" — excluded from the default UI search. +NON_GAME_CONTENT_TYPES: List[str] = ["reteta", "cantec", "ceremonie"] + +DEFAULT_CONTENT_TYPE = "activitate" + +# --- Aliases ----------------------------------------------------------------- +# Map of normalized arbitrary strings -> canonical slug. Keys are already +# diacritic-stripped, lowercased and hyphenated (see _slugify). This catches +# legacy / messy values from the old DB and common English/Romanian variants. +_CATEGORY_ALIASES: Dict[str, str] = { + # legacy junk + "general-activity": "altele", + "general": "altele", + "educational": "creative-stem", + "d": "altele", + "a": "altele", + "b": "altele", + "c": "altele", + # scouting + "cercetasie": "jocuri-cercetasesti", + "cercetasesti": "jocuri-cercetasesti", + "scout": "jocuri-cercetasesti", + "scouting": "jocuri-cercetasesti", + "scout-games": "jocuri-cercetasesti", + "jocuri-cercetasesti": "jocuri-cercetasesti", + # team building + "teambuilding": "team-building", + "team": "team-building", + "cooperare": "team-building", + # icebreakers + "icebreaker": "icebreakers", + "spargerea-ghetii": "icebreakers", + "cunoastere": "icebreakers", + "energizers": "icebreakers", + "energizer": "icebreakers", + # camp / outdoor + "camp": "camp-outdoor", + "tabara": "camp-outdoor", + "outdoor": "camp-outdoor", + "aer-liber": "camp-outdoor", + # wide games + "wide-game": "wide-games", + "jocuri-de-teren": "wide-games", + "joc-de-teren": "wide-games", + "big-games": "wide-games", + # orientare + "orienteering": "orientare", + "navigatie": "orientare", + # prim ajutor + "first-aid": "prim-ajutor", + "primul-ajutor": "prim-ajutor", + # escape room / puzzle + "escape-room": "escape-room-puzzle", + "escaperoom": "escape-room-puzzle", + "puzzle": "escape-room-puzzle", + "puzzles": "escape-room-puzzle", + "ghicitori": "escape-room-puzzle", + # creative / stem + "creative": "creative-stem", + "creativitate": "creative-stem", + "stem": "creative-stem", + "arts-and-crafts": "creative-stem", + "craft": "creative-stem", + "crafts": "creative-stem", + "stiinta": "creative-stem", + # sports + "sport": "sports-active", + "sports": "sports-active", + "sportive": "sports-active", + "active": "sports-active", + "miscare": "sports-active", + "physical": "sports-active", + # songs / ceremonies + "cantece": "cantece-ceremonii", + "cantec": "cantece-ceremonii", + "songs": "cantece-ceremonii", + "ceremonii": "cantece-ceremonii", + "ceremonie": "cantece-ceremonii", + "ceremony": "cantece-ceremonii", + # recipes + "reteta": "retete", + "recipe": "retete", + "recipes": "retete", + "cooking": "retete", + "gatit": "retete", + # survival + "survival": "supravietuire", + "supravietuire": "supravietuire", + # inclusion + "integrare": "integrare-incluziune", + "incluziune": "integrare-incluziune", + "inclusion": "integrare-incluziune", + # conflict / empathy + "conflict": "conflict-empatie", + "empatie": "conflict-empatie", + "empathy": "conflict-empatie", + "rezolvarea-conflictelor": "conflict-empatie", + # fallback + "altele": "altele", + "other": "altele", + "others": "altele", + "misc": "altele", +} + + +def _slugify(value: str) -> str: + """Lowercase, strip diacritics, collapse non-alphanumerics to hyphens.""" + if not value: + return "" + # Decompose accents (ă -> a, ș -> s, ț -> t, etc.) + decomposed = unicodedata.normalize("NFKD", value) + ascii_str = "".join(c for c in decomposed if not unicodedata.combining(c)) + ascii_str = ascii_str.lower().strip() + ascii_str = re.sub(r"[^a-z0-9]+", "-", ascii_str) + return ascii_str.strip("-") + + +def normalize_category(value: str) -> str: + """Map an arbitrary string to a valid category slug. + + Returns one of CATEGORY_SLUGS, falling back to `altele` for anything + unrecognised or empty. + """ + if not value: + return FALLBACK_CATEGORY + slug = _slugify(str(value)) + if not slug: + return FALLBACK_CATEGORY + # Exact slug match. + if slug in CATEGORIES: + return slug + # Alias match. + if slug in _CATEGORY_ALIASES: + return _CATEGORY_ALIASES[slug] + return FALLBACK_CATEGORY + + +def normalize_content_type(value: str) -> str: + """Map an arbitrary string to a valid content_type slug. + + Returns one of CONTENT_TYPE_SLUGS, falling back to `activitate`. + """ + if not value: + return DEFAULT_CONTENT_TYPE + slug = _slugify(str(value)) + if slug in CONTENT_TYPES: + return slug + # Light alias handling for plural / English forms. + aliases = { + "jocuri": "joc", + "game": "joc", + "games": "joc", + "activitati": "activitate", + "activity": "activitate", + "retete": "reteta", + "recipe": "reteta", + "cantece": "cantec", + "song": "cantec", + "ceremonii": "ceremonie", + "ceremony": "ceremonie", + } + return aliases.get(slug, DEFAULT_CONTENT_TYPE) + + +def is_valid_category(slug: str) -> bool: + """True if `slug` is a valid category slug.""" + return slug in CATEGORIES + + +def category_display_name(slug: str) -> str: + """Romanian display name for a slug (fallback to the slug itself).""" + return CATEGORIES.get(slug, slug) + + +def content_type_display_name(slug: str) -> str: + """Romanian display name for a content_type slug.""" + return CONTENT_TYPES.get(slug, slug) diff --git a/app/models/activity.py b/app/models/activity.py index d28f76b..b2bbf18 100644 --- a/app/models/activity.py +++ b/app/models/activity.py @@ -5,6 +5,22 @@ Activity data model for INDEX-SISTEM-JOCURI v2.0 from dataclasses import dataclass, field from typing import List, Optional, Dict, Any import json +import re +import unicodedata + + +def normalize_name(name: str) -> str: + """Diacritic-free, lowercased, whitespace-collapsed form of a name. + + Used as the exact-match key for dedup grouping (see plan §4). + """ + if not name: + return "" + decomposed = unicodedata.normalize("NFKD", name) + ascii_str = "".join(c for c in decomposed if not unicodedata.combining(c)) + ascii_str = ascii_str.lower().strip() + ascii_str = re.sub(r"\s+", " ", ascii_str) + return ascii_str @dataclass class Activity: @@ -19,10 +35,19 @@ class Activity: # Categories category: str = "" subcategory: Optional[str] = None - + # content_type is an axis INDEPENDENT of category: + # one of joc/activitate/reteta/cantec/ceremonie (see config_taxonomy). + content_type: Optional[str] = None + # Source information source_file: str = "" page_reference: Optional[str] = None + # source_files: JSON-encoded list of every source the activity was seen in. + # `source_file` (singular) stays as the primary/original source; build_database + # (Lane C) accumulates the full list here on dedup-merge. + source_files: List[str] = field(default_factory=list) + # Short verbatim quote from the source — anti-hallucination anchor. + source_excerpt: Optional[str] = None # Age and participants age_group_min: Optional[int] = None @@ -44,11 +69,22 @@ class Activity: keywords: Optional[str] = None tags: List[str] = field(default_factory=list) popularity_score: int = 0 - + + # Extraction / language metadata + language: Optional[str] = None # 'ro' / 'en' + normalized_name: Optional[str] = None # dedup key; auto-derived from name + extraction_confidence: Optional[str] = None # 'high' / 'med' / 'low' + needs_review: int = 0 + # Database fields id: Optional[int] = None created_at: Optional[str] = None updated_at: Optional[str] = None + + def __post_init__(self): + """Derive normalized_name from name when not explicitly provided.""" + if not self.normalized_name: + self.normalized_name = normalize_name(self.name) def to_dict(self) -> Dict[str, Any]: """Convert activity to dictionary for database storage""" @@ -59,8 +95,11 @@ class Activity: 'variations': self.variations, 'category': self.category, 'subcategory': self.subcategory, + 'content_type': self.content_type, 'source_file': self.source_file, + 'source_files': json.dumps(self.source_files) if self.source_files else None, 'page_reference': self.page_reference, + 'source_excerpt': self.source_excerpt, 'age_group_min': self.age_group_min, 'age_group_max': self.age_group_max, 'participants_min': self.participants_min, @@ -73,7 +112,11 @@ class Activity: 'difficulty_level': self.difficulty_level, 'keywords': self.keywords, 'tags': json.dumps(self.tags) if self.tags else None, - 'popularity_score': self.popularity_score + 'popularity_score': self.popularity_score, + 'language': self.language, + 'normalized_name': self.normalized_name or normalize_name(self.name), + 'extraction_confidence': self.extraction_confidence, + 'needs_review': self.needs_review, } @classmethod @@ -86,7 +129,17 @@ class Activity: tags = json.loads(data['tags']) except (json.JSONDecodeError, TypeError): tags = [] - + + # source_files may arrive as a JSON string (DB) or a list (extraction) + source_files = data.get('source_files') + if isinstance(source_files, str): + try: + source_files = json.loads(source_files) + except (json.JSONDecodeError, TypeError): + source_files = [] + elif source_files is None: + source_files = [] + return cls( id=data.get('id'), name=data.get('name', ''), @@ -95,8 +148,11 @@ class Activity: variations=data.get('variations'), category=data.get('category', ''), subcategory=data.get('subcategory'), + content_type=data.get('content_type'), source_file=data.get('source_file', ''), + source_files=source_files, page_reference=data.get('page_reference'), + source_excerpt=data.get('source_excerpt'), age_group_min=data.get('age_group_min'), age_group_max=data.get('age_group_max'), participants_min=data.get('participants_min'), @@ -110,6 +166,10 @@ class Activity: keywords=data.get('keywords'), tags=tags, popularity_score=data.get('popularity_score', 0), + language=data.get('language'), + normalized_name=data.get('normalized_name'), + extraction_confidence=data.get('extraction_confidence'), + needs_review=data.get('needs_review', 0) or 0, created_at=data.get('created_at'), updated_at=data.get('updated_at') ) diff --git a/app/models/database.py b/app/models/database.py index 93524d4..816c403 100644 --- a/app/models/database.py +++ b/app/models/database.py @@ -30,6 +30,8 @@ class DatabaseManager: """Initialize database with v2.0 schema""" with self._get_connection() as conn: # Main activities table + # NOTE: schema is rebuilt from scratch (plan §6) — no in-place + # migration. The old DB is deleted and recreated by build_database. conn.execute(""" CREATE TABLE IF NOT EXISTS activities ( id INTEGER PRIMARY KEY AUTOINCREMENT, @@ -39,9 +41,12 @@ class DatabaseManager: variations TEXT, category TEXT NOT NULL, subcategory TEXT, + content_type TEXT, source_file TEXT NOT NULL, + source_files TEXT, page_reference TEXT, - + source_excerpt TEXT, + -- Structured parameters age_group_min INTEGER, age_group_max INTEGER, @@ -49,26 +54,34 @@ class DatabaseManager: participants_max INTEGER, duration_min INTEGER, duration_max INTEGER, - + -- Categories for filtering materials_category TEXT, materials_list TEXT, skills_developed TEXT, difficulty_level TEXT, - + -- Metadata keywords TEXT, tags TEXT, popularity_score INTEGER DEFAULT 0, + + -- Extraction / language metadata + language TEXT, + normalized_name TEXT, + extraction_confidence TEXT, + needs_review INTEGER DEFAULT 0, + created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP, updated_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP ) """) - + # FTS5 virtual table for search conn.execute(""" CREATE VIRTUAL TABLE IF NOT EXISTS activities_fts USING fts5( name, description, rules, variations, keywords, + materials_list, skills_developed, content='activities', content_rowid='id' ) @@ -92,6 +105,7 @@ class DatabaseManager: "CREATE INDEX IF NOT EXISTS idx_activities_age ON activities(age_group_min, age_group_max)", "CREATE INDEX IF NOT EXISTS idx_activities_participants ON activities(participants_min, participants_max)", "CREATE INDEX IF NOT EXISTS idx_activities_duration ON activities(duration_min, duration_max)", + "CREATE INDEX IF NOT EXISTS idx_activities_normalized_name ON activities(normalized_name)", "CREATE INDEX IF NOT EXISTS idx_categories_type ON categories(type)" ] @@ -102,24 +116,34 @@ class DatabaseManager: conn.execute(""" CREATE TRIGGER IF NOT EXISTS activities_fts_insert AFTER INSERT ON activities BEGIN - INSERT INTO activities_fts(rowid, name, description, rules, variations, keywords) - VALUES (new.id, new.name, new.description, new.rules, new.variations, new.keywords); + INSERT INTO activities_fts(rowid, name, description, rules, variations, + keywords, materials_list, skills_developed) + VALUES (new.id, new.name, new.description, new.rules, new.variations, + new.keywords, new.materials_list, new.skills_developed); END """) - + conn.execute(""" CREATE TRIGGER IF NOT EXISTS activities_fts_delete AFTER DELETE ON activities BEGIN - DELETE FROM activities_fts WHERE rowid = old.id; + INSERT INTO activities_fts(activities_fts, rowid, name, description, rules, + variations, keywords, materials_list, skills_developed) + VALUES ('delete', old.id, old.name, old.description, old.rules, + old.variations, old.keywords, old.materials_list, old.skills_developed); END """) - + conn.execute(""" CREATE TRIGGER IF NOT EXISTS activities_fts_update AFTER UPDATE ON activities BEGIN - DELETE FROM activities_fts WHERE rowid = old.id; - INSERT INTO activities_fts(rowid, name, description, rules, variations, keywords) - VALUES (new.id, new.name, new.description, new.rules, new.variations, new.keywords); + INSERT INTO activities_fts(activities_fts, rowid, name, description, rules, + variations, keywords, materials_list, skills_developed) + VALUES ('delete', old.id, old.name, old.description, old.rules, + old.variations, old.keywords, old.materials_list, old.skills_developed); + INSERT INTO activities_fts(rowid, name, description, rules, variations, + keywords, materials_list, skills_developed) + VALUES (new.id, new.name, new.description, new.rules, new.variations, + new.keywords, new.materials_list, new.skills_developed); END """) @@ -179,6 +203,8 @@ class DatabaseManager: """Update category usage counts""" categories_to_update = [ ('category', activity.category), + ('content_type', activity.content_type), + ('language', activity.language), ('age_group', activity.get_age_range_display()), ('participants', activity.get_participants_display()), ('duration', activity.get_duration_display()), @@ -332,8 +358,11 @@ class DatabaseManager: def clear_database(self): """Clear all data from database""" with self._get_connection() as conn: + # Deleting from activities fires the delete trigger, which removes + # the matching FTS rows. The explicit 'delete-all' command then + # guarantees the external-content FTS index is fully cleared. conn.execute("DELETE FROM activities") - conn.execute("DELETE FROM activities_fts") + conn.execute("INSERT INTO activities_fts(activities_fts) VALUES('delete-all')") conn.execute("DELETE FROM categories") conn.commit() diff --git a/app/services/__init__.py b/app/services/__init__.py index 38de191..36492a0 100644 --- a/app/services/__init__.py +++ b/app/services/__init__.py @@ -2,8 +2,6 @@ Services for INDEX-SISTEM-JOCURI v2.0 """ -from .parser import IndexMasterParser -from .indexer import ActivityIndexer from .search import SearchService -__all__ = ['IndexMasterParser', 'ActivityIndexer', 'SearchService'] \ No newline at end of file +__all__ = ['SearchService'] diff --git a/app/services/indexer.py b/app/services/indexer.py deleted file mode 100644 index ba9cd96..0000000 --- a/app/services/indexer.py +++ /dev/null @@ -1,248 +0,0 @@ -""" -Activity indexer service for INDEX-SISTEM-JOCURI v2.0 -Coordinates parsing and database indexing -""" - -from typing import List, Dict, Any -from pathlib import Path -from app.models.database import DatabaseManager -from app.models.activity import Activity -from app.services.parser import IndexMasterParser -import time - -class ActivityIndexer: - """Service for indexing activities from INDEX_MASTER into database""" - - def __init__(self, db_manager: DatabaseManager, index_master_path: str): - """Initialize indexer with database manager and INDEX_MASTER path""" - self.db = db_manager - self.parser = IndexMasterParser(index_master_path) - self.indexing_stats = {} - - def index_all_activities(self, clear_existing: bool = False) -> Dict[str, Any]: - """Index all activities from INDEX_MASTER into database""" - - print("🚀 Starting activity indexing process...") - start_time = time.time() - - # Clear existing data if requested - if clear_existing: - print("🗑️ Clearing existing database...") - self.db.clear_database() - - # Parse activities from INDEX_MASTER - print("📖 Parsing INDEX_MASTER file...") - activities = self.parser.parse_all_categories() - - if not activities: - print("❌ No activities were parsed!") - return {'success': False, 'error': 'No activities parsed'} - - # Filter valid activities - valid_activities = [] - for activity in activities: - if self.parser.validate_activity_completeness(activity): - valid_activities.append(activity) - else: - print(f"⚠️ Skipping incomplete activity: {activity.name[:50]}...") - - print(f"✅ Validated {len(valid_activities)} activities out of {len(activities)} parsed") - - if len(valid_activities) < 100: - print(f"⚠️ Warning: Only {len(valid_activities)} valid activities found. Expected 500+") - - # Bulk insert into database - print("💾 Inserting activities into database...") - try: - inserted_count = self.db.bulk_insert_activities(valid_activities) - - # Rebuild FTS index for optimal search performance - print("🔍 Rebuilding search index...") - self.db.rebuild_fts_index() - - end_time = time.time() - indexing_time = end_time - start_time - - # Generate final statistics (with error handling) - try: - stats = self._generate_indexing_stats(valid_activities, indexing_time) - stats['inserted_count'] = inserted_count - stats['success'] = True - except Exception as e: - print(f"⚠️ Error generating statistics: {e}") - stats = { - 'success': True, - 'inserted_count': inserted_count, - 'indexing_time_seconds': indexing_time, - 'error': f'Stats generation failed: {str(e)}' - } - - print(f"✅ Indexing complete! {inserted_count} activities indexed in {indexing_time:.2f}s") - - # Verify database state (with error handling) - try: - db_stats = self.db.get_statistics() - print(f"📊 Database now contains {db_stats['total_activities']} activities") - except Exception as e: - print(f"⚠️ Error getting database statistics: {e}") - print(f"📊 Database insertion completed, statistics unavailable") - - return stats - - except Exception as e: - print(f"❌ Error during database insertion: {e}") - return {'success': False, 'error': str(e)} - - def index_specific_category(self, category_code: str) -> Dict[str, Any]: - """Index activities from a specific category only""" - - print(f"🎯 Indexing specific category: {category_code}") - - # Load content and parse specific category - if not self.parser.load_content(): - return {'success': False, 'error': 'Could not load INDEX_MASTER'} - - category_name = self.parser.category_mapping.get(category_code) - if not category_name: - return {'success': False, 'error': f'Unknown category code: {category_code}'} - - activities = self.parser.parse_category_section(category_code, category_name) - - if not activities: - return {'success': False, 'error': f'No activities found in category {category_code}'} - - # Filter valid activities - valid_activities = [a for a in activities if self.parser.validate_activity_completeness(a)] - - try: - inserted_count = self.db.bulk_insert_activities(valid_activities) - return { - 'success': True, - 'category': category_name, - 'inserted_count': inserted_count, - 'total_parsed': len(activities), - 'valid_activities': len(valid_activities) - } - except Exception as e: - return {'success': False, 'error': str(e)} - - def _generate_indexing_stats(self, activities: List[Activity], indexing_time: float) -> Dict[str, Any]: - """Generate comprehensive indexing statistics""" - - # Get parser statistics - parser_stats = self.parser.get_parsing_statistics() - - # Calculate additional metrics - categories = {} - age_ranges = {} - durations = {} - materials = {} - - for activity in activities: - # Category breakdown - if activity.category in categories: - categories[activity.category] += 1 - else: - categories[activity.category] = 1 - - # Age range analysis (with safety check) - try: - age_key = activity.get_age_range_display() or "nespecificat" - age_ranges[age_key] = age_ranges.get(age_key, 0) + 1 - except Exception as e: - print(f"Warning: Error getting age range for activity {activity.name}: {e}") - age_ranges["nespecificat"] = age_ranges.get("nespecificat", 0) + 1 - - # Duration analysis (with safety check) - try: - duration_key = activity.get_duration_display() or "nespecificat" - durations[duration_key] = durations.get(duration_key, 0) + 1 - except Exception as e: - print(f"Warning: Error getting duration for activity {activity.name}: {e}") - durations["nespecificat"] = durations.get("nespecificat", 0) + 1 - - # Materials analysis (with safety check) - try: - materials_key = activity.get_materials_display() or "nespecificat" - materials[materials_key] = materials.get(materials_key, 0) + 1 - except Exception as e: - print(f"Warning: Error getting materials for activity {activity.name}: {e}") - materials["nespecificat"] = materials.get("nespecificat", 0) + 1 - - return { - 'indexing_time_seconds': indexing_time, - 'parsing_stats': parser_stats, - 'distribution': { - 'categories': categories, - 'age_ranges': age_ranges, - 'durations': durations, - 'materials': materials - }, - 'quality_metrics': { - 'completion_rate': parser_stats.get('completion_rate', 0), - 'average_description_length': parser_stats.get('average_description_length', 0), - 'activities_with_metadata': sum(1 for a in activities if a.age_group_min or a.participants_min or a.duration_min) - } - } - - def verify_indexing_quality(self) -> Dict[str, Any]: - """Verify the quality of indexed data""" - - try: - # Get database statistics - db_stats = self.db.get_statistics() - - # Check for minimum activity count - total_activities = db_stats['total_activities'] - meets_minimum = total_activities >= 500 - - # Check category distribution - categories = db_stats.get('categories', {}) - category_coverage = len(categories) - - # Sample some activities to check quality - sample_activities = self.db.search_activities(limit=10) - - quality_issues = [] - for activity in sample_activities: - if not activity.get('description') or len(activity['description']) < 10: - quality_issues.append(f"Activity {activity.get('name', 'Unknown')} has insufficient description") - - if not activity.get('category'): - quality_issues.append(f"Activity {activity.get('name', 'Unknown')} missing category") - - return { - 'total_activities': total_activities, - 'meets_minimum_requirement': meets_minimum, - 'minimum_target': 500, - 'category_coverage': category_coverage, - 'expected_categories': len(self.parser.category_mapping), - 'quality_issues': quality_issues, - 'quality_score': max(0, 100 - len(quality_issues) * 10), - 'database_stats': db_stats - } - - except Exception as e: - return {'error': str(e), 'quality_score': 0} - - def get_indexing_progress(self) -> Dict[str, Any]: - """Get current indexing progress and status""" - try: - db_stats = self.db.get_statistics() - - # Calculate progress towards 500+ activities goal - total_activities = db_stats['total_activities'] - target_activities = 500 - progress_percentage = min(100, (total_activities / target_activities) * 100) - - return { - 'current_activities': total_activities, - 'target_activities': target_activities, - 'progress_percentage': progress_percentage, - 'status': 'completed' if total_activities >= target_activities else 'in_progress', - 'categories_indexed': list(db_stats.get('categories', {}).keys()), - 'database_size_mb': db_stats.get('database_size_bytes', 0) / (1024 * 1024) - } - - except Exception as e: - return {'error': str(e), 'status': 'error'} \ No newline at end of file diff --git a/app/services/parser.py b/app/services/parser.py deleted file mode 100644 index e086248..0000000 --- a/app/services/parser.py +++ /dev/null @@ -1,340 +0,0 @@ -""" -Advanced parser for INDEX_MASTER_JOCURI_ACTIVITATI.md -Extracts 500+ individual activities with full details -""" - -import re -from pathlib import Path -from typing import List, Dict, Optional, Tuple -from app.models.activity import Activity - -class IndexMasterParser: - """Advanced parser for extracting real activities from INDEX_MASTER""" - - def __init__(self, index_file_path: str): - """Initialize parser with INDEX_MASTER file path""" - self.index_file_path = Path(index_file_path) - self.content = "" - self.activities = [] - - # Category mapping for main sections (exact match from file) - self.category_mapping = { - '[A]': 'JOCURI CERCETĂȘEȘTI ȘI SCOUT', - '[B]': 'TEAM BUILDING ȘI COMUNICARE', - '[C]': 'CAMPING ȘI ACTIVITĂȚI EXTERIOR', - '[D]': 'ESCAPE ROOM ȘI PUZZLE-URI', - '[E]': 'ORIENTARE ȘI BUSOLE', - '[F]': 'PRIMUL AJUTOR ȘI SIGURANȚA', - '[G]': 'ACTIVITĂȚI EDUCAȚIONALE', - '[H]': 'RESURSE SPECIALE' - } - - def load_content(self) -> bool: - """Load and validate INDEX_MASTER content""" - try: - if not self.index_file_path.exists(): - print(f"❌ INDEX_MASTER file not found: {self.index_file_path}") - return False - - with open(self.index_file_path, 'r', encoding='utf-8') as f: - self.content = f.read() - - if len(self.content) < 1000: # Sanity check - print(f"⚠️ INDEX_MASTER file seems too small: {len(self.content)} chars") - return False - - print(f"✅ Loaded INDEX_MASTER: {len(self.content)} characters") - return True - - except Exception as e: - print(f"❌ Error loading INDEX_MASTER: {e}") - return False - - def parse_all_categories(self) -> List[Activity]: - """Parse all categories and extract individual activities""" - if not self.load_content(): - return [] - - print("🔍 Starting comprehensive parsing of INDEX_MASTER...") - - # Parse each main category - for category_code, category_name in self.category_mapping.items(): - print(f"\n📂 Processing category {category_code}: {category_name}") - category_activities = self.parse_category_section(category_code, category_name) - self.activities.extend(category_activities) - print(f" ✅ Extracted {len(category_activities)} activities") - - print(f"\n🎯 Total activities extracted: {len(self.activities)}") - return self.activities - - def parse_category_section(self, category_code: str, category_name: str) -> List[Activity]: - """Parse a specific category section""" - activities = [] - - # Find the category section - exact pattern match - # Look for the actual section, not the table of contents - pattern = rf"^## {re.escape(category_code)} {re.escape(category_name)}\s*$" - matches = list(re.finditer(pattern, self.content, re.MULTILINE | re.IGNORECASE)) - - if not matches: - print(f" ⚠️ Category section not found: {category_code}") - return activities - - # Take the last match (should be the actual section, not TOC) - match = matches[-1] - print(f" 📍 Found section at position {match.start()}") - - # Extract content until next main category or end - start_pos = match.end() - - # Find next main category (look for complete header) - next_category_pattern = r"^## \[[A-H]\] [A-ZĂÂÎȘȚ]" - next_match = re.search(next_category_pattern, self.content[start_pos:], re.MULTILINE) - - if next_match: - end_pos = start_pos + next_match.start() - section_content = self.content[start_pos:end_pos] - else: - section_content = self.content[start_pos:] - - # Parse subsections within the category - activities.extend(self._parse_subsections(section_content, category_name)) - - return activities - - def _parse_subsections(self, section_content: str, category_name: str) -> List[Activity]: - """Parse subsections within a category""" - activities = [] - - # Find all subsections (### markers) - subsection_pattern = r"^### (.+?)$" - subsections = re.finditer(subsection_pattern, section_content, re.MULTILINE) - - subsection_list = list(subsections) - - for i, subsection in enumerate(subsection_list): - subsection_title = subsection.group(1).strip() - subsection_start = subsection.end() - - # Find end of subsection - if i + 1 < len(subsection_list): - subsection_end = subsection_list[i + 1].start() - else: - subsection_end = len(section_content) - - subsection_text = section_content[subsection_start:subsection_end] - - # Parse individual games in this subsection - subsection_activities = self._parse_games_in_subsection( - subsection_text, category_name, subsection_title - ) - activities.extend(subsection_activities) - - return activities - - def _parse_games_in_subsection(self, subsection_text: str, category_name: str, subsection_title: str) -> List[Activity]: - """Parse individual games within a subsection""" - activities = [] - - # Look for "Exemple de jocuri:" sections - examples_pattern = r"\*\*Exemple de jocuri:\*\*\s*\n(.*?)(?=\n\*\*|$)" - examples_matches = re.finditer(examples_pattern, subsection_text, re.DOTALL) - - for examples_match in examples_matches: - examples_text = examples_match.group(1) - - # Extract individual games (numbered list) - game_pattern = r"^(\d+)\.\s*\*\*(.+?)\*\*\s*-\s*(.+?)$" - games = re.finditer(game_pattern, examples_text, re.MULTILINE) - - for game_match in games: - game_number = game_match.group(1) - game_name = game_match.group(2).strip() - game_description = game_match.group(3).strip() - - # Extract metadata from subsection - metadata = self._extract_subsection_metadata(subsection_text) - - # Create activity - activity = Activity( - name=game_name, - description=game_description, - category=category_name, - subcategory=subsection_title, - source_file=f"INDEX_MASTER_JOCURI_ACTIVITATI.md", - page_reference=f"{category_name} > {subsection_title} > #{game_number}", - **metadata - ) - - activities.append(activity) - - # Also extract from direct activity descriptions without "Exemple de jocuri" - activities.extend(self._parse_direct_activities(subsection_text, category_name, subsection_title)) - - return activities - - def _extract_subsection_metadata(self, subsection_text: str) -> Dict: - """Extract metadata from subsection text""" - metadata = {} - - # Extract participants info - participants_pattern = r"\*\*Participanți:\*\*\s*(.+?)(?:\n|\*\*)" - participants_match = re.search(participants_pattern, subsection_text) - if participants_match: - participants_text = participants_match.group(1).strip() - participants = self._parse_participants(participants_text) - metadata.update(participants) - - # Extract duration - duration_pattern = r"\*\*Durata:\*\*\s*(.+?)(?:\n|\*\*)" - duration_match = re.search(duration_pattern, subsection_text) - if duration_match: - duration_text = duration_match.group(1).strip() - duration = self._parse_duration(duration_text) - metadata.update(duration) - - # Extract materials - materials_pattern = r"\*\*Materiale:\*\*\s*(.+?)(?:\n|\*\*)" - materials_match = re.search(materials_pattern, subsection_text) - if materials_match: - materials_text = materials_match.group(1).strip() - metadata['materials_list'] = materials_text - metadata['materials_category'] = self._categorize_materials(materials_text) - - # Extract keywords - keywords_pattern = r"\*\*Cuvinte cheie:\*\*\s*(.+?)(?:\n|\*\*)" - keywords_match = re.search(keywords_pattern, subsection_text) - if keywords_match: - metadata['keywords'] = keywords_match.group(1).strip() - - return metadata - - def _parse_participants(self, participants_text: str) -> Dict: - """Parse participants information""" - result = {} - - # Look for number ranges like "8-30 copii" or "5-15 persoane" - range_pattern = r"(\d+)-(\d+)" - range_match = re.search(range_pattern, participants_text) - - if range_match: - result['participants_min'] = int(range_match.group(1)) - result['participants_max'] = int(range_match.group(2)) - else: - # Look for single numbers - number_pattern = r"(\d+)\+" - number_match = re.search(number_pattern, participants_text) - if number_match: - result['participants_min'] = int(number_match.group(1)) - - # Extract age information - age_pattern = r"(\d+)-(\d+)\s*ani" - age_match = re.search(age_pattern, participants_text) - if age_match: - result['age_group_min'] = int(age_match.group(1)) - result['age_group_max'] = int(age_match.group(2)) - - return result - - def _parse_duration(self, duration_text: str) -> Dict: - """Parse duration information""" - result = {} - - # Look for time ranges like "5-20 minute" or "15-30min" - range_pattern = r"(\d+)-(\d+)\s*(?:minute|min)" - range_match = re.search(range_pattern, duration_text) - - if range_match: - result['duration_min'] = int(range_match.group(1)) - result['duration_max'] = int(range_match.group(2)) - else: - # Look for single duration - single_pattern = r"(\d+)\+?\s*(?:minute|min)" - single_match = re.search(single_pattern, duration_text) - if single_match: - result['duration_min'] = int(single_match.group(1)) - - return result - - def _categorize_materials(self, materials_text: str) -> str: - """Categorize materials into simple categories""" - materials_lower = materials_text.lower() - - if any(word in materials_lower for word in ['fără', 'nu necesare', 'nimic', 'minime']): - return 'Fără materiale' - elif any(word in materials_lower for word in ['hârtie', 'creion', 'marker', 'simple']): - return 'Materiale simple' - elif any(word in materials_lower for word in ['computer', 'proiector', 'echipament', 'complexe']): - return 'Materiale complexe' - else: - return 'Materiale variate' - - def _parse_direct_activities(self, subsection_text: str, category_name: str, subsection_title: str) -> List[Activity]: - """Parse activities that are described directly without 'Exemple de jocuri' section""" - activities = [] - - # Look for activity descriptions in sections that don't have "Exemple de jocuri" - if "**Exemple de jocuri:**" not in subsection_text: - # Try to extract from file descriptions - file_pattern = r"\*\*Fișier:\*\*\s*`([^`]+)`.*?\*\*(.+?)\*\*" - file_matches = re.finditer(file_pattern, subsection_text, re.DOTALL) - - for file_match in file_matches: - file_name = file_match.group(1) - description_part = file_match.group(2) - - # Create a general activity for this file - activity = Activity( - name=f"Activități din {file_name}", - description=f"Colecție de activități din fișierul {file_name}. {description_part[:200]}...", - category=category_name, - subcategory=subsection_title, - source_file=file_name, - page_reference=f"{category_name} > {subsection_title}", - **self._extract_subsection_metadata(subsection_text) - ) - - activities.append(activity) - - return activities - - def validate_activity_completeness(self, activity: Activity) -> bool: - """Validate that an activity has all necessary fields""" - required_fields = ['name', 'description', 'category', 'source_file'] - - for field in required_fields: - if not getattr(activity, field) or not getattr(activity, field).strip(): - return False - - # Check minimum description length - if len(activity.description) < 10: - return False - - return True - - def get_parsing_statistics(self) -> Dict: - """Get statistics about the parsing process""" - if not self.activities: - return {'total_activities': 0} - - category_counts = {} - valid_activities = 0 - - for activity in self.activities: - # Count by category - if activity.category in category_counts: - category_counts[activity.category] += 1 - else: - category_counts[activity.category] = 1 - - # Count valid activities - if self.validate_activity_completeness(activity): - valid_activities += 1 - - return { - 'total_activities': len(self.activities), - 'valid_activities': valid_activities, - 'completion_rate': (valid_activities / len(self.activities)) * 100 if self.activities else 0, - 'category_breakdown': category_counts, - 'average_description_length': sum(len(a.description) for a in self.activities) / len(self.activities) if self.activities else 0 - } \ No newline at end of file diff --git a/app/services/search.py b/app/services/search.py index a41857a..2a64261 100644 --- a/app/services/search.py +++ b/app/services/search.py @@ -5,8 +5,19 @@ Enhanced search with FTS5 and intelligent filtering from typing import List, Dict, Any, Optional from app.models.database import DatabaseManager +from app.config_taxonomy import NON_GAME_CONTENT_TYPES import re +# Category slugs that are themselves "non-game" — selecting one of these as a +# category filter also lifts the default non-game content_type exclusion. +NON_GAME_CATEGORIES = {"retete", "cantece-ceremonii"} + +# When a Python-side post-filter is active the DB LIMIT is applied *before* +# filtering, so we over-fetch to still satisfy the caller's `limit`. +_OVERSCAN_FACTOR = 5 +_OVERSCAN_CAP = 2000 + + class SearchService: """Enhanced search service with intelligent query processing""" @@ -24,22 +35,72 @@ class SearchService: if filters is None: filters = {} - + # Process and normalize search text processed_search = self._process_search_text(search_text) - + # Map web filters to database fields db_filters = self._map_filters_to_db_fields(filters) - + + # content_type and language are filtered in Python: the DB layer does + # not expose them as query parameters. The DEFAULT search excludes the + # non-game content types (rețete / cântece / ceremonii) — they surface + # only when the user explicitly filters that content_type, or picks a + # non-game category. See plan §6. + content_type, exclude_non_game = self._resolve_content_type_filter(filters) + language = (filters.get('language') or '').strip().lower() or None + post_filtering = bool(content_type or exclude_non_game or language) + + # Over-fetch when post-filtering so the final list can still reach `limit`. + fetch_limit = min(limit * _OVERSCAN_FACTOR, _OVERSCAN_CAP) if post_filtering else limit + # Perform database search results = self.db.search_activities( search_text=processed_search, **db_filters, - limit=limit + limit=fetch_limit ) - - # Post-process results for relevance and ranking - return self._post_process_results(results, processed_search, filters) + + # Apply content_type / language post-filters + results = self._apply_content_type_filter(results, content_type, exclude_non_game) + if language: + results = [r for r in results + if (r.get('language') or '').strip().lower() == language] + + # Post-process results for relevance and ranking, then honour `limit` + results = self._post_process_results(results, processed_search, filters) + return results[:limit] + + def _resolve_content_type_filter(self, filters: Dict[str, str]): + """Determine the content_type post-filter. + + Returns (explicit_content_type | None, exclude_non_game: bool): + - an explicit `content_type` filter → that value, no exclusion; + - a `category` filter on a non-game category → no exclusion; + - otherwise → default search, exclude non-game content types. + """ + content_type = (filters.get('content_type') or '').strip() + if content_type: + return content_type, False + category = (filters.get('category') or '').strip() + if category in NON_GAME_CATEGORIES: + return None, False + return None, True + + def _apply_content_type_filter(self, + results: List[Dict[str, Any]], + content_type: Optional[str], + exclude_non_game: bool) -> List[Dict[str, Any]]: + """Filter results by content_type (explicit include vs default exclude).""" + if content_type: + return [r for r in results + if (r.get('content_type') or '') == content_type] + if exclude_non_game: + # Rows with NULL/unknown content_type are kept — only the known + # non-game types are dropped from the default search. + return [r for r in results + if (r.get('content_type') or '') not in NON_GAME_CONTENT_TYPES] + return results def _process_search_text(self, search_text: Optional[str]) -> Optional[str]: """Process and enhance search text for better FTS5 results""" @@ -83,10 +144,16 @@ class SearchService: if not filter_value or not filter_value.strip(): continue + # content_type / language are NOT database query params — they are + # applied as Python post-filters in search_activities(). Skip them + # here so they never reach DatabaseManager.search_activities(). + if filter_key in ('content_type', 'language'): + continue + # Map filter types to database fields if filter_key == 'category': db_filters['category'] = filter_value - + elif filter_key == 'age_group': # Parse age range (e.g., "5-8 ani", "12+ ani") age_match = re.search(r'(\d+)(?:-(\d+))?\s*ani?', filter_value) @@ -177,21 +244,22 @@ class SearchService: boost_score = 0 # Check name matches (highest priority) - name_lower = result.get('name', '').lower() + # NB: use `or ''` — nullable columns come back as None, not ''. + name_lower = (result.get('name') or '').lower() for term in search_terms: if term in name_lower: boost_score += 10 if name_lower.startswith(term): boost_score += 5 # Extra boost for name starts with term - + # Check description matches - desc_lower = result.get('description', '').lower() + desc_lower = (result.get('description') or '').lower() for term in search_terms: if term in desc_lower: boost_score += 3 - + # Check keywords matches - keywords_lower = result.get('keywords', '').lower() + keywords_lower = (result.get('keywords') or '').lower() for term in search_terms: if term in keywords_lower: boost_score += 5 @@ -280,11 +348,14 @@ class SearchService: return [] try: - # Search for activities that match the partial query + # Search for activities that match the partial query. + # Over-fetch then drop non-game content types so autocomplete + # mirrors the default search (no rețete / cântece / ceremonii). results = self.db.search_activities( search_text=f'"{partial_query}"', - limit=limit * 2 + limit=limit * 6 ) + results = self._apply_content_type_filter(results, None, True) suggestions = [] seen = set() diff --git a/app/templates/activity.html b/app/templates/activity.html index 6e25f08..d865f0a 100644 --- a/app/templates/activity.html +++ b/app/templates/activity.html @@ -15,7 +15,13 @@

{{ activity.name }}

- {{ activity.category }} + {{ display_names.get(activity.category, activity.category) }} + {% if activity.content_type %} + {{ display_names.get(activity.content_type, activity.content_type) }} + {% endif %} + {% if activity.needs_review %} + ⚠ De verificat + {% endif %}
{% if activity.subcategory %} diff --git a/app/templates/index.html b/app/templates/index.html index 8809c15..7baffeb 100644 --- a/app/templates/index.html +++ b/app/templates/index.html @@ -36,7 +36,31 @@ + + {% endif %} + + {% if filters.content_type %} +
+ + +
+ {% endif %} + + {% if filters.language %} +
+ +
diff --git a/app/templates/results.html b/app/templates/results.html index fa835cb..f06166d 100644 --- a/app/templates/results.html +++ b/app/templates/results.html @@ -24,7 +24,29 @@ {% for category in filters.category %} + {% endfor %} + + {% endif %} + + {% if filters.content_type %} + + {% endif %} + + {% if filters.language %} + @@ -109,7 +131,10 @@ {{ activity.name }} - {{ activity.category }} + {{ display_names.get(activity.category, activity.category) }} + {% if activity.needs_review %} + ⚠ De verificat + {% endif %}
diff --git a/app/web/routes.py b/app/web/routes.py index 6445e7a..56fd7ca 100644 --- a/app/web/routes.py +++ b/app/web/routes.py @@ -7,11 +7,17 @@ from flask import Blueprint, request, render_template, jsonify, current_app from app.models.database import DatabaseManager from app.models.activity import Activity from app.services.search import SearchService +from app.config_taxonomy import CATEGORIES, CONTENT_TYPES import os from pathlib import Path bp = Blueprint('main', __name__) +# Slug -> Romanian display name. Category and content_type slugs never collide, +# so a single flat map is enough for the UI filter labels. +LANGUAGE_NAMES = {'ro': 'Română', 'en': 'Engleză'} +DISPLAY_NAMES = {**CATEGORIES, **CONTENT_TYPES, **LANGUAGE_NAMES} + # Initialize database manager (will be configured in application factory) def get_db_manager(): """Get database manager instance""" @@ -36,15 +42,17 @@ def index(): # Get database statistics for the interface stats = db.get_statistics() - return render_template('index.html', + return render_template('index.html', filters=filter_options, + display_names=DISPLAY_NAMES, stats=stats) - + except Exception as e: print(f"Error loading main page: {e}") # Fallback with empty filters - return render_template('index.html', + return render_template('index.html', filters={}, + display_names=DISPLAY_NAMES, stats={'total_activities': 0}) @bp.route('/search', methods=['GET', 'POST']) @@ -82,8 +90,9 @@ def search(): search_query=search_query, applied_filters=filters, filters=filter_options, + display_names=DISPLAY_NAMES, results_count=len(activities)) - + except Exception as e: print(f"Search error: {e}") return render_template('results.html', @@ -91,6 +100,7 @@ def search(): search_query='', applied_filters={}, filters={}, + display_names=DISPLAY_NAMES, results_count=0, error=str(e)) @@ -121,6 +131,7 @@ def activity_detail(activity_id): return render_template('activity.html', activity=activity, + display_names=DISPLAY_NAMES, similar_activities=similar_activities) except Exception as e: diff --git a/scripts/SUBAGENT_PROMPT.md b/scripts/SUBAGENT_PROMPT.md new file mode 100644 index 0000000..79c3e9c --- /dev/null +++ b/scripts/SUBAGENT_PROMPT.md @@ -0,0 +1,81 @@ +# SUBAGENT — Activity extraction + +You are a subagent in the game-library extraction pipeline. You extract +educational activities (games, team-building, scouting, recipes, songs, +ceremonies) from one chunk of a source document into structured JSON. + +## Your task + +1. **Read ONLY the chunk you were assigned.** Do not read other chunks, other + files, or the original document. The chunk is a `.txt` file with + `--- PAGE N ---` markers. +2. Identify **every distinct activity** in the chunk. +3. For each activity, fill the schema in `scripts/activity_schema.json`. +4. Write the result to `data/extracted/.json`. + +## What counts as "a distinct activity" + +A distinct activity is a self-contained game/activity/recipe/song/ceremony with +its own name and a real description of how to do it. It is NOT: + +- a bare mention or a cross-reference with no description — **skip it**; +- a sub-variant of an activity already extracted — fold it into `variations`; +- a heading, a table of contents entry, or running page chrome. + +If the same activity is split across a page boundary inside your chunk, treat it +as **one** activity and combine the text. + +## Output format + +The file is one JSON object: a `header` plus an `activities` array. + +```json +{ + "header": { + "source_id": "", + "chunk_key": "", + "source_hash": "", + "schema_version": "1.0", + "prompt_version": "1.0", + "chunk_range": "pages 1-20" + }, + "activities": [ ... ] +} +``` + +## Rules for each activity + +- **`name`** — the activity's real name (≥3 characters). +- **`description`** — real prose describing the activity. No hard length limit, + but it must actually describe what happens. +- **`rules`** — how it is played / carried out, if the source gives rules. +- **`category`** — exactly one taxonomy slug (see the `enum` in the schema): + `jocuri-cercetasesti`, `team-building`, `icebreakers`, `camp-outdoor`, + `wide-games`, `orientare`, `prim-ajutor`, `escape-room-puzzle`, + `creative-stem`, `sports-active`, `cantece-ceremonii`, `retete`, + `supravietuire`, `integrare-incluziune`, `conflict-empatie`, `altele`. + When unsure, use `altele`. +- **`content_type`** — the FORM of the content, independent of category: + `joc`, `activitate`, `reteta`, `cantec`, or `ceremonie`. +- **`language`** — `ro` or `en` (the language the activity is written in). +- **`source_excerpt`** — **MANDATORY.** A short quote (one or two sentences) + copied **verbatim** from the chunk. This is the anti-hallucination anchor: it + is checked as a fuzzy substring of the chunk, and invented quotes are + rejected. +- **`page_reference`** — **MANDATORY.** The `--- PAGE N ---` marker(s) the + activity came from, e.g. `"page 14"` or `"pages 14-15"`. +- **`extraction_confidence`** — `high`, `med`, or `low`. Use `low` when the + source text for the activity is thin or ambiguous. + +## Never invent data + +- Do **not** invent ages, participant counts, or durations. If the source does + not state them, leave those fields `null`. +- Do **not** paraphrase the `source_excerpt` — copy it character for character. +- Better to extract fewer activities accurately than to pad the output. + +## Before you finish + +- Every activity has a non-empty `source_excerpt` and `page_reference`. +- The file validates against `scripts/activity_schema.json`. +- You only used text from your assigned chunk. diff --git a/scripts/activity_schema.json b/scripts/activity_schema.json new file mode 100644 index 0000000..922dc86 --- /dev/null +++ b/scripts/activity_schema.json @@ -0,0 +1,110 @@ +{ + "$schema": "http://json-schema.org/draft-07/schema#", + "title": "Game-library extraction output", + "description": "One subagent output file: a header carrying provenance/version metadata plus the list of activities extracted from a single chunk.", + "type": "object", + "required": ["header", "activities"], + "additionalProperties": false, + "properties": { + "header": { + "type": "object", + "required": ["source_hash", "schema_version", "prompt_version", "chunk_range"], + "additionalProperties": true, + "properties": { + "source_hash": {"type": "string", "minLength": 8}, + "schema_version": {"type": "string"}, + "prompt_version": {"type": "string"}, + "chunk_range": {"type": "string"}, + "source_id": {"type": ["string", "null"]}, + "chunk_key": {"type": ["string", "null"]} + } + }, + "activities": { + "type": "array", + "items": {"$ref": "#/definitions/activity"} + } + }, + "definitions": { + "activity": { + "type": "object", + "required": [ + "name", + "description", + "category", + "content_type", + "language", + "extraction_confidence", + "source_excerpt", + "page_reference" + ], + "additionalProperties": false, + "properties": { + "name": {"type": "string", "minLength": 3}, + "description": {"type": "string", "minLength": 1}, + "rules": {"type": ["string", "null"]}, + "variations": {"type": ["string", "null"]}, + "category": { + "type": "string", + "enum": [ + "jocuri-cercetasesti", + "team-building", + "icebreakers", + "camp-outdoor", + "wide-games", + "orientare", + "prim-ajutor", + "escape-room-puzzle", + "creative-stem", + "sports-active", + "cantece-ceremonii", + "retete", + "supravietuire", + "integrare-incluziune", + "conflict-empatie", + "altele" + ] + }, + "subcategory": {"type": ["string", "null"]}, + "content_type": { + "type": "string", + "enum": ["joc", "activitate", "reteta", "cantec", "ceremonie"] + }, + "language": {"type": "string", "enum": ["ro", "en"]}, + "extraction_confidence": { + "type": "string", + "enum": ["high", "med", "low"] + }, + "source_excerpt": {"type": "string", "minLength": 1}, + "page_reference": {"type": "string", "minLength": 1}, + "source_file": {"type": ["string", "null"]}, + "age_group_min": {"type": ["integer", "null"], "minimum": 0}, + "age_group_max": {"type": ["integer", "null"], "minimum": 0}, + "participants_min": {"type": ["integer", "null"], "minimum": 0}, + "participants_max": {"type": ["integer", "null"], "minimum": 0}, + "duration_min": {"type": ["integer", "null"], "minimum": 0}, + "duration_max": {"type": ["integer", "null"], "minimum": 0}, + "materials_category": {"type": ["string", "null"]}, + "materials_list": { + "type": ["array", "null"], + "items": {"type": "string"} + }, + "skills_developed": { + "type": ["array", "null"], + "items": {"type": "string"} + }, + "difficulty_level": { + "type": ["string", "null"], + "enum": ["usor", "mediu", "dificil", null] + }, + "keywords": { + "type": ["array", "null"], + "items": {"type": "string"} + }, + "tags": { + "type": ["array", "null"], + "items": {"type": "string"} + } + } + } + } +} diff --git a/scripts/build_database.py b/scripts/build_database.py new file mode 100644 index 0000000..d7276be --- /dev/null +++ b/scripts/build_database.py @@ -0,0 +1,639 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- +""" +build_database.py — build data/activities.db from the subagent extraction JSON. + +Replaces the old import_claude_activities.py. Pipeline (plan §4): + + 1. `--rebuild` builds into data/activities.db.tmp; on success the live DB is + backed up to data/activities.db.bak and the tmp file is swapped in with an + atomic os.replace. A mid-build crash leaves the live DB untouched. + 2. Every data/extracted/*.json is validated against scripts/activity_schema.json; + invalid files are moved to data/extracted/_rejected/ with an error log. + 2b. Each source_excerpt must appear as a fuzzy substring (rapidfuzz + partial_ratio >= 90) of its source chunk — non-matches are hallucinations + and the activity is dropped (logged to _rejected/). + 3. `category` is normalized to a valid taxonomy slug (fallback `altele`). + 4. Dedup (D5): group by exact normalized_name, never across languages; within a + group rapidfuzz on descriptions — >=85 auto-merge, 60-85 borderline (keep + both, needs_review), <60 separate variants. + 5. data/review_decisions.json is applied before insert. + 6. Bulk insert into the tmp DB, populate the categories table, rebuild FTS. + 7. A QA report is printed. + +Usage: + python scripts/build_database.py --rebuild +""" + +from __future__ import annotations + +import argparse +import json +import os +import shutil +import sys +from collections import defaultdict +from pathlib import Path +from typing import Any, Optional + +SCRIPT_DIR = Path(__file__).resolve().parent +REPO_ROOT = SCRIPT_DIR.parent +for _p in (str(SCRIPT_DIR), str(REPO_ROOT)): + if _p not in sys.path: + sys.path.insert(0, _p) + +from app.config_taxonomy import ( # noqa: E402 + category_display_name, + normalize_category, + normalize_content_type, +) +from app.models.activity import Activity # noqa: E402 +from app.models.database import DatabaseManager # noqa: E402 +from import_common import ( # noqa: E402 + DEFAULT_SCHEMA_PATH, + content_key, + excerpt_matches, + find_chunk_text, + iter_extraction_files, + load_schema, + normalize_name, + source_path_for, +) + +# dedup thresholds (rapidfuzz token_sort_ratio, 0..100 scale) +AUTO_MERGE_THRESHOLD = 85.0 +BORDERLINE_THRESHOLD = 60.0 + + +# -------------------------------------------------------------------------- +# extraction dict -> Activity +# -------------------------------------------------------------------------- +def _csv(value: Any) -> Optional[str]: + """Schema arrays -> comma string for the (TEXT) DB columns.""" + if value is None: + return None + if isinstance(value, str): + return value.strip() or None + if isinstance(value, (list, tuple)): + parts = [str(v).strip() for v in value if str(v).strip()] + return ", ".join(parts) or None + return str(value) + + +def _split_csv(value: Optional[str]) -> list[str]: + if not value: + return [] + return [p.strip() for p in str(value).split(",") if p.strip()] + + +def dict_to_activity(adict: dict, source_file: str) -> Activity: + """Build an Activity from one extraction-JSON activity object.""" + tags = adict.get("tags") or [] + if isinstance(tags, str): + tags = _split_csv(tags) + + source_files = adict.get("source_files") or [] + if isinstance(source_files, str): + source_files = _split_csv(source_files) + if source_file and source_file not in source_files: + source_files = [source_file, *source_files] + + return Activity( + name=(adict.get("name") or "").strip(), + description=(adict.get("description") or "").strip(), + rules=adict.get("rules"), + variations=adict.get("variations"), + category=normalize_category(adict.get("category", "")), + subcategory=adict.get("subcategory"), + content_type=normalize_content_type(adict.get("content_type", "")), + source_file=source_file, + source_files=list(source_files), + page_reference=adict.get("page_reference"), + source_excerpt=adict.get("source_excerpt"), + age_group_min=adict.get("age_group_min"), + age_group_max=adict.get("age_group_max"), + participants_min=adict.get("participants_min"), + participants_max=adict.get("participants_max"), + duration_min=adict.get("duration_min"), + duration_max=adict.get("duration_max"), + materials_category=adict.get("materials_category"), + materials_list=_csv(adict.get("materials_list")), + skills_developed=_csv(adict.get("skills_developed")), + difficulty_level=adict.get("difficulty_level"), + keywords=_csv(adict.get("keywords")), + tags=list(tags), + language=adict.get("language"), + extraction_confidence=adict.get("extraction_confidence"), + ) + + +# -------------------------------------------------------------------------- +# step 3 — category normalization is done in dict_to_activity; a non-taxonomy +# value silently falls back to `altele`. This logs the substitutions. +# -------------------------------------------------------------------------- +def log_category_fallbacks(raw_pairs: list[tuple[str, str]]) -> list[str]: + """raw_pairs = (original, slug); return human-readable fallback messages.""" + msgs = [] + for original, slug in raw_pairs: + if slug == "altele" and normalize_name(original or "") not in ("", "altele"): + msgs.append(f"category '{original}' -> altele (not in taxonomy)") + return msgs + + +# -------------------------------------------------------------------------- +# step 4 — dedup +# -------------------------------------------------------------------------- +def _longest(*values: Optional[str]) -> Optional[str]: + best: Optional[str] = None + for v in values: + if v and (best is None or len(v) > len(best)): + best = v + return best + + +def _union_csv(values: list[Optional[str]]) -> Optional[str]: + seen: list[str] = [] + for value in values: + for item in _split_csv(value): + if item not in seen: + seen.append(item) + return ", ".join(seen) or None + + +def merge_cluster(cluster: list[Activity]) -> Activity: + """Collapse a cluster of duplicate activities into one merged Activity.""" + if len(cluster) == 1: + return cluster[0] + + # representative = the one with the longest description + rep = max(cluster, key=lambda a: len(a.description or "")) + merged = Activity( + name=rep.name, + description=_longest(*(a.description for a in cluster)) or rep.description, + rules=_longest(*(a.rules for a in cluster)), + variations=_longest(*(a.variations for a in cluster)), + category=rep.category, + subcategory=rep.subcategory, + content_type=rep.content_type, + source_file=rep.source_file, + page_reference=rep.page_reference, + source_excerpt=rep.source_excerpt, + age_group_min=rep.age_group_min, + age_group_max=rep.age_group_max, + participants_min=rep.participants_min, + participants_max=rep.participants_max, + duration_min=rep.duration_min, + duration_max=rep.duration_max, + materials_category=rep.materials_category, + materials_list=_union_csv([a.materials_list for a in cluster]), + skills_developed=_union_csv([a.skills_developed for a in cluster]), + difficulty_level=rep.difficulty_level, + keywords=_union_csv([a.keywords for a in cluster]), + language=rep.language, + extraction_confidence=rep.extraction_confidence, + ) + # union of tags + tags: list[str] = [] + for a in cluster: + for t in a.tags or []: + if t not in tags: + tags.append(t) + merged.tags = tags + # accumulate every source the activity was seen in + sources: list[str] = [] + for a in cluster: + for s in [a.source_file, *(a.source_files or [])]: + if s and s not in sources: + sources.append(s) + merged.source_files = sources + # popularity_score++ per merged duplicate (plan §4) + merged.popularity_score = max(a.popularity_score for a in cluster) + (len(cluster) - 1) + return merged + + +def dedup_activities(activities: list[Activity]) -> tuple[list[Activity], dict]: + """ + Dedup per plan D5. + + Groups by (normalized_name, language) — different languages are NEVER + merged. Within a group, descriptions are clustered with rapidfuzz: + >= 85 -> same cluster (auto-merge) + 60-85 -> borderline: kept as separate clusters, both flagged needs_review + < 60 -> separate variants + """ + from rapidfuzz import fuzz + + groups: dict[tuple, list[Activity]] = defaultdict(list) + for act in activities: + key = (act.normalized_name or normalize_name(act.name), act.language) + groups[key].append(act) + + result: list[Activity] = [] + stats = {"input": len(activities), "auto_merged": 0, "borderline": 0, "output": 0} + + for members in groups.values(): + clusters: list[list[Activity]] = [] + borderline_idx: set[int] = set() + + for act in members: + best_idx, best_score = -1, -1.0 + borderline_here: list[int] = [] + for idx, cluster in enumerate(clusters): + score = fuzz.token_sort_ratio( + act.description or "", cluster[0].description or "" + ) + if score >= AUTO_MERGE_THRESHOLD: + if score > best_score: + best_idx, best_score = idx, score + elif score >= BORDERLINE_THRESHOLD: + borderline_here.append(idx) + if best_idx >= 0: + clusters[best_idx].append(act) + else: + clusters.append([act]) + new_idx = len(clusters) - 1 + for bidx in borderline_here: + borderline_idx.add(bidx) + borderline_idx.add(new_idx) + + for idx, cluster in enumerate(clusters): + merged = merge_cluster(cluster) + if len(cluster) > 1: + stats["auto_merged"] += len(cluster) - 1 + if idx in borderline_idx: + merged.needs_review = 1 + stats["borderline"] += 1 + result.append(merged) + + stats["output"] = len(result) + return result, stats + + +# -------------------------------------------------------------------------- +# step 5 — review decisions +# -------------------------------------------------------------------------- +def load_review_decisions(path: Path) -> dict: + if path and path.is_file(): + try: + data = json.loads(path.read_text(encoding="utf-8")) + if isinstance(data, dict): + return data + except (json.JSONDecodeError, OSError): + pass + return {} + + +def apply_review_decisions( + activities: list[Activity], decisions: dict +) -> tuple[list[Activity], dict]: + """ + Apply data/review_decisions.json (plan §5c). + + Keyed by the stable content_key. A decision of `drop` removes the row; + `keep-separate` / `merge` clear needs_review (the user has resolved it). + Rows with no decision keep needs_review and resurface in the queue. + """ + kept: list[Activity] = [] + stats = {"dropped": 0, "resolved": 0} + for act in activities: + key = content_key( + act.normalized_name or normalize_name(act.name), + act.language, + act.description or "", + ) + entry = decisions.get(key) + decision = entry.get("decision") if isinstance(entry, dict) else entry + if decision == "drop": + stats["dropped"] += 1 + continue + if decision in ("keep-separate", "merge"): + act.needs_review = 0 + stats["resolved"] += 1 + kept.append(act) + return kept, stats + + +# -------------------------------------------------------------------------- +# golden-set recall (plan §7) +# -------------------------------------------------------------------------- +def _golden_names(data: Any) -> list[str]: + items = data.get("activities", data) if isinstance(data, dict) else data + names: list[str] = [] + for item in items or []: + if isinstance(item, str): + names.append(item) + elif isinstance(item, dict) and item.get("name"): + names.append(item["name"]) + return names + + +def golden_recall(golden_dir: Path, activities: list[Activity]) -> Optional[dict]: + if not golden_dir or not golden_dir.is_dir(): + return None + found = {normalize_name(a.name) for a in activities} + expected, hits = 0, 0 + for gf in sorted(golden_dir.glob("*.json")): + try: + data = json.loads(gf.read_text(encoding="utf-8")) + except (json.JSONDecodeError, OSError): + continue + for name in _golden_names(data): + expected += 1 + if normalize_name(name) in found: + hits += 1 + if expected == 0: + return None + return {"expected": expected, "found": hits, "recall": round(hits / expected, 3)} + + +# -------------------------------------------------------------------------- +# load + validate + excerpt-check the extraction files +# -------------------------------------------------------------------------- +def collect_activities( + extracted_dir: Path, + chunks_dir: Path, + sources_dir: Path, + schema: dict, +) -> dict: + """Validate, excerpt-check and convert every extraction file.""" + rejected_dir = extracted_dir / "_rejected" + activities: list[Activity] = [] + report = { + "files_total": 0, + "files_valid": 0, + "files_rejected_schema": 0, + "activities_raw": 0, + "activities_hallucinated": 0, + "category_fallbacks": [], + } + raw_categories: list[tuple[str, str]] = [] + + from import_common import chunk_key_for # local import to avoid clutter + + for json_path in iter_extraction_files(extracted_dir): + report["files_total"] += 1 + try: + data = json.loads(json_path.read_text(encoding="utf-8")) + except json.JSONDecodeError as exc: + _reject_file(json_path, rejected_dir, [f"invalid JSON: {exc}"]) + report["files_rejected_schema"] += 1 + continue + + from import_common import validate_extraction + + errors = validate_extraction(data, schema) + if errors: + _reject_file(json_path, rejected_dir, errors) + report["files_rejected_schema"] += 1 + continue + report["files_valid"] += 1 + + header = data.get("header", {}) + chunk_text = find_chunk_text(json_path, header, chunks_dir) + source_id = header.get("source_id") or chunk_key_for(json_path, header).rsplit( + ".part", 1 + )[0] + fallback_source = ( + source_path_for(source_id, sources_dir) or source_id or json_path.stem + ) + + hallucinated: list[dict] = [] + for adict in data.get("activities", []): + report["activities_raw"] += 1 + excerpt = adict.get("source_excerpt") or "" + # if the chunk text is unavailable we cannot verify — keep but the + # QA report still counts it under activities_raw. + if chunk_text is not None and not excerpt_matches(excerpt, chunk_text): + hallucinated.append(adict) + report["activities_hallucinated"] += 1 + continue + src = adict.get("source_file") or fallback_source + raw_categories.append((adict.get("category", ""), normalize_category(adict.get("category", "")))) + activities.append(dict_to_activity(adict, src)) + + if hallucinated: + _log_hallucinations(json_path, rejected_dir, hallucinated) + + report["category_fallbacks"] = log_category_fallbacks(raw_categories) + report["activities"] = activities + return report + + +def _reject_file(json_path: Path, rejected_dir: Path, errors: list[str]) -> None: + rejected_dir.mkdir(parents=True, exist_ok=True) + dest = rejected_dir / json_path.name + shutil.move(str(json_path), str(dest)) + log = rejected_dir / f"{json_path.stem}.errors.txt" + log.write_text( + f"REJECTED (schema validation): {json_path.name}\n\n" + + "\n".join(f" - {e}" for e in errors) + + "\n", + encoding="utf-8", + ) + + +def _log_hallucinations( + json_path: Path, rejected_dir: Path, hallucinated: list[dict] +) -> None: + rejected_dir.mkdir(parents=True, exist_ok=True) + log = rejected_dir / f"{json_path.stem}.hallucinations.txt" + lines = [f"DROPPED activities (source_excerpt not found in chunk): {json_path.name}", ""] + for a in hallucinated: + lines.append(f" - {a.get('name')!r}") + lines.append(f" excerpt: {a.get('source_excerpt')!r}") + log.write_text("\n".join(lines) + "\n", encoding="utf-8") + + +# -------------------------------------------------------------------------- +# DB write + atomic swap +# -------------------------------------------------------------------------- +def _enrich_category_display_names(db_path: Path) -> None: + """Give the categories table proper Romanian display names for slugs.""" + import sqlite3 + + conn = sqlite3.connect(db_path) + try: + rows = conn.execute( + "SELECT value FROM categories WHERE type = 'category'" + ).fetchall() + for (slug,) in rows: + conn.execute( + "UPDATE categories SET display_name = ? WHERE type='category' AND value = ?", + (category_display_name(slug), slug), + ) + conn.commit() + finally: + conn.close() + + +def write_database(db_tmp_path: Path, activities: list[Activity]) -> None: + """Create a fresh tmp DB, bulk insert, populate categories, rebuild FTS.""" + if db_tmp_path.exists(): + db_tmp_path.unlink() + db = DatabaseManager(str(db_tmp_path)) + db.bulk_insert_activities(activities) + _enrich_category_display_names(db_tmp_path) + db.rebuild_fts_index() + + +def atomic_swap(db_tmp_path: Path, db_path: Path) -> Optional[Path]: + """Back up the live DB then atomically swap the tmp file in.""" + backup: Optional[Path] = None + if db_path.exists(): + backup = db_path.with_suffix(db_path.suffix + ".bak") + shutil.copy2(db_path, backup) + os.replace(db_tmp_path, db_path) + return backup + + +# -------------------------------------------------------------------------- +# orchestration +# -------------------------------------------------------------------------- +def rebuild( + *, + extracted_dir: Path, + chunks_dir: Path, + sources_dir: Path, + db_path: Path, + decisions_path: Optional[Path] = None, + schema_path: Path = DEFAULT_SCHEMA_PATH, + golden_dir: Optional[Path] = None, + do_swap: bool = True, +) -> dict: + """ + Full rebuild. Everything is built into .tmp; the live DB is only + touched by the final atomic swap, so a crash anywhere above leaves it intact. + """ + extracted_dir = Path(extracted_dir) + db_path = Path(db_path) + db_tmp_path = db_path.with_suffix(db_path.suffix + ".tmp") + + schema = load_schema(schema_path) + collected = collect_activities(extracted_dir, Path(chunks_dir), Path(sources_dir), schema) + activities: list[Activity] = collected.pop("activities") + + deduped, dedup_stats = dedup_activities(activities) + + decisions = load_review_decisions(Path(decisions_path)) if decisions_path else {} + final, decision_stats = apply_review_decisions(deduped, decisions) + + try: + write_database(db_tmp_path, final) + backup = atomic_swap(db_tmp_path, db_path) if do_swap else None + except Exception: + if db_tmp_path.exists(): + db_tmp_path.unlink() + raise + + report = { + **collected, + "dedup": dedup_stats, + "decisions": decision_stats, + "final_count": len(final), + "backup": str(backup) if backup else None, + "swapped": do_swap, + "qa": _qa_report(final, collected, golden_dir), + } + return report + + +def _qa_report( + activities: list[Activity], collected: dict, golden_dir: Optional[Path] +) -> dict: + per_category: dict[str, int] = defaultdict(int) + per_content_type: dict[str, int] = defaultdict(int) + confidence: dict[str, int] = defaultdict(int) + with_rules = 0 + for a in activities: + per_category[a.category] += 1 + per_content_type[a.content_type or "?"] += 1 + confidence[a.extraction_confidence or "?"] += 1 + if a.rules and a.rules.strip(): + with_rules += 1 + raw = collected.get("activities_raw", 0) + hallucinated = collected.get("activities_hallucinated", 0) + return { + "total": len(activities), + "per_category": dict(per_category), + "per_content_type": dict(per_content_type), + "extraction_confidence": dict(confidence), + "pct_with_rules": round(100 * with_rules / len(activities), 1) if activities else 0.0, + "needs_review": sum(1 for a in activities if a.needs_review), + "hallucination_rate": round(100 * hallucinated / raw, 2) if raw else 0.0, + "golden_recall": golden_recall(Path(golden_dir), activities) if golden_dir else None, + } + + +def print_report(report: dict) -> None: + qa = report["qa"] + print("=" * 60) + print("BUILD DATABASE — QA REPORT") + print("=" * 60) + print(f"extraction files : {report['files_total']} " + f"(valid {report['files_valid']}, schema-rejected {report['files_rejected_schema']})") + print(f"activities raw : {report['activities_raw']}") + print(f" hallucinated drop : {report['activities_hallucinated']} " + f"({qa['hallucination_rate']}%)") + d = report["dedup"] + print(f"dedup : {d['input']} -> {d['output']} " + f"(auto-merged {d['auto_merged']}, borderline {d['borderline']})") + print(f"review decisions : dropped {report['decisions']['dropped']}, " + f"resolved {report['decisions']['resolved']}") + print(f"final inserted : {report['final_count']}") + print(f"% with rules : {qa['pct_with_rules']}") + print(f"needs_review rows : {qa['needs_review']}") + print("per category :") + for slug, n in sorted(qa["per_category"].items(), key=lambda kv: -kv[1]): + print(f" {slug:<24}: {n}") + print("per content_type :") + for ct, n in sorted(qa["per_content_type"].items(), key=lambda kv: -kv[1]): + print(f" {ct:<24}: {n}") + print("extraction_confidence:") + for c, n in sorted(qa["extraction_confidence"].items()): + print(f" {c:<24}: {n}") + if qa["golden_recall"]: + g = qa["golden_recall"] + print(f"golden recall : {g['found']}/{g['expected']} = {g['recall']}") + if report["category_fallbacks"]: + print("category fallbacks :") + for msg in report["category_fallbacks"]: + print(f" {msg}") + if report["backup"]: + print(f"live DB backed up to : {report['backup']}") + print("=" * 60) + + +# -------------------------------------------------------------------------- +# CLI +# -------------------------------------------------------------------------- +def main(argv: Optional[list[str]] = None) -> int: + parser = argparse.ArgumentParser(description="Build activities.db from extraction JSON.") + parser.add_argument("--rebuild", action="store_true", + help="rebuild the database from scratch (only mode supported)") + parser.add_argument("--extracted", default="data/extracted") + parser.add_argument("--chunks", default="data/chunks") + parser.add_argument("--sources", default="data/sources") + parser.add_argument("--db", default="data/activities.db") + parser.add_argument("--decisions", default="data/review_decisions.json") + parser.add_argument("--golden", default="data/golden") + parser.add_argument("--schema", default=str(DEFAULT_SCHEMA_PATH)) + args = parser.parse_args(argv) + + if not args.rebuild: + parser.error("only --rebuild is supported (full rebuild, no incremental merge)") + + report = rebuild( + extracted_dir=Path(args.extracted), + chunks_dir=Path(args.chunks), + sources_dir=Path(args.sources), + db_path=Path(args.db), + decisions_path=Path(args.decisions), + schema_path=Path(args.schema), + golden_dir=Path(args.golden), + ) + print_report(report) + return 0 + + +if __name__ == "__main__": + raise SystemExit(main()) diff --git a/scripts/chunk_sources.py b/scripts/chunk_sources.py new file mode 100644 index 0000000..0844b10 --- /dev/null +++ b/scripts/chunk_sources.py @@ -0,0 +1,251 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- +""" +chunk_sources.py — split normalized data/sources/*.txt into ~20-page chunks +for subagent extraction, and maintain data/chunks/manifest.json. + +Paginated text → ~20-page chunks, ~4-page overlap (plan D8). +Unpaginated text → ~10000-word windows, ~2000-word overlap. + +The manifest is a cache derived from the filesystem + per-chunk state. Re-running +this script is idempotent: existing chunk states (pending/assigned/done/rejected) +survive as long as the source content hash is unchanged. +""" + +from __future__ import annotations + +import argparse +import json +import sys +from pathlib import Path + +SCRIPT_DIR = Path(__file__).resolve().parent +if str(SCRIPT_DIR) not in sys.path: + sys.path.insert(0, str(SCRIPT_DIR)) + +from extract_common import content_hash, split_pages # noqa: E402 + +SCHEMA_VERSION = "1.0" +PAGES_PER_CHUNK = 20 +PAGE_OVERLAP = 4 +WORD_WINDOW = 10_000 +WORD_OVERLAP = 2_000 + +VALID_STATES = {"pending", "assigned", "done", "rejected"} + + +# -------------------------------------------------------------------------- +# header parsing +# -------------------------------------------------------------------------- +def parse_source(text: str) -> tuple[dict, str]: + """Split a normalized source file into (header_dict, body).""" + lines = text.splitlines() + header: dict = {} + body_start = 0 + in_header = True + for i, line in enumerate(lines): + if line.startswith("--- PAGE "): + body_start = i + break + if not in_header: + continue + if set(line.strip()) == {"="} and line.strip(): + body_start = i + 1 + in_header = False # header ends at the rule line + continue + if ":" in line: + key, _, val = line.partition(":") + header[key.strip()] = val.strip() + body = "\n".join(lines[body_start:]) + return header, body + + +# -------------------------------------------------------------------------- +# chunking — pure functions +# -------------------------------------------------------------------------- +def chunk_pages( + pages: list[tuple[int, str]], + pages_per_chunk: int = PAGES_PER_CHUNK, + overlap: int = PAGE_OVERLAP, +) -> list[dict]: + """ + Split an ordered list of (page_no, text) into overlapping chunks. + + stride = pages_per_chunk - overlap. Because stride < pages_per_chunk - 1, any + activity straddling a page boundary appears whole in at least one chunk. + """ + if not pages: + return [] + stride = max(1, pages_per_chunk - overlap) + chunks: list[dict] = [] + i = 0 + n = len(pages) + while i < n: + window = pages[i : i + pages_per_chunk] + first, last = window[0][0], window[-1][0] + text = "".join( + f"\n--- PAGE {num} ---\n{txt}\n" for num, txt in window + ) + chunks.append( + {"page_start": first, "page_end": last, + "chunk_range": f"pages {first}-{last}", "text": text} + ) + if i + pages_per_chunk >= n: + break + i += stride + return chunks + + +def chunk_words( + text: str, window: int = WORD_WINDOW, overlap: int = WORD_OVERLAP +) -> list[dict]: + """Split unpaginated text into overlapping word windows.""" + words = text.split() + if not words: + return [] + stride = max(1, window - overlap) + chunks: list[dict] = [] + i = 0 + n = len(words) + while i < n: + seg = words[i : i + window] + chunks.append( + {"word_start": i, "word_end": i + len(seg), + "chunk_range": f"words {i}-{i + len(seg)}", "text": " ".join(seg)} + ) + if i + window >= n: + break + i += stride + return chunks + + +def make_chunks(source_text: str) -> list[dict]: + """Chunk one normalized source file. Picks page- or word-windowing.""" + _, body = parse_source(source_text) + pages = split_pages(body) + if pages: + return chunk_pages(pages) + return chunk_words(body) + + +# -------------------------------------------------------------------------- +# manifest +# -------------------------------------------------------------------------- +def _empty_manifest() -> dict: + return {"schema_version": SCHEMA_VERSION, "chunks": {}} + + +def load_manifest(manifest_path: Path) -> dict: + if manifest_path.exists(): + try: + data = json.loads(manifest_path.read_text(encoding="utf-8")) + data.setdefault("schema_version", SCHEMA_VERSION) + data.setdefault("chunks", {}) + return data + except (json.JSONDecodeError, OSError): + pass + return _empty_manifest() + + +def save_manifest(manifest: dict, manifest_path: Path) -> None: + manifest_path.parent.mkdir(parents=True, exist_ok=True) + manifest_path.write_text( + json.dumps(manifest, indent=2, ensure_ascii=False), encoding="utf-8" + ) + + +def chunk_source_file( + source_path: Path, chunks_dir: Path, manifest: dict +) -> list[str]: + """ + Chunk one data/sources/.txt → data/chunks//.partNN.txt and + register every chunk in `manifest`. Preserves prior state when the source + content hash is unchanged. Returns the list of chunk keys written. + """ + source_id = source_path.stem + text = source_path.read_text(encoding="utf-8", errors="replace") + src_hash = content_hash(text) + chunks = make_chunks(text) + + out_dir = chunks_dir / source_id + out_dir.mkdir(parents=True, exist_ok=True) + + written: list[str] = [] + for idx, chunk in enumerate(chunks, 1): + key = f"{source_id}.part{idx:02d}" + chunk_file = out_dir / f"{key}.txt" + chunk_file.write_text(chunk["text"], encoding="utf-8") + + prior = manifest["chunks"].get(key) + # preserve state only if the source content is unchanged + if prior and prior.get("source_hash") == src_hash and \ + prior.get("state") in VALID_STATES: + state = prior["state"] + else: + state = "pending" + + manifest["chunks"][key] = { + "source_id": source_id, + "source_hash": src_hash, + "part": idx, + "chunk_range": chunk["chunk_range"], + "chunk_file": str(chunk_file.relative_to(chunks_dir.parent)), + "expected_json": f"{key}.json", + "state": state, + } + written.append(key) + return written + + +def prune_stale(manifest: dict, live_keys: set[str]) -> list[str]: + """Drop manifest entries whose chunk no longer exists on disk.""" + stale = [k for k in manifest["chunks"] if k not in live_keys] + for k in stale: + del manifest["chunks"][k] + return stale + + +# -------------------------------------------------------------------------- +# CLI +# -------------------------------------------------------------------------- +def run(sources_dir: Path, chunks_dir: Path) -> dict: + """Chunk every *.txt in sources_dir. Returns a summary dict.""" + manifest_path = chunks_dir / "manifest.json" + manifest = load_manifest(manifest_path) + + live_keys: set[str] = set() + source_files = sorted(sources_dir.glob("*.txt")) + for src in source_files: + live_keys.update(chunk_source_file(src, chunks_dir, manifest)) + + stale = prune_stale(manifest, live_keys) + save_manifest(manifest, manifest_path) + + states: dict[str, int] = {} + for meta in manifest["chunks"].values(): + states[meta["state"]] = states.get(meta["state"], 0) + 1 + return { + "sources": len(source_files), + "chunks": len(live_keys), + "pruned": len(stale), + "states": states, + } + + +def main(argv: list[str] | None = None) -> int: + parser = argparse.ArgumentParser(description="Chunk normalized sources.") + parser.add_argument("--sources", default="data/sources", help="sources dir") + parser.add_argument("--chunks", default="data/chunks", help="chunks output dir") + args = parser.parse_args(argv) + + summary = run(Path(args.sources), Path(args.chunks)) + print(f"sources processed : {summary['sources']}") + print(f"chunks written : {summary['chunks']}") + print(f"stale pruned : {summary['pruned']}") + for state, count in sorted(summary["states"].items()): + print(f" {state:<10}: {count}") + return 0 + + +if __name__ == "__main__": + raise SystemExit(main()) diff --git a/scripts/claude_extraction_template.md b/scripts/claude_extraction_template.md deleted file mode 100644 index f2137d1..0000000 --- a/scripts/claude_extraction_template.md +++ /dev/null @@ -1,54 +0,0 @@ -# TEMPLATE PENTRU EXTRACȚIE ACTIVITĂȚI CU CLAUDE - -## Instrucțiuni pentru Claude Code: - -Pentru fiecare PDF/DOC, folosește următorul format de extracție: - -### 1. Citește fișierul: -``` -Claude, te rog citește fișierul: [CALE_FISIER] -``` - -### 2. Extrage activitățile folosind acest template JSON: -```json -{ - "source_file": "[NUME_FISIER]", - "activities": [ - { - "name": "Numele activității", - "description": "Descrierea completă a activității", - "rules": "Regulile jocului/activității", - "variations": "Variante sau adaptări", - "category": "[A-H] bazat pe tip", - "age_group_min": 6, - "age_group_max": 14, - "participants_min": 4, - "participants_max": 20, - "duration_min": 10, - "duration_max": 30, - "materials_list": "Lista materialelor necesare", - "skills_developed": "Competențe dezvoltate", - "difficulty_level": "Ușor/Mediu/Dificil", - "keywords": "cuvinte cheie separate prin virgulă", - "tags": "taguri relevante" - } - ] -} -``` - -### 3. Salvează în fișier: -După extracție, salvează JSON-ul în: `/scripts/extracted_activities/[NUME_FISIER].json` - -### 4. Priorități de procesare: - -**TOP PRIORITY (procesează primele):** -1. 1000 Fantastic Scout Games.pdf -2. Cartea Mare a jocurilor.pdf -3. 160-de-activitati-dinamice-jocuri-pentru-team-building.pdf -4. 101 Ways to Create an Unforgettable Camp Experience.pdf -5. 151 Awesome Summer Camp Nature Activities.pdf - -**Categorii de focus:** -- [A] Jocuri Cercetășești -- [C] Camping & Activități Exterior -- [G] Activități Educaționale \ No newline at end of file diff --git a/scripts/create_databases.py b/scripts/create_databases.py deleted file mode 100644 index 515d3a4..0000000 --- a/scripts/create_databases.py +++ /dev/null @@ -1,164 +0,0 @@ -#!/usr/bin/env python3 -# -*- coding: utf-8 -*- -""" -DATABASE SETUP SCRIPT - INDEX-SISTEM-JOCURI - -Script pentru recrearea bazelor de date din .gitignore -Folosește clasele DatabaseManager pentru consistență - -Usage: - python scripts/create_databases.py - python scripts/create_databases.py --clear-existing -""" - -import sys -import argparse -from pathlib import Path - -# Add src to path so we can import our modules -sys.path.append(str(Path(__file__).parent.parent / 'src')) - -from database import DatabaseManager -from game_library_manager import GameLibraryManager - -def create_main_database(db_path: str = "data/activities.db", clear: bool = False): - """Create the main activities database""" - db_file = Path(db_path) - - if clear and db_file.exists(): - print(f"🗑️ Removing existing database: {db_path}") - db_file.unlink() - - print(f"📊 Creating main database: {db_path}") - db = DatabaseManager(db_path) - - # Test the database - try: - stats = db.get_statistics() - print(f"✅ Database created successfully: {stats['total_activities']} activities") - return True - except Exception as e: - print(f"❌ Error creating database: {e}") - return False - -def create_game_library_database(db_path: str = "data/game_library.db", clear: bool = False): - """Create the legacy game library database""" - db_file = Path(db_path) - - if clear and db_file.exists(): - print(f"🗑️ Removing existing database: {db_path}") - db_file.unlink() - - print(f"📊 Creating game library database: {db_path}") - manager = GameLibraryManager(db_path) - - print(f"✅ Game library database created successfully") - return True - -def create_test_database(db_path: str = "data/test_activities.db", clear: bool = False): - """Create the test database""" - db_file = Path(db_path) - - if clear and db_file.exists(): - print(f"🗑️ Removing existing database: {db_path}") - db_file.unlink() - - print(f"📊 Creating test database: {db_path}") - db = DatabaseManager(db_path) - - # Add some test data - test_activity = { - 'title': 'Test Activity - Setup Script', - 'description': 'This is a test activity created by the setup script', - 'file_path': 'test/sample.txt', - 'file_type': 'TXT', - 'category': 'test', - 'age_group': '8-12 ani', - 'participants': '5-10 persoane', - 'duration': '15-30min', - 'materials': 'Fără materiale', - 'tags': '["test", "setup"]', - 'source_text': 'Sample test content for verification' - } - - try: - db.insert_activity(test_activity) - stats = db.get_statistics() - print(f"✅ Test database created with sample data: {stats['total_activities']} activities") - return True - except Exception as e: - print(f"❌ Error creating test database: {e}") - return False - -def ensure_data_directory(): - """Ensure the data directory exists""" - data_dir = Path("data") - if not data_dir.exists(): - print(f"📁 Creating data directory: {data_dir}") - data_dir.mkdir(parents=True) - else: - print(f"📁 Data directory exists: {data_dir}") - -def main(): - """Main setup function""" - parser = argparse.ArgumentParser(description='Create databases for INDEX-SISTEM-JOCURI') - parser.add_argument('--clear-existing', '-c', action='store_true', - help='Remove existing databases before creating new ones') - parser.add_argument('--main-only', action='store_true', - help='Create only the main activities database') - parser.add_argument('--test-only', action='store_true', - help='Create only the test database') - - args = parser.parse_args() - - print("🚀 DATABASE SETUP - INDEX-SISTEM-JOCURI") - print("=" * 50) - - # Ensure data directory exists - ensure_data_directory() - - success_count = 0 - total_count = 0 - - if args.test_only: - total_count = 1 - if create_test_database(clear=args.clear_existing): - success_count += 1 - elif args.main_only: - total_count = 1 - if create_main_database(clear=args.clear_existing): - success_count += 1 - else: - # Create all databases - databases = [ - ("Main activities", lambda: create_main_database(clear=args.clear_existing)), - ("Game library", lambda: create_game_library_database(clear=args.clear_existing)), - ("Test activities", lambda: create_test_database(clear=args.clear_existing)) - ] - - total_count = len(databases) - - for name, create_func in databases: - print(f"\n📂 Creating {name} database...") - try: - if create_func(): - success_count += 1 - except Exception as e: - print(f"❌ Failed to create {name} database: {e}") - - print("\n" + "=" * 50) - print(f"🎯 SUMMARY: {success_count}/{total_count} databases created successfully") - - if success_count == total_count: - print("✅ All databases ready!") - print("\nNext steps:") - print("1. Run indexer: cd src && python indexer.py --clear-db") - print("2. Start web app: cd src && python app.py") - else: - print("⚠️ Some databases failed to create. Check errors above.") - return 1 - - return 0 - -if __name__ == '__main__': - sys.exit(main()) \ No newline at end of file diff --git a/scripts/extract_common.py b/scripts/extract_common.py new file mode 100644 index 0000000..f9f1a37 --- /dev/null +++ b/scripts/extract_common.py @@ -0,0 +1,361 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- +""" +extract_common.py — single home for per-format text extraction. + +Every extractor returns a plain text *body* with synthetic page markers +(`--- PAGE N ---`). The file-level header (`SOURCE:` / `CONVERTED:`) is added +by normalize_sources.py, not here. + +Critical fix vs. the old pdf_to_text_converter.py: there is NO `max_pages` cap. +Large books are extracted in full. +""" + +from __future__ import annotations + +import hashlib +import importlib +import os +import re +import shutil +import subprocess +import tempfile +import zipfile +from pathlib import Path +from typing import Callable + +PAGE_MARKER_RE = re.compile(r"^--- PAGE (\d+) ---\s*$", re.MULTILINE) + +# paragraphs per synthetic page for paginated-by-flow formats (docx) +DOCX_PARAS_PER_PAGE = 40 + +# formats we deliberately ignore (epub duplicates existing PDFs — plan §1) +IGNORED_EXTENSIONS = {".epub"} + +# obvious junk filenames skipped during a walk +JUNK_NAMES = {"desktop.ini", "linkuri-jocuri.txt"} +JUNK_SUFFIXES = {".bak", ".tmp", ".ini"} + + +# -------------------------------------------------------------------------- +# page assembly helpers +# -------------------------------------------------------------------------- +def join_pages(pages: list[str], start: int = 1) -> str: + """Join a list of page texts into a body string with `--- PAGE N ---`.""" + out: list[str] = [] + for i, text in enumerate(pages, start): + out.append(f"\n--- PAGE {i} ---\n{(text or '').strip()}\n") + return "".join(out) + + +def split_pages(body: str) -> list[tuple[int, str]]: + """Inverse of join_pages: parse a body into [(page_number, text), ...].""" + matches = list(PAGE_MARKER_RE.finditer(body)) + if not matches: + return [] + pages: list[tuple[int, str]] = [] + for idx, m in enumerate(matches): + num = int(m.group(1)) + seg_start = m.end() + seg_end = matches[idx + 1].start() if idx + 1 < len(matches) else len(body) + pages.append((num, body[seg_start:seg_end].strip())) + return pages + + +def count_page_markers(body: str) -> int: + return len(PAGE_MARKER_RE.findall(body)) + + +# -------------------------------------------------------------------------- +# format detection +# -------------------------------------------------------------------------- +FORMAT_BY_EXT = { + ".pdf": "pdf", + ".docx": "docx", + ".doc": "doc", + ".pptx": "pptx", + ".ppt": "pptx", + ".htm": "html", + ".html": "html", + ".zip": "zip", + ".epub": "epub", + ".txt": "txt", +} + + +def detect_format(path: str | os.PathLike) -> str: + """Return a format key for a path based on its extension.""" + ext = Path(path).suffix.lower() + return FORMAT_BY_EXT.get(ext, "unknown") + + +def is_junk(path: str | os.PathLike) -> bool: + p = Path(path) + name = p.name.lower() + if name in JUNK_NAMES: + return True + if name.startswith("readme") and p.suffix.lower() == ".md": + return True + if p.suffix.lower() in JUNK_SUFFIXES: + return True + return False + + +# -------------------------------------------------------------------------- +# content hashing + near-duplicate elimination +# -------------------------------------------------------------------------- +def _normalize_for_hash(text: str) -> str: + return re.sub(r"\s+", " ", (text or "")).strip().lower() + + +def content_hash(text: str) -> str: + """Stable SHA1 of whitespace-normalized text — used for exact-dup detection.""" + return hashlib.sha1(_normalize_for_hash(text).encode("utf-8")).hexdigest() + + +def near_duplicate_ratio(a: str, b: str) -> float: + """Similarity score in [0, 100] between two texts (rapidfuzz token ratio).""" + from rapidfuzz import fuzz + + return fuzz.token_sort_ratio(_normalize_for_hash(a), _normalize_for_hash(b)) + + +def dedupe_texts( + items: list[tuple[str, str]], threshold: float = 95.0 +) -> list[tuple[str, str]]: + """ + Drop exact and near-duplicate texts from a list of (key, text) pairs. + + Used for HTML mirror pages (print copies, repeated index/footer pages). + Keeps the first occurrence; O(n) on exact hash, O(n*k) fuzzy only against + already-kept items. + """ + kept: list[tuple[str, str]] = [] + seen_hashes: set[str] = set() + for key, text in items: + h = content_hash(text) + if h in seen_hashes: + continue + if any(near_duplicate_ratio(text, kt) >= threshold for _, kt in kept): + continue + seen_hashes.add(h) + kept.append((key, text)) + return kept + + +# -------------------------------------------------------------------------- +# preflight dependency check +# -------------------------------------------------------------------------- +REQUIRED_PYTHON_MODULES = { + "pdfplumber": "pdfplumber", + "PyPDF2": "pypdf2", + "docx": "python-docx", + "pptx": "python-pptx", + "bs4": "beautifulsoup4", + "lxml": "lxml", + "jsonschema": "jsonschema", + "rapidfuzz": "rapidfuzz", + "chardet": "chardet", +} + + +def preflight(check_ocr: bool = False) -> dict: + """ + Check system + Python dependencies before a long normalization run. + + Returns {'ok': bool, 'missing_python': [...], 'missing_system': [...], + 'warnings': [...]}. libreoffice is a *warning* (only .doc needs it), + tesseract only checked when check_ocr=True. + """ + missing_python: list[str] = [] + for module, pip_name in REQUIRED_PYTHON_MODULES.items(): + try: + importlib.import_module(module) + except ImportError: + missing_python.append(pip_name) + + warnings: list[str] = [] + missing_system: list[str] = [] + + if not (shutil.which("libreoffice") or shutil.which("soffice")): + warnings.append("libreoffice not found — legacy .doc files cannot be converted") + + if check_ocr and not shutil.which("tesseract"): + missing_system.append("tesseract (OCR requested but not installed)") + + return { + "ok": not missing_python and not missing_system, + "missing_python": missing_python, + "missing_system": missing_system, + "warnings": warnings, + } + + +# -------------------------------------------------------------------------- +# per-format extractors +# -------------------------------------------------------------------------- +def extract_pdf(path: str | os.PathLike) -> str: + """PDF → body. pdfplumber primary, PyPDF2 fallback. No page cap.""" + path = str(path) + try: + return _extract_pdf_pdfplumber(path) + except Exception: + return _extract_pdf_pypdf2(path) + + +def _extract_pdf_pdfplumber(path: str) -> str: + import pdfplumber + + pages: list[str] = [] + with pdfplumber.open(path) as pdf: + for page in pdf.pages: # ALL pages — no max_pages + try: + pages.append(page.extract_text() or "") + except Exception: + pages.append("") + return join_pages(pages) + + +def _extract_pdf_pypdf2(path: str) -> str: + import PyPDF2 + + pages: list[str] = [] + with open(path, "rb") as fh: + reader = PyPDF2.PdfReader(fh) + for page in reader.pages: # ALL pages — no max_pages + try: + pages.append(page.extract_text() or "") + except Exception: + pages.append("") + return join_pages(pages) + + +def extract_docx(path: str | os.PathLike) -> str: + """docx → body. Synthetic page marker every DOCX_PARAS_PER_PAGE paragraphs.""" + import docx + + document = docx.Document(str(path)) + paragraphs = [p.text for p in document.paragraphs] + pages: list[str] = [] + for i in range(0, max(len(paragraphs), 1), DOCX_PARAS_PER_PAGE): + chunk = paragraphs[i : i + DOCX_PARAS_PER_PAGE] + pages.append("\n".join(chunk)) + return join_pages(pages) + + +def extract_doc(path: str | os.PathLike) -> str: + """ + Legacy .doc → body via `libreoffice --headless --convert-to docx`. + + Raises RuntimeError if libreoffice is unavailable — the caller marks the + resulting source `needs_review` regardless (conversion is imperfect). + """ + soffice = shutil.which("libreoffice") or shutil.which("soffice") + if not soffice: + raise RuntimeError("libreoffice/soffice not available — cannot convert .doc") + + src = Path(path).resolve() + with tempfile.TemporaryDirectory() as tmp: + subprocess.run( + [soffice, "--headless", "--convert-to", "docx", "--outdir", tmp, str(src)], + check=True, + capture_output=True, + timeout=300, + ) + converted = Path(tmp) / (src.stem + ".docx") + if not converted.exists(): + raise RuntimeError(f"libreoffice produced no output for {src.name}") + return extract_docx(converted) + + +def extract_pptx(path: str | os.PathLike) -> str: + """pptx → body. One page per slide: title + body text + speaker notes.""" + from pptx import Presentation + + presentation = Presentation(str(path)) + pages: list[str] = [] + for slide in presentation.slides: + parts: list[str] = [] + for shape in slide.shapes: + if shape.has_text_frame and shape.text_frame.text.strip(): + parts.append(shape.text_frame.text.strip()) + if slide.has_notes_slide: + notes = slide.notes_slide.notes_text_frame.text.strip() + if notes: + parts.append(f"[NOTES] {notes}") + pages.append("\n".join(parts)) + return join_pages(pages) + + +def extract_html(path: str | os.PathLike) -> str: + """HTML mirror page → body. Strips nav/script/style/footer/header/aside.""" + import chardet + from bs4 import BeautifulSoup + + raw = Path(path).read_bytes() + enc = chardet.detect(raw).get("encoding") or "utf-8" + soup = BeautifulSoup(raw.decode(enc, errors="replace"), "lxml") + + for tag in soup(["script", "style", "nav", "footer", "header", "aside", "noscript"]): + tag.decompose() + # also drop common chrome by role/class + for tag in soup.find_all(attrs={"role": ["navigation", "banner", "contentinfo"]}): + tag.decompose() + + text = soup.get_text(separator="\n") + lines = [ln.strip() for ln in text.splitlines() if ln.strip()] + return join_pages(["\n".join(lines)]) + + +def extract_zip(path: str | os.PathLike) -> str: + """ + zip → body. Unzips into a temp dir and recurses on every extractable inner + file. Inner files are page-renumbered into one continuous body. + """ + path = str(path) + pages: list[str] = [] + with tempfile.TemporaryDirectory() as tmp: + try: + with zipfile.ZipFile(path) as zf: + zf.extractall(tmp) + except zipfile.BadZipFile: + return "" + for inner in sorted(Path(tmp).rglob("*")): + if not inner.is_file() or is_junk(inner): + continue + fmt = detect_format(inner) + if fmt in ("unknown", "epub", "zip"): + # nested zips handled by recursion below + if fmt == "zip": + body = extract_zip(inner) + pages.extend(t for _, t in split_pages(body)) + continue + try: + body = extract_file(inner) + except Exception: + continue + pages.extend(t for _, t in split_pages(body)) + return join_pages(pages) + + +EXTRACTORS: dict[str, Callable[[str | os.PathLike], str]] = { + "pdf": extract_pdf, + "docx": extract_docx, + "doc": extract_doc, + "pptx": extract_pptx, + "html": extract_html, + "zip": extract_zip, +} + + +def extract_file(path: str | os.PathLike) -> str: + """Dispatch a single file to the right extractor. Returns a page-marked body.""" + fmt = detect_format(path) + if fmt == "txt": + body = Path(path).read_text(encoding="utf-8", errors="replace") + # already paginated? pass through; else wrap as one page + return body if count_page_markers(body) else join_pages([body]) + extractor = EXTRACTORS.get(fmt) + if extractor is None: + raise ValueError(f"No extractor for format '{fmt}': {path}") + return extractor(path) diff --git a/scripts/html_extractor.py b/scripts/html_extractor.py deleted file mode 100644 index 08f5898..0000000 --- a/scripts/html_extractor.py +++ /dev/null @@ -1,424 +0,0 @@ -#!/usr/bin/env python3 -""" -HTML Activity Extractor - Proceseaz 1876 fiiere HTML -Extrage automat activiti folosind pattern recognition -""" - -import os -import re -import json -from pathlib import Path -from bs4 import BeautifulSoup -import chardet -from typing import List, Dict, Optional -import sqlite3 -from datetime import datetime - -class HTMLActivityExtractor: - def __init__(self, db_path='data/activities.db'): - self.db_path = db_path - # Pattern-uri pentru detectare activiti �n rom�n - self.activity_patterns = { - 'title_patterns': [ - r'(?i)(joc|activitate|exerci[t]iu|team[\s-]?building|energizer|ice[\s-]?breaker)[\s:]+([^\.]{5,100})', - r'(?i)]*>([^<]*(?:joc|activitate|exerci[t]iu)[^<]*)', - r'(?i)([^<]*(?:joc|activitate|exerci[t]iu)[^<]*)', - r'(?i)^[\d]+\.?\s*([A-Z][^\.]{10,100}(?:joc|activitate|exerci[t]iu)[^\.]{0,50})$', - ], - 'description_markers': [ - 'descriere', 'reguli', 'cum se joac[a]', 'instructiuni', - 'obiectiv', 'desfasurare', 'explicatie', 'mod de joc' - ], - 'materials_markers': [ - 'materiale', 'necesare', 'echipament', 'ce avem nevoie', - 'se folosesc', 'trebuie sa avem', 'dotari' - ], - 'age_patterns': [ - r'(?i)v[�a]rst[a][\s:]+(\d+)[\s-]+(\d+)', - r'(?i)(\d+)[\s-]+(\d+)\s*ani', - r'(?i)pentru\s+(\d+)[\s-]+(\d+)\s*ani', - r'(?i)categoria?\s*(?:de\s*)?v[�a]rst[a][\s:]+(\d+)[\s-]+(\d+)', - ], - 'participants_patterns': [ - r'(?i)(\d+)[\s-]+(\d+)\s*(?:participan[t]i|juc[a]tori|persoane|copii)', - r'(?i)num[a]r\s*(?:de\s*)?(?:participan[t]i|juc[a]tori)[\s:]+(\d+)[\s-]+(\d+)', - r'(?i)grup\s*de\s*(\d+)[\s-]+(\d+)', - ], - 'duration_patterns': [ - r'(?i)durat[a][\s:]+(\d+)[\s-]+(\d+)\s*(?:minute|min)', - r'(?i)timp[\s:]+(\d+)[\s-]+(\d+)\s*(?:minute|min)', - r'(?i)(\d+)[\s-]+(\d+)\s*minute', - ] - } - - # Categorii predefinite bazate pe sistemul existent - self.categories = { - '[A]': ['joc', 'joaca', 'distractie', 'amuzament'], - '[B]': ['aventura', 'explorare', 'descoperire'], - '[C]': ['camping', 'tabara', 'excursie', 'drumetie'], - '[D]': ['foc', 'flacara', 'lumina'], - '[E]': ['noduri', 'fr�nghii', 'sfori', 'legare'], - '[F]': ['bushcraft', 'supravietuire', 'survival'], - '[G]': ['educatie', 'educativ', 'invatare', 'scoala'], - '[H]': ['orientare', 'busola', 'harta', 'navigare'] - } - - def detect_encoding(self, file_path): - """Detecteaz encoding-ul fiierului""" - with open(file_path, 'rb') as f: - result = chardet.detect(f.read()) - return result['encoding'] or 'utf-8' - - def extract_from_html(self, html_path: str) -> List[Dict]: - """Extrage activiti dintr-un singur fiier HTML""" - activities = [] - - try: - # Detectare encoding i citire - encoding = self.detect_encoding(html_path) - with open(html_path, 'r', encoding=encoding, errors='ignore') as f: - content = f.read() - - soup = BeautifulSoup(content, 'lxml') - - # Metod 1: Caut liste de activiti - activities.extend(self._extract_from_lists(soup, html_path)) - - # Metod 2: Caut activiti �n headings - activities.extend(self._extract_from_headings(soup, html_path)) - - # Metod 3: Caut pattern-uri �n text - activities.extend(self._extract_from_patterns(soup, html_path)) - - # Metod 4: Caut �n tabele - activities.extend(self._extract_from_tables(soup, html_path)) - - except Exception as e: - print(f"Error processing {html_path}: {e}") - - return activities - - def _extract_from_lists(self, soup, source_file): - """Extrage activiti din liste HTML (ul, ol)""" - activities = [] - - for list_elem in soup.find_all(['ul', 'ol']): - # Verific dac lista pare s conin activiti - list_text = list_elem.get_text().lower() - if any(marker in list_text for marker in ['joc', 'activitate', 'exercitiu']): - for li in list_elem.find_all('li'): - text = li.get_text(strip=True) - if len(text) > 20: # Minim 20 caractere pentru o activitate valid - activity = self._create_activity_from_text(text, source_file) - if activity: - activities.append(activity) - - return activities - - def _extract_from_headings(self, soup, source_file): - """Extrage activiti bazate pe headings""" - activities = [] - - for heading in soup.find_all(['h1', 'h2', 'h3', 'h4', 'h5', 'h6']): - heading_text = heading.get_text(strip=True) - - # Verific dac heading-ul conine cuvinte cheie - if any(keyword in heading_text.lower() for keyword in ['joc', 'activitate', 'exercitiu']): - # Caut descrierea �n elementele urmtoare - description = "" - next_elem = heading.find_next_sibling() - - while next_elem and next_elem.name not in ['h1', 'h2', 'h3', 'h4', 'h5', 'h6']: - if next_elem.name in ['p', 'div', 'ul']: - description += next_elem.get_text(strip=True) + " " - if len(description) > 500: # Limit descriere - break - next_elem = next_elem.find_next_sibling() - - if description: - activity = { - 'name': heading_text[:200], - 'description': description[:1000], - 'source_file': str(source_file), - 'category': self._detect_category(heading_text + " " + description) - } - activities.append(activity) - - return activities - - def _extract_from_patterns(self, soup, source_file): - """Extrage activiti folosind pattern matching""" - activities = [] - text = soup.get_text() - - # Caut pattern-uri de activiti - for pattern in self.activity_patterns['title_patterns']: - matches = re.finditer(pattern, text, re.MULTILINE) - for match in matches: - title = match.group(0) if match.lastindex == 0 else match.group(match.lastindex) - if len(title) > 10: - # Extrage context �n jurul match-ului - start = max(0, match.start() - 200) - end = min(len(text), match.end() + 500) - context = text[start:end] - - activity = self._create_activity_from_text(context, source_file, title) - if activity: - activities.append(activity) - - return activities - - def _extract_from_tables(self, soup, source_file): - """Extrage activiti din tabele""" - activities = [] - - for table in soup.find_all('table'): - rows = table.find_all('tr') - if len(rows) > 1: # Cel puin header i o linie de date - # Detecteaz coloanele relevante - headers = [th.get_text(strip=True).lower() for th in rows[0].find_all(['th', 'td'])] - - for row in rows[1:]: - cells = row.find_all(['td']) - if cells: - activity_data = {} - for i, cell in enumerate(cells): - if i < len(headers): - activity_data[headers[i]] = cell.get_text(strip=True) - - # Creeaz activitate din date tabel - if any(key in activity_data for key in ['joc', 'activitate', 'nume', 'titlu']): - activity = self._create_activity_from_table_data(activity_data, source_file) - if activity: - activities.append(activity) - - return activities - - def _create_activity_from_text(self, text, source_file, title=None): - """Creeaz un dicionar de activitate din text""" - if not text or len(text) < 30: - return None - - activity = { - 'name': title or text[:100].split('.')[0].strip(), - 'description': text[:1000], - 'source_file': str(source_file), - 'category': self._detect_category(text), - 'keywords': self._extract_keywords(text), - 'created_at': datetime.now().isoformat() - } - - # Extrage metadata suplimentar - activity.update(self._extract_metadata(text)) - - return activity - - def _create_activity_from_table_data(self, data, source_file): - """Creeaz activitate din date de tabel""" - activity = { - 'source_file': str(source_file), - 'created_at': datetime.now().isoformat() - } - - # Mapare c�mpuri tabel la c�mpuri DB - field_mapping = { - 'nume': 'name', 'titlu': 'name', 'joc': 'name', 'activitate': 'name', - 'descriere': 'description', 'detalii': 'description', 'explicatie': 'description', - 'materiale': 'materials_list', 'echipament': 'materials_list', - 'varsta': 'age_group_min', 'categoria': 'category', - 'participanti': 'participants_min', 'numar': 'participants_min', - 'durata': 'duration_min', 'timp': 'duration_min' - } - - for table_field, db_field in field_mapping.items(): - if table_field in data: - activity[db_field] = data[table_field] - - # Validare minim - if 'name' in activity and len(activity.get('name', '')) > 5: - return activity - - return None - - def _extract_metadata(self, text): - """Extrage metadata din text folosind pattern-uri""" - metadata = {} - - # Extrage v�rsta - for pattern in self.activity_patterns['age_patterns']: - match = re.search(pattern, text) - if match: - metadata['age_group_min'] = int(match.group(1)) - metadata['age_group_max'] = int(match.group(2)) if match.lastindex >= 2 else int(match.group(1)) - break - - # Extrage numr participani - for pattern in self.activity_patterns['participants_patterns']: - match = re.search(pattern, text) - if match: - metadata['participants_min'] = int(match.group(1)) - metadata['participants_max'] = int(match.group(2)) if match.lastindex >= 2 else int(match.group(1)) - break - - # Extrage durata - for pattern in self.activity_patterns['duration_patterns']: - match = re.search(pattern, text) - if match: - metadata['duration_min'] = int(match.group(1)) - metadata['duration_max'] = int(match.group(2)) if match.lastindex >= 2 else int(match.group(1)) - break - - # Extrage materiale - materials = [] - text_lower = text.lower() - for marker in self.activity_patterns['materials_markers']: - idx = text_lower.find(marker) - if idx != -1: - # Extrage urmtoarele 200 caractere dup marker - materials_text = text[idx:idx+200] - # Extrage items din list - items = re.findall(r'[-"]\s*([^\n-"]+)', materials_text) - if items: - materials.extend(items) - - if materials: - metadata['materials_list'] = ', '.join(materials[:10]) # Maxim 10 materiale - - return metadata - - def _detect_category(self, text): - """Detecteaz categoria activitii bazat pe cuvinte cheie""" - text_lower = text.lower() - - for category, keywords in self.categories.items(): - if any(keyword in text_lower for keyword in keywords): - return category - - return '[A]' # Default categoria jocuri - - def _extract_keywords(self, text): - """Extrage cuvinte cheie din text""" - keywords = [] - text_lower = text.lower() - - # Lista de cuvinte cheie relevante - keyword_list = [ - 'cooperare', 'competitie', 'echipa', 'creativitate', 'miscare', - 'strategie', 'comunicare', 'incredere', 'coordonare', 'atentie', - 'reflexe', 'logica', 'imaginatie', 'muzica', 'dans', 'sport', - 'natura', 'mediu', 'stiinta', 'matematica', 'limba', 'cultura' - ] - - for keyword in keyword_list: - if keyword in text_lower: - keywords.append(keyword) - - return ', '.join(keywords[:5]) # Maxim 5 keywords - - def save_to_database(self, activities): - """Salveaz activitile �n baza de date""" - conn = sqlite3.connect(self.db_path) - cursor = conn.cursor() - - saved_count = 0 - duplicate_count = 0 - - for activity in activities: - try: - # Verific duplicate - cursor.execute( - "SELECT id FROM activities WHERE name = ? AND source_file = ?", - (activity.get('name'), activity.get('source_file')) - ) - - if cursor.fetchone(): - duplicate_count += 1 - continue - - # Pregtete valorile pentru insert - columns = [] - values = [] - placeholders = [] - - for key, value in activity.items(): - if key != 'created_at': # Skip created_at, it has default - columns.append(key) - values.append(value) - placeholders.append('?') - - # Insert �n DB - query = f"INSERT INTO activities ({', '.join(columns)}) VALUES ({', '.join(placeholders)})" - cursor.execute(query, values) - saved_count += 1 - - except Exception as e: - print(f"Error saving activity: {e}") - continue - - conn.commit() - conn.close() - - return saved_count, duplicate_count - - def process_all_html_files(self, base_path='/mnt/d/GoogleDrive/Cercetasi/carti-camp-jocuri'): - """Proceseaz toate fiierele HTML din directorul specificat""" - base_path = Path(base_path) - html_files = list(base_path.rglob("*.html")) - html_files.extend(list(base_path.rglob("*.htm"))) - - print(f"Found {len(html_files)} HTML files to process") - - all_activities = [] - processed = 0 - errors = 0 - - for i, html_file in enumerate(html_files): - try: - activities = self.extract_from_html(str(html_file)) - all_activities.extend(activities) - processed += 1 - - # Progress update - if (i + 1) % 100 == 0: - print(f"Progress: {i+1}/{len(html_files)} files processed, {len(all_activities)} activities found") - # Save batch to DB - if all_activities: - saved, dupes = self.save_to_database(all_activities) - print(f"Batch saved: {saved} new activities, {dupes} duplicates skipped") - all_activities = [] # Clear buffer - - except Exception as e: - print(f"Error processing {html_file}: {e}") - errors += 1 - - # Save remaining activities - if all_activities: - saved, dupes = self.save_to_database(all_activities) - print(f"Final batch saved: {saved} new activities, {dupes} duplicates skipped") - - print(f"\nProcessing complete!") - print(f"Files processed: {processed}") - print(f"Errors: {errors}") - - return processed, errors - -# Funcie main pentru test -if __name__ == "__main__": - extractor = HTMLActivityExtractor() - - # Test pe un fiier sample mai �nt�i - print("Testing on sample file first...") - # Gsete un fiier HTML pentru test - test_files = list(Path('/mnt/d/GoogleDrive/Cercetasi/carti-camp-jocuri').rglob("*.html"))[:3] - - for test_file in test_files: - print(f"\nTesting: {test_file}") - activities = extractor.extract_from_html(str(test_file)) - print(f"Found {len(activities)} activities") - if activities: - print(f"Sample activity: {activities[0]['name'][:50]}...") - - # �ntreab dac s continue cu procesarea complet - response = input("\nContinue with full processing? (y/n): ") - if response.lower() == 'y': - extractor.process_all_html_files() \ No newline at end of file diff --git a/scripts/import_claude_activities.py b/scripts/import_claude_activities.py deleted file mode 100644 index c10141a..0000000 --- a/scripts/import_claude_activities.py +++ /dev/null @@ -1,78 +0,0 @@ -#!/usr/bin/env python3 -""" -Import activities extracted by Claude from JSON files -""" - -import json -import sqlite3 -from pathlib import Path -from datetime import datetime - -class ClaudeActivityImporter: - def __init__(self, db_path='data/activities.db'): - self.db_path = db_path - self.json_dir = Path('scripts/extracted_activities') - self.json_dir.mkdir(exist_ok=True) - - def import_json_file(self, json_path): - """Import activities from a single JSON file""" - with open(json_path, 'r', encoding='utf-8') as f: - data = json.load(f) - - source_file = data.get('source_file', str(json_path)) - activities = data.get('activities', []) - - conn = sqlite3.connect(self.db_path) - cursor = conn.cursor() - - imported = 0 - for activity in activities: - try: - # Add source file and timestamp - activity['source_file'] = source_file - activity['created_at'] = datetime.now().isoformat() - - # Prepare insert - columns = list(activity.keys()) - values = list(activity.values()) - placeholders = ['?' for _ in values] - - # Check for duplicate - cursor.execute( - "SELECT id FROM activities WHERE name = ? AND source_file = ?", - (activity.get('name'), source_file) - ) - - if not cursor.fetchone(): - query = f"INSERT INTO activities ({', '.join(columns)}) VALUES ({', '.join(placeholders)})" - cursor.execute(query, values) - imported += 1 - - except Exception as e: - print(f"Error importing activity: {e}") - - conn.commit() - conn.close() - - print(f"Imported {imported} activities from {json_path.name}") - return imported - - def import_all_json_files(self): - """Import all JSON files from the extracted_activities directory""" - json_files = list(self.json_dir.glob("*.json")) - - if not json_files: - print("No JSON files found in extracted_activities directory") - return 0 - - total_imported = 0 - for json_file in json_files: - imported = self.import_json_file(json_file) - total_imported += imported - - print(f"\nTotal imported: {total_imported} activities from {len(json_files)} files") - return total_imported - -if __name__ == "__main__": - importer = ClaudeActivityImporter() - importer.import_all_json_files() \ No newline at end of file diff --git a/scripts/import_common.py b/scripts/import_common.py new file mode 100644 index 0000000..0ec3718 --- /dev/null +++ b/scripts/import_common.py @@ -0,0 +1,179 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- +""" +import_common.py — shared helpers for the import / validation side of the +extraction pipeline (Lane C). + +Used by build_database.py and validate_extractions.py: + * JSON-schema validation of subagent extraction files, + * the anti-hallucination source_excerpt substring check (E5), + * locating the source chunk that an extraction file came from, + * the stable content key used by the needs_review queue. +""" + +from __future__ import annotations + +import hashlib +import json +import re +import unicodedata +from pathlib import Path +from typing import Any, Optional + +SCRIPT_DIR = Path(__file__).resolve().parent +REPO_ROOT = SCRIPT_DIR.parent + +DEFAULT_SCHEMA_PATH = SCRIPT_DIR / "activity_schema.json" + +# rapidfuzz.partial_ratio is on a 0..100 scale; an excerpt counts as a real +# quote from the source when it scores at least this against the chunk text. +EXCERPT_MATCH_THRESHOLD = 90.0 + + +# -------------------------------------------------------------------------- +# schema validation +# -------------------------------------------------------------------------- +def load_schema(schema_path: str | Path = DEFAULT_SCHEMA_PATH) -> dict: + """Load the activity JSON schema produced by Lane A.""" + return json.loads(Path(schema_path).read_text(encoding="utf-8")) + + +def validate_extraction(data: Any, schema: dict) -> list[str]: + """ + Validate one parsed extraction file against `schema`. + + Returns a list of human-readable error strings; empty list == valid. + """ + import jsonschema + + validator = jsonschema.Draft7Validator(schema) + errors: list[str] = [] + for err in sorted(validator.iter_errors(data), key=lambda e: list(e.path)): + location = "/".join(str(p) for p in err.path) or "" + errors.append(f"{location}: {err.message}") + return errors + + +# -------------------------------------------------------------------------- +# excerpt verification (E5 — anti-hallucination) +# -------------------------------------------------------------------------- +def _normalize_text(text: str) -> str: + return re.sub(r"\s+", " ", (text or "")).strip().lower() + + +def excerpt_score(excerpt: str, chunk_text: str) -> float: + """Best fuzzy-substring score (0..100) of `excerpt` inside `chunk_text`.""" + from rapidfuzz import fuzz + + if not excerpt or not chunk_text: + return 0.0 + return float(fuzz.partial_ratio(_normalize_text(excerpt), _normalize_text(chunk_text))) + + +def excerpt_matches( + excerpt: str, chunk_text: str, threshold: float = EXCERPT_MATCH_THRESHOLD +) -> bool: + """True when `excerpt` appears (fuzzily) as a substring of `chunk_text`.""" + return excerpt_score(excerpt, chunk_text) >= threshold + + +# -------------------------------------------------------------------------- +# locating the source chunk an extraction file came from +# -------------------------------------------------------------------------- +def chunk_key_for(json_path: Path, header: Optional[dict]) -> str: + """ + Resolve the chunk key for an extraction file. + + Prefers the explicit `chunk_key` in the header, otherwise falls back to the + JSON file stem (extraction files are named `.json`). + """ + if header and header.get("chunk_key"): + return str(header["chunk_key"]) + return json_path.stem + + +def source_id_for(chunk_key: str, header: Optional[dict]) -> str: + """Resolve the source id; `.partNN` → ``.""" + if header and header.get("source_id"): + return str(header["source_id"]) + # chunk keys look like ".partNN" + return chunk_key.rsplit(".part", 1)[0] + + +def find_chunk_text( + json_path: Path, header: Optional[dict], chunks_dir: Path +) -> Optional[str]: + """ + Return the text of the source chunk for an extraction file, or None. + + Looks for data/chunks//.txt, then falls back to a + recursive glob on the chunk key. + """ + chunk_key = chunk_key_for(json_path, header) + source_id = source_id_for(chunk_key, header) + + candidate = chunks_dir / source_id / f"{chunk_key}.txt" + if candidate.is_file(): + return candidate.read_text(encoding="utf-8", errors="replace") + + matches = list(chunks_dir.rglob(f"{chunk_key}.txt")) + if matches: + return matches[0].read_text(encoding="utf-8", errors="replace") + return None + + +def source_path_for(source_id: str, sources_dir: Path) -> Optional[str]: + """ + Read the original `SOURCE:` path from a normalized source header. + + data/sources/.txt starts with a `SOURCE: ` line. + """ + src_file = sources_dir / f"{source_id}.txt" + if not src_file.is_file(): + return None + try: + with src_file.open(encoding="utf-8", errors="replace") as fh: + for line in fh: + if line.startswith("SOURCE:"): + return line.split(":", 1)[1].strip() + if line.startswith("=") or line.startswith("--- PAGE "): + break + except OSError: + return None + return None + + +# -------------------------------------------------------------------------- +# stable content key for the needs_review queue (plan §5c) +# -------------------------------------------------------------------------- +def normalize_name(name: str) -> str: + """Diacritic-free, lowercased, whitespace-collapsed name (dedup key).""" + if not name: + return "" + decomposed = unicodedata.normalize("NFKD", name) + ascii_str = "".join(c for c in decomposed if not unicodedata.combining(c)) + return re.sub(r"\s+", " ", ascii_str.lower().strip()) + + +def content_key(normalized_name: str, language: Optional[str], description: str) -> str: + """ + Stable hash identifying a row for the review queue. + + Only borderline-kept-separate rows and legacy `.doc` rows ever carry + needs_review, and neither is auto-merged — so their (normalized_name, + language, description) triple is stable across rebuilds. + """ + payload = f"{normalized_name}\x1f{language or ''}\x1f{_normalize_text(description)}" + return hashlib.sha1(payload.encode("utf-8")).hexdigest() + + +# -------------------------------------------------------------------------- +# iteration +# -------------------------------------------------------------------------- +def iter_extraction_files(extracted_dir: Path): + """Yield every *.json directly under `extracted_dir` (skips _rejected/).""" + if not extracted_dir.is_dir(): + return + for path in sorted(extracted_dir.glob("*.json")): + if path.is_file(): + yield path diff --git a/scripts/normalize_sources.py b/scripts/normalize_sources.py new file mode 100644 index 0000000..2c9c607 --- /dev/null +++ b/scripts/normalize_sources.py @@ -0,0 +1,255 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- +""" +normalize_sources.py — walk data/carti-camp-jocuri/ and write data/sources/.txt. + +Output files keep the existing header format: + + SOURCE: + CONVERTED: + FORMAT: + NEEDS_REVIEW: (optional — legacy .doc conversions) + ================================================== + + --- PAGE 1 --- + ... + +Each source gets a stable id = <8-hex hash of relative path>_, +so two files with the same name in different folders never collide. + +The pipeline is script-only: this normalizes formats, it does NOT run extraction. +Run `--check-deps` before a long job. +""" + +from __future__ import annotations + +import argparse +import datetime as _dt +import hashlib +import re +import sys +from pathlib import Path + +SCRIPT_DIR = Path(__file__).resolve().parent +if str(SCRIPT_DIR) not in sys.path: + sys.path.insert(0, str(SCRIPT_DIR)) + +from extract_common import ( # noqa: E402 + count_page_markers, + dedupe_texts, + detect_format, + extract_file, + extract_html, + is_junk, + join_pages, + preflight, + split_pages, +) + +HEADER_RULE = "=" * 50 + + +# -------------------------------------------------------------------------- +# stable source id +# -------------------------------------------------------------------------- +def sanitize_stem(stem: str) -> str: + s = re.sub(r"[^\w]+", "_", stem, flags=re.UNICODE).strip("_").lower() + return s[:60] or "source" + + +def stable_id(relative_path: str | Path) -> str: + """Collision-proof id derived from the path relative to the corpus root.""" + rel = str(relative_path).replace("\\", "/") + digest = hashlib.sha1(rel.encode("utf-8")).hexdigest()[:8] + stem = sanitize_stem(Path(rel).stem) + return f"{digest}_{stem}" + + +# -------------------------------------------------------------------------- +# header +# -------------------------------------------------------------------------- +def build_header( + source_rel: str, fmt: str, needs_review: str | None = None +) -> str: + today = _dt.date.today().isoformat() + lines = [ + f"SOURCE: {source_rel}", + f"CONVERTED: {today}", + f"FORMAT: {fmt}", + ] + if needs_review: + lines.append(f"NEEDS_REVIEW: {needs_review}") + lines.append(HEADER_RULE) + return "\n".join(lines) + "\n\n" + + +# -------------------------------------------------------------------------- +# mirror-site directories +# -------------------------------------------------------------------------- +MIRROR_PAGE_EXTS = {".html", ".htm"} + + +def is_mirror_dir(path: Path) -> bool: + """A directory counts as a site mirror if it contains HTML pages.""" + if not path.is_dir(): + return False + if path.name.endswith("_files"): + return False + return any( + p.suffix.lower() in MIRROR_PAGE_EXTS + for p in path.rglob("*") + if p.is_file() + ) + + +def normalize_mirror(mirror_dir: Path) -> str: + """Extract every HTML page in a mirror dir, dedupe near-duplicates, join.""" + pages: list[tuple[str, str]] = [] + for html in sorted(mirror_dir.rglob("*")): + if not html.is_file() or html.suffix.lower() not in MIRROR_PAGE_EXTS: + continue + if "_files" in html.parts: + continue + try: + body = extract_html(html) + except Exception: + continue + text = "\n".join(t for _, t in split_pages(body)) + if text.strip(): + pages.append((str(html.relative_to(mirror_dir)), text)) + pages = dedupe_texts(pages) + return join_pages([t for _, t in pages]) + + +# -------------------------------------------------------------------------- +# one source +# -------------------------------------------------------------------------- +def normalize_one( + path: Path, corpus_root: Path, out_dir: Path +) -> dict | None: + """ + Normalize a single file or mirror directory → data/sources/.txt. + + Returns a result dict, or None if the entry was skipped (junk / ignored). + """ + rel = path.relative_to(corpus_root) + sid = stable_id(rel) + + if path.is_dir(): + if not is_mirror_dir(path): + return None + fmt, needs_review = "html-mirror", None + body = normalize_mirror(path) + else: + if is_junk(path): + return None + fmt = detect_format(path) + if fmt in ("unknown", "epub", "txt"): + return None # epub duplicates PDFs; txt is not a source format here + needs_review = "legacy .doc conversion is imperfect" if fmt == "doc" else None + try: + body = extract_file(path) + except Exception as exc: # noqa: BLE001 + return {"id": sid, "source": str(rel), "status": "error", "error": str(exc)} + + if not body.strip(): + return {"id": sid, "source": str(rel), "status": "empty"} + + out_path = out_dir / f"{sid}.txt" + out_path.write_text(build_header(str(rel), fmt, needs_review) + body, + encoding="utf-8") + return { + "id": sid, + "source": str(rel), + "status": "ok", + "format": fmt, + "pages": count_page_markers(body), + "needs_review": bool(needs_review), + } + + +# -------------------------------------------------------------------------- +# walk +# -------------------------------------------------------------------------- +def iter_corpus_entries(corpus_root: Path): + """Yield top-level files and mirror directories under the corpus root.""" + for entry in sorted(corpus_root.iterdir()): + if entry.name.startswith("."): + continue + if entry.is_dir(): + if is_mirror_dir(entry): + yield entry + else: + yield entry + + +def run(corpus_root: Path, out_dir: Path) -> dict: + out_dir.mkdir(parents=True, exist_ok=True) + results: list[dict] = [] + for entry in iter_corpus_entries(corpus_root): + res = normalize_one(entry, corpus_root, out_dir) + if res is not None: + results.append(res) + summary = { + "total": len(results), + "ok": sum(1 for r in results if r["status"] == "ok"), + "errors": sum(1 for r in results if r["status"] == "error"), + "empty": sum(1 for r in results if r["status"] == "empty"), + "needs_review": sum(1 for r in results if r.get("needs_review")), + "results": results, + } + return summary + + +# -------------------------------------------------------------------------- +# CLI +# -------------------------------------------------------------------------- +def print_preflight(report: dict) -> int: + print("Dependency preflight") + print("--------------------") + if report["missing_python"]: + print(" MISSING Python packages: " + ", ".join(report["missing_python"])) + else: + print(" Python packages: OK") + if report["missing_system"]: + print(" MISSING system tools : " + ", ".join(report["missing_system"])) + for w in report["warnings"]: + print(f" WARNING: {w}") + print(" => " + ("READY" if report["ok"] else "NOT READY — install the above")) + return 0 if report["ok"] else 1 + + +def main(argv: list[str] | None = None) -> int: + parser = argparse.ArgumentParser(description="Normalize mixed sources to .txt") + parser.add_argument("--corpus", default="data/carti-camp-jocuri", + help="corpus root to walk") + parser.add_argument("--out", default="data/sources", help="output directory") + parser.add_argument("--check-deps", action="store_true", + help="run dependency preflight and exit") + parser.add_argument("--ocr", action="store_true", + help="include OCR (tesseract) in the preflight check") + args = parser.parse_args(argv) + + if args.check_deps: + return print_preflight(preflight(check_ocr=args.ocr)) + + report = preflight(check_ocr=args.ocr) + if report["missing_python"]: + print_preflight(report) + return 1 + for w in report["warnings"]: + print(f"WARNING: {w}") + + summary = run(Path(args.corpus), Path(args.out)) + print(f"normalized : {summary['ok']}/{summary['total']}") + print(f"errors : {summary['errors']}") + print(f"empty : {summary['empty']}") + print(f"needs_review: {summary['needs_review']}") + for r in summary["results"]: + if r["status"] != "ok": + print(f" [{r['status']}] {r['source']}") + return 0 + + +if __name__ == "__main__": + raise SystemExit(main()) diff --git a/scripts/pdf_extractor.py b/scripts/pdf_extractor.py deleted file mode 100644 index e69de29..0000000 diff --git a/scripts/pdf_to_text_converter.py b/scripts/pdf_to_text_converter.py deleted file mode 100644 index db03509..0000000 --- a/scripts/pdf_to_text_converter.py +++ /dev/null @@ -1,143 +0,0 @@ -#!/usr/bin/env python3 -""" -PDF Mass Conversion to Text for Activity Extraction -Handles all PDF sizes efficiently with multiple fallback methods -""" - -import os -import json -from pathlib import Path -import PyPDF2 -import pdfplumber -from typing import List, Dict -import logging - -class PDFConverter: - def __init__(self, max_pages=50): - self.max_pages = max_pages - self.conversion_stats = {} - - def convert_pdf_to_text(self, pdf_path: str) -> str: - """Convert PDF to text using multiple methods with fallbacks""" - try: - # Method 1: pdfplumber (best for tables and layout) - return self._convert_with_pdfplumber(pdf_path) - except Exception as e: - print(f"pdfplumber failed for {pdf_path}: {e}") - - try: - # Method 2: PyPDF2 (fallback) - return self._convert_with_pypdf2(pdf_path) - except Exception as e2: - print(f"PyPDF2 also failed for {pdf_path}: {e2}") - return "" - - def _convert_with_pdfplumber(self, pdf_path: str) -> str: - """Primary conversion method using pdfplumber""" - text_content = "" - - with pdfplumber.open(pdf_path) as pdf: - total_pages = len(pdf.pages) - pages_to_process = min(total_pages, self.max_pages) - - print(f" Converting {pdf_path}: {pages_to_process}/{total_pages} pages") - - for i, page in enumerate(pdf.pages[:pages_to_process]): - try: - page_text = page.extract_text() - if page_text: - text_content += f"\n--- PAGE {i+1} ---\n" - text_content += page_text - text_content += "\n" - except Exception as e: - print(f" Error on page {i+1}: {e}") - continue - - self.conversion_stats[pdf_path] = { - 'method': 'pdfplumber', - 'pages_processed': pages_to_process, - 'total_pages': total_pages, - 'success': True, - 'text_length': len(text_content) - } - - return text_content - - def _convert_with_pypdf2(self, pdf_path: str) -> str: - """Fallback conversion method using PyPDF2""" - text_content = "" - - with open(pdf_path, 'rb') as file: - reader = PyPDF2.PdfReader(file) - total_pages = len(reader.pages) - pages_to_process = min(total_pages, self.max_pages) - - print(f" Converting {pdf_path} (fallback): {pages_to_process}/{total_pages} pages") - - for i in range(pages_to_process): - try: - page = reader.pages[i] - page_text = page.extract_text() - if page_text: - text_content += f"\n--- PAGE {i+1} ---\n" - text_content += page_text - text_content += "\n" - except Exception as e: - print(f" Error on page {i+1}: {e}") - continue - - self.conversion_stats[pdf_path] = { - 'method': 'PyPDF2', - 'pages_processed': pages_to_process, - 'total_pages': total_pages, - 'success': True, - 'text_length': len(text_content) - } - - return text_content - - def convert_all_pdfs(self, pdf_directory: str, output_directory: str): - """Convert all PDFs in directory to text files""" - pdf_files = list(Path(pdf_directory).glob("**/*.pdf")) - - print(f"🔄 Converting {len(pdf_files)} PDF files to text...") - - os.makedirs(output_directory, exist_ok=True) - - for i, pdf_path in enumerate(pdf_files): - print(f"\n[{i+1}/{len(pdf_files)}] Processing {pdf_path.name}...") - - # Convert to text - text_content = self.convert_pdf_to_text(str(pdf_path)) - - if text_content.strip(): - # Save as text file - output_file = Path(output_directory) / f"{pdf_path.stem}.txt" - with open(output_file, 'w', encoding='utf-8') as f: - f.write(f"SOURCE: {pdf_path}\n") - f.write(f"CONVERTED: 2025-01-11\n") - f.write("="*50 + "\n\n") - f.write(text_content) - - print(f" ✅ Saved: {output_file}") - else: - print(f" ❌ No text extracted from {pdf_path.name}") - - # Save conversion statistics - stats_file = Path(output_directory) / "conversion_stats.json" - with open(stats_file, 'w', encoding='utf-8') as f: - json.dump(self.conversion_stats, f, indent=2, ensure_ascii=False) - - print(f"\n🎉 PDF conversion complete! Check {output_directory}") - return len([f for f in self.conversion_stats.values() if f['success']]) - -# Usage -if __name__ == "__main__": - converter = PDFConverter(max_pages=50) - - # Convert all PDFs - pdf_dir = "/mnt/d/GoogleDrive/Cercetasi/carti-camp-jocuri" - output_dir = "/mnt/d/GoogleDrive/Cercetasi/carti-camp-jocuri/INDEX-SISTEM-JOCURI/converted_pdfs" - - converted_count = converter.convert_all_pdfs(pdf_dir, output_dir) - print(f"Final result: {converted_count} PDFs successfully converted") \ No newline at end of file diff --git a/scripts/review_queue.py b/scripts/review_queue.py new file mode 100644 index 0000000..bf75c76 --- /dev/null +++ b/scripts/review_queue.py @@ -0,0 +1,145 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- +""" +review_queue.py — CLI for the needs_review lifecycle (plan §5c). + +Rows land in the queue when dedup leaves a borderline pair separate, or when a +legacy `.doc` source was converted imperfectly. Each row has a stable content +key; a decision written here is stored in data/review_decisions.json (git +tracked) and re-applied by build_database.py on every rebuild, so the queue +never resurfaces a resolved row. + +Commands: + python scripts/review_queue.py list + python scripts/review_queue.py resolve +""" + +from __future__ import annotations + +import argparse +import json +import sqlite3 +import sys +from pathlib import Path +from typing import Optional + +SCRIPT_DIR = Path(__file__).resolve().parent +REPO_ROOT = SCRIPT_DIR.parent +for _p in (str(SCRIPT_DIR), str(REPO_ROOT)): + if _p not in sys.path: + sys.path.insert(0, _p) + +from import_common import content_key, normalize_name # noqa: E402 + +VALID_DECISIONS = ("merge", "keep-separate", "drop") + + +# -------------------------------------------------------------------------- +# review_decisions.json +# -------------------------------------------------------------------------- +def load_decisions(path: Path) -> dict: + if path.is_file(): + try: + data = json.loads(path.read_text(encoding="utf-8")) + if isinstance(data, dict): + return data + except (json.JSONDecodeError, OSError): + pass + return {} + + +def save_decisions(decisions: dict, path: Path) -> None: + path.parent.mkdir(parents=True, exist_ok=True) + path.write_text( + json.dumps(decisions, indent=2, ensure_ascii=False, sort_keys=True), + encoding="utf-8", + ) + + +# -------------------------------------------------------------------------- +# queue +# -------------------------------------------------------------------------- +def list_queue(db_path: Path) -> list[dict]: + """Return every needs_review row in the current DB, with its content key.""" + if not db_path.is_file(): + return [] + conn = sqlite3.connect(db_path) + conn.row_factory = sqlite3.Row + try: + rows = conn.execute( + "SELECT name, normalized_name, language, description " + "FROM activities WHERE needs_review = 1 ORDER BY normalized_name" + ).fetchall() + except sqlite3.OperationalError: + return [] + finally: + conn.close() + + out = [] + for row in rows: + norm = row["normalized_name"] or normalize_name(row["name"]) + key = content_key(norm, row["language"], row["description"] or "") + out.append({ + "id": key, + "name": row["name"], + "language": row["language"], + "description": row["description"] or "", + }) + return out + + +def resolve(decisions_path: Path, content_id: str, decision: str) -> dict: + """Record a decision for a content key in review_decisions.json.""" + if decision not in VALID_DECISIONS: + raise ValueError( + f"invalid decision {decision!r}; expected one of {VALID_DECISIONS}" + ) + decisions = load_decisions(decisions_path) + decisions[content_id] = {"decision": decision} + save_decisions(decisions, decisions_path) + return decisions + + +# -------------------------------------------------------------------------- +# CLI +# -------------------------------------------------------------------------- +def main(argv: Optional[list[str]] = None) -> int: + parser = argparse.ArgumentParser(description="needs_review queue CLI") + parser.add_argument("--db", default="data/activities.db") + parser.add_argument("--decisions", default="data/review_decisions.json") + sub = parser.add_subparsers(dest="command", required=True) + + sub.add_parser("list", help="list rows currently flagged needs_review") + + p_resolve = sub.add_parser("resolve", help="record a decision for a row") + p_resolve.add_argument("id", help="content id from `list`") + p_resolve.add_argument("decision", choices=VALID_DECISIONS) + + args = parser.parse_args(argv) + + if args.command == "list": + rows = list_queue(Path(args.db)) + if not rows: + print("review queue is empty.") + return 0 + print(f"{len(rows)} row(s) need review:\n") + for r in rows: + desc = r["description"][:80].replace("\n", " ") + print(f" id : {r['id']}") + print(f" name : {r['name']} [{r['language']}]") + print(f" desc : {desc}") + print(f" -> review_queue.py resolve {r['id']} ") + print() + return 0 + + if args.command == "resolve": + resolve(Path(args.decisions), args.id, args.decision) + print(f"recorded: {args.id} -> {args.decision}") + print(f"written to {args.decisions} (applied on next build_database --rebuild)") + return 0 + + return 1 + + +if __name__ == "__main__": + raise SystemExit(main()) diff --git a/scripts/run_extraction.py b/scripts/run_extraction.py index 9304861..c80747a 100644 --- a/scripts/run_extraction.py +++ b/scripts/run_extraction.py @@ -1,50 +1,140 @@ #!/usr/bin/env python3 # -*- coding: utf-8 -*- """ -Main extraction orchestrator -Ruleaza intregul proces de extractie +run_extraction.py — extraction orchestrator (plan §3). + +The pipeline is script-only up to the LLM step: this script normalizes the +corpus, chunks the normalized sources, and emits one subagent prompt per +`pending` chunk. It does NOT run the extraction itself — that step is the +interactive Claude Code orchestrator launching waves of subagents. + +Steps: + 1. normalize data/carti-camp-jocuri/ -> data/sources/*.txt + 2. chunk data/sources/*.txt -> data/chunks//*.txt + manifest.json + 3. emit one prompt per `pending` chunk -> data/chunks/_prompts/*.md + 4. report how many chunks remain `pending` + +Usage: + python scripts/run_extraction.py + python scripts/run_extraction.py --skip-normalize # re-chunk only """ +from __future__ import annotations + +import argparse import sys -import time from pathlib import Path +from typing import Optional -from unified_processor import UnifiedProcessor -from import_claude_activities import ClaudeActivityImporter +SCRIPT_DIR = Path(__file__).resolve().parent +REPO_ROOT = SCRIPT_DIR.parent +for _p in (str(SCRIPT_DIR), str(REPO_ROOT)): + if _p not in sys.path: + sys.path.insert(0, _p) + +import chunk_sources # noqa: E402 +import normalize_sources # noqa: E402 + +SUBAGENT_PROMPT = SCRIPT_DIR / "SUBAGENT_PROMPT.md" + + +def emit_chunk_prompt(chunk_key: str, meta: dict, prompts_dir: Path) -> Path: + """Write the subagent prompt for one pending chunk.""" + chunk_file = meta.get("chunk_file", f"data/chunks//{chunk_key}.txt") + expected_json = meta.get("expected_json", f"{chunk_key}.json") + text = "\n".join([ + f"# EXTRACTION — chunk `{chunk_key}`", + "", + f"Read ONLY this chunk: `{chunk_file}`", + f"Chunk range: {meta.get('chunk_range', '?')}", + "", + f"Follow the rules in `{SUBAGENT_PROMPT.relative_to(REPO_ROOT)}`.", + "Identify every distinct activity, fill the schema " + "(`scripts/activity_schema.json`), and write the result to:", + "", + f" data/extracted/{expected_json}", + "", + "Header fields to set: " + f'source_id="{meta.get("source_id", "")}", chunk_key="{chunk_key}", ' + f'source_hash="{meta.get("source_hash", "")}".', + "", + ]) + prompts_dir.mkdir(parents=True, exist_ok=True) + out = prompts_dir / f"{chunk_key}.prompt.md" + out.write_text(text, encoding="utf-8") + return out + + +def run( + *, + corpus_root: Path, + sources_dir: Path, + chunks_dir: Path, + skip_normalize: bool = False, +) -> dict: + summary: dict = {} + + if not skip_normalize: + norm = normalize_sources.run(corpus_root, sources_dir) + summary["normalized"] = {"ok": norm["ok"], "total": norm["total"], + "errors": norm["errors"]} + + chunk_summary = chunk_sources.run(sources_dir, chunks_dir) + summary["chunks"] = chunk_summary + + manifest_path = chunks_dir / "manifest.json" + manifest = chunk_sources.load_manifest(manifest_path) + prompts_dir = chunks_dir / "_prompts" + + pending = {k: m for k, m in manifest["chunks"].items() + if m.get("state") == "pending"} + for key, meta in sorted(pending.items()): + emit_chunk_prompt(key, meta, prompts_dir) + + states: dict[str, int] = {} + for m in manifest["chunks"].values(): + states[m.get("state", "?")] = states.get(m.get("state", "?"), 0) + 1 + summary["states"] = states + summary["pending"] = len(pending) + summary["prompts_dir"] = str(prompts_dir) + return summary + + +def main(argv: Optional[list[str]] = None) -> int: + parser = argparse.ArgumentParser(description="Extraction orchestrator.") + parser.add_argument("--corpus", default="data/carti-camp-jocuri") + parser.add_argument("--sources", default="data/sources") + parser.add_argument("--chunks", default="data/chunks") + parser.add_argument("--skip-normalize", action="store_true", + help="skip normalization, re-chunk existing sources only") + args = parser.parse_args(argv) + + summary = run( + corpus_root=Path(args.corpus), + sources_dir=Path(args.sources), + chunks_dir=Path(args.chunks), + skip_normalize=args.skip_normalize, + ) + + print("=" * 60) + print("EXTRACTION ORCHESTRATOR") + print("=" * 60) + if "normalized" in summary: + n = summary["normalized"] + print(f"normalized : {n['ok']}/{n['total']} (errors {n['errors']})") + print(f"chunks : {summary['chunks']['chunks']}") + for state, count in sorted(summary["states"].items()): + print(f" {state:<10}: {count}") + print(f"\npending chunks remaining : {summary['pending']}") + if summary["pending"]: + print(f"subagent prompts written : {summary['prompts_dir']}/") + print("Launch waves of ~5-10 subagents on those prompts, then run " + "validate_extractions.py and build_database.py --rebuild.") + else: + print("All chunks extracted — run build_database.py --rebuild.") + print("=" * 60) + return 0 -def main(): - print("="*60) - print("ACTIVITY EXTRACTION SYSTEM") - print("Strategy S8: Hybrid Claude + Scripts") - print("="*60) - - # Step 1: Run automated extraction - print("\nSTEP 1: Automated Extraction") - print("-"*40) - processor = UnifiedProcessor() - processor.process_automated_formats() - - # Step 2: Wait for Claude processing - print("\n" + "="*60) - print("STEP 2: Manual Claude Processing Required") - print("-"*40) - print("Please process PDF/DOC files with Claude using the template.") - print("Files are listed in: pdf_doc_for_claude.txt") - print("Save extracted activities as JSON in: scripts/extracted_activities/") - print("="*60) - - response = input("\nHave you completed Claude processing? (y/n): ") - - if response.lower() == 'y': - # Step 3: Import Claude-extracted activities - print("\nSTEP 3: Importing Claude-extracted activities") - print("-"*40) - importer = ClaudeActivityImporter() - importer.import_all_json_files() - - print("\n" + "="*60) - print("EXTRACTION COMPLETE!") - print("="*60) if __name__ == "__main__": - main() \ No newline at end of file + raise SystemExit(main()) diff --git a/scripts/text_extractor.py b/scripts/text_extractor.py deleted file mode 100644 index 47b9b16..0000000 --- a/scripts/text_extractor.py +++ /dev/null @@ -1,197 +0,0 @@ -#!/usr/bin/env python3 -# -*- coding: utf-8 -*- -""" -Text/Markdown Activity Extractor -Proceseaza fisiere TXT si MD pentru extractie activitati -""" - -import re -from pathlib import Path -from typing import List, Dict -import sqlite3 -from datetime import datetime - -class TextActivityExtractor: - def __init__(self, db_path='data/activities.db'): - self.db_path = db_path - self.activity_patterns = { - 'section_headers': [ - r'^#{1,6}\s*(.+)$', # Markdown headers - r'^([A-Z][^\.]{10,100})$', # Titluri simple - r'^\d+\.\s*(.+)$', # Numbered lists - r'^[•\-\*]\s*(.+)$', # Bullet points - ], - 'activity_markers': [ - 'joc:', 'activitate:', 'exercitiu:', 'team building:', - 'nume:', 'titlu:', 'denumire:' - ] - } - - def extract_from_text(self, file_path: str) -> List[Dict]: - """Extrage activitati din fisier text/markdown""" - activities = [] - - try: - with open(file_path, 'r', encoding='utf-8', errors='ignore') as f: - content = f.read() - - # Metoda 1: Cauta sectiuni markdown - if file_path.endswith('.md'): - activities.extend(self._extract_from_markdown(content, file_path)) - - # Metoda 2: Cauta pattern-uri generale - activities.extend(self._extract_from_patterns(content, file_path)) - - # Metoda 3: Cauta blocuri de text structurate - activities.extend(self._extract_from_blocks(content, file_path)) - - except Exception as e: - print(f"Error processing {file_path}: {e}") - - return activities - - def _extract_from_markdown(self, content, source_file): - """Extrage activitati din format markdown""" - activities = [] - lines = content.split('\n') - - current_activity = None - current_content = [] - - for line in lines: - # Verifica daca e header de activitate - if re.match(r'^#{1,3}\s*(.+)', line): - # Salveaza activitatea anterioara daca exista - if current_activity and current_content: - current_activity['description'] = '\n'.join(current_content[:20]) # Max 20 linii - activities.append(current_activity) - - # Verifica daca noul header e o activitate - header_text = re.sub(r'^#{1,3}\s*', '', line) - if any(marker in header_text.lower() for marker in ['joc', 'activitate', 'exercitiu']): - current_activity = { - 'name': header_text[:200], - 'source_file': str(source_file), - 'category': '[A]' - } - current_content = [] - else: - current_activity = None - - elif current_activity: - # Adauga continut la activitatea curenta - if line.strip(): - current_content.append(line) - - # Salveaza ultima activitate - if current_activity and current_content: - current_activity['description'] = '\n'.join(current_content[:20]) - activities.append(current_activity) - - return activities - - def _extract_from_patterns(self, content, source_file): - """Extrage folosind pattern matching""" - activities = [] - - # Cauta markeri specifici de activitati - for marker in self.activity_patterns['activity_markers']: - pattern = re.compile(f'{re.escape(marker)}\\s*(.+?)(?=\\n\\n|{re.escape(marker)}|$)', - re.IGNORECASE | re.DOTALL) - matches = pattern.finditer(content) - - for match in matches: - activity_text = match.group(1) - if len(activity_text) > 20: - activity = { - 'name': activity_text.split('\n')[0][:200], - 'description': activity_text[:1000], - 'source_file': str(source_file), - 'category': '[A]' - } - activities.append(activity) - - return activities - - def _extract_from_blocks(self, content, source_file): - """Extrage din blocuri de text separate""" - activities = [] - - # Imparte in blocuri separate de linii goale - blocks = re.split(r'\n\s*\n', content) - - for block in blocks: - if len(block) > 50: # Minim 50 caractere - lines = block.strip().split('\n') - first_line = lines[0].strip() - - # Verifica daca blocul pare o activitate - if any(keyword in first_line.lower() for keyword in ['joc', 'activitate', 'exercitiu']): - activity = { - 'name': first_line[:200], - 'description': block[:1000], - 'source_file': str(source_file), - 'category': '[A]' - } - activities.append(activity) - - return activities - - def save_to_database(self, activities): - """Salveaza in baza de date""" - conn = sqlite3.connect(self.db_path) - cursor = conn.cursor() - - saved_count = 0 - - for activity in activities: - try: - # Check for duplicates - cursor.execute( - "SELECT id FROM activities WHERE name = ? AND source_file = ?", - (activity.get('name'), activity.get('source_file')) - ) - - if not cursor.fetchone(): - columns = list(activity.keys()) - values = list(activity.values()) - placeholders = ['?' for _ in values] - - query = f"INSERT INTO activities ({', '.join(columns)}) VALUES ({', '.join(placeholders)})" - cursor.execute(query, values) - saved_count += 1 - - except Exception as e: - print(f"Error saving: {e}") - - conn.commit() - conn.close() - - return saved_count - - def process_all_text_files(self, base_path='/mnt/d/GoogleDrive/Cercetasi/carti-camp-jocuri'): - """Proceseaza toate fisierele text si markdown""" - base_path = Path(base_path) - - text_files = list(base_path.rglob("*.txt")) - md_files = list(base_path.rglob("*.md")) - all_files = text_files + md_files - - print(f"Found {len(all_files)} text/markdown files") - - all_activities = [] - - for file_path in all_files: - activities = self.extract_from_text(str(file_path)) - all_activities.extend(activities) - print(f"Processed {file_path.name}: {len(activities)} activities") - - # Save to database - saved = self.save_to_database(all_activities) - print(f"\nTotal saved: {saved} activities from {len(all_files)} files") - - return len(all_files), saved - -if __name__ == "__main__": - extractor = TextActivityExtractor() - extractor.process_all_text_files() \ No newline at end of file diff --git a/scripts/unified_processor.py b/scripts/unified_processor.py deleted file mode 100644 index 8a6d2a3..0000000 --- a/scripts/unified_processor.py +++ /dev/null @@ -1,151 +0,0 @@ -#!/usr/bin/env python3 -""" -Unified Activity Processor -Orchestreaz toate extractoarele pentru procesare complet -""" - -import time -from pathlib import Path -from html_extractor import HTMLActivityExtractor -from text_extractor import TextActivityExtractor -import sqlite3 - -class UnifiedProcessor: - def __init__(self, db_path='data/activities.db'): - self.db_path = db_path - self.html_extractor = HTMLActivityExtractor(db_path) - self.text_extractor = TextActivityExtractor(db_path) - self.stats = { - 'html_processed': 0, - 'text_processed': 0, - 'pdf_to_process': 0, - 'doc_to_process': 0, - 'total_activities': 0, - 'start_time': None, - 'end_time': None - } - - def get_current_activity_count(self): - """Obine numrul curent de activiti din DB""" - conn = sqlite3.connect(self.db_path) - cursor = conn.cursor() - cursor.execute("SELECT COUNT(*) FROM activities") - count = cursor.fetchone()[0] - conn.close() - return count - - def count_files_to_process(self, base_path): - """Numr fiierele care trebuie procesate""" - base_path = Path(base_path) - - counts = { - 'html': len(list(base_path.rglob("*.html"))) + len(list(base_path.rglob("*.htm"))), - 'txt': len(list(base_path.rglob("*.txt"))), - 'md': len(list(base_path.rglob("*.md"))), - 'pdf': len(list(base_path.rglob("*.pdf"))), - 'doc': len(list(base_path.rglob("*.doc"))), - 'docx': len(list(base_path.rglob("*.docx"))) - } - - return counts - - def process_automated_formats(self, base_path='/mnt/d/GoogleDrive/Cercetasi/carti-camp-jocuri'): - """Proceseaz toate formatele care pot fi automatizate""" - print("="*60) - print("UNIFIED ACTIVITY PROCESSOR - AUTOMATED PHASE") - print("="*60) - - self.stats['start_time'] = time.time() - initial_count = self.get_current_activity_count() - - # Afieaz statistici iniiale - file_counts = self.count_files_to_process(base_path) - print(f"\nFiles to process:") - for format, count in file_counts.items(): - print(f" {format.upper()}: {count} files") - print(f"\nCurrent activities in database: {initial_count}") - print("-"*60) - - # FAZA 1: Procesare HTML (prioritate maxim - volum mare) - print("\n[1/2] Processing HTML files...") - print("-"*40) - html_processed, html_errors = self.html_extractor.process_all_html_files(base_path) - self.stats['html_processed'] = html_processed - - # FAZA 2: Procesare Text/MD - print("\n[2/2] Processing Text/Markdown files...") - print("-"*40) - text_processed, text_saved = self.text_extractor.process_all_text_files(base_path) - self.stats['text_processed'] = text_processed - - # Statistici finale - self.stats['end_time'] = time.time() - final_count = self.get_current_activity_count() - self.stats['total_activities'] = final_count - initial_count - - # Identific fiierele care necesit procesare manual - self.stats['pdf_to_process'] = file_counts['pdf'] - self.stats['doc_to_process'] = file_counts['doc'] + file_counts['docx'] - - self.print_summary() - self.save_pdf_doc_list(base_path) - - def print_summary(self): - """Afieaz rezumatul procesrii""" - print("\n" + "="*60) - print("PROCESSING SUMMARY") - print("="*60) - - duration = self.stats['end_time'] - self.stats['start_time'] - - print(f"\nAutomated Processing Results:") - print(f" HTML files processed: {self.stats['html_processed']}") - print(f" Text/MD files processed: {self.stats['text_processed']}") - print(f" New activities added: {self.stats['total_activities']}") - print(f" Processing time: {duration:.1f} seconds") - - print(f"\nFiles requiring Claude processing:") - print(f" PDF files: {self.stats['pdf_to_process']}") - print(f" DOC/DOCX files: {self.stats['doc_to_process']}") - - print("\n" + "="*60) - print("NEXT STEPS:") - print("1. Review the file 'pdf_doc_for_claude.txt' for manual processing") - print("2. Use Claude to extract activities from PDF/DOC files") - print("3. Focus on largest PDF files first (highest activity density)") - print("="*60) - - def save_pdf_doc_list(self, base_path): - """Salveaz lista de PDF/DOC pentru procesare cu Claude""" - base_path = Path(base_path) - - pdf_files = sorted(base_path.rglob("*.pdf"), key=lambda p: p.stat().st_size, reverse=True) - doc_files = list(base_path.rglob("*.doc")) - docx_files = list(base_path.rglob("*.docx")) - - with open('pdf_doc_for_claude.txt', 'w', encoding='utf-8') as f: - f.write("PDF/DOC FILES FOR CLAUDE PROCESSING\n") - f.write("="*60 + "\n") - f.write("Files sorted by size (largest first = likely more activities)\n\n") - - f.write("TOP PRIORITY PDF FILES (process these first):\n") - f.write("-"*40 + "\n") - for i, pdf in enumerate(pdf_files[:20], 1): - size_mb = pdf.stat().st_size / (1024*1024) - f.write(f"{i}. {pdf.name} ({size_mb:.1f} MB)\n") - f.write(f" Path: {pdf}\n\n") - - if len(pdf_files) > 20: - f.write(f"\n... and {len(pdf_files)-20} more PDF files\n\n") - - f.write("\nDOC/DOCX FILES:\n") - f.write("-"*40 + "\n") - for doc in doc_files + docx_files: - size_kb = doc.stat().st_size / 1024 - f.write(f"- {doc.name} ({size_kb:.1f} KB)\n") - - print(f"\nPDF/DOC list saved to: pdf_doc_for_claude.txt") - -if __name__ == "__main__": - processor = UnifiedProcessor() - processor.process_automated_formats() \ No newline at end of file diff --git a/scripts/validate_extractions.py b/scripts/validate_extractions.py new file mode 100644 index 0000000..cdb6113 --- /dev/null +++ b/scripts/validate_extractions.py @@ -0,0 +1,208 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- +""" +validate_extractions.py — validate every data/extracted/*.json (plan §5b). + +For each extraction file it runs two checks: + 1. JSON-schema validation against scripts/activity_schema.json, + 2. the source_excerpt anti-hallucination check (each excerpt must be a fuzzy + substring of the chunk it came from). + +For every failing chunk it: + * writes the exact re-extraction prompt to data/extracted/_reextract/.prompt.md, + * marks the chunk `rejected` in data/chunks/manifest.json. + +The orchestrator then re-launches subagents only on the `rejected` chunks; the +loop repeats until nothing is rejected. + +Usage: + python scripts/validate_extractions.py +""" + +from __future__ import annotations + +import argparse +import json +import sys +from pathlib import Path +from typing import Optional + +SCRIPT_DIR = Path(__file__).resolve().parent +REPO_ROOT = SCRIPT_DIR.parent +for _p in (str(SCRIPT_DIR), str(REPO_ROOT)): + if _p not in sys.path: + sys.path.insert(0, _p) + +from import_common import ( # noqa: E402 + DEFAULT_SCHEMA_PATH, + chunk_key_for, + excerpt_matches, + excerpt_score, + find_chunk_text, + iter_extraction_files, + load_schema, + validate_extraction, +) + +SUBAGENT_PROMPT = SCRIPT_DIR / "SUBAGENT_PROMPT.md" + + +# -------------------------------------------------------------------------- +# re-extraction prompt +# -------------------------------------------------------------------------- +def build_reextraction_prompt( + chunk_key: str, chunk_file: Optional[str], errors: list[str] +) -> str: + """The exact prompt to hand a subagent to re-extract a rejected chunk.""" + chunk_ref = chunk_file or f"data/chunks//{chunk_key}.txt" + lines = [ + f"# RE-EXTRACTION — chunk `{chunk_key}`", + "", + "The previous extraction for this chunk was **REJECTED**. Reasons:", + "", + ] + lines += [f"- {e}" for e in errors] + lines += [ + "", + "## What to do", + "", + f"1. Read ONLY this chunk: `{chunk_ref}`", + f"2. Follow the extraction rules in `{SUBAGENT_PROMPT.relative_to(REPO_ROOT)}`.", + "3. Fix every problem listed above. In particular:", + " - every `source_excerpt` must be copied **verbatim** from the chunk", + " (it is checked as a fuzzy substring — invented quotes are rejected);", + " - `source_excerpt` and `page_reference` are mandatory on every activity;", + " - the output must validate against `scripts/activity_schema.json`.", + f"4. Overwrite the extraction file `data/extracted/{chunk_key}.json`.", + "", + ] + return "\n".join(lines) + + +# -------------------------------------------------------------------------- +# manifest +# -------------------------------------------------------------------------- +def load_manifest(manifest_path: Path) -> dict: + if manifest_path.is_file(): + try: + data = json.loads(manifest_path.read_text(encoding="utf-8")) + data.setdefault("chunks", {}) + return data + except (json.JSONDecodeError, OSError): + pass + return {"chunks": {}} + + +def save_manifest(manifest: dict, manifest_path: Path) -> None: + manifest_path.parent.mkdir(parents=True, exist_ok=True) + manifest_path.write_text( + json.dumps(manifest, indent=2, ensure_ascii=False), encoding="utf-8" + ) + + +def mark_rejected(manifest: dict, chunk_key: str) -> None: + """Flip a chunk to `rejected` in the manifest (creating the entry if new).""" + entry = manifest["chunks"].get(chunk_key, {}) + entry["state"] = "rejected" + manifest["chunks"][chunk_key] = entry + + +# -------------------------------------------------------------------------- +# validation +# -------------------------------------------------------------------------- +def validate_file(json_path: Path, schema: dict, chunks_dir: Path) -> list[str]: + """Return the list of errors for one extraction file (empty == valid).""" + try: + data = json.loads(json_path.read_text(encoding="utf-8")) + except json.JSONDecodeError as exc: + return [f"invalid JSON: {exc}"] + + errors = validate_extraction(data, schema) + if errors: + return errors + + header = data.get("header", {}) + chunk_text = find_chunk_text(json_path, header, chunks_dir) + if chunk_text is None: + return [f"source chunk not found for {chunk_key_for(json_path, header)}"] + + for adict in data.get("activities", []): + excerpt = adict.get("source_excerpt") or "" + if not excerpt_matches(excerpt, chunk_text): + score = excerpt_score(excerpt, chunk_text) + errors.append( + f"activity {adict.get('name')!r}: source_excerpt not found in " + f"chunk (best match {score:.0f}/100) — possible hallucination" + ) + return errors + + +def run( + extracted_dir: Path, + chunks_dir: Path, + manifest_path: Path, + schema_path: Path = DEFAULT_SCHEMA_PATH, +) -> dict: + schema = load_schema(schema_path) + manifest = load_manifest(manifest_path) + reextract_dir = extracted_dir / "_reextract" + + report = {"total": 0, "valid": 0, "rejected": 0, "rejected_chunks": []} + for json_path in iter_extraction_files(extracted_dir): + report["total"] += 1 + errors = validate_file(json_path, schema, chunks_dir) + if not errors: + report["valid"] += 1 + continue + + report["rejected"] += 1 + try: + data = json.loads(json_path.read_text(encoding="utf-8")) + header = data.get("header", {}) + except json.JSONDecodeError: + header = {} + chunk_key = chunk_key_for(json_path, header) + chunk_file = None + meta = manifest["chunks"].get(chunk_key) + if meta: + chunk_file = meta.get("chunk_file") + + reextract_dir.mkdir(parents=True, exist_ok=True) + prompt = build_reextraction_prompt(chunk_key, chunk_file, errors) + (reextract_dir / f"{chunk_key}.prompt.md").write_text(prompt, encoding="utf-8") + + mark_rejected(manifest, chunk_key) + report["rejected_chunks"].append({"chunk": chunk_key, "errors": errors}) + + save_manifest(manifest, manifest_path) + return report + + +# -------------------------------------------------------------------------- +# CLI +# -------------------------------------------------------------------------- +def main(argv: Optional[list[str]] = None) -> int: + parser = argparse.ArgumentParser(description="Validate extraction JSON files.") + parser.add_argument("--extracted", default="data/extracted") + parser.add_argument("--chunks", default="data/chunks") + parser.add_argument("--manifest", default="data/chunks/manifest.json") + parser.add_argument("--schema", default=str(DEFAULT_SCHEMA_PATH)) + args = parser.parse_args(argv) + + report = run( + Path(args.extracted), Path(args.chunks), Path(args.manifest), Path(args.schema) + ) + print(f"extraction files : {report['total']}") + print(f" valid : {report['valid']}") + print(f" rejected : {report['rejected']}") + for item in report["rejected_chunks"]: + print(f" [rejected] {item['chunk']}") + for err in item["errors"]: + print(f" - {err}") + if report["rejected"]: + print(f"\nRe-extraction prompts written to {args.extracted}/_reextract/") + return 0 + + +if __name__ == "__main__": + raise SystemExit(main()) diff --git a/tests/conftest.py b/tests/conftest.py new file mode 100644 index 0000000..3e59d0e --- /dev/null +++ b/tests/conftest.py @@ -0,0 +1,114 @@ +# -*- coding: utf-8 -*- +""" +Shared pytest fixtures for the extraction-pipeline tests. + +scripts/ is not a package, so it is added to sys.path here. Synthetic fixtures +(PDF, docx, zip, HTML) are generated at runtime — no binary blobs in the repo. +""" + +import sys +import zipfile +from pathlib import Path + +import pytest + +REPO_ROOT = Path(__file__).resolve().parent.parent +SCRIPTS_DIR = REPO_ROOT / "scripts" +if str(SCRIPTS_DIR) not in sys.path: + sys.path.insert(0, str(SCRIPTS_DIR)) + + +# -------------------------------------------------------------------------- +# synthetic PDF — deliberately large to pin the "no max_pages" regression +# -------------------------------------------------------------------------- +@pytest.fixture +def big_pdf(tmp_path): + """A 60-page PDF; each page carries a unique 'PDFMARK-' token.""" + from reportlab.pdfgen import canvas + from reportlab.lib.pagesizes import letter + + path = tmp_path / "big.pdf" + c = canvas.Canvas(str(path), pagesize=letter) + for n in range(1, 61): + c.drawString(72, 720, f"PDFMARK-{n} synthetic activity page number {n}") + c.drawString(72, 700, "Acest joc educativ se joaca in echipa.") + c.showPage() + c.save() + return path + + +# -------------------------------------------------------------------------- +# synthetic docx — 100 paragraphs => 3 synthetic pages at 40 paras/page +# -------------------------------------------------------------------------- +@pytest.fixture +def sample_docx(tmp_path): + import docx + + path = tmp_path / "sample.docx" + document = docx.Document() + for i in range(100): + document.add_paragraph(f"Paragraf {i}: continut joc team-building.") + document.save(str(path)) + return path + + +# -------------------------------------------------------------------------- +# synthetic HTML mirror page — with nav/script/footer chrome to strip +# -------------------------------------------------------------------------- +HTML_WITH_NAV = """ +Joc + + + + +
Site Banner Junk
+
+

Vanatoarea de comori

+

Acesta este un joc real de orientare pentru cercetasi.

+

Jucatorii cauta indicii ascunse in tabara.

+
+ + +""" + + +@pytest.fixture +def html_with_nav(tmp_path): + path = tmp_path / "page.html" + path.write_text(HTML_WITH_NAV, encoding="utf-8") + return path + + +# -------------------------------------------------------------------------- +# synthetic zip — contains a docx and a stray junk file +# -------------------------------------------------------------------------- +@pytest.fixture +def sample_zip(tmp_path, sample_docx): + path = tmp_path / "archive.zip" + with zipfile.ZipFile(path, "w") as zf: + zf.write(sample_docx, arcname="inner/sample.docx") + zf.writestr("desktop.ini", "junk") + return path + + +# -------------------------------------------------------------------------- +# synthetic normalized source — paginated, with an activity straddling a +# page boundary so the chunker overlap can be verified. +# -------------------------------------------------------------------------- +@pytest.fixture +def paginated_source(tmp_path): + """A 50-page normalized source. An activity spans the page 20/21 boundary.""" + lines = ["SOURCE: synthetic/test.pdf", "CONVERTED: 2026-05-19", + "FORMAT: pdf", "=" * 50, ""] + for n in range(1, 51): + lines.append(f"--- PAGE {n} ---") + if n == 20: + lines.append("ACTIVITY-START jocul podului care traverseaza pagina") + elif n == 21: + lines.append("continuare a jocului podului ACTIVITY-END") + else: + lines.append(f"continut obisnuit pe pagina {n}") + lines.append("") + path = tmp_path / "src_paginated.txt" + path.write_text("\n".join(lines), encoding="utf-8") + return path diff --git a/tests/fixtures/.gitkeep b/tests/fixtures/.gitkeep new file mode 100644 index 0000000..f016cdb --- /dev/null +++ b/tests/fixtures/.gitkeep @@ -0,0 +1,3 @@ +# Test fixtures (synthetic PDF/docx/zip/HTML) are generated at runtime by +# tests/conftest.py — no binary blobs are committed. This file only preserves +# the directory in git. diff --git a/tests/test_build_database.py b/tests/test_build_database.py new file mode 100644 index 0000000..e4a5e14 --- /dev/null +++ b/tests/test_build_database.py @@ -0,0 +1,334 @@ +# -*- coding: utf-8 -*- +""" +Tests for scripts/build_database.py — the import / dedup / swap side. + +Covers: category -> slug + `altele` fallback; dedup across all three threshold +bands; EN != RO never merged; field combination on merge; atomic swap with a +simulated mid-build crash; the source_excerpt substring check. +""" + +import json +import os +import sys +from pathlib import Path + +import pytest + +REPO_ROOT = Path(__file__).resolve().parent.parent +SCRIPTS_DIR = REPO_ROOT / "scripts" +for _p in (str(REPO_ROOT), str(SCRIPTS_DIR)): + if _p not in sys.path: + sys.path.insert(0, _p) + +import build_database as bd # noqa: E402 +from app.models.activity import Activity # noqa: E402 +from app.models.database import DatabaseManager # noqa: E402 + + +# -------------------------------------------------------------------------- +# helpers +# -------------------------------------------------------------------------- +def _activity(**over): + base = dict( + name="Jocul testului", + description="O activitate de echipa in aer liber.", + category="team-building", + content_type="joc", + language="ro", + extraction_confidence="high", + ) + base.update(over) + return Activity(**base) + + +def _ext_activity(**over): + """A schema-valid extraction-JSON activity object.""" + base = dict( + name="Jocul testului", + description="O activitate de echipa in aer liber.", + category="team-building", + content_type="joc", + language="ro", + extraction_confidence="high", + source_excerpt="ANCHOR-EXCERPT despre jocul testului", + page_reference="page 1", + ) + base.update(over) + return base + + +def _write_extraction(extracted_dir, chunk_key, activities, source_id="src01"): + extracted_dir.mkdir(parents=True, exist_ok=True) + payload = { + "header": { + "source_hash": "hash1234deadbeef", + "schema_version": "1.0", + "prompt_version": "1.0", + "chunk_range": "pages 1-20", + "source_id": source_id, + "chunk_key": chunk_key, + }, + "activities": activities, + } + (extracted_dir / f"{chunk_key}.json").write_text( + json.dumps(payload, ensure_ascii=False), encoding="utf-8" + ) + + +def _write_chunk(chunks_dir, source_id, chunk_key, text): + d = chunks_dir / source_id + d.mkdir(parents=True, exist_ok=True) + (d / f"{chunk_key}.txt").write_text(text, encoding="utf-8") + + +# -------------------------------------------------------------------------- +# step 3 — category normalization +# -------------------------------------------------------------------------- +def test_category_alias_mapped_to_slug(): + act = bd.dict_to_activity(_ext_activity(category="teambuilding"), "s.txt") + assert act.category == "team-building" + + +def test_unknown_category_falls_back_to_altele(): + act = bd.dict_to_activity(_ext_activity(category="zzz-not-a-category"), "s.txt") + assert act.category == "altele" + + +def test_content_type_normalized(): + act = bd.dict_to_activity(_ext_activity(content_type="games"), "s.txt") + assert act.content_type == "joc" + + +# -------------------------------------------------------------------------- +# step 4 — dedup, three bands +# -------------------------------------------------------------------------- +def test_dedup_auto_merge_identical_descriptions(): + """>= 85 similar -> a single merged row.""" + a = _activity(description="copiii formeaza echipe si traverseaza terenul") + b = _activity(description="copiii formeaza echipe si traverseaza terenul") + out, stats = bd.dedup_activities([a, b]) + assert len(out) == 1 + assert stats["auto_merged"] == 1 + assert out[0].needs_review == 0 + + +def test_dedup_borderline_keeps_both_and_flags_needs_review(): + """60-85 similar -> both kept, both flagged needs_review.""" + from rapidfuzz import fuzz + + d1 = "alpha beta gamma delta epsilon" + d2 = "alpha beta gamma delta epsilon zeta eta theta iota" + score = fuzz.token_sort_ratio(d1, d2) + assert 60.0 <= score < 85.0, f"precondition: score={score} not borderline" + + a = _activity(description=d1) + b = _activity(description=d2) + out, stats = bd.dedup_activities([a, b]) + assert len(out) == 2 + assert stats["borderline"] == 2 + assert all(act.needs_review == 1 for act in out) + + +def test_dedup_low_similarity_kept_as_separate_variants(): + """< 60 similar -> separate variants, no needs_review.""" + from rapidfuzz import fuzz + + d1 = "alpha beta gamma delta epsilon" + d2 = "quebec romeo sierra tango uniform victor whiskey" + assert fuzz.token_sort_ratio(d1, d2) < 60.0 + + a = _activity(description=d1) + b = _activity(description=d2) + out, stats = bd.dedup_activities([a, b]) + assert len(out) == 2 + assert stats["auto_merged"] == 0 + assert all(act.needs_review == 0 for act in out) + + +def test_dedup_never_merges_across_languages(): + """Same name + same description but EN vs RO -> two distinct rows.""" + desc = "children form teams and cross the field" + ro = _activity(name="Cursa", description=desc, language="ro") + en = _activity(name="Cursa", description=desc, language="en") + out, stats = bd.dedup_activities([ro, en]) + assert len(out) == 2 + assert stats["auto_merged"] == 0 + langs = {a.language for a in out} + assert langs == {"ro", "en"} + + +def test_merge_combines_fields(): + """On merge: longest description/rules, union materials, accumulated sources.""" + desc = "copiii formeaza echipe si traverseaza terenul cu obstacole" + a = _activity( + description=desc, + rules="regula scurta", + materials_list="franghie, esarfa", + source_file="a.txt", + keywords="echipa", + ) + b = _activity( + description=desc, + rules="o regula mult mai lunga si mai detaliata pentru joc", + materials_list="busola, esarfa", + source_file="b.txt", + keywords="cooperare", + ) + out, _ = bd.dedup_activities([a, b]) + assert len(out) == 1 + merged = out[0] + assert merged.rules == "o regula mult mai lunga si mai detaliata pentru joc" + mats = set(m.strip() for m in merged.materials_list.split(",")) + assert mats == {"franghie", "esarfa", "busola"} + assert set(merged.source_files) == {"a.txt", "b.txt"} + assert merged.popularity_score == 1 + assert set(k.strip() for k in merged.keywords.split(",")) == {"echipa", "cooperare"} + + +# -------------------------------------------------------------------------- +# step 5 — review decisions +# -------------------------------------------------------------------------- +def test_review_decision_drop_removes_row(): + from import_common import content_key, normalize_name + + a = _activity(description="o descriere de test") + key = content_key(normalize_name(a.name), a.language, a.description) + kept, stats = bd.apply_review_decisions([a], {key: {"decision": "drop"}}) + assert kept == [] + assert stats["dropped"] == 1 + + +def test_review_decision_keep_separate_clears_needs_review(): + from import_common import content_key, normalize_name + + a = _activity(description="o descriere de test") + a.needs_review = 1 + key = content_key(normalize_name(a.name), a.language, a.description) + kept, stats = bd.apply_review_decisions([a], {key: {"decision": "keep-separate"}}) + assert len(kept) == 1 and kept[0].needs_review == 0 + assert stats["resolved"] == 1 + + +# -------------------------------------------------------------------------- +# step 2b — source_excerpt hallucination check +# -------------------------------------------------------------------------- +def test_hallucinated_excerpt_activity_dropped(tmp_path): + extracted = tmp_path / "extracted" + chunks = tmp_path / "chunks" + sources = tmp_path / "sources" + + good = _ext_activity( + name="Joc real", source_excerpt="textul real apare in bucata sursa" + ) + bad = _ext_activity( + name="Joc inventat", + source_excerpt="acest citat nu exista nicaieri in sursa originala xyzzy", + ) + _write_extraction(extracted, "src01.part01", [good, bad]) + _write_chunk( + chunks, "src01", "src01.part01", + "--- PAGE 1 ---\ntextul real apare in bucata sursa pentru jocul real.\n", + ) + + from import_common import load_schema + + schema = load_schema() + res = bd.collect_activities(extracted, chunks, sources, schema) + names = {a.name for a in res["activities"]} + assert names == {"Joc real"} + assert res["activities_hallucinated"] == 1 + assert (extracted / "_rejected").exists() + + +def test_schema_invalid_file_moved_to_rejected(tmp_path): + extracted = tmp_path / "extracted" + chunks = tmp_path / "chunks" + sources = tmp_path / "sources" + extracted.mkdir(parents=True) + + # missing required header keys + bad activity + (extracted / "bad.json").write_text( + json.dumps({"header": {}, "activities": [{"name": "x"}]}), + encoding="utf-8", + ) + from import_common import load_schema + + res = bd.collect_activities(extracted, chunks, sources, load_schema()) + assert res["files_rejected_schema"] == 1 + assert not (extracted / "bad.json").exists() + assert (extracted / "_rejected" / "bad.json").exists() + assert (extracted / "_rejected" / "bad.errors.txt").exists() + + +# -------------------------------------------------------------------------- +# end-to-end rebuild + atomic swap +# -------------------------------------------------------------------------- +def _setup_corpus(tmp_path): + extracted = tmp_path / "extracted" + chunks = tmp_path / "chunks" + sources = tmp_path / "sources" + excerpt = "jocul testului este o activitate de echipa" + _write_extraction( + extracted, "src01.part01", + [_ext_activity(source_excerpt=excerpt)], + ) + _write_chunk(chunks, "src01", "src01.part01", + f"--- PAGE 1 ---\n{excerpt} in aer liber.\n") + return extracted, chunks, sources + + +def test_rebuild_creates_database(tmp_path): + extracted, chunks, sources = _setup_corpus(tmp_path) + db_path = tmp_path / "activities.db" + + report = bd.rebuild( + extracted_dir=extracted, chunks_dir=chunks, sources_dir=sources, + db_path=db_path, + ) + assert db_path.exists() + assert report["final_count"] == 1 + + db = DatabaseManager(str(db_path)) + rows = db.search_activities() + assert len(rows) == 1 + assert rows[0]["category"] == "team-building" + + +def test_atomic_swap_keeps_live_db_intact_on_crash(tmp_path, monkeypatch): + """A mid-build crash must leave the live DB byte-identical.""" + extracted, chunks, sources = _setup_corpus(tmp_path) + db_path = tmp_path / "activities.db" + + # a pre-existing live DB with sentinel content + live = DatabaseManager(str(db_path)) + live.insert_activity(_activity(name="Sentinel viu")) + before = db_path.read_bytes() + + def boom(self, *a, **k): + raise RuntimeError("simulated mid-build crash") + + monkeypatch.setattr(DatabaseManager, "bulk_insert_activities", boom) + + with pytest.raises(RuntimeError, match="simulated mid-build crash"): + bd.rebuild( + extracted_dir=extracted, chunks_dir=chunks, sources_dir=sources, + db_path=db_path, + ) + + # live DB untouched, tmp cleaned up + assert db_path.read_bytes() == before + assert not (tmp_path / "activities.db.tmp").exists() + + +def test_rebuild_backs_up_live_db(tmp_path): + extracted, chunks, sources = _setup_corpus(tmp_path) + db_path = tmp_path / "activities.db" + DatabaseManager(str(db_path)).insert_activity(_activity(name="Vechi")) + + report = bd.rebuild( + extracted_dir=extracted, chunks_dir=chunks, sources_dir=sources, + db_path=db_path, + ) + assert report["backup"] is not None + assert Path(report["backup"]).exists() + assert os.path.basename(report["backup"]) == "activities.db.bak" diff --git a/tests/test_chunk_sources.py b/tests/test_chunk_sources.py new file mode 100644 index 0000000..1b6b5e5 --- /dev/null +++ b/tests/test_chunk_sources.py @@ -0,0 +1,183 @@ +# -*- coding: utf-8 -*- +"""Tests for scripts/chunk_sources.py.""" + +import json + +import chunk_sources as cs +import normalize_sources as ns + + +def _pages(n): + return [(i, f"text-{i}") for i in range(1, n + 1)] + + +# -------------------------------------------------------------------------- +# header parsing +# -------------------------------------------------------------------------- +def test_parse_source_splits_header_and_body(paginated_source): + text = paginated_source.read_text(encoding="utf-8") + header, body = cs.parse_source(text) + assert header["FORMAT"] == "pdf" + assert body.lstrip().startswith("--- PAGE 1 ---") + + +# -------------------------------------------------------------------------- +# page chunking +# -------------------------------------------------------------------------- +def test_chunk_pages_basic_split(): + chunks = cs.chunk_pages(_pages(50), pages_per_chunk=20, overlap=4) + # stride 16: starts at pages 1, 17, 33, ... + assert chunks[0]["page_start"] == 1 and chunks[0]["page_end"] == 20 + assert chunks[1]["page_start"] == 17 + assert chunks[-1]["page_end"] == 50 + + +def test_chunk_pages_have_overlap(): + chunks = cs.chunk_pages(_pages(50), pages_per_chunk=20, overlap=4) + overlap = chunks[0]["page_end"] - chunks[1]["page_start"] + 1 + assert overlap == 4 + + +def test_chunk_pages_short_document_single_chunk(): + chunks = cs.chunk_pages(_pages(8), pages_per_chunk=20, overlap=4) + assert len(chunks) == 1 + assert chunks[0]["page_start"] == 1 and chunks[0]["page_end"] == 8 + + +def test_chunk_pages_empty(): + assert cs.chunk_pages([]) == [] + + +def test_activity_at_page_boundary_intact_in_one_chunk(paginated_source): + """An activity straddling the page 20/21 boundary must appear whole in >=1 chunk.""" + text = paginated_source.read_text(encoding="utf-8") + chunks = cs.make_chunks(text) + full = [ + c for c in chunks + if "ACTIVITY-START" in c["text"] and "ACTIVITY-END" in c["text"] + ] + assert full, "activity spanning a page boundary was split across all chunks" + + +# -------------------------------------------------------------------------- +# word-window chunking for unpaginated text +# -------------------------------------------------------------------------- +def test_chunk_words_window_and_overlap(): + text = " ".join(f"w{i}" for i in range(25_000)) + chunks = cs.chunk_words(text, window=10_000, overlap=2_000) + assert len(chunks) == 3 # stride 8000 over 25000 words + first = chunks[0]["text"].split() + second = chunks[1]["text"].split() + assert first[8_000:10_000] == second[0:2_000] # 2000-word overlap + + +def test_make_chunks_unpaginated_uses_word_windows(): + body = "cuvant " * 15_000 + text = "SOURCE: x\nFORMAT: txt\n" + "=" * 50 + "\n\n" + body + chunks = cs.make_chunks(text) + assert len(chunks) >= 2 + assert chunks[0]["chunk_range"].startswith("words") + + +# -------------------------------------------------------------------------- +# stable source ids — anti-collision +# -------------------------------------------------------------------------- +def test_stable_id_same_stem_different_path_no_collision(): + a = ns.stable_id("camp/games/scout.pdf") + b = ns.stable_id("school/lessons/scout.pdf") + assert a != b + assert a.endswith("_scout") and b.endswith("_scout") + + +def test_stable_id_deterministic(): + assert ns.stable_id("a/b/c.pdf") == ns.stable_id("a/b/c.pdf") + + +# -------------------------------------------------------------------------- +# manifest registry + idempotency +# -------------------------------------------------------------------------- +def test_run_writes_chunks_and_manifest(paginated_source, tmp_path): + sources_dir = tmp_path / "sources" + sources_dir.mkdir() + (sources_dir / paginated_source.name).write_text( + paginated_source.read_text(encoding="utf-8"), encoding="utf-8" + ) + chunks_dir = tmp_path / "chunks" + + summary = cs.run(sources_dir, chunks_dir) + assert summary["sources"] == 1 + assert summary["chunks"] >= 2 + + manifest = json.loads((chunks_dir / "manifest.json").read_text()) + assert manifest["chunks"] + for key, meta in manifest["chunks"].items(): + assert meta["state"] == "pending" + assert meta["expected_json"] == f"{key}.json" + assert (chunks_dir.parent / meta["chunk_file"]).exists() + + +def test_manifest_idempotent_preserves_state(paginated_source, tmp_path): + sources_dir = tmp_path / "sources" + sources_dir.mkdir() + (sources_dir / paginated_source.name).write_text( + paginated_source.read_text(encoding="utf-8"), encoding="utf-8" + ) + chunks_dir = tmp_path / "chunks" + manifest_path = chunks_dir / "manifest.json" + + cs.run(sources_dir, chunks_dir) + + # orchestrator marks one chunk done + manifest = json.loads(manifest_path.read_text()) + first_key = next(iter(manifest["chunks"])) + n_before = len(manifest["chunks"]) + manifest["chunks"][first_key]["state"] = "done" + manifest_path.write_text(json.dumps(manifest), encoding="utf-8") + + # re-run: 'done' must survive, no chunk added or lost + cs.run(sources_dir, chunks_dir) + manifest2 = json.loads(manifest_path.read_text()) + assert len(manifest2["chunks"]) == n_before + assert manifest2["chunks"][first_key]["state"] == "done" + assert all( + m["state"] in ("pending", "done") for m in manifest2["chunks"].values() + ) + + +def test_manifest_resets_state_when_source_changes(paginated_source, tmp_path): + sources_dir = tmp_path / "sources" + sources_dir.mkdir() + src = sources_dir / paginated_source.name + src.write_text(paginated_source.read_text(encoding="utf-8"), encoding="utf-8") + chunks_dir = tmp_path / "chunks" + manifest_path = chunks_dir / "manifest.json" + + cs.run(sources_dir, chunks_dir) + manifest = json.loads(manifest_path.read_text()) + first_key = next(iter(manifest["chunks"])) + manifest["chunks"][first_key]["state"] = "done" + manifest_path.write_text(json.dumps(manifest), encoding="utf-8") + + # mutate the source content -> hash changes -> state resets + src.write_text(src.read_text(encoding="utf-8") + "\n--- PAGE 51 ---\nextra\n", + encoding="utf-8") + cs.run(sources_dir, chunks_dir) + manifest2 = json.loads(manifest_path.read_text()) + assert manifest2["chunks"][first_key]["state"] == "pending" + + +def test_prune_stale_removes_orphan_entries(paginated_source, tmp_path): + sources_dir = tmp_path / "sources" + sources_dir.mkdir() + src = sources_dir / paginated_source.name + src.write_text(paginated_source.read_text(encoding="utf-8"), encoding="utf-8") + chunks_dir = tmp_path / "chunks" + + cs.run(sources_dir, chunks_dir) + # delete the source -> its chunks become stale + src.unlink() + summary = cs.run(sources_dir, chunks_dir) + assert summary["chunks"] == 0 + assert summary["pruned"] >= 1 + manifest = json.loads((chunks_dir / "manifest.json").read_text()) + assert manifest["chunks"] == {} diff --git a/tests/test_extract_common.py b/tests/test_extract_common.py new file mode 100644 index 0000000..17dedee --- /dev/null +++ b/tests/test_extract_common.py @@ -0,0 +1,177 @@ +# -*- coding: utf-8 -*- +"""Tests for scripts/extract_common.py.""" + +import shutil +import zipfile + +import pytest + +import extract_common as ec + + +# -------------------------------------------------------------------------- +# format detection +# -------------------------------------------------------------------------- +def test_detect_format(): + assert ec.detect_format("a/b/file.PDF") == "pdf" + assert ec.detect_format("x.docx") == "docx" + assert ec.detect_format("x.doc") == "doc" + assert ec.detect_format("x.pptx") == "pptx" + assert ec.detect_format("x.html") == "html" + assert ec.detect_format("x.zip") == "zip" + assert ec.detect_format("x.epub") == "epub" + assert ec.detect_format("x.xyz") == "unknown" + + +def test_is_junk(): + assert ec.is_junk("some/desktop.ini") + assert ec.is_junk("notes.bak") + assert ec.is_junk("README.md") + assert not ec.is_junk("1000 Scout Games.pdf") + + +# -------------------------------------------------------------------------- +# PDF — the critical "no max_pages" regression +# -------------------------------------------------------------------------- +def test_pdf_extracts_all_60_pages(big_pdf): + body = ec.extract_pdf(big_pdf) + # the old converter capped at 50 pages — page 60 must be present now + assert "--- PAGE 60 ---" in body + assert "PDFMARK-60" in body + assert ec.count_page_markers(body) == 60 + + +def test_pdf_does_not_truncate_mid_document(big_pdf): + body = ec.extract_pdf(big_pdf) + pages = ec.split_pages(body) + assert pages[-1][0] == 60 # last marker is the real last page + + +# -------------------------------------------------------------------------- +# page join / split round-trip +# -------------------------------------------------------------------------- +def test_join_split_round_trip(): + body = ec.join_pages(["alpha", "beta", "gamma"]) + pages = ec.split_pages(body) + assert [n for n, _ in pages] == [1, 2, 3] + assert [t for _, t in pages] == ["alpha", "beta", "gamma"] + + +def test_split_pages_no_markers_returns_empty(): + assert ec.split_pages("plain text with no markers") == [] + + +# -------------------------------------------------------------------------- +# docx — synthetic page markers +# -------------------------------------------------------------------------- +def test_docx_synthetic_page_markers(sample_docx): + body = ec.extract_docx(sample_docx) + # 100 paragraphs / 40 per page => 3 pages + assert ec.count_page_markers(body) == 3 + assert "Paragraf 99" in body + + +# -------------------------------------------------------------------------- +# HTML mirror — nav/script/footer stripped +# -------------------------------------------------------------------------- +def test_html_strips_chrome(html_with_nav): + body = ec.extract_html(html_with_nav) + assert "Vanatoarea de comori" in body + assert "joc real de orientare" in body + # chrome must be gone + assert "tracking" not in body + assert "Site Banner Junk" not in body + assert "toate drepturile rezervate" not in body + assert "Games" not in body + + +# -------------------------------------------------------------------------- +# content hash + near-duplicate elimination +# -------------------------------------------------------------------------- +def test_content_hash_ignores_whitespace(): + assert ec.content_hash("hello world") == ec.content_hash("hello world\n") + assert ec.content_hash("hello world") != ec.content_hash("goodbye world") + + +def test_dedupe_exact_duplicates(): + items = [("a", "joc identic"), ("b", "joc identic"), ("c", "alt joc")] + kept = ec.dedupe_texts(items) + assert [k for k, _ in kept] == ["a", "c"] + + +def test_dedupe_near_duplicates(): + base = "Vanatoarea de comori este un joc de orientare pentru cercetasi in tabara." + near = base + " Pagina printata." # >95% similar + items = [("orig", base), ("print", near), ("other", "Cu totul alt continut diferit aici.")] + kept = ec.dedupe_texts(items, threshold=85.0) + keys = [k for k, _ in kept] + assert "orig" in keys + assert "print" not in keys + assert "other" in keys + + +# -------------------------------------------------------------------------- +# zip recursion +# -------------------------------------------------------------------------- +def test_zip_recurses_into_inner_files(sample_zip): + body = ec.extract_zip(sample_zip) + assert "Paragraf 0" in body + assert ec.count_page_markers(body) > 0 + + +def test_zip_bad_archive_returns_empty(tmp_path): + bad = tmp_path / "broken.zip" + bad.write_text("not a zip", encoding="utf-8") + assert ec.extract_zip(bad) == "" + + +def test_nested_zip(tmp_path, sample_zip): + outer = tmp_path / "outer.zip" + with zipfile.ZipFile(outer, "w") as zf: + zf.write(sample_zip, arcname="nested/archive.zip") + body = ec.extract_zip(outer) + assert "Paragraf 0" in body + + +# -------------------------------------------------------------------------- +# preflight +# -------------------------------------------------------------------------- +def test_preflight_python_packages_present(): + report = ec.preflight() + # all required packages are installed in the test environment + assert report["missing_python"] == [] + + +def test_preflight_reports_libreoffice_state(): + report = ec.preflight() + has_lo = bool(shutil.which("libreoffice") or shutil.which("soffice")) + if has_lo: + assert all("libreoffice" not in w for w in report["warnings"]) + else: + assert any("libreoffice" in w for w in report["warnings"]) + + +def test_preflight_ocr_flag(): + report = ec.preflight(check_ocr=True) + if not shutil.which("tesseract"): + assert any("tesseract" in m for m in report["missing_system"]) + + +# -------------------------------------------------------------------------- +# legacy .doc — skipped unless libreoffice is installed +# -------------------------------------------------------------------------- +@pytest.mark.skipif( + not (shutil.which("libreoffice") or shutil.which("soffice")), + reason="libreoffice not installed", +) +def test_doc_conversion(tmp_path, sample_docx): + doc_path = tmp_path / "legacy.doc" + shutil.copy(sample_docx, doc_path) # smoke test of the docx path + body = ec.extract_doc(doc_path) + assert ec.count_page_markers(body) >= 1 + + +def test_doc_without_libreoffice_raises(tmp_path, monkeypatch): + monkeypatch.setattr(ec.shutil, "which", lambda _: None) + with pytest.raises(RuntimeError): + ec.extract_doc(tmp_path / "whatever.doc") diff --git a/tests/test_fts.py b/tests/test_fts.py new file mode 100644 index 0000000..14e627f --- /dev/null +++ b/tests/test_fts.py @@ -0,0 +1,139 @@ +""" +Integration tests for the FTS5 search index. + +Confirms that materials_list and skills_developed are indexed by FTS5 and kept +in sync by the insert / update / delete triggers (plan §6, §7). +""" + +import os +import sys +import json + +import pytest + +# Make the project root importable when pytest is run from anywhere. +PROJECT_ROOT = os.path.abspath(os.path.join(os.path.dirname(__file__), "..")) +if PROJECT_ROOT not in sys.path: + sys.path.insert(0, PROJECT_ROOT) + +from app.models.activity import Activity # noqa: E402 +from app.models.database import DatabaseManager # noqa: E402 + + +@pytest.fixture +def db(tmp_path): + """A fresh DatabaseManager backed by a temporary SQLite file.""" + return DatabaseManager(str(tmp_path / "test_activities.db")) + + +def _make_activity(**overrides): + base = dict( + name="Vânătoarea de comori", + description="O activitate de echipă în aer liber.", + category="camp-outdoor", + content_type="joc", + source_file="test.txt", + language="ro", + ) + base.update(overrides) + return Activity(**base) + + +def test_search_by_materials_list(db): + """A term that only appears in materials_list returns the activity.""" + activity = _make_activity(materials_list="frânghie, eșarfă, busolă") + db.insert_activity(activity) + + results = db.search_activities(search_text="busolă") + assert len(results) == 1 + assert results[0]["name"] == "Vânătoarea de comori" + + +def test_search_by_skills_developed(db): + """A term that only appears in skills_developed returns the activity.""" + activity = _make_activity(skills_developed="comunicare, leadership, rabdare") + db.insert_activity(activity) + + results = db.search_activities(search_text="leadership") + assert len(results) == 1 + assert results[0]["name"] == "Vânătoarea de comori" + + +def test_term_absent_from_indexed_columns_no_hit(db): + """A term present in no indexed column yields no hit (control).""" + db.insert_activity(_make_activity(materials_list="frânghie")) + assert db.search_activities(search_text="zzzunlikelyterm") == [] + + +def test_delete_trigger_removes_from_fts(db): + """Deleting an activity removes it from the FTS index (delete trigger).""" + activity = _make_activity(materials_list="catalige") + activity_id = db.insert_activity(activity) + assert len(db.search_activities(search_text="catalige")) == 1 + + with db._get_connection() as conn: + conn.execute("DELETE FROM activities WHERE id = ?", (activity_id,)) + conn.commit() + + assert db.search_activities(search_text="catalige") == [] + + +def test_update_trigger_resyncs_fts(db): + """Updating materials_list re-syncs the FTS index (update trigger).""" + activity = _make_activity(materials_list="creioane") + activity_id = db.insert_activity(activity) + assert len(db.search_activities(search_text="creioane")) == 1 + + with db._get_connection() as conn: + conn.execute( + "UPDATE activities SET materials_list = ? WHERE id = ?", + ("acuarele", activity_id), + ) + conn.commit() + + # Old term gone, new term found. + assert db.search_activities(search_text="creioane") == [] + assert len(db.search_activities(search_text="acuarele")) == 1 + + +def test_rebuild_fts_index(db): + """rebuild_fts_index keeps materials_list / skills_developed searchable.""" + db.insert_activity(_make_activity(skills_developed="orientare")) + db.rebuild_fts_index() + assert len(db.search_activities(search_text="orientare")) == 1 + + +def test_new_schema_columns_round_trip(db): + """New activity columns persist and load back via from_dict.""" + activity = _make_activity( + source_files=["a.txt", "b.txt"], + source_excerpt="Citat scurt din sursă.", + extraction_confidence="high", + needs_review=1, + normalized_name="vanatoarea de comori", + ) + activity_id = db.insert_activity(activity) + + row = db.get_activity_by_id(activity_id) + assert row["content_type"] == "joc" + assert row["language"] == "ro" + assert row["extraction_confidence"] == "high" + assert row["needs_review"] == 1 + assert row["normalized_name"] == "vanatoarea de comori" + assert json.loads(row["source_files"]) == ["a.txt", "b.txt"] + assert row["source_excerpt"] == "Citat scurt din sursă." + + loaded = Activity.from_dict(row) + assert loaded.source_files == ["a.txt", "b.txt"] + assert loaded.content_type == "joc" + + +def test_normalized_name_auto_derived(db): + """normalized_name is auto-derived from name when not provided.""" + activity = Activity( + name="Ștafetă cu Obstacole", + description="desc", + category="sports-active", + source_file="t.txt", + ) + assert activity.normalized_name == "stafeta cu obstacole" diff --git a/tests/test_search.py b/tests/test_search.py new file mode 100644 index 0000000..547c9e2 --- /dev/null +++ b/tests/test_search.py @@ -0,0 +1,140 @@ +# -*- coding: utf-8 -*- +""" +CRITICAL REGRESSION TEST (plan §6, §7). + +`search.py` changed the result sets of /search and /api/search: the default +search now EXCLUDES the non-game content types (rețetă / cântec / ceremonie), +which surface only when the user explicitly filters that content_type or picks +a non-game category. This test guards that behaviour. +""" + +import pytest + +from app.models.activity import Activity +from app.models.database import DatabaseManager +from app.services.search import SearchService +from app.config_taxonomy import NON_GAME_CONTENT_TYPES + + +# -------------------------------------------------------------------------- +# fixtures +# -------------------------------------------------------------------------- +def _activity(name, content_type, category="altele", language="ro"): + return Activity( + name=name, + description=f"Descriere pentru {name}, un conținut de tip {content_type}.", + category=category, + content_type=content_type, + language=language, + source_file="test/fixture.txt", + ) + + +@pytest.fixture +def search_service(tmp_path): + """A SearchService over a temp DB seeded with one row per content_type.""" + db = DatabaseManager(str(tmp_path / "activities.db")) + db.clear_database() + db.bulk_insert_activities([ + _activity("Vanatoarea de comori", "joc", category="wide-games"), + _activity("Cercul de cunoastere", "activitate", category="icebreakers"), + _activity("Reteta de paine la ceaun", "reteta", category="retete"), + _activity("Cantecul de tabara", "cantec", category="cantece-ceremonii"), + _activity("Ceremonia de inchidere", "ceremonie", category="cantece-ceremonii"), + _activity("Game in English", "joc", category="wide-games", language="en"), + ]) + return SearchService(db) + + +def _content_types(results): + return {r.get("content_type") for r in results} + + +# -------------------------------------------------------------------------- +# the regression: default search excludes non-game content types +# -------------------------------------------------------------------------- +def test_default_search_excludes_non_game_content(search_service): + """No filters → rețete / cântece / ceremonii must NOT appear.""" + results = search_service.search_activities() + types = _content_types(results) + + assert types, "default search returned nothing" + for non_game in NON_GAME_CONTENT_TYPES: + assert non_game not in types, ( + f"default search leaked non-game content_type '{non_game}'" + ) + # game content is still present + assert "joc" in types + assert "activitate" in types + + +def test_default_search_with_text_excludes_non_game(search_service): + """A text query still excludes non-game content by default.""" + results = search_service.search_activities(search_text="conținut") + assert NON_GAME_CONTENT_TYPES[0] not in _content_types(results) + + +# -------------------------------------------------------------------------- +# explicit content_type filter INCLUDES the non-game rows +# -------------------------------------------------------------------------- +def test_explicit_content_type_filter_includes_non_game(search_service): + """Filtering content_type=reteta returns exactly the rețete.""" + results = search_service.search_activities(filters={"content_type": "reteta"}) + types = _content_types(results) + + assert types == {"reteta"}, f"expected only rețete, got {types}" + assert len(results) == 1 + + +def test_explicit_content_type_filter_for_cantec(search_service): + results = search_service.search_activities(filters={"content_type": "cantec"}) + assert _content_types(results) == {"cantec"} + + +# -------------------------------------------------------------------------- +# a non-game CATEGORY filter also lifts the exclusion +# -------------------------------------------------------------------------- +def test_non_game_category_filter_includes_non_game(search_service): + """Picking category=cantece-ceremonii surfaces cântece + ceremonii.""" + results = search_service.search_activities( + filters={"category": "cantece-ceremonii"}) + types = _content_types(results) + + assert "cantec" in types + assert "ceremonie" in types + + +def test_game_category_filter_still_excludes_non_game(search_service): + """A normal (game) category filter keeps the non-game exclusion.""" + results = search_service.search_activities(filters={"category": "wide-games"}) + types = _content_types(results) + for non_game in NON_GAME_CONTENT_TYPES: + assert non_game not in types + + +# -------------------------------------------------------------------------- +# language filter +# -------------------------------------------------------------------------- +def test_language_filter_ro(search_service): + results = search_service.search_activities(filters={"language": "ro"}) + assert results + assert all(r.get("language") == "ro" for r in results) + + +def test_language_filter_en(search_service): + results = search_service.search_activities(filters={"language": "en"}) + assert results + assert all(r.get("language") == "en" for r in results) + assert {r.get("name") for r in results} == {"Game in English"} + + +# -------------------------------------------------------------------------- +# get_filter_options surfaces the new axes +# -------------------------------------------------------------------------- +def test_filter_options_include_content_type_and_language(search_service): + """The dynamic-filter mechanism now exposes content_type + language.""" + options = search_service.db.get_filter_options() + assert "content_type" in options + assert "language" in options + assert "joc" in options["content_type"] + assert set(options["language"]) == {"ro", "en"} diff --git a/tests/test_validate_extractions.py b/tests/test_validate_extractions.py new file mode 100644 index 0000000..c452f2d --- /dev/null +++ b/tests/test_validate_extractions.py @@ -0,0 +1,156 @@ +# -*- coding: utf-8 -*- +""" +Tests for scripts/validate_extractions.py. + +Covers: schema rejection, the source_excerpt hallucination check, the content +of the generated re-extraction prompt, and the manifest `rejected` marking. +""" + +import json +import sys +from pathlib import Path + +REPO_ROOT = Path(__file__).resolve().parent.parent +SCRIPTS_DIR = REPO_ROOT / "scripts" +for _p in (str(REPO_ROOT), str(SCRIPTS_DIR)): + if _p not in sys.path: + sys.path.insert(0, _p) + +import validate_extractions as ve # noqa: E402 + + +# -------------------------------------------------------------------------- +# helpers +# -------------------------------------------------------------------------- +def _ext_activity(**over): + base = dict( + name="Jocul testului", + description="O activitate de echipa in aer liber.", + category="team-building", + content_type="joc", + language="ro", + extraction_confidence="high", + source_excerpt="ancora din bucata sursa", + page_reference="page 1", + ) + base.update(over) + return base + + +def _write_extraction(extracted_dir, chunk_key, activities, header_extra=None): + extracted_dir.mkdir(parents=True, exist_ok=True) + header = { + "source_hash": "hash1234deadbeef", + "schema_version": "1.0", + "prompt_version": "1.0", + "chunk_range": "pages 1-20", + "source_id": "src01", + "chunk_key": chunk_key, + } + if header_extra: + header.update(header_extra) + payload = {"header": header, "activities": activities} + (extracted_dir / f"{chunk_key}.json").write_text( + json.dumps(payload, ensure_ascii=False), encoding="utf-8" + ) + + +def _write_chunk(chunks_dir, source_id, chunk_key, text): + d = chunks_dir / source_id + d.mkdir(parents=True, exist_ok=True) + (d / f"{chunk_key}.txt").write_text(text, encoding="utf-8") + + +# -------------------------------------------------------------------------- +# tests +# -------------------------------------------------------------------------- +def test_valid_file_passes(tmp_path): + extracted = tmp_path / "extracted" + chunks = tmp_path / "chunks" + excerpt = "ancora din bucata sursa apare aici" + _write_extraction(extracted, "src01.part01", [_ext_activity(source_excerpt=excerpt)]) + _write_chunk(chunks, "src01", "src01.part01", f"--- PAGE 1 ---\n{excerpt}\n") + + report = ve.run(extracted, chunks, tmp_path / "manifest.json") + assert report["valid"] == 1 + assert report["rejected"] == 0 + + +def test_schema_invalid_file_rejected(tmp_path): + extracted = tmp_path / "extracted" + chunks = tmp_path / "chunks" + extracted.mkdir(parents=True) + (extracted / "src01.part01.json").write_text( + json.dumps({"header": {}, "activities": [{"name": "x"}]}), encoding="utf-8" + ) + + report = ve.run(extracted, chunks, tmp_path / "manifest.json") + assert report["rejected"] == 1 + prompt = extracted / "_reextract" / "src01.part01.prompt.md" + assert prompt.exists() + + +def test_hallucinated_excerpt_rejected(tmp_path): + extracted = tmp_path / "extracted" + chunks = tmp_path / "chunks" + _write_extraction( + extracted, "src01.part01", + [_ext_activity(source_excerpt="citat complet inventat care nu exista qqqq")], + ) + _write_chunk(chunks, "src01", "src01.part01", + "--- PAGE 1 ---\ntext complet diferit despre altceva.\n") + + report = ve.run(extracted, chunks, tmp_path / "manifest.json") + assert report["rejected"] == 1 + errors = report["rejected_chunks"][0]["errors"] + assert any("hallucination" in e for e in errors) + + +def test_reextraction_prompt_content(tmp_path): + extracted = tmp_path / "extracted" + chunks = tmp_path / "chunks" + _write_extraction( + extracted, "src01.part01", + [_ext_activity(source_excerpt="citat inventat care nu exista zzzz")], + ) + _write_chunk(chunks, "src01", "src01.part01", + "--- PAGE 1 ---\ntext despre cu totul altceva aici.\n") + + ve.run(extracted, chunks, tmp_path / "manifest.json") + prompt = (extracted / "_reextract" / "src01.part01.prompt.md").read_text( + encoding="utf-8" + ) + assert "src01.part01" in prompt + assert "REJECTED" in prompt + assert "verbatim" in prompt + assert "data/extracted/src01.part01.json" in prompt + + +def test_manifest_marks_chunk_rejected(tmp_path): + extracted = tmp_path / "extracted" + chunks = tmp_path / "chunks" + manifest_path = tmp_path / "manifest.json" + manifest_path.write_text( + json.dumps({"chunks": {"src01.part01": {"state": "done", + "chunk_file": "chunks/src01/src01.part01.txt"}}}), + encoding="utf-8", + ) + _write_extraction( + extracted, "src01.part01", + [_ext_activity(source_excerpt="citat fabricat absent vvvv")], + ) + _write_chunk(chunks, "src01", "src01.part01", + "--- PAGE 1 ---\nun continut neinrudit.\n") + + ve.run(extracted, chunks, manifest_path) + manifest = json.loads(manifest_path.read_text(encoding="utf-8")) + assert manifest["chunks"]["src01.part01"]["state"] == "rejected" + + +def test_build_reextraction_prompt_lists_errors(): + prompt = ve.build_reextraction_prompt( + "abc.part03", "data/chunks/abc/abc.part03.txt", + ["header: 'source_hash' is a required property"], + ) + assert "abc.part03" in prompt + assert "source_hash" in prompt