diff --git a/app/config_taxonomy.py b/app/config_taxonomy.py
new file mode 100644
index 0000000..2e8db25
--- /dev/null
+++ b/app/config_taxonomy.py
@@ -0,0 +1,230 @@
+"""
+Controlled category taxonomy for game-library.
+
+Single source of truth for activity categories. The DB stores the *slug*;
+the UI displays the Romanian name. `category` (thematic domain) and
+`content_type` (form of the content) are INDEPENDENT axes — see plan §2.
+"""
+
+import unicodedata
+import re
+from typing import Dict, List
+
+# --- Categories (thematic domain) --------------------------------------------
+# slug -> Romanian display name. ~16 fixed slugs; `altele` is the mandatory
+# fallback and MUST always be present.
+CATEGORIES: Dict[str, str] = {
+ "jocuri-cercetasesti": "Jocuri cercetășești",
+ "team-building": "Team-building",
+ "icebreakers": "Icebreakers / spargerea gheții",
+ "camp-outdoor": "Tabără și activități în aer liber",
+ "wide-games": "Wide games / jocuri de teren",
+ "orientare": "Orientare",
+ "prim-ajutor": "Prim ajutor",
+ "escape-room-puzzle": "Escape room și puzzle",
+ "creative-stem": "Creativitate și STEM",
+ "sports-active": "Sport și activități fizice",
+ "cantece-ceremonii": "Cântece și ceremonii",
+ "retete": "Rețete",
+ "supravietuire": "Supraviețuire",
+ "integrare-incluziune": "Integrare și incluziune",
+ "conflict-empatie": "Conflict și empatie",
+ "altele": "Altele",
+}
+
+# Mandatory fallback slug.
+FALLBACK_CATEGORY = "altele"
+
+# Ordered list of valid slugs.
+CATEGORY_SLUGS: List[str] = list(CATEGORIES.keys())
+
+# --- Content type (form of the content) --------------------------------------
+# Independent axis from `category`. The UI default search excludes the
+# non-game content types (see plan §6).
+CONTENT_TYPES: Dict[str, str] = {
+ "joc": "Joc",
+ "activitate": "Activitate",
+ "reteta": "Rețetă",
+ "cantec": "Cântec",
+ "ceremonie": "Ceremonie",
+}
+
+CONTENT_TYPE_SLUGS: List[str] = list(CONTENT_TYPES.keys())
+
+# Content types considered "non-game" — excluded from the default UI search.
+NON_GAME_CONTENT_TYPES: List[str] = ["reteta", "cantec", "ceremonie"]
+
+DEFAULT_CONTENT_TYPE = "activitate"
+
+# --- Aliases -----------------------------------------------------------------
+# Map of normalized arbitrary strings -> canonical slug. Keys are already
+# diacritic-stripped, lowercased and hyphenated (see _slugify). This catches
+# legacy / messy values from the old DB and common English/Romanian variants.
+_CATEGORY_ALIASES: Dict[str, str] = {
+ # legacy junk
+ "general-activity": "altele",
+ "general": "altele",
+ "educational": "creative-stem",
+ "d": "altele",
+ "a": "altele",
+ "b": "altele",
+ "c": "altele",
+ # scouting
+ "cercetasie": "jocuri-cercetasesti",
+ "cercetasesti": "jocuri-cercetasesti",
+ "scout": "jocuri-cercetasesti",
+ "scouting": "jocuri-cercetasesti",
+ "scout-games": "jocuri-cercetasesti",
+ "jocuri-cercetasesti": "jocuri-cercetasesti",
+ # team building
+ "teambuilding": "team-building",
+ "team": "team-building",
+ "cooperare": "team-building",
+ # icebreakers
+ "icebreaker": "icebreakers",
+ "spargerea-ghetii": "icebreakers",
+ "cunoastere": "icebreakers",
+ "energizers": "icebreakers",
+ "energizer": "icebreakers",
+ # camp / outdoor
+ "camp": "camp-outdoor",
+ "tabara": "camp-outdoor",
+ "outdoor": "camp-outdoor",
+ "aer-liber": "camp-outdoor",
+ # wide games
+ "wide-game": "wide-games",
+ "jocuri-de-teren": "wide-games",
+ "joc-de-teren": "wide-games",
+ "big-games": "wide-games",
+ # orientare
+ "orienteering": "orientare",
+ "navigatie": "orientare",
+ # prim ajutor
+ "first-aid": "prim-ajutor",
+ "primul-ajutor": "prim-ajutor",
+ # escape room / puzzle
+ "escape-room": "escape-room-puzzle",
+ "escaperoom": "escape-room-puzzle",
+ "puzzle": "escape-room-puzzle",
+ "puzzles": "escape-room-puzzle",
+ "ghicitori": "escape-room-puzzle",
+ # creative / stem
+ "creative": "creative-stem",
+ "creativitate": "creative-stem",
+ "stem": "creative-stem",
+ "arts-and-crafts": "creative-stem",
+ "craft": "creative-stem",
+ "crafts": "creative-stem",
+ "stiinta": "creative-stem",
+ # sports
+ "sport": "sports-active",
+ "sports": "sports-active",
+ "sportive": "sports-active",
+ "active": "sports-active",
+ "miscare": "sports-active",
+ "physical": "sports-active",
+ # songs / ceremonies
+ "cantece": "cantece-ceremonii",
+ "cantec": "cantece-ceremonii",
+ "songs": "cantece-ceremonii",
+ "ceremonii": "cantece-ceremonii",
+ "ceremonie": "cantece-ceremonii",
+ "ceremony": "cantece-ceremonii",
+ # recipes
+ "reteta": "retete",
+ "recipe": "retete",
+ "recipes": "retete",
+ "cooking": "retete",
+ "gatit": "retete",
+ # survival
+ "survival": "supravietuire",
+ "supravietuire": "supravietuire",
+ # inclusion
+ "integrare": "integrare-incluziune",
+ "incluziune": "integrare-incluziune",
+ "inclusion": "integrare-incluziune",
+ # conflict / empathy
+ "conflict": "conflict-empatie",
+ "empatie": "conflict-empatie",
+ "empathy": "conflict-empatie",
+ "rezolvarea-conflictelor": "conflict-empatie",
+ # fallback
+ "altele": "altele",
+ "other": "altele",
+ "others": "altele",
+ "misc": "altele",
+}
+
+
+def _slugify(value: str) -> str:
+ """Lowercase, strip diacritics, collapse non-alphanumerics to hyphens."""
+ if not value:
+ return ""
+ # Decompose accents (ă -> a, ș -> s, ț -> t, etc.)
+ decomposed = unicodedata.normalize("NFKD", value)
+ ascii_str = "".join(c for c in decomposed if not unicodedata.combining(c))
+ ascii_str = ascii_str.lower().strip()
+ ascii_str = re.sub(r"[^a-z0-9]+", "-", ascii_str)
+ return ascii_str.strip("-")
+
+
+def normalize_category(value: str) -> str:
+ """Map an arbitrary string to a valid category slug.
+
+ Returns one of CATEGORY_SLUGS, falling back to `altele` for anything
+ unrecognised or empty.
+ """
+ if not value:
+ return FALLBACK_CATEGORY
+ slug = _slugify(str(value))
+ if not slug:
+ return FALLBACK_CATEGORY
+ # Exact slug match.
+ if slug in CATEGORIES:
+ return slug
+ # Alias match.
+ if slug in _CATEGORY_ALIASES:
+ return _CATEGORY_ALIASES[slug]
+ return FALLBACK_CATEGORY
+
+
+def normalize_content_type(value: str) -> str:
+ """Map an arbitrary string to a valid content_type slug.
+
+ Returns one of CONTENT_TYPE_SLUGS, falling back to `activitate`.
+ """
+ if not value:
+ return DEFAULT_CONTENT_TYPE
+ slug = _slugify(str(value))
+ if slug in CONTENT_TYPES:
+ return slug
+ # Light alias handling for plural / English forms.
+ aliases = {
+ "jocuri": "joc",
+ "game": "joc",
+ "games": "joc",
+ "activitati": "activitate",
+ "activity": "activitate",
+ "retete": "reteta",
+ "recipe": "reteta",
+ "cantece": "cantec",
+ "song": "cantec",
+ "ceremonii": "ceremonie",
+ "ceremony": "ceremonie",
+ }
+ return aliases.get(slug, DEFAULT_CONTENT_TYPE)
+
+
+def is_valid_category(slug: str) -> bool:
+ """True if `slug` is a valid category slug."""
+ return slug in CATEGORIES
+
+
+def category_display_name(slug: str) -> str:
+ """Romanian display name for a slug (fallback to the slug itself)."""
+ return CATEGORIES.get(slug, slug)
+
+
+def content_type_display_name(slug: str) -> str:
+ """Romanian display name for a content_type slug."""
+ return CONTENT_TYPES.get(slug, slug)
diff --git a/app/models/activity.py b/app/models/activity.py
index d28f76b..b2bbf18 100644
--- a/app/models/activity.py
+++ b/app/models/activity.py
@@ -5,6 +5,22 @@ Activity data model for INDEX-SISTEM-JOCURI v2.0
from dataclasses import dataclass, field
from typing import List, Optional, Dict, Any
import json
+import re
+import unicodedata
+
+
+def normalize_name(name: str) -> str:
+ """Diacritic-free, lowercased, whitespace-collapsed form of a name.
+
+ Used as the exact-match key for dedup grouping (see plan §4).
+ """
+ if not name:
+ return ""
+ decomposed = unicodedata.normalize("NFKD", name)
+ ascii_str = "".join(c for c in decomposed if not unicodedata.combining(c))
+ ascii_str = ascii_str.lower().strip()
+ ascii_str = re.sub(r"\s+", " ", ascii_str)
+ return ascii_str
@dataclass
class Activity:
@@ -19,10 +35,19 @@ class Activity:
# Categories
category: str = ""
subcategory: Optional[str] = None
-
+ # content_type is an axis INDEPENDENT of category:
+ # one of joc/activitate/reteta/cantec/ceremonie (see config_taxonomy).
+ content_type: Optional[str] = None
+
# Source information
source_file: str = ""
page_reference: Optional[str] = None
+ # source_files: JSON-encoded list of every source the activity was seen in.
+ # `source_file` (singular) stays as the primary/original source; build_database
+ # (Lane C) accumulates the full list here on dedup-merge.
+ source_files: List[str] = field(default_factory=list)
+ # Short verbatim quote from the source — anti-hallucination anchor.
+ source_excerpt: Optional[str] = None
# Age and participants
age_group_min: Optional[int] = None
@@ -44,11 +69,22 @@ class Activity:
keywords: Optional[str] = None
tags: List[str] = field(default_factory=list)
popularity_score: int = 0
-
+
+ # Extraction / language metadata
+ language: Optional[str] = None # 'ro' / 'en'
+ normalized_name: Optional[str] = None # dedup key; auto-derived from name
+ extraction_confidence: Optional[str] = None # 'high' / 'med' / 'low'
+ needs_review: int = 0
+
# Database fields
id: Optional[int] = None
created_at: Optional[str] = None
updated_at: Optional[str] = None
+
+ def __post_init__(self):
+ """Derive normalized_name from name when not explicitly provided."""
+ if not self.normalized_name:
+ self.normalized_name = normalize_name(self.name)
def to_dict(self) -> Dict[str, Any]:
"""Convert activity to dictionary for database storage"""
@@ -59,8 +95,11 @@ class Activity:
'variations': self.variations,
'category': self.category,
'subcategory': self.subcategory,
+ 'content_type': self.content_type,
'source_file': self.source_file,
+ 'source_files': json.dumps(self.source_files) if self.source_files else None,
'page_reference': self.page_reference,
+ 'source_excerpt': self.source_excerpt,
'age_group_min': self.age_group_min,
'age_group_max': self.age_group_max,
'participants_min': self.participants_min,
@@ -73,7 +112,11 @@ class Activity:
'difficulty_level': self.difficulty_level,
'keywords': self.keywords,
'tags': json.dumps(self.tags) if self.tags else None,
- 'popularity_score': self.popularity_score
+ 'popularity_score': self.popularity_score,
+ 'language': self.language,
+ 'normalized_name': self.normalized_name or normalize_name(self.name),
+ 'extraction_confidence': self.extraction_confidence,
+ 'needs_review': self.needs_review,
}
@classmethod
@@ -86,7 +129,17 @@ class Activity:
tags = json.loads(data['tags'])
except (json.JSONDecodeError, TypeError):
tags = []
-
+
+ # source_files may arrive as a JSON string (DB) or a list (extraction)
+ source_files = data.get('source_files')
+ if isinstance(source_files, str):
+ try:
+ source_files = json.loads(source_files)
+ except (json.JSONDecodeError, TypeError):
+ source_files = []
+ elif source_files is None:
+ source_files = []
+
return cls(
id=data.get('id'),
name=data.get('name', ''),
@@ -95,8 +148,11 @@ class Activity:
variations=data.get('variations'),
category=data.get('category', ''),
subcategory=data.get('subcategory'),
+ content_type=data.get('content_type'),
source_file=data.get('source_file', ''),
+ source_files=source_files,
page_reference=data.get('page_reference'),
+ source_excerpt=data.get('source_excerpt'),
age_group_min=data.get('age_group_min'),
age_group_max=data.get('age_group_max'),
participants_min=data.get('participants_min'),
@@ -110,6 +166,10 @@ class Activity:
keywords=data.get('keywords'),
tags=tags,
popularity_score=data.get('popularity_score', 0),
+ language=data.get('language'),
+ normalized_name=data.get('normalized_name'),
+ extraction_confidence=data.get('extraction_confidence'),
+ needs_review=data.get('needs_review', 0) or 0,
created_at=data.get('created_at'),
updated_at=data.get('updated_at')
)
diff --git a/app/models/database.py b/app/models/database.py
index 93524d4..816c403 100644
--- a/app/models/database.py
+++ b/app/models/database.py
@@ -30,6 +30,8 @@ class DatabaseManager:
"""Initialize database with v2.0 schema"""
with self._get_connection() as conn:
# Main activities table
+ # NOTE: schema is rebuilt from scratch (plan §6) — no in-place
+ # migration. The old DB is deleted and recreated by build_database.
conn.execute("""
CREATE TABLE IF NOT EXISTS activities (
id INTEGER PRIMARY KEY AUTOINCREMENT,
@@ -39,9 +41,12 @@ class DatabaseManager:
variations TEXT,
category TEXT NOT NULL,
subcategory TEXT,
+ content_type TEXT,
source_file TEXT NOT NULL,
+ source_files TEXT,
page_reference TEXT,
-
+ source_excerpt TEXT,
+
-- Structured parameters
age_group_min INTEGER,
age_group_max INTEGER,
@@ -49,26 +54,34 @@ class DatabaseManager:
participants_max INTEGER,
duration_min INTEGER,
duration_max INTEGER,
-
+
-- Categories for filtering
materials_category TEXT,
materials_list TEXT,
skills_developed TEXT,
difficulty_level TEXT,
-
+
-- Metadata
keywords TEXT,
tags TEXT,
popularity_score INTEGER DEFAULT 0,
+
+ -- Extraction / language metadata
+ language TEXT,
+ normalized_name TEXT,
+ extraction_confidence TEXT,
+ needs_review INTEGER DEFAULT 0,
+
created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
updated_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP
)
""")
-
+
# FTS5 virtual table for search
conn.execute("""
CREATE VIRTUAL TABLE IF NOT EXISTS activities_fts USING fts5(
name, description, rules, variations, keywords,
+ materials_list, skills_developed,
content='activities',
content_rowid='id'
)
@@ -92,6 +105,7 @@ class DatabaseManager:
"CREATE INDEX IF NOT EXISTS idx_activities_age ON activities(age_group_min, age_group_max)",
"CREATE INDEX IF NOT EXISTS idx_activities_participants ON activities(participants_min, participants_max)",
"CREATE INDEX IF NOT EXISTS idx_activities_duration ON activities(duration_min, duration_max)",
+ "CREATE INDEX IF NOT EXISTS idx_activities_normalized_name ON activities(normalized_name)",
"CREATE INDEX IF NOT EXISTS idx_categories_type ON categories(type)"
]
@@ -102,24 +116,34 @@ class DatabaseManager:
conn.execute("""
CREATE TRIGGER IF NOT EXISTS activities_fts_insert AFTER INSERT ON activities
BEGIN
- INSERT INTO activities_fts(rowid, name, description, rules, variations, keywords)
- VALUES (new.id, new.name, new.description, new.rules, new.variations, new.keywords);
+ INSERT INTO activities_fts(rowid, name, description, rules, variations,
+ keywords, materials_list, skills_developed)
+ VALUES (new.id, new.name, new.description, new.rules, new.variations,
+ new.keywords, new.materials_list, new.skills_developed);
END
""")
-
+
conn.execute("""
CREATE TRIGGER IF NOT EXISTS activities_fts_delete AFTER DELETE ON activities
BEGIN
- DELETE FROM activities_fts WHERE rowid = old.id;
+ INSERT INTO activities_fts(activities_fts, rowid, name, description, rules,
+ variations, keywords, materials_list, skills_developed)
+ VALUES ('delete', old.id, old.name, old.description, old.rules,
+ old.variations, old.keywords, old.materials_list, old.skills_developed);
END
""")
-
+
conn.execute("""
CREATE TRIGGER IF NOT EXISTS activities_fts_update AFTER UPDATE ON activities
BEGIN
- DELETE FROM activities_fts WHERE rowid = old.id;
- INSERT INTO activities_fts(rowid, name, description, rules, variations, keywords)
- VALUES (new.id, new.name, new.description, new.rules, new.variations, new.keywords);
+ INSERT INTO activities_fts(activities_fts, rowid, name, description, rules,
+ variations, keywords, materials_list, skills_developed)
+ VALUES ('delete', old.id, old.name, old.description, old.rules,
+ old.variations, old.keywords, old.materials_list, old.skills_developed);
+ INSERT INTO activities_fts(rowid, name, description, rules, variations,
+ keywords, materials_list, skills_developed)
+ VALUES (new.id, new.name, new.description, new.rules, new.variations,
+ new.keywords, new.materials_list, new.skills_developed);
END
""")
@@ -179,6 +203,8 @@ class DatabaseManager:
"""Update category usage counts"""
categories_to_update = [
('category', activity.category),
+ ('content_type', activity.content_type),
+ ('language', activity.language),
('age_group', activity.get_age_range_display()),
('participants', activity.get_participants_display()),
('duration', activity.get_duration_display()),
@@ -332,8 +358,11 @@ class DatabaseManager:
def clear_database(self):
"""Clear all data from database"""
with self._get_connection() as conn:
+ # Deleting from activities fires the delete trigger, which removes
+ # the matching FTS rows. The explicit 'delete-all' command then
+ # guarantees the external-content FTS index is fully cleared.
conn.execute("DELETE FROM activities")
- conn.execute("DELETE FROM activities_fts")
+ conn.execute("INSERT INTO activities_fts(activities_fts) VALUES('delete-all')")
conn.execute("DELETE FROM categories")
conn.commit()
diff --git a/app/services/__init__.py b/app/services/__init__.py
index 38de191..36492a0 100644
--- a/app/services/__init__.py
+++ b/app/services/__init__.py
@@ -2,8 +2,6 @@
Services for INDEX-SISTEM-JOCURI v2.0
"""
-from .parser import IndexMasterParser
-from .indexer import ActivityIndexer
from .search import SearchService
-__all__ = ['IndexMasterParser', 'ActivityIndexer', 'SearchService']
\ No newline at end of file
+__all__ = ['SearchService']
diff --git a/app/services/indexer.py b/app/services/indexer.py
deleted file mode 100644
index ba9cd96..0000000
--- a/app/services/indexer.py
+++ /dev/null
@@ -1,248 +0,0 @@
-"""
-Activity indexer service for INDEX-SISTEM-JOCURI v2.0
-Coordinates parsing and database indexing
-"""
-
-from typing import List, Dict, Any
-from pathlib import Path
-from app.models.database import DatabaseManager
-from app.models.activity import Activity
-from app.services.parser import IndexMasterParser
-import time
-
-class ActivityIndexer:
- """Service for indexing activities from INDEX_MASTER into database"""
-
- def __init__(self, db_manager: DatabaseManager, index_master_path: str):
- """Initialize indexer with database manager and INDEX_MASTER path"""
- self.db = db_manager
- self.parser = IndexMasterParser(index_master_path)
- self.indexing_stats = {}
-
- def index_all_activities(self, clear_existing: bool = False) -> Dict[str, Any]:
- """Index all activities from INDEX_MASTER into database"""
-
- print("🚀 Starting activity indexing process...")
- start_time = time.time()
-
- # Clear existing data if requested
- if clear_existing:
- print("🗑️ Clearing existing database...")
- self.db.clear_database()
-
- # Parse activities from INDEX_MASTER
- print("📖 Parsing INDEX_MASTER file...")
- activities = self.parser.parse_all_categories()
-
- if not activities:
- print("❌ No activities were parsed!")
- return {'success': False, 'error': 'No activities parsed'}
-
- # Filter valid activities
- valid_activities = []
- for activity in activities:
- if self.parser.validate_activity_completeness(activity):
- valid_activities.append(activity)
- else:
- print(f"⚠️ Skipping incomplete activity: {activity.name[:50]}...")
-
- print(f"✅ Validated {len(valid_activities)} activities out of {len(activities)} parsed")
-
- if len(valid_activities) < 100:
- print(f"⚠️ Warning: Only {len(valid_activities)} valid activities found. Expected 500+")
-
- # Bulk insert into database
- print("💾 Inserting activities into database...")
- try:
- inserted_count = self.db.bulk_insert_activities(valid_activities)
-
- # Rebuild FTS index for optimal search performance
- print("🔍 Rebuilding search index...")
- self.db.rebuild_fts_index()
-
- end_time = time.time()
- indexing_time = end_time - start_time
-
- # Generate final statistics (with error handling)
- try:
- stats = self._generate_indexing_stats(valid_activities, indexing_time)
- stats['inserted_count'] = inserted_count
- stats['success'] = True
- except Exception as e:
- print(f"⚠️ Error generating statistics: {e}")
- stats = {
- 'success': True,
- 'inserted_count': inserted_count,
- 'indexing_time_seconds': indexing_time,
- 'error': f'Stats generation failed: {str(e)}'
- }
-
- print(f"✅ Indexing complete! {inserted_count} activities indexed in {indexing_time:.2f}s")
-
- # Verify database state (with error handling)
- try:
- db_stats = self.db.get_statistics()
- print(f"📊 Database now contains {db_stats['total_activities']} activities")
- except Exception as e:
- print(f"⚠️ Error getting database statistics: {e}")
- print(f"📊 Database insertion completed, statistics unavailable")
-
- return stats
-
- except Exception as e:
- print(f"❌ Error during database insertion: {e}")
- return {'success': False, 'error': str(e)}
-
- def index_specific_category(self, category_code: str) -> Dict[str, Any]:
- """Index activities from a specific category only"""
-
- print(f"🎯 Indexing specific category: {category_code}")
-
- # Load content and parse specific category
- if not self.parser.load_content():
- return {'success': False, 'error': 'Could not load INDEX_MASTER'}
-
- category_name = self.parser.category_mapping.get(category_code)
- if not category_name:
- return {'success': False, 'error': f'Unknown category code: {category_code}'}
-
- activities = self.parser.parse_category_section(category_code, category_name)
-
- if not activities:
- return {'success': False, 'error': f'No activities found in category {category_code}'}
-
- # Filter valid activities
- valid_activities = [a for a in activities if self.parser.validate_activity_completeness(a)]
-
- try:
- inserted_count = self.db.bulk_insert_activities(valid_activities)
- return {
- 'success': True,
- 'category': category_name,
- 'inserted_count': inserted_count,
- 'total_parsed': len(activities),
- 'valid_activities': len(valid_activities)
- }
- except Exception as e:
- return {'success': False, 'error': str(e)}
-
- def _generate_indexing_stats(self, activities: List[Activity], indexing_time: float) -> Dict[str, Any]:
- """Generate comprehensive indexing statistics"""
-
- # Get parser statistics
- parser_stats = self.parser.get_parsing_statistics()
-
- # Calculate additional metrics
- categories = {}
- age_ranges = {}
- durations = {}
- materials = {}
-
- for activity in activities:
- # Category breakdown
- if activity.category in categories:
- categories[activity.category] += 1
- else:
- categories[activity.category] = 1
-
- # Age range analysis (with safety check)
- try:
- age_key = activity.get_age_range_display() or "nespecificat"
- age_ranges[age_key] = age_ranges.get(age_key, 0) + 1
- except Exception as e:
- print(f"Warning: Error getting age range for activity {activity.name}: {e}")
- age_ranges["nespecificat"] = age_ranges.get("nespecificat", 0) + 1
-
- # Duration analysis (with safety check)
- try:
- duration_key = activity.get_duration_display() or "nespecificat"
- durations[duration_key] = durations.get(duration_key, 0) + 1
- except Exception as e:
- print(f"Warning: Error getting duration for activity {activity.name}: {e}")
- durations["nespecificat"] = durations.get("nespecificat", 0) + 1
-
- # Materials analysis (with safety check)
- try:
- materials_key = activity.get_materials_display() or "nespecificat"
- materials[materials_key] = materials.get(materials_key, 0) + 1
- except Exception as e:
- print(f"Warning: Error getting materials for activity {activity.name}: {e}")
- materials["nespecificat"] = materials.get("nespecificat", 0) + 1
-
- return {
- 'indexing_time_seconds': indexing_time,
- 'parsing_stats': parser_stats,
- 'distribution': {
- 'categories': categories,
- 'age_ranges': age_ranges,
- 'durations': durations,
- 'materials': materials
- },
- 'quality_metrics': {
- 'completion_rate': parser_stats.get('completion_rate', 0),
- 'average_description_length': parser_stats.get('average_description_length', 0),
- 'activities_with_metadata': sum(1 for a in activities if a.age_group_min or a.participants_min or a.duration_min)
- }
- }
-
- def verify_indexing_quality(self) -> Dict[str, Any]:
- """Verify the quality of indexed data"""
-
- try:
- # Get database statistics
- db_stats = self.db.get_statistics()
-
- # Check for minimum activity count
- total_activities = db_stats['total_activities']
- meets_minimum = total_activities >= 500
-
- # Check category distribution
- categories = db_stats.get('categories', {})
- category_coverage = len(categories)
-
- # Sample some activities to check quality
- sample_activities = self.db.search_activities(limit=10)
-
- quality_issues = []
- for activity in sample_activities:
- if not activity.get('description') or len(activity['description']) < 10:
- quality_issues.append(f"Activity {activity.get('name', 'Unknown')} has insufficient description")
-
- if not activity.get('category'):
- quality_issues.append(f"Activity {activity.get('name', 'Unknown')} missing category")
-
- return {
- 'total_activities': total_activities,
- 'meets_minimum_requirement': meets_minimum,
- 'minimum_target': 500,
- 'category_coverage': category_coverage,
- 'expected_categories': len(self.parser.category_mapping),
- 'quality_issues': quality_issues,
- 'quality_score': max(0, 100 - len(quality_issues) * 10),
- 'database_stats': db_stats
- }
-
- except Exception as e:
- return {'error': str(e), 'quality_score': 0}
-
- def get_indexing_progress(self) -> Dict[str, Any]:
- """Get current indexing progress and status"""
- try:
- db_stats = self.db.get_statistics()
-
- # Calculate progress towards 500+ activities goal
- total_activities = db_stats['total_activities']
- target_activities = 500
- progress_percentage = min(100, (total_activities / target_activities) * 100)
-
- return {
- 'current_activities': total_activities,
- 'target_activities': target_activities,
- 'progress_percentage': progress_percentage,
- 'status': 'completed' if total_activities >= target_activities else 'in_progress',
- 'categories_indexed': list(db_stats.get('categories', {}).keys()),
- 'database_size_mb': db_stats.get('database_size_bytes', 0) / (1024 * 1024)
- }
-
- except Exception as e:
- return {'error': str(e), 'status': 'error'}
\ No newline at end of file
diff --git a/app/services/parser.py b/app/services/parser.py
deleted file mode 100644
index e086248..0000000
--- a/app/services/parser.py
+++ /dev/null
@@ -1,340 +0,0 @@
-"""
-Advanced parser for INDEX_MASTER_JOCURI_ACTIVITATI.md
-Extracts 500+ individual activities with full details
-"""
-
-import re
-from pathlib import Path
-from typing import List, Dict, Optional, Tuple
-from app.models.activity import Activity
-
-class IndexMasterParser:
- """Advanced parser for extracting real activities from INDEX_MASTER"""
-
- def __init__(self, index_file_path: str):
- """Initialize parser with INDEX_MASTER file path"""
- self.index_file_path = Path(index_file_path)
- self.content = ""
- self.activities = []
-
- # Category mapping for main sections (exact match from file)
- self.category_mapping = {
- '[A]': 'JOCURI CERCETĂȘEȘTI ȘI SCOUT',
- '[B]': 'TEAM BUILDING ȘI COMUNICARE',
- '[C]': 'CAMPING ȘI ACTIVITĂȚI EXTERIOR',
- '[D]': 'ESCAPE ROOM ȘI PUZZLE-URI',
- '[E]': 'ORIENTARE ȘI BUSOLE',
- '[F]': 'PRIMUL AJUTOR ȘI SIGURANȚA',
- '[G]': 'ACTIVITĂȚI EDUCAȚIONALE',
- '[H]': 'RESURSE SPECIALE'
- }
-
- def load_content(self) -> bool:
- """Load and validate INDEX_MASTER content"""
- try:
- if not self.index_file_path.exists():
- print(f"❌ INDEX_MASTER file not found: {self.index_file_path}")
- return False
-
- with open(self.index_file_path, 'r', encoding='utf-8') as f:
- self.content = f.read()
-
- if len(self.content) < 1000: # Sanity check
- print(f"⚠️ INDEX_MASTER file seems too small: {len(self.content)} chars")
- return False
-
- print(f"✅ Loaded INDEX_MASTER: {len(self.content)} characters")
- return True
-
- except Exception as e:
- print(f"❌ Error loading INDEX_MASTER: {e}")
- return False
-
- def parse_all_categories(self) -> List[Activity]:
- """Parse all categories and extract individual activities"""
- if not self.load_content():
- return []
-
- print("🔍 Starting comprehensive parsing of INDEX_MASTER...")
-
- # Parse each main category
- for category_code, category_name in self.category_mapping.items():
- print(f"\n📂 Processing category {category_code}: {category_name}")
- category_activities = self.parse_category_section(category_code, category_name)
- self.activities.extend(category_activities)
- print(f" ✅ Extracted {len(category_activities)} activities")
-
- print(f"\n🎯 Total activities extracted: {len(self.activities)}")
- return self.activities
-
- def parse_category_section(self, category_code: str, category_name: str) -> List[Activity]:
- """Parse a specific category section"""
- activities = []
-
- # Find the category section - exact pattern match
- # Look for the actual section, not the table of contents
- pattern = rf"^## {re.escape(category_code)} {re.escape(category_name)}\s*$"
- matches = list(re.finditer(pattern, self.content, re.MULTILINE | re.IGNORECASE))
-
- if not matches:
- print(f" ⚠️ Category section not found: {category_code}")
- return activities
-
- # Take the last match (should be the actual section, not TOC)
- match = matches[-1]
- print(f" 📍 Found section at position {match.start()}")
-
- # Extract content until next main category or end
- start_pos = match.end()
-
- # Find next main category (look for complete header)
- next_category_pattern = r"^## \[[A-H]\] [A-ZĂÂÎȘȚ]"
- next_match = re.search(next_category_pattern, self.content[start_pos:], re.MULTILINE)
-
- if next_match:
- end_pos = start_pos + next_match.start()
- section_content = self.content[start_pos:end_pos]
- else:
- section_content = self.content[start_pos:]
-
- # Parse subsections within the category
- activities.extend(self._parse_subsections(section_content, category_name))
-
- return activities
-
- def _parse_subsections(self, section_content: str, category_name: str) -> List[Activity]:
- """Parse subsections within a category"""
- activities = []
-
- # Find all subsections (### markers)
- subsection_pattern = r"^### (.+?)$"
- subsections = re.finditer(subsection_pattern, section_content, re.MULTILINE)
-
- subsection_list = list(subsections)
-
- for i, subsection in enumerate(subsection_list):
- subsection_title = subsection.group(1).strip()
- subsection_start = subsection.end()
-
- # Find end of subsection
- if i + 1 < len(subsection_list):
- subsection_end = subsection_list[i + 1].start()
- else:
- subsection_end = len(section_content)
-
- subsection_text = section_content[subsection_start:subsection_end]
-
- # Parse individual games in this subsection
- subsection_activities = self._parse_games_in_subsection(
- subsection_text, category_name, subsection_title
- )
- activities.extend(subsection_activities)
-
- return activities
-
- def _parse_games_in_subsection(self, subsection_text: str, category_name: str, subsection_title: str) -> List[Activity]:
- """Parse individual games within a subsection"""
- activities = []
-
- # Look for "Exemple de jocuri:" sections
- examples_pattern = r"\*\*Exemple de jocuri:\*\*\s*\n(.*?)(?=\n\*\*|$)"
- examples_matches = re.finditer(examples_pattern, subsection_text, re.DOTALL)
-
- for examples_match in examples_matches:
- examples_text = examples_match.group(1)
-
- # Extract individual games (numbered list)
- game_pattern = r"^(\d+)\.\s*\*\*(.+?)\*\*\s*-\s*(.+?)$"
- games = re.finditer(game_pattern, examples_text, re.MULTILINE)
-
- for game_match in games:
- game_number = game_match.group(1)
- game_name = game_match.group(2).strip()
- game_description = game_match.group(3).strip()
-
- # Extract metadata from subsection
- metadata = self._extract_subsection_metadata(subsection_text)
-
- # Create activity
- activity = Activity(
- name=game_name,
- description=game_description,
- category=category_name,
- subcategory=subsection_title,
- source_file=f"INDEX_MASTER_JOCURI_ACTIVITATI.md",
- page_reference=f"{category_name} > {subsection_title} > #{game_number}",
- **metadata
- )
-
- activities.append(activity)
-
- # Also extract from direct activity descriptions without "Exemple de jocuri"
- activities.extend(self._parse_direct_activities(subsection_text, category_name, subsection_title))
-
- return activities
-
- def _extract_subsection_metadata(self, subsection_text: str) -> Dict:
- """Extract metadata from subsection text"""
- metadata = {}
-
- # Extract participants info
- participants_pattern = r"\*\*Participanți:\*\*\s*(.+?)(?:\n|\*\*)"
- participants_match = re.search(participants_pattern, subsection_text)
- if participants_match:
- participants_text = participants_match.group(1).strip()
- participants = self._parse_participants(participants_text)
- metadata.update(participants)
-
- # Extract duration
- duration_pattern = r"\*\*Durata:\*\*\s*(.+?)(?:\n|\*\*)"
- duration_match = re.search(duration_pattern, subsection_text)
- if duration_match:
- duration_text = duration_match.group(1).strip()
- duration = self._parse_duration(duration_text)
- metadata.update(duration)
-
- # Extract materials
- materials_pattern = r"\*\*Materiale:\*\*\s*(.+?)(?:\n|\*\*)"
- materials_match = re.search(materials_pattern, subsection_text)
- if materials_match:
- materials_text = materials_match.group(1).strip()
- metadata['materials_list'] = materials_text
- metadata['materials_category'] = self._categorize_materials(materials_text)
-
- # Extract keywords
- keywords_pattern = r"\*\*Cuvinte cheie:\*\*\s*(.+?)(?:\n|\*\*)"
- keywords_match = re.search(keywords_pattern, subsection_text)
- if keywords_match:
- metadata['keywords'] = keywords_match.group(1).strip()
-
- return metadata
-
- def _parse_participants(self, participants_text: str) -> Dict:
- """Parse participants information"""
- result = {}
-
- # Look for number ranges like "8-30 copii" or "5-15 persoane"
- range_pattern = r"(\d+)-(\d+)"
- range_match = re.search(range_pattern, participants_text)
-
- if range_match:
- result['participants_min'] = int(range_match.group(1))
- result['participants_max'] = int(range_match.group(2))
- else:
- # Look for single numbers
- number_pattern = r"(\d+)\+"
- number_match = re.search(number_pattern, participants_text)
- if number_match:
- result['participants_min'] = int(number_match.group(1))
-
- # Extract age information
- age_pattern = r"(\d+)-(\d+)\s*ani"
- age_match = re.search(age_pattern, participants_text)
- if age_match:
- result['age_group_min'] = int(age_match.group(1))
- result['age_group_max'] = int(age_match.group(2))
-
- return result
-
- def _parse_duration(self, duration_text: str) -> Dict:
- """Parse duration information"""
- result = {}
-
- # Look for time ranges like "5-20 minute" or "15-30min"
- range_pattern = r"(\d+)-(\d+)\s*(?:minute|min)"
- range_match = re.search(range_pattern, duration_text)
-
- if range_match:
- result['duration_min'] = int(range_match.group(1))
- result['duration_max'] = int(range_match.group(2))
- else:
- # Look for single duration
- single_pattern = r"(\d+)\+?\s*(?:minute|min)"
- single_match = re.search(single_pattern, duration_text)
- if single_match:
- result['duration_min'] = int(single_match.group(1))
-
- return result
-
- def _categorize_materials(self, materials_text: str) -> str:
- """Categorize materials into simple categories"""
- materials_lower = materials_text.lower()
-
- if any(word in materials_lower for word in ['fără', 'nu necesare', 'nimic', 'minime']):
- return 'Fără materiale'
- elif any(word in materials_lower for word in ['hârtie', 'creion', 'marker', 'simple']):
- return 'Materiale simple'
- elif any(word in materials_lower for word in ['computer', 'proiector', 'echipament', 'complexe']):
- return 'Materiale complexe'
- else:
- return 'Materiale variate'
-
- def _parse_direct_activities(self, subsection_text: str, category_name: str, subsection_title: str) -> List[Activity]:
- """Parse activities that are described directly without 'Exemple de jocuri' section"""
- activities = []
-
- # Look for activity descriptions in sections that don't have "Exemple de jocuri"
- if "**Exemple de jocuri:**" not in subsection_text:
- # Try to extract from file descriptions
- file_pattern = r"\*\*Fișier:\*\*\s*`([^`]+)`.*?\*\*(.+?)\*\*"
- file_matches = re.finditer(file_pattern, subsection_text, re.DOTALL)
-
- for file_match in file_matches:
- file_name = file_match.group(1)
- description_part = file_match.group(2)
-
- # Create a general activity for this file
- activity = Activity(
- name=f"Activități din {file_name}",
- description=f"Colecție de activități din fișierul {file_name}. {description_part[:200]}...",
- category=category_name,
- subcategory=subsection_title,
- source_file=file_name,
- page_reference=f"{category_name} > {subsection_title}",
- **self._extract_subsection_metadata(subsection_text)
- )
-
- activities.append(activity)
-
- return activities
-
- def validate_activity_completeness(self, activity: Activity) -> bool:
- """Validate that an activity has all necessary fields"""
- required_fields = ['name', 'description', 'category', 'source_file']
-
- for field in required_fields:
- if not getattr(activity, field) or not getattr(activity, field).strip():
- return False
-
- # Check minimum description length
- if len(activity.description) < 10:
- return False
-
- return True
-
- def get_parsing_statistics(self) -> Dict:
- """Get statistics about the parsing process"""
- if not self.activities:
- return {'total_activities': 0}
-
- category_counts = {}
- valid_activities = 0
-
- for activity in self.activities:
- # Count by category
- if activity.category in category_counts:
- category_counts[activity.category] += 1
- else:
- category_counts[activity.category] = 1
-
- # Count valid activities
- if self.validate_activity_completeness(activity):
- valid_activities += 1
-
- return {
- 'total_activities': len(self.activities),
- 'valid_activities': valid_activities,
- 'completion_rate': (valid_activities / len(self.activities)) * 100 if self.activities else 0,
- 'category_breakdown': category_counts,
- 'average_description_length': sum(len(a.description) for a in self.activities) / len(self.activities) if self.activities else 0
- }
\ No newline at end of file
diff --git a/app/services/search.py b/app/services/search.py
index a41857a..2a64261 100644
--- a/app/services/search.py
+++ b/app/services/search.py
@@ -5,8 +5,19 @@ Enhanced search with FTS5 and intelligent filtering
from typing import List, Dict, Any, Optional
from app.models.database import DatabaseManager
+from app.config_taxonomy import NON_GAME_CONTENT_TYPES
import re
+# Category slugs that are themselves "non-game" — selecting one of these as a
+# category filter also lifts the default non-game content_type exclusion.
+NON_GAME_CATEGORIES = {"retete", "cantece-ceremonii"}
+
+# When a Python-side post-filter is active the DB LIMIT is applied *before*
+# filtering, so we over-fetch to still satisfy the caller's `limit`.
+_OVERSCAN_FACTOR = 5
+_OVERSCAN_CAP = 2000
+
+
class SearchService:
"""Enhanced search service with intelligent query processing"""
@@ -24,22 +35,72 @@ class SearchService:
if filters is None:
filters = {}
-
+
# Process and normalize search text
processed_search = self._process_search_text(search_text)
-
+
# Map web filters to database fields
db_filters = self._map_filters_to_db_fields(filters)
-
+
+ # content_type and language are filtered in Python: the DB layer does
+ # not expose them as query parameters. The DEFAULT search excludes the
+ # non-game content types (rețete / cântece / ceremonii) — they surface
+ # only when the user explicitly filters that content_type, or picks a
+ # non-game category. See plan §6.
+ content_type, exclude_non_game = self._resolve_content_type_filter(filters)
+ language = (filters.get('language') or '').strip().lower() or None
+ post_filtering = bool(content_type or exclude_non_game or language)
+
+ # Over-fetch when post-filtering so the final list can still reach `limit`.
+ fetch_limit = min(limit * _OVERSCAN_FACTOR, _OVERSCAN_CAP) if post_filtering else limit
+
# Perform database search
results = self.db.search_activities(
search_text=processed_search,
**db_filters,
- limit=limit
+ limit=fetch_limit
)
-
- # Post-process results for relevance and ranking
- return self._post_process_results(results, processed_search, filters)
+
+ # Apply content_type / language post-filters
+ results = self._apply_content_type_filter(results, content_type, exclude_non_game)
+ if language:
+ results = [r for r in results
+ if (r.get('language') or '').strip().lower() == language]
+
+ # Post-process results for relevance and ranking, then honour `limit`
+ results = self._post_process_results(results, processed_search, filters)
+ return results[:limit]
+
+ def _resolve_content_type_filter(self, filters: Dict[str, str]):
+ """Determine the content_type post-filter.
+
+ Returns (explicit_content_type | None, exclude_non_game: bool):
+ - an explicit `content_type` filter → that value, no exclusion;
+ - a `category` filter on a non-game category → no exclusion;
+ - otherwise → default search, exclude non-game content types.
+ """
+ content_type = (filters.get('content_type') or '').strip()
+ if content_type:
+ return content_type, False
+ category = (filters.get('category') or '').strip()
+ if category in NON_GAME_CATEGORIES:
+ return None, False
+ return None, True
+
+ def _apply_content_type_filter(self,
+ results: List[Dict[str, Any]],
+ content_type: Optional[str],
+ exclude_non_game: bool) -> List[Dict[str, Any]]:
+ """Filter results by content_type (explicit include vs default exclude)."""
+ if content_type:
+ return [r for r in results
+ if (r.get('content_type') or '') == content_type]
+ if exclude_non_game:
+ # Rows with NULL/unknown content_type are kept — only the known
+ # non-game types are dropped from the default search.
+ return [r for r in results
+ if (r.get('content_type') or '') not in NON_GAME_CONTENT_TYPES]
+ return results
def _process_search_text(self, search_text: Optional[str]) -> Optional[str]:
"""Process and enhance search text for better FTS5 results"""
@@ -83,10 +144,16 @@ class SearchService:
if not filter_value or not filter_value.strip():
continue
+ # content_type / language are NOT database query params — they are
+ # applied as Python post-filters in search_activities(). Skip them
+ # here so they never reach DatabaseManager.search_activities().
+ if filter_key in ('content_type', 'language'):
+ continue
+
# Map filter types to database fields
if filter_key == 'category':
db_filters['category'] = filter_value
-
+
elif filter_key == 'age_group':
# Parse age range (e.g., "5-8 ani", "12+ ani")
age_match = re.search(r'(\d+)(?:-(\d+))?\s*ani?', filter_value)
@@ -177,21 +244,22 @@ class SearchService:
boost_score = 0
# Check name matches (highest priority)
- name_lower = result.get('name', '').lower()
+ # NB: use `or ''` — nullable columns come back as None, not ''.
+ name_lower = (result.get('name') or '').lower()
for term in search_terms:
if term in name_lower:
boost_score += 10
if name_lower.startswith(term):
boost_score += 5 # Extra boost for name starts with term
-
+
# Check description matches
- desc_lower = result.get('description', '').lower()
+ desc_lower = (result.get('description') or '').lower()
for term in search_terms:
if term in desc_lower:
boost_score += 3
-
+
# Check keywords matches
- keywords_lower = result.get('keywords', '').lower()
+ keywords_lower = (result.get('keywords') or '').lower()
for term in search_terms:
if term in keywords_lower:
boost_score += 5
@@ -280,11 +348,14 @@ class SearchService:
return []
try:
- # Search for activities that match the partial query
+ # Search for activities that match the partial query.
+ # Over-fetch then drop non-game content types so autocomplete
+ # mirrors the default search (no rețete / cântece / ceremonii).
results = self.db.search_activities(
search_text=f'"{partial_query}"',
- limit=limit * 2
+ limit=limit * 6
)
+ results = self._apply_content_type_filter(results, None, True)
suggestions = []
seen = set()
diff --git a/app/templates/activity.html b/app/templates/activity.html
index 6e25f08..d865f0a 100644
--- a/app/templates/activity.html
+++ b/app/templates/activity.html
@@ -15,7 +15,13 @@
diff --git a/app/web/routes.py b/app/web/routes.py
index 6445e7a..56fd7ca 100644
--- a/app/web/routes.py
+++ b/app/web/routes.py
@@ -7,11 +7,17 @@ from flask import Blueprint, request, render_template, jsonify, current_app
from app.models.database import DatabaseManager
from app.models.activity import Activity
from app.services.search import SearchService
+from app.config_taxonomy import CATEGORIES, CONTENT_TYPES
import os
from pathlib import Path
bp = Blueprint('main', __name__)
+# Slug -> Romanian display name. Category and content_type slugs never collide,
+# so a single flat map is enough for the UI filter labels.
+LANGUAGE_NAMES = {'ro': 'Română', 'en': 'Engleză'}
+DISPLAY_NAMES = {**CATEGORIES, **CONTENT_TYPES, **LANGUAGE_NAMES}
+
# Initialize database manager (will be configured in application factory)
def get_db_manager():
"""Get database manager instance"""
@@ -36,15 +42,17 @@ def index():
# Get database statistics for the interface
stats = db.get_statistics()
- return render_template('index.html',
+ return render_template('index.html',
filters=filter_options,
+ display_names=DISPLAY_NAMES,
stats=stats)
-
+
except Exception as e:
print(f"Error loading main page: {e}")
# Fallback with empty filters
- return render_template('index.html',
+ return render_template('index.html',
filters={},
+ display_names=DISPLAY_NAMES,
stats={'total_activities': 0})
@bp.route('/search', methods=['GET', 'POST'])
@@ -82,8 +90,9 @@ def search():
search_query=search_query,
applied_filters=filters,
filters=filter_options,
+ display_names=DISPLAY_NAMES,
results_count=len(activities))
-
+
except Exception as e:
print(f"Search error: {e}")
return render_template('results.html',
@@ -91,6 +100,7 @@ def search():
search_query='',
applied_filters={},
filters={},
+ display_names=DISPLAY_NAMES,
results_count=0,
error=str(e))
@@ -121,6 +131,7 @@ def activity_detail(activity_id):
return render_template('activity.html',
activity=activity,
+ display_names=DISPLAY_NAMES,
similar_activities=similar_activities)
except Exception as e:
diff --git a/scripts/SUBAGENT_PROMPT.md b/scripts/SUBAGENT_PROMPT.md
new file mode 100644
index 0000000..79c3e9c
--- /dev/null
+++ b/scripts/SUBAGENT_PROMPT.md
@@ -0,0 +1,81 @@
+# SUBAGENT — Activity extraction
+
+You are a subagent in the game-library extraction pipeline. You extract
+educational activities (games, team-building, scouting, recipes, songs,
+ceremonies) from one chunk of a source document into structured JSON.
+
+## Your task
+
+1. **Read ONLY the chunk you were assigned.** Do not read other chunks, other
+ files, or the original document. The chunk is a `.txt` file with
+ `--- PAGE N ---` markers.
+2. Identify **every distinct activity** in the chunk.
+3. For each activity, fill the schema in `scripts/activity_schema.json`.
+4. Write the result to `data/extracted/
.json`.
+
+## What counts as "a distinct activity"
+
+A distinct activity is a self-contained game/activity/recipe/song/ceremony with
+its own name and a real description of how to do it. It is NOT:
+
+- a bare mention or a cross-reference with no description — **skip it**;
+- a sub-variant of an activity already extracted — fold it into `variations`;
+- a heading, a table of contents entry, or running page chrome.
+
+If the same activity is split across a page boundary inside your chunk, treat it
+as **one** activity and combine the text.
+
+## Output format
+
+The file is one JSON object: a `header` plus an `activities` array.
+
+```json
+{
+ "header": {
+ "source_id": "",
+ "chunk_key": "",
+ "source_hash": "",
+ "schema_version": "1.0",
+ "prompt_version": "1.0",
+ "chunk_range": "pages 1-20"
+ },
+ "activities": [ ... ]
+}
+```
+
+## Rules for each activity
+
+- **`name`** — the activity's real name (≥3 characters).
+- **`description`** — real prose describing the activity. No hard length limit,
+ but it must actually describe what happens.
+- **`rules`** — how it is played / carried out, if the source gives rules.
+- **`category`** — exactly one taxonomy slug (see the `enum` in the schema):
+ `jocuri-cercetasesti`, `team-building`, `icebreakers`, `camp-outdoor`,
+ `wide-games`, `orientare`, `prim-ajutor`, `escape-room-puzzle`,
+ `creative-stem`, `sports-active`, `cantece-ceremonii`, `retete`,
+ `supravietuire`, `integrare-incluziune`, `conflict-empatie`, `altele`.
+ When unsure, use `altele`.
+- **`content_type`** — the FORM of the content, independent of category:
+ `joc`, `activitate`, `reteta`, `cantec`, or `ceremonie`.
+- **`language`** — `ro` or `en` (the language the activity is written in).
+- **`source_excerpt`** — **MANDATORY.** A short quote (one or two sentences)
+ copied **verbatim** from the chunk. This is the anti-hallucination anchor: it
+ is checked as a fuzzy substring of the chunk, and invented quotes are
+ rejected.
+- **`page_reference`** — **MANDATORY.** The `--- PAGE N ---` marker(s) the
+ activity came from, e.g. `"page 14"` or `"pages 14-15"`.
+- **`extraction_confidence`** — `high`, `med`, or `low`. Use `low` when the
+ source text for the activity is thin or ambiguous.
+
+## Never invent data
+
+- Do **not** invent ages, participant counts, or durations. If the source does
+ not state them, leave those fields `null`.
+- Do **not** paraphrase the `source_excerpt` — copy it character for character.
+- Better to extract fewer activities accurately than to pad the output.
+
+## Before you finish
+
+- Every activity has a non-empty `source_excerpt` and `page_reference`.
+- The file validates against `scripts/activity_schema.json`.
+- You only used text from your assigned chunk.
diff --git a/scripts/activity_schema.json b/scripts/activity_schema.json
new file mode 100644
index 0000000..922dc86
--- /dev/null
+++ b/scripts/activity_schema.json
@@ -0,0 +1,110 @@
+{
+ "$schema": "http://json-schema.org/draft-07/schema#",
+ "title": "Game-library extraction output",
+ "description": "One subagent output file: a header carrying provenance/version metadata plus the list of activities extracted from a single chunk.",
+ "type": "object",
+ "required": ["header", "activities"],
+ "additionalProperties": false,
+ "properties": {
+ "header": {
+ "type": "object",
+ "required": ["source_hash", "schema_version", "prompt_version", "chunk_range"],
+ "additionalProperties": true,
+ "properties": {
+ "source_hash": {"type": "string", "minLength": 8},
+ "schema_version": {"type": "string"},
+ "prompt_version": {"type": "string"},
+ "chunk_range": {"type": "string"},
+ "source_id": {"type": ["string", "null"]},
+ "chunk_key": {"type": ["string", "null"]}
+ }
+ },
+ "activities": {
+ "type": "array",
+ "items": {"$ref": "#/definitions/activity"}
+ }
+ },
+ "definitions": {
+ "activity": {
+ "type": "object",
+ "required": [
+ "name",
+ "description",
+ "category",
+ "content_type",
+ "language",
+ "extraction_confidence",
+ "source_excerpt",
+ "page_reference"
+ ],
+ "additionalProperties": false,
+ "properties": {
+ "name": {"type": "string", "minLength": 3},
+ "description": {"type": "string", "minLength": 1},
+ "rules": {"type": ["string", "null"]},
+ "variations": {"type": ["string", "null"]},
+ "category": {
+ "type": "string",
+ "enum": [
+ "jocuri-cercetasesti",
+ "team-building",
+ "icebreakers",
+ "camp-outdoor",
+ "wide-games",
+ "orientare",
+ "prim-ajutor",
+ "escape-room-puzzle",
+ "creative-stem",
+ "sports-active",
+ "cantece-ceremonii",
+ "retete",
+ "supravietuire",
+ "integrare-incluziune",
+ "conflict-empatie",
+ "altele"
+ ]
+ },
+ "subcategory": {"type": ["string", "null"]},
+ "content_type": {
+ "type": "string",
+ "enum": ["joc", "activitate", "reteta", "cantec", "ceremonie"]
+ },
+ "language": {"type": "string", "enum": ["ro", "en"]},
+ "extraction_confidence": {
+ "type": "string",
+ "enum": ["high", "med", "low"]
+ },
+ "source_excerpt": {"type": "string", "minLength": 1},
+ "page_reference": {"type": "string", "minLength": 1},
+ "source_file": {"type": ["string", "null"]},
+ "age_group_min": {"type": ["integer", "null"], "minimum": 0},
+ "age_group_max": {"type": ["integer", "null"], "minimum": 0},
+ "participants_min": {"type": ["integer", "null"], "minimum": 0},
+ "participants_max": {"type": ["integer", "null"], "minimum": 0},
+ "duration_min": {"type": ["integer", "null"], "minimum": 0},
+ "duration_max": {"type": ["integer", "null"], "minimum": 0},
+ "materials_category": {"type": ["string", "null"]},
+ "materials_list": {
+ "type": ["array", "null"],
+ "items": {"type": "string"}
+ },
+ "skills_developed": {
+ "type": ["array", "null"],
+ "items": {"type": "string"}
+ },
+ "difficulty_level": {
+ "type": ["string", "null"],
+ "enum": ["usor", "mediu", "dificil", null]
+ },
+ "keywords": {
+ "type": ["array", "null"],
+ "items": {"type": "string"}
+ },
+ "tags": {
+ "type": ["array", "null"],
+ "items": {"type": "string"}
+ }
+ }
+ }
+ }
+}
diff --git a/scripts/build_database.py b/scripts/build_database.py
new file mode 100644
index 0000000..d7276be
--- /dev/null
+++ b/scripts/build_database.py
@@ -0,0 +1,639 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+"""
+build_database.py — build data/activities.db from the subagent extraction JSON.
+
+Replaces the old import_claude_activities.py. Pipeline (plan §4):
+
+ 1. `--rebuild` builds into data/activities.db.tmp; on success the live DB is
+ backed up to data/activities.db.bak and the tmp file is swapped in with an
+ atomic os.replace. A mid-build crash leaves the live DB untouched.
+ 2. Every data/extracted/*.json is validated against scripts/activity_schema.json;
+ invalid files are moved to data/extracted/_rejected/ with an error log.
+ 2b. Each source_excerpt must appear as a fuzzy substring (rapidfuzz
+ partial_ratio >= 90) of its source chunk — non-matches are hallucinations
+ and the activity is dropped (logged to _rejected/).
+ 3. `category` is normalized to a valid taxonomy slug (fallback `altele`).
+ 4. Dedup (D5): group by exact normalized_name, never across languages; within a
+ group rapidfuzz on descriptions — >=85 auto-merge, 60-85 borderline (keep
+ both, needs_review), <60 separate variants.
+ 5. data/review_decisions.json is applied before insert.
+ 6. Bulk insert into the tmp DB, populate the categories table, rebuild FTS.
+ 7. A QA report is printed.
+
+Usage:
+ python scripts/build_database.py --rebuild
+"""
+
+from __future__ import annotations
+
+import argparse
+import json
+import os
+import shutil
+import sys
+from collections import defaultdict
+from pathlib import Path
+from typing import Any, Optional
+
+SCRIPT_DIR = Path(__file__).resolve().parent
+REPO_ROOT = SCRIPT_DIR.parent
+for _p in (str(SCRIPT_DIR), str(REPO_ROOT)):
+ if _p not in sys.path:
+ sys.path.insert(0, _p)
+
+from app.config_taxonomy import ( # noqa: E402
+ category_display_name,
+ normalize_category,
+ normalize_content_type,
+)
+from app.models.activity import Activity # noqa: E402
+from app.models.database import DatabaseManager # noqa: E402
+from import_common import ( # noqa: E402
+ DEFAULT_SCHEMA_PATH,
+ content_key,
+ excerpt_matches,
+ find_chunk_text,
+ iter_extraction_files,
+ load_schema,
+ normalize_name,
+ source_path_for,
+)
+
+# dedup thresholds (rapidfuzz token_sort_ratio, 0..100 scale)
+AUTO_MERGE_THRESHOLD = 85.0
+BORDERLINE_THRESHOLD = 60.0
+
+
+# --------------------------------------------------------------------------
+# extraction dict -> Activity
+# --------------------------------------------------------------------------
+def _csv(value: Any) -> Optional[str]:
+ """Schema arrays -> comma string for the (TEXT) DB columns."""
+ if value is None:
+ return None
+ if isinstance(value, str):
+ return value.strip() or None
+ if isinstance(value, (list, tuple)):
+ parts = [str(v).strip() for v in value if str(v).strip()]
+ return ", ".join(parts) or None
+ return str(value)
+
+
+def _split_csv(value: Optional[str]) -> list[str]:
+ if not value:
+ return []
+ return [p.strip() for p in str(value).split(",") if p.strip()]
+
+
+def dict_to_activity(adict: dict, source_file: str) -> Activity:
+ """Build an Activity from one extraction-JSON activity object."""
+ tags = adict.get("tags") or []
+ if isinstance(tags, str):
+ tags = _split_csv(tags)
+
+ source_files = adict.get("source_files") or []
+ if isinstance(source_files, str):
+ source_files = _split_csv(source_files)
+ if source_file and source_file not in source_files:
+ source_files = [source_file, *source_files]
+
+ return Activity(
+ name=(adict.get("name") or "").strip(),
+ description=(adict.get("description") or "").strip(),
+ rules=adict.get("rules"),
+ variations=adict.get("variations"),
+ category=normalize_category(adict.get("category", "")),
+ subcategory=adict.get("subcategory"),
+ content_type=normalize_content_type(adict.get("content_type", "")),
+ source_file=source_file,
+ source_files=list(source_files),
+ page_reference=adict.get("page_reference"),
+ source_excerpt=adict.get("source_excerpt"),
+ age_group_min=adict.get("age_group_min"),
+ age_group_max=adict.get("age_group_max"),
+ participants_min=adict.get("participants_min"),
+ participants_max=adict.get("participants_max"),
+ duration_min=adict.get("duration_min"),
+ duration_max=adict.get("duration_max"),
+ materials_category=adict.get("materials_category"),
+ materials_list=_csv(adict.get("materials_list")),
+ skills_developed=_csv(adict.get("skills_developed")),
+ difficulty_level=adict.get("difficulty_level"),
+ keywords=_csv(adict.get("keywords")),
+ tags=list(tags),
+ language=adict.get("language"),
+ extraction_confidence=adict.get("extraction_confidence"),
+ )
+
+
+# --------------------------------------------------------------------------
+# step 3 — category normalization is done in dict_to_activity; a non-taxonomy
+# value silently falls back to `altele`. This logs the substitutions.
+# --------------------------------------------------------------------------
+def log_category_fallbacks(raw_pairs: list[tuple[str, str]]) -> list[str]:
+ """raw_pairs = (original, slug); return human-readable fallback messages."""
+ msgs = []
+ for original, slug in raw_pairs:
+ if slug == "altele" and normalize_name(original or "") not in ("", "altele"):
+ msgs.append(f"category '{original}' -> altele (not in taxonomy)")
+ return msgs
+
+
+# --------------------------------------------------------------------------
+# step 4 — dedup
+# --------------------------------------------------------------------------
+def _longest(*values: Optional[str]) -> Optional[str]:
+ best: Optional[str] = None
+ for v in values:
+ if v and (best is None or len(v) > len(best)):
+ best = v
+ return best
+
+
+def _union_csv(values: list[Optional[str]]) -> Optional[str]:
+ seen: list[str] = []
+ for value in values:
+ for item in _split_csv(value):
+ if item not in seen:
+ seen.append(item)
+ return ", ".join(seen) or None
+
+
+def merge_cluster(cluster: list[Activity]) -> Activity:
+ """Collapse a cluster of duplicate activities into one merged Activity."""
+ if len(cluster) == 1:
+ return cluster[0]
+
+ # representative = the one with the longest description
+ rep = max(cluster, key=lambda a: len(a.description or ""))
+ merged = Activity(
+ name=rep.name,
+ description=_longest(*(a.description for a in cluster)) or rep.description,
+ rules=_longest(*(a.rules for a in cluster)),
+ variations=_longest(*(a.variations for a in cluster)),
+ category=rep.category,
+ subcategory=rep.subcategory,
+ content_type=rep.content_type,
+ source_file=rep.source_file,
+ page_reference=rep.page_reference,
+ source_excerpt=rep.source_excerpt,
+ age_group_min=rep.age_group_min,
+ age_group_max=rep.age_group_max,
+ participants_min=rep.participants_min,
+ participants_max=rep.participants_max,
+ duration_min=rep.duration_min,
+ duration_max=rep.duration_max,
+ materials_category=rep.materials_category,
+ materials_list=_union_csv([a.materials_list for a in cluster]),
+ skills_developed=_union_csv([a.skills_developed for a in cluster]),
+ difficulty_level=rep.difficulty_level,
+ keywords=_union_csv([a.keywords for a in cluster]),
+ language=rep.language,
+ extraction_confidence=rep.extraction_confidence,
+ )
+ # union of tags
+ tags: list[str] = []
+ for a in cluster:
+ for t in a.tags or []:
+ if t not in tags:
+ tags.append(t)
+ merged.tags = tags
+ # accumulate every source the activity was seen in
+ sources: list[str] = []
+ for a in cluster:
+ for s in [a.source_file, *(a.source_files or [])]:
+ if s and s not in sources:
+ sources.append(s)
+ merged.source_files = sources
+ # popularity_score++ per merged duplicate (plan §4)
+ merged.popularity_score = max(a.popularity_score for a in cluster) + (len(cluster) - 1)
+ return merged
+
+
+def dedup_activities(activities: list[Activity]) -> tuple[list[Activity], dict]:
+ """
+ Dedup per plan D5.
+
+ Groups by (normalized_name, language) — different languages are NEVER
+ merged. Within a group, descriptions are clustered with rapidfuzz:
+ >= 85 -> same cluster (auto-merge)
+ 60-85 -> borderline: kept as separate clusters, both flagged needs_review
+ < 60 -> separate variants
+ """
+ from rapidfuzz import fuzz
+
+ groups: dict[tuple, list[Activity]] = defaultdict(list)
+ for act in activities:
+ key = (act.normalized_name or normalize_name(act.name), act.language)
+ groups[key].append(act)
+
+ result: list[Activity] = []
+ stats = {"input": len(activities), "auto_merged": 0, "borderline": 0, "output": 0}
+
+ for members in groups.values():
+ clusters: list[list[Activity]] = []
+ borderline_idx: set[int] = set()
+
+ for act in members:
+ best_idx, best_score = -1, -1.0
+ borderline_here: list[int] = []
+ for idx, cluster in enumerate(clusters):
+ score = fuzz.token_sort_ratio(
+ act.description or "", cluster[0].description or ""
+ )
+ if score >= AUTO_MERGE_THRESHOLD:
+ if score > best_score:
+ best_idx, best_score = idx, score
+ elif score >= BORDERLINE_THRESHOLD:
+ borderline_here.append(idx)
+ if best_idx >= 0:
+ clusters[best_idx].append(act)
+ else:
+ clusters.append([act])
+ new_idx = len(clusters) - 1
+ for bidx in borderline_here:
+ borderline_idx.add(bidx)
+ borderline_idx.add(new_idx)
+
+ for idx, cluster in enumerate(clusters):
+ merged = merge_cluster(cluster)
+ if len(cluster) > 1:
+ stats["auto_merged"] += len(cluster) - 1
+ if idx in borderline_idx:
+ merged.needs_review = 1
+ stats["borderline"] += 1
+ result.append(merged)
+
+ stats["output"] = len(result)
+ return result, stats
+
+
+# --------------------------------------------------------------------------
+# step 5 — review decisions
+# --------------------------------------------------------------------------
+def load_review_decisions(path: Path) -> dict:
+ if path and path.is_file():
+ try:
+ data = json.loads(path.read_text(encoding="utf-8"))
+ if isinstance(data, dict):
+ return data
+ except (json.JSONDecodeError, OSError):
+ pass
+ return {}
+
+
+def apply_review_decisions(
+ activities: list[Activity], decisions: dict
+) -> tuple[list[Activity], dict]:
+ """
+ Apply data/review_decisions.json (plan §5c).
+
+ Keyed by the stable content_key. A decision of `drop` removes the row;
+ `keep-separate` / `merge` clear needs_review (the user has resolved it).
+ Rows with no decision keep needs_review and resurface in the queue.
+ """
+ kept: list[Activity] = []
+ stats = {"dropped": 0, "resolved": 0}
+ for act in activities:
+ key = content_key(
+ act.normalized_name or normalize_name(act.name),
+ act.language,
+ act.description or "",
+ )
+ entry = decisions.get(key)
+ decision = entry.get("decision") if isinstance(entry, dict) else entry
+ if decision == "drop":
+ stats["dropped"] += 1
+ continue
+ if decision in ("keep-separate", "merge"):
+ act.needs_review = 0
+ stats["resolved"] += 1
+ kept.append(act)
+ return kept, stats
+
+
+# --------------------------------------------------------------------------
+# golden-set recall (plan §7)
+# --------------------------------------------------------------------------
+def _golden_names(data: Any) -> list[str]:
+ items = data.get("activities", data) if isinstance(data, dict) else data
+ names: list[str] = []
+ for item in items or []:
+ if isinstance(item, str):
+ names.append(item)
+ elif isinstance(item, dict) and item.get("name"):
+ names.append(item["name"])
+ return names
+
+
+def golden_recall(golden_dir: Path, activities: list[Activity]) -> Optional[dict]:
+ if not golden_dir or not golden_dir.is_dir():
+ return None
+ found = {normalize_name(a.name) for a in activities}
+ expected, hits = 0, 0
+ for gf in sorted(golden_dir.glob("*.json")):
+ try:
+ data = json.loads(gf.read_text(encoding="utf-8"))
+ except (json.JSONDecodeError, OSError):
+ continue
+ for name in _golden_names(data):
+ expected += 1
+ if normalize_name(name) in found:
+ hits += 1
+ if expected == 0:
+ return None
+ return {"expected": expected, "found": hits, "recall": round(hits / expected, 3)}
+
+
+# --------------------------------------------------------------------------
+# load + validate + excerpt-check the extraction files
+# --------------------------------------------------------------------------
+def collect_activities(
+ extracted_dir: Path,
+ chunks_dir: Path,
+ sources_dir: Path,
+ schema: dict,
+) -> dict:
+ """Validate, excerpt-check and convert every extraction file."""
+ rejected_dir = extracted_dir / "_rejected"
+ activities: list[Activity] = []
+ report = {
+ "files_total": 0,
+ "files_valid": 0,
+ "files_rejected_schema": 0,
+ "activities_raw": 0,
+ "activities_hallucinated": 0,
+ "category_fallbacks": [],
+ }
+ raw_categories: list[tuple[str, str]] = []
+
+ from import_common import chunk_key_for # local import to avoid clutter
+
+ for json_path in iter_extraction_files(extracted_dir):
+ report["files_total"] += 1
+ try:
+ data = json.loads(json_path.read_text(encoding="utf-8"))
+ except json.JSONDecodeError as exc:
+ _reject_file(json_path, rejected_dir, [f"invalid JSON: {exc}"])
+ report["files_rejected_schema"] += 1
+ continue
+
+ from import_common import validate_extraction
+
+ errors = validate_extraction(data, schema)
+ if errors:
+ _reject_file(json_path, rejected_dir, errors)
+ report["files_rejected_schema"] += 1
+ continue
+ report["files_valid"] += 1
+
+ header = data.get("header", {})
+ chunk_text = find_chunk_text(json_path, header, chunks_dir)
+ source_id = header.get("source_id") or chunk_key_for(json_path, header).rsplit(
+ ".part", 1
+ )[0]
+ fallback_source = (
+ source_path_for(source_id, sources_dir) or source_id or json_path.stem
+ )
+
+ hallucinated: list[dict] = []
+ for adict in data.get("activities", []):
+ report["activities_raw"] += 1
+ excerpt = adict.get("source_excerpt") or ""
+ # if the chunk text is unavailable we cannot verify — keep but the
+ # QA report still counts it under activities_raw.
+ if chunk_text is not None and not excerpt_matches(excerpt, chunk_text):
+ hallucinated.append(adict)
+ report["activities_hallucinated"] += 1
+ continue
+ src = adict.get("source_file") or fallback_source
+ raw_categories.append((adict.get("category", ""), normalize_category(adict.get("category", ""))))
+ activities.append(dict_to_activity(adict, src))
+
+ if hallucinated:
+ _log_hallucinations(json_path, rejected_dir, hallucinated)
+
+ report["category_fallbacks"] = log_category_fallbacks(raw_categories)
+ report["activities"] = activities
+ return report
+
+
+def _reject_file(json_path: Path, rejected_dir: Path, errors: list[str]) -> None:
+ rejected_dir.mkdir(parents=True, exist_ok=True)
+ dest = rejected_dir / json_path.name
+ shutil.move(str(json_path), str(dest))
+ log = rejected_dir / f"{json_path.stem}.errors.txt"
+ log.write_text(
+ f"REJECTED (schema validation): {json_path.name}\n\n"
+ + "\n".join(f" - {e}" for e in errors)
+ + "\n",
+ encoding="utf-8",
+ )
+
+
+def _log_hallucinations(
+ json_path: Path, rejected_dir: Path, hallucinated: list[dict]
+) -> None:
+ rejected_dir.mkdir(parents=True, exist_ok=True)
+ log = rejected_dir / f"{json_path.stem}.hallucinations.txt"
+ lines = [f"DROPPED activities (source_excerpt not found in chunk): {json_path.name}", ""]
+ for a in hallucinated:
+ lines.append(f" - {a.get('name')!r}")
+ lines.append(f" excerpt: {a.get('source_excerpt')!r}")
+ log.write_text("\n".join(lines) + "\n", encoding="utf-8")
+
+
+# --------------------------------------------------------------------------
+# DB write + atomic swap
+# --------------------------------------------------------------------------
+def _enrich_category_display_names(db_path: Path) -> None:
+ """Give the categories table proper Romanian display names for slugs."""
+ import sqlite3
+
+ conn = sqlite3.connect(db_path)
+ try:
+ rows = conn.execute(
+ "SELECT value FROM categories WHERE type = 'category'"
+ ).fetchall()
+ for (slug,) in rows:
+ conn.execute(
+ "UPDATE categories SET display_name = ? WHERE type='category' AND value = ?",
+ (category_display_name(slug), slug),
+ )
+ conn.commit()
+ finally:
+ conn.close()
+
+
+def write_database(db_tmp_path: Path, activities: list[Activity]) -> None:
+ """Create a fresh tmp DB, bulk insert, populate categories, rebuild FTS."""
+ if db_tmp_path.exists():
+ db_tmp_path.unlink()
+ db = DatabaseManager(str(db_tmp_path))
+ db.bulk_insert_activities(activities)
+ _enrich_category_display_names(db_tmp_path)
+ db.rebuild_fts_index()
+
+
+def atomic_swap(db_tmp_path: Path, db_path: Path) -> Optional[Path]:
+ """Back up the live DB then atomically swap the tmp file in."""
+ backup: Optional[Path] = None
+ if db_path.exists():
+ backup = db_path.with_suffix(db_path.suffix + ".bak")
+ shutil.copy2(db_path, backup)
+ os.replace(db_tmp_path, db_path)
+ return backup
+
+
+# --------------------------------------------------------------------------
+# orchestration
+# --------------------------------------------------------------------------
+def rebuild(
+ *,
+ extracted_dir: Path,
+ chunks_dir: Path,
+ sources_dir: Path,
+ db_path: Path,
+ decisions_path: Optional[Path] = None,
+ schema_path: Path = DEFAULT_SCHEMA_PATH,
+ golden_dir: Optional[Path] = None,
+ do_swap: bool = True,
+) -> dict:
+ """
+ Full rebuild. Everything is built into .tmp; the live DB is only
+ touched by the final atomic swap, so a crash anywhere above leaves it intact.
+ """
+ extracted_dir = Path(extracted_dir)
+ db_path = Path(db_path)
+ db_tmp_path = db_path.with_suffix(db_path.suffix + ".tmp")
+
+ schema = load_schema(schema_path)
+ collected = collect_activities(extracted_dir, Path(chunks_dir), Path(sources_dir), schema)
+ activities: list[Activity] = collected.pop("activities")
+
+ deduped, dedup_stats = dedup_activities(activities)
+
+ decisions = load_review_decisions(Path(decisions_path)) if decisions_path else {}
+ final, decision_stats = apply_review_decisions(deduped, decisions)
+
+ try:
+ write_database(db_tmp_path, final)
+ backup = atomic_swap(db_tmp_path, db_path) if do_swap else None
+ except Exception:
+ if db_tmp_path.exists():
+ db_tmp_path.unlink()
+ raise
+
+ report = {
+ **collected,
+ "dedup": dedup_stats,
+ "decisions": decision_stats,
+ "final_count": len(final),
+ "backup": str(backup) if backup else None,
+ "swapped": do_swap,
+ "qa": _qa_report(final, collected, golden_dir),
+ }
+ return report
+
+
+def _qa_report(
+ activities: list[Activity], collected: dict, golden_dir: Optional[Path]
+) -> dict:
+ per_category: dict[str, int] = defaultdict(int)
+ per_content_type: dict[str, int] = defaultdict(int)
+ confidence: dict[str, int] = defaultdict(int)
+ with_rules = 0
+ for a in activities:
+ per_category[a.category] += 1
+ per_content_type[a.content_type or "?"] += 1
+ confidence[a.extraction_confidence or "?"] += 1
+ if a.rules and a.rules.strip():
+ with_rules += 1
+ raw = collected.get("activities_raw", 0)
+ hallucinated = collected.get("activities_hallucinated", 0)
+ return {
+ "total": len(activities),
+ "per_category": dict(per_category),
+ "per_content_type": dict(per_content_type),
+ "extraction_confidence": dict(confidence),
+ "pct_with_rules": round(100 * with_rules / len(activities), 1) if activities else 0.0,
+ "needs_review": sum(1 for a in activities if a.needs_review),
+ "hallucination_rate": round(100 * hallucinated / raw, 2) if raw else 0.0,
+ "golden_recall": golden_recall(Path(golden_dir), activities) if golden_dir else None,
+ }
+
+
+def print_report(report: dict) -> None:
+ qa = report["qa"]
+ print("=" * 60)
+ print("BUILD DATABASE — QA REPORT")
+ print("=" * 60)
+ print(f"extraction files : {report['files_total']} "
+ f"(valid {report['files_valid']}, schema-rejected {report['files_rejected_schema']})")
+ print(f"activities raw : {report['activities_raw']}")
+ print(f" hallucinated drop : {report['activities_hallucinated']} "
+ f"({qa['hallucination_rate']}%)")
+ d = report["dedup"]
+ print(f"dedup : {d['input']} -> {d['output']} "
+ f"(auto-merged {d['auto_merged']}, borderline {d['borderline']})")
+ print(f"review decisions : dropped {report['decisions']['dropped']}, "
+ f"resolved {report['decisions']['resolved']}")
+ print(f"final inserted : {report['final_count']}")
+ print(f"% with rules : {qa['pct_with_rules']}")
+ print(f"needs_review rows : {qa['needs_review']}")
+ print("per category :")
+ for slug, n in sorted(qa["per_category"].items(), key=lambda kv: -kv[1]):
+ print(f" {slug:<24}: {n}")
+ print("per content_type :")
+ for ct, n in sorted(qa["per_content_type"].items(), key=lambda kv: -kv[1]):
+ print(f" {ct:<24}: {n}")
+ print("extraction_confidence:")
+ for c, n in sorted(qa["extraction_confidence"].items()):
+ print(f" {c:<24}: {n}")
+ if qa["golden_recall"]:
+ g = qa["golden_recall"]
+ print(f"golden recall : {g['found']}/{g['expected']} = {g['recall']}")
+ if report["category_fallbacks"]:
+ print("category fallbacks :")
+ for msg in report["category_fallbacks"]:
+ print(f" {msg}")
+ if report["backup"]:
+ print(f"live DB backed up to : {report['backup']}")
+ print("=" * 60)
+
+
+# --------------------------------------------------------------------------
+# CLI
+# --------------------------------------------------------------------------
+def main(argv: Optional[list[str]] = None) -> int:
+ parser = argparse.ArgumentParser(description="Build activities.db from extraction JSON.")
+ parser.add_argument("--rebuild", action="store_true",
+ help="rebuild the database from scratch (only mode supported)")
+ parser.add_argument("--extracted", default="data/extracted")
+ parser.add_argument("--chunks", default="data/chunks")
+ parser.add_argument("--sources", default="data/sources")
+ parser.add_argument("--db", default="data/activities.db")
+ parser.add_argument("--decisions", default="data/review_decisions.json")
+ parser.add_argument("--golden", default="data/golden")
+ parser.add_argument("--schema", default=str(DEFAULT_SCHEMA_PATH))
+ args = parser.parse_args(argv)
+
+ if not args.rebuild:
+ parser.error("only --rebuild is supported (full rebuild, no incremental merge)")
+
+ report = rebuild(
+ extracted_dir=Path(args.extracted),
+ chunks_dir=Path(args.chunks),
+ sources_dir=Path(args.sources),
+ db_path=Path(args.db),
+ decisions_path=Path(args.decisions),
+ schema_path=Path(args.schema),
+ golden_dir=Path(args.golden),
+ )
+ print_report(report)
+ return 0
+
+
+if __name__ == "__main__":
+ raise SystemExit(main())
diff --git a/scripts/chunk_sources.py b/scripts/chunk_sources.py
new file mode 100644
index 0000000..0844b10
--- /dev/null
+++ b/scripts/chunk_sources.py
@@ -0,0 +1,251 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+"""
+chunk_sources.py — split normalized data/sources/*.txt into ~20-page chunks
+for subagent extraction, and maintain data/chunks/manifest.json.
+
+Paginated text → ~20-page chunks, ~4-page overlap (plan D8).
+Unpaginated text → ~10000-word windows, ~2000-word overlap.
+
+The manifest is a cache derived from the filesystem + per-chunk state. Re-running
+this script is idempotent: existing chunk states (pending/assigned/done/rejected)
+survive as long as the source content hash is unchanged.
+"""
+
+from __future__ import annotations
+
+import argparse
+import json
+import sys
+from pathlib import Path
+
+SCRIPT_DIR = Path(__file__).resolve().parent
+if str(SCRIPT_DIR) not in sys.path:
+ sys.path.insert(0, str(SCRIPT_DIR))
+
+from extract_common import content_hash, split_pages # noqa: E402
+
+SCHEMA_VERSION = "1.0"
+PAGES_PER_CHUNK = 20
+PAGE_OVERLAP = 4
+WORD_WINDOW = 10_000
+WORD_OVERLAP = 2_000
+
+VALID_STATES = {"pending", "assigned", "done", "rejected"}
+
+
+# --------------------------------------------------------------------------
+# header parsing
+# --------------------------------------------------------------------------
+def parse_source(text: str) -> tuple[dict, str]:
+ """Split a normalized source file into (header_dict, body)."""
+ lines = text.splitlines()
+ header: dict = {}
+ body_start = 0
+ in_header = True
+ for i, line in enumerate(lines):
+ if line.startswith("--- PAGE "):
+ body_start = i
+ break
+ if not in_header:
+ continue
+ if set(line.strip()) == {"="} and line.strip():
+ body_start = i + 1
+ in_header = False # header ends at the rule line
+ continue
+ if ":" in line:
+ key, _, val = line.partition(":")
+ header[key.strip()] = val.strip()
+ body = "\n".join(lines[body_start:])
+ return header, body
+
+
+# --------------------------------------------------------------------------
+# chunking — pure functions
+# --------------------------------------------------------------------------
+def chunk_pages(
+ pages: list[tuple[int, str]],
+ pages_per_chunk: int = PAGES_PER_CHUNK,
+ overlap: int = PAGE_OVERLAP,
+) -> list[dict]:
+ """
+ Split an ordered list of (page_no, text) into overlapping chunks.
+
+ stride = pages_per_chunk - overlap. Because stride < pages_per_chunk - 1, any
+ activity straddling a page boundary appears whole in at least one chunk.
+ """
+ if not pages:
+ return []
+ stride = max(1, pages_per_chunk - overlap)
+ chunks: list[dict] = []
+ i = 0
+ n = len(pages)
+ while i < n:
+ window = pages[i : i + pages_per_chunk]
+ first, last = window[0][0], window[-1][0]
+ text = "".join(
+ f"\n--- PAGE {num} ---\n{txt}\n" for num, txt in window
+ )
+ chunks.append(
+ {"page_start": first, "page_end": last,
+ "chunk_range": f"pages {first}-{last}", "text": text}
+ )
+ if i + pages_per_chunk >= n:
+ break
+ i += stride
+ return chunks
+
+
+def chunk_words(
+ text: str, window: int = WORD_WINDOW, overlap: int = WORD_OVERLAP
+) -> list[dict]:
+ """Split unpaginated text into overlapping word windows."""
+ words = text.split()
+ if not words:
+ return []
+ stride = max(1, window - overlap)
+ chunks: list[dict] = []
+ i = 0
+ n = len(words)
+ while i < n:
+ seg = words[i : i + window]
+ chunks.append(
+ {"word_start": i, "word_end": i + len(seg),
+ "chunk_range": f"words {i}-{i + len(seg)}", "text": " ".join(seg)}
+ )
+ if i + window >= n:
+ break
+ i += stride
+ return chunks
+
+
+def make_chunks(source_text: str) -> list[dict]:
+ """Chunk one normalized source file. Picks page- or word-windowing."""
+ _, body = parse_source(source_text)
+ pages = split_pages(body)
+ if pages:
+ return chunk_pages(pages)
+ return chunk_words(body)
+
+
+# --------------------------------------------------------------------------
+# manifest
+# --------------------------------------------------------------------------
+def _empty_manifest() -> dict:
+ return {"schema_version": SCHEMA_VERSION, "chunks": {}}
+
+
+def load_manifest(manifest_path: Path) -> dict:
+ if manifest_path.exists():
+ try:
+ data = json.loads(manifest_path.read_text(encoding="utf-8"))
+ data.setdefault("schema_version", SCHEMA_VERSION)
+ data.setdefault("chunks", {})
+ return data
+ except (json.JSONDecodeError, OSError):
+ pass
+ return _empty_manifest()
+
+
+def save_manifest(manifest: dict, manifest_path: Path) -> None:
+ manifest_path.parent.mkdir(parents=True, exist_ok=True)
+ manifest_path.write_text(
+ json.dumps(manifest, indent=2, ensure_ascii=False), encoding="utf-8"
+ )
+
+
+def chunk_source_file(
+ source_path: Path, chunks_dir: Path, manifest: dict
+) -> list[str]:
+ """
+ Chunk one data/sources/.txt → data/chunks//.partNN.txt and
+ register every chunk in `manifest`. Preserves prior state when the source
+ content hash is unchanged. Returns the list of chunk keys written.
+ """
+ source_id = source_path.stem
+ text = source_path.read_text(encoding="utf-8", errors="replace")
+ src_hash = content_hash(text)
+ chunks = make_chunks(text)
+
+ out_dir = chunks_dir / source_id
+ out_dir.mkdir(parents=True, exist_ok=True)
+
+ written: list[str] = []
+ for idx, chunk in enumerate(chunks, 1):
+ key = f"{source_id}.part{idx:02d}"
+ chunk_file = out_dir / f"{key}.txt"
+ chunk_file.write_text(chunk["text"], encoding="utf-8")
+
+ prior = manifest["chunks"].get(key)
+ # preserve state only if the source content is unchanged
+ if prior and prior.get("source_hash") == src_hash and \
+ prior.get("state") in VALID_STATES:
+ state = prior["state"]
+ else:
+ state = "pending"
+
+ manifest["chunks"][key] = {
+ "source_id": source_id,
+ "source_hash": src_hash,
+ "part": idx,
+ "chunk_range": chunk["chunk_range"],
+ "chunk_file": str(chunk_file.relative_to(chunks_dir.parent)),
+ "expected_json": f"{key}.json",
+ "state": state,
+ }
+ written.append(key)
+ return written
+
+
+def prune_stale(manifest: dict, live_keys: set[str]) -> list[str]:
+ """Drop manifest entries whose chunk no longer exists on disk."""
+ stale = [k for k in manifest["chunks"] if k not in live_keys]
+ for k in stale:
+ del manifest["chunks"][k]
+ return stale
+
+
+# --------------------------------------------------------------------------
+# CLI
+# --------------------------------------------------------------------------
+def run(sources_dir: Path, chunks_dir: Path) -> dict:
+ """Chunk every *.txt in sources_dir. Returns a summary dict."""
+ manifest_path = chunks_dir / "manifest.json"
+ manifest = load_manifest(manifest_path)
+
+ live_keys: set[str] = set()
+ source_files = sorted(sources_dir.glob("*.txt"))
+ for src in source_files:
+ live_keys.update(chunk_source_file(src, chunks_dir, manifest))
+
+ stale = prune_stale(manifest, live_keys)
+ save_manifest(manifest, manifest_path)
+
+ states: dict[str, int] = {}
+ for meta in manifest["chunks"].values():
+ states[meta["state"]] = states.get(meta["state"], 0) + 1
+ return {
+ "sources": len(source_files),
+ "chunks": len(live_keys),
+ "pruned": len(stale),
+ "states": states,
+ }
+
+
+def main(argv: list[str] | None = None) -> int:
+ parser = argparse.ArgumentParser(description="Chunk normalized sources.")
+ parser.add_argument("--sources", default="data/sources", help="sources dir")
+ parser.add_argument("--chunks", default="data/chunks", help="chunks output dir")
+ args = parser.parse_args(argv)
+
+ summary = run(Path(args.sources), Path(args.chunks))
+ print(f"sources processed : {summary['sources']}")
+ print(f"chunks written : {summary['chunks']}")
+ print(f"stale pruned : {summary['pruned']}")
+ for state, count in sorted(summary["states"].items()):
+ print(f" {state:<10}: {count}")
+ return 0
+
+
+if __name__ == "__main__":
+ raise SystemExit(main())
diff --git a/scripts/claude_extraction_template.md b/scripts/claude_extraction_template.md
deleted file mode 100644
index f2137d1..0000000
--- a/scripts/claude_extraction_template.md
+++ /dev/null
@@ -1,54 +0,0 @@
-# TEMPLATE PENTRU EXTRACȚIE ACTIVITĂȚI CU CLAUDE
-
-## Instrucțiuni pentru Claude Code:
-
-Pentru fiecare PDF/DOC, folosește următorul format de extracție:
-
-### 1. Citește fișierul:
-```
-Claude, te rog citește fișierul: [CALE_FISIER]
-```
-
-### 2. Extrage activitățile folosind acest template JSON:
-```json
-{
- "source_file": "[NUME_FISIER]",
- "activities": [
- {
- "name": "Numele activității",
- "description": "Descrierea completă a activității",
- "rules": "Regulile jocului/activității",
- "variations": "Variante sau adaptări",
- "category": "[A-H] bazat pe tip",
- "age_group_min": 6,
- "age_group_max": 14,
- "participants_min": 4,
- "participants_max": 20,
- "duration_min": 10,
- "duration_max": 30,
- "materials_list": "Lista materialelor necesare",
- "skills_developed": "Competențe dezvoltate",
- "difficulty_level": "Ușor/Mediu/Dificil",
- "keywords": "cuvinte cheie separate prin virgulă",
- "tags": "taguri relevante"
- }
- ]
-}
-```
-
-### 3. Salvează în fișier:
-După extracție, salvează JSON-ul în: `/scripts/extracted_activities/[NUME_FISIER].json`
-
-### 4. Priorități de procesare:
-
-**TOP PRIORITY (procesează primele):**
-1. 1000 Fantastic Scout Games.pdf
-2. Cartea Mare a jocurilor.pdf
-3. 160-de-activitati-dinamice-jocuri-pentru-team-building.pdf
-4. 101 Ways to Create an Unforgettable Camp Experience.pdf
-5. 151 Awesome Summer Camp Nature Activities.pdf
-
-**Categorii de focus:**
-- [A] Jocuri Cercetășești
-- [C] Camping & Activități Exterior
-- [G] Activități Educaționale
\ No newline at end of file
diff --git a/scripts/create_databases.py b/scripts/create_databases.py
deleted file mode 100644
index 515d3a4..0000000
--- a/scripts/create_databases.py
+++ /dev/null
@@ -1,164 +0,0 @@
-#!/usr/bin/env python3
-# -*- coding: utf-8 -*-
-"""
-DATABASE SETUP SCRIPT - INDEX-SISTEM-JOCURI
-
-Script pentru recrearea bazelor de date din .gitignore
-Folosește clasele DatabaseManager pentru consistență
-
-Usage:
- python scripts/create_databases.py
- python scripts/create_databases.py --clear-existing
-"""
-
-import sys
-import argparse
-from pathlib import Path
-
-# Add src to path so we can import our modules
-sys.path.append(str(Path(__file__).parent.parent / 'src'))
-
-from database import DatabaseManager
-from game_library_manager import GameLibraryManager
-
-def create_main_database(db_path: str = "data/activities.db", clear: bool = False):
- """Create the main activities database"""
- db_file = Path(db_path)
-
- if clear and db_file.exists():
- print(f"🗑️ Removing existing database: {db_path}")
- db_file.unlink()
-
- print(f"📊 Creating main database: {db_path}")
- db = DatabaseManager(db_path)
-
- # Test the database
- try:
- stats = db.get_statistics()
- print(f"✅ Database created successfully: {stats['total_activities']} activities")
- return True
- except Exception as e:
- print(f"❌ Error creating database: {e}")
- return False
-
-def create_game_library_database(db_path: str = "data/game_library.db", clear: bool = False):
- """Create the legacy game library database"""
- db_file = Path(db_path)
-
- if clear and db_file.exists():
- print(f"🗑️ Removing existing database: {db_path}")
- db_file.unlink()
-
- print(f"📊 Creating game library database: {db_path}")
- manager = GameLibraryManager(db_path)
-
- print(f"✅ Game library database created successfully")
- return True
-
-def create_test_database(db_path: str = "data/test_activities.db", clear: bool = False):
- """Create the test database"""
- db_file = Path(db_path)
-
- if clear and db_file.exists():
- print(f"🗑️ Removing existing database: {db_path}")
- db_file.unlink()
-
- print(f"📊 Creating test database: {db_path}")
- db = DatabaseManager(db_path)
-
- # Add some test data
- test_activity = {
- 'title': 'Test Activity - Setup Script',
- 'description': 'This is a test activity created by the setup script',
- 'file_path': 'test/sample.txt',
- 'file_type': 'TXT',
- 'category': 'test',
- 'age_group': '8-12 ani',
- 'participants': '5-10 persoane',
- 'duration': '15-30min',
- 'materials': 'Fără materiale',
- 'tags': '["test", "setup"]',
- 'source_text': 'Sample test content for verification'
- }
-
- try:
- db.insert_activity(test_activity)
- stats = db.get_statistics()
- print(f"✅ Test database created with sample data: {stats['total_activities']} activities")
- return True
- except Exception as e:
- print(f"❌ Error creating test database: {e}")
- return False
-
-def ensure_data_directory():
- """Ensure the data directory exists"""
- data_dir = Path("data")
- if not data_dir.exists():
- print(f"📁 Creating data directory: {data_dir}")
- data_dir.mkdir(parents=True)
- else:
- print(f"📁 Data directory exists: {data_dir}")
-
-def main():
- """Main setup function"""
- parser = argparse.ArgumentParser(description='Create databases for INDEX-SISTEM-JOCURI')
- parser.add_argument('--clear-existing', '-c', action='store_true',
- help='Remove existing databases before creating new ones')
- parser.add_argument('--main-only', action='store_true',
- help='Create only the main activities database')
- parser.add_argument('--test-only', action='store_true',
- help='Create only the test database')
-
- args = parser.parse_args()
-
- print("🚀 DATABASE SETUP - INDEX-SISTEM-JOCURI")
- print("=" * 50)
-
- # Ensure data directory exists
- ensure_data_directory()
-
- success_count = 0
- total_count = 0
-
- if args.test_only:
- total_count = 1
- if create_test_database(clear=args.clear_existing):
- success_count += 1
- elif args.main_only:
- total_count = 1
- if create_main_database(clear=args.clear_existing):
- success_count += 1
- else:
- # Create all databases
- databases = [
- ("Main activities", lambda: create_main_database(clear=args.clear_existing)),
- ("Game library", lambda: create_game_library_database(clear=args.clear_existing)),
- ("Test activities", lambda: create_test_database(clear=args.clear_existing))
- ]
-
- total_count = len(databases)
-
- for name, create_func in databases:
- print(f"\n📂 Creating {name} database...")
- try:
- if create_func():
- success_count += 1
- except Exception as e:
- print(f"❌ Failed to create {name} database: {e}")
-
- print("\n" + "=" * 50)
- print(f"🎯 SUMMARY: {success_count}/{total_count} databases created successfully")
-
- if success_count == total_count:
- print("✅ All databases ready!")
- print("\nNext steps:")
- print("1. Run indexer: cd src && python indexer.py --clear-db")
- print("2. Start web app: cd src && python app.py")
- else:
- print("⚠️ Some databases failed to create. Check errors above.")
- return 1
-
- return 0
-
-if __name__ == '__main__':
- sys.exit(main())
\ No newline at end of file
diff --git a/scripts/extract_common.py b/scripts/extract_common.py
new file mode 100644
index 0000000..f9f1a37
--- /dev/null
+++ b/scripts/extract_common.py
@@ -0,0 +1,361 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+"""
+extract_common.py — single home for per-format text extraction.
+
+Every extractor returns a plain text *body* with synthetic page markers
+(`--- PAGE N ---`). The file-level header (`SOURCE:` / `CONVERTED:`) is added
+by normalize_sources.py, not here.
+
+Critical fix vs. the old pdf_to_text_converter.py: there is NO `max_pages` cap.
+Large books are extracted in full.
+"""
+
+from __future__ import annotations
+
+import hashlib
+import importlib
+import os
+import re
+import shutil
+import subprocess
+import tempfile
+import zipfile
+from pathlib import Path
+from typing import Callable
+
+PAGE_MARKER_RE = re.compile(r"^--- PAGE (\d+) ---\s*$", re.MULTILINE)
+
+# paragraphs per synthetic page for paginated-by-flow formats (docx)
+DOCX_PARAS_PER_PAGE = 40
+
+# formats we deliberately ignore (epub duplicates existing PDFs — plan §1)
+IGNORED_EXTENSIONS = {".epub"}
+
+# obvious junk filenames skipped during a walk
+JUNK_NAMES = {"desktop.ini", "linkuri-jocuri.txt"}
+JUNK_SUFFIXES = {".bak", ".tmp", ".ini"}
+
+
+# --------------------------------------------------------------------------
+# page assembly helpers
+# --------------------------------------------------------------------------
+def join_pages(pages: list[str], start: int = 1) -> str:
+ """Join a list of page texts into a body string with `--- PAGE N ---`."""
+ out: list[str] = []
+ for i, text in enumerate(pages, start):
+ out.append(f"\n--- PAGE {i} ---\n{(text or '').strip()}\n")
+ return "".join(out)
+
+
+def split_pages(body: str) -> list[tuple[int, str]]:
+ """Inverse of join_pages: parse a body into [(page_number, text), ...]."""
+ matches = list(PAGE_MARKER_RE.finditer(body))
+ if not matches:
+ return []
+ pages: list[tuple[int, str]] = []
+ for idx, m in enumerate(matches):
+ num = int(m.group(1))
+ seg_start = m.end()
+ seg_end = matches[idx + 1].start() if idx + 1 < len(matches) else len(body)
+ pages.append((num, body[seg_start:seg_end].strip()))
+ return pages
+
+
+def count_page_markers(body: str) -> int:
+ return len(PAGE_MARKER_RE.findall(body))
+
+
+# --------------------------------------------------------------------------
+# format detection
+# --------------------------------------------------------------------------
+FORMAT_BY_EXT = {
+ ".pdf": "pdf",
+ ".docx": "docx",
+ ".doc": "doc",
+ ".pptx": "pptx",
+ ".ppt": "pptx",
+ ".htm": "html",
+ ".html": "html",
+ ".zip": "zip",
+ ".epub": "epub",
+ ".txt": "txt",
+}
+
+
+def detect_format(path: str | os.PathLike) -> str:
+ """Return a format key for a path based on its extension."""
+ ext = Path(path).suffix.lower()
+ return FORMAT_BY_EXT.get(ext, "unknown")
+
+
+def is_junk(path: str | os.PathLike) -> bool:
+ p = Path(path)
+ name = p.name.lower()
+ if name in JUNK_NAMES:
+ return True
+ if name.startswith("readme") and p.suffix.lower() == ".md":
+ return True
+ if p.suffix.lower() in JUNK_SUFFIXES:
+ return True
+ return False
+
+
+# --------------------------------------------------------------------------
+# content hashing + near-duplicate elimination
+# --------------------------------------------------------------------------
+def _normalize_for_hash(text: str) -> str:
+ return re.sub(r"\s+", " ", (text or "")).strip().lower()
+
+
+def content_hash(text: str) -> str:
+ """Stable SHA1 of whitespace-normalized text — used for exact-dup detection."""
+ return hashlib.sha1(_normalize_for_hash(text).encode("utf-8")).hexdigest()
+
+
+def near_duplicate_ratio(a: str, b: str) -> float:
+ """Similarity score in [0, 100] between two texts (rapidfuzz token ratio)."""
+ from rapidfuzz import fuzz
+
+ return fuzz.token_sort_ratio(_normalize_for_hash(a), _normalize_for_hash(b))
+
+
+def dedupe_texts(
+ items: list[tuple[str, str]], threshold: float = 95.0
+) -> list[tuple[str, str]]:
+ """
+ Drop exact and near-duplicate texts from a list of (key, text) pairs.
+
+ Used for HTML mirror pages (print copies, repeated index/footer pages).
+ Keeps the first occurrence; O(n) on exact hash, O(n*k) fuzzy only against
+ already-kept items.
+ """
+ kept: list[tuple[str, str]] = []
+ seen_hashes: set[str] = set()
+ for key, text in items:
+ h = content_hash(text)
+ if h in seen_hashes:
+ continue
+ if any(near_duplicate_ratio(text, kt) >= threshold for _, kt in kept):
+ continue
+ seen_hashes.add(h)
+ kept.append((key, text))
+ return kept
+
+
+# --------------------------------------------------------------------------
+# preflight dependency check
+# --------------------------------------------------------------------------
+REQUIRED_PYTHON_MODULES = {
+ "pdfplumber": "pdfplumber",
+ "PyPDF2": "pypdf2",
+ "docx": "python-docx",
+ "pptx": "python-pptx",
+ "bs4": "beautifulsoup4",
+ "lxml": "lxml",
+ "jsonschema": "jsonschema",
+ "rapidfuzz": "rapidfuzz",
+ "chardet": "chardet",
+}
+
+
+def preflight(check_ocr: bool = False) -> dict:
+ """
+ Check system + Python dependencies before a long normalization run.
+
+ Returns {'ok': bool, 'missing_python': [...], 'missing_system': [...],
+ 'warnings': [...]}. libreoffice is a *warning* (only .doc needs it),
+ tesseract only checked when check_ocr=True.
+ """
+ missing_python: list[str] = []
+ for module, pip_name in REQUIRED_PYTHON_MODULES.items():
+ try:
+ importlib.import_module(module)
+ except ImportError:
+ missing_python.append(pip_name)
+
+ warnings: list[str] = []
+ missing_system: list[str] = []
+
+ if not (shutil.which("libreoffice") or shutil.which("soffice")):
+ warnings.append("libreoffice not found — legacy .doc files cannot be converted")
+
+ if check_ocr and not shutil.which("tesseract"):
+ missing_system.append("tesseract (OCR requested but not installed)")
+
+ return {
+ "ok": not missing_python and not missing_system,
+ "missing_python": missing_python,
+ "missing_system": missing_system,
+ "warnings": warnings,
+ }
+
+
+# --------------------------------------------------------------------------
+# per-format extractors
+# --------------------------------------------------------------------------
+def extract_pdf(path: str | os.PathLike) -> str:
+ """PDF → body. pdfplumber primary, PyPDF2 fallback. No page cap."""
+ path = str(path)
+ try:
+ return _extract_pdf_pdfplumber(path)
+ except Exception:
+ return _extract_pdf_pypdf2(path)
+
+
+def _extract_pdf_pdfplumber(path: str) -> str:
+ import pdfplumber
+
+ pages: list[str] = []
+ with pdfplumber.open(path) as pdf:
+ for page in pdf.pages: # ALL pages — no max_pages
+ try:
+ pages.append(page.extract_text() or "")
+ except Exception:
+ pages.append("")
+ return join_pages(pages)
+
+
+def _extract_pdf_pypdf2(path: str) -> str:
+ import PyPDF2
+
+ pages: list[str] = []
+ with open(path, "rb") as fh:
+ reader = PyPDF2.PdfReader(fh)
+ for page in reader.pages: # ALL pages — no max_pages
+ try:
+ pages.append(page.extract_text() or "")
+ except Exception:
+ pages.append("")
+ return join_pages(pages)
+
+
+def extract_docx(path: str | os.PathLike) -> str:
+ """docx → body. Synthetic page marker every DOCX_PARAS_PER_PAGE paragraphs."""
+ import docx
+
+ document = docx.Document(str(path))
+ paragraphs = [p.text for p in document.paragraphs]
+ pages: list[str] = []
+ for i in range(0, max(len(paragraphs), 1), DOCX_PARAS_PER_PAGE):
+ chunk = paragraphs[i : i + DOCX_PARAS_PER_PAGE]
+ pages.append("\n".join(chunk))
+ return join_pages(pages)
+
+
+def extract_doc(path: str | os.PathLike) -> str:
+ """
+ Legacy .doc → body via `libreoffice --headless --convert-to docx`.
+
+ Raises RuntimeError if libreoffice is unavailable — the caller marks the
+ resulting source `needs_review` regardless (conversion is imperfect).
+ """
+ soffice = shutil.which("libreoffice") or shutil.which("soffice")
+ if not soffice:
+ raise RuntimeError("libreoffice/soffice not available — cannot convert .doc")
+
+ src = Path(path).resolve()
+ with tempfile.TemporaryDirectory() as tmp:
+ subprocess.run(
+ [soffice, "--headless", "--convert-to", "docx", "--outdir", tmp, str(src)],
+ check=True,
+ capture_output=True,
+ timeout=300,
+ )
+ converted = Path(tmp) / (src.stem + ".docx")
+ if not converted.exists():
+ raise RuntimeError(f"libreoffice produced no output for {src.name}")
+ return extract_docx(converted)
+
+
+def extract_pptx(path: str | os.PathLike) -> str:
+ """pptx → body. One page per slide: title + body text + speaker notes."""
+ from pptx import Presentation
+
+ presentation = Presentation(str(path))
+ pages: list[str] = []
+ for slide in presentation.slides:
+ parts: list[str] = []
+ for shape in slide.shapes:
+ if shape.has_text_frame and shape.text_frame.text.strip():
+ parts.append(shape.text_frame.text.strip())
+ if slide.has_notes_slide:
+ notes = slide.notes_slide.notes_text_frame.text.strip()
+ if notes:
+ parts.append(f"[NOTES] {notes}")
+ pages.append("\n".join(parts))
+ return join_pages(pages)
+
+
+def extract_html(path: str | os.PathLike) -> str:
+ """HTML mirror page → body. Strips nav/script/style/footer/header/aside."""
+ import chardet
+ from bs4 import BeautifulSoup
+
+ raw = Path(path).read_bytes()
+ enc = chardet.detect(raw).get("encoding") or "utf-8"
+ soup = BeautifulSoup(raw.decode(enc, errors="replace"), "lxml")
+
+ for tag in soup(["script", "style", "nav", "footer", "header", "aside", "noscript"]):
+ tag.decompose()
+ # also drop common chrome by role/class
+ for tag in soup.find_all(attrs={"role": ["navigation", "banner", "contentinfo"]}):
+ tag.decompose()
+
+ text = soup.get_text(separator="\n")
+ lines = [ln.strip() for ln in text.splitlines() if ln.strip()]
+ return join_pages(["\n".join(lines)])
+
+
+def extract_zip(path: str | os.PathLike) -> str:
+ """
+ zip → body. Unzips into a temp dir and recurses on every extractable inner
+ file. Inner files are page-renumbered into one continuous body.
+ """
+ path = str(path)
+ pages: list[str] = []
+ with tempfile.TemporaryDirectory() as tmp:
+ try:
+ with zipfile.ZipFile(path) as zf:
+ zf.extractall(tmp)
+ except zipfile.BadZipFile:
+ return ""
+ for inner in sorted(Path(tmp).rglob("*")):
+ if not inner.is_file() or is_junk(inner):
+ continue
+ fmt = detect_format(inner)
+ if fmt in ("unknown", "epub", "zip"):
+ # nested zips handled by recursion below
+ if fmt == "zip":
+ body = extract_zip(inner)
+ pages.extend(t for _, t in split_pages(body))
+ continue
+ try:
+ body = extract_file(inner)
+ except Exception:
+ continue
+ pages.extend(t for _, t in split_pages(body))
+ return join_pages(pages)
+
+
+EXTRACTORS: dict[str, Callable[[str | os.PathLike], str]] = {
+ "pdf": extract_pdf,
+ "docx": extract_docx,
+ "doc": extract_doc,
+ "pptx": extract_pptx,
+ "html": extract_html,
+ "zip": extract_zip,
+}
+
+
+def extract_file(path: str | os.PathLike) -> str:
+ """Dispatch a single file to the right extractor. Returns a page-marked body."""
+ fmt = detect_format(path)
+ if fmt == "txt":
+ body = Path(path).read_text(encoding="utf-8", errors="replace")
+ # already paginated? pass through; else wrap as one page
+ return body if count_page_markers(body) else join_pages([body])
+ extractor = EXTRACTORS.get(fmt)
+ if extractor is None:
+ raise ValueError(f"No extractor for format '{fmt}': {path}")
+ return extractor(path)
diff --git a/scripts/html_extractor.py b/scripts/html_extractor.py
deleted file mode 100644
index 08f5898..0000000
--- a/scripts/html_extractor.py
+++ /dev/null
@@ -1,424 +0,0 @@
-#!/usr/bin/env python3
-"""
-HTML Activity Extractor - Proceseaz 1876 fiiere HTML
-Extrage automat activiti folosind pattern recognition
-"""
-
-import os
-import re
-import json
-from pathlib import Path
-from bs4 import BeautifulSoup
-import chardet
-from typing import List, Dict, Optional
-import sqlite3
-from datetime import datetime
-
-class HTMLActivityExtractor:
- def __init__(self, db_path='data/activities.db'):
- self.db_path = db_path
- # Pattern-uri pentru detectare activiti �n rom�n
- self.activity_patterns = {
- 'title_patterns': [
- r'(?i)(joc|activitate|exerci[t]iu|team[\s-]?building|energizer|ice[\s-]?breaker)[\s:]+([^\.]{5,100})',
- r'(?i)]*>([^<]*(?:joc|activitate|exerci[t]iu)[^<]*)',
- r'(?i)([^<]*(?:joc|activitate|exerci[t]iu)[^<]*)',
- r'(?i)^[\d]+\.?\s*([A-Z][^\.]{10,100}(?:joc|activitate|exerci[t]iu)[^\.]{0,50})$',
- ],
- 'description_markers': [
- 'descriere', 'reguli', 'cum se joac[a]', 'instructiuni',
- 'obiectiv', 'desfasurare', 'explicatie', 'mod de joc'
- ],
- 'materials_markers': [
- 'materiale', 'necesare', 'echipament', 'ce avem nevoie',
- 'se folosesc', 'trebuie sa avem', 'dotari'
- ],
- 'age_patterns': [
- r'(?i)v[�a]rst[a][\s:]+(\d+)[\s-]+(\d+)',
- r'(?i)(\d+)[\s-]+(\d+)\s*ani',
- r'(?i)pentru\s+(\d+)[\s-]+(\d+)\s*ani',
- r'(?i)categoria?\s*(?:de\s*)?v[�a]rst[a][\s:]+(\d+)[\s-]+(\d+)',
- ],
- 'participants_patterns': [
- r'(?i)(\d+)[\s-]+(\d+)\s*(?:participan[t]i|juc[a]tori|persoane|copii)',
- r'(?i)num[a]r\s*(?:de\s*)?(?:participan[t]i|juc[a]tori)[\s:]+(\d+)[\s-]+(\d+)',
- r'(?i)grup\s*de\s*(\d+)[\s-]+(\d+)',
- ],
- 'duration_patterns': [
- r'(?i)durat[a][\s:]+(\d+)[\s-]+(\d+)\s*(?:minute|min)',
- r'(?i)timp[\s:]+(\d+)[\s-]+(\d+)\s*(?:minute|min)',
- r'(?i)(\d+)[\s-]+(\d+)\s*minute',
- ]
- }
-
- # Categorii predefinite bazate pe sistemul existent
- self.categories = {
- '[A]': ['joc', 'joaca', 'distractie', 'amuzament'],
- '[B]': ['aventura', 'explorare', 'descoperire'],
- '[C]': ['camping', 'tabara', 'excursie', 'drumetie'],
- '[D]': ['foc', 'flacara', 'lumina'],
- '[E]': ['noduri', 'fr�nghii', 'sfori', 'legare'],
- '[F]': ['bushcraft', 'supravietuire', 'survival'],
- '[G]': ['educatie', 'educativ', 'invatare', 'scoala'],
- '[H]': ['orientare', 'busola', 'harta', 'navigare']
- }
-
- def detect_encoding(self, file_path):
- """Detecteaz encoding-ul fiierului"""
- with open(file_path, 'rb') as f:
- result = chardet.detect(f.read())
- return result['encoding'] or 'utf-8'
-
- def extract_from_html(self, html_path: str) -> List[Dict]:
- """Extrage activiti dintr-un singur fiier HTML"""
- activities = []
-
- try:
- # Detectare encoding i citire
- encoding = self.detect_encoding(html_path)
- with open(html_path, 'r', encoding=encoding, errors='ignore') as f:
- content = f.read()
-
- soup = BeautifulSoup(content, 'lxml')
-
- # Metod 1: Caut liste de activiti
- activities.extend(self._extract_from_lists(soup, html_path))
-
- # Metod 2: Caut activiti �n headings
- activities.extend(self._extract_from_headings(soup, html_path))
-
- # Metod 3: Caut pattern-uri �n text
- activities.extend(self._extract_from_patterns(soup, html_path))
-
- # Metod 4: Caut �n tabele
- activities.extend(self._extract_from_tables(soup, html_path))
-
- except Exception as e:
- print(f"Error processing {html_path}: {e}")
-
- return activities
-
- def _extract_from_lists(self, soup, source_file):
- """Extrage activiti din liste HTML (ul, ol)"""
- activities = []
-
- for list_elem in soup.find_all(['ul', 'ol']):
- # Verific dac lista pare s conin activiti
- list_text = list_elem.get_text().lower()
- if any(marker in list_text for marker in ['joc', 'activitate', 'exercitiu']):
- for li in list_elem.find_all('li'):
- text = li.get_text(strip=True)
- if len(text) > 20: # Minim 20 caractere pentru o activitate valid
- activity = self._create_activity_from_text(text, source_file)
- if activity:
- activities.append(activity)
-
- return activities
-
- def _extract_from_headings(self, soup, source_file):
- """Extrage activiti bazate pe headings"""
- activities = []
-
- for heading in soup.find_all(['h1', 'h2', 'h3', 'h4', 'h5', 'h6']):
- heading_text = heading.get_text(strip=True)
-
- # Verific dac heading-ul conine cuvinte cheie
- if any(keyword in heading_text.lower() for keyword in ['joc', 'activitate', 'exercitiu']):
- # Caut descrierea �n elementele urmtoare
- description = ""
- next_elem = heading.find_next_sibling()
-
- while next_elem and next_elem.name not in ['h1', 'h2', 'h3', 'h4', 'h5', 'h6']:
- if next_elem.name in ['p', 'div', 'ul']:
- description += next_elem.get_text(strip=True) + " "
- if len(description) > 500: # Limit descriere
- break
- next_elem = next_elem.find_next_sibling()
-
- if description:
- activity = {
- 'name': heading_text[:200],
- 'description': description[:1000],
- 'source_file': str(source_file),
- 'category': self._detect_category(heading_text + " " + description)
- }
- activities.append(activity)
-
- return activities
-
- def _extract_from_patterns(self, soup, source_file):
- """Extrage activiti folosind pattern matching"""
- activities = []
- text = soup.get_text()
-
- # Caut pattern-uri de activiti
- for pattern in self.activity_patterns['title_patterns']:
- matches = re.finditer(pattern, text, re.MULTILINE)
- for match in matches:
- title = match.group(0) if match.lastindex == 0 else match.group(match.lastindex)
- if len(title) > 10:
- # Extrage context �n jurul match-ului
- start = max(0, match.start() - 200)
- end = min(len(text), match.end() + 500)
- context = text[start:end]
-
- activity = self._create_activity_from_text(context, source_file, title)
- if activity:
- activities.append(activity)
-
- return activities
-
- def _extract_from_tables(self, soup, source_file):
- """Extrage activiti din tabele"""
- activities = []
-
- for table in soup.find_all('table'):
- rows = table.find_all('tr')
- if len(rows) > 1: # Cel puin header i o linie de date
- # Detecteaz coloanele relevante
- headers = [th.get_text(strip=True).lower() for th in rows[0].find_all(['th', 'td'])]
-
- for row in rows[1:]:
- cells = row.find_all(['td'])
- if cells:
- activity_data = {}
- for i, cell in enumerate(cells):
- if i < len(headers):
- activity_data[headers[i]] = cell.get_text(strip=True)
-
- # Creeaz activitate din date tabel
- if any(key in activity_data for key in ['joc', 'activitate', 'nume', 'titlu']):
- activity = self._create_activity_from_table_data(activity_data, source_file)
- if activity:
- activities.append(activity)
-
- return activities
-
- def _create_activity_from_text(self, text, source_file, title=None):
- """Creeaz un dicionar de activitate din text"""
- if not text or len(text) < 30:
- return None
-
- activity = {
- 'name': title or text[:100].split('.')[0].strip(),
- 'description': text[:1000],
- 'source_file': str(source_file),
- 'category': self._detect_category(text),
- 'keywords': self._extract_keywords(text),
- 'created_at': datetime.now().isoformat()
- }
-
- # Extrage metadata suplimentar
- activity.update(self._extract_metadata(text))
-
- return activity
-
- def _create_activity_from_table_data(self, data, source_file):
- """Creeaz activitate din date de tabel"""
- activity = {
- 'source_file': str(source_file),
- 'created_at': datetime.now().isoformat()
- }
-
- # Mapare c�mpuri tabel la c�mpuri DB
- field_mapping = {
- 'nume': 'name', 'titlu': 'name', 'joc': 'name', 'activitate': 'name',
- 'descriere': 'description', 'detalii': 'description', 'explicatie': 'description',
- 'materiale': 'materials_list', 'echipament': 'materials_list',
- 'varsta': 'age_group_min', 'categoria': 'category',
- 'participanti': 'participants_min', 'numar': 'participants_min',
- 'durata': 'duration_min', 'timp': 'duration_min'
- }
-
- for table_field, db_field in field_mapping.items():
- if table_field in data:
- activity[db_field] = data[table_field]
-
- # Validare minim
- if 'name' in activity and len(activity.get('name', '')) > 5:
- return activity
-
- return None
-
- def _extract_metadata(self, text):
- """Extrage metadata din text folosind pattern-uri"""
- metadata = {}
-
- # Extrage v�rsta
- for pattern in self.activity_patterns['age_patterns']:
- match = re.search(pattern, text)
- if match:
- metadata['age_group_min'] = int(match.group(1))
- metadata['age_group_max'] = int(match.group(2)) if match.lastindex >= 2 else int(match.group(1))
- break
-
- # Extrage numr participani
- for pattern in self.activity_patterns['participants_patterns']:
- match = re.search(pattern, text)
- if match:
- metadata['participants_min'] = int(match.group(1))
- metadata['participants_max'] = int(match.group(2)) if match.lastindex >= 2 else int(match.group(1))
- break
-
- # Extrage durata
- for pattern in self.activity_patterns['duration_patterns']:
- match = re.search(pattern, text)
- if match:
- metadata['duration_min'] = int(match.group(1))
- metadata['duration_max'] = int(match.group(2)) if match.lastindex >= 2 else int(match.group(1))
- break
-
- # Extrage materiale
- materials = []
- text_lower = text.lower()
- for marker in self.activity_patterns['materials_markers']:
- idx = text_lower.find(marker)
- if idx != -1:
- # Extrage urmtoarele 200 caractere dup marker
- materials_text = text[idx:idx+200]
- # Extrage items din list
- items = re.findall(r'[-"]\s*([^\n-"]+)', materials_text)
- if items:
- materials.extend(items)
-
- if materials:
- metadata['materials_list'] = ', '.join(materials[:10]) # Maxim 10 materiale
-
- return metadata
-
- def _detect_category(self, text):
- """Detecteaz categoria activitii bazat pe cuvinte cheie"""
- text_lower = text.lower()
-
- for category, keywords in self.categories.items():
- if any(keyword in text_lower for keyword in keywords):
- return category
-
- return '[A]' # Default categoria jocuri
-
- def _extract_keywords(self, text):
- """Extrage cuvinte cheie din text"""
- keywords = []
- text_lower = text.lower()
-
- # Lista de cuvinte cheie relevante
- keyword_list = [
- 'cooperare', 'competitie', 'echipa', 'creativitate', 'miscare',
- 'strategie', 'comunicare', 'incredere', 'coordonare', 'atentie',
- 'reflexe', 'logica', 'imaginatie', 'muzica', 'dans', 'sport',
- 'natura', 'mediu', 'stiinta', 'matematica', 'limba', 'cultura'
- ]
-
- for keyword in keyword_list:
- if keyword in text_lower:
- keywords.append(keyword)
-
- return ', '.join(keywords[:5]) # Maxim 5 keywords
-
- def save_to_database(self, activities):
- """Salveaz activitile �n baza de date"""
- conn = sqlite3.connect(self.db_path)
- cursor = conn.cursor()
-
- saved_count = 0
- duplicate_count = 0
-
- for activity in activities:
- try:
- # Verific duplicate
- cursor.execute(
- "SELECT id FROM activities WHERE name = ? AND source_file = ?",
- (activity.get('name'), activity.get('source_file'))
- )
-
- if cursor.fetchone():
- duplicate_count += 1
- continue
-
- # Pregtete valorile pentru insert
- columns = []
- values = []
- placeholders = []
-
- for key, value in activity.items():
- if key != 'created_at': # Skip created_at, it has default
- columns.append(key)
- values.append(value)
- placeholders.append('?')
-
- # Insert �n DB
- query = f"INSERT INTO activities ({', '.join(columns)}) VALUES ({', '.join(placeholders)})"
- cursor.execute(query, values)
- saved_count += 1
-
- except Exception as e:
- print(f"Error saving activity: {e}")
- continue
-
- conn.commit()
- conn.close()
-
- return saved_count, duplicate_count
-
- def process_all_html_files(self, base_path='/mnt/d/GoogleDrive/Cercetasi/carti-camp-jocuri'):
- """Proceseaz toate fiierele HTML din directorul specificat"""
- base_path = Path(base_path)
- html_files = list(base_path.rglob("*.html"))
- html_files.extend(list(base_path.rglob("*.htm")))
-
- print(f"Found {len(html_files)} HTML files to process")
-
- all_activities = []
- processed = 0
- errors = 0
-
- for i, html_file in enumerate(html_files):
- try:
- activities = self.extract_from_html(str(html_file))
- all_activities.extend(activities)
- processed += 1
-
- # Progress update
- if (i + 1) % 100 == 0:
- print(f"Progress: {i+1}/{len(html_files)} files processed, {len(all_activities)} activities found")
- # Save batch to DB
- if all_activities:
- saved, dupes = self.save_to_database(all_activities)
- print(f"Batch saved: {saved} new activities, {dupes} duplicates skipped")
- all_activities = [] # Clear buffer
-
- except Exception as e:
- print(f"Error processing {html_file}: {e}")
- errors += 1
-
- # Save remaining activities
- if all_activities:
- saved, dupes = self.save_to_database(all_activities)
- print(f"Final batch saved: {saved} new activities, {dupes} duplicates skipped")
-
- print(f"\nProcessing complete!")
- print(f"Files processed: {processed}")
- print(f"Errors: {errors}")
-
- return processed, errors
-
-# Funcie main pentru test
-if __name__ == "__main__":
- extractor = HTMLActivityExtractor()
-
- # Test pe un fiier sample mai �nt�i
- print("Testing on sample file first...")
- # Gsete un fiier HTML pentru test
- test_files = list(Path('/mnt/d/GoogleDrive/Cercetasi/carti-camp-jocuri').rglob("*.html"))[:3]
-
- for test_file in test_files:
- print(f"\nTesting: {test_file}")
- activities = extractor.extract_from_html(str(test_file))
- print(f"Found {len(activities)} activities")
- if activities:
- print(f"Sample activity: {activities[0]['name'][:50]}...")
-
- # �ntreab dac s continue cu procesarea complet
- response = input("\nContinue with full processing? (y/n): ")
- if response.lower() == 'y':
- extractor.process_all_html_files()
\ No newline at end of file
diff --git a/scripts/import_claude_activities.py b/scripts/import_claude_activities.py
deleted file mode 100644
index c10141a..0000000
--- a/scripts/import_claude_activities.py
+++ /dev/null
@@ -1,78 +0,0 @@
-#!/usr/bin/env python3
-"""
-Import activities extracted by Claude from JSON files
-"""
-
-import json
-import sqlite3
-from pathlib import Path
-from datetime import datetime
-
-class ClaudeActivityImporter:
- def __init__(self, db_path='data/activities.db'):
- self.db_path = db_path
- self.json_dir = Path('scripts/extracted_activities')
- self.json_dir.mkdir(exist_ok=True)
-
- def import_json_file(self, json_path):
- """Import activities from a single JSON file"""
- with open(json_path, 'r', encoding='utf-8') as f:
- data = json.load(f)
-
- source_file = data.get('source_file', str(json_path))
- activities = data.get('activities', [])
-
- conn = sqlite3.connect(self.db_path)
- cursor = conn.cursor()
-
- imported = 0
- for activity in activities:
- try:
- # Add source file and timestamp
- activity['source_file'] = source_file
- activity['created_at'] = datetime.now().isoformat()
-
- # Prepare insert
- columns = list(activity.keys())
- values = list(activity.values())
- placeholders = ['?' for _ in values]
-
- # Check for duplicate
- cursor.execute(
- "SELECT id FROM activities WHERE name = ? AND source_file = ?",
- (activity.get('name'), source_file)
- )
-
- if not cursor.fetchone():
- query = f"INSERT INTO activities ({', '.join(columns)}) VALUES ({', '.join(placeholders)})"
- cursor.execute(query, values)
- imported += 1
-
- except Exception as e:
- print(f"Error importing activity: {e}")
-
- conn.commit()
- conn.close()
-
- print(f"Imported {imported} activities from {json_path.name}")
- return imported
-
- def import_all_json_files(self):
- """Import all JSON files from the extracted_activities directory"""
- json_files = list(self.json_dir.glob("*.json"))
-
- if not json_files:
- print("No JSON files found in extracted_activities directory")
- return 0
-
- total_imported = 0
- for json_file in json_files:
- imported = self.import_json_file(json_file)
- total_imported += imported
-
- print(f"\nTotal imported: {total_imported} activities from {len(json_files)} files")
- return total_imported
-
-if __name__ == "__main__":
- importer = ClaudeActivityImporter()
- importer.import_all_json_files()
\ No newline at end of file
diff --git a/scripts/import_common.py b/scripts/import_common.py
new file mode 100644
index 0000000..0ec3718
--- /dev/null
+++ b/scripts/import_common.py
@@ -0,0 +1,179 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+"""
+import_common.py — shared helpers for the import / validation side of the
+extraction pipeline (Lane C).
+
+Used by build_database.py and validate_extractions.py:
+ * JSON-schema validation of subagent extraction files,
+ * the anti-hallucination source_excerpt substring check (E5),
+ * locating the source chunk that an extraction file came from,
+ * the stable content key used by the needs_review queue.
+"""
+
+from __future__ import annotations
+
+import hashlib
+import json
+import re
+import unicodedata
+from pathlib import Path
+from typing import Any, Optional
+
+SCRIPT_DIR = Path(__file__).resolve().parent
+REPO_ROOT = SCRIPT_DIR.parent
+
+DEFAULT_SCHEMA_PATH = SCRIPT_DIR / "activity_schema.json"
+
+# rapidfuzz.partial_ratio is on a 0..100 scale; an excerpt counts as a real
+# quote from the source when it scores at least this against the chunk text.
+EXCERPT_MATCH_THRESHOLD = 90.0
+
+
+# --------------------------------------------------------------------------
+# schema validation
+# --------------------------------------------------------------------------
+def load_schema(schema_path: str | Path = DEFAULT_SCHEMA_PATH) -> dict:
+ """Load the activity JSON schema produced by Lane A."""
+ return json.loads(Path(schema_path).read_text(encoding="utf-8"))
+
+
+def validate_extraction(data: Any, schema: dict) -> list[str]:
+ """
+ Validate one parsed extraction file against `schema`.
+
+ Returns a list of human-readable error strings; empty list == valid.
+ """
+ import jsonschema
+
+ validator = jsonschema.Draft7Validator(schema)
+ errors: list[str] = []
+ for err in sorted(validator.iter_errors(data), key=lambda e: list(e.path)):
+ location = "/".join(str(p) for p in err.path) or ""
+ errors.append(f"{location}: {err.message}")
+ return errors
+
+
+# --------------------------------------------------------------------------
+# excerpt verification (E5 — anti-hallucination)
+# --------------------------------------------------------------------------
+def _normalize_text(text: str) -> str:
+ return re.sub(r"\s+", " ", (text or "")).strip().lower()
+
+
+def excerpt_score(excerpt: str, chunk_text: str) -> float:
+ """Best fuzzy-substring score (0..100) of `excerpt` inside `chunk_text`."""
+ from rapidfuzz import fuzz
+
+ if not excerpt or not chunk_text:
+ return 0.0
+ return float(fuzz.partial_ratio(_normalize_text(excerpt), _normalize_text(chunk_text)))
+
+
+def excerpt_matches(
+ excerpt: str, chunk_text: str, threshold: float = EXCERPT_MATCH_THRESHOLD
+) -> bool:
+ """True when `excerpt` appears (fuzzily) as a substring of `chunk_text`."""
+ return excerpt_score(excerpt, chunk_text) >= threshold
+
+
+# --------------------------------------------------------------------------
+# locating the source chunk an extraction file came from
+# --------------------------------------------------------------------------
+def chunk_key_for(json_path: Path, header: Optional[dict]) -> str:
+ """
+ Resolve the chunk key for an extraction file.
+
+ Prefers the explicit `chunk_key` in the header, otherwise falls back to the
+ JSON file stem (extraction files are named `.json`).
+ """
+ if header and header.get("chunk_key"):
+ return str(header["chunk_key"])
+ return json_path.stem
+
+
+def source_id_for(chunk_key: str, header: Optional[dict]) -> str:
+ """Resolve the source id; `.partNN` → ``."""
+ if header and header.get("source_id"):
+ return str(header["source_id"])
+ # chunk keys look like ".partNN"
+ return chunk_key.rsplit(".part", 1)[0]
+
+
+def find_chunk_text(
+ json_path: Path, header: Optional[dict], chunks_dir: Path
+) -> Optional[str]:
+ """
+ Return the text of the source chunk for an extraction file, or None.
+
+ Looks for data/chunks//.txt, then falls back to a
+ recursive glob on the chunk key.
+ """
+ chunk_key = chunk_key_for(json_path, header)
+ source_id = source_id_for(chunk_key, header)
+
+ candidate = chunks_dir / source_id / f"{chunk_key}.txt"
+ if candidate.is_file():
+ return candidate.read_text(encoding="utf-8", errors="replace")
+
+ matches = list(chunks_dir.rglob(f"{chunk_key}.txt"))
+ if matches:
+ return matches[0].read_text(encoding="utf-8", errors="replace")
+ return None
+
+
+def source_path_for(source_id: str, sources_dir: Path) -> Optional[str]:
+ """
+ Read the original `SOURCE:` path from a normalized source header.
+
+ data/sources/.txt starts with a `SOURCE: ` line.
+ """
+ src_file = sources_dir / f"{source_id}.txt"
+ if not src_file.is_file():
+ return None
+ try:
+ with src_file.open(encoding="utf-8", errors="replace") as fh:
+ for line in fh:
+ if line.startswith("SOURCE:"):
+ return line.split(":", 1)[1].strip()
+ if line.startswith("=") or line.startswith("--- PAGE "):
+ break
+ except OSError:
+ return None
+ return None
+
+
+# --------------------------------------------------------------------------
+# stable content key for the needs_review queue (plan §5c)
+# --------------------------------------------------------------------------
+def normalize_name(name: str) -> str:
+ """Diacritic-free, lowercased, whitespace-collapsed name (dedup key)."""
+ if not name:
+ return ""
+ decomposed = unicodedata.normalize("NFKD", name)
+ ascii_str = "".join(c for c in decomposed if not unicodedata.combining(c))
+ return re.sub(r"\s+", " ", ascii_str.lower().strip())
+
+
+def content_key(normalized_name: str, language: Optional[str], description: str) -> str:
+ """
+ Stable hash identifying a row for the review queue.
+
+ Only borderline-kept-separate rows and legacy `.doc` rows ever carry
+ needs_review, and neither is auto-merged — so their (normalized_name,
+ language, description) triple is stable across rebuilds.
+ """
+ payload = f"{normalized_name}\x1f{language or ''}\x1f{_normalize_text(description)}"
+ return hashlib.sha1(payload.encode("utf-8")).hexdigest()
+
+
+# --------------------------------------------------------------------------
+# iteration
+# --------------------------------------------------------------------------
+def iter_extraction_files(extracted_dir: Path):
+ """Yield every *.json directly under `extracted_dir` (skips _rejected/)."""
+ if not extracted_dir.is_dir():
+ return
+ for path in sorted(extracted_dir.glob("*.json")):
+ if path.is_file():
+ yield path
diff --git a/scripts/normalize_sources.py b/scripts/normalize_sources.py
new file mode 100644
index 0000000..2c9c607
--- /dev/null
+++ b/scripts/normalize_sources.py
@@ -0,0 +1,255 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+"""
+normalize_sources.py — walk data/carti-camp-jocuri/ and write data/sources/.txt.
+
+Output files keep the existing header format:
+
+ SOURCE:
+ CONVERTED:
+ FORMAT:
+ NEEDS_REVIEW: (optional — legacy .doc conversions)
+ ==================================================
+
+ --- PAGE 1 ---
+ ...
+
+Each source gets a stable id = <8-hex hash of relative path>_,
+so two files with the same name in different folders never collide.
+
+The pipeline is script-only: this normalizes formats, it does NOT run extraction.
+Run `--check-deps` before a long job.
+"""
+
+from __future__ import annotations
+
+import argparse
+import datetime as _dt
+import hashlib
+import re
+import sys
+from pathlib import Path
+
+SCRIPT_DIR = Path(__file__).resolve().parent
+if str(SCRIPT_DIR) not in sys.path:
+ sys.path.insert(0, str(SCRIPT_DIR))
+
+from extract_common import ( # noqa: E402
+ count_page_markers,
+ dedupe_texts,
+ detect_format,
+ extract_file,
+ extract_html,
+ is_junk,
+ join_pages,
+ preflight,
+ split_pages,
+)
+
+HEADER_RULE = "=" * 50
+
+
+# --------------------------------------------------------------------------
+# stable source id
+# --------------------------------------------------------------------------
+def sanitize_stem(stem: str) -> str:
+ s = re.sub(r"[^\w]+", "_", stem, flags=re.UNICODE).strip("_").lower()
+ return s[:60] or "source"
+
+
+def stable_id(relative_path: str | Path) -> str:
+ """Collision-proof id derived from the path relative to the corpus root."""
+ rel = str(relative_path).replace("\\", "/")
+ digest = hashlib.sha1(rel.encode("utf-8")).hexdigest()[:8]
+ stem = sanitize_stem(Path(rel).stem)
+ return f"{digest}_{stem}"
+
+
+# --------------------------------------------------------------------------
+# header
+# --------------------------------------------------------------------------
+def build_header(
+ source_rel: str, fmt: str, needs_review: str | None = None
+) -> str:
+ today = _dt.date.today().isoformat()
+ lines = [
+ f"SOURCE: {source_rel}",
+ f"CONVERTED: {today}",
+ f"FORMAT: {fmt}",
+ ]
+ if needs_review:
+ lines.append(f"NEEDS_REVIEW: {needs_review}")
+ lines.append(HEADER_RULE)
+ return "\n".join(lines) + "\n\n"
+
+
+# --------------------------------------------------------------------------
+# mirror-site directories
+# --------------------------------------------------------------------------
+MIRROR_PAGE_EXTS = {".html", ".htm"}
+
+
+def is_mirror_dir(path: Path) -> bool:
+ """A directory counts as a site mirror if it contains HTML pages."""
+ if not path.is_dir():
+ return False
+ if path.name.endswith("_files"):
+ return False
+ return any(
+ p.suffix.lower() in MIRROR_PAGE_EXTS
+ for p in path.rglob("*")
+ if p.is_file()
+ )
+
+
+def normalize_mirror(mirror_dir: Path) -> str:
+ """Extract every HTML page in a mirror dir, dedupe near-duplicates, join."""
+ pages: list[tuple[str, str]] = []
+ for html in sorted(mirror_dir.rglob("*")):
+ if not html.is_file() or html.suffix.lower() not in MIRROR_PAGE_EXTS:
+ continue
+ if "_files" in html.parts:
+ continue
+ try:
+ body = extract_html(html)
+ except Exception:
+ continue
+ text = "\n".join(t for _, t in split_pages(body))
+ if text.strip():
+ pages.append((str(html.relative_to(mirror_dir)), text))
+ pages = dedupe_texts(pages)
+ return join_pages([t for _, t in pages])
+
+
+# --------------------------------------------------------------------------
+# one source
+# --------------------------------------------------------------------------
+def normalize_one(
+ path: Path, corpus_root: Path, out_dir: Path
+) -> dict | None:
+ """
+ Normalize a single file or mirror directory → data/sources/.txt.
+
+ Returns a result dict, or None if the entry was skipped (junk / ignored).
+ """
+ rel = path.relative_to(corpus_root)
+ sid = stable_id(rel)
+
+ if path.is_dir():
+ if not is_mirror_dir(path):
+ return None
+ fmt, needs_review = "html-mirror", None
+ body = normalize_mirror(path)
+ else:
+ if is_junk(path):
+ return None
+ fmt = detect_format(path)
+ if fmt in ("unknown", "epub", "txt"):
+ return None # epub duplicates PDFs; txt is not a source format here
+ needs_review = "legacy .doc conversion is imperfect" if fmt == "doc" else None
+ try:
+ body = extract_file(path)
+ except Exception as exc: # noqa: BLE001
+ return {"id": sid, "source": str(rel), "status": "error", "error": str(exc)}
+
+ if not body.strip():
+ return {"id": sid, "source": str(rel), "status": "empty"}
+
+ out_path = out_dir / f"{sid}.txt"
+ out_path.write_text(build_header(str(rel), fmt, needs_review) + body,
+ encoding="utf-8")
+ return {
+ "id": sid,
+ "source": str(rel),
+ "status": "ok",
+ "format": fmt,
+ "pages": count_page_markers(body),
+ "needs_review": bool(needs_review),
+ }
+
+
+# --------------------------------------------------------------------------
+# walk
+# --------------------------------------------------------------------------
+def iter_corpus_entries(corpus_root: Path):
+ """Yield top-level files and mirror directories under the corpus root."""
+ for entry in sorted(corpus_root.iterdir()):
+ if entry.name.startswith("."):
+ continue
+ if entry.is_dir():
+ if is_mirror_dir(entry):
+ yield entry
+ else:
+ yield entry
+
+
+def run(corpus_root: Path, out_dir: Path) -> dict:
+ out_dir.mkdir(parents=True, exist_ok=True)
+ results: list[dict] = []
+ for entry in iter_corpus_entries(corpus_root):
+ res = normalize_one(entry, corpus_root, out_dir)
+ if res is not None:
+ results.append(res)
+ summary = {
+ "total": len(results),
+ "ok": sum(1 for r in results if r["status"] == "ok"),
+ "errors": sum(1 for r in results if r["status"] == "error"),
+ "empty": sum(1 for r in results if r["status"] == "empty"),
+ "needs_review": sum(1 for r in results if r.get("needs_review")),
+ "results": results,
+ }
+ return summary
+
+
+# --------------------------------------------------------------------------
+# CLI
+# --------------------------------------------------------------------------
+def print_preflight(report: dict) -> int:
+ print("Dependency preflight")
+ print("--------------------")
+ if report["missing_python"]:
+ print(" MISSING Python packages: " + ", ".join(report["missing_python"]))
+ else:
+ print(" Python packages: OK")
+ if report["missing_system"]:
+ print(" MISSING system tools : " + ", ".join(report["missing_system"]))
+ for w in report["warnings"]:
+ print(f" WARNING: {w}")
+ print(" => " + ("READY" if report["ok"] else "NOT READY — install the above"))
+ return 0 if report["ok"] else 1
+
+
+def main(argv: list[str] | None = None) -> int:
+ parser = argparse.ArgumentParser(description="Normalize mixed sources to .txt")
+ parser.add_argument("--corpus", default="data/carti-camp-jocuri",
+ help="corpus root to walk")
+ parser.add_argument("--out", default="data/sources", help="output directory")
+ parser.add_argument("--check-deps", action="store_true",
+ help="run dependency preflight and exit")
+ parser.add_argument("--ocr", action="store_true",
+ help="include OCR (tesseract) in the preflight check")
+ args = parser.parse_args(argv)
+
+ if args.check_deps:
+ return print_preflight(preflight(check_ocr=args.ocr))
+
+ report = preflight(check_ocr=args.ocr)
+ if report["missing_python"]:
+ print_preflight(report)
+ return 1
+ for w in report["warnings"]:
+ print(f"WARNING: {w}")
+
+ summary = run(Path(args.corpus), Path(args.out))
+ print(f"normalized : {summary['ok']}/{summary['total']}")
+ print(f"errors : {summary['errors']}")
+ print(f"empty : {summary['empty']}")
+ print(f"needs_review: {summary['needs_review']}")
+ for r in summary["results"]:
+ if r["status"] != "ok":
+ print(f" [{r['status']}] {r['source']}")
+ return 0
+
+
+if __name__ == "__main__":
+ raise SystemExit(main())
diff --git a/scripts/pdf_extractor.py b/scripts/pdf_extractor.py
deleted file mode 100644
index e69de29..0000000
diff --git a/scripts/pdf_to_text_converter.py b/scripts/pdf_to_text_converter.py
deleted file mode 100644
index db03509..0000000
--- a/scripts/pdf_to_text_converter.py
+++ /dev/null
@@ -1,143 +0,0 @@
-#!/usr/bin/env python3
-"""
-PDF Mass Conversion to Text for Activity Extraction
-Handles all PDF sizes efficiently with multiple fallback methods
-"""
-
-import os
-import json
-from pathlib import Path
-import PyPDF2
-import pdfplumber
-from typing import List, Dict
-import logging
-
-class PDFConverter:
- def __init__(self, max_pages=50):
- self.max_pages = max_pages
- self.conversion_stats = {}
-
- def convert_pdf_to_text(self, pdf_path: str) -> str:
- """Convert PDF to text using multiple methods with fallbacks"""
- try:
- # Method 1: pdfplumber (best for tables and layout)
- return self._convert_with_pdfplumber(pdf_path)
- except Exception as e:
- print(f"pdfplumber failed for {pdf_path}: {e}")
-
- try:
- # Method 2: PyPDF2 (fallback)
- return self._convert_with_pypdf2(pdf_path)
- except Exception as e2:
- print(f"PyPDF2 also failed for {pdf_path}: {e2}")
- return ""
-
- def _convert_with_pdfplumber(self, pdf_path: str) -> str:
- """Primary conversion method using pdfplumber"""
- text_content = ""
-
- with pdfplumber.open(pdf_path) as pdf:
- total_pages = len(pdf.pages)
- pages_to_process = min(total_pages, self.max_pages)
-
- print(f" Converting {pdf_path}: {pages_to_process}/{total_pages} pages")
-
- for i, page in enumerate(pdf.pages[:pages_to_process]):
- try:
- page_text = page.extract_text()
- if page_text:
- text_content += f"\n--- PAGE {i+1} ---\n"
- text_content += page_text
- text_content += "\n"
- except Exception as e:
- print(f" Error on page {i+1}: {e}")
- continue
-
- self.conversion_stats[pdf_path] = {
- 'method': 'pdfplumber',
- 'pages_processed': pages_to_process,
- 'total_pages': total_pages,
- 'success': True,
- 'text_length': len(text_content)
- }
-
- return text_content
-
- def _convert_with_pypdf2(self, pdf_path: str) -> str:
- """Fallback conversion method using PyPDF2"""
- text_content = ""
-
- with open(pdf_path, 'rb') as file:
- reader = PyPDF2.PdfReader(file)
- total_pages = len(reader.pages)
- pages_to_process = min(total_pages, self.max_pages)
-
- print(f" Converting {pdf_path} (fallback): {pages_to_process}/{total_pages} pages")
-
- for i in range(pages_to_process):
- try:
- page = reader.pages[i]
- page_text = page.extract_text()
- if page_text:
- text_content += f"\n--- PAGE {i+1} ---\n"
- text_content += page_text
- text_content += "\n"
- except Exception as e:
- print(f" Error on page {i+1}: {e}")
- continue
-
- self.conversion_stats[pdf_path] = {
- 'method': 'PyPDF2',
- 'pages_processed': pages_to_process,
- 'total_pages': total_pages,
- 'success': True,
- 'text_length': len(text_content)
- }
-
- return text_content
-
- def convert_all_pdfs(self, pdf_directory: str, output_directory: str):
- """Convert all PDFs in directory to text files"""
- pdf_files = list(Path(pdf_directory).glob("**/*.pdf"))
-
- print(f"🔄 Converting {len(pdf_files)} PDF files to text...")
-
- os.makedirs(output_directory, exist_ok=True)
-
- for i, pdf_path in enumerate(pdf_files):
- print(f"\n[{i+1}/{len(pdf_files)}] Processing {pdf_path.name}...")
-
- # Convert to text
- text_content = self.convert_pdf_to_text(str(pdf_path))
-
- if text_content.strip():
- # Save as text file
- output_file = Path(output_directory) / f"{pdf_path.stem}.txt"
- with open(output_file, 'w', encoding='utf-8') as f:
- f.write(f"SOURCE: {pdf_path}\n")
- f.write(f"CONVERTED: 2025-01-11\n")
- f.write("="*50 + "\n\n")
- f.write(text_content)
-
- print(f" ✅ Saved: {output_file}")
- else:
- print(f" ❌ No text extracted from {pdf_path.name}")
-
- # Save conversion statistics
- stats_file = Path(output_directory) / "conversion_stats.json"
- with open(stats_file, 'w', encoding='utf-8') as f:
- json.dump(self.conversion_stats, f, indent=2, ensure_ascii=False)
-
- print(f"\n🎉 PDF conversion complete! Check {output_directory}")
- return len([f for f in self.conversion_stats.values() if f['success']])
-
-# Usage
-if __name__ == "__main__":
- converter = PDFConverter(max_pages=50)
-
- # Convert all PDFs
- pdf_dir = "/mnt/d/GoogleDrive/Cercetasi/carti-camp-jocuri"
- output_dir = "/mnt/d/GoogleDrive/Cercetasi/carti-camp-jocuri/INDEX-SISTEM-JOCURI/converted_pdfs"
-
- converted_count = converter.convert_all_pdfs(pdf_dir, output_dir)
- print(f"Final result: {converted_count} PDFs successfully converted")
\ No newline at end of file
diff --git a/scripts/review_queue.py b/scripts/review_queue.py
new file mode 100644
index 0000000..bf75c76
--- /dev/null
+++ b/scripts/review_queue.py
@@ -0,0 +1,145 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+"""
+review_queue.py — CLI for the needs_review lifecycle (plan §5c).
+
+Rows land in the queue when dedup leaves a borderline pair separate, or when a
+legacy `.doc` source was converted imperfectly. Each row has a stable content
+key; a decision written here is stored in data/review_decisions.json (git
+tracked) and re-applied by build_database.py on every rebuild, so the queue
+never resurfaces a resolved row.
+
+Commands:
+ python scripts/review_queue.py list
+ python scripts/review_queue.py resolve
+"""
+
+from __future__ import annotations
+
+import argparse
+import json
+import sqlite3
+import sys
+from pathlib import Path
+from typing import Optional
+
+SCRIPT_DIR = Path(__file__).resolve().parent
+REPO_ROOT = SCRIPT_DIR.parent
+for _p in (str(SCRIPT_DIR), str(REPO_ROOT)):
+ if _p not in sys.path:
+ sys.path.insert(0, _p)
+
+from import_common import content_key, normalize_name # noqa: E402
+
+VALID_DECISIONS = ("merge", "keep-separate", "drop")
+
+
+# --------------------------------------------------------------------------
+# review_decisions.json
+# --------------------------------------------------------------------------
+def load_decisions(path: Path) -> dict:
+ if path.is_file():
+ try:
+ data = json.loads(path.read_text(encoding="utf-8"))
+ if isinstance(data, dict):
+ return data
+ except (json.JSONDecodeError, OSError):
+ pass
+ return {}
+
+
+def save_decisions(decisions: dict, path: Path) -> None:
+ path.parent.mkdir(parents=True, exist_ok=True)
+ path.write_text(
+ json.dumps(decisions, indent=2, ensure_ascii=False, sort_keys=True),
+ encoding="utf-8",
+ )
+
+
+# --------------------------------------------------------------------------
+# queue
+# --------------------------------------------------------------------------
+def list_queue(db_path: Path) -> list[dict]:
+ """Return every needs_review row in the current DB, with its content key."""
+ if not db_path.is_file():
+ return []
+ conn = sqlite3.connect(db_path)
+ conn.row_factory = sqlite3.Row
+ try:
+ rows = conn.execute(
+ "SELECT name, normalized_name, language, description "
+ "FROM activities WHERE needs_review = 1 ORDER BY normalized_name"
+ ).fetchall()
+ except sqlite3.OperationalError:
+ return []
+ finally:
+ conn.close()
+
+ out = []
+ for row in rows:
+ norm = row["normalized_name"] or normalize_name(row["name"])
+ key = content_key(norm, row["language"], row["description"] or "")
+ out.append({
+ "id": key,
+ "name": row["name"],
+ "language": row["language"],
+ "description": row["description"] or "",
+ })
+ return out
+
+
+def resolve(decisions_path: Path, content_id: str, decision: str) -> dict:
+ """Record a decision for a content key in review_decisions.json."""
+ if decision not in VALID_DECISIONS:
+ raise ValueError(
+ f"invalid decision {decision!r}; expected one of {VALID_DECISIONS}"
+ )
+ decisions = load_decisions(decisions_path)
+ decisions[content_id] = {"decision": decision}
+ save_decisions(decisions, decisions_path)
+ return decisions
+
+
+# --------------------------------------------------------------------------
+# CLI
+# --------------------------------------------------------------------------
+def main(argv: Optional[list[str]] = None) -> int:
+ parser = argparse.ArgumentParser(description="needs_review queue CLI")
+ parser.add_argument("--db", default="data/activities.db")
+ parser.add_argument("--decisions", default="data/review_decisions.json")
+ sub = parser.add_subparsers(dest="command", required=True)
+
+ sub.add_parser("list", help="list rows currently flagged needs_review")
+
+ p_resolve = sub.add_parser("resolve", help="record a decision for a row")
+ p_resolve.add_argument("id", help="content id from `list`")
+ p_resolve.add_argument("decision", choices=VALID_DECISIONS)
+
+ args = parser.parse_args(argv)
+
+ if args.command == "list":
+ rows = list_queue(Path(args.db))
+ if not rows:
+ print("review queue is empty.")
+ return 0
+ print(f"{len(rows)} row(s) need review:\n")
+ for r in rows:
+ desc = r["description"][:80].replace("\n", " ")
+ print(f" id : {r['id']}")
+ print(f" name : {r['name']} [{r['language']}]")
+ print(f" desc : {desc}")
+ print(f" -> review_queue.py resolve {r['id']} ")
+ print()
+ return 0
+
+ if args.command == "resolve":
+ resolve(Path(args.decisions), args.id, args.decision)
+ print(f"recorded: {args.id} -> {args.decision}")
+ print(f"written to {args.decisions} (applied on next build_database --rebuild)")
+ return 0
+
+ return 1
+
+
+if __name__ == "__main__":
+ raise SystemExit(main())
diff --git a/scripts/run_extraction.py b/scripts/run_extraction.py
index 9304861..c80747a 100644
--- a/scripts/run_extraction.py
+++ b/scripts/run_extraction.py
@@ -1,50 +1,140 @@
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
-Main extraction orchestrator
-Ruleaza intregul proces de extractie
+run_extraction.py — extraction orchestrator (plan §3).
+
+The pipeline is script-only up to the LLM step: this script normalizes the
+corpus, chunks the normalized sources, and emits one subagent prompt per
+`pending` chunk. It does NOT run the extraction itself — that step is the
+interactive Claude Code orchestrator launching waves of subagents.
+
+Steps:
+ 1. normalize data/carti-camp-jocuri/ -> data/sources/*.txt
+ 2. chunk data/sources/*.txt -> data/chunks//*.txt + manifest.json
+ 3. emit one prompt per `pending` chunk -> data/chunks/_prompts/*.md
+ 4. report how many chunks remain `pending`
+
+Usage:
+ python scripts/run_extraction.py
+ python scripts/run_extraction.py --skip-normalize # re-chunk only
"""
+from __future__ import annotations
+
+import argparse
import sys
-import time
from pathlib import Path
+from typing import Optional
-from unified_processor import UnifiedProcessor
-from import_claude_activities import ClaudeActivityImporter
+SCRIPT_DIR = Path(__file__).resolve().parent
+REPO_ROOT = SCRIPT_DIR.parent
+for _p in (str(SCRIPT_DIR), str(REPO_ROOT)):
+ if _p not in sys.path:
+ sys.path.insert(0, _p)
+
+import chunk_sources # noqa: E402
+import normalize_sources # noqa: E402
+
+SUBAGENT_PROMPT = SCRIPT_DIR / "SUBAGENT_PROMPT.md"
+
+
+def emit_chunk_prompt(chunk_key: str, meta: dict, prompts_dir: Path) -> Path:
+ """Write the subagent prompt for one pending chunk."""
+ chunk_file = meta.get("chunk_file", f"data/chunks//{chunk_key}.txt")
+ expected_json = meta.get("expected_json", f"{chunk_key}.json")
+ text = "\n".join([
+ f"# EXTRACTION — chunk `{chunk_key}`",
+ "",
+ f"Read ONLY this chunk: `{chunk_file}`",
+ f"Chunk range: {meta.get('chunk_range', '?')}",
+ "",
+ f"Follow the rules in `{SUBAGENT_PROMPT.relative_to(REPO_ROOT)}`.",
+ "Identify every distinct activity, fill the schema "
+ "(`scripts/activity_schema.json`), and write the result to:",
+ "",
+ f" data/extracted/{expected_json}",
+ "",
+ "Header fields to set: "
+ f'source_id="{meta.get("source_id", "")}", chunk_key="{chunk_key}", '
+ f'source_hash="{meta.get("source_hash", "")}".',
+ "",
+ ])
+ prompts_dir.mkdir(parents=True, exist_ok=True)
+ out = prompts_dir / f"{chunk_key}.prompt.md"
+ out.write_text(text, encoding="utf-8")
+ return out
+
+
+def run(
+ *,
+ corpus_root: Path,
+ sources_dir: Path,
+ chunks_dir: Path,
+ skip_normalize: bool = False,
+) -> dict:
+ summary: dict = {}
+
+ if not skip_normalize:
+ norm = normalize_sources.run(corpus_root, sources_dir)
+ summary["normalized"] = {"ok": norm["ok"], "total": norm["total"],
+ "errors": norm["errors"]}
+
+ chunk_summary = chunk_sources.run(sources_dir, chunks_dir)
+ summary["chunks"] = chunk_summary
+
+ manifest_path = chunks_dir / "manifest.json"
+ manifest = chunk_sources.load_manifest(manifest_path)
+ prompts_dir = chunks_dir / "_prompts"
+
+ pending = {k: m for k, m in manifest["chunks"].items()
+ if m.get("state") == "pending"}
+ for key, meta in sorted(pending.items()):
+ emit_chunk_prompt(key, meta, prompts_dir)
+
+ states: dict[str, int] = {}
+ for m in manifest["chunks"].values():
+ states[m.get("state", "?")] = states.get(m.get("state", "?"), 0) + 1
+ summary["states"] = states
+ summary["pending"] = len(pending)
+ summary["prompts_dir"] = str(prompts_dir)
+ return summary
+
+
+def main(argv: Optional[list[str]] = None) -> int:
+ parser = argparse.ArgumentParser(description="Extraction orchestrator.")
+ parser.add_argument("--corpus", default="data/carti-camp-jocuri")
+ parser.add_argument("--sources", default="data/sources")
+ parser.add_argument("--chunks", default="data/chunks")
+ parser.add_argument("--skip-normalize", action="store_true",
+ help="skip normalization, re-chunk existing sources only")
+ args = parser.parse_args(argv)
+
+ summary = run(
+ corpus_root=Path(args.corpus),
+ sources_dir=Path(args.sources),
+ chunks_dir=Path(args.chunks),
+ skip_normalize=args.skip_normalize,
+ )
+
+ print("=" * 60)
+ print("EXTRACTION ORCHESTRATOR")
+ print("=" * 60)
+ if "normalized" in summary:
+ n = summary["normalized"]
+ print(f"normalized : {n['ok']}/{n['total']} (errors {n['errors']})")
+ print(f"chunks : {summary['chunks']['chunks']}")
+ for state, count in sorted(summary["states"].items()):
+ print(f" {state:<10}: {count}")
+ print(f"\npending chunks remaining : {summary['pending']}")
+ if summary["pending"]:
+ print(f"subagent prompts written : {summary['prompts_dir']}/")
+ print("Launch waves of ~5-10 subagents on those prompts, then run "
+ "validate_extractions.py and build_database.py --rebuild.")
+ else:
+ print("All chunks extracted — run build_database.py --rebuild.")
+ print("=" * 60)
+ return 0
-def main():
- print("="*60)
- print("ACTIVITY EXTRACTION SYSTEM")
- print("Strategy S8: Hybrid Claude + Scripts")
- print("="*60)
-
- # Step 1: Run automated extraction
- print("\nSTEP 1: Automated Extraction")
- print("-"*40)
- processor = UnifiedProcessor()
- processor.process_automated_formats()
-
- # Step 2: Wait for Claude processing
- print("\n" + "="*60)
- print("STEP 2: Manual Claude Processing Required")
- print("-"*40)
- print("Please process PDF/DOC files with Claude using the template.")
- print("Files are listed in: pdf_doc_for_claude.txt")
- print("Save extracted activities as JSON in: scripts/extracted_activities/")
- print("="*60)
-
- response = input("\nHave you completed Claude processing? (y/n): ")
-
- if response.lower() == 'y':
- # Step 3: Import Claude-extracted activities
- print("\nSTEP 3: Importing Claude-extracted activities")
- print("-"*40)
- importer = ClaudeActivityImporter()
- importer.import_all_json_files()
-
- print("\n" + "="*60)
- print("EXTRACTION COMPLETE!")
- print("="*60)
if __name__ == "__main__":
- main()
\ No newline at end of file
+ raise SystemExit(main())
diff --git a/scripts/text_extractor.py b/scripts/text_extractor.py
deleted file mode 100644
index 47b9b16..0000000
--- a/scripts/text_extractor.py
+++ /dev/null
@@ -1,197 +0,0 @@
-#!/usr/bin/env python3
-# -*- coding: utf-8 -*-
-"""
-Text/Markdown Activity Extractor
-Proceseaza fisiere TXT si MD pentru extractie activitati
-"""
-
-import re
-from pathlib import Path
-from typing import List, Dict
-import sqlite3
-from datetime import datetime
-
-class TextActivityExtractor:
- def __init__(self, db_path='data/activities.db'):
- self.db_path = db_path
- self.activity_patterns = {
- 'section_headers': [
- r'^#{1,6}\s*(.+)$', # Markdown headers
- r'^([A-Z][^\.]{10,100})$', # Titluri simple
- r'^\d+\.\s*(.+)$', # Numbered lists
- r'^[•\-\*]\s*(.+)$', # Bullet points
- ],
- 'activity_markers': [
- 'joc:', 'activitate:', 'exercitiu:', 'team building:',
- 'nume:', 'titlu:', 'denumire:'
- ]
- }
-
- def extract_from_text(self, file_path: str) -> List[Dict]:
- """Extrage activitati din fisier text/markdown"""
- activities = []
-
- try:
- with open(file_path, 'r', encoding='utf-8', errors='ignore') as f:
- content = f.read()
-
- # Metoda 1: Cauta sectiuni markdown
- if file_path.endswith('.md'):
- activities.extend(self._extract_from_markdown(content, file_path))
-
- # Metoda 2: Cauta pattern-uri generale
- activities.extend(self._extract_from_patterns(content, file_path))
-
- # Metoda 3: Cauta blocuri de text structurate
- activities.extend(self._extract_from_blocks(content, file_path))
-
- except Exception as e:
- print(f"Error processing {file_path}: {e}")
-
- return activities
-
- def _extract_from_markdown(self, content, source_file):
- """Extrage activitati din format markdown"""
- activities = []
- lines = content.split('\n')
-
- current_activity = None
- current_content = []
-
- for line in lines:
- # Verifica daca e header de activitate
- if re.match(r'^#{1,3}\s*(.+)', line):
- # Salveaza activitatea anterioara daca exista
- if current_activity and current_content:
- current_activity['description'] = '\n'.join(current_content[:20]) # Max 20 linii
- activities.append(current_activity)
-
- # Verifica daca noul header e o activitate
- header_text = re.sub(r'^#{1,3}\s*', '', line)
- if any(marker in header_text.lower() for marker in ['joc', 'activitate', 'exercitiu']):
- current_activity = {
- 'name': header_text[:200],
- 'source_file': str(source_file),
- 'category': '[A]'
- }
- current_content = []
- else:
- current_activity = None
-
- elif current_activity:
- # Adauga continut la activitatea curenta
- if line.strip():
- current_content.append(line)
-
- # Salveaza ultima activitate
- if current_activity and current_content:
- current_activity['description'] = '\n'.join(current_content[:20])
- activities.append(current_activity)
-
- return activities
-
- def _extract_from_patterns(self, content, source_file):
- """Extrage folosind pattern matching"""
- activities = []
-
- # Cauta markeri specifici de activitati
- for marker in self.activity_patterns['activity_markers']:
- pattern = re.compile(f'{re.escape(marker)}\\s*(.+?)(?=\\n\\n|{re.escape(marker)}|$)',
- re.IGNORECASE | re.DOTALL)
- matches = pattern.finditer(content)
-
- for match in matches:
- activity_text = match.group(1)
- if len(activity_text) > 20:
- activity = {
- 'name': activity_text.split('\n')[0][:200],
- 'description': activity_text[:1000],
- 'source_file': str(source_file),
- 'category': '[A]'
- }
- activities.append(activity)
-
- return activities
-
- def _extract_from_blocks(self, content, source_file):
- """Extrage din blocuri de text separate"""
- activities = []
-
- # Imparte in blocuri separate de linii goale
- blocks = re.split(r'\n\s*\n', content)
-
- for block in blocks:
- if len(block) > 50: # Minim 50 caractere
- lines = block.strip().split('\n')
- first_line = lines[0].strip()
-
- # Verifica daca blocul pare o activitate
- if any(keyword in first_line.lower() for keyword in ['joc', 'activitate', 'exercitiu']):
- activity = {
- 'name': first_line[:200],
- 'description': block[:1000],
- 'source_file': str(source_file),
- 'category': '[A]'
- }
- activities.append(activity)
-
- return activities
-
- def save_to_database(self, activities):
- """Salveaza in baza de date"""
- conn = sqlite3.connect(self.db_path)
- cursor = conn.cursor()
-
- saved_count = 0
-
- for activity in activities:
- try:
- # Check for duplicates
- cursor.execute(
- "SELECT id FROM activities WHERE name = ? AND source_file = ?",
- (activity.get('name'), activity.get('source_file'))
- )
-
- if not cursor.fetchone():
- columns = list(activity.keys())
- values = list(activity.values())
- placeholders = ['?' for _ in values]
-
- query = f"INSERT INTO activities ({', '.join(columns)}) VALUES ({', '.join(placeholders)})"
- cursor.execute(query, values)
- saved_count += 1
-
- except Exception as e:
- print(f"Error saving: {e}")
-
- conn.commit()
- conn.close()
-
- return saved_count
-
- def process_all_text_files(self, base_path='/mnt/d/GoogleDrive/Cercetasi/carti-camp-jocuri'):
- """Proceseaza toate fisierele text si markdown"""
- base_path = Path(base_path)
-
- text_files = list(base_path.rglob("*.txt"))
- md_files = list(base_path.rglob("*.md"))
- all_files = text_files + md_files
-
- print(f"Found {len(all_files)} text/markdown files")
-
- all_activities = []
-
- for file_path in all_files:
- activities = self.extract_from_text(str(file_path))
- all_activities.extend(activities)
- print(f"Processed {file_path.name}: {len(activities)} activities")
-
- # Save to database
- saved = self.save_to_database(all_activities)
- print(f"\nTotal saved: {saved} activities from {len(all_files)} files")
-
- return len(all_files), saved
-
-if __name__ == "__main__":
- extractor = TextActivityExtractor()
- extractor.process_all_text_files()
\ No newline at end of file
diff --git a/scripts/unified_processor.py b/scripts/unified_processor.py
deleted file mode 100644
index 8a6d2a3..0000000
--- a/scripts/unified_processor.py
+++ /dev/null
@@ -1,151 +0,0 @@
-#!/usr/bin/env python3
-"""
-Unified Activity Processor
-Orchestreaz toate extractoarele pentru procesare complet
-"""
-
-import time
-from pathlib import Path
-from html_extractor import HTMLActivityExtractor
-from text_extractor import TextActivityExtractor
-import sqlite3
-
-class UnifiedProcessor:
- def __init__(self, db_path='data/activities.db'):
- self.db_path = db_path
- self.html_extractor = HTMLActivityExtractor(db_path)
- self.text_extractor = TextActivityExtractor(db_path)
- self.stats = {
- 'html_processed': 0,
- 'text_processed': 0,
- 'pdf_to_process': 0,
- 'doc_to_process': 0,
- 'total_activities': 0,
- 'start_time': None,
- 'end_time': None
- }
-
- def get_current_activity_count(self):
- """Obine numrul curent de activiti din DB"""
- conn = sqlite3.connect(self.db_path)
- cursor = conn.cursor()
- cursor.execute("SELECT COUNT(*) FROM activities")
- count = cursor.fetchone()[0]
- conn.close()
- return count
-
- def count_files_to_process(self, base_path):
- """Numr fiierele care trebuie procesate"""
- base_path = Path(base_path)
-
- counts = {
- 'html': len(list(base_path.rglob("*.html"))) + len(list(base_path.rglob("*.htm"))),
- 'txt': len(list(base_path.rglob("*.txt"))),
- 'md': len(list(base_path.rglob("*.md"))),
- 'pdf': len(list(base_path.rglob("*.pdf"))),
- 'doc': len(list(base_path.rglob("*.doc"))),
- 'docx': len(list(base_path.rglob("*.docx")))
- }
-
- return counts
-
- def process_automated_formats(self, base_path='/mnt/d/GoogleDrive/Cercetasi/carti-camp-jocuri'):
- """Proceseaz toate formatele care pot fi automatizate"""
- print("="*60)
- print("UNIFIED ACTIVITY PROCESSOR - AUTOMATED PHASE")
- print("="*60)
-
- self.stats['start_time'] = time.time()
- initial_count = self.get_current_activity_count()
-
- # Afieaz statistici iniiale
- file_counts = self.count_files_to_process(base_path)
- print(f"\nFiles to process:")
- for format, count in file_counts.items():
- print(f" {format.upper()}: {count} files")
- print(f"\nCurrent activities in database: {initial_count}")
- print("-"*60)
-
- # FAZA 1: Procesare HTML (prioritate maxim - volum mare)
- print("\n[1/2] Processing HTML files...")
- print("-"*40)
- html_processed, html_errors = self.html_extractor.process_all_html_files(base_path)
- self.stats['html_processed'] = html_processed
-
- # FAZA 2: Procesare Text/MD
- print("\n[2/2] Processing Text/Markdown files...")
- print("-"*40)
- text_processed, text_saved = self.text_extractor.process_all_text_files(base_path)
- self.stats['text_processed'] = text_processed
-
- # Statistici finale
- self.stats['end_time'] = time.time()
- final_count = self.get_current_activity_count()
- self.stats['total_activities'] = final_count - initial_count
-
- # Identific fiierele care necesit procesare manual
- self.stats['pdf_to_process'] = file_counts['pdf']
- self.stats['doc_to_process'] = file_counts['doc'] + file_counts['docx']
-
- self.print_summary()
- self.save_pdf_doc_list(base_path)
-
- def print_summary(self):
- """Afieaz rezumatul procesrii"""
- print("\n" + "="*60)
- print("PROCESSING SUMMARY")
- print("="*60)
-
- duration = self.stats['end_time'] - self.stats['start_time']
-
- print(f"\nAutomated Processing Results:")
- print(f" HTML files processed: {self.stats['html_processed']}")
- print(f" Text/MD files processed: {self.stats['text_processed']}")
- print(f" New activities added: {self.stats['total_activities']}")
- print(f" Processing time: {duration:.1f} seconds")
-
- print(f"\nFiles requiring Claude processing:")
- print(f" PDF files: {self.stats['pdf_to_process']}")
- print(f" DOC/DOCX files: {self.stats['doc_to_process']}")
-
- print("\n" + "="*60)
- print("NEXT STEPS:")
- print("1. Review the file 'pdf_doc_for_claude.txt' for manual processing")
- print("2. Use Claude to extract activities from PDF/DOC files")
- print("3. Focus on largest PDF files first (highest activity density)")
- print("="*60)
-
- def save_pdf_doc_list(self, base_path):
- """Salveaz lista de PDF/DOC pentru procesare cu Claude"""
- base_path = Path(base_path)
-
- pdf_files = sorted(base_path.rglob("*.pdf"), key=lambda p: p.stat().st_size, reverse=True)
- doc_files = list(base_path.rglob("*.doc"))
- docx_files = list(base_path.rglob("*.docx"))
-
- with open('pdf_doc_for_claude.txt', 'w', encoding='utf-8') as f:
- f.write("PDF/DOC FILES FOR CLAUDE PROCESSING\n")
- f.write("="*60 + "\n")
- f.write("Files sorted by size (largest first = likely more activities)\n\n")
-
- f.write("TOP PRIORITY PDF FILES (process these first):\n")
- f.write("-"*40 + "\n")
- for i, pdf in enumerate(pdf_files[:20], 1):
- size_mb = pdf.stat().st_size / (1024*1024)
- f.write(f"{i}. {pdf.name} ({size_mb:.1f} MB)\n")
- f.write(f" Path: {pdf}\n\n")
-
- if len(pdf_files) > 20:
- f.write(f"\n... and {len(pdf_files)-20} more PDF files\n\n")
-
- f.write("\nDOC/DOCX FILES:\n")
- f.write("-"*40 + "\n")
- for doc in doc_files + docx_files:
- size_kb = doc.stat().st_size / 1024
- f.write(f"- {doc.name} ({size_kb:.1f} KB)\n")
-
- print(f"\nPDF/DOC list saved to: pdf_doc_for_claude.txt")
-
-if __name__ == "__main__":
- processor = UnifiedProcessor()
- processor.process_automated_formats()
\ No newline at end of file
diff --git a/scripts/validate_extractions.py b/scripts/validate_extractions.py
new file mode 100644
index 0000000..cdb6113
--- /dev/null
+++ b/scripts/validate_extractions.py
@@ -0,0 +1,208 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+"""
+validate_extractions.py — validate every data/extracted/*.json (plan §5b).
+
+For each extraction file it runs two checks:
+ 1. JSON-schema validation against scripts/activity_schema.json,
+ 2. the source_excerpt anti-hallucination check (each excerpt must be a fuzzy
+ substring of the chunk it came from).
+
+For every failing chunk it:
+ * writes the exact re-extraction prompt to data/extracted/_reextract/.prompt.md,
+ * marks the chunk `rejected` in data/chunks/manifest.json.
+
+The orchestrator then re-launches subagents only on the `rejected` chunks; the
+loop repeats until nothing is rejected.
+
+Usage:
+ python scripts/validate_extractions.py
+"""
+
+from __future__ import annotations
+
+import argparse
+import json
+import sys
+from pathlib import Path
+from typing import Optional
+
+SCRIPT_DIR = Path(__file__).resolve().parent
+REPO_ROOT = SCRIPT_DIR.parent
+for _p in (str(SCRIPT_DIR), str(REPO_ROOT)):
+ if _p not in sys.path:
+ sys.path.insert(0, _p)
+
+from import_common import ( # noqa: E402
+ DEFAULT_SCHEMA_PATH,
+ chunk_key_for,
+ excerpt_matches,
+ excerpt_score,
+ find_chunk_text,
+ iter_extraction_files,
+ load_schema,
+ validate_extraction,
+)
+
+SUBAGENT_PROMPT = SCRIPT_DIR / "SUBAGENT_PROMPT.md"
+
+
+# --------------------------------------------------------------------------
+# re-extraction prompt
+# --------------------------------------------------------------------------
+def build_reextraction_prompt(
+ chunk_key: str, chunk_file: Optional[str], errors: list[str]
+) -> str:
+ """The exact prompt to hand a subagent to re-extract a rejected chunk."""
+ chunk_ref = chunk_file or f"data/chunks//{chunk_key}.txt"
+ lines = [
+ f"# RE-EXTRACTION — chunk `{chunk_key}`",
+ "",
+ "The previous extraction for this chunk was **REJECTED**. Reasons:",
+ "",
+ ]
+ lines += [f"- {e}" for e in errors]
+ lines += [
+ "",
+ "## What to do",
+ "",
+ f"1. Read ONLY this chunk: `{chunk_ref}`",
+ f"2. Follow the extraction rules in `{SUBAGENT_PROMPT.relative_to(REPO_ROOT)}`.",
+ "3. Fix every problem listed above. In particular:",
+ " - every `source_excerpt` must be copied **verbatim** from the chunk",
+ " (it is checked as a fuzzy substring — invented quotes are rejected);",
+ " - `source_excerpt` and `page_reference` are mandatory on every activity;",
+ " - the output must validate against `scripts/activity_schema.json`.",
+ f"4. Overwrite the extraction file `data/extracted/{chunk_key}.json`.",
+ "",
+ ]
+ return "\n".join(lines)
+
+
+# --------------------------------------------------------------------------
+# manifest
+# --------------------------------------------------------------------------
+def load_manifest(manifest_path: Path) -> dict:
+ if manifest_path.is_file():
+ try:
+ data = json.loads(manifest_path.read_text(encoding="utf-8"))
+ data.setdefault("chunks", {})
+ return data
+ except (json.JSONDecodeError, OSError):
+ pass
+ return {"chunks": {}}
+
+
+def save_manifest(manifest: dict, manifest_path: Path) -> None:
+ manifest_path.parent.mkdir(parents=True, exist_ok=True)
+ manifest_path.write_text(
+ json.dumps(manifest, indent=2, ensure_ascii=False), encoding="utf-8"
+ )
+
+
+def mark_rejected(manifest: dict, chunk_key: str) -> None:
+ """Flip a chunk to `rejected` in the manifest (creating the entry if new)."""
+ entry = manifest["chunks"].get(chunk_key, {})
+ entry["state"] = "rejected"
+ manifest["chunks"][chunk_key] = entry
+
+
+# --------------------------------------------------------------------------
+# validation
+# --------------------------------------------------------------------------
+def validate_file(json_path: Path, schema: dict, chunks_dir: Path) -> list[str]:
+ """Return the list of errors for one extraction file (empty == valid)."""
+ try:
+ data = json.loads(json_path.read_text(encoding="utf-8"))
+ except json.JSONDecodeError as exc:
+ return [f"invalid JSON: {exc}"]
+
+ errors = validate_extraction(data, schema)
+ if errors:
+ return errors
+
+ header = data.get("header", {})
+ chunk_text = find_chunk_text(json_path, header, chunks_dir)
+ if chunk_text is None:
+ return [f"source chunk not found for {chunk_key_for(json_path, header)}"]
+
+ for adict in data.get("activities", []):
+ excerpt = adict.get("source_excerpt") or ""
+ if not excerpt_matches(excerpt, chunk_text):
+ score = excerpt_score(excerpt, chunk_text)
+ errors.append(
+ f"activity {adict.get('name')!r}: source_excerpt not found in "
+ f"chunk (best match {score:.0f}/100) — possible hallucination"
+ )
+ return errors
+
+
+def run(
+ extracted_dir: Path,
+ chunks_dir: Path,
+ manifest_path: Path,
+ schema_path: Path = DEFAULT_SCHEMA_PATH,
+) -> dict:
+ schema = load_schema(schema_path)
+ manifest = load_manifest(manifest_path)
+ reextract_dir = extracted_dir / "_reextract"
+
+ report = {"total": 0, "valid": 0, "rejected": 0, "rejected_chunks": []}
+ for json_path in iter_extraction_files(extracted_dir):
+ report["total"] += 1
+ errors = validate_file(json_path, schema, chunks_dir)
+ if not errors:
+ report["valid"] += 1
+ continue
+
+ report["rejected"] += 1
+ try:
+ data = json.loads(json_path.read_text(encoding="utf-8"))
+ header = data.get("header", {})
+ except json.JSONDecodeError:
+ header = {}
+ chunk_key = chunk_key_for(json_path, header)
+ chunk_file = None
+ meta = manifest["chunks"].get(chunk_key)
+ if meta:
+ chunk_file = meta.get("chunk_file")
+
+ reextract_dir.mkdir(parents=True, exist_ok=True)
+ prompt = build_reextraction_prompt(chunk_key, chunk_file, errors)
+ (reextract_dir / f"{chunk_key}.prompt.md").write_text(prompt, encoding="utf-8")
+
+ mark_rejected(manifest, chunk_key)
+ report["rejected_chunks"].append({"chunk": chunk_key, "errors": errors})
+
+ save_manifest(manifest, manifest_path)
+ return report
+
+
+# --------------------------------------------------------------------------
+# CLI
+# --------------------------------------------------------------------------
+def main(argv: Optional[list[str]] = None) -> int:
+ parser = argparse.ArgumentParser(description="Validate extraction JSON files.")
+ parser.add_argument("--extracted", default="data/extracted")
+ parser.add_argument("--chunks", default="data/chunks")
+ parser.add_argument("--manifest", default="data/chunks/manifest.json")
+ parser.add_argument("--schema", default=str(DEFAULT_SCHEMA_PATH))
+ args = parser.parse_args(argv)
+
+ report = run(
+ Path(args.extracted), Path(args.chunks), Path(args.manifest), Path(args.schema)
+ )
+ print(f"extraction files : {report['total']}")
+ print(f" valid : {report['valid']}")
+ print(f" rejected : {report['rejected']}")
+ for item in report["rejected_chunks"]:
+ print(f" [rejected] {item['chunk']}")
+ for err in item["errors"]:
+ print(f" - {err}")
+ if report["rejected"]:
+ print(f"\nRe-extraction prompts written to {args.extracted}/_reextract/")
+ return 0
+
+
+if __name__ == "__main__":
+ raise SystemExit(main())
diff --git a/tests/conftest.py b/tests/conftest.py
new file mode 100644
index 0000000..3e59d0e
--- /dev/null
+++ b/tests/conftest.py
@@ -0,0 +1,114 @@
+# -*- coding: utf-8 -*-
+"""
+Shared pytest fixtures for the extraction-pipeline tests.
+
+scripts/ is not a package, so it is added to sys.path here. Synthetic fixtures
+(PDF, docx, zip, HTML) are generated at runtime — no binary blobs in the repo.
+"""
+
+import sys
+import zipfile
+from pathlib import Path
+
+import pytest
+
+REPO_ROOT = Path(__file__).resolve().parent.parent
+SCRIPTS_DIR = REPO_ROOT / "scripts"
+if str(SCRIPTS_DIR) not in sys.path:
+ sys.path.insert(0, str(SCRIPTS_DIR))
+
+
+# --------------------------------------------------------------------------
+# synthetic PDF — deliberately large to pin the "no max_pages" regression
+# --------------------------------------------------------------------------
+@pytest.fixture
+def big_pdf(tmp_path):
+ """A 60-page PDF; each page carries a unique 'PDFMARK-' token."""
+ from reportlab.pdfgen import canvas
+ from reportlab.lib.pagesizes import letter
+
+ path = tmp_path / "big.pdf"
+ c = canvas.Canvas(str(path), pagesize=letter)
+ for n in range(1, 61):
+ c.drawString(72, 720, f"PDFMARK-{n} synthetic activity page number {n}")
+ c.drawString(72, 700, "Acest joc educativ se joaca in echipa.")
+ c.showPage()
+ c.save()
+ return path
+
+
+# --------------------------------------------------------------------------
+# synthetic docx — 100 paragraphs => 3 synthetic pages at 40 paras/page
+# --------------------------------------------------------------------------
+@pytest.fixture
+def sample_docx(tmp_path):
+ import docx
+
+ path = tmp_path / "sample.docx"
+ document = docx.Document()
+ for i in range(100):
+ document.add_paragraph(f"Paragraf {i}: continut joc team-building.")
+ document.save(str(path))
+ return path
+
+
+# --------------------------------------------------------------------------
+# synthetic HTML mirror page — with nav/script/footer chrome to strip
+# --------------------------------------------------------------------------
+HTML_WITH_NAV = """
+Joc
+
+
+
+
+
+
+Vanatoarea de comori
+Acesta este un joc real de orientare pentru cercetasi.
+Jucatorii cauta indicii ascunse in tabara.
+
+
+
+"""
+
+
+@pytest.fixture
+def html_with_nav(tmp_path):
+ path = tmp_path / "page.html"
+ path.write_text(HTML_WITH_NAV, encoding="utf-8")
+ return path
+
+
+# --------------------------------------------------------------------------
+# synthetic zip — contains a docx and a stray junk file
+# --------------------------------------------------------------------------
+@pytest.fixture
+def sample_zip(tmp_path, sample_docx):
+ path = tmp_path / "archive.zip"
+ with zipfile.ZipFile(path, "w") as zf:
+ zf.write(sample_docx, arcname="inner/sample.docx")
+ zf.writestr("desktop.ini", "junk")
+ return path
+
+
+# --------------------------------------------------------------------------
+# synthetic normalized source — paginated, with an activity straddling a
+# page boundary so the chunker overlap can be verified.
+# --------------------------------------------------------------------------
+@pytest.fixture
+def paginated_source(tmp_path):
+ """A 50-page normalized source. An activity spans the page 20/21 boundary."""
+ lines = ["SOURCE: synthetic/test.pdf", "CONVERTED: 2026-05-19",
+ "FORMAT: pdf", "=" * 50, ""]
+ for n in range(1, 51):
+ lines.append(f"--- PAGE {n} ---")
+ if n == 20:
+ lines.append("ACTIVITY-START jocul podului care traverseaza pagina")
+ elif n == 21:
+ lines.append("continuare a jocului podului ACTIVITY-END")
+ else:
+ lines.append(f"continut obisnuit pe pagina {n}")
+ lines.append("")
+ path = tmp_path / "src_paginated.txt"
+ path.write_text("\n".join(lines), encoding="utf-8")
+ return path
diff --git a/tests/fixtures/.gitkeep b/tests/fixtures/.gitkeep
new file mode 100644
index 0000000..f016cdb
--- /dev/null
+++ b/tests/fixtures/.gitkeep
@@ -0,0 +1,3 @@
+# Test fixtures (synthetic PDF/docx/zip/HTML) are generated at runtime by
+# tests/conftest.py — no binary blobs are committed. This file only preserves
+# the directory in git.
diff --git a/tests/test_build_database.py b/tests/test_build_database.py
new file mode 100644
index 0000000..e4a5e14
--- /dev/null
+++ b/tests/test_build_database.py
@@ -0,0 +1,334 @@
+# -*- coding: utf-8 -*-
+"""
+Tests for scripts/build_database.py — the import / dedup / swap side.
+
+Covers: category -> slug + `altele` fallback; dedup across all three threshold
+bands; EN != RO never merged; field combination on merge; atomic swap with a
+simulated mid-build crash; the source_excerpt substring check.
+"""
+
+import json
+import os
+import sys
+from pathlib import Path
+
+import pytest
+
+REPO_ROOT = Path(__file__).resolve().parent.parent
+SCRIPTS_DIR = REPO_ROOT / "scripts"
+for _p in (str(REPO_ROOT), str(SCRIPTS_DIR)):
+ if _p not in sys.path:
+ sys.path.insert(0, _p)
+
+import build_database as bd # noqa: E402
+from app.models.activity import Activity # noqa: E402
+from app.models.database import DatabaseManager # noqa: E402
+
+
+# --------------------------------------------------------------------------
+# helpers
+# --------------------------------------------------------------------------
+def _activity(**over):
+ base = dict(
+ name="Jocul testului",
+ description="O activitate de echipa in aer liber.",
+ category="team-building",
+ content_type="joc",
+ language="ro",
+ extraction_confidence="high",
+ )
+ base.update(over)
+ return Activity(**base)
+
+
+def _ext_activity(**over):
+ """A schema-valid extraction-JSON activity object."""
+ base = dict(
+ name="Jocul testului",
+ description="O activitate de echipa in aer liber.",
+ category="team-building",
+ content_type="joc",
+ language="ro",
+ extraction_confidence="high",
+ source_excerpt="ANCHOR-EXCERPT despre jocul testului",
+ page_reference="page 1",
+ )
+ base.update(over)
+ return base
+
+
+def _write_extraction(extracted_dir, chunk_key, activities, source_id="src01"):
+ extracted_dir.mkdir(parents=True, exist_ok=True)
+ payload = {
+ "header": {
+ "source_hash": "hash1234deadbeef",
+ "schema_version": "1.0",
+ "prompt_version": "1.0",
+ "chunk_range": "pages 1-20",
+ "source_id": source_id,
+ "chunk_key": chunk_key,
+ },
+ "activities": activities,
+ }
+ (extracted_dir / f"{chunk_key}.json").write_text(
+ json.dumps(payload, ensure_ascii=False), encoding="utf-8"
+ )
+
+
+def _write_chunk(chunks_dir, source_id, chunk_key, text):
+ d = chunks_dir / source_id
+ d.mkdir(parents=True, exist_ok=True)
+ (d / f"{chunk_key}.txt").write_text(text, encoding="utf-8")
+
+
+# --------------------------------------------------------------------------
+# step 3 — category normalization
+# --------------------------------------------------------------------------
+def test_category_alias_mapped_to_slug():
+ act = bd.dict_to_activity(_ext_activity(category="teambuilding"), "s.txt")
+ assert act.category == "team-building"
+
+
+def test_unknown_category_falls_back_to_altele():
+ act = bd.dict_to_activity(_ext_activity(category="zzz-not-a-category"), "s.txt")
+ assert act.category == "altele"
+
+
+def test_content_type_normalized():
+ act = bd.dict_to_activity(_ext_activity(content_type="games"), "s.txt")
+ assert act.content_type == "joc"
+
+
+# --------------------------------------------------------------------------
+# step 4 — dedup, three bands
+# --------------------------------------------------------------------------
+def test_dedup_auto_merge_identical_descriptions():
+ """>= 85 similar -> a single merged row."""
+ a = _activity(description="copiii formeaza echipe si traverseaza terenul")
+ b = _activity(description="copiii formeaza echipe si traverseaza terenul")
+ out, stats = bd.dedup_activities([a, b])
+ assert len(out) == 1
+ assert stats["auto_merged"] == 1
+ assert out[0].needs_review == 0
+
+
+def test_dedup_borderline_keeps_both_and_flags_needs_review():
+ """60-85 similar -> both kept, both flagged needs_review."""
+ from rapidfuzz import fuzz
+
+ d1 = "alpha beta gamma delta epsilon"
+ d2 = "alpha beta gamma delta epsilon zeta eta theta iota"
+ score = fuzz.token_sort_ratio(d1, d2)
+ assert 60.0 <= score < 85.0, f"precondition: score={score} not borderline"
+
+ a = _activity(description=d1)
+ b = _activity(description=d2)
+ out, stats = bd.dedup_activities([a, b])
+ assert len(out) == 2
+ assert stats["borderline"] == 2
+ assert all(act.needs_review == 1 for act in out)
+
+
+def test_dedup_low_similarity_kept_as_separate_variants():
+ """< 60 similar -> separate variants, no needs_review."""
+ from rapidfuzz import fuzz
+
+ d1 = "alpha beta gamma delta epsilon"
+ d2 = "quebec romeo sierra tango uniform victor whiskey"
+ assert fuzz.token_sort_ratio(d1, d2) < 60.0
+
+ a = _activity(description=d1)
+ b = _activity(description=d2)
+ out, stats = bd.dedup_activities([a, b])
+ assert len(out) == 2
+ assert stats["auto_merged"] == 0
+ assert all(act.needs_review == 0 for act in out)
+
+
+def test_dedup_never_merges_across_languages():
+ """Same name + same description but EN vs RO -> two distinct rows."""
+ desc = "children form teams and cross the field"
+ ro = _activity(name="Cursa", description=desc, language="ro")
+ en = _activity(name="Cursa", description=desc, language="en")
+ out, stats = bd.dedup_activities([ro, en])
+ assert len(out) == 2
+ assert stats["auto_merged"] == 0
+ langs = {a.language for a in out}
+ assert langs == {"ro", "en"}
+
+
+def test_merge_combines_fields():
+ """On merge: longest description/rules, union materials, accumulated sources."""
+ desc = "copiii formeaza echipe si traverseaza terenul cu obstacole"
+ a = _activity(
+ description=desc,
+ rules="regula scurta",
+ materials_list="franghie, esarfa",
+ source_file="a.txt",
+ keywords="echipa",
+ )
+ b = _activity(
+ description=desc,
+ rules="o regula mult mai lunga si mai detaliata pentru joc",
+ materials_list="busola, esarfa",
+ source_file="b.txt",
+ keywords="cooperare",
+ )
+ out, _ = bd.dedup_activities([a, b])
+ assert len(out) == 1
+ merged = out[0]
+ assert merged.rules == "o regula mult mai lunga si mai detaliata pentru joc"
+ mats = set(m.strip() for m in merged.materials_list.split(","))
+ assert mats == {"franghie", "esarfa", "busola"}
+ assert set(merged.source_files) == {"a.txt", "b.txt"}
+ assert merged.popularity_score == 1
+ assert set(k.strip() for k in merged.keywords.split(",")) == {"echipa", "cooperare"}
+
+
+# --------------------------------------------------------------------------
+# step 5 — review decisions
+# --------------------------------------------------------------------------
+def test_review_decision_drop_removes_row():
+ from import_common import content_key, normalize_name
+
+ a = _activity(description="o descriere de test")
+ key = content_key(normalize_name(a.name), a.language, a.description)
+ kept, stats = bd.apply_review_decisions([a], {key: {"decision": "drop"}})
+ assert kept == []
+ assert stats["dropped"] == 1
+
+
+def test_review_decision_keep_separate_clears_needs_review():
+ from import_common import content_key, normalize_name
+
+ a = _activity(description="o descriere de test")
+ a.needs_review = 1
+ key = content_key(normalize_name(a.name), a.language, a.description)
+ kept, stats = bd.apply_review_decisions([a], {key: {"decision": "keep-separate"}})
+ assert len(kept) == 1 and kept[0].needs_review == 0
+ assert stats["resolved"] == 1
+
+
+# --------------------------------------------------------------------------
+# step 2b — source_excerpt hallucination check
+# --------------------------------------------------------------------------
+def test_hallucinated_excerpt_activity_dropped(tmp_path):
+ extracted = tmp_path / "extracted"
+ chunks = tmp_path / "chunks"
+ sources = tmp_path / "sources"
+
+ good = _ext_activity(
+ name="Joc real", source_excerpt="textul real apare in bucata sursa"
+ )
+ bad = _ext_activity(
+ name="Joc inventat",
+ source_excerpt="acest citat nu exista nicaieri in sursa originala xyzzy",
+ )
+ _write_extraction(extracted, "src01.part01", [good, bad])
+ _write_chunk(
+ chunks, "src01", "src01.part01",
+ "--- PAGE 1 ---\ntextul real apare in bucata sursa pentru jocul real.\n",
+ )
+
+ from import_common import load_schema
+
+ schema = load_schema()
+ res = bd.collect_activities(extracted, chunks, sources, schema)
+ names = {a.name for a in res["activities"]}
+ assert names == {"Joc real"}
+ assert res["activities_hallucinated"] == 1
+ assert (extracted / "_rejected").exists()
+
+
+def test_schema_invalid_file_moved_to_rejected(tmp_path):
+ extracted = tmp_path / "extracted"
+ chunks = tmp_path / "chunks"
+ sources = tmp_path / "sources"
+ extracted.mkdir(parents=True)
+
+ # missing required header keys + bad activity
+ (extracted / "bad.json").write_text(
+ json.dumps({"header": {}, "activities": [{"name": "x"}]}),
+ encoding="utf-8",
+ )
+ from import_common import load_schema
+
+ res = bd.collect_activities(extracted, chunks, sources, load_schema())
+ assert res["files_rejected_schema"] == 1
+ assert not (extracted / "bad.json").exists()
+ assert (extracted / "_rejected" / "bad.json").exists()
+ assert (extracted / "_rejected" / "bad.errors.txt").exists()
+
+
+# --------------------------------------------------------------------------
+# end-to-end rebuild + atomic swap
+# --------------------------------------------------------------------------
+def _setup_corpus(tmp_path):
+ extracted = tmp_path / "extracted"
+ chunks = tmp_path / "chunks"
+ sources = tmp_path / "sources"
+ excerpt = "jocul testului este o activitate de echipa"
+ _write_extraction(
+ extracted, "src01.part01",
+ [_ext_activity(source_excerpt=excerpt)],
+ )
+ _write_chunk(chunks, "src01", "src01.part01",
+ f"--- PAGE 1 ---\n{excerpt} in aer liber.\n")
+ return extracted, chunks, sources
+
+
+def test_rebuild_creates_database(tmp_path):
+ extracted, chunks, sources = _setup_corpus(tmp_path)
+ db_path = tmp_path / "activities.db"
+
+ report = bd.rebuild(
+ extracted_dir=extracted, chunks_dir=chunks, sources_dir=sources,
+ db_path=db_path,
+ )
+ assert db_path.exists()
+ assert report["final_count"] == 1
+
+ db = DatabaseManager(str(db_path))
+ rows = db.search_activities()
+ assert len(rows) == 1
+ assert rows[0]["category"] == "team-building"
+
+
+def test_atomic_swap_keeps_live_db_intact_on_crash(tmp_path, monkeypatch):
+ """A mid-build crash must leave the live DB byte-identical."""
+ extracted, chunks, sources = _setup_corpus(tmp_path)
+ db_path = tmp_path / "activities.db"
+
+ # a pre-existing live DB with sentinel content
+ live = DatabaseManager(str(db_path))
+ live.insert_activity(_activity(name="Sentinel viu"))
+ before = db_path.read_bytes()
+
+ def boom(self, *a, **k):
+ raise RuntimeError("simulated mid-build crash")
+
+ monkeypatch.setattr(DatabaseManager, "bulk_insert_activities", boom)
+
+ with pytest.raises(RuntimeError, match="simulated mid-build crash"):
+ bd.rebuild(
+ extracted_dir=extracted, chunks_dir=chunks, sources_dir=sources,
+ db_path=db_path,
+ )
+
+ # live DB untouched, tmp cleaned up
+ assert db_path.read_bytes() == before
+ assert not (tmp_path / "activities.db.tmp").exists()
+
+
+def test_rebuild_backs_up_live_db(tmp_path):
+ extracted, chunks, sources = _setup_corpus(tmp_path)
+ db_path = tmp_path / "activities.db"
+ DatabaseManager(str(db_path)).insert_activity(_activity(name="Vechi"))
+
+ report = bd.rebuild(
+ extracted_dir=extracted, chunks_dir=chunks, sources_dir=sources,
+ db_path=db_path,
+ )
+ assert report["backup"] is not None
+ assert Path(report["backup"]).exists()
+ assert os.path.basename(report["backup"]) == "activities.db.bak"
diff --git a/tests/test_chunk_sources.py b/tests/test_chunk_sources.py
new file mode 100644
index 0000000..1b6b5e5
--- /dev/null
+++ b/tests/test_chunk_sources.py
@@ -0,0 +1,183 @@
+# -*- coding: utf-8 -*-
+"""Tests for scripts/chunk_sources.py."""
+
+import json
+
+import chunk_sources as cs
+import normalize_sources as ns
+
+
+def _pages(n):
+ return [(i, f"text-{i}") for i in range(1, n + 1)]
+
+
+# --------------------------------------------------------------------------
+# header parsing
+# --------------------------------------------------------------------------
+def test_parse_source_splits_header_and_body(paginated_source):
+ text = paginated_source.read_text(encoding="utf-8")
+ header, body = cs.parse_source(text)
+ assert header["FORMAT"] == "pdf"
+ assert body.lstrip().startswith("--- PAGE 1 ---")
+
+
+# --------------------------------------------------------------------------
+# page chunking
+# --------------------------------------------------------------------------
+def test_chunk_pages_basic_split():
+ chunks = cs.chunk_pages(_pages(50), pages_per_chunk=20, overlap=4)
+ # stride 16: starts at pages 1, 17, 33, ...
+ assert chunks[0]["page_start"] == 1 and chunks[0]["page_end"] == 20
+ assert chunks[1]["page_start"] == 17
+ assert chunks[-1]["page_end"] == 50
+
+
+def test_chunk_pages_have_overlap():
+ chunks = cs.chunk_pages(_pages(50), pages_per_chunk=20, overlap=4)
+ overlap = chunks[0]["page_end"] - chunks[1]["page_start"] + 1
+ assert overlap == 4
+
+
+def test_chunk_pages_short_document_single_chunk():
+ chunks = cs.chunk_pages(_pages(8), pages_per_chunk=20, overlap=4)
+ assert len(chunks) == 1
+ assert chunks[0]["page_start"] == 1 and chunks[0]["page_end"] == 8
+
+
+def test_chunk_pages_empty():
+ assert cs.chunk_pages([]) == []
+
+
+def test_activity_at_page_boundary_intact_in_one_chunk(paginated_source):
+ """An activity straddling the page 20/21 boundary must appear whole in >=1 chunk."""
+ text = paginated_source.read_text(encoding="utf-8")
+ chunks = cs.make_chunks(text)
+ full = [
+ c for c in chunks
+ if "ACTIVITY-START" in c["text"] and "ACTIVITY-END" in c["text"]
+ ]
+ assert full, "activity spanning a page boundary was split across all chunks"
+
+
+# --------------------------------------------------------------------------
+# word-window chunking for unpaginated text
+# --------------------------------------------------------------------------
+def test_chunk_words_window_and_overlap():
+ text = " ".join(f"w{i}" for i in range(25_000))
+ chunks = cs.chunk_words(text, window=10_000, overlap=2_000)
+ assert len(chunks) == 3 # stride 8000 over 25000 words
+ first = chunks[0]["text"].split()
+ second = chunks[1]["text"].split()
+ assert first[8_000:10_000] == second[0:2_000] # 2000-word overlap
+
+
+def test_make_chunks_unpaginated_uses_word_windows():
+ body = "cuvant " * 15_000
+ text = "SOURCE: x\nFORMAT: txt\n" + "=" * 50 + "\n\n" + body
+ chunks = cs.make_chunks(text)
+ assert len(chunks) >= 2
+ assert chunks[0]["chunk_range"].startswith("words")
+
+
+# --------------------------------------------------------------------------
+# stable source ids — anti-collision
+# --------------------------------------------------------------------------
+def test_stable_id_same_stem_different_path_no_collision():
+ a = ns.stable_id("camp/games/scout.pdf")
+ b = ns.stable_id("school/lessons/scout.pdf")
+ assert a != b
+ assert a.endswith("_scout") and b.endswith("_scout")
+
+
+def test_stable_id_deterministic():
+ assert ns.stable_id("a/b/c.pdf") == ns.stable_id("a/b/c.pdf")
+
+
+# --------------------------------------------------------------------------
+# manifest registry + idempotency
+# --------------------------------------------------------------------------
+def test_run_writes_chunks_and_manifest(paginated_source, tmp_path):
+ sources_dir = tmp_path / "sources"
+ sources_dir.mkdir()
+ (sources_dir / paginated_source.name).write_text(
+ paginated_source.read_text(encoding="utf-8"), encoding="utf-8"
+ )
+ chunks_dir = tmp_path / "chunks"
+
+ summary = cs.run(sources_dir, chunks_dir)
+ assert summary["sources"] == 1
+ assert summary["chunks"] >= 2
+
+ manifest = json.loads((chunks_dir / "manifest.json").read_text())
+ assert manifest["chunks"]
+ for key, meta in manifest["chunks"].items():
+ assert meta["state"] == "pending"
+ assert meta["expected_json"] == f"{key}.json"
+ assert (chunks_dir.parent / meta["chunk_file"]).exists()
+
+
+def test_manifest_idempotent_preserves_state(paginated_source, tmp_path):
+ sources_dir = tmp_path / "sources"
+ sources_dir.mkdir()
+ (sources_dir / paginated_source.name).write_text(
+ paginated_source.read_text(encoding="utf-8"), encoding="utf-8"
+ )
+ chunks_dir = tmp_path / "chunks"
+ manifest_path = chunks_dir / "manifest.json"
+
+ cs.run(sources_dir, chunks_dir)
+
+ # orchestrator marks one chunk done
+ manifest = json.loads(manifest_path.read_text())
+ first_key = next(iter(manifest["chunks"]))
+ n_before = len(manifest["chunks"])
+ manifest["chunks"][first_key]["state"] = "done"
+ manifest_path.write_text(json.dumps(manifest), encoding="utf-8")
+
+ # re-run: 'done' must survive, no chunk added or lost
+ cs.run(sources_dir, chunks_dir)
+ manifest2 = json.loads(manifest_path.read_text())
+ assert len(manifest2["chunks"]) == n_before
+ assert manifest2["chunks"][first_key]["state"] == "done"
+ assert all(
+ m["state"] in ("pending", "done") for m in manifest2["chunks"].values()
+ )
+
+
+def test_manifest_resets_state_when_source_changes(paginated_source, tmp_path):
+ sources_dir = tmp_path / "sources"
+ sources_dir.mkdir()
+ src = sources_dir / paginated_source.name
+ src.write_text(paginated_source.read_text(encoding="utf-8"), encoding="utf-8")
+ chunks_dir = tmp_path / "chunks"
+ manifest_path = chunks_dir / "manifest.json"
+
+ cs.run(sources_dir, chunks_dir)
+ manifest = json.loads(manifest_path.read_text())
+ first_key = next(iter(manifest["chunks"]))
+ manifest["chunks"][first_key]["state"] = "done"
+ manifest_path.write_text(json.dumps(manifest), encoding="utf-8")
+
+ # mutate the source content -> hash changes -> state resets
+ src.write_text(src.read_text(encoding="utf-8") + "\n--- PAGE 51 ---\nextra\n",
+ encoding="utf-8")
+ cs.run(sources_dir, chunks_dir)
+ manifest2 = json.loads(manifest_path.read_text())
+ assert manifest2["chunks"][first_key]["state"] == "pending"
+
+
+def test_prune_stale_removes_orphan_entries(paginated_source, tmp_path):
+ sources_dir = tmp_path / "sources"
+ sources_dir.mkdir()
+ src = sources_dir / paginated_source.name
+ src.write_text(paginated_source.read_text(encoding="utf-8"), encoding="utf-8")
+ chunks_dir = tmp_path / "chunks"
+
+ cs.run(sources_dir, chunks_dir)
+ # delete the source -> its chunks become stale
+ src.unlink()
+ summary = cs.run(sources_dir, chunks_dir)
+ assert summary["chunks"] == 0
+ assert summary["pruned"] >= 1
+ manifest = json.loads((chunks_dir / "manifest.json").read_text())
+ assert manifest["chunks"] == {}
diff --git a/tests/test_extract_common.py b/tests/test_extract_common.py
new file mode 100644
index 0000000..17dedee
--- /dev/null
+++ b/tests/test_extract_common.py
@@ -0,0 +1,177 @@
+# -*- coding: utf-8 -*-
+"""Tests for scripts/extract_common.py."""
+
+import shutil
+import zipfile
+
+import pytest
+
+import extract_common as ec
+
+
+# --------------------------------------------------------------------------
+# format detection
+# --------------------------------------------------------------------------
+def test_detect_format():
+ assert ec.detect_format("a/b/file.PDF") == "pdf"
+ assert ec.detect_format("x.docx") == "docx"
+ assert ec.detect_format("x.doc") == "doc"
+ assert ec.detect_format("x.pptx") == "pptx"
+ assert ec.detect_format("x.html") == "html"
+ assert ec.detect_format("x.zip") == "zip"
+ assert ec.detect_format("x.epub") == "epub"
+ assert ec.detect_format("x.xyz") == "unknown"
+
+
+def test_is_junk():
+ assert ec.is_junk("some/desktop.ini")
+ assert ec.is_junk("notes.bak")
+ assert ec.is_junk("README.md")
+ assert not ec.is_junk("1000 Scout Games.pdf")
+
+
+# --------------------------------------------------------------------------
+# PDF — the critical "no max_pages" regression
+# --------------------------------------------------------------------------
+def test_pdf_extracts_all_60_pages(big_pdf):
+ body = ec.extract_pdf(big_pdf)
+ # the old converter capped at 50 pages — page 60 must be present now
+ assert "--- PAGE 60 ---" in body
+ assert "PDFMARK-60" in body
+ assert ec.count_page_markers(body) == 60
+
+
+def test_pdf_does_not_truncate_mid_document(big_pdf):
+ body = ec.extract_pdf(big_pdf)
+ pages = ec.split_pages(body)
+ assert pages[-1][0] == 60 # last marker is the real last page
+
+
+# --------------------------------------------------------------------------
+# page join / split round-trip
+# --------------------------------------------------------------------------
+def test_join_split_round_trip():
+ body = ec.join_pages(["alpha", "beta", "gamma"])
+ pages = ec.split_pages(body)
+ assert [n for n, _ in pages] == [1, 2, 3]
+ assert [t for _, t in pages] == ["alpha", "beta", "gamma"]
+
+
+def test_split_pages_no_markers_returns_empty():
+ assert ec.split_pages("plain text with no markers") == []
+
+
+# --------------------------------------------------------------------------
+# docx — synthetic page markers
+# --------------------------------------------------------------------------
+def test_docx_synthetic_page_markers(sample_docx):
+ body = ec.extract_docx(sample_docx)
+ # 100 paragraphs / 40 per page => 3 pages
+ assert ec.count_page_markers(body) == 3
+ assert "Paragraf 99" in body
+
+
+# --------------------------------------------------------------------------
+# HTML mirror — nav/script/footer stripped
+# --------------------------------------------------------------------------
+def test_html_strips_chrome(html_with_nav):
+ body = ec.extract_html(html_with_nav)
+ assert "Vanatoarea de comori" in body
+ assert "joc real de orientare" in body
+ # chrome must be gone
+ assert "tracking" not in body
+ assert "Site Banner Junk" not in body
+ assert "toate drepturile rezervate" not in body
+ assert "Games" not in body
+
+
+# --------------------------------------------------------------------------
+# content hash + near-duplicate elimination
+# --------------------------------------------------------------------------
+def test_content_hash_ignores_whitespace():
+ assert ec.content_hash("hello world") == ec.content_hash("hello world\n")
+ assert ec.content_hash("hello world") != ec.content_hash("goodbye world")
+
+
+def test_dedupe_exact_duplicates():
+ items = [("a", "joc identic"), ("b", "joc identic"), ("c", "alt joc")]
+ kept = ec.dedupe_texts(items)
+ assert [k for k, _ in kept] == ["a", "c"]
+
+
+def test_dedupe_near_duplicates():
+ base = "Vanatoarea de comori este un joc de orientare pentru cercetasi in tabara."
+ near = base + " Pagina printata." # >95% similar
+ items = [("orig", base), ("print", near), ("other", "Cu totul alt continut diferit aici.")]
+ kept = ec.dedupe_texts(items, threshold=85.0)
+ keys = [k for k, _ in kept]
+ assert "orig" in keys
+ assert "print" not in keys
+ assert "other" in keys
+
+
+# --------------------------------------------------------------------------
+# zip recursion
+# --------------------------------------------------------------------------
+def test_zip_recurses_into_inner_files(sample_zip):
+ body = ec.extract_zip(sample_zip)
+ assert "Paragraf 0" in body
+ assert ec.count_page_markers(body) > 0
+
+
+def test_zip_bad_archive_returns_empty(tmp_path):
+ bad = tmp_path / "broken.zip"
+ bad.write_text("not a zip", encoding="utf-8")
+ assert ec.extract_zip(bad) == ""
+
+
+def test_nested_zip(tmp_path, sample_zip):
+ outer = tmp_path / "outer.zip"
+ with zipfile.ZipFile(outer, "w") as zf:
+ zf.write(sample_zip, arcname="nested/archive.zip")
+ body = ec.extract_zip(outer)
+ assert "Paragraf 0" in body
+
+
+# --------------------------------------------------------------------------
+# preflight
+# --------------------------------------------------------------------------
+def test_preflight_python_packages_present():
+ report = ec.preflight()
+ # all required packages are installed in the test environment
+ assert report["missing_python"] == []
+
+
+def test_preflight_reports_libreoffice_state():
+ report = ec.preflight()
+ has_lo = bool(shutil.which("libreoffice") or shutil.which("soffice"))
+ if has_lo:
+ assert all("libreoffice" not in w for w in report["warnings"])
+ else:
+ assert any("libreoffice" in w for w in report["warnings"])
+
+
+def test_preflight_ocr_flag():
+ report = ec.preflight(check_ocr=True)
+ if not shutil.which("tesseract"):
+ assert any("tesseract" in m for m in report["missing_system"])
+
+
+# --------------------------------------------------------------------------
+# legacy .doc — skipped unless libreoffice is installed
+# --------------------------------------------------------------------------
+@pytest.mark.skipif(
+ not (shutil.which("libreoffice") or shutil.which("soffice")),
+ reason="libreoffice not installed",
+)
+def test_doc_conversion(tmp_path, sample_docx):
+ doc_path = tmp_path / "legacy.doc"
+ shutil.copy(sample_docx, doc_path) # smoke test of the docx path
+ body = ec.extract_doc(doc_path)
+ assert ec.count_page_markers(body) >= 1
+
+
+def test_doc_without_libreoffice_raises(tmp_path, monkeypatch):
+ monkeypatch.setattr(ec.shutil, "which", lambda _: None)
+ with pytest.raises(RuntimeError):
+ ec.extract_doc(tmp_path / "whatever.doc")
diff --git a/tests/test_fts.py b/tests/test_fts.py
new file mode 100644
index 0000000..14e627f
--- /dev/null
+++ b/tests/test_fts.py
@@ -0,0 +1,139 @@
+"""
+Integration tests for the FTS5 search index.
+
+Confirms that materials_list and skills_developed are indexed by FTS5 and kept
+in sync by the insert / update / delete triggers (plan §6, §7).
+"""
+
+import os
+import sys
+import json
+
+import pytest
+
+# Make the project root importable when pytest is run from anywhere.
+PROJECT_ROOT = os.path.abspath(os.path.join(os.path.dirname(__file__), ".."))
+if PROJECT_ROOT not in sys.path:
+ sys.path.insert(0, PROJECT_ROOT)
+
+from app.models.activity import Activity # noqa: E402
+from app.models.database import DatabaseManager # noqa: E402
+
+
+@pytest.fixture
+def db(tmp_path):
+ """A fresh DatabaseManager backed by a temporary SQLite file."""
+ return DatabaseManager(str(tmp_path / "test_activities.db"))
+
+
+def _make_activity(**overrides):
+ base = dict(
+ name="Vânătoarea de comori",
+ description="O activitate de echipă în aer liber.",
+ category="camp-outdoor",
+ content_type="joc",
+ source_file="test.txt",
+ language="ro",
+ )
+ base.update(overrides)
+ return Activity(**base)
+
+
+def test_search_by_materials_list(db):
+ """A term that only appears in materials_list returns the activity."""
+ activity = _make_activity(materials_list="frânghie, eșarfă, busolă")
+ db.insert_activity(activity)
+
+ results = db.search_activities(search_text="busolă")
+ assert len(results) == 1
+ assert results[0]["name"] == "Vânătoarea de comori"
+
+
+def test_search_by_skills_developed(db):
+ """A term that only appears in skills_developed returns the activity."""
+ activity = _make_activity(skills_developed="comunicare, leadership, rabdare")
+ db.insert_activity(activity)
+
+ results = db.search_activities(search_text="leadership")
+ assert len(results) == 1
+ assert results[0]["name"] == "Vânătoarea de comori"
+
+
+def test_term_absent_from_indexed_columns_no_hit(db):
+ """A term present in no indexed column yields no hit (control)."""
+ db.insert_activity(_make_activity(materials_list="frânghie"))
+ assert db.search_activities(search_text="zzzunlikelyterm") == []
+
+
+def test_delete_trigger_removes_from_fts(db):
+ """Deleting an activity removes it from the FTS index (delete trigger)."""
+ activity = _make_activity(materials_list="catalige")
+ activity_id = db.insert_activity(activity)
+ assert len(db.search_activities(search_text="catalige")) == 1
+
+ with db._get_connection() as conn:
+ conn.execute("DELETE FROM activities WHERE id = ?", (activity_id,))
+ conn.commit()
+
+ assert db.search_activities(search_text="catalige") == []
+
+
+def test_update_trigger_resyncs_fts(db):
+ """Updating materials_list re-syncs the FTS index (update trigger)."""
+ activity = _make_activity(materials_list="creioane")
+ activity_id = db.insert_activity(activity)
+ assert len(db.search_activities(search_text="creioane")) == 1
+
+ with db._get_connection() as conn:
+ conn.execute(
+ "UPDATE activities SET materials_list = ? WHERE id = ?",
+ ("acuarele", activity_id),
+ )
+ conn.commit()
+
+ # Old term gone, new term found.
+ assert db.search_activities(search_text="creioane") == []
+ assert len(db.search_activities(search_text="acuarele")) == 1
+
+
+def test_rebuild_fts_index(db):
+ """rebuild_fts_index keeps materials_list / skills_developed searchable."""
+ db.insert_activity(_make_activity(skills_developed="orientare"))
+ db.rebuild_fts_index()
+ assert len(db.search_activities(search_text="orientare")) == 1
+
+
+def test_new_schema_columns_round_trip(db):
+ """New activity columns persist and load back via from_dict."""
+ activity = _make_activity(
+ source_files=["a.txt", "b.txt"],
+ source_excerpt="Citat scurt din sursă.",
+ extraction_confidence="high",
+ needs_review=1,
+ normalized_name="vanatoarea de comori",
+ )
+ activity_id = db.insert_activity(activity)
+
+ row = db.get_activity_by_id(activity_id)
+ assert row["content_type"] == "joc"
+ assert row["language"] == "ro"
+ assert row["extraction_confidence"] == "high"
+ assert row["needs_review"] == 1
+ assert row["normalized_name"] == "vanatoarea de comori"
+ assert json.loads(row["source_files"]) == ["a.txt", "b.txt"]
+ assert row["source_excerpt"] == "Citat scurt din sursă."
+
+ loaded = Activity.from_dict(row)
+ assert loaded.source_files == ["a.txt", "b.txt"]
+ assert loaded.content_type == "joc"
+
+
+def test_normalized_name_auto_derived(db):
+ """normalized_name is auto-derived from name when not provided."""
+ activity = Activity(
+ name="Ștafetă cu Obstacole",
+ description="desc",
+ category="sports-active",
+ source_file="t.txt",
+ )
+ assert activity.normalized_name == "stafeta cu obstacole"
diff --git a/tests/test_search.py b/tests/test_search.py
new file mode 100644
index 0000000..547c9e2
--- /dev/null
+++ b/tests/test_search.py
@@ -0,0 +1,140 @@
+# -*- coding: utf-8 -*-
+"""
+CRITICAL REGRESSION TEST (plan §6, §7).
+
+`search.py` changed the result sets of /search and /api/search: the default
+search now EXCLUDES the non-game content types (rețetă / cântec / ceremonie),
+which surface only when the user explicitly filters that content_type or picks
+a non-game category. This test guards that behaviour.
+"""
+
+import pytest
+
+from app.models.activity import Activity
+from app.models.database import DatabaseManager
+from app.services.search import SearchService
+from app.config_taxonomy import NON_GAME_CONTENT_TYPES
+
+
+# --------------------------------------------------------------------------
+# fixtures
+# --------------------------------------------------------------------------
+def _activity(name, content_type, category="altele", language="ro"):
+ return Activity(
+ name=name,
+ description=f"Descriere pentru {name}, un conținut de tip {content_type}.",
+ category=category,
+ content_type=content_type,
+ language=language,
+ source_file="test/fixture.txt",
+ )
+
+
+@pytest.fixture
+def search_service(tmp_path):
+ """A SearchService over a temp DB seeded with one row per content_type."""
+ db = DatabaseManager(str(tmp_path / "activities.db"))
+ db.clear_database()
+ db.bulk_insert_activities([
+ _activity("Vanatoarea de comori", "joc", category="wide-games"),
+ _activity("Cercul de cunoastere", "activitate", category="icebreakers"),
+ _activity("Reteta de paine la ceaun", "reteta", category="retete"),
+ _activity("Cantecul de tabara", "cantec", category="cantece-ceremonii"),
+ _activity("Ceremonia de inchidere", "ceremonie", category="cantece-ceremonii"),
+ _activity("Game in English", "joc", category="wide-games", language="en"),
+ ])
+ return SearchService(db)
+
+
+def _content_types(results):
+ return {r.get("content_type") for r in results}
+
+
+# --------------------------------------------------------------------------
+# the regression: default search excludes non-game content types
+# --------------------------------------------------------------------------
+def test_default_search_excludes_non_game_content(search_service):
+ """No filters → rețete / cântece / ceremonii must NOT appear."""
+ results = search_service.search_activities()
+ types = _content_types(results)
+
+ assert types, "default search returned nothing"
+ for non_game in NON_GAME_CONTENT_TYPES:
+ assert non_game not in types, (
+ f"default search leaked non-game content_type '{non_game}'"
+ )
+ # game content is still present
+ assert "joc" in types
+ assert "activitate" in types
+
+
+def test_default_search_with_text_excludes_non_game(search_service):
+ """A text query still excludes non-game content by default."""
+ results = search_service.search_activities(search_text="conținut")
+ assert NON_GAME_CONTENT_TYPES[0] not in _content_types(results)
+
+
+# --------------------------------------------------------------------------
+# explicit content_type filter INCLUDES the non-game rows
+# --------------------------------------------------------------------------
+def test_explicit_content_type_filter_includes_non_game(search_service):
+ """Filtering content_type=reteta returns exactly the rețete."""
+ results = search_service.search_activities(filters={"content_type": "reteta"})
+ types = _content_types(results)
+
+ assert types == {"reteta"}, f"expected only rețete, got {types}"
+ assert len(results) == 1
+
+
+def test_explicit_content_type_filter_for_cantec(search_service):
+ results = search_service.search_activities(filters={"content_type": "cantec"})
+ assert _content_types(results) == {"cantec"}
+
+
+# --------------------------------------------------------------------------
+# a non-game CATEGORY filter also lifts the exclusion
+# --------------------------------------------------------------------------
+def test_non_game_category_filter_includes_non_game(search_service):
+ """Picking category=cantece-ceremonii surfaces cântece + ceremonii."""
+ results = search_service.search_activities(
+ filters={"category": "cantece-ceremonii"})
+ types = _content_types(results)
+
+ assert "cantec" in types
+ assert "ceremonie" in types
+
+
+def test_game_category_filter_still_excludes_non_game(search_service):
+ """A normal (game) category filter keeps the non-game exclusion."""
+ results = search_service.search_activities(filters={"category": "wide-games"})
+ types = _content_types(results)
+ for non_game in NON_GAME_CONTENT_TYPES:
+ assert non_game not in types
+
+
+# --------------------------------------------------------------------------
+# language filter
+# --------------------------------------------------------------------------
+def test_language_filter_ro(search_service):
+ results = search_service.search_activities(filters={"language": "ro"})
+ assert results
+ assert all(r.get("language") == "ro" for r in results)
+
+
+def test_language_filter_en(search_service):
+ results = search_service.search_activities(filters={"language": "en"})
+ assert results
+ assert all(r.get("language") == "en" for r in results)
+ assert {r.get("name") for r in results} == {"Game in English"}
+
+
+# --------------------------------------------------------------------------
+# get_filter_options surfaces the new axes
+# --------------------------------------------------------------------------
+def test_filter_options_include_content_type_and_language(search_service):
+ """The dynamic-filter mechanism now exposes content_type + language."""
+ options = search_service.db.get_filter_options()
+ assert "content_type" in options
+ assert "language" in options
+ assert "joc" in options["content_type"]
+ assert set(options["language"]) == {"ro", "en"}
diff --git a/tests/test_validate_extractions.py b/tests/test_validate_extractions.py
new file mode 100644
index 0000000..c452f2d
--- /dev/null
+++ b/tests/test_validate_extractions.py
@@ -0,0 +1,156 @@
+# -*- coding: utf-8 -*-
+"""
+Tests for scripts/validate_extractions.py.
+
+Covers: schema rejection, the source_excerpt hallucination check, the content
+of the generated re-extraction prompt, and the manifest `rejected` marking.
+"""
+
+import json
+import sys
+from pathlib import Path
+
+REPO_ROOT = Path(__file__).resolve().parent.parent
+SCRIPTS_DIR = REPO_ROOT / "scripts"
+for _p in (str(REPO_ROOT), str(SCRIPTS_DIR)):
+ if _p not in sys.path:
+ sys.path.insert(0, _p)
+
+import validate_extractions as ve # noqa: E402
+
+
+# --------------------------------------------------------------------------
+# helpers
+# --------------------------------------------------------------------------
+def _ext_activity(**over):
+ base = dict(
+ name="Jocul testului",
+ description="O activitate de echipa in aer liber.",
+ category="team-building",
+ content_type="joc",
+ language="ro",
+ extraction_confidence="high",
+ source_excerpt="ancora din bucata sursa",
+ page_reference="page 1",
+ )
+ base.update(over)
+ return base
+
+
+def _write_extraction(extracted_dir, chunk_key, activities, header_extra=None):
+ extracted_dir.mkdir(parents=True, exist_ok=True)
+ header = {
+ "source_hash": "hash1234deadbeef",
+ "schema_version": "1.0",
+ "prompt_version": "1.0",
+ "chunk_range": "pages 1-20",
+ "source_id": "src01",
+ "chunk_key": chunk_key,
+ }
+ if header_extra:
+ header.update(header_extra)
+ payload = {"header": header, "activities": activities}
+ (extracted_dir / f"{chunk_key}.json").write_text(
+ json.dumps(payload, ensure_ascii=False), encoding="utf-8"
+ )
+
+
+def _write_chunk(chunks_dir, source_id, chunk_key, text):
+ d = chunks_dir / source_id
+ d.mkdir(parents=True, exist_ok=True)
+ (d / f"{chunk_key}.txt").write_text(text, encoding="utf-8")
+
+
+# --------------------------------------------------------------------------
+# tests
+# --------------------------------------------------------------------------
+def test_valid_file_passes(tmp_path):
+ extracted = tmp_path / "extracted"
+ chunks = tmp_path / "chunks"
+ excerpt = "ancora din bucata sursa apare aici"
+ _write_extraction(extracted, "src01.part01", [_ext_activity(source_excerpt=excerpt)])
+ _write_chunk(chunks, "src01", "src01.part01", f"--- PAGE 1 ---\n{excerpt}\n")
+
+ report = ve.run(extracted, chunks, tmp_path / "manifest.json")
+ assert report["valid"] == 1
+ assert report["rejected"] == 0
+
+
+def test_schema_invalid_file_rejected(tmp_path):
+ extracted = tmp_path / "extracted"
+ chunks = tmp_path / "chunks"
+ extracted.mkdir(parents=True)
+ (extracted / "src01.part01.json").write_text(
+ json.dumps({"header": {}, "activities": [{"name": "x"}]}), encoding="utf-8"
+ )
+
+ report = ve.run(extracted, chunks, tmp_path / "manifest.json")
+ assert report["rejected"] == 1
+ prompt = extracted / "_reextract" / "src01.part01.prompt.md"
+ assert prompt.exists()
+
+
+def test_hallucinated_excerpt_rejected(tmp_path):
+ extracted = tmp_path / "extracted"
+ chunks = tmp_path / "chunks"
+ _write_extraction(
+ extracted, "src01.part01",
+ [_ext_activity(source_excerpt="citat complet inventat care nu exista qqqq")],
+ )
+ _write_chunk(chunks, "src01", "src01.part01",
+ "--- PAGE 1 ---\ntext complet diferit despre altceva.\n")
+
+ report = ve.run(extracted, chunks, tmp_path / "manifest.json")
+ assert report["rejected"] == 1
+ errors = report["rejected_chunks"][0]["errors"]
+ assert any("hallucination" in e for e in errors)
+
+
+def test_reextraction_prompt_content(tmp_path):
+ extracted = tmp_path / "extracted"
+ chunks = tmp_path / "chunks"
+ _write_extraction(
+ extracted, "src01.part01",
+ [_ext_activity(source_excerpt="citat inventat care nu exista zzzz")],
+ )
+ _write_chunk(chunks, "src01", "src01.part01",
+ "--- PAGE 1 ---\ntext despre cu totul altceva aici.\n")
+
+ ve.run(extracted, chunks, tmp_path / "manifest.json")
+ prompt = (extracted / "_reextract" / "src01.part01.prompt.md").read_text(
+ encoding="utf-8"
+ )
+ assert "src01.part01" in prompt
+ assert "REJECTED" in prompt
+ assert "verbatim" in prompt
+ assert "data/extracted/src01.part01.json" in prompt
+
+
+def test_manifest_marks_chunk_rejected(tmp_path):
+ extracted = tmp_path / "extracted"
+ chunks = tmp_path / "chunks"
+ manifest_path = tmp_path / "manifest.json"
+ manifest_path.write_text(
+ json.dumps({"chunks": {"src01.part01": {"state": "done",
+ "chunk_file": "chunks/src01/src01.part01.txt"}}}),
+ encoding="utf-8",
+ )
+ _write_extraction(
+ extracted, "src01.part01",
+ [_ext_activity(source_excerpt="citat fabricat absent vvvv")],
+ )
+ _write_chunk(chunks, "src01", "src01.part01",
+ "--- PAGE 1 ---\nun continut neinrudit.\n")
+
+ ve.run(extracted, chunks, manifest_path)
+ manifest = json.loads(manifest_path.read_text(encoding="utf-8"))
+ assert manifest["chunks"]["src01.part01"]["state"] == "rejected"
+
+
+def test_build_reextraction_prompt_lists_errors():
+ prompt = ve.build_reextraction_prompt(
+ "abc.part03", "data/chunks/abc/abc.part03.txt",
+ ["header: 'source_hash' is a required property"],
+ )
+ assert "abc.part03" in prompt
+ assert "source_hash" in prompt