From 66ae831c36accf53ee167ab9d66299ec0cf62bdc Mon Sep 17 00:00:00 2001
From: Claude Agent <claude-agent@romfast.ro>
Date: Tue, 19 May 2026 17:43:38 +0000
Subject: [PATCH] Rebuild extraction pipeline infrastructure (Faza 0 prep)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Implements the approved plan to replace the broken regex/index-master
extraction with an LLM-subagent pipeline. Four parallel lanes:

Lane A — scripts/extract_common.py (PDF/docx/doc/pptx/html/zip, no
  max_pages truncation), normalize_sources.py, chunk_sources.py
  (~20pg chunks + overlap, manifest registry), activity_schema.json.
Lane B — app/config_taxonomy.py (16 fixed category slugs), schema
  rebuilt from scratch in app/models/ with content_type, language,
  source_files, source_excerpt, normalized_name, extraction_confidence,
  needs_review; FTS5 + 3 triggers extended with materials_list and
  skills_developed.
Lane C — build_database.py (--rebuild, atomic swap, schema + fuzzy
  source_excerpt validation, dedup with needs_review band),
  validate_extractions.py, review_queue.py, new run_extraction.py
  orchestrator, SUBAGENT_PROMPT.md.
Lane D — search.py content_type/language filters (default search
  excludes non-game content), E7 schema-compat audit; fixed a NULL
  keywords AttributeError in _boost_search_relevance.

Removes 8 orphaned/dead scripts and app/services/parser.py +
indexer.py. Adds tests/ (70 passing, 1 skipped — libreoffice absent).

Note: Lane D made one additive edit to app/models/database.py
(_update_category_counts) to surface content_type/language in
get_filter_options, outside its nominal lane boundary but after
Lane B completed.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 app/config_taxonomy.py                | 230 +++++++++
 app/models/activity.py                |  68 ++-
 app/models/database.py                |  55 ++-
 app/services/__init__.py              |   4 +-
 app/services/indexer.py               | 248 ----------
 app/services/parser.py                | 340 --------------
 app/services/search.py                | 101 +++-
 app/templates/activity.html           |   8 +-
 app/templates/index.html              |  26 +-
 app/templates/results.html            |  29 +-
 app/web/routes.py                     |  19 +-
 scripts/SUBAGENT_PROMPT.md            |  81 ++++
 scripts/activity_schema.json          | 110 +++++
 scripts/build_database.py             | 639 ++++++++++++++++++++++++++
 scripts/chunk_sources.py              | 251 ++++++++++
 scripts/claude_extraction_template.md |  54 ---
 scripts/create_databases.py           | 164 -------
 scripts/extract_common.py             | 361 +++++++++++++++
 scripts/html_extractor.py             | 424 -----------------
 scripts/import_claude_activities.py   |  78 ----
 scripts/import_common.py              | 179 ++++++++
 scripts/normalize_sources.py          | 255 ++++++++++
 scripts/pdf_extractor.py              |   0
 scripts/pdf_to_text_converter.py      | 143 ------
 scripts/review_queue.py               | 145 ++++++
 scripts/run_extraction.py             | 168 +++++--
 scripts/text_extractor.py             | 197 --------
 scripts/unified_processor.py          | 151 ------
 scripts/validate_extractions.py       | 208 +++++++++
 tests/conftest.py                     | 114 +++++
 tests/fixtures/.gitkeep               |   3 +
 tests/test_build_database.py          | 334 ++++++++++++++
 tests/test_chunk_sources.py           | 183 ++++++++
 tests/test_extract_common.py          | 177 +++++++
 tests/test_fts.py                     | 139 ++++++
 tests/test_search.py                  | 140 ++++++
 tests/test_validate_extractions.py    | 156 +++++++
 37 files changed, 4101 insertions(+), 1881 deletions(-)
 create mode 100644 app/config_taxonomy.py
 delete mode 100644 app/services/indexer.py
 delete mode 100644 app/services/parser.py
 create mode 100644 scripts/SUBAGENT_PROMPT.md
 create mode 100644 scripts/activity_schema.json
 create mode 100644 scripts/build_database.py
 create mode 100644 scripts/chunk_sources.py
 delete mode 100644 scripts/claude_extraction_template.md
 delete mode 100644 scripts/create_databases.py
 create mode 100644 scripts/extract_common.py
 delete mode 100644 scripts/html_extractor.py
 delete mode 100644 scripts/import_claude_activities.py
 create mode 100644 scripts/import_common.py
 create mode 100644 scripts/normalize_sources.py
 delete mode 100644 scripts/pdf_extractor.py
 delete mode 100644 scripts/pdf_to_text_converter.py
 create mode 100644 scripts/review_queue.py
 delete mode 100644 scripts/text_extractor.py
 delete mode 100644 scripts/unified_processor.py
 create mode 100644 scripts/validate_extractions.py
 create mode 100644 tests/conftest.py
 create mode 100644 tests/fixtures/.gitkeep
 create mode 100644 tests/test_build_database.py
 create mode 100644 tests/test_chunk_sources.py
 create mode 100644 tests/test_extract_common.py
 create mode 100644 tests/test_fts.py
 create mode 100644 tests/test_search.py
 create mode 100644 tests/test_validate_extractions.py

diff --git a/app/config_taxonomy.py b/app/config_taxonomy.py
new file mode 100644
index 0000000..2e8db25
--- /dev/null
+++ b/app/config_taxonomy.py
@@ -0,0 +1,230 @@
+"""
+Controlled category taxonomy for game-library.
+
+Single source of truth for activity categories. The DB stores the *slug*;
+the UI displays the Romanian name. `category` (thematic domain) and
+`content_type` (form of the content) are INDEPENDENT axes — see plan §2.
+"""
+
+import unicodedata
+import re
+from typing import Dict, List
+
+# --- Categories (thematic domain) --------------------------------------------
+# slug -> Romanian display name. ~16 fixed slugs; `altele` is the mandatory
+# fallback and MUST always be present.
+CATEGORIES: Dict[str, str] = {
+    "jocuri-cercetasesti": "Jocuri cercetășești",
+    "team-building": "Team-building",
+    "icebreakers": "Icebreakers / spargerea gheții",
+    "camp-outdoor": "Tabără și activități în aer liber",
+    "wide-games": "Wide games / jocuri de teren",
+    "orientare": "Orientare",
+    "prim-ajutor": "Prim ajutor",
+    "escape-room-puzzle": "Escape room și puzzle",
+    "creative-stem": "Creativitate și STEM",
+    "sports-active": "Sport și activități fizice",
+    "cantece-ceremonii": "Cântece și ceremonii",
+    "retete": "Rețete",
+    "supravietuire": "Supraviețuire",
+    "integrare-incluziune": "Integrare și incluziune",
+    "conflict-empatie": "Conflict și empatie",
+    "altele": "Altele",
+}
+
+# Mandatory fallback slug.
+FALLBACK_CATEGORY = "altele"
+
+# Ordered list of valid slugs.
+CATEGORY_SLUGS: List[str] = list(CATEGORIES.keys())
+
+# --- Content type (form of the content) --------------------------------------
+# Independent axis from `category`. The UI default search excludes the
+# non-game content types (see plan §6).
+CONTENT_TYPES: Dict[str, str] = {
+    "joc": "Joc",
+    "activitate": "Activitate",
+    "reteta": "Rețetă",
+    "cantec": "Cântec",
+    "ceremonie": "Ceremonie",
+}
+
+CONTENT_TYPE_SLUGS: List[str] = list(CONTENT_TYPES.keys())
+
+# Content types considered "non-game" — excluded from the default UI search.
+NON_GAME_CONTENT_TYPES: List[str] = ["reteta", "cantec", "ceremonie"]
+
+DEFAULT_CONTENT_TYPE = "activitate"
+
+# --- Aliases -----------------------------------------------------------------
+# Map of normalized arbitrary strings -> canonical slug. Keys are already
+# diacritic-stripped, lowercased and hyphenated (see _slugify). This catches
+# legacy / messy values from the old DB and common English/Romanian variants.
+_CATEGORY_ALIASES: Dict[str, str] = {
+    # legacy junk
+    "general-activity": "altele",
+    "general": "altele",
+    "educational": "creative-stem",
+    "d": "altele",
+    "a": "altele",
+    "b": "altele",
+    "c": "altele",
+    # scouting
+    "cercetasie": "jocuri-cercetasesti",
+    "cercetasesti": "jocuri-cercetasesti",
+    "scout": "jocuri-cercetasesti",
+    "scouting": "jocuri-cercetasesti",
+    "scout-games": "jocuri-cercetasesti",
+    "jocuri-cercetasesti": "jocuri-cercetasesti",
+    # team building
+    "teambuilding": "team-building",
+    "team": "team-building",
+    "cooperare": "team-building",
+    # icebreakers
+    "icebreaker": "icebreakers",
+    "spargerea-ghetii": "icebreakers",
+    "cunoastere": "icebreakers",
+    "energizers": "icebreakers",
+    "energizer": "icebreakers",
+    # camp / outdoor
+    "camp": "camp-outdoor",
+    "tabara": "camp-outdoor",
+    "outdoor": "camp-outdoor",
+    "aer-liber": "camp-outdoor",
+    # wide games
+    "wide-game": "wide-games",
+    "jocuri-de-teren": "wide-games",
+    "joc-de-teren": "wide-games",
+    "big-games": "wide-games",
+    # orientare
+    "orienteering": "orientare",
+    "navigatie": "orientare",
+    # prim ajutor
+    "first-aid": "prim-ajutor",
+    "primul-ajutor": "prim-ajutor",
+    # escape room / puzzle
+    "escape-room": "escape-room-puzzle",
+    "escaperoom": "escape-room-puzzle",
+    "puzzle": "escape-room-puzzle",
+    "puzzles": "escape-room-puzzle",
+    "ghicitori": "escape-room-puzzle",
+    # creative / stem
+    "creative": "creative-stem",
+    "creativitate": "creative-stem",
+    "stem": "creative-stem",
+    "arts-and-crafts": "creative-stem",
+    "craft": "creative-stem",
+    "crafts": "creative-stem",
+    "stiinta": "creative-stem",
+    # sports
+    "sport": "sports-active",
+    "sports": "sports-active",
+    "sportive": "sports-active",
+    "active": "sports-active",
+    "miscare": "sports-active",
+    "physical": "sports-active",
+    # songs / ceremonies
+    "cantece": "cantece-ceremonii",
+    "cantec": "cantece-ceremonii",
+    "songs": "cantece-ceremonii",
+    "ceremonii": "cantece-ceremonii",
+    "ceremonie": "cantece-ceremonii",
+    "ceremony": "cantece-ceremonii",
+    # recipes
+    "reteta": "retete",
+    "recipe": "retete",
+    "recipes": "retete",
+    "cooking": "retete",
+    "gatit": "retete",
+    # survival
+    "survival": "supravietuire",
+    "supravietuire": "supravietuire",
+    # inclusion
+    "integrare": "integrare-incluziune",
+    "incluziune": "integrare-incluziune",
+    "inclusion": "integrare-incluziune",
+    # conflict / empathy
+    "conflict": "conflict-empatie",
+    "empatie": "conflict-empatie",
+    "empathy": "conflict-empatie",
+    "rezolvarea-conflictelor": "conflict-empatie",
+    # fallback
+    "altele": "altele",
+    "other": "altele",
+    "others": "altele",
+    "misc": "altele",
+}
+
+
+def _slugify(value: str) -> str:
+    """Lowercase, strip diacritics, collapse non-alphanumerics to hyphens."""
+    if not value:
+        return ""
+    # Decompose accents (ă -> a, ș -> s, ț -> t, etc.)
+    decomposed = unicodedata.normalize("NFKD", value)
+    ascii_str = "".join(c for c in decomposed if not unicodedata.combining(c))
+    ascii_str = ascii_str.lower().strip()
+    ascii_str = re.sub(r"[^a-z0-9]+", "-", ascii_str)
+    return ascii_str.strip("-")
+
+
+def normalize_category(value: str) -> str:
+    """Map an arbitrary string to a valid category slug.
+
+    Returns one of CATEGORY_SLUGS, falling back to `altele` for anything
+    unrecognised or empty.
+    """
+    if not value:
+        return FALLBACK_CATEGORY
+    slug = _slugify(str(value))
+    if not slug:
+        return FALLBACK_CATEGORY
+    # Exact slug match.
+    if slug in CATEGORIES:
+        return slug
+    # Alias match.
+    if slug in _CATEGORY_ALIASES:
+        return _CATEGORY_ALIASES[slug]
+    return FALLBACK_CATEGORY
+
+
+def normalize_content_type(value: str) -> str:
+    """Map an arbitrary string to a valid content_type slug.
+
+    Returns one of CONTENT_TYPE_SLUGS, falling back to `activitate`.
+    """
+    if not value:
+        return DEFAULT_CONTENT_TYPE
+    slug = _slugify(str(value))
+    if slug in CONTENT_TYPES:
+        return slug
+    # Light alias handling for plural / English forms.
+    aliases = {
+        "jocuri": "joc",
+        "game": "joc",
+        "games": "joc",
+        "activitati": "activitate",
+        "activity": "activitate",
+        "retete": "reteta",
+        "recipe": "reteta",
+        "cantece": "cantec",
+        "song": "cantec",
+        "ceremonii": "ceremonie",
+        "ceremony": "ceremonie",
+    }
+    return aliases.get(slug, DEFAULT_CONTENT_TYPE)
+
+
+def is_valid_category(slug: str) -> bool:
+    """True if `slug` is a valid category slug."""
+    return slug in CATEGORIES
+
+
+def category_display_name(slug: str) -> str:
+    """Romanian display name for a slug (fallback to the slug itself)."""
+    return CATEGORIES.get(slug, slug)
+
+
+def content_type_display_name(slug: str) -> str:
+    """Romanian display name for a content_type slug."""
+    return CONTENT_TYPES.get(slug, slug)
diff --git a/app/models/activity.py b/app/models/activity.py
index d28f76b..b2bbf18 100644
--- a/app/models/activity.py
+++ b/app/models/activity.py
@@ -5,6 +5,22 @@ Activity data model for INDEX-SISTEM-JOCURI v2.0
 from dataclasses import dataclass, field
 from typing import List, Optional, Dict, Any
 import json
+import re
+import unicodedata
+
+
+def normalize_name(name: str) -> str:
+    """Diacritic-free, lowercased, whitespace-collapsed form of a name.
+
+    Used as the exact-match key for dedup grouping (see plan §4).
+    """
+    if not name:
+        return ""
+    decomposed = unicodedata.normalize("NFKD", name)
+    ascii_str = "".join(c for c in decomposed if not unicodedata.combining(c))
+    ascii_str = ascii_str.lower().strip()
+    ascii_str = re.sub(r"\s+", " ", ascii_str)
+    return ascii_str
 
 @dataclass
 class Activity:
@@ -19,10 +35,19 @@ class Activity:
     # Categories
     category: str = ""
     subcategory: Optional[str] = None
-    
+    # content_type is an axis INDEPENDENT of category:
+    # one of joc/activitate/reteta/cantec/ceremonie (see config_taxonomy).
+    content_type: Optional[str] = None
+
     # Source information
     source_file: str = ""
     page_reference: Optional[str] = None
+    # source_files: JSON-encoded list of every source the activity was seen in.
+    # `source_file` (singular) stays as the primary/original source; build_database
+    # (Lane C) accumulates the full list here on dedup-merge.
+    source_files: List[str] = field(default_factory=list)
+    # Short verbatim quote from the source — anti-hallucination anchor.
+    source_excerpt: Optional[str] = None
     
     # Age and participants
     age_group_min: Optional[int] = None
@@ -44,11 +69,22 @@ class Activity:
     keywords: Optional[str] = None
     tags: List[str] = field(default_factory=list)
     popularity_score: int = 0
-    
+
+    # Extraction / language metadata
+    language: Optional[str] = None          # 'ro' / 'en'
+    normalized_name: Optional[str] = None   # dedup key; auto-derived from name
+    extraction_confidence: Optional[str] = None  # 'high' / 'med' / 'low'
+    needs_review: int = 0
+
     # Database fields
     id: Optional[int] = None
     created_at: Optional[str] = None
     updated_at: Optional[str] = None
+
+    def __post_init__(self):
+        """Derive normalized_name from name when not explicitly provided."""
+        if not self.normalized_name:
+            self.normalized_name = normalize_name(self.name)
     
     def to_dict(self) -> Dict[str, Any]:
         """Convert activity to dictionary for database storage"""
@@ -59,8 +95,11 @@ class Activity:
             'variations': self.variations,
             'category': self.category,
             'subcategory': self.subcategory,
+            'content_type': self.content_type,
             'source_file': self.source_file,
+            'source_files': json.dumps(self.source_files) if self.source_files else None,
             'page_reference': self.page_reference,
+            'source_excerpt': self.source_excerpt,
             'age_group_min': self.age_group_min,
             'age_group_max': self.age_group_max,
             'participants_min': self.participants_min,
@@ -73,7 +112,11 @@ class Activity:
             'difficulty_level': self.difficulty_level,
             'keywords': self.keywords,
             'tags': json.dumps(self.tags) if self.tags else None,
-            'popularity_score': self.popularity_score
+            'popularity_score': self.popularity_score,
+            'language': self.language,
+            'normalized_name': self.normalized_name or normalize_name(self.name),
+            'extraction_confidence': self.extraction_confidence,
+            'needs_review': self.needs_review,
         }
     
     @classmethod
@@ -86,7 +129,17 @@ class Activity:
                 tags = json.loads(data['tags'])
             except (json.JSONDecodeError, TypeError):
                 tags = []
-        
+
+        # source_files may arrive as a JSON string (DB) or a list (extraction)
+        source_files = data.get('source_files')
+        if isinstance(source_files, str):
+            try:
+                source_files = json.loads(source_files)
+            except (json.JSONDecodeError, TypeError):
+                source_files = []
+        elif source_files is None:
+            source_files = []
+
         return cls(
             id=data.get('id'),
             name=data.get('name', ''),
@@ -95,8 +148,11 @@ class Activity:
             variations=data.get('variations'),
             category=data.get('category', ''),
             subcategory=data.get('subcategory'),
+            content_type=data.get('content_type'),
             source_file=data.get('source_file', ''),
+            source_files=source_files,
             page_reference=data.get('page_reference'),
+            source_excerpt=data.get('source_excerpt'),
             age_group_min=data.get('age_group_min'),
             age_group_max=data.get('age_group_max'),
             participants_min=data.get('participants_min'),
@@ -110,6 +166,10 @@ class Activity:
             keywords=data.get('keywords'),
             tags=tags,
             popularity_score=data.get('popularity_score', 0),
+            language=data.get('language'),
+            normalized_name=data.get('normalized_name'),
+            extraction_confidence=data.get('extraction_confidence'),
+            needs_review=data.get('needs_review', 0) or 0,
             created_at=data.get('created_at'),
             updated_at=data.get('updated_at')
         )
diff --git a/app/models/database.py b/app/models/database.py
index 93524d4..816c403 100644
--- a/app/models/database.py
+++ b/app/models/database.py
@@ -30,6 +30,8 @@ class DatabaseManager:
         """Initialize database with v2.0 schema"""
         with self._get_connection() as conn:
             # Main activities table
+            # NOTE: schema is rebuilt from scratch (plan §6) — no in-place
+            # migration. The old DB is deleted and recreated by build_database.
             conn.execute("""
                 CREATE TABLE IF NOT EXISTS activities (
                     id INTEGER PRIMARY KEY AUTOINCREMENT,
@@ -39,9 +41,12 @@ class DatabaseManager:
                     variations TEXT,
                     category TEXT NOT NULL,
                     subcategory TEXT,
+                    content_type TEXT,
                     source_file TEXT NOT NULL,
+                    source_files TEXT,
                     page_reference TEXT,
-                    
+                    source_excerpt TEXT,
+
                     -- Structured parameters
                     age_group_min INTEGER,
                     age_group_max INTEGER,
@@ -49,26 +54,34 @@ class DatabaseManager:
                     participants_max INTEGER,
                     duration_min INTEGER,
                     duration_max INTEGER,
-                    
+
                     -- Categories for filtering
                     materials_category TEXT,
                     materials_list TEXT,
                     skills_developed TEXT,
                     difficulty_level TEXT,
-                    
+
                     -- Metadata
                     keywords TEXT,
                     tags TEXT,
                     popularity_score INTEGER DEFAULT 0,
+
+                    -- Extraction / language metadata
+                    language TEXT,
+                    normalized_name TEXT,
+                    extraction_confidence TEXT,
+                    needs_review INTEGER DEFAULT 0,
+
                     created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
                     updated_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP
                 )
             """)
-            
+
             # FTS5 virtual table for search
             conn.execute("""
                 CREATE VIRTUAL TABLE IF NOT EXISTS activities_fts USING fts5(
                     name, description, rules, variations, keywords,
+                    materials_list, skills_developed,
                     content='activities',
                     content_rowid='id'
                 )
@@ -92,6 +105,7 @@ class DatabaseManager:
                 "CREATE INDEX IF NOT EXISTS idx_activities_age ON activities(age_group_min, age_group_max)",
                 "CREATE INDEX IF NOT EXISTS idx_activities_participants ON activities(participants_min, participants_max)",
                 "CREATE INDEX IF NOT EXISTS idx_activities_duration ON activities(duration_min, duration_max)",
+                "CREATE INDEX IF NOT EXISTS idx_activities_normalized_name ON activities(normalized_name)",
                 "CREATE INDEX IF NOT EXISTS idx_categories_type ON categories(type)"
             ]
             
@@ -102,24 +116,34 @@ class DatabaseManager:
             conn.execute("""
                 CREATE TRIGGER IF NOT EXISTS activities_fts_insert AFTER INSERT ON activities
                 BEGIN
-                    INSERT INTO activities_fts(rowid, name, description, rules, variations, keywords)
-                    VALUES (new.id, new.name, new.description, new.rules, new.variations, new.keywords);
+                    INSERT INTO activities_fts(rowid, name, description, rules, variations,
+                                               keywords, materials_list, skills_developed)
+                    VALUES (new.id, new.name, new.description, new.rules, new.variations,
+                            new.keywords, new.materials_list, new.skills_developed);
                 END
             """)
-            
+
             conn.execute("""
                 CREATE TRIGGER IF NOT EXISTS activities_fts_delete AFTER DELETE ON activities
                 BEGIN
-                    DELETE FROM activities_fts WHERE rowid = old.id;
+                    INSERT INTO activities_fts(activities_fts, rowid, name, description, rules,
+                                               variations, keywords, materials_list, skills_developed)
+                    VALUES ('delete', old.id, old.name, old.description, old.rules,
+                            old.variations, old.keywords, old.materials_list, old.skills_developed);
                 END
             """)
-            
+
             conn.execute("""
                 CREATE TRIGGER IF NOT EXISTS activities_fts_update AFTER UPDATE ON activities
                 BEGIN
-                    DELETE FROM activities_fts WHERE rowid = old.id;
-                    INSERT INTO activities_fts(rowid, name, description, rules, variations, keywords)
-                    VALUES (new.id, new.name, new.description, new.rules, new.variations, new.keywords);
+                    INSERT INTO activities_fts(activities_fts, rowid, name, description, rules,
+                                               variations, keywords, materials_list, skills_developed)
+                    VALUES ('delete', old.id, old.name, old.description, old.rules,
+                            old.variations, old.keywords, old.materials_list, old.skills_developed);
+                    INSERT INTO activities_fts(rowid, name, description, rules, variations,
+                                               keywords, materials_list, skills_developed)
+                    VALUES (new.id, new.name, new.description, new.rules, new.variations,
+                            new.keywords, new.materials_list, new.skills_developed);
                 END
             """)
             
@@ -179,6 +203,8 @@ class DatabaseManager:
         """Update category usage counts"""
         categories_to_update = [
             ('category', activity.category),
+            ('content_type', activity.content_type),
+            ('language', activity.language),
             ('age_group', activity.get_age_range_display()),
             ('participants', activity.get_participants_display()),
             ('duration', activity.get_duration_display()),
@@ -332,8 +358,11 @@ class DatabaseManager:
     def clear_database(self):
         """Clear all data from database"""
         with self._get_connection() as conn:
+            # Deleting from activities fires the delete trigger, which removes
+            # the matching FTS rows. The explicit 'delete-all' command then
+            # guarantees the external-content FTS index is fully cleared.
             conn.execute("DELETE FROM activities")
-            conn.execute("DELETE FROM activities_fts")
+            conn.execute("INSERT INTO activities_fts(activities_fts) VALUES('delete-all')")
             conn.execute("DELETE FROM categories")
             conn.commit()
     
diff --git a/app/services/__init__.py b/app/services/__init__.py
index 38de191..36492a0 100644
--- a/app/services/__init__.py
+++ b/app/services/__init__.py
@@ -2,8 +2,6 @@
 Services for INDEX-SISTEM-JOCURI v2.0
 """
 
-from .parser import IndexMasterParser
-from .indexer import ActivityIndexer
 from .search import SearchService
 
-__all__ = ['IndexMasterParser', 'ActivityIndexer', 'SearchService']
\ No newline at end of file
+__all__ = ['SearchService']
diff --git a/app/services/indexer.py b/app/services/indexer.py
deleted file mode 100644
index ba9cd96..0000000
--- a/app/services/indexer.py
+++ /dev/null
@@ -1,248 +0,0 @@
-"""
-Activity indexer service for INDEX-SISTEM-JOCURI v2.0
-Coordinates parsing and database indexing
-"""
-
-from typing import List, Dict, Any
-from pathlib import Path
-from app.models.database import DatabaseManager
-from app.models.activity import Activity
-from app.services.parser import IndexMasterParser
-import time
-
-class ActivityIndexer:
-    """Service for indexing activities from INDEX_MASTER into database"""
-    
-    def __init__(self, db_manager: DatabaseManager, index_master_path: str):
-        """Initialize indexer with database manager and INDEX_MASTER path"""
-        self.db = db_manager
-        self.parser = IndexMasterParser(index_master_path)
-        self.indexing_stats = {}
-    
-    def index_all_activities(self, clear_existing: bool = False) -> Dict[str, Any]:
-        """Index all activities from INDEX_MASTER into database"""
-        
-        print("🚀 Starting activity indexing process...")
-        start_time = time.time()
-        
-        # Clear existing data if requested
-        if clear_existing:
-            print("🗑️  Clearing existing database...")
-            self.db.clear_database()
-        
-        # Parse activities from INDEX_MASTER
-        print("📖 Parsing INDEX_MASTER file...")
-        activities = self.parser.parse_all_categories()
-        
-        if not activities:
-            print("❌ No activities were parsed!")
-            return {'success': False, 'error': 'No activities parsed'}
-        
-        # Filter valid activities
-        valid_activities = []
-        for activity in activities:
-            if self.parser.validate_activity_completeness(activity):
-                valid_activities.append(activity)
-            else:
-                print(f"⚠️  Skipping incomplete activity: {activity.name[:50]}...")
-        
-        print(f"✅ Validated {len(valid_activities)} activities out of {len(activities)} parsed")
-        
-        if len(valid_activities) < 100:
-            print(f"⚠️  Warning: Only {len(valid_activities)} valid activities found. Expected 500+")
-        
-        # Bulk insert into database
-        print("💾 Inserting activities into database...")
-        try:
-            inserted_count = self.db.bulk_insert_activities(valid_activities)
-            
-            # Rebuild FTS index for optimal search performance
-            print("🔍 Rebuilding search index...")
-            self.db.rebuild_fts_index()
-            
-            end_time = time.time()
-            indexing_time = end_time - start_time
-            
-            # Generate final statistics (with error handling)
-            try:
-                stats = self._generate_indexing_stats(valid_activities, indexing_time)
-                stats['inserted_count'] = inserted_count
-                stats['success'] = True
-            except Exception as e:
-                print(f"⚠️  Error generating statistics: {e}")
-                stats = {
-                    'success': True,
-                    'inserted_count': inserted_count,
-                    'indexing_time_seconds': indexing_time,
-                    'error': f'Stats generation failed: {str(e)}'
-                }
-            
-            print(f"✅ Indexing complete! {inserted_count} activities indexed in {indexing_time:.2f}s")
-            
-            # Verify database state (with error handling)
-            try:
-                db_stats = self.db.get_statistics()
-                print(f"📊 Database now contains {db_stats['total_activities']} activities")
-            except Exception as e:
-                print(f"⚠️  Error getting database statistics: {e}")
-                print(f"📊 Database insertion completed, statistics unavailable")
-            
-            return stats
-            
-        except Exception as e:
-            print(f"❌ Error during database insertion: {e}")
-            return {'success': False, 'error': str(e)}
-    
-    def index_specific_category(self, category_code: str) -> Dict[str, Any]:
-        """Index activities from a specific category only"""
-        
-        print(f"🎯 Indexing specific category: {category_code}")
-        
-        # Load content and parse specific category
-        if not self.parser.load_content():
-            return {'success': False, 'error': 'Could not load INDEX_MASTER'}
-        
-        category_name = self.parser.category_mapping.get(category_code)
-        if not category_name:
-            return {'success': False, 'error': f'Unknown category code: {category_code}'}
-        
-        activities = self.parser.parse_category_section(category_code, category_name)
-        
-        if not activities:
-            return {'success': False, 'error': f'No activities found in category {category_code}'}
-        
-        # Filter valid activities
-        valid_activities = [a for a in activities if self.parser.validate_activity_completeness(a)]
-        
-        try:
-            inserted_count = self.db.bulk_insert_activities(valid_activities)
-            return {
-                'success': True,
-                'category': category_name,
-                'inserted_count': inserted_count,
-                'total_parsed': len(activities),
-                'valid_activities': len(valid_activities)
-            }
-        except Exception as e:
-            return {'success': False, 'error': str(e)}
-    
-    def _generate_indexing_stats(self, activities: List[Activity], indexing_time: float) -> Dict[str, Any]:
-        """Generate comprehensive indexing statistics"""
-        
-        # Get parser statistics
-        parser_stats = self.parser.get_parsing_statistics()
-        
-        # Calculate additional metrics
-        categories = {}
-        age_ranges = {}
-        durations = {}
-        materials = {}
-        
-        for activity in activities:
-            # Category breakdown
-            if activity.category in categories:
-                categories[activity.category] += 1
-            else:
-                categories[activity.category] = 1
-            
-            # Age range analysis (with safety check)
-            try:
-                age_key = activity.get_age_range_display() or "nespecificat"
-                age_ranges[age_key] = age_ranges.get(age_key, 0) + 1
-            except Exception as e:
-                print(f"Warning: Error getting age range for activity {activity.name}: {e}")
-                age_ranges["nespecificat"] = age_ranges.get("nespecificat", 0) + 1
-            
-            # Duration analysis (with safety check)
-            try:
-                duration_key = activity.get_duration_display() or "nespecificat"
-                durations[duration_key] = durations.get(duration_key, 0) + 1
-            except Exception as e:
-                print(f"Warning: Error getting duration for activity {activity.name}: {e}")
-                durations["nespecificat"] = durations.get("nespecificat", 0) + 1
-            
-            # Materials analysis (with safety check)
-            try:
-                materials_key = activity.get_materials_display() or "nespecificat"
-                materials[materials_key] = materials.get(materials_key, 0) + 1
-            except Exception as e:
-                print(f"Warning: Error getting materials for activity {activity.name}: {e}")
-                materials["nespecificat"] = materials.get("nespecificat", 0) + 1
-        
-        return {
-            'indexing_time_seconds': indexing_time,
-            'parsing_stats': parser_stats,
-            'distribution': {
-                'categories': categories,
-                'age_ranges': age_ranges,
-                'durations': durations,
-                'materials': materials
-            },
-            'quality_metrics': {
-                'completion_rate': parser_stats.get('completion_rate', 0),
-                'average_description_length': parser_stats.get('average_description_length', 0),
-                'activities_with_metadata': sum(1 for a in activities if a.age_group_min or a.participants_min or a.duration_min)
-            }
-        }
-    
-    def verify_indexing_quality(self) -> Dict[str, Any]:
-        """Verify the quality of indexed data"""
-        
-        try:
-            # Get database statistics
-            db_stats = self.db.get_statistics()
-            
-            # Check for minimum activity count
-            total_activities = db_stats['total_activities']
-            meets_minimum = total_activities >= 500
-            
-            # Check category distribution
-            categories = db_stats.get('categories', {})
-            category_coverage = len(categories)
-            
-            # Sample some activities to check quality
-            sample_activities = self.db.search_activities(limit=10)
-            
-            quality_issues = []
-            for activity in sample_activities:
-                if not activity.get('description') or len(activity['description']) < 10:
-                    quality_issues.append(f"Activity {activity.get('name', 'Unknown')} has insufficient description")
-                
-                if not activity.get('category'):
-                    quality_issues.append(f"Activity {activity.get('name', 'Unknown')} missing category")
-            
-            return {
-                'total_activities': total_activities,
-                'meets_minimum_requirement': meets_minimum,
-                'minimum_target': 500,
-                'category_coverage': category_coverage,
-                'expected_categories': len(self.parser.category_mapping),
-                'quality_issues': quality_issues,
-                'quality_score': max(0, 100 - len(quality_issues) * 10),
-                'database_stats': db_stats
-            }
-            
-        except Exception as e:
-            return {'error': str(e), 'quality_score': 0}
-    
-    def get_indexing_progress(self) -> Dict[str, Any]:
-        """Get current indexing progress and status"""
-        try:
-            db_stats = self.db.get_statistics()
-            
-            # Calculate progress towards 500+ activities goal
-            total_activities = db_stats['total_activities']
-            target_activities = 500
-            progress_percentage = min(100, (total_activities / target_activities) * 100)
-            
-            return {
-                'current_activities': total_activities,
-                'target_activities': target_activities,
-                'progress_percentage': progress_percentage,
-                'status': 'completed' if total_activities >= target_activities else 'in_progress',
-                'categories_indexed': list(db_stats.get('categories', {}).keys()),
-                'database_size_mb': db_stats.get('database_size_bytes', 0) / (1024 * 1024)
-            }
-            
-        except Exception as e:
-            return {'error': str(e), 'status': 'error'}
\ No newline at end of file
diff --git a/app/services/parser.py b/app/services/parser.py
deleted file mode 100644
index e086248..0000000
--- a/app/services/parser.py
+++ /dev/null
@@ -1,340 +0,0 @@
-"""
-Advanced parser for INDEX_MASTER_JOCURI_ACTIVITATI.md
-Extracts 500+ individual activities with full details
-"""
-
-import re
-from pathlib import Path
-from typing import List, Dict, Optional, Tuple
-from app.models.activity import Activity
-
-class IndexMasterParser:
-    """Advanced parser for extracting real activities from INDEX_MASTER"""
-    
-    def __init__(self, index_file_path: str):
-        """Initialize parser with INDEX_MASTER file path"""
-        self.index_file_path = Path(index_file_path)
-        self.content = ""
-        self.activities = []
-        
-        # Category mapping for main sections (exact match from file)
-        self.category_mapping = {
-            '[A]': 'JOCURI CERCETĂȘEȘTI ȘI SCOUT',
-            '[B]': 'TEAM BUILDING ȘI COMUNICARE',
-            '[C]': 'CAMPING ȘI ACTIVITĂȚI EXTERIOR', 
-            '[D]': 'ESCAPE ROOM ȘI PUZZLE-URI',
-            '[E]': 'ORIENTARE ȘI BUSOLE',
-            '[F]': 'PRIMUL AJUTOR ȘI SIGURANȚA',
-            '[G]': 'ACTIVITĂȚI EDUCAȚIONALE',
-            '[H]': 'RESURSE SPECIALE'
-        }
-    
-    def load_content(self) -> bool:
-        """Load and validate INDEX_MASTER content"""
-        try:
-            if not self.index_file_path.exists():
-                print(f"❌ INDEX_MASTER file not found: {self.index_file_path}")
-                return False
-            
-            with open(self.index_file_path, 'r', encoding='utf-8') as f:
-                self.content = f.read()
-            
-            if len(self.content) < 1000:  # Sanity check
-                print(f"⚠️  INDEX_MASTER file seems too small: {len(self.content)} chars")
-                return False
-            
-            print(f"✅ Loaded INDEX_MASTER: {len(self.content)} characters")
-            return True
-            
-        except Exception as e:
-            print(f"❌ Error loading INDEX_MASTER: {e}")
-            return False
-    
-    def parse_all_categories(self) -> List[Activity]:
-        """Parse all categories and extract individual activities"""
-        if not self.load_content():
-            return []
-        
-        print("🔍 Starting comprehensive parsing of INDEX_MASTER...")
-        
-        # Parse each main category
-        for category_code, category_name in self.category_mapping.items():
-            print(f"\n📂 Processing category {category_code}: {category_name}")
-            category_activities = self.parse_category_section(category_code, category_name)
-            self.activities.extend(category_activities)
-            print(f"   ✅ Extracted {len(category_activities)} activities")
-        
-        print(f"\n🎯 Total activities extracted: {len(self.activities)}")
-        return self.activities
-    
-    def parse_category_section(self, category_code: str, category_name: str) -> List[Activity]:
-        """Parse a specific category section"""
-        activities = []
-        
-        # Find the category section - exact pattern match
-        # Look for the actual section, not the table of contents
-        pattern = rf"^## {re.escape(category_code)} {re.escape(category_name)}\s*$"
-        matches = list(re.finditer(pattern, self.content, re.MULTILINE | re.IGNORECASE))
-        
-        if not matches:
-            print(f"   ⚠️  Category section not found: {category_code}")
-            return activities
-        
-        # Take the last match (should be the actual section, not TOC)
-        match = matches[-1]
-        print(f"   📍 Found section at position {match.start()}")
-        
-        # Extract content until next main category or end
-        start_pos = match.end()
-        
-        # Find next main category (look for complete header)
-        next_category_pattern = r"^## \[[A-H]\] [A-ZĂÂÎȘȚ]"
-        next_match = re.search(next_category_pattern, self.content[start_pos:], re.MULTILINE)
-        
-        if next_match:
-            end_pos = start_pos + next_match.start()
-            section_content = self.content[start_pos:end_pos]
-        else:
-            section_content = self.content[start_pos:]
-        
-        # Parse subsections within the category
-        activities.extend(self._parse_subsections(section_content, category_name))
-        
-        return activities
-    
-    def _parse_subsections(self, section_content: str, category_name: str) -> List[Activity]:
-        """Parse subsections within a category"""
-        activities = []
-        
-        # Find all subsections (### markers)
-        subsection_pattern = r"^### (.+?)$"
-        subsections = re.finditer(subsection_pattern, section_content, re.MULTILINE)
-        
-        subsection_list = list(subsections)
-        
-        for i, subsection in enumerate(subsection_list):
-            subsection_title = subsection.group(1).strip()
-            subsection_start = subsection.end()
-            
-            # Find end of subsection
-            if i + 1 < len(subsection_list):
-                subsection_end = subsection_list[i + 1].start()
-            else:
-                subsection_end = len(section_content)
-            
-            subsection_text = section_content[subsection_start:subsection_end]
-            
-            # Parse individual games in this subsection
-            subsection_activities = self._parse_games_in_subsection(
-                subsection_text, category_name, subsection_title
-            )
-            activities.extend(subsection_activities)
-        
-        return activities
-    
-    def _parse_games_in_subsection(self, subsection_text: str, category_name: str, subsection_title: str) -> List[Activity]:
-        """Parse individual games within a subsection"""
-        activities = []
-        
-        # Look for "Exemple de jocuri:" sections
-        examples_pattern = r"\*\*Exemple de jocuri:\*\*\s*\n(.*?)(?=\n\*\*|$)"
-        examples_matches = re.finditer(examples_pattern, subsection_text, re.DOTALL)
-        
-        for examples_match in examples_matches:
-            examples_text = examples_match.group(1)
-            
-            # Extract individual games (numbered list)
-            game_pattern = r"^(\d+)\.\s*\*\*(.+?)\*\*\s*-\s*(.+?)$"
-            games = re.finditer(game_pattern, examples_text, re.MULTILINE)
-            
-            for game_match in games:
-                game_number = game_match.group(1)
-                game_name = game_match.group(2).strip()
-                game_description = game_match.group(3).strip()
-                
-                # Extract metadata from subsection
-                metadata = self._extract_subsection_metadata(subsection_text)
-                
-                # Create activity
-                activity = Activity(
-                    name=game_name,
-                    description=game_description,
-                    category=category_name,
-                    subcategory=subsection_title,
-                    source_file=f"INDEX_MASTER_JOCURI_ACTIVITATI.md",
-                    page_reference=f"{category_name} > {subsection_title} > #{game_number}",
-                    **metadata
-                )
-                
-                activities.append(activity)
-        
-        # Also extract from direct activity descriptions without "Exemple de jocuri"
-        activities.extend(self._parse_direct_activities(subsection_text, category_name, subsection_title))
-        
-        return activities
-    
-    def _extract_subsection_metadata(self, subsection_text: str) -> Dict:
-        """Extract metadata from subsection text"""
-        metadata = {}
-        
-        # Extract participants info
-        participants_pattern = r"\*\*Participanți:\*\*\s*(.+?)(?:\n|\*\*)"
-        participants_match = re.search(participants_pattern, subsection_text)
-        if participants_match:
-            participants_text = participants_match.group(1).strip()
-            participants = self._parse_participants(participants_text)
-            metadata.update(participants)
-        
-        # Extract duration
-        duration_pattern = r"\*\*Durata:\*\*\s*(.+?)(?:\n|\*\*)"
-        duration_match = re.search(duration_pattern, subsection_text)
-        if duration_match:
-            duration_text = duration_match.group(1).strip()
-            duration = self._parse_duration(duration_text)
-            metadata.update(duration)
-        
-        # Extract materials
-        materials_pattern = r"\*\*Materiale:\*\*\s*(.+?)(?:\n|\*\*)"
-        materials_match = re.search(materials_pattern, subsection_text)
-        if materials_match:
-            materials_text = materials_match.group(1).strip()
-            metadata['materials_list'] = materials_text
-            metadata['materials_category'] = self._categorize_materials(materials_text)
-        
-        # Extract keywords
-        keywords_pattern = r"\*\*Cuvinte cheie:\*\*\s*(.+?)(?:\n|\*\*)"
-        keywords_match = re.search(keywords_pattern, subsection_text)
-        if keywords_match:
-            metadata['keywords'] = keywords_match.group(1).strip()
-        
-        return metadata
-    
-    def _parse_participants(self, participants_text: str) -> Dict:
-        """Parse participants information"""
-        result = {}
-        
-        # Look for number ranges like "8-30 copii" or "5-15 persoane"
-        range_pattern = r"(\d+)-(\d+)"
-        range_match = re.search(range_pattern, participants_text)
-        
-        if range_match:
-            result['participants_min'] = int(range_match.group(1))
-            result['participants_max'] = int(range_match.group(2))
-        else:
-            # Look for single numbers
-            number_pattern = r"(\d+)\+"
-            number_match = re.search(number_pattern, participants_text)
-            if number_match:
-                result['participants_min'] = int(number_match.group(1))
-        
-        # Extract age information
-        age_pattern = r"(\d+)-(\d+)\s*ani"
-        age_match = re.search(age_pattern, participants_text)
-        if age_match:
-            result['age_group_min'] = int(age_match.group(1))
-            result['age_group_max'] = int(age_match.group(2))
-        
-        return result
-    
-    def _parse_duration(self, duration_text: str) -> Dict:
-        """Parse duration information"""
-        result = {}
-        
-        # Look for time ranges like "5-20 minute" or "15-30min"
-        range_pattern = r"(\d+)-(\d+)\s*(?:minute|min)"
-        range_match = re.search(range_pattern, duration_text)
-        
-        if range_match:
-            result['duration_min'] = int(range_match.group(1))
-            result['duration_max'] = int(range_match.group(2))
-        else:
-            # Look for single duration
-            single_pattern = r"(\d+)\+?\s*(?:minute|min)"
-            single_match = re.search(single_pattern, duration_text)
-            if single_match:
-                result['duration_min'] = int(single_match.group(1))
-        
-        return result
-    
-    def _categorize_materials(self, materials_text: str) -> str:
-        """Categorize materials into simple categories"""
-        materials_lower = materials_text.lower()
-        
-        if any(word in materials_lower for word in ['fără', 'nu necesare', 'nimic', 'minime']):
-            return 'Fără materiale'
-        elif any(word in materials_lower for word in ['hârtie', 'creion', 'marker', 'simple']):
-            return 'Materiale simple'
-        elif any(word in materials_lower for word in ['computer', 'proiector', 'echipament', 'complexe']):
-            return 'Materiale complexe'
-        else:
-            return 'Materiale variate'
-    
-    def _parse_direct_activities(self, subsection_text: str, category_name: str, subsection_title: str) -> List[Activity]:
-        """Parse activities that are described directly without 'Exemple de jocuri' section"""
-        activities = []
-        
-        # Look for activity descriptions in sections that don't have "Exemple de jocuri"
-        if "**Exemple de jocuri:**" not in subsection_text:
-            # Try to extract from file descriptions
-            file_pattern = r"\*\*Fișier:\*\*\s*`([^`]+)`.*?\*\*(.+?)\*\*"
-            file_matches = re.finditer(file_pattern, subsection_text, re.DOTALL)
-            
-            for file_match in file_matches:
-                file_name = file_match.group(1)
-                description_part = file_match.group(2)
-                
-                # Create a general activity for this file
-                activity = Activity(
-                    name=f"Activități din {file_name}",
-                    description=f"Colecție de activități din fișierul {file_name}. {description_part[:200]}...",
-                    category=category_name,
-                    subcategory=subsection_title,
-                    source_file=file_name,
-                    page_reference=f"{category_name} > {subsection_title}",
-                    **self._extract_subsection_metadata(subsection_text)
-                )
-                
-                activities.append(activity)
-        
-        return activities
-    
-    def validate_activity_completeness(self, activity: Activity) -> bool:
-        """Validate that an activity has all necessary fields"""
-        required_fields = ['name', 'description', 'category', 'source_file']
-        
-        for field in required_fields:
-            if not getattr(activity, field) or not getattr(activity, field).strip():
-                return False
-        
-        # Check minimum description length
-        if len(activity.description) < 10:
-            return False
-        
-        return True
-    
-    def get_parsing_statistics(self) -> Dict:
-        """Get statistics about the parsing process"""
-        if not self.activities:
-            return {'total_activities': 0}
-        
-        category_counts = {}
-        valid_activities = 0
-        
-        for activity in self.activities:
-            # Count by category
-            if activity.category in category_counts:
-                category_counts[activity.category] += 1
-            else:
-                category_counts[activity.category] = 1
-            
-            # Count valid activities
-            if self.validate_activity_completeness(activity):
-                valid_activities += 1
-        
-        return {
-            'total_activities': len(self.activities),
-            'valid_activities': valid_activities,
-            'completion_rate': (valid_activities / len(self.activities)) * 100 if self.activities else 0,
-            'category_breakdown': category_counts,
-            'average_description_length': sum(len(a.description) for a in self.activities) / len(self.activities) if self.activities else 0
-        }
\ No newline at end of file
diff --git a/app/services/search.py b/app/services/search.py
index a41857a..2a64261 100644
--- a/app/services/search.py
+++ b/app/services/search.py
@@ -5,8 +5,19 @@ Enhanced search with FTS5 and intelligent filtering
 
 from typing import List, Dict, Any, Optional
 from app.models.database import DatabaseManager
+from app.config_taxonomy import NON_GAME_CONTENT_TYPES
 import re
 
+# Category slugs that are themselves "non-game" — selecting one of these as a
+# category filter also lifts the default non-game content_type exclusion.
+NON_GAME_CATEGORIES = {"retete", "cantece-ceremonii"}
+
+# When a Python-side post-filter is active the DB LIMIT is applied *before*
+# filtering, so we over-fetch to still satisfy the caller's `limit`.
+_OVERSCAN_FACTOR = 5
+_OVERSCAN_CAP = 2000
+
+
 class SearchService:
     """Enhanced search service with intelligent query processing"""
     
@@ -24,22 +35,72 @@ class SearchService:
         
         if filters is None:
             filters = {}
-        
+
         # Process and normalize search text
         processed_search = self._process_search_text(search_text)
-        
+
         # Map web filters to database fields
         db_filters = self._map_filters_to_db_fields(filters)
-        
+
+        # content_type and language are filtered in Python: the DB layer does
+        # not expose them as query parameters. The DEFAULT search excludes the
+        # non-game content types (rețete / cântece / ceremonii) — they surface
+        # only when the user explicitly filters that content_type, or picks a
+        # non-game category. See plan §6.
+        content_type, exclude_non_game = self._resolve_content_type_filter(filters)
+        language = (filters.get('language') or '').strip().lower() or None
+        post_filtering = bool(content_type or exclude_non_game or language)
+
+        # Over-fetch when post-filtering so the final list can still reach `limit`.
+        fetch_limit = min(limit * _OVERSCAN_FACTOR, _OVERSCAN_CAP) if post_filtering else limit
+
         # Perform database search
         results = self.db.search_activities(
             search_text=processed_search,
             **db_filters,
-            limit=limit
+            limit=fetch_limit
         )
-        
-        # Post-process results for relevance and ranking
-        return self._post_process_results(results, processed_search, filters)
+
+        # Apply content_type / language post-filters
+        results = self._apply_content_type_filter(results, content_type, exclude_non_game)
+        if language:
+            results = [r for r in results
+                       if (r.get('language') or '').strip().lower() == language]
+
+        # Post-process results for relevance and ranking, then honour `limit`
+        results = self._post_process_results(results, processed_search, filters)
+        return results[:limit]
+
+    def _resolve_content_type_filter(self, filters: Dict[str, str]):
+        """Determine the content_type post-filter.
+
+        Returns (explicit_content_type | None, exclude_non_game: bool):
+        - an explicit `content_type` filter → that value, no exclusion;
+        - a `category` filter on a non-game category → no exclusion;
+        - otherwise → default search, exclude non-game content types.
+        """
+        content_type = (filters.get('content_type') or '').strip()
+        if content_type:
+            return content_type, False
+        category = (filters.get('category') or '').strip()
+        if category in NON_GAME_CATEGORIES:
+            return None, False
+        return None, True
+
+    def _apply_content_type_filter(self,
+                                   results: List[Dict[str, Any]],
+                                   content_type: Optional[str],
+                                   exclude_non_game: bool) -> List[Dict[str, Any]]:
+        """Filter results by content_type (explicit include vs default exclude)."""
+        if content_type:
+            return [r for r in results
+                    if (r.get('content_type') or '') == content_type]
+        if exclude_non_game:
+            # Rows with NULL/unknown content_type are kept — only the known
+            # non-game types are dropped from the default search.
+            return [r for r in results
+                    if (r.get('content_type') or '') not in NON_GAME_CONTENT_TYPES]
+        return results
     
     def _process_search_text(self, search_text: Optional[str]) -> Optional[str]:
         """Process and enhance search text for better FTS5 results"""
@@ -83,10 +144,16 @@ class SearchService:
             if not filter_value or not filter_value.strip():
                 continue
             
+            # content_type / language are NOT database query params — they are
+            # applied as Python post-filters in search_activities(). Skip them
+            # here so they never reach DatabaseManager.search_activities().
+            if filter_key in ('content_type', 'language'):
+                continue
+
             # Map filter types to database fields
             if filter_key == 'category':
                 db_filters['category'] = filter_value
-            
+
             elif filter_key == 'age_group':
                 # Parse age range (e.g., "5-8 ani", "12+ ani")
                 age_match = re.search(r'(\d+)(?:-(\d+))?\s*ani?', filter_value)
@@ -177,21 +244,22 @@ class SearchService:
             boost_score = 0
             
             # Check name matches (highest priority)
-            name_lower = result.get('name', '').lower()
+            # NB: use `or ''` — nullable columns come back as None, not ''.
+            name_lower = (result.get('name') or '').lower()
             for term in search_terms:
                 if term in name_lower:
                     boost_score += 10
                     if name_lower.startswith(term):
                         boost_score += 5  # Extra boost for name starts with term
-            
+
             # Check description matches
-            desc_lower = result.get('description', '').lower()
+            desc_lower = (result.get('description') or '').lower()
             for term in search_terms:
                 if term in desc_lower:
                     boost_score += 3
-            
+
             # Check keywords matches
-            keywords_lower = result.get('keywords', '').lower()
+            keywords_lower = (result.get('keywords') or '').lower()
             for term in search_terms:
                 if term in keywords_lower:
                     boost_score += 5
@@ -280,11 +348,14 @@ class SearchService:
             return []
         
         try:
-            # Search for activities that match the partial query
+            # Search for activities that match the partial query.
+            # Over-fetch then drop non-game content types so autocomplete
+            # mirrors the default search (no rețete / cântece / ceremonii).
             results = self.db.search_activities(
                 search_text=f'"{partial_query}"',
-                limit=limit * 2
+                limit=limit * 6
             )
+            results = self._apply_content_type_filter(results, None, True)
             
             suggestions = []
             seen = set()
diff --git a/app/templates/activity.html b/app/templates/activity.html
index 6e25f08..d865f0a 100644
--- a/app/templates/activity.html
+++ b/app/templates/activity.html
@@ -15,7 +15,13 @@
     <header class="activity-detail-header">
         <div class="activity-title-section">
             <h1 class="activity-detail-title">{{ activity.name }}</h1>
-            <span class="activity-category-badge">{{ activity.category }}</span>
+            <span class="activity-category-badge">{{ display_names.get(activity.category, activity.category) }}</span>
+            {% if activity.content_type %}
+            <span class="activity-content-type-badge">{{ display_names.get(activity.content_type, activity.content_type) }}</span>
+            {% endif %}
+            {% if activity.needs_review %}
+            <span class="activity-badge needs-review" title="Această activitate necesită verificare">⚠ De verificat</span>
+            {% endif %}
         </div>
         
         {% if activity.subcategory %}
diff --git a/app/templates/index.html b/app/templates/index.html
index 8809c15..7baffeb 100644
--- a/app/templates/index.html
+++ b/app/templates/index.html
@@ -36,7 +36,31 @@
                     <select name="category" id="category" class="filter-select">
                         <option value="">Toate categoriile</option>
                         {% for category in filters.category %}
-                        <option value="{{ category }}">{{ category }}</option>
+                        <option value="{{ category }}">{{ display_names.get(category, category) }}</option>
+                        {% endfor %}
+                    </select>
+                </div>
+                {% endif %}
+
+                {% if filters.content_type %}
+                <div class="filter-group">
+                    <label for="content_type" class="filter-label">Tip conținut</label>
+                    <select name="content_type" id="content_type" class="filter-select">
+                        <option value="">Doar jocuri și activități</option>
+                        {% for content_type in filters.content_type %}
+                        <option value="{{ content_type }}">{{ display_names.get(content_type, content_type) }}</option>
+                        {% endfor %}
+                    </select>
+                </div>
+                {% endif %}
+
+                {% if filters.language %}
+                <div class="filter-group">
+                    <label for="language" class="filter-label">Limbă</label>
+                    <select name="language" id="language" class="filter-select">
+                        <option value="">Toate limbile</option>
+                        {% for language in filters.language %}
+                        <option value="{{ language }}">{{ display_names.get(language, language) }}</option>
                         {% endfor %}
                     </select>
                 </div>
diff --git a/app/templates/results.html b/app/templates/results.html
index fa835cb..f06166d 100644
--- a/app/templates/results.html
+++ b/app/templates/results.html
@@ -24,7 +24,29 @@
                 <option value="">Toate categoriile</option>
                 {% for category in filters.category %}
                 <option value="{{ category }}" {% if applied_filters.category == category %}selected{% endif %}>
-                    {{ category }}
+                    {{ display_names.get(category, category) }}
+                </option>
+                {% endfor %}
+            </select>
+            {% endif %}
+
+            {% if filters.content_type %}
+            <select name="content_type" class="filter-select compact">
+                <option value="">Doar jocuri și activități</option>
+                {% for content_type in filters.content_type %}
+                <option value="{{ content_type }}" {% if applied_filters.content_type == content_type %}selected{% endif %}>
+                    {{ display_names.get(content_type, content_type) }}
+                </option>
+                {% endfor %}
+            </select>
+            {% endif %}
+
+            {% if filters.language %}
+            <select name="language" class="filter-select compact">
+                <option value="">Toate limbile</option>
+                {% for language in filters.language %}
+                <option value="{{ language }}" {% if applied_filters.language == language %}selected{% endif %}>
+                    {{ display_names.get(language, language) }}
                 </option>
                 {% endfor %}
             </select>
@@ -109,7 +131,10 @@
                         {{ activity.name }}
                     </a>
                 </h3>
-                <span class="activity-category">{{ activity.category }}</span>
+                <span class="activity-category">{{ display_names.get(activity.category, activity.category) }}</span>
+                {% if activity.needs_review %}
+                <span class="activity-badge needs-review" title="Această activitate necesită verificare">⚠ De verificat</span>
+                {% endif %}
             </header>
 
             <div class="activity-content">
diff --git a/app/web/routes.py b/app/web/routes.py
index 6445e7a..56fd7ca 100644
--- a/app/web/routes.py
+++ b/app/web/routes.py
@@ -7,11 +7,17 @@ from flask import Blueprint, request, render_template, jsonify, current_app
 from app.models.database import DatabaseManager
 from app.models.activity import Activity
 from app.services.search import SearchService
+from app.config_taxonomy import CATEGORIES, CONTENT_TYPES
 import os
 from pathlib import Path
 
 bp = Blueprint('main', __name__)
 
+# Slug -> Romanian display name. Category and content_type slugs never collide,
+# so a single flat map is enough for the UI filter labels.
+LANGUAGE_NAMES = {'ro': 'Română', 'en': 'Engleză'}
+DISPLAY_NAMES = {**CATEGORIES, **CONTENT_TYPES, **LANGUAGE_NAMES}
+
 # Initialize database manager (will be configured in application factory)
 def get_db_manager():
     """Get database manager instance"""
@@ -36,15 +42,17 @@ def index():
         # Get database statistics for the interface
         stats = db.get_statistics()
         
-        return render_template('index.html', 
+        return render_template('index.html',
                              filters=filter_options,
+                             display_names=DISPLAY_NAMES,
                              stats=stats)
-    
+
     except Exception as e:
         print(f"Error loading main page: {e}")
         # Fallback with empty filters
-        return render_template('index.html', 
+        return render_template('index.html',
                              filters={},
+                             display_names=DISPLAY_NAMES,
                              stats={'total_activities': 0})
 
 @bp.route('/search', methods=['GET', 'POST'])
@@ -82,8 +90,9 @@ def search():
                              search_query=search_query,
                              applied_filters=filters,
                              filters=filter_options,
+                             display_names=DISPLAY_NAMES,
                              results_count=len(activities))
-    
+
     except Exception as e:
         print(f"Search error: {e}")
         return render_template('results.html',
@@ -91,6 +100,7 @@ def search():
                              search_query='',
                              applied_filters={},
                              filters={},
+                             display_names=DISPLAY_NAMES,
                              results_count=0,
                              error=str(e))
 
@@ -121,6 +131,7 @@ def activity_detail(activity_id):
         
         return render_template('activity.html',
                              activity=activity,
+                             display_names=DISPLAY_NAMES,
                              similar_activities=similar_activities)
     
     except Exception as e:
diff --git a/scripts/SUBAGENT_PROMPT.md b/scripts/SUBAGENT_PROMPT.md
new file mode 100644
index 0000000..79c3e9c
--- /dev/null
+++ b/scripts/SUBAGENT_PROMPT.md
@@ -0,0 +1,81 @@
+# SUBAGENT — Activity extraction
+
+You are a subagent in the game-library extraction pipeline. You extract
+educational activities (games, team-building, scouting, recipes, songs,
+ceremonies) from one chunk of a source document into structured JSON.
+
+## Your task
+
+1. **Read ONLY the chunk you were assigned.** Do not read other chunks, other
+   files, or the original document. The chunk is a `.txt` file with
+   `--- PAGE N ---` markers.
+2. Identify **every distinct activity** in the chunk.
+3. For each activity, fill the schema in `scripts/activity_schema.json`.
+4. Write the result to `data/extracted/<chunk_key>.json`.
+
+## What counts as "a distinct activity"
+
+A distinct activity is a self-contained game/activity/recipe/song/ceremony with
+its own name and a real description of how to do it. It is NOT:
+
+- a bare mention or a cross-reference with no description — **skip it**;
+- a sub-variant of an activity already extracted — fold it into `variations`;
+- a heading, a table of contents entry, or running page chrome.
+
+If the same activity is split across a page boundary inside your chunk, treat it
+as **one** activity and combine the text.
+
+## Output format
+
+The file is one JSON object: a `header` plus an `activities` array.
+
+```json
+{
+  "header": {
+    "source_id": "<set from the prompt>",
+    "chunk_key": "<set from the prompt>",
+    "source_hash": "<set from the prompt>",
+    "schema_version": "1.0",
+    "prompt_version": "1.0",
+    "chunk_range": "pages 1-20"
+  },
+  "activities": [ ... ]
+}
+```
+
+## Rules for each activity
+
+- **`name`** — the activity's real name (≥3 characters).
+- **`description`** — real prose describing the activity. No hard length limit,
+  but it must actually describe what happens.
+- **`rules`** — how it is played / carried out, if the source gives rules.
+- **`category`** — exactly one taxonomy slug (see the `enum` in the schema):
+  `jocuri-cercetasesti`, `team-building`, `icebreakers`, `camp-outdoor`,
+  `wide-games`, `orientare`, `prim-ajutor`, `escape-room-puzzle`,
+  `creative-stem`, `sports-active`, `cantece-ceremonii`, `retete`,
+  `supravietuire`, `integrare-incluziune`, `conflict-empatie`, `altele`.
+  When unsure, use `altele`.
+- **`content_type`** — the FORM of the content, independent of category:
+  `joc`, `activitate`, `reteta`, `cantec`, or `ceremonie`.
+- **`language`** — `ro` or `en` (the language the activity is written in).
+- **`source_excerpt`** — **MANDATORY.** A short quote (one or two sentences)
+  copied **verbatim** from the chunk. This is the anti-hallucination anchor: it
+  is checked as a fuzzy substring of the chunk, and invented quotes are
+  rejected.
+- **`page_reference`** — **MANDATORY.** The `--- PAGE N ---` marker(s) the
+  activity came from, e.g. `"page 14"` or `"pages 14-15"`.
+- **`extraction_confidence`** — `high`, `med`, or `low`. Use `low` when the
+  source text for the activity is thin or ambiguous.
+
+## Never invent data
+
+- Do **not** invent ages, participant counts, or durations. If the source does
+  not state them, leave those fields `null`.
+- Do **not** paraphrase the `source_excerpt` — copy it character for character.
+- Better to extract fewer activities accurately than to pad the output.
+
+## Before you finish
+
+- Every activity has a non-empty `source_excerpt` and `page_reference`.
+- The file validates against `scripts/activity_schema.json`.
+- You only used text from your assigned chunk.
diff --git a/scripts/activity_schema.json b/scripts/activity_schema.json
new file mode 100644
index 0000000..922dc86
--- /dev/null
+++ b/scripts/activity_schema.json
@@ -0,0 +1,110 @@
+{
+  "$schema": "http://json-schema.org/draft-07/schema#",
+  "title": "Game-library extraction output",
+  "description": "One subagent output file: a header carrying provenance/version metadata plus the list of activities extracted from a single chunk.",
+  "type": "object",
+  "required": ["header", "activities"],
+  "additionalProperties": false,
+  "properties": {
+    "header": {
+      "type": "object",
+      "required": ["source_hash", "schema_version", "prompt_version", "chunk_range"],
+      "additionalProperties": true,
+      "properties": {
+        "source_hash": {"type": "string", "minLength": 8},
+        "schema_version": {"type": "string"},
+        "prompt_version": {"type": "string"},
+        "chunk_range": {"type": "string"},
+        "source_id": {"type": ["string", "null"]},
+        "chunk_key": {"type": ["string", "null"]}
+      }
+    },
+    "activities": {
+      "type": "array",
+      "items": {"$ref": "#/definitions/activity"}
+    }
+  },
+  "definitions": {
+    "activity": {
+      "type": "object",
+      "required": [
+        "name",
+        "description",
+        "category",
+        "content_type",
+        "language",
+        "extraction_confidence",
+        "source_excerpt",
+        "page_reference"
+      ],
+      "additionalProperties": false,
+      "properties": {
+        "name": {"type": "string", "minLength": 3},
+        "description": {"type": "string", "minLength": 1},
+        "rules": {"type": ["string", "null"]},
+        "variations": {"type": ["string", "null"]},
+        "category": {
+          "type": "string",
+          "enum": [
+            "jocuri-cercetasesti",
+            "team-building",
+            "icebreakers",
+            "camp-outdoor",
+            "wide-games",
+            "orientare",
+            "prim-ajutor",
+            "escape-room-puzzle",
+            "creative-stem",
+            "sports-active",
+            "cantece-ceremonii",
+            "retete",
+            "supravietuire",
+            "integrare-incluziune",
+            "conflict-empatie",
+            "altele"
+          ]
+        },
+        "subcategory": {"type": ["string", "null"]},
+        "content_type": {
+          "type": "string",
+          "enum": ["joc", "activitate", "reteta", "cantec", "ceremonie"]
+        },
+        "language": {"type": "string", "enum": ["ro", "en"]},
+        "extraction_confidence": {
+          "type": "string",
+          "enum": ["high", "med", "low"]
+        },
+        "source_excerpt": {"type": "string", "minLength": 1},
+        "page_reference": {"type": "string", "minLength": 1},
+        "source_file": {"type": ["string", "null"]},
+        "age_group_min": {"type": ["integer", "null"], "minimum": 0},
+        "age_group_max": {"type": ["integer", "null"], "minimum": 0},
+        "participants_min": {"type": ["integer", "null"], "minimum": 0},
+        "participants_max": {"type": ["integer", "null"], "minimum": 0},
+        "duration_min": {"type": ["integer", "null"], "minimum": 0},
+        "duration_max": {"type": ["integer", "null"], "minimum": 0},
+        "materials_category": {"type": ["string", "null"]},
+        "materials_list": {
+          "type": ["array", "null"],
+          "items": {"type": "string"}
+        },
+        "skills_developed": {
+          "type": ["array", "null"],
+          "items": {"type": "string"}
+        },
+        "difficulty_level": {
+          "type": ["string", "null"],
+          "enum": ["usor", "mediu", "dificil", null]
+        },
+        "keywords": {
+          "type": ["array", "null"],
+          "items": {"type": "string"}
+        },
+        "tags": {
+          "type": ["array", "null"],
+          "items": {"type": "string"}
+        }
+      }
+    }
+  }
+}
diff --git a/scripts/build_database.py b/scripts/build_database.py
new file mode 100644
index 0000000..d7276be
--- /dev/null
+++ b/scripts/build_database.py
@@ -0,0 +1,639 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+"""
+build_database.py — build data/activities.db from the subagent extraction JSON.
+
+Replaces the old import_claude_activities.py. Pipeline (plan §4):
+
+  1. `--rebuild` builds into data/activities.db.tmp; on success the live DB is
+     backed up to data/activities.db.bak and the tmp file is swapped in with an
+     atomic os.replace. A mid-build crash leaves the live DB untouched.
+  2. Every data/extracted/*.json is validated against scripts/activity_schema.json;
+     invalid files are moved to data/extracted/_rejected/ with an error log.
+  2b. Each source_excerpt must appear as a fuzzy substring (rapidfuzz
+     partial_ratio >= 90) of its source chunk — non-matches are hallucinations
+     and the activity is dropped (logged to _rejected/).
+  3. `category` is normalized to a valid taxonomy slug (fallback `altele`).
+  4. Dedup (D5): group by exact normalized_name, never across languages; within a
+     group rapidfuzz on descriptions — >=85 auto-merge, 60-85 borderline (keep
+     both, needs_review), <60 separate variants.
+  5. data/review_decisions.json is applied before insert.
+  6. Bulk insert into the tmp DB, populate the categories table, rebuild FTS.
+  7. A QA report is printed.
+
+Usage:
+    python scripts/build_database.py --rebuild
+"""
+
+from __future__ import annotations
+
+import argparse
+import json
+import os
+import shutil
+import sys
+from collections import defaultdict
+from pathlib import Path
+from typing import Any, Optional
+
+SCRIPT_DIR = Path(__file__).resolve().parent
+REPO_ROOT = SCRIPT_DIR.parent
+for _p in (str(SCRIPT_DIR), str(REPO_ROOT)):
+    if _p not in sys.path:
+        sys.path.insert(0, _p)
+
+from app.config_taxonomy import (  # noqa: E402
+    category_display_name,
+    normalize_category,
+    normalize_content_type,
+)
+from app.models.activity import Activity  # noqa: E402
+from app.models.database import DatabaseManager  # noqa: E402
+from import_common import (  # noqa: E402
+    DEFAULT_SCHEMA_PATH,
+    content_key,
+    excerpt_matches,
+    find_chunk_text,
+    iter_extraction_files,
+    load_schema,
+    normalize_name,
+    source_path_for,
+)
+
+# dedup thresholds (rapidfuzz token_sort_ratio, 0..100 scale)
+AUTO_MERGE_THRESHOLD = 85.0
+BORDERLINE_THRESHOLD = 60.0
+
+
+# --------------------------------------------------------------------------
+# extraction dict -> Activity
+# --------------------------------------------------------------------------
+def _csv(value: Any) -> Optional[str]:
+    """Schema arrays -> comma string for the (TEXT) DB columns."""
+    if value is None:
+        return None
+    if isinstance(value, str):
+        return value.strip() or None
+    if isinstance(value, (list, tuple)):
+        parts = [str(v).strip() for v in value if str(v).strip()]
+        return ", ".join(parts) or None
+    return str(value)
+
+
+def _split_csv(value: Optional[str]) -> list[str]:
+    if not value:
+        return []
+    return [p.strip() for p in str(value).split(",") if p.strip()]
+
+
+def dict_to_activity(adict: dict, source_file: str) -> Activity:
+    """Build an Activity from one extraction-JSON activity object."""
+    tags = adict.get("tags") or []
+    if isinstance(tags, str):
+        tags = _split_csv(tags)
+
+    source_files = adict.get("source_files") or []
+    if isinstance(source_files, str):
+        source_files = _split_csv(source_files)
+    if source_file and source_file not in source_files:
+        source_files = [source_file, *source_files]
+
+    return Activity(
+        name=(adict.get("name") or "").strip(),
+        description=(adict.get("description") or "").strip(),
+        rules=adict.get("rules"),
+        variations=adict.get("variations"),
+        category=normalize_category(adict.get("category", "")),
+        subcategory=adict.get("subcategory"),
+        content_type=normalize_content_type(adict.get("content_type", "")),
+        source_file=source_file,
+        source_files=list(source_files),
+        page_reference=adict.get("page_reference"),
+        source_excerpt=adict.get("source_excerpt"),
+        age_group_min=adict.get("age_group_min"),
+        age_group_max=adict.get("age_group_max"),
+        participants_min=adict.get("participants_min"),
+        participants_max=adict.get("participants_max"),
+        duration_min=adict.get("duration_min"),
+        duration_max=adict.get("duration_max"),
+        materials_category=adict.get("materials_category"),
+        materials_list=_csv(adict.get("materials_list")),
+        skills_developed=_csv(adict.get("skills_developed")),
+        difficulty_level=adict.get("difficulty_level"),
+        keywords=_csv(adict.get("keywords")),
+        tags=list(tags),
+        language=adict.get("language"),
+        extraction_confidence=adict.get("extraction_confidence"),
+    )
+
+
+# --------------------------------------------------------------------------
+# step 3 — category normalization is done in dict_to_activity; a non-taxonomy
+# value silently falls back to `altele`. This logs the substitutions.
+# --------------------------------------------------------------------------
+def log_category_fallbacks(raw_pairs: list[tuple[str, str]]) -> list[str]:
+    """raw_pairs = (original, slug); return human-readable fallback messages."""
+    msgs = []
+    for original, slug in raw_pairs:
+        if slug == "altele" and normalize_name(original or "") not in ("", "altele"):
+            msgs.append(f"category '{original}' -> altele (not in taxonomy)")
+    return msgs
+
+
+# --------------------------------------------------------------------------
+# step 4 — dedup
+# --------------------------------------------------------------------------
+def _longest(*values: Optional[str]) -> Optional[str]:
+    best: Optional[str] = None
+    for v in values:
+        if v and (best is None or len(v) > len(best)):
+            best = v
+    return best
+
+
+def _union_csv(values: list[Optional[str]]) -> Optional[str]:
+    seen: list[str] = []
+    for value in values:
+        for item in _split_csv(value):
+            if item not in seen:
+                seen.append(item)
+    return ", ".join(seen) or None
+
+
+def merge_cluster(cluster: list[Activity]) -> Activity:
+    """Collapse a cluster of duplicate activities into one merged Activity."""
+    if len(cluster) == 1:
+        return cluster[0]
+
+    # representative = the one with the longest description
+    rep = max(cluster, key=lambda a: len(a.description or ""))
+    merged = Activity(
+        name=rep.name,
+        description=_longest(*(a.description for a in cluster)) or rep.description,
+        rules=_longest(*(a.rules for a in cluster)),
+        variations=_longest(*(a.variations for a in cluster)),
+        category=rep.category,
+        subcategory=rep.subcategory,
+        content_type=rep.content_type,
+        source_file=rep.source_file,
+        page_reference=rep.page_reference,
+        source_excerpt=rep.source_excerpt,
+        age_group_min=rep.age_group_min,
+        age_group_max=rep.age_group_max,
+        participants_min=rep.participants_min,
+        participants_max=rep.participants_max,
+        duration_min=rep.duration_min,
+        duration_max=rep.duration_max,
+        materials_category=rep.materials_category,
+        materials_list=_union_csv([a.materials_list for a in cluster]),
+        skills_developed=_union_csv([a.skills_developed for a in cluster]),
+        difficulty_level=rep.difficulty_level,
+        keywords=_union_csv([a.keywords for a in cluster]),
+        language=rep.language,
+        extraction_confidence=rep.extraction_confidence,
+    )
+    # union of tags
+    tags: list[str] = []
+    for a in cluster:
+        for t in a.tags or []:
+            if t not in tags:
+                tags.append(t)
+    merged.tags = tags
+    # accumulate every source the activity was seen in
+    sources: list[str] = []
+    for a in cluster:
+        for s in [a.source_file, *(a.source_files or [])]:
+            if s and s not in sources:
+                sources.append(s)
+    merged.source_files = sources
+    # popularity_score++ per merged duplicate (plan §4)
+    merged.popularity_score = max(a.popularity_score for a in cluster) + (len(cluster) - 1)
+    return merged
+
+
+def dedup_activities(activities: list[Activity]) -> tuple[list[Activity], dict]:
+    """
+    Dedup per plan D5.
+
+    Groups by (normalized_name, language) — different languages are NEVER
+    merged. Within a group, descriptions are clustered with rapidfuzz:
+      >= 85  -> same cluster (auto-merge)
+      60-85  -> borderline: kept as separate clusters, both flagged needs_review
+      < 60   -> separate variants
+    """
+    from rapidfuzz import fuzz
+
+    groups: dict[tuple, list[Activity]] = defaultdict(list)
+    for act in activities:
+        key = (act.normalized_name or normalize_name(act.name), act.language)
+        groups[key].append(act)
+
+    result: list[Activity] = []
+    stats = {"input": len(activities), "auto_merged": 0, "borderline": 0, "output": 0}
+
+    for members in groups.values():
+        clusters: list[list[Activity]] = []
+        borderline_idx: set[int] = set()
+
+        for act in members:
+            best_idx, best_score = -1, -1.0
+            borderline_here: list[int] = []
+            for idx, cluster in enumerate(clusters):
+                score = fuzz.token_sort_ratio(
+                    act.description or "", cluster[0].description or ""
+                )
+                if score >= AUTO_MERGE_THRESHOLD:
+                    if score > best_score:
+                        best_idx, best_score = idx, score
+                elif score >= BORDERLINE_THRESHOLD:
+                    borderline_here.append(idx)
+            if best_idx >= 0:
+                clusters[best_idx].append(act)
+            else:
+                clusters.append([act])
+                new_idx = len(clusters) - 1
+                for bidx in borderline_here:
+                    borderline_idx.add(bidx)
+                    borderline_idx.add(new_idx)
+
+        for idx, cluster in enumerate(clusters):
+            merged = merge_cluster(cluster)
+            if len(cluster) > 1:
+                stats["auto_merged"] += len(cluster) - 1
+            if idx in borderline_idx:
+                merged.needs_review = 1
+                stats["borderline"] += 1
+            result.append(merged)
+
+    stats["output"] = len(result)
+    return result, stats
+
+
+# --------------------------------------------------------------------------
+# step 5 — review decisions
+# --------------------------------------------------------------------------
+def load_review_decisions(path: Path) -> dict:
+    if path and path.is_file():
+        try:
+            data = json.loads(path.read_text(encoding="utf-8"))
+            if isinstance(data, dict):
+                return data
+        except (json.JSONDecodeError, OSError):
+            pass
+    return {}
+
+
+def apply_review_decisions(
+    activities: list[Activity], decisions: dict
+) -> tuple[list[Activity], dict]:
+    """
+    Apply data/review_decisions.json (plan §5c).
+
+    Keyed by the stable content_key. A decision of `drop` removes the row;
+    `keep-separate` / `merge` clear needs_review (the user has resolved it).
+    Rows with no decision keep needs_review and resurface in the queue.
+    """
+    kept: list[Activity] = []
+    stats = {"dropped": 0, "resolved": 0}
+    for act in activities:
+        key = content_key(
+            act.normalized_name or normalize_name(act.name),
+            act.language,
+            act.description or "",
+        )
+        entry = decisions.get(key)
+        decision = entry.get("decision") if isinstance(entry, dict) else entry
+        if decision == "drop":
+            stats["dropped"] += 1
+            continue
+        if decision in ("keep-separate", "merge"):
+            act.needs_review = 0
+            stats["resolved"] += 1
+        kept.append(act)
+    return kept, stats
+
+
+# --------------------------------------------------------------------------
+# golden-set recall (plan §7)
+# --------------------------------------------------------------------------
+def _golden_names(data: Any) -> list[str]:
+    items = data.get("activities", data) if isinstance(data, dict) else data
+    names: list[str] = []
+    for item in items or []:
+        if isinstance(item, str):
+            names.append(item)
+        elif isinstance(item, dict) and item.get("name"):
+            names.append(item["name"])
+    return names
+
+
+def golden_recall(golden_dir: Path, activities: list[Activity]) -> Optional[dict]:
+    if not golden_dir or not golden_dir.is_dir():
+        return None
+    found = {normalize_name(a.name) for a in activities}
+    expected, hits = 0, 0
+    for gf in sorted(golden_dir.glob("*.json")):
+        try:
+            data = json.loads(gf.read_text(encoding="utf-8"))
+        except (json.JSONDecodeError, OSError):
+            continue
+        for name in _golden_names(data):
+            expected += 1
+            if normalize_name(name) in found:
+                hits += 1
+    if expected == 0:
+        return None
+    return {"expected": expected, "found": hits, "recall": round(hits / expected, 3)}
+
+
+# --------------------------------------------------------------------------
+# load + validate + excerpt-check the extraction files
+# --------------------------------------------------------------------------
+def collect_activities(
+    extracted_dir: Path,
+    chunks_dir: Path,
+    sources_dir: Path,
+    schema: dict,
+) -> dict:
+    """Validate, excerpt-check and convert every extraction file."""
+    rejected_dir = extracted_dir / "_rejected"
+    activities: list[Activity] = []
+    report = {
+        "files_total": 0,
+        "files_valid": 0,
+        "files_rejected_schema": 0,
+        "activities_raw": 0,
+        "activities_hallucinated": 0,
+        "category_fallbacks": [],
+    }
+    raw_categories: list[tuple[str, str]] = []
+
+    from import_common import chunk_key_for  # local import to avoid clutter
+
+    for json_path in iter_extraction_files(extracted_dir):
+        report["files_total"] += 1
+        try:
+            data = json.loads(json_path.read_text(encoding="utf-8"))
+        except json.JSONDecodeError as exc:
+            _reject_file(json_path, rejected_dir, [f"invalid JSON: {exc}"])
+            report["files_rejected_schema"] += 1
+            continue
+
+        from import_common import validate_extraction
+
+        errors = validate_extraction(data, schema)
+        if errors:
+            _reject_file(json_path, rejected_dir, errors)
+            report["files_rejected_schema"] += 1
+            continue
+        report["files_valid"] += 1
+
+        header = data.get("header", {})
+        chunk_text = find_chunk_text(json_path, header, chunks_dir)
+        source_id = header.get("source_id") or chunk_key_for(json_path, header).rsplit(
+            ".part", 1
+        )[0]
+        fallback_source = (
+            source_path_for(source_id, sources_dir) or source_id or json_path.stem
+        )
+
+        hallucinated: list[dict] = []
+        for adict in data.get("activities", []):
+            report["activities_raw"] += 1
+            excerpt = adict.get("source_excerpt") or ""
+            # if the chunk text is unavailable we cannot verify — keep but the
+            # QA report still counts it under activities_raw.
+            if chunk_text is not None and not excerpt_matches(excerpt, chunk_text):
+                hallucinated.append(adict)
+                report["activities_hallucinated"] += 1
+                continue
+            src = adict.get("source_file") or fallback_source
+            raw_categories.append((adict.get("category", ""), normalize_category(adict.get("category", ""))))
+            activities.append(dict_to_activity(adict, src))
+
+        if hallucinated:
+            _log_hallucinations(json_path, rejected_dir, hallucinated)
+
+    report["category_fallbacks"] = log_category_fallbacks(raw_categories)
+    report["activities"] = activities
+    return report
+
+
+def _reject_file(json_path: Path, rejected_dir: Path, errors: list[str]) -> None:
+    rejected_dir.mkdir(parents=True, exist_ok=True)
+    dest = rejected_dir / json_path.name
+    shutil.move(str(json_path), str(dest))
+    log = rejected_dir / f"{json_path.stem}.errors.txt"
+    log.write_text(
+        f"REJECTED (schema validation): {json_path.name}\n\n"
+        + "\n".join(f"  - {e}" for e in errors)
+        + "\n",
+        encoding="utf-8",
+    )
+
+
+def _log_hallucinations(
+    json_path: Path, rejected_dir: Path, hallucinated: list[dict]
+) -> None:
+    rejected_dir.mkdir(parents=True, exist_ok=True)
+    log = rejected_dir / f"{json_path.stem}.hallucinations.txt"
+    lines = [f"DROPPED activities (source_excerpt not found in chunk): {json_path.name}", ""]
+    for a in hallucinated:
+        lines.append(f"  - {a.get('name')!r}")
+        lines.append(f"    excerpt: {a.get('source_excerpt')!r}")
+    log.write_text("\n".join(lines) + "\n", encoding="utf-8")
+
+
+# --------------------------------------------------------------------------
+# DB write + atomic swap
+# --------------------------------------------------------------------------
+def _enrich_category_display_names(db_path: Path) -> None:
+    """Give the categories table proper Romanian display names for slugs."""
+    import sqlite3
+
+    conn = sqlite3.connect(db_path)
+    try:
+        rows = conn.execute(
+            "SELECT value FROM categories WHERE type = 'category'"
+        ).fetchall()
+        for (slug,) in rows:
+            conn.execute(
+                "UPDATE categories SET display_name = ? WHERE type='category' AND value = ?",
+                (category_display_name(slug), slug),
+            )
+        conn.commit()
+    finally:
+        conn.close()
+
+
+def write_database(db_tmp_path: Path, activities: list[Activity]) -> None:
+    """Create a fresh tmp DB, bulk insert, populate categories, rebuild FTS."""
+    if db_tmp_path.exists():
+        db_tmp_path.unlink()
+    db = DatabaseManager(str(db_tmp_path))
+    db.bulk_insert_activities(activities)
+    _enrich_category_display_names(db_tmp_path)
+    db.rebuild_fts_index()
+
+
+def atomic_swap(db_tmp_path: Path, db_path: Path) -> Optional[Path]:
+    """Back up the live DB then atomically swap the tmp file in."""
+    backup: Optional[Path] = None
+    if db_path.exists():
+        backup = db_path.with_suffix(db_path.suffix + ".bak")
+        shutil.copy2(db_path, backup)
+    os.replace(db_tmp_path, db_path)
+    return backup
+
+
+# --------------------------------------------------------------------------
+# orchestration
+# --------------------------------------------------------------------------
+def rebuild(
+    *,
+    extracted_dir: Path,
+    chunks_dir: Path,
+    sources_dir: Path,
+    db_path: Path,
+    decisions_path: Optional[Path] = None,
+    schema_path: Path = DEFAULT_SCHEMA_PATH,
+    golden_dir: Optional[Path] = None,
+    do_swap: bool = True,
+) -> dict:
+    """
+    Full rebuild. Everything is built into <db_path>.tmp; the live DB is only
+    touched by the final atomic swap, so a crash anywhere above leaves it intact.
+    """
+    extracted_dir = Path(extracted_dir)
+    db_path = Path(db_path)
+    db_tmp_path = db_path.with_suffix(db_path.suffix + ".tmp")
+
+    schema = load_schema(schema_path)
+    collected = collect_activities(extracted_dir, Path(chunks_dir), Path(sources_dir), schema)
+    activities: list[Activity] = collected.pop("activities")
+
+    deduped, dedup_stats = dedup_activities(activities)
+
+    decisions = load_review_decisions(Path(decisions_path)) if decisions_path else {}
+    final, decision_stats = apply_review_decisions(deduped, decisions)
+
+    try:
+        write_database(db_tmp_path, final)
+        backup = atomic_swap(db_tmp_path, db_path) if do_swap else None
+    except Exception:
+        if db_tmp_path.exists():
+            db_tmp_path.unlink()
+        raise
+
+    report = {
+        **collected,
+        "dedup": dedup_stats,
+        "decisions": decision_stats,
+        "final_count": len(final),
+        "backup": str(backup) if backup else None,
+        "swapped": do_swap,
+        "qa": _qa_report(final, collected, golden_dir),
+    }
+    return report
+
+
+def _qa_report(
+    activities: list[Activity], collected: dict, golden_dir: Optional[Path]
+) -> dict:
+    per_category: dict[str, int] = defaultdict(int)
+    per_content_type: dict[str, int] = defaultdict(int)
+    confidence: dict[str, int] = defaultdict(int)
+    with_rules = 0
+    for a in activities:
+        per_category[a.category] += 1
+        per_content_type[a.content_type or "?"] += 1
+        confidence[a.extraction_confidence or "?"] += 1
+        if a.rules and a.rules.strip():
+            with_rules += 1
+    raw = collected.get("activities_raw", 0)
+    hallucinated = collected.get("activities_hallucinated", 0)
+    return {
+        "total": len(activities),
+        "per_category": dict(per_category),
+        "per_content_type": dict(per_content_type),
+        "extraction_confidence": dict(confidence),
+        "pct_with_rules": round(100 * with_rules / len(activities), 1) if activities else 0.0,
+        "needs_review": sum(1 for a in activities if a.needs_review),
+        "hallucination_rate": round(100 * hallucinated / raw, 2) if raw else 0.0,
+        "golden_recall": golden_recall(Path(golden_dir), activities) if golden_dir else None,
+    }
+
+
+def print_report(report: dict) -> None:
+    qa = report["qa"]
+    print("=" * 60)
+    print("BUILD DATABASE — QA REPORT")
+    print("=" * 60)
+    print(f"extraction files     : {report['files_total']} "
+          f"(valid {report['files_valid']}, schema-rejected {report['files_rejected_schema']})")
+    print(f"activities raw       : {report['activities_raw']}")
+    print(f"  hallucinated drop  : {report['activities_hallucinated']} "
+          f"({qa['hallucination_rate']}%)")
+    d = report["dedup"]
+    print(f"dedup                : {d['input']} -> {d['output']} "
+          f"(auto-merged {d['auto_merged']}, borderline {d['borderline']})")
+    print(f"review decisions     : dropped {report['decisions']['dropped']}, "
+          f"resolved {report['decisions']['resolved']}")
+    print(f"final inserted       : {report['final_count']}")
+    print(f"% with rules         : {qa['pct_with_rules']}")
+    print(f"needs_review rows    : {qa['needs_review']}")
+    print("per category         :")
+    for slug, n in sorted(qa["per_category"].items(), key=lambda kv: -kv[1]):
+        print(f"  {slug:<24}: {n}")
+    print("per content_type     :")
+    for ct, n in sorted(qa["per_content_type"].items(), key=lambda kv: -kv[1]):
+        print(f"  {ct:<24}: {n}")
+    print("extraction_confidence:")
+    for c, n in sorted(qa["extraction_confidence"].items()):
+        print(f"  {c:<24}: {n}")
+    if qa["golden_recall"]:
+        g = qa["golden_recall"]
+        print(f"golden recall        : {g['found']}/{g['expected']} = {g['recall']}")
+    if report["category_fallbacks"]:
+        print("category fallbacks   :")
+        for msg in report["category_fallbacks"]:
+            print(f"  {msg}")
+    if report["backup"]:
+        print(f"live DB backed up to : {report['backup']}")
+    print("=" * 60)
+
+
+# --------------------------------------------------------------------------
+# CLI
+# --------------------------------------------------------------------------
+def main(argv: Optional[list[str]] = None) -> int:
+    parser = argparse.ArgumentParser(description="Build activities.db from extraction JSON.")
+    parser.add_argument("--rebuild", action="store_true",
+                        help="rebuild the database from scratch (only mode supported)")
+    parser.add_argument("--extracted", default="data/extracted")
+    parser.add_argument("--chunks", default="data/chunks")
+    parser.add_argument("--sources", default="data/sources")
+    parser.add_argument("--db", default="data/activities.db")
+    parser.add_argument("--decisions", default="data/review_decisions.json")
+    parser.add_argument("--golden", default="data/golden")
+    parser.add_argument("--schema", default=str(DEFAULT_SCHEMA_PATH))
+    args = parser.parse_args(argv)
+
+    if not args.rebuild:
+        parser.error("only --rebuild is supported (full rebuild, no incremental merge)")
+
+    report = rebuild(
+        extracted_dir=Path(args.extracted),
+        chunks_dir=Path(args.chunks),
+        sources_dir=Path(args.sources),
+        db_path=Path(args.db),
+        decisions_path=Path(args.decisions),
+        schema_path=Path(args.schema),
+        golden_dir=Path(args.golden),
+    )
+    print_report(report)
+    return 0
+
+
+if __name__ == "__main__":
+    raise SystemExit(main())
diff --git a/scripts/chunk_sources.py b/scripts/chunk_sources.py
new file mode 100644
index 0000000..0844b10
--- /dev/null
+++ b/scripts/chunk_sources.py
@@ -0,0 +1,251 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+"""
+chunk_sources.py — split normalized data/sources/*.txt into ~20-page chunks
+for subagent extraction, and maintain data/chunks/manifest.json.
+
+Paginated text  → ~20-page chunks, ~4-page overlap (plan D8).
+Unpaginated text → ~10000-word windows, ~2000-word overlap.
+
+The manifest is a cache derived from the filesystem + per-chunk state. Re-running
+this script is idempotent: existing chunk states (pending/assigned/done/rejected)
+survive as long as the source content hash is unchanged.
+"""
+
+from __future__ import annotations
+
+import argparse
+import json
+import sys
+from pathlib import Path
+
+SCRIPT_DIR = Path(__file__).resolve().parent
+if str(SCRIPT_DIR) not in sys.path:
+    sys.path.insert(0, str(SCRIPT_DIR))
+
+from extract_common import content_hash, split_pages  # noqa: E402
+
+SCHEMA_VERSION = "1.0"
+PAGES_PER_CHUNK = 20
+PAGE_OVERLAP = 4
+WORD_WINDOW = 10_000
+WORD_OVERLAP = 2_000
+
+VALID_STATES = {"pending", "assigned", "done", "rejected"}
+
+
+# --------------------------------------------------------------------------
+# header parsing
+# --------------------------------------------------------------------------
+def parse_source(text: str) -> tuple[dict, str]:
+    """Split a normalized source file into (header_dict, body)."""
+    lines = text.splitlines()
+    header: dict = {}
+    body_start = 0
+    in_header = True
+    for i, line in enumerate(lines):
+        if line.startswith("--- PAGE "):
+            body_start = i
+            break
+        if not in_header:
+            continue
+        if set(line.strip()) == {"="} and line.strip():
+            body_start = i + 1
+            in_header = False  # header ends at the rule line
+            continue
+        if ":" in line:
+            key, _, val = line.partition(":")
+            header[key.strip()] = val.strip()
+    body = "\n".join(lines[body_start:])
+    return header, body
+
+
+# --------------------------------------------------------------------------
+# chunking — pure functions
+# --------------------------------------------------------------------------
+def chunk_pages(
+    pages: list[tuple[int, str]],
+    pages_per_chunk: int = PAGES_PER_CHUNK,
+    overlap: int = PAGE_OVERLAP,
+) -> list[dict]:
+    """
+    Split an ordered list of (page_no, text) into overlapping chunks.
+
+    stride = pages_per_chunk - overlap. Because stride < pages_per_chunk - 1, any
+    activity straddling a page boundary appears whole in at least one chunk.
+    """
+    if not pages:
+        return []
+    stride = max(1, pages_per_chunk - overlap)
+    chunks: list[dict] = []
+    i = 0
+    n = len(pages)
+    while i < n:
+        window = pages[i : i + pages_per_chunk]
+        first, last = window[0][0], window[-1][0]
+        text = "".join(
+            f"\n--- PAGE {num} ---\n{txt}\n" for num, txt in window
+        )
+        chunks.append(
+            {"page_start": first, "page_end": last,
+             "chunk_range": f"pages {first}-{last}", "text": text}
+        )
+        if i + pages_per_chunk >= n:
+            break
+        i += stride
+    return chunks
+
+
+def chunk_words(
+    text: str, window: int = WORD_WINDOW, overlap: int = WORD_OVERLAP
+) -> list[dict]:
+    """Split unpaginated text into overlapping word windows."""
+    words = text.split()
+    if not words:
+        return []
+    stride = max(1, window - overlap)
+    chunks: list[dict] = []
+    i = 0
+    n = len(words)
+    while i < n:
+        seg = words[i : i + window]
+        chunks.append(
+            {"word_start": i, "word_end": i + len(seg),
+             "chunk_range": f"words {i}-{i + len(seg)}", "text": " ".join(seg)}
+        )
+        if i + window >= n:
+            break
+        i += stride
+    return chunks
+
+
+def make_chunks(source_text: str) -> list[dict]:
+    """Chunk one normalized source file. Picks page- or word-windowing."""
+    _, body = parse_source(source_text)
+    pages = split_pages(body)
+    if pages:
+        return chunk_pages(pages)
+    return chunk_words(body)
+
+
+# --------------------------------------------------------------------------
+# manifest
+# --------------------------------------------------------------------------
+def _empty_manifest() -> dict:
+    return {"schema_version": SCHEMA_VERSION, "chunks": {}}
+
+
+def load_manifest(manifest_path: Path) -> dict:
+    if manifest_path.exists():
+        try:
+            data = json.loads(manifest_path.read_text(encoding="utf-8"))
+            data.setdefault("schema_version", SCHEMA_VERSION)
+            data.setdefault("chunks", {})
+            return data
+        except (json.JSONDecodeError, OSError):
+            pass
+    return _empty_manifest()
+
+
+def save_manifest(manifest: dict, manifest_path: Path) -> None:
+    manifest_path.parent.mkdir(parents=True, exist_ok=True)
+    manifest_path.write_text(
+        json.dumps(manifest, indent=2, ensure_ascii=False), encoding="utf-8"
+    )
+
+
+def chunk_source_file(
+    source_path: Path, chunks_dir: Path, manifest: dict
+) -> list[str]:
+    """
+    Chunk one data/sources/<id>.txt → data/chunks/<id>/<id>.partNN.txt and
+    register every chunk in `manifest`. Preserves prior state when the source
+    content hash is unchanged. Returns the list of chunk keys written.
+    """
+    source_id = source_path.stem
+    text = source_path.read_text(encoding="utf-8", errors="replace")
+    src_hash = content_hash(text)
+    chunks = make_chunks(text)
+
+    out_dir = chunks_dir / source_id
+    out_dir.mkdir(parents=True, exist_ok=True)
+
+    written: list[str] = []
+    for idx, chunk in enumerate(chunks, 1):
+        key = f"{source_id}.part{idx:02d}"
+        chunk_file = out_dir / f"{key}.txt"
+        chunk_file.write_text(chunk["text"], encoding="utf-8")
+
+        prior = manifest["chunks"].get(key)
+        # preserve state only if the source content is unchanged
+        if prior and prior.get("source_hash") == src_hash and \
+                prior.get("state") in VALID_STATES:
+            state = prior["state"]
+        else:
+            state = "pending"
+
+        manifest["chunks"][key] = {
+            "source_id": source_id,
+            "source_hash": src_hash,
+            "part": idx,
+            "chunk_range": chunk["chunk_range"],
+            "chunk_file": str(chunk_file.relative_to(chunks_dir.parent)),
+            "expected_json": f"{key}.json",
+            "state": state,
+        }
+        written.append(key)
+    return written
+
+
+def prune_stale(manifest: dict, live_keys: set[str]) -> list[str]:
+    """Drop manifest entries whose chunk no longer exists on disk."""
+    stale = [k for k in manifest["chunks"] if k not in live_keys]
+    for k in stale:
+        del manifest["chunks"][k]
+    return stale
+
+
+# --------------------------------------------------------------------------
+# CLI
+# --------------------------------------------------------------------------
+def run(sources_dir: Path, chunks_dir: Path) -> dict:
+    """Chunk every *.txt in sources_dir. Returns a summary dict."""
+    manifest_path = chunks_dir / "manifest.json"
+    manifest = load_manifest(manifest_path)
+
+    live_keys: set[str] = set()
+    source_files = sorted(sources_dir.glob("*.txt"))
+    for src in source_files:
+        live_keys.update(chunk_source_file(src, chunks_dir, manifest))
+
+    stale = prune_stale(manifest, live_keys)
+    save_manifest(manifest, manifest_path)
+
+    states: dict[str, int] = {}
+    for meta in manifest["chunks"].values():
+        states[meta["state"]] = states.get(meta["state"], 0) + 1
+    return {
+        "sources": len(source_files),
+        "chunks": len(live_keys),
+        "pruned": len(stale),
+        "states": states,
+    }
+
+
+def main(argv: list[str] | None = None) -> int:
+    parser = argparse.ArgumentParser(description="Chunk normalized sources.")
+    parser.add_argument("--sources", default="data/sources", help="sources dir")
+    parser.add_argument("--chunks", default="data/chunks", help="chunks output dir")
+    args = parser.parse_args(argv)
+
+    summary = run(Path(args.sources), Path(args.chunks))
+    print(f"sources processed : {summary['sources']}")
+    print(f"chunks written    : {summary['chunks']}")
+    print(f"stale pruned      : {summary['pruned']}")
+    for state, count in sorted(summary["states"].items()):
+        print(f"  {state:<10}: {count}")
+    return 0
+
+
+if __name__ == "__main__":
+    raise SystemExit(main())
diff --git a/scripts/claude_extraction_template.md b/scripts/claude_extraction_template.md
deleted file mode 100644
index f2137d1..0000000
--- a/scripts/claude_extraction_template.md
+++ /dev/null
@@ -1,54 +0,0 @@
-# TEMPLATE PENTRU EXTRACȚIE ACTIVITĂȚI CU CLAUDE
-
-## Instrucțiuni pentru Claude Code:
-
-Pentru fiecare PDF/DOC, folosește următorul format de extracție:
-
-### 1. Citește fișierul:
-```
-Claude, te rog citește fișierul: [CALE_FISIER]
-```
-
-### 2. Extrage activitățile folosind acest template JSON:
-```json
-{
-  "source_file": "[NUME_FISIER]",
-  "activities": [
-    {
-      "name": "Numele activității",
-      "description": "Descrierea completă a activității",
-      "rules": "Regulile jocului/activității",
-      "variations": "Variante sau adaptări",
-      "category": "[A-H] bazat pe tip",
-      "age_group_min": 6,
-      "age_group_max": 14,
-      "participants_min": 4,
-      "participants_max": 20,
-      "duration_min": 10,
-      "duration_max": 30,
-      "materials_list": "Lista materialelor necesare",
-      "skills_developed": "Competențe dezvoltate",
-      "difficulty_level": "Ușor/Mediu/Dificil",
-      "keywords": "cuvinte cheie separate prin virgulă",
-      "tags": "taguri relevante"
-    }
-  ]
-}
-```
-
-### 3. Salvează în fișier:
-După extracție, salvează JSON-ul în: `/scripts/extracted_activities/[NUME_FISIER].json`
-
-### 4. Priorități de procesare:
-
-**TOP PRIORITY (procesează primele):**
-1. 1000 Fantastic Scout Games.pdf
-2. Cartea Mare a jocurilor.pdf
-3. 160-de-activitati-dinamice-jocuri-pentru-team-building.pdf
-4. 101 Ways to Create an Unforgettable Camp Experience.pdf
-5. 151 Awesome Summer Camp Nature Activities.pdf
-
-**Categorii de focus:**
-- [A] Jocuri Cercetășești
-- [C] Camping & Activități Exterior
-- [G] Activități Educaționale
\ No newline at end of file
diff --git a/scripts/create_databases.py b/scripts/create_databases.py
deleted file mode 100644
index 515d3a4..0000000
--- a/scripts/create_databases.py
+++ /dev/null
@@ -1,164 +0,0 @@
-#!/usr/bin/env python3
-# -*- coding: utf-8 -*-
-"""
-DATABASE SETUP SCRIPT - INDEX-SISTEM-JOCURI
-
-Script pentru recrearea bazelor de date din .gitignore
-Folosește clasele DatabaseManager pentru consistență
-
-Usage:
-    python scripts/create_databases.py
-    python scripts/create_databases.py --clear-existing
-"""
-
-import sys
-import argparse
-from pathlib import Path
-
-# Add src to path so we can import our modules
-sys.path.append(str(Path(__file__).parent.parent / 'src'))
-
-from database import DatabaseManager
-from game_library_manager import GameLibraryManager
-
-def create_main_database(db_path: str = "data/activities.db", clear: bool = False):
-    """Create the main activities database"""
-    db_file = Path(db_path)
-    
-    if clear and db_file.exists():
-        print(f"🗑️  Removing existing database: {db_path}")
-        db_file.unlink()
-    
-    print(f"📊 Creating main database: {db_path}")
-    db = DatabaseManager(db_path)
-    
-    # Test the database
-    try:
-        stats = db.get_statistics()
-        print(f"✅ Database created successfully: {stats['total_activities']} activities")
-        return True
-    except Exception as e:
-        print(f"❌ Error creating database: {e}")
-        return False
-
-def create_game_library_database(db_path: str = "data/game_library.db", clear: bool = False):
-    """Create the legacy game library database"""
-    db_file = Path(db_path)
-    
-    if clear and db_file.exists():
-        print(f"🗑️  Removing existing database: {db_path}")
-        db_file.unlink()
-    
-    print(f"📊 Creating game library database: {db_path}")
-    manager = GameLibraryManager(db_path)
-    
-    print(f"✅ Game library database created successfully")
-    return True
-
-def create_test_database(db_path: str = "data/test_activities.db", clear: bool = False):
-    """Create the test database"""
-    db_file = Path(db_path)
-    
-    if clear and db_file.exists():
-        print(f"🗑️  Removing existing database: {db_path}")
-        db_file.unlink()
-    
-    print(f"📊 Creating test database: {db_path}")
-    db = DatabaseManager(db_path)
-    
-    # Add some test data
-    test_activity = {
-        'title': 'Test Activity - Setup Script',
-        'description': 'This is a test activity created by the setup script',
-        'file_path': 'test/sample.txt',
-        'file_type': 'TXT',
-        'category': 'test',
-        'age_group': '8-12 ani',
-        'participants': '5-10 persoane',
-        'duration': '15-30min',
-        'materials': 'Fără materiale',
-        'tags': '["test", "setup"]',
-        'source_text': 'Sample test content for verification'
-    }
-    
-    try:
-        db.insert_activity(test_activity)
-        stats = db.get_statistics()
-        print(f"✅ Test database created with sample data: {stats['total_activities']} activities")
-        return True
-    except Exception as e:
-        print(f"❌ Error creating test database: {e}")
-        return False
-
-def ensure_data_directory():
-    """Ensure the data directory exists"""
-    data_dir = Path("data")
-    if not data_dir.exists():
-        print(f"📁 Creating data directory: {data_dir}")
-        data_dir.mkdir(parents=True)
-    else:
-        print(f"📁 Data directory exists: {data_dir}")
-
-def main():
-    """Main setup function"""
-    parser = argparse.ArgumentParser(description='Create databases for INDEX-SISTEM-JOCURI')
-    parser.add_argument('--clear-existing', '-c', action='store_true',
-                       help='Remove existing databases before creating new ones')
-    parser.add_argument('--main-only', action='store_true',
-                       help='Create only the main activities database')
-    parser.add_argument('--test-only', action='store_true',
-                       help='Create only the test database')
-    
-    args = parser.parse_args()
-    
-    print("🚀 DATABASE SETUP - INDEX-SISTEM-JOCURI")
-    print("=" * 50)
-    
-    # Ensure data directory exists
-    ensure_data_directory()
-    
-    success_count = 0
-    total_count = 0
-    
-    if args.test_only:
-        total_count = 1
-        if create_test_database(clear=args.clear_existing):
-            success_count += 1
-    elif args.main_only:
-        total_count = 1
-        if create_main_database(clear=args.clear_existing):
-            success_count += 1
-    else:
-        # Create all databases
-        databases = [
-            ("Main activities", lambda: create_main_database(clear=args.clear_existing)),
-            ("Game library", lambda: create_game_library_database(clear=args.clear_existing)),
-            ("Test activities", lambda: create_test_database(clear=args.clear_existing))
-        ]
-        
-        total_count = len(databases)
-        
-        for name, create_func in databases:
-            print(f"\n📂 Creating {name} database...")
-            try:
-                if create_func():
-                    success_count += 1
-            except Exception as e:
-                print(f"❌ Failed to create {name} database: {e}")
-    
-    print("\n" + "=" * 50)
-    print(f"🎯 SUMMARY: {success_count}/{total_count} databases created successfully")
-    
-    if success_count == total_count:
-        print("✅ All databases ready!")
-        print("\nNext steps:")
-        print("1. Run indexer: cd src && python indexer.py --clear-db")
-        print("2. Start web app: cd src && python app.py")
-    else:
-        print("⚠️  Some databases failed to create. Check errors above.")
-        return 1
-    
-    return 0
-
-if __name__ == '__main__':
-    sys.exit(main())
\ No newline at end of file
diff --git a/scripts/extract_common.py b/scripts/extract_common.py
new file mode 100644
index 0000000..f9f1a37
--- /dev/null
+++ b/scripts/extract_common.py
@@ -0,0 +1,361 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+"""
+extract_common.py — single home for per-format text extraction.
+
+Every extractor returns a plain text *body* with synthetic page markers
+(`--- PAGE N ---`). The file-level header (`SOURCE:` / `CONVERTED:`) is added
+by normalize_sources.py, not here.
+
+Critical fix vs. the old pdf_to_text_converter.py: there is NO `max_pages` cap.
+Large books are extracted in full.
+"""
+
+from __future__ import annotations
+
+import hashlib
+import importlib
+import os
+import re
+import shutil
+import subprocess
+import tempfile
+import zipfile
+from pathlib import Path
+from typing import Callable
+
+PAGE_MARKER_RE = re.compile(r"^--- PAGE (\d+) ---\s*$", re.MULTILINE)
+
+# paragraphs per synthetic page for paginated-by-flow formats (docx)
+DOCX_PARAS_PER_PAGE = 40
+
+# formats we deliberately ignore (epub duplicates existing PDFs — plan §1)
+IGNORED_EXTENSIONS = {".epub"}
+
+# obvious junk filenames skipped during a walk
+JUNK_NAMES = {"desktop.ini", "linkuri-jocuri.txt"}
+JUNK_SUFFIXES = {".bak", ".tmp", ".ini"}
+
+
+# --------------------------------------------------------------------------
+# page assembly helpers
+# --------------------------------------------------------------------------
+def join_pages(pages: list[str], start: int = 1) -> str:
+    """Join a list of page texts into a body string with `--- PAGE N ---`."""
+    out: list[str] = []
+    for i, text in enumerate(pages, start):
+        out.append(f"\n--- PAGE {i} ---\n{(text or '').strip()}\n")
+    return "".join(out)
+
+
+def split_pages(body: str) -> list[tuple[int, str]]:
+    """Inverse of join_pages: parse a body into [(page_number, text), ...]."""
+    matches = list(PAGE_MARKER_RE.finditer(body))
+    if not matches:
+        return []
+    pages: list[tuple[int, str]] = []
+    for idx, m in enumerate(matches):
+        num = int(m.group(1))
+        seg_start = m.end()
+        seg_end = matches[idx + 1].start() if idx + 1 < len(matches) else len(body)
+        pages.append((num, body[seg_start:seg_end].strip()))
+    return pages
+
+
+def count_page_markers(body: str) -> int:
+    return len(PAGE_MARKER_RE.findall(body))
+
+
+# --------------------------------------------------------------------------
+# format detection
+# --------------------------------------------------------------------------
+FORMAT_BY_EXT = {
+    ".pdf": "pdf",
+    ".docx": "docx",
+    ".doc": "doc",
+    ".pptx": "pptx",
+    ".ppt": "pptx",
+    ".htm": "html",
+    ".html": "html",
+    ".zip": "zip",
+    ".epub": "epub",
+    ".txt": "txt",
+}
+
+
+def detect_format(path: str | os.PathLike) -> str:
+    """Return a format key for a path based on its extension."""
+    ext = Path(path).suffix.lower()
+    return FORMAT_BY_EXT.get(ext, "unknown")
+
+
+def is_junk(path: str | os.PathLike) -> bool:
+    p = Path(path)
+    name = p.name.lower()
+    if name in JUNK_NAMES:
+        return True
+    if name.startswith("readme") and p.suffix.lower() == ".md":
+        return True
+    if p.suffix.lower() in JUNK_SUFFIXES:
+        return True
+    return False
+
+
+# --------------------------------------------------------------------------
+# content hashing + near-duplicate elimination
+# --------------------------------------------------------------------------
+def _normalize_for_hash(text: str) -> str:
+    return re.sub(r"\s+", " ", (text or "")).strip().lower()
+
+
+def content_hash(text: str) -> str:
+    """Stable SHA1 of whitespace-normalized text — used for exact-dup detection."""
+    return hashlib.sha1(_normalize_for_hash(text).encode("utf-8")).hexdigest()
+
+
+def near_duplicate_ratio(a: str, b: str) -> float:
+    """Similarity score in [0, 100] between two texts (rapidfuzz token ratio)."""
+    from rapidfuzz import fuzz
+
+    return fuzz.token_sort_ratio(_normalize_for_hash(a), _normalize_for_hash(b))
+
+
+def dedupe_texts(
+    items: list[tuple[str, str]], threshold: float = 95.0
+) -> list[tuple[str, str]]:
+    """
+    Drop exact and near-duplicate texts from a list of (key, text) pairs.
+
+    Used for HTML mirror pages (print copies, repeated index/footer pages).
+    Keeps the first occurrence; O(n) on exact hash, O(n*k) fuzzy only against
+    already-kept items.
+    """
+    kept: list[tuple[str, str]] = []
+    seen_hashes: set[str] = set()
+    for key, text in items:
+        h = content_hash(text)
+        if h in seen_hashes:
+            continue
+        if any(near_duplicate_ratio(text, kt) >= threshold for _, kt in kept):
+            continue
+        seen_hashes.add(h)
+        kept.append((key, text))
+    return kept
+
+
+# --------------------------------------------------------------------------
+# preflight dependency check
+# --------------------------------------------------------------------------
+REQUIRED_PYTHON_MODULES = {
+    "pdfplumber": "pdfplumber",
+    "PyPDF2": "pypdf2",
+    "docx": "python-docx",
+    "pptx": "python-pptx",
+    "bs4": "beautifulsoup4",
+    "lxml": "lxml",
+    "jsonschema": "jsonschema",
+    "rapidfuzz": "rapidfuzz",
+    "chardet": "chardet",
+}
+
+
+def preflight(check_ocr: bool = False) -> dict:
+    """
+    Check system + Python dependencies before a long normalization run.
+
+    Returns {'ok': bool, 'missing_python': [...], 'missing_system': [...],
+             'warnings': [...]}.  libreoffice is a *warning* (only .doc needs it),
+             tesseract only checked when check_ocr=True.
+    """
+    missing_python: list[str] = []
+    for module, pip_name in REQUIRED_PYTHON_MODULES.items():
+        try:
+            importlib.import_module(module)
+        except ImportError:
+            missing_python.append(pip_name)
+
+    warnings: list[str] = []
+    missing_system: list[str] = []
+
+    if not (shutil.which("libreoffice") or shutil.which("soffice")):
+        warnings.append("libreoffice not found — legacy .doc files cannot be converted")
+
+    if check_ocr and not shutil.which("tesseract"):
+        missing_system.append("tesseract (OCR requested but not installed)")
+
+    return {
+        "ok": not missing_python and not missing_system,
+        "missing_python": missing_python,
+        "missing_system": missing_system,
+        "warnings": warnings,
+    }
+
+
+# --------------------------------------------------------------------------
+# per-format extractors
+# --------------------------------------------------------------------------
+def extract_pdf(path: str | os.PathLike) -> str:
+    """PDF → body. pdfplumber primary, PyPDF2 fallback. No page cap."""
+    path = str(path)
+    try:
+        return _extract_pdf_pdfplumber(path)
+    except Exception:
+        return _extract_pdf_pypdf2(path)
+
+
+def _extract_pdf_pdfplumber(path: str) -> str:
+    import pdfplumber
+
+    pages: list[str] = []
+    with pdfplumber.open(path) as pdf:
+        for page in pdf.pages:  # ALL pages — no max_pages
+            try:
+                pages.append(page.extract_text() or "")
+            except Exception:
+                pages.append("")
+    return join_pages(pages)
+
+
+def _extract_pdf_pypdf2(path: str) -> str:
+    import PyPDF2
+
+    pages: list[str] = []
+    with open(path, "rb") as fh:
+        reader = PyPDF2.PdfReader(fh)
+        for page in reader.pages:  # ALL pages — no max_pages
+            try:
+                pages.append(page.extract_text() or "")
+            except Exception:
+                pages.append("")
+    return join_pages(pages)
+
+
+def extract_docx(path: str | os.PathLike) -> str:
+    """docx → body. Synthetic page marker every DOCX_PARAS_PER_PAGE paragraphs."""
+    import docx
+
+    document = docx.Document(str(path))
+    paragraphs = [p.text for p in document.paragraphs]
+    pages: list[str] = []
+    for i in range(0, max(len(paragraphs), 1), DOCX_PARAS_PER_PAGE):
+        chunk = paragraphs[i : i + DOCX_PARAS_PER_PAGE]
+        pages.append("\n".join(chunk))
+    return join_pages(pages)
+
+
+def extract_doc(path: str | os.PathLike) -> str:
+    """
+    Legacy .doc → body via `libreoffice --headless --convert-to docx`.
+
+    Raises RuntimeError if libreoffice is unavailable — the caller marks the
+    resulting source `needs_review` regardless (conversion is imperfect).
+    """
+    soffice = shutil.which("libreoffice") or shutil.which("soffice")
+    if not soffice:
+        raise RuntimeError("libreoffice/soffice not available — cannot convert .doc")
+
+    src = Path(path).resolve()
+    with tempfile.TemporaryDirectory() as tmp:
+        subprocess.run(
+            [soffice, "--headless", "--convert-to", "docx", "--outdir", tmp, str(src)],
+            check=True,
+            capture_output=True,
+            timeout=300,
+        )
+        converted = Path(tmp) / (src.stem + ".docx")
+        if not converted.exists():
+            raise RuntimeError(f"libreoffice produced no output for {src.name}")
+        return extract_docx(converted)
+
+
+def extract_pptx(path: str | os.PathLike) -> str:
+    """pptx → body. One page per slide: title + body text + speaker notes."""
+    from pptx import Presentation
+
+    presentation = Presentation(str(path))
+    pages: list[str] = []
+    for slide in presentation.slides:
+        parts: list[str] = []
+        for shape in slide.shapes:
+            if shape.has_text_frame and shape.text_frame.text.strip():
+                parts.append(shape.text_frame.text.strip())
+        if slide.has_notes_slide:
+            notes = slide.notes_slide.notes_text_frame.text.strip()
+            if notes:
+                parts.append(f"[NOTES] {notes}")
+        pages.append("\n".join(parts))
+    return join_pages(pages)
+
+
+def extract_html(path: str | os.PathLike) -> str:
+    """HTML mirror page → body. Strips nav/script/style/footer/header/aside."""
+    import chardet
+    from bs4 import BeautifulSoup
+
+    raw = Path(path).read_bytes()
+    enc = chardet.detect(raw).get("encoding") or "utf-8"
+    soup = BeautifulSoup(raw.decode(enc, errors="replace"), "lxml")
+
+    for tag in soup(["script", "style", "nav", "footer", "header", "aside", "noscript"]):
+        tag.decompose()
+    # also drop common chrome by role/class
+    for tag in soup.find_all(attrs={"role": ["navigation", "banner", "contentinfo"]}):
+        tag.decompose()
+
+    text = soup.get_text(separator="\n")
+    lines = [ln.strip() for ln in text.splitlines() if ln.strip()]
+    return join_pages(["\n".join(lines)])
+
+
+def extract_zip(path: str | os.PathLike) -> str:
+    """
+    zip → body. Unzips into a temp dir and recurses on every extractable inner
+    file. Inner files are page-renumbered into one continuous body.
+    """
+    path = str(path)
+    pages: list[str] = []
+    with tempfile.TemporaryDirectory() as tmp:
+        try:
+            with zipfile.ZipFile(path) as zf:
+                zf.extractall(tmp)
+        except zipfile.BadZipFile:
+            return ""
+        for inner in sorted(Path(tmp).rglob("*")):
+            if not inner.is_file() or is_junk(inner):
+                continue
+            fmt = detect_format(inner)
+            if fmt in ("unknown", "epub", "zip"):
+                # nested zips handled by recursion below
+                if fmt == "zip":
+                    body = extract_zip(inner)
+                    pages.extend(t for _, t in split_pages(body))
+                continue
+            try:
+                body = extract_file(inner)
+            except Exception:
+                continue
+            pages.extend(t for _, t in split_pages(body))
+    return join_pages(pages)
+
+
+EXTRACTORS: dict[str, Callable[[str | os.PathLike], str]] = {
+    "pdf": extract_pdf,
+    "docx": extract_docx,
+    "doc": extract_doc,
+    "pptx": extract_pptx,
+    "html": extract_html,
+    "zip": extract_zip,
+}
+
+
+def extract_file(path: str | os.PathLike) -> str:
+    """Dispatch a single file to the right extractor. Returns a page-marked body."""
+    fmt = detect_format(path)
+    if fmt == "txt":
+        body = Path(path).read_text(encoding="utf-8", errors="replace")
+        # already paginated? pass through; else wrap as one page
+        return body if count_page_markers(body) else join_pages([body])
+    extractor = EXTRACTORS.get(fmt)
+    if extractor is None:
+        raise ValueError(f"No extractor for format '{fmt}': {path}")
+    return extractor(path)
diff --git a/scripts/html_extractor.py b/scripts/html_extractor.py
deleted file mode 100644
index 08f5898..0000000
--- a/scripts/html_extractor.py
+++ /dev/null
@@ -1,424 +0,0 @@
-#!/usr/bin/env python3
-"""
-HTML Activity Extractor - Proceseaz 1876 fiiere HTML
-Extrage automat activiti folosind pattern recognition
-"""
-
-import os
-import re
-import json
-from pathlib import Path
-from bs4 import BeautifulSoup
-import chardet
-from typing import List, Dict, Optional
-import sqlite3
-from datetime import datetime
-
-class HTMLActivityExtractor:
-    def __init__(self, db_path='data/activities.db'):
-        self.db_path = db_path
-        # Pattern-uri pentru detectare activiti �n rom�n
-        self.activity_patterns = {
-            'title_patterns': [
-                r'(?i)(joc|activitate|exerci[t]iu|team[\s-]?building|energizer|ice[\s-]?breaker)[\s:]+([^\.]{5,100})',
-                r'(?i)<h[1-6][^>]*>([^<]*(?:joc|activitate|exerci[t]iu)[^<]*)</h[1-6]>',
-                r'(?i)<strong>([^<]*(?:joc|activitate|exerci[t]iu)[^<]*)</strong>',
-                r'(?i)^[\d]+\.?\s*([A-Z][^\.]{10,100}(?:joc|activitate|exerci[t]iu)[^\.]{0,50})$',
-            ],
-            'description_markers': [
-                'descriere', 'reguli', 'cum se joac[a]', 'instructiuni', 
-                'obiectiv', 'desfasurare', 'explicatie', 'mod de joc'
-            ],
-            'materials_markers': [
-                'materiale', 'necesare', 'echipament', 'ce avem nevoie',
-                'se folosesc', 'trebuie sa avem', 'dotari'
-            ],
-            'age_patterns': [
-                r'(?i)v[�a]rst[a][\s:]+(\d+)[\s-]+(\d+)',
-                r'(?i)(\d+)[\s-]+(\d+)\s*ani',
-                r'(?i)pentru\s+(\d+)[\s-]+(\d+)\s*ani',
-                r'(?i)categoria?\s*(?:de\s*)?v[�a]rst[a][\s:]+(\d+)[\s-]+(\d+)',
-            ],
-            'participants_patterns': [
-                r'(?i)(\d+)[\s-]+(\d+)\s*(?:participan[t]i|juc[a]tori|persoane|copii)',
-                r'(?i)num[a]r\s*(?:de\s*)?(?:participan[t]i|juc[a]tori)[\s:]+(\d+)[\s-]+(\d+)',
-                r'(?i)grup\s*de\s*(\d+)[\s-]+(\d+)',
-            ],
-            'duration_patterns': [
-                r'(?i)durat[a][\s:]+(\d+)[\s-]+(\d+)\s*(?:minute|min)',
-                r'(?i)timp[\s:]+(\d+)[\s-]+(\d+)\s*(?:minute|min)',
-                r'(?i)(\d+)[\s-]+(\d+)\s*minute',
-            ]
-        }
-        
-        # Categorii predefinite bazate pe sistemul existent
-        self.categories = {
-            '[A]': ['joc', 'joaca', 'distractie', 'amuzament'],
-            '[B]': ['aventura', 'explorare', 'descoperire'],
-            '[C]': ['camping', 'tabara', 'excursie', 'drumetie'],
-            '[D]': ['foc', 'flacara', 'lumina'],
-            '[E]': ['noduri', 'fr�nghii', 'sfori', 'legare'],
-            '[F]': ['bushcraft', 'supravietuire', 'survival'],
-            '[G]': ['educatie', 'educativ', 'invatare', 'scoala'],
-            '[H]': ['orientare', 'busola', 'harta', 'navigare']
-        }
-    
-    def detect_encoding(self, file_path):
-        """Detecteaz encoding-ul fiierului"""
-        with open(file_path, 'rb') as f:
-            result = chardet.detect(f.read())
-        return result['encoding'] or 'utf-8'
-    
-    def extract_from_html(self, html_path: str) -> List[Dict]:
-        """Extrage activiti dintr-un singur fiier HTML"""
-        activities = []
-        
-        try:
-            # Detectare encoding i citire
-            encoding = self.detect_encoding(html_path)
-            with open(html_path, 'r', encoding=encoding, errors='ignore') as f:
-                content = f.read()
-            
-            soup = BeautifulSoup(content, 'lxml')
-            
-            # Metod 1: Caut liste de activiti
-            activities.extend(self._extract_from_lists(soup, html_path))
-            
-            # Metod 2: Caut activiti �n headings
-            activities.extend(self._extract_from_headings(soup, html_path))
-            
-            # Metod 3: Caut pattern-uri �n text
-            activities.extend(self._extract_from_patterns(soup, html_path))
-            
-            # Metod 4: Caut �n tabele
-            activities.extend(self._extract_from_tables(soup, html_path))
-            
-        except Exception as e:
-            print(f"Error processing {html_path}: {e}")
-        
-        return activities
-    
-    def _extract_from_lists(self, soup, source_file):
-        """Extrage activiti din liste HTML (ul, ol)"""
-        activities = []
-        
-        for list_elem in soup.find_all(['ul', 'ol']):
-            # Verific dac lista pare s conin activiti
-            list_text = list_elem.get_text().lower()
-            if any(marker in list_text for marker in ['joc', 'activitate', 'exercitiu']):
-                for li in list_elem.find_all('li'):
-                    text = li.get_text(strip=True)
-                    if len(text) > 20:  # Minim 20 caractere pentru o activitate valid
-                        activity = self._create_activity_from_text(text, source_file)
-                        if activity:
-                            activities.append(activity)
-        
-        return activities
-    
-    def _extract_from_headings(self, soup, source_file):
-        """Extrage activiti bazate pe headings"""
-        activities = []
-        
-        for heading in soup.find_all(['h1', 'h2', 'h3', 'h4', 'h5', 'h6']):
-            heading_text = heading.get_text(strip=True)
-            
-            # Verific dac heading-ul conine cuvinte cheie
-            if any(keyword in heading_text.lower() for keyword in ['joc', 'activitate', 'exercitiu']):
-                # Caut descrierea �n elementele urmtoare
-                description = ""
-                next_elem = heading.find_next_sibling()
-                
-                while next_elem and next_elem.name not in ['h1', 'h2', 'h3', 'h4', 'h5', 'h6']:
-                    if next_elem.name in ['p', 'div', 'ul']:
-                        description += next_elem.get_text(strip=True) + " "
-                        if len(description) > 500:  # Limit descriere
-                            break
-                    next_elem = next_elem.find_next_sibling()
-                
-                if description:
-                    activity = {
-                        'name': heading_text[:200],
-                        'description': description[:1000],
-                        'source_file': str(source_file),
-                        'category': self._detect_category(heading_text + " " + description)
-                    }
-                    activities.append(activity)
-        
-        return activities
-    
-    def _extract_from_patterns(self, soup, source_file):
-        """Extrage activiti folosind pattern matching"""
-        activities = []
-        text = soup.get_text()
-        
-        # Caut pattern-uri de activiti
-        for pattern in self.activity_patterns['title_patterns']:
-            matches = re.finditer(pattern, text, re.MULTILINE)
-            for match in matches:
-                title = match.group(0) if match.lastindex == 0 else match.group(match.lastindex)
-                if len(title) > 10:
-                    # Extrage context �n jurul match-ului
-                    start = max(0, match.start() - 200)
-                    end = min(len(text), match.end() + 500)
-                    context = text[start:end]
-                    
-                    activity = self._create_activity_from_text(context, source_file, title)
-                    if activity:
-                        activities.append(activity)
-        
-        return activities
-    
-    def _extract_from_tables(self, soup, source_file):
-        """Extrage activiti din tabele"""
-        activities = []
-        
-        for table in soup.find_all('table'):
-            rows = table.find_all('tr')
-            if len(rows) > 1:  # Cel puin header i o linie de date
-                # Detecteaz coloanele relevante
-                headers = [th.get_text(strip=True).lower() for th in rows[0].find_all(['th', 'td'])]
-                
-                for row in rows[1:]:
-                    cells = row.find_all(['td'])
-                    if cells:
-                        activity_data = {}
-                        for i, cell in enumerate(cells):
-                            if i < len(headers):
-                                activity_data[headers[i]] = cell.get_text(strip=True)
-                        
-                        # Creeaz activitate din date tabel
-                        if any(key in activity_data for key in ['joc', 'activitate', 'nume', 'titlu']):
-                            activity = self._create_activity_from_table_data(activity_data, source_file)
-                            if activity:
-                                activities.append(activity)
-        
-        return activities
-    
-    def _create_activity_from_text(self, text, source_file, title=None):
-        """Creeaz un dicionar de activitate din text"""
-        if not text or len(text) < 30:
-            return None
-        
-        activity = {
-            'name': title or text[:100].split('.')[0].strip(),
-            'description': text[:1000],
-            'source_file': str(source_file),
-            'category': self._detect_category(text),
-            'keywords': self._extract_keywords(text),
-            'created_at': datetime.now().isoformat()
-        }
-        
-        # Extrage metadata suplimentar
-        activity.update(self._extract_metadata(text))
-        
-        return activity
-    
-    def _create_activity_from_table_data(self, data, source_file):
-        """Creeaz activitate din date de tabel"""
-        activity = {
-            'source_file': str(source_file),
-            'created_at': datetime.now().isoformat()
-        }
-        
-        # Mapare c�mpuri tabel la c�mpuri DB
-        field_mapping = {
-            'nume': 'name', 'titlu': 'name', 'joc': 'name', 'activitate': 'name',
-            'descriere': 'description', 'detalii': 'description', 'explicatie': 'description',
-            'materiale': 'materials_list', 'echipament': 'materials_list',
-            'varsta': 'age_group_min', 'categoria': 'category',
-            'participanti': 'participants_min', 'numar': 'participants_min',
-            'durata': 'duration_min', 'timp': 'duration_min'
-        }
-        
-        for table_field, db_field in field_mapping.items():
-            if table_field in data:
-                activity[db_field] = data[table_field]
-        
-        # Validare minim
-        if 'name' in activity and len(activity.get('name', '')) > 5:
-            return activity
-        
-        return None
-    
-    def _extract_metadata(self, text):
-        """Extrage metadata din text folosind pattern-uri"""
-        metadata = {}
-        
-        # Extrage v�rsta
-        for pattern in self.activity_patterns['age_patterns']:
-            match = re.search(pattern, text)
-            if match:
-                metadata['age_group_min'] = int(match.group(1))
-                metadata['age_group_max'] = int(match.group(2)) if match.lastindex >= 2 else int(match.group(1))
-                break
-        
-        # Extrage numr participani
-        for pattern in self.activity_patterns['participants_patterns']:
-            match = re.search(pattern, text)
-            if match:
-                metadata['participants_min'] = int(match.group(1))
-                metadata['participants_max'] = int(match.group(2)) if match.lastindex >= 2 else int(match.group(1))
-                break
-        
-        # Extrage durata
-        for pattern in self.activity_patterns['duration_patterns']:
-            match = re.search(pattern, text)
-            if match:
-                metadata['duration_min'] = int(match.group(1))
-                metadata['duration_max'] = int(match.group(2)) if match.lastindex >= 2 else int(match.group(1))
-                break
-        
-        # Extrage materiale
-        materials = []
-        text_lower = text.lower()
-        for marker in self.activity_patterns['materials_markers']:
-            idx = text_lower.find(marker)
-            if idx != -1:
-                # Extrage urmtoarele 200 caractere dup marker
-                materials_text = text[idx:idx+200]
-                # Extrage items din list
-                items = re.findall(r'[-"]\s*([^\n-"]+)', materials_text)
-                if items:
-                    materials.extend(items)
-        
-        if materials:
-            metadata['materials_list'] = ', '.join(materials[:10])  # Maxim 10 materiale
-        
-        return metadata
-    
-    def _detect_category(self, text):
-        """Detecteaz categoria activitii bazat pe cuvinte cheie"""
-        text_lower = text.lower()
-        
-        for category, keywords in self.categories.items():
-            if any(keyword in text_lower for keyword in keywords):
-                return category
-        
-        return '[A]'  # Default categoria jocuri
-    
-    def _extract_keywords(self, text):
-        """Extrage cuvinte cheie din text"""
-        keywords = []
-        text_lower = text.lower()
-        
-        # Lista de cuvinte cheie relevante
-        keyword_list = [
-            'cooperare', 'competitie', 'echipa', 'creativitate', 'miscare',
-            'strategie', 'comunicare', 'incredere', 'coordonare', 'atentie',
-            'reflexe', 'logica', 'imaginatie', 'muzica', 'dans', 'sport',
-            'natura', 'mediu', 'stiinta', 'matematica', 'limba', 'cultura'
-        ]
-        
-        for keyword in keyword_list:
-            if keyword in text_lower:
-                keywords.append(keyword)
-        
-        return ', '.join(keywords[:5])  # Maxim 5 keywords
-    
-    def save_to_database(self, activities):
-        """Salveaz activitile �n baza de date"""
-        conn = sqlite3.connect(self.db_path)
-        cursor = conn.cursor()
-        
-        saved_count = 0
-        duplicate_count = 0
-        
-        for activity in activities:
-            try:
-                # Verific duplicate
-                cursor.execute(
-                    "SELECT id FROM activities WHERE name = ? AND source_file = ?",
-                    (activity.get('name'), activity.get('source_file'))
-                )
-                
-                if cursor.fetchone():
-                    duplicate_count += 1
-                    continue
-                
-                # Pregtete valorile pentru insert
-                columns = []
-                values = []
-                placeholders = []
-                
-                for key, value in activity.items():
-                    if key != 'created_at':  # Skip created_at, it has default
-                        columns.append(key)
-                        values.append(value)
-                        placeholders.append('?')
-                
-                # Insert �n DB
-                query = f"INSERT INTO activities ({', '.join(columns)}) VALUES ({', '.join(placeholders)})"
-                cursor.execute(query, values)
-                saved_count += 1
-                
-            except Exception as e:
-                print(f"Error saving activity: {e}")
-                continue
-        
-        conn.commit()
-        conn.close()
-        
-        return saved_count, duplicate_count
-    
-    def process_all_html_files(self, base_path='/mnt/d/GoogleDrive/Cercetasi/carti-camp-jocuri'):
-        """Proceseaz toate fiierele HTML din directorul specificat"""
-        base_path = Path(base_path)
-        html_files = list(base_path.rglob("*.html"))
-        html_files.extend(list(base_path.rglob("*.htm")))
-        
-        print(f"Found {len(html_files)} HTML files to process")
-        
-        all_activities = []
-        processed = 0
-        errors = 0
-        
-        for i, html_file in enumerate(html_files):
-            try:
-                activities = self.extract_from_html(str(html_file))
-                all_activities.extend(activities)
-                processed += 1
-                
-                # Progress update
-                if (i + 1) % 100 == 0:
-                    print(f"Progress: {i+1}/{len(html_files)} files processed, {len(all_activities)} activities found")
-                    # Save batch to DB
-                    if all_activities:
-                        saved, dupes = self.save_to_database(all_activities)
-                        print(f"Batch saved: {saved} new activities, {dupes} duplicates skipped")
-                        all_activities = []  # Clear buffer
-                
-            except Exception as e:
-                print(f"Error processing {html_file}: {e}")
-                errors += 1
-        
-        # Save remaining activities
-        if all_activities:
-            saved, dupes = self.save_to_database(all_activities)
-            print(f"Final batch saved: {saved} new activities, {dupes} duplicates skipped")
-        
-        print(f"\nProcessing complete!")
-        print(f"Files processed: {processed}")
-        print(f"Errors: {errors}")
-        
-        return processed, errors
-
-# Funcie main pentru test
-if __name__ == "__main__":
-    extractor = HTMLActivityExtractor()
-    
-    # Test pe un fiier sample mai �nt�i
-    print("Testing on sample file first...")
-    # Gsete un fiier HTML pentru test
-    test_files = list(Path('/mnt/d/GoogleDrive/Cercetasi/carti-camp-jocuri').rglob("*.html"))[:3]
-    
-    for test_file in test_files:
-        print(f"\nTesting: {test_file}")
-        activities = extractor.extract_from_html(str(test_file))
-        print(f"Found {len(activities)} activities")
-        if activities:
-            print(f"Sample activity: {activities[0]['name'][:50]}...")
-    
-    # �ntreab dac s continue cu procesarea complet
-    response = input("\nContinue with full processing? (y/n): ")
-    if response.lower() == 'y':
-        extractor.process_all_html_files()
\ No newline at end of file
diff --git a/scripts/import_claude_activities.py b/scripts/import_claude_activities.py
deleted file mode 100644
index c10141a..0000000
--- a/scripts/import_claude_activities.py
+++ /dev/null
@@ -1,78 +0,0 @@
-#!/usr/bin/env python3
-"""
-Import activities extracted by Claude from JSON files
-"""
-
-import json
-import sqlite3
-from pathlib import Path
-from datetime import datetime
-
-class ClaudeActivityImporter:
-    def __init__(self, db_path='data/activities.db'):
-        self.db_path = db_path
-        self.json_dir = Path('scripts/extracted_activities')
-        self.json_dir.mkdir(exist_ok=True)
-    
-    def import_json_file(self, json_path):
-        """Import activities from a single JSON file"""
-        with open(json_path, 'r', encoding='utf-8') as f:
-            data = json.load(f)
-        
-        source_file = data.get('source_file', str(json_path))
-        activities = data.get('activities', [])
-        
-        conn = sqlite3.connect(self.db_path)
-        cursor = conn.cursor()
-        
-        imported = 0
-        for activity in activities:
-            try:
-                # Add source file and timestamp
-                activity['source_file'] = source_file
-                activity['created_at'] = datetime.now().isoformat()
-                
-                # Prepare insert
-                columns = list(activity.keys())
-                values = list(activity.values())
-                placeholders = ['?' for _ in values]
-                
-                # Check for duplicate
-                cursor.execute(
-                    "SELECT id FROM activities WHERE name = ? AND source_file = ?",
-                    (activity.get('name'), source_file)
-                )
-                
-                if not cursor.fetchone():
-                    query = f"INSERT INTO activities ({', '.join(columns)}) VALUES ({', '.join(placeholders)})"
-                    cursor.execute(query, values)
-                    imported += 1
-                    
-            except Exception as e:
-                print(f"Error importing activity: {e}")
-        
-        conn.commit()
-        conn.close()
-        
-        print(f"Imported {imported} activities from {json_path.name}")
-        return imported
-    
-    def import_all_json_files(self):
-        """Import all JSON files from the extracted_activities directory"""
-        json_files = list(self.json_dir.glob("*.json"))
-        
-        if not json_files:
-            print("No JSON files found in extracted_activities directory")
-            return 0
-        
-        total_imported = 0
-        for json_file in json_files:
-            imported = self.import_json_file(json_file)
-            total_imported += imported
-        
-        print(f"\nTotal imported: {total_imported} activities from {len(json_files)} files")
-        return total_imported
-
-if __name__ == "__main__":
-    importer = ClaudeActivityImporter()
-    importer.import_all_json_files()
\ No newline at end of file
diff --git a/scripts/import_common.py b/scripts/import_common.py
new file mode 100644
index 0000000..0ec3718
--- /dev/null
+++ b/scripts/import_common.py
@@ -0,0 +1,179 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+"""
+import_common.py — shared helpers for the import / validation side of the
+extraction pipeline (Lane C).
+
+Used by build_database.py and validate_extractions.py:
+  * JSON-schema validation of subagent extraction files,
+  * the anti-hallucination source_excerpt substring check (E5),
+  * locating the source chunk that an extraction file came from,
+  * the stable content key used by the needs_review queue.
+"""
+
+from __future__ import annotations
+
+import hashlib
+import json
+import re
+import unicodedata
+from pathlib import Path
+from typing import Any, Optional
+
+SCRIPT_DIR = Path(__file__).resolve().parent
+REPO_ROOT = SCRIPT_DIR.parent
+
+DEFAULT_SCHEMA_PATH = SCRIPT_DIR / "activity_schema.json"
+
+# rapidfuzz.partial_ratio is on a 0..100 scale; an excerpt counts as a real
+# quote from the source when it scores at least this against the chunk text.
+EXCERPT_MATCH_THRESHOLD = 90.0
+
+
+# --------------------------------------------------------------------------
+# schema validation
+# --------------------------------------------------------------------------
+def load_schema(schema_path: str | Path = DEFAULT_SCHEMA_PATH) -> dict:
+    """Load the activity JSON schema produced by Lane A."""
+    return json.loads(Path(schema_path).read_text(encoding="utf-8"))
+
+
+def validate_extraction(data: Any, schema: dict) -> list[str]:
+    """
+    Validate one parsed extraction file against `schema`.
+
+    Returns a list of human-readable error strings; empty list == valid.
+    """
+    import jsonschema
+
+    validator = jsonschema.Draft7Validator(schema)
+    errors: list[str] = []
+    for err in sorted(validator.iter_errors(data), key=lambda e: list(e.path)):
+        location = "/".join(str(p) for p in err.path) or "<root>"
+        errors.append(f"{location}: {err.message}")
+    return errors
+
+
+# --------------------------------------------------------------------------
+# excerpt verification (E5 — anti-hallucination)
+# --------------------------------------------------------------------------
+def _normalize_text(text: str) -> str:
+    return re.sub(r"\s+", " ", (text or "")).strip().lower()
+
+
+def excerpt_score(excerpt: str, chunk_text: str) -> float:
+    """Best fuzzy-substring score (0..100) of `excerpt` inside `chunk_text`."""
+    from rapidfuzz import fuzz
+
+    if not excerpt or not chunk_text:
+        return 0.0
+    return float(fuzz.partial_ratio(_normalize_text(excerpt), _normalize_text(chunk_text)))
+
+
+def excerpt_matches(
+    excerpt: str, chunk_text: str, threshold: float = EXCERPT_MATCH_THRESHOLD
+) -> bool:
+    """True when `excerpt` appears (fuzzily) as a substring of `chunk_text`."""
+    return excerpt_score(excerpt, chunk_text) >= threshold
+
+
+# --------------------------------------------------------------------------
+# locating the source chunk an extraction file came from
+# --------------------------------------------------------------------------
+def chunk_key_for(json_path: Path, header: Optional[dict]) -> str:
+    """
+    Resolve the chunk key for an extraction file.
+
+    Prefers the explicit `chunk_key` in the header, otherwise falls back to the
+    JSON file stem (extraction files are named `<chunk_key>.json`).
+    """
+    if header and header.get("chunk_key"):
+        return str(header["chunk_key"])
+    return json_path.stem
+
+
+def source_id_for(chunk_key: str, header: Optional[dict]) -> str:
+    """Resolve the source id; `<source_id>.partNN` → `<source_id>`."""
+    if header and header.get("source_id"):
+        return str(header["source_id"])
+    # chunk keys look like "<source_id>.partNN"
+    return chunk_key.rsplit(".part", 1)[0]
+
+
+def find_chunk_text(
+    json_path: Path, header: Optional[dict], chunks_dir: Path
+) -> Optional[str]:
+    """
+    Return the text of the source chunk for an extraction file, or None.
+
+    Looks for data/chunks/<source_id>/<chunk_key>.txt, then falls back to a
+    recursive glob on the chunk key.
+    """
+    chunk_key = chunk_key_for(json_path, header)
+    source_id = source_id_for(chunk_key, header)
+
+    candidate = chunks_dir / source_id / f"{chunk_key}.txt"
+    if candidate.is_file():
+        return candidate.read_text(encoding="utf-8", errors="replace")
+
+    matches = list(chunks_dir.rglob(f"{chunk_key}.txt"))
+    if matches:
+        return matches[0].read_text(encoding="utf-8", errors="replace")
+    return None
+
+
+def source_path_for(source_id: str, sources_dir: Path) -> Optional[str]:
+    """
+    Read the original `SOURCE:` path from a normalized source header.
+
+    data/sources/<source_id>.txt starts with a `SOURCE: <relative path>` line.
+    """
+    src_file = sources_dir / f"{source_id}.txt"
+    if not src_file.is_file():
+        return None
+    try:
+        with src_file.open(encoding="utf-8", errors="replace") as fh:
+            for line in fh:
+                if line.startswith("SOURCE:"):
+                    return line.split(":", 1)[1].strip()
+                if line.startswith("=") or line.startswith("--- PAGE "):
+                    break
+    except OSError:
+        return None
+    return None
+
+
+# --------------------------------------------------------------------------
+# stable content key for the needs_review queue (plan §5c)
+# --------------------------------------------------------------------------
+def normalize_name(name: str) -> str:
+    """Diacritic-free, lowercased, whitespace-collapsed name (dedup key)."""
+    if not name:
+        return ""
+    decomposed = unicodedata.normalize("NFKD", name)
+    ascii_str = "".join(c for c in decomposed if not unicodedata.combining(c))
+    return re.sub(r"\s+", " ", ascii_str.lower().strip())
+
+
+def content_key(normalized_name: str, language: Optional[str], description: str) -> str:
+    """
+    Stable hash identifying a row for the review queue.
+
+    Only borderline-kept-separate rows and legacy `.doc` rows ever carry
+    needs_review, and neither is auto-merged — so their (normalized_name,
+    language, description) triple is stable across rebuilds.
+    """
+    payload = f"{normalized_name}\x1f{language or ''}\x1f{_normalize_text(description)}"
+    return hashlib.sha1(payload.encode("utf-8")).hexdigest()
+
+
+# --------------------------------------------------------------------------
+# iteration
+# --------------------------------------------------------------------------
+def iter_extraction_files(extracted_dir: Path):
+    """Yield every *.json directly under `extracted_dir` (skips _rejected/)."""
+    if not extracted_dir.is_dir():
+        return
+    for path in sorted(extracted_dir.glob("*.json")):
+        if path.is_file():
+            yield path
diff --git a/scripts/normalize_sources.py b/scripts/normalize_sources.py
new file mode 100644
index 0000000..2c9c607
--- /dev/null
+++ b/scripts/normalize_sources.py
@@ -0,0 +1,255 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+"""
+normalize_sources.py — walk data/carti-camp-jocuri/ and write data/sources/<id>.txt.
+
+Output files keep the existing header format:
+
+    SOURCE: <original relative path>
+    CONVERTED: <iso date>
+    FORMAT: <pdf|docx|doc|pptx|html-mirror|zip>
+    NEEDS_REVIEW: <reason>            (optional — legacy .doc conversions)
+    ==================================================
+
+    --- PAGE 1 ---
+    ...
+
+Each source gets a stable id = <8-hex hash of relative path>_<sanitized stem>,
+so two files with the same name in different folders never collide.
+
+The pipeline is script-only: this normalizes formats, it does NOT run extraction.
+Run `--check-deps` before a long job.
+"""
+
+from __future__ import annotations
+
+import argparse
+import datetime as _dt
+import hashlib
+import re
+import sys
+from pathlib import Path
+
+SCRIPT_DIR = Path(__file__).resolve().parent
+if str(SCRIPT_DIR) not in sys.path:
+    sys.path.insert(0, str(SCRIPT_DIR))
+
+from extract_common import (  # noqa: E402
+    count_page_markers,
+    dedupe_texts,
+    detect_format,
+    extract_file,
+    extract_html,
+    is_junk,
+    join_pages,
+    preflight,
+    split_pages,
+)
+
+HEADER_RULE = "=" * 50
+
+
+# --------------------------------------------------------------------------
+# stable source id
+# --------------------------------------------------------------------------
+def sanitize_stem(stem: str) -> str:
+    s = re.sub(r"[^\w]+", "_", stem, flags=re.UNICODE).strip("_").lower()
+    return s[:60] or "source"
+
+
+def stable_id(relative_path: str | Path) -> str:
+    """Collision-proof id derived from the path relative to the corpus root."""
+    rel = str(relative_path).replace("\\", "/")
+    digest = hashlib.sha1(rel.encode("utf-8")).hexdigest()[:8]
+    stem = sanitize_stem(Path(rel).stem)
+    return f"{digest}_{stem}"
+
+
+# --------------------------------------------------------------------------
+# header
+# --------------------------------------------------------------------------
+def build_header(
+    source_rel: str, fmt: str, needs_review: str | None = None
+) -> str:
+    today = _dt.date.today().isoformat()
+    lines = [
+        f"SOURCE: {source_rel}",
+        f"CONVERTED: {today}",
+        f"FORMAT: {fmt}",
+    ]
+    if needs_review:
+        lines.append(f"NEEDS_REVIEW: {needs_review}")
+    lines.append(HEADER_RULE)
+    return "\n".join(lines) + "\n\n"
+
+
+# --------------------------------------------------------------------------
+# mirror-site directories
+# --------------------------------------------------------------------------
+MIRROR_PAGE_EXTS = {".html", ".htm"}
+
+
+def is_mirror_dir(path: Path) -> bool:
+    """A directory counts as a site mirror if it contains HTML pages."""
+    if not path.is_dir():
+        return False
+    if path.name.endswith("_files"):
+        return False
+    return any(
+        p.suffix.lower() in MIRROR_PAGE_EXTS
+        for p in path.rglob("*")
+        if p.is_file()
+    )
+
+
+def normalize_mirror(mirror_dir: Path) -> str:
+    """Extract every HTML page in a mirror dir, dedupe near-duplicates, join."""
+    pages: list[tuple[str, str]] = []
+    for html in sorted(mirror_dir.rglob("*")):
+        if not html.is_file() or html.suffix.lower() not in MIRROR_PAGE_EXTS:
+            continue
+        if "_files" in html.parts:
+            continue
+        try:
+            body = extract_html(html)
+        except Exception:
+            continue
+        text = "\n".join(t for _, t in split_pages(body))
+        if text.strip():
+            pages.append((str(html.relative_to(mirror_dir)), text))
+    pages = dedupe_texts(pages)
+    return join_pages([t for _, t in pages])
+
+
+# --------------------------------------------------------------------------
+# one source
+# --------------------------------------------------------------------------
+def normalize_one(
+    path: Path, corpus_root: Path, out_dir: Path
+) -> dict | None:
+    """
+    Normalize a single file or mirror directory → data/sources/<id>.txt.
+
+    Returns a result dict, or None if the entry was skipped (junk / ignored).
+    """
+    rel = path.relative_to(corpus_root)
+    sid = stable_id(rel)
+
+    if path.is_dir():
+        if not is_mirror_dir(path):
+            return None
+        fmt, needs_review = "html-mirror", None
+        body = normalize_mirror(path)
+    else:
+        if is_junk(path):
+            return None
+        fmt = detect_format(path)
+        if fmt in ("unknown", "epub", "txt"):
+            return None  # epub duplicates PDFs; txt is not a source format here
+        needs_review = "legacy .doc conversion is imperfect" if fmt == "doc" else None
+        try:
+            body = extract_file(path)
+        except Exception as exc:  # noqa: BLE001
+            return {"id": sid, "source": str(rel), "status": "error", "error": str(exc)}
+
+    if not body.strip():
+        return {"id": sid, "source": str(rel), "status": "empty"}
+
+    out_path = out_dir / f"{sid}.txt"
+    out_path.write_text(build_header(str(rel), fmt, needs_review) + body,
+                        encoding="utf-8")
+    return {
+        "id": sid,
+        "source": str(rel),
+        "status": "ok",
+        "format": fmt,
+        "pages": count_page_markers(body),
+        "needs_review": bool(needs_review),
+    }
+
+
+# --------------------------------------------------------------------------
+# walk
+# --------------------------------------------------------------------------
+def iter_corpus_entries(corpus_root: Path):
+    """Yield top-level files and mirror directories under the corpus root."""
+    for entry in sorted(corpus_root.iterdir()):
+        if entry.name.startswith("."):
+            continue
+        if entry.is_dir():
+            if is_mirror_dir(entry):
+                yield entry
+        else:
+            yield entry
+
+
+def run(corpus_root: Path, out_dir: Path) -> dict:
+    out_dir.mkdir(parents=True, exist_ok=True)
+    results: list[dict] = []
+    for entry in iter_corpus_entries(corpus_root):
+        res = normalize_one(entry, corpus_root, out_dir)
+        if res is not None:
+            results.append(res)
+    summary = {
+        "total": len(results),
+        "ok": sum(1 for r in results if r["status"] == "ok"),
+        "errors": sum(1 for r in results if r["status"] == "error"),
+        "empty": sum(1 for r in results if r["status"] == "empty"),
+        "needs_review": sum(1 for r in results if r.get("needs_review")),
+        "results": results,
+    }
+    return summary
+
+
+# --------------------------------------------------------------------------
+# CLI
+# --------------------------------------------------------------------------
+def print_preflight(report: dict) -> int:
+    print("Dependency preflight")
+    print("--------------------")
+    if report["missing_python"]:
+        print("  MISSING Python packages: " + ", ".join(report["missing_python"]))
+    else:
+        print("  Python packages: OK")
+    if report["missing_system"]:
+        print("  MISSING system tools  : " + ", ".join(report["missing_system"]))
+    for w in report["warnings"]:
+        print(f"  WARNING: {w}")
+    print("  => " + ("READY" if report["ok"] else "NOT READY — install the above"))
+    return 0 if report["ok"] else 1
+
+
+def main(argv: list[str] | None = None) -> int:
+    parser = argparse.ArgumentParser(description="Normalize mixed sources to .txt")
+    parser.add_argument("--corpus", default="data/carti-camp-jocuri",
+                        help="corpus root to walk")
+    parser.add_argument("--out", default="data/sources", help="output directory")
+    parser.add_argument("--check-deps", action="store_true",
+                        help="run dependency preflight and exit")
+    parser.add_argument("--ocr", action="store_true",
+                        help="include OCR (tesseract) in the preflight check")
+    args = parser.parse_args(argv)
+
+    if args.check_deps:
+        return print_preflight(preflight(check_ocr=args.ocr))
+
+    report = preflight(check_ocr=args.ocr)
+    if report["missing_python"]:
+        print_preflight(report)
+        return 1
+    for w in report["warnings"]:
+        print(f"WARNING: {w}")
+
+    summary = run(Path(args.corpus), Path(args.out))
+    print(f"normalized : {summary['ok']}/{summary['total']}")
+    print(f"errors     : {summary['errors']}")
+    print(f"empty      : {summary['empty']}")
+    print(f"needs_review: {summary['needs_review']}")
+    for r in summary["results"]:
+        if r["status"] != "ok":
+            print(f"  [{r['status']}] {r['source']}")
+    return 0
+
+
+if __name__ == "__main__":
+    raise SystemExit(main())
diff --git a/scripts/pdf_extractor.py b/scripts/pdf_extractor.py
deleted file mode 100644
index e69de29..0000000
diff --git a/scripts/pdf_to_text_converter.py b/scripts/pdf_to_text_converter.py
deleted file mode 100644
index db03509..0000000
--- a/scripts/pdf_to_text_converter.py
+++ /dev/null
@@ -1,143 +0,0 @@
-#!/usr/bin/env python3
-"""
-PDF Mass Conversion to Text for Activity Extraction
-Handles all PDF sizes efficiently with multiple fallback methods
-"""
-
-import os
-import json
-from pathlib import Path
-import PyPDF2
-import pdfplumber
-from typing import List, Dict
-import logging
-
-class PDFConverter:
-    def __init__(self, max_pages=50):
-        self.max_pages = max_pages
-        self.conversion_stats = {}
-    
-    def convert_pdf_to_text(self, pdf_path: str) -> str:
-        """Convert PDF to text using multiple methods with fallbacks"""
-        try:
-            # Method 1: pdfplumber (best for tables and layout)
-            return self._convert_with_pdfplumber(pdf_path)
-        except Exception as e:
-            print(f"pdfplumber failed for {pdf_path}: {e}")
-            
-            try:
-                # Method 2: PyPDF2 (fallback)
-                return self._convert_with_pypdf2(pdf_path)
-            except Exception as e2:
-                print(f"PyPDF2 also failed for {pdf_path}: {e2}")
-                return ""
-    
-    def _convert_with_pdfplumber(self, pdf_path: str) -> str:
-        """Primary conversion method using pdfplumber"""
-        text_content = ""
-        
-        with pdfplumber.open(pdf_path) as pdf:
-            total_pages = len(pdf.pages)
-            pages_to_process = min(total_pages, self.max_pages)
-            
-            print(f"  Converting {pdf_path}: {pages_to_process}/{total_pages} pages")
-            
-            for i, page in enumerate(pdf.pages[:pages_to_process]):
-                try:
-                    page_text = page.extract_text()
-                    if page_text:
-                        text_content += f"\n--- PAGE {i+1} ---\n"
-                        text_content += page_text
-                        text_content += "\n"
-                except Exception as e:
-                    print(f"    Error on page {i+1}: {e}")
-                    continue
-        
-        self.conversion_stats[pdf_path] = {
-            'method': 'pdfplumber',
-            'pages_processed': pages_to_process,
-            'total_pages': total_pages,
-            'success': True,
-            'text_length': len(text_content)
-        }
-        
-        return text_content
-    
-    def _convert_with_pypdf2(self, pdf_path: str) -> str:
-        """Fallback conversion method using PyPDF2"""
-        text_content = ""
-        
-        with open(pdf_path, 'rb') as file:
-            reader = PyPDF2.PdfReader(file)
-            total_pages = len(reader.pages)
-            pages_to_process = min(total_pages, self.max_pages)
-            
-            print(f"  Converting {pdf_path} (fallback): {pages_to_process}/{total_pages} pages")
-            
-            for i in range(pages_to_process):
-                try:
-                    page = reader.pages[i]
-                    page_text = page.extract_text()
-                    if page_text:
-                        text_content += f"\n--- PAGE {i+1} ---\n"
-                        text_content += page_text
-                        text_content += "\n"
-                except Exception as e:
-                    print(f"    Error on page {i+1}: {e}")
-                    continue
-        
-        self.conversion_stats[pdf_path] = {
-            'method': 'PyPDF2',
-            'pages_processed': pages_to_process,
-            'total_pages': total_pages,
-            'success': True,
-            'text_length': len(text_content)
-        }
-        
-        return text_content
-    
-    def convert_all_pdfs(self, pdf_directory: str, output_directory: str):
-        """Convert all PDFs in directory to text files"""
-        pdf_files = list(Path(pdf_directory).glob("**/*.pdf"))
-        
-        print(f"🔄 Converting {len(pdf_files)} PDF files to text...")
-        
-        os.makedirs(output_directory, exist_ok=True)
-        
-        for i, pdf_path in enumerate(pdf_files):
-            print(f"\n[{i+1}/{len(pdf_files)}] Processing {pdf_path.name}...")
-            
-            # Convert to text
-            text_content = self.convert_pdf_to_text(str(pdf_path))
-            
-            if text_content.strip():
-                # Save as text file
-                output_file = Path(output_directory) / f"{pdf_path.stem}.txt"
-                with open(output_file, 'w', encoding='utf-8') as f:
-                    f.write(f"SOURCE: {pdf_path}\n")
-                    f.write(f"CONVERTED: 2025-01-11\n")
-                    f.write("="*50 + "\n\n")
-                    f.write(text_content)
-                
-                print(f"  ✅ Saved: {output_file}")
-            else:
-                print(f"  ❌ No text extracted from {pdf_path.name}")
-        
-        # Save conversion statistics
-        stats_file = Path(output_directory) / "conversion_stats.json"
-        with open(stats_file, 'w', encoding='utf-8') as f:
-            json.dump(self.conversion_stats, f, indent=2, ensure_ascii=False)
-        
-        print(f"\n🎉 PDF conversion complete! Check {output_directory}")
-        return len([f for f in self.conversion_stats.values() if f['success']])
-
-# Usage
-if __name__ == "__main__":
-    converter = PDFConverter(max_pages=50)
-    
-    # Convert all PDFs
-    pdf_dir = "/mnt/d/GoogleDrive/Cercetasi/carti-camp-jocuri"
-    output_dir = "/mnt/d/GoogleDrive/Cercetasi/carti-camp-jocuri/INDEX-SISTEM-JOCURI/converted_pdfs"
-    
-    converted_count = converter.convert_all_pdfs(pdf_dir, output_dir)
-    print(f"Final result: {converted_count} PDFs successfully converted")
\ No newline at end of file
diff --git a/scripts/review_queue.py b/scripts/review_queue.py
new file mode 100644
index 0000000..bf75c76
--- /dev/null
+++ b/scripts/review_queue.py
@@ -0,0 +1,145 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+"""
+review_queue.py — CLI for the needs_review lifecycle (plan §5c).
+
+Rows land in the queue when dedup leaves a borderline pair separate, or when a
+legacy `.doc` source was converted imperfectly. Each row has a stable content
+key; a decision written here is stored in data/review_decisions.json (git
+tracked) and re-applied by build_database.py on every rebuild, so the queue
+never resurfaces a resolved row.
+
+Commands:
+    python scripts/review_queue.py list
+    python scripts/review_queue.py resolve <id> <merge|keep-separate|drop>
+"""
+
+from __future__ import annotations
+
+import argparse
+import json
+import sqlite3
+import sys
+from pathlib import Path
+from typing import Optional
+
+SCRIPT_DIR = Path(__file__).resolve().parent
+REPO_ROOT = SCRIPT_DIR.parent
+for _p in (str(SCRIPT_DIR), str(REPO_ROOT)):
+    if _p not in sys.path:
+        sys.path.insert(0, _p)
+
+from import_common import content_key, normalize_name  # noqa: E402
+
+VALID_DECISIONS = ("merge", "keep-separate", "drop")
+
+
+# --------------------------------------------------------------------------
+# review_decisions.json
+# --------------------------------------------------------------------------
+def load_decisions(path: Path) -> dict:
+    if path.is_file():
+        try:
+            data = json.loads(path.read_text(encoding="utf-8"))
+            if isinstance(data, dict):
+                return data
+        except (json.JSONDecodeError, OSError):
+            pass
+    return {}
+
+
+def save_decisions(decisions: dict, path: Path) -> None:
+    path.parent.mkdir(parents=True, exist_ok=True)
+    path.write_text(
+        json.dumps(decisions, indent=2, ensure_ascii=False, sort_keys=True),
+        encoding="utf-8",
+    )
+
+
+# --------------------------------------------------------------------------
+# queue
+# --------------------------------------------------------------------------
+def list_queue(db_path: Path) -> list[dict]:
+    """Return every needs_review row in the current DB, with its content key."""
+    if not db_path.is_file():
+        return []
+    conn = sqlite3.connect(db_path)
+    conn.row_factory = sqlite3.Row
+    try:
+        rows = conn.execute(
+            "SELECT name, normalized_name, language, description "
+            "FROM activities WHERE needs_review = 1 ORDER BY normalized_name"
+        ).fetchall()
+    except sqlite3.OperationalError:
+        return []
+    finally:
+        conn.close()
+
+    out = []
+    for row in rows:
+        norm = row["normalized_name"] or normalize_name(row["name"])
+        key = content_key(norm, row["language"], row["description"] or "")
+        out.append({
+            "id": key,
+            "name": row["name"],
+            "language": row["language"],
+            "description": row["description"] or "",
+        })
+    return out
+
+
+def resolve(decisions_path: Path, content_id: str, decision: str) -> dict:
+    """Record a decision for a content key in review_decisions.json."""
+    if decision not in VALID_DECISIONS:
+        raise ValueError(
+            f"invalid decision {decision!r}; expected one of {VALID_DECISIONS}"
+        )
+    decisions = load_decisions(decisions_path)
+    decisions[content_id] = {"decision": decision}
+    save_decisions(decisions, decisions_path)
+    return decisions
+
+
+# --------------------------------------------------------------------------
+# CLI
+# --------------------------------------------------------------------------
+def main(argv: Optional[list[str]] = None) -> int:
+    parser = argparse.ArgumentParser(description="needs_review queue CLI")
+    parser.add_argument("--db", default="data/activities.db")
+    parser.add_argument("--decisions", default="data/review_decisions.json")
+    sub = parser.add_subparsers(dest="command", required=True)
+
+    sub.add_parser("list", help="list rows currently flagged needs_review")
+
+    p_resolve = sub.add_parser("resolve", help="record a decision for a row")
+    p_resolve.add_argument("id", help="content id from `list`")
+    p_resolve.add_argument("decision", choices=VALID_DECISIONS)
+
+    args = parser.parse_args(argv)
+
+    if args.command == "list":
+        rows = list_queue(Path(args.db))
+        if not rows:
+            print("review queue is empty.")
+            return 0
+        print(f"{len(rows)} row(s) need review:\n")
+        for r in rows:
+            desc = r["description"][:80].replace("\n", " ")
+            print(f"  id   : {r['id']}")
+            print(f"  name : {r['name']}  [{r['language']}]")
+            print(f"  desc : {desc}")
+            print(f"  -> review_queue.py resolve {r['id']} <merge|keep-separate|drop>")
+            print()
+        return 0
+
+    if args.command == "resolve":
+        resolve(Path(args.decisions), args.id, args.decision)
+        print(f"recorded: {args.id} -> {args.decision}")
+        print(f"written to {args.decisions} (applied on next build_database --rebuild)")
+        return 0
+
+    return 1
+
+
+if __name__ == "__main__":
+    raise SystemExit(main())
diff --git a/scripts/run_extraction.py b/scripts/run_extraction.py
index 9304861..c80747a 100644
--- a/scripts/run_extraction.py
+++ b/scripts/run_extraction.py
@@ -1,50 +1,140 @@
 #!/usr/bin/env python3
 # -*- coding: utf-8 -*-
 """
-Main extraction orchestrator
-Ruleaza intregul proces de extractie
+run_extraction.py — extraction orchestrator (plan §3).
+
+The pipeline is script-only up to the LLM step: this script normalizes the
+corpus, chunks the normalized sources, and emits one subagent prompt per
+`pending` chunk. It does NOT run the extraction itself — that step is the
+interactive Claude Code orchestrator launching waves of subagents.
+
+Steps:
+  1. normalize  data/carti-camp-jocuri/ -> data/sources/*.txt
+  2. chunk      data/sources/*.txt      -> data/chunks/<id>/*.txt + manifest.json
+  3. emit       one prompt per `pending` chunk -> data/chunks/_prompts/*.md
+  4. report     how many chunks remain `pending`
+
+Usage:
+    python scripts/run_extraction.py
+    python scripts/run_extraction.py --skip-normalize   # re-chunk only
 """
 
+from __future__ import annotations
+
+import argparse
 import sys
-import time
 from pathlib import Path
+from typing import Optional
 
-from unified_processor import UnifiedProcessor
-from import_claude_activities import ClaudeActivityImporter
+SCRIPT_DIR = Path(__file__).resolve().parent
+REPO_ROOT = SCRIPT_DIR.parent
+for _p in (str(SCRIPT_DIR), str(REPO_ROOT)):
+    if _p not in sys.path:
+        sys.path.insert(0, _p)
+
+import chunk_sources  # noqa: E402
+import normalize_sources  # noqa: E402
+
+SUBAGENT_PROMPT = SCRIPT_DIR / "SUBAGENT_PROMPT.md"
+
+
+def emit_chunk_prompt(chunk_key: str, meta: dict, prompts_dir: Path) -> Path:
+    """Write the subagent prompt for one pending chunk."""
+    chunk_file = meta.get("chunk_file", f"data/chunks/<id>/{chunk_key}.txt")
+    expected_json = meta.get("expected_json", f"{chunk_key}.json")
+    text = "\n".join([
+        f"# EXTRACTION — chunk `{chunk_key}`",
+        "",
+        f"Read ONLY this chunk: `{chunk_file}`",
+        f"Chunk range: {meta.get('chunk_range', '?')}",
+        "",
+        f"Follow the rules in `{SUBAGENT_PROMPT.relative_to(REPO_ROOT)}`.",
+        "Identify every distinct activity, fill the schema "
+        "(`scripts/activity_schema.json`), and write the result to:",
+        "",
+        f"    data/extracted/{expected_json}",
+        "",
+        "Header fields to set: "
+        f'source_id="{meta.get("source_id", "")}", chunk_key="{chunk_key}", '
+        f'source_hash="{meta.get("source_hash", "")}".',
+        "",
+    ])
+    prompts_dir.mkdir(parents=True, exist_ok=True)
+    out = prompts_dir / f"{chunk_key}.prompt.md"
+    out.write_text(text, encoding="utf-8")
+    return out
+
+
+def run(
+    *,
+    corpus_root: Path,
+    sources_dir: Path,
+    chunks_dir: Path,
+    skip_normalize: bool = False,
+) -> dict:
+    summary: dict = {}
+
+    if not skip_normalize:
+        norm = normalize_sources.run(corpus_root, sources_dir)
+        summary["normalized"] = {"ok": norm["ok"], "total": norm["total"],
+                                 "errors": norm["errors"]}
+
+    chunk_summary = chunk_sources.run(sources_dir, chunks_dir)
+    summary["chunks"] = chunk_summary
+
+    manifest_path = chunks_dir / "manifest.json"
+    manifest = chunk_sources.load_manifest(manifest_path)
+    prompts_dir = chunks_dir / "_prompts"
+
+    pending = {k: m for k, m in manifest["chunks"].items()
+               if m.get("state") == "pending"}
+    for key, meta in sorted(pending.items()):
+        emit_chunk_prompt(key, meta, prompts_dir)
+
+    states: dict[str, int] = {}
+    for m in manifest["chunks"].values():
+        states[m.get("state", "?")] = states.get(m.get("state", "?"), 0) + 1
+    summary["states"] = states
+    summary["pending"] = len(pending)
+    summary["prompts_dir"] = str(prompts_dir)
+    return summary
+
+
+def main(argv: Optional[list[str]] = None) -> int:
+    parser = argparse.ArgumentParser(description="Extraction orchestrator.")
+    parser.add_argument("--corpus", default="data/carti-camp-jocuri")
+    parser.add_argument("--sources", default="data/sources")
+    parser.add_argument("--chunks", default="data/chunks")
+    parser.add_argument("--skip-normalize", action="store_true",
+                        help="skip normalization, re-chunk existing sources only")
+    args = parser.parse_args(argv)
+
+    summary = run(
+        corpus_root=Path(args.corpus),
+        sources_dir=Path(args.sources),
+        chunks_dir=Path(args.chunks),
+        skip_normalize=args.skip_normalize,
+    )
+
+    print("=" * 60)
+    print("EXTRACTION ORCHESTRATOR")
+    print("=" * 60)
+    if "normalized" in summary:
+        n = summary["normalized"]
+        print(f"normalized : {n['ok']}/{n['total']} (errors {n['errors']})")
+    print(f"chunks     : {summary['chunks']['chunks']}")
+    for state, count in sorted(summary["states"].items()):
+        print(f"  {state:<10}: {count}")
+    print(f"\npending chunks remaining : {summary['pending']}")
+    if summary["pending"]:
+        print(f"subagent prompts written : {summary['prompts_dir']}/")
+        print("Launch waves of ~5-10 subagents on those prompts, then run "
+              "validate_extractions.py and build_database.py --rebuild.")
+    else:
+        print("All chunks extracted — run build_database.py --rebuild.")
+    print("=" * 60)
+    return 0
 
-def main():
-    print("="*60)
-    print("ACTIVITY EXTRACTION SYSTEM")
-    print("Strategy S8: Hybrid Claude + Scripts")
-    print("="*60)
-    
-    # Step 1: Run automated extraction
-    print("\nSTEP 1: Automated Extraction")
-    print("-"*40)
-    processor = UnifiedProcessor()
-    processor.process_automated_formats()
-    
-    # Step 2: Wait for Claude processing
-    print("\n" + "="*60)
-    print("STEP 2: Manual Claude Processing Required")
-    print("-"*40)
-    print("Please process PDF/DOC files with Claude using the template.")
-    print("Files are listed in: pdf_doc_for_claude.txt")
-    print("Save extracted activities as JSON in: scripts/extracted_activities/")
-    print("="*60)
-    
-    response = input("\nHave you completed Claude processing? (y/n): ")
-    
-    if response.lower() == 'y':
-        # Step 3: Import Claude-extracted activities
-        print("\nSTEP 3: Importing Claude-extracted activities")
-        print("-"*40)
-        importer = ClaudeActivityImporter()
-        importer.import_all_json_files()
-    
-    print("\n" + "="*60)
-    print("EXTRACTION COMPLETE!")
-    print("="*60)
 
 if __name__ == "__main__":
-    main()
\ No newline at end of file
+    raise SystemExit(main())
diff --git a/scripts/text_extractor.py b/scripts/text_extractor.py
deleted file mode 100644
index 47b9b16..0000000
--- a/scripts/text_extractor.py
+++ /dev/null
@@ -1,197 +0,0 @@
-#!/usr/bin/env python3
-# -*- coding: utf-8 -*-
-"""
-Text/Markdown Activity Extractor
-Proceseaza fisiere TXT si MD pentru extractie activitati
-"""
-
-import re
-from pathlib import Path
-from typing import List, Dict
-import sqlite3
-from datetime import datetime
-
-class TextActivityExtractor:
-    def __init__(self, db_path='data/activities.db'):
-        self.db_path = db_path
-        self.activity_patterns = {
-            'section_headers': [
-                r'^#{1,6}\s*(.+)$',  # Markdown headers
-                r'^([A-Z][^\.]{10,100})$',  # Titluri simple
-                r'^\d+\.\s*(.+)$',  # Numbered lists
-                r'^[•\-\*]\s*(.+)$',  # Bullet points
-            ],
-            'activity_markers': [
-                'joc:', 'activitate:', 'exercitiu:', 'team building:',
-                'nume:', 'titlu:', 'denumire:'
-            ]
-        }
-    
-    def extract_from_text(self, file_path: str) -> List[Dict]:
-        """Extrage activitati din fisier text/markdown"""
-        activities = []
-        
-        try:
-            with open(file_path, 'r', encoding='utf-8', errors='ignore') as f:
-                content = f.read()
-            
-            # Metoda 1: Cauta sectiuni markdown
-            if file_path.endswith('.md'):
-                activities.extend(self._extract_from_markdown(content, file_path))
-            
-            # Metoda 2: Cauta pattern-uri generale
-            activities.extend(self._extract_from_patterns(content, file_path))
-            
-            # Metoda 3: Cauta blocuri de text structurate
-            activities.extend(self._extract_from_blocks(content, file_path))
-            
-        except Exception as e:
-            print(f"Error processing {file_path}: {e}")
-        
-        return activities
-    
-    def _extract_from_markdown(self, content, source_file):
-        """Extrage activitati din format markdown"""
-        activities = []
-        lines = content.split('\n')
-        
-        current_activity = None
-        current_content = []
-        
-        for line in lines:
-            # Verifica daca e header de activitate
-            if re.match(r'^#{1,3}\s*(.+)', line):
-                # Salveaza activitatea anterioara daca exista
-                if current_activity and current_content:
-                    current_activity['description'] = '\n'.join(current_content[:20])  # Max 20 linii
-                    activities.append(current_activity)
-                
-                # Verifica daca noul header e o activitate
-                header_text = re.sub(r'^#{1,3}\s*', '', line)
-                if any(marker in header_text.lower() for marker in ['joc', 'activitate', 'exercitiu']):
-                    current_activity = {
-                        'name': header_text[:200],
-                        'source_file': str(source_file),
-                        'category': '[A]'
-                    }
-                    current_content = []
-                else:
-                    current_activity = None
-            
-            elif current_activity:
-                # Adauga continut la activitatea curenta
-                if line.strip():
-                    current_content.append(line)
-        
-        # Salveaza ultima activitate
-        if current_activity and current_content:
-            current_activity['description'] = '\n'.join(current_content[:20])
-            activities.append(current_activity)
-        
-        return activities
-    
-    def _extract_from_patterns(self, content, source_file):
-        """Extrage folosind pattern matching"""
-        activities = []
-        
-        # Cauta markeri specifici de activitati
-        for marker in self.activity_patterns['activity_markers']:
-            pattern = re.compile(f'{re.escape(marker)}\\s*(.+?)(?=\\n\\n|{re.escape(marker)}|$)', 
-                               re.IGNORECASE | re.DOTALL)
-            matches = pattern.finditer(content)
-            
-            for match in matches:
-                activity_text = match.group(1)
-                if len(activity_text) > 20:
-                    activity = {
-                        'name': activity_text.split('\n')[0][:200],
-                        'description': activity_text[:1000],
-                        'source_file': str(source_file),
-                        'category': '[A]'
-                    }
-                    activities.append(activity)
-        
-        return activities
-    
-    def _extract_from_blocks(self, content, source_file):
-        """Extrage din blocuri de text separate"""
-        activities = []
-        
-        # Imparte in blocuri separate de linii goale
-        blocks = re.split(r'\n\s*\n', content)
-        
-        for block in blocks:
-            if len(block) > 50:  # Minim 50 caractere
-                lines = block.strip().split('\n')
-                first_line = lines[0].strip()
-                
-                # Verifica daca blocul pare o activitate
-                if any(keyword in first_line.lower() for keyword in ['joc', 'activitate', 'exercitiu']):
-                    activity = {
-                        'name': first_line[:200],
-                        'description': block[:1000],
-                        'source_file': str(source_file),
-                        'category': '[A]'
-                    }
-                    activities.append(activity)
-        
-        return activities
-    
-    def save_to_database(self, activities):
-        """Salveaza in baza de date"""
-        conn = sqlite3.connect(self.db_path)
-        cursor = conn.cursor()
-        
-        saved_count = 0
-        
-        for activity in activities:
-            try:
-                # Check for duplicates
-                cursor.execute(
-                    "SELECT id FROM activities WHERE name = ? AND source_file = ?",
-                    (activity.get('name'), activity.get('source_file'))
-                )
-                
-                if not cursor.fetchone():
-                    columns = list(activity.keys())
-                    values = list(activity.values())
-                    placeholders = ['?' for _ in values]
-                    
-                    query = f"INSERT INTO activities ({', '.join(columns)}) VALUES ({', '.join(placeholders)})"
-                    cursor.execute(query, values)
-                    saved_count += 1
-                    
-            except Exception as e:
-                print(f"Error saving: {e}")
-        
-        conn.commit()
-        conn.close()
-        
-        return saved_count
-    
-    def process_all_text_files(self, base_path='/mnt/d/GoogleDrive/Cercetasi/carti-camp-jocuri'):
-        """Proceseaza toate fisierele text si markdown"""
-        base_path = Path(base_path)
-        
-        text_files = list(base_path.rglob("*.txt"))
-        md_files = list(base_path.rglob("*.md"))
-        all_files = text_files + md_files
-        
-        print(f"Found {len(all_files)} text/markdown files")
-        
-        all_activities = []
-        
-        for file_path in all_files:
-            activities = self.extract_from_text(str(file_path))
-            all_activities.extend(activities)
-            print(f"Processed {file_path.name}: {len(activities)} activities")
-        
-        # Save to database
-        saved = self.save_to_database(all_activities)
-        print(f"\nTotal saved: {saved} activities from {len(all_files)} files")
-        
-        return len(all_files), saved
-
-if __name__ == "__main__":
-    extractor = TextActivityExtractor()
-    extractor.process_all_text_files()
\ No newline at end of file
diff --git a/scripts/unified_processor.py b/scripts/unified_processor.py
deleted file mode 100644
index 8a6d2a3..0000000
--- a/scripts/unified_processor.py
+++ /dev/null
@@ -1,151 +0,0 @@
-#!/usr/bin/env python3
-"""
-Unified Activity Processor
-Orchestreaz toate extractoarele pentru procesare complet
-"""
-
-import time
-from pathlib import Path
-from html_extractor import HTMLActivityExtractor
-from text_extractor import TextActivityExtractor
-import sqlite3
-
-class UnifiedProcessor:
-    def __init__(self, db_path='data/activities.db'):
-        self.db_path = db_path
-        self.html_extractor = HTMLActivityExtractor(db_path)
-        self.text_extractor = TextActivityExtractor(db_path)
-        self.stats = {
-            'html_processed': 0,
-            'text_processed': 0,
-            'pdf_to_process': 0,
-            'doc_to_process': 0,
-            'total_activities': 0,
-            'start_time': None,
-            'end_time': None
-        }
-    
-    def get_current_activity_count(self):
-        """Obine numrul curent de activiti din DB"""
-        conn = sqlite3.connect(self.db_path)
-        cursor = conn.cursor()
-        cursor.execute("SELECT COUNT(*) FROM activities")
-        count = cursor.fetchone()[0]
-        conn.close()
-        return count
-    
-    def count_files_to_process(self, base_path):
-        """Numr fiierele care trebuie procesate"""
-        base_path = Path(base_path)
-        
-        counts = {
-            'html': len(list(base_path.rglob("*.html"))) + len(list(base_path.rglob("*.htm"))),
-            'txt': len(list(base_path.rglob("*.txt"))),
-            'md': len(list(base_path.rglob("*.md"))),
-            'pdf': len(list(base_path.rglob("*.pdf"))),
-            'doc': len(list(base_path.rglob("*.doc"))),
-            'docx': len(list(base_path.rglob("*.docx")))
-        }
-        
-        return counts
-    
-    def process_automated_formats(self, base_path='/mnt/d/GoogleDrive/Cercetasi/carti-camp-jocuri'):
-        """Proceseaz toate formatele care pot fi automatizate"""
-        print("="*60)
-        print("UNIFIED ACTIVITY PROCESSOR - AUTOMATED PHASE")
-        print("="*60)
-        
-        self.stats['start_time'] = time.time()
-        initial_count = self.get_current_activity_count()
-        
-        # Afieaz statistici iniiale
-        file_counts = self.count_files_to_process(base_path)
-        print(f"\nFiles to process:")
-        for format, count in file_counts.items():
-            print(f"  {format.upper()}: {count} files")
-        print(f"\nCurrent activities in database: {initial_count}")
-        print("-"*60)
-        
-        # FAZA 1: Procesare HTML (prioritate maxim - volum mare)
-        print("\n[1/2] Processing HTML files...")
-        print("-"*40)
-        html_processed, html_errors = self.html_extractor.process_all_html_files(base_path)
-        self.stats['html_processed'] = html_processed
-        
-        # FAZA 2: Procesare Text/MD
-        print("\n[2/2] Processing Text/Markdown files...")
-        print("-"*40)
-        text_processed, text_saved = self.text_extractor.process_all_text_files(base_path)
-        self.stats['text_processed'] = text_processed
-        
-        # Statistici finale
-        self.stats['end_time'] = time.time()
-        final_count = self.get_current_activity_count()
-        self.stats['total_activities'] = final_count - initial_count
-        
-        # Identific fiierele care necesit procesare manual
-        self.stats['pdf_to_process'] = file_counts['pdf']
-        self.stats['doc_to_process'] = file_counts['doc'] + file_counts['docx']
-        
-        self.print_summary()
-        self.save_pdf_doc_list(base_path)
-    
-    def print_summary(self):
-        """Afieaz rezumatul procesrii"""
-        print("\n" + "="*60)
-        print("PROCESSING SUMMARY")
-        print("="*60)
-        
-        duration = self.stats['end_time'] - self.stats['start_time']
-        
-        print(f"\nAutomated Processing Results:")
-        print(f"  HTML files processed: {self.stats['html_processed']}")
-        print(f"  Text/MD files processed: {self.stats['text_processed']}")
-        print(f"  New activities added: {self.stats['total_activities']}")
-        print(f"  Processing time: {duration:.1f} seconds")
-        
-        print(f"\nFiles requiring Claude processing:")
-        print(f"  PDF files: {self.stats['pdf_to_process']}")
-        print(f"  DOC/DOCX files: {self.stats['doc_to_process']}")
-        
-        print("\n" + "="*60)
-        print("NEXT STEPS:")
-        print("1. Review the file 'pdf_doc_for_claude.txt' for manual processing")
-        print("2. Use Claude to extract activities from PDF/DOC files")
-        print("3. Focus on largest PDF files first (highest activity density)")
-        print("="*60)
-    
-    def save_pdf_doc_list(self, base_path):
-        """Salveaz lista de PDF/DOC pentru procesare cu Claude"""
-        base_path = Path(base_path)
-        
-        pdf_files = sorted(base_path.rglob("*.pdf"), key=lambda p: p.stat().st_size, reverse=True)
-        doc_files = list(base_path.rglob("*.doc"))
-        docx_files = list(base_path.rglob("*.docx"))
-        
-        with open('pdf_doc_for_claude.txt', 'w', encoding='utf-8') as f:
-            f.write("PDF/DOC FILES FOR CLAUDE PROCESSING\n")
-            f.write("="*60 + "\n")
-            f.write("Files sorted by size (largest first = likely more activities)\n\n")
-            
-            f.write("TOP PRIORITY PDF FILES (process these first):\n")
-            f.write("-"*40 + "\n")
-            for i, pdf in enumerate(pdf_files[:20], 1):
-                size_mb = pdf.stat().st_size / (1024*1024)
-                f.write(f"{i}. {pdf.name} ({size_mb:.1f} MB)\n")
-                f.write(f"   Path: {pdf}\n\n")
-            
-            if len(pdf_files) > 20:
-                f.write(f"\n... and {len(pdf_files)-20} more PDF files\n\n")
-            
-            f.write("\nDOC/DOCX FILES:\n")
-            f.write("-"*40 + "\n")
-            for doc in doc_files + docx_files:
-                size_kb = doc.stat().st_size / 1024
-                f.write(f"- {doc.name} ({size_kb:.1f} KB)\n")
-        
-        print(f"\nPDF/DOC list saved to: pdf_doc_for_claude.txt")
-
-if __name__ == "__main__":
-    processor = UnifiedProcessor()
-    processor.process_automated_formats()
\ No newline at end of file
diff --git a/scripts/validate_extractions.py b/scripts/validate_extractions.py
new file mode 100644
index 0000000..cdb6113
--- /dev/null
+++ b/scripts/validate_extractions.py
@@ -0,0 +1,208 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+"""
+validate_extractions.py — validate every data/extracted/*.json (plan §5b).
+
+For each extraction file it runs two checks:
+  1. JSON-schema validation against scripts/activity_schema.json,
+  2. the source_excerpt anti-hallucination check (each excerpt must be a fuzzy
+     substring of the chunk it came from).
+
+For every failing chunk it:
+  * writes the exact re-extraction prompt to data/extracted/_reextract/<chunk>.prompt.md,
+  * marks the chunk `rejected` in data/chunks/manifest.json.
+
+The orchestrator then re-launches subagents only on the `rejected` chunks; the
+loop repeats until nothing is rejected.
+
+Usage:
+    python scripts/validate_extractions.py
+"""
+
+from __future__ import annotations
+
+import argparse
+import json
+import sys
+from pathlib import Path
+from typing import Optional
+
+SCRIPT_DIR = Path(__file__).resolve().parent
+REPO_ROOT = SCRIPT_DIR.parent
+for _p in (str(SCRIPT_DIR), str(REPO_ROOT)):
+    if _p not in sys.path:
+        sys.path.insert(0, _p)
+
+from import_common import (  # noqa: E402
+    DEFAULT_SCHEMA_PATH,
+    chunk_key_for,
+    excerpt_matches,
+    excerpt_score,
+    find_chunk_text,
+    iter_extraction_files,
+    load_schema,
+    validate_extraction,
+)
+
+SUBAGENT_PROMPT = SCRIPT_DIR / "SUBAGENT_PROMPT.md"
+
+
+# --------------------------------------------------------------------------
+# re-extraction prompt
+# --------------------------------------------------------------------------
+def build_reextraction_prompt(
+    chunk_key: str, chunk_file: Optional[str], errors: list[str]
+) -> str:
+    """The exact prompt to hand a subagent to re-extract a rejected chunk."""
+    chunk_ref = chunk_file or f"data/chunks/<source_id>/{chunk_key}.txt"
+    lines = [
+        f"# RE-EXTRACTION — chunk `{chunk_key}`",
+        "",
+        "The previous extraction for this chunk was **REJECTED**. Reasons:",
+        "",
+    ]
+    lines += [f"- {e}" for e in errors]
+    lines += [
+        "",
+        "## What to do",
+        "",
+        f"1. Read ONLY this chunk: `{chunk_ref}`",
+        f"2. Follow the extraction rules in `{SUBAGENT_PROMPT.relative_to(REPO_ROOT)}`.",
+        "3. Fix every problem listed above. In particular:",
+        "   - every `source_excerpt` must be copied **verbatim** from the chunk",
+        "     (it is checked as a fuzzy substring — invented quotes are rejected);",
+        "   - `source_excerpt` and `page_reference` are mandatory on every activity;",
+        "   - the output must validate against `scripts/activity_schema.json`.",
+        f"4. Overwrite the extraction file `data/extracted/{chunk_key}.json`.",
+        "",
+    ]
+    return "\n".join(lines)
+
+
+# --------------------------------------------------------------------------
+# manifest
+# --------------------------------------------------------------------------
+def load_manifest(manifest_path: Path) -> dict:
+    if manifest_path.is_file():
+        try:
+            data = json.loads(manifest_path.read_text(encoding="utf-8"))
+            data.setdefault("chunks", {})
+            return data
+        except (json.JSONDecodeError, OSError):
+            pass
+    return {"chunks": {}}
+
+
+def save_manifest(manifest: dict, manifest_path: Path) -> None:
+    manifest_path.parent.mkdir(parents=True, exist_ok=True)
+    manifest_path.write_text(
+        json.dumps(manifest, indent=2, ensure_ascii=False), encoding="utf-8"
+    )
+
+
+def mark_rejected(manifest: dict, chunk_key: str) -> None:
+    """Flip a chunk to `rejected` in the manifest (creating the entry if new)."""
+    entry = manifest["chunks"].get(chunk_key, {})
+    entry["state"] = "rejected"
+    manifest["chunks"][chunk_key] = entry
+
+
+# --------------------------------------------------------------------------
+# validation
+# --------------------------------------------------------------------------
+def validate_file(json_path: Path, schema: dict, chunks_dir: Path) -> list[str]:
+    """Return the list of errors for one extraction file (empty == valid)."""
+    try:
+        data = json.loads(json_path.read_text(encoding="utf-8"))
+    except json.JSONDecodeError as exc:
+        return [f"invalid JSON: {exc}"]
+
+    errors = validate_extraction(data, schema)
+    if errors:
+        return errors
+
+    header = data.get("header", {})
+    chunk_text = find_chunk_text(json_path, header, chunks_dir)
+    if chunk_text is None:
+        return [f"source chunk not found for {chunk_key_for(json_path, header)}"]
+
+    for adict in data.get("activities", []):
+        excerpt = adict.get("source_excerpt") or ""
+        if not excerpt_matches(excerpt, chunk_text):
+            score = excerpt_score(excerpt, chunk_text)
+            errors.append(
+                f"activity {adict.get('name')!r}: source_excerpt not found in "
+                f"chunk (best match {score:.0f}/100) — possible hallucination"
+            )
+    return errors
+
+
+def run(
+    extracted_dir: Path,
+    chunks_dir: Path,
+    manifest_path: Path,
+    schema_path: Path = DEFAULT_SCHEMA_PATH,
+) -> dict:
+    schema = load_schema(schema_path)
+    manifest = load_manifest(manifest_path)
+    reextract_dir = extracted_dir / "_reextract"
+
+    report = {"total": 0, "valid": 0, "rejected": 0, "rejected_chunks": []}
+    for json_path in iter_extraction_files(extracted_dir):
+        report["total"] += 1
+        errors = validate_file(json_path, schema, chunks_dir)
+        if not errors:
+            report["valid"] += 1
+            continue
+
+        report["rejected"] += 1
+        try:
+            data = json.loads(json_path.read_text(encoding="utf-8"))
+            header = data.get("header", {})
+        except json.JSONDecodeError:
+            header = {}
+        chunk_key = chunk_key_for(json_path, header)
+        chunk_file = None
+        meta = manifest["chunks"].get(chunk_key)
+        if meta:
+            chunk_file = meta.get("chunk_file")
+
+        reextract_dir.mkdir(parents=True, exist_ok=True)
+        prompt = build_reextraction_prompt(chunk_key, chunk_file, errors)
+        (reextract_dir / f"{chunk_key}.prompt.md").write_text(prompt, encoding="utf-8")
+
+        mark_rejected(manifest, chunk_key)
+        report["rejected_chunks"].append({"chunk": chunk_key, "errors": errors})
+
+    save_manifest(manifest, manifest_path)
+    return report
+
+
+# --------------------------------------------------------------------------
+# CLI
+# --------------------------------------------------------------------------
+def main(argv: Optional[list[str]] = None) -> int:
+    parser = argparse.ArgumentParser(description="Validate extraction JSON files.")
+    parser.add_argument("--extracted", default="data/extracted")
+    parser.add_argument("--chunks", default="data/chunks")
+    parser.add_argument("--manifest", default="data/chunks/manifest.json")
+    parser.add_argument("--schema", default=str(DEFAULT_SCHEMA_PATH))
+    args = parser.parse_args(argv)
+
+    report = run(
+        Path(args.extracted), Path(args.chunks), Path(args.manifest), Path(args.schema)
+    )
+    print(f"extraction files : {report['total']}")
+    print(f"  valid          : {report['valid']}")
+    print(f"  rejected       : {report['rejected']}")
+    for item in report["rejected_chunks"]:
+        print(f"  [rejected] {item['chunk']}")
+        for err in item["errors"]:
+            print(f"      - {err}")
+    if report["rejected"]:
+        print(f"\nRe-extraction prompts written to {args.extracted}/_reextract/")
+    return 0
+
+
+if __name__ == "__main__":
+    raise SystemExit(main())
diff --git a/tests/conftest.py b/tests/conftest.py
new file mode 100644
index 0000000..3e59d0e
--- /dev/null
+++ b/tests/conftest.py
@@ -0,0 +1,114 @@
+# -*- coding: utf-8 -*-
+"""
+Shared pytest fixtures for the extraction-pipeline tests.
+
+scripts/ is not a package, so it is added to sys.path here. Synthetic fixtures
+(PDF, docx, zip, HTML) are generated at runtime — no binary blobs in the repo.
+"""
+
+import sys
+import zipfile
+from pathlib import Path
+
+import pytest
+
+REPO_ROOT = Path(__file__).resolve().parent.parent
+SCRIPTS_DIR = REPO_ROOT / "scripts"
+if str(SCRIPTS_DIR) not in sys.path:
+    sys.path.insert(0, str(SCRIPTS_DIR))
+
+
+# --------------------------------------------------------------------------
+# synthetic PDF — deliberately large to pin the "no max_pages" regression
+# --------------------------------------------------------------------------
+@pytest.fixture
+def big_pdf(tmp_path):
+    """A 60-page PDF; each page carries a unique 'PDFMARK-<n>' token."""
+    from reportlab.pdfgen import canvas
+    from reportlab.lib.pagesizes import letter
+
+    path = tmp_path / "big.pdf"
+    c = canvas.Canvas(str(path), pagesize=letter)
+    for n in range(1, 61):
+        c.drawString(72, 720, f"PDFMARK-{n} synthetic activity page number {n}")
+        c.drawString(72, 700, "Acest joc educativ se joaca in echipa.")
+        c.showPage()
+    c.save()
+    return path
+
+
+# --------------------------------------------------------------------------
+# synthetic docx — 100 paragraphs => 3 synthetic pages at 40 paras/page
+# --------------------------------------------------------------------------
+@pytest.fixture
+def sample_docx(tmp_path):
+    import docx
+
+    path = tmp_path / "sample.docx"
+    document = docx.Document()
+    for i in range(100):
+        document.add_paragraph(f"Paragraf {i}: continut joc team-building.")
+    document.save(str(path))
+    return path
+
+
+# --------------------------------------------------------------------------
+# synthetic HTML mirror page — with nav/script/footer chrome to strip
+# --------------------------------------------------------------------------
+HTML_WITH_NAV = """<!doctype html>
+<html><head><title>Joc</title>
+<style>.x{color:red}</style>
+<script>var tracking = 1;</script>
+</head><body>
+<nav><a href="/">Home</a><a href="/games">Games</a></nav>
+<header>Site Banner Junk</header>
+<main>
+<h1>Vanatoarea de comori</h1>
+<p>Acesta este un joc real de orientare pentru cercetasi.</p>
+<p>Jucatorii cauta indicii ascunse in tabara.</p>
+</main>
+<footer>Copyright 2024 - toate drepturile rezervate</footer>
+</body></html>
+"""
+
+
+@pytest.fixture
+def html_with_nav(tmp_path):
+    path = tmp_path / "page.html"
+    path.write_text(HTML_WITH_NAV, encoding="utf-8")
+    return path
+
+
+# --------------------------------------------------------------------------
+# synthetic zip — contains a docx and a stray junk file
+# --------------------------------------------------------------------------
+@pytest.fixture
+def sample_zip(tmp_path, sample_docx):
+    path = tmp_path / "archive.zip"
+    with zipfile.ZipFile(path, "w") as zf:
+        zf.write(sample_docx, arcname="inner/sample.docx")
+        zf.writestr("desktop.ini", "junk")
+    return path
+
+
+# --------------------------------------------------------------------------
+# synthetic normalized source — paginated, with an activity straddling a
+# page boundary so the chunker overlap can be verified.
+# --------------------------------------------------------------------------
+@pytest.fixture
+def paginated_source(tmp_path):
+    """A 50-page normalized source. An activity spans the page 20/21 boundary."""
+    lines = ["SOURCE: synthetic/test.pdf", "CONVERTED: 2026-05-19",
+             "FORMAT: pdf", "=" * 50, ""]
+    for n in range(1, 51):
+        lines.append(f"--- PAGE {n} ---")
+        if n == 20:
+            lines.append("ACTIVITY-START jocul podului care traverseaza pagina")
+        elif n == 21:
+            lines.append("continuare a jocului podului ACTIVITY-END")
+        else:
+            lines.append(f"continut obisnuit pe pagina {n}")
+        lines.append("")
+    path = tmp_path / "src_paginated.txt"
+    path.write_text("\n".join(lines), encoding="utf-8")
+    return path
diff --git a/tests/fixtures/.gitkeep b/tests/fixtures/.gitkeep
new file mode 100644
index 0000000..f016cdb
--- /dev/null
+++ b/tests/fixtures/.gitkeep
@@ -0,0 +1,3 @@
+# Test fixtures (synthetic PDF/docx/zip/HTML) are generated at runtime by
+# tests/conftest.py — no binary blobs are committed. This file only preserves
+# the directory in git.
diff --git a/tests/test_build_database.py b/tests/test_build_database.py
new file mode 100644
index 0000000..e4a5e14
--- /dev/null
+++ b/tests/test_build_database.py
@@ -0,0 +1,334 @@
+# -*- coding: utf-8 -*-
+"""
+Tests for scripts/build_database.py — the import / dedup / swap side.
+
+Covers: category -> slug + `altele` fallback; dedup across all three threshold
+bands; EN != RO never merged; field combination on merge; atomic swap with a
+simulated mid-build crash; the source_excerpt substring check.
+"""
+
+import json
+import os
+import sys
+from pathlib import Path
+
+import pytest
+
+REPO_ROOT = Path(__file__).resolve().parent.parent
+SCRIPTS_DIR = REPO_ROOT / "scripts"
+for _p in (str(REPO_ROOT), str(SCRIPTS_DIR)):
+    if _p not in sys.path:
+        sys.path.insert(0, _p)
+
+import build_database as bd  # noqa: E402
+from app.models.activity import Activity  # noqa: E402
+from app.models.database import DatabaseManager  # noqa: E402
+
+
+# --------------------------------------------------------------------------
+# helpers
+# --------------------------------------------------------------------------
+def _activity(**over):
+    base = dict(
+        name="Jocul testului",
+        description="O activitate de echipa in aer liber.",
+        category="team-building",
+        content_type="joc",
+        language="ro",
+        extraction_confidence="high",
+    )
+    base.update(over)
+    return Activity(**base)
+
+
+def _ext_activity(**over):
+    """A schema-valid extraction-JSON activity object."""
+    base = dict(
+        name="Jocul testului",
+        description="O activitate de echipa in aer liber.",
+        category="team-building",
+        content_type="joc",
+        language="ro",
+        extraction_confidence="high",
+        source_excerpt="ANCHOR-EXCERPT despre jocul testului",
+        page_reference="page 1",
+    )
+    base.update(over)
+    return base
+
+
+def _write_extraction(extracted_dir, chunk_key, activities, source_id="src01"):
+    extracted_dir.mkdir(parents=True, exist_ok=True)
+    payload = {
+        "header": {
+            "source_hash": "hash1234deadbeef",
+            "schema_version": "1.0",
+            "prompt_version": "1.0",
+            "chunk_range": "pages 1-20",
+            "source_id": source_id,
+            "chunk_key": chunk_key,
+        },
+        "activities": activities,
+    }
+    (extracted_dir / f"{chunk_key}.json").write_text(
+        json.dumps(payload, ensure_ascii=False), encoding="utf-8"
+    )
+
+
+def _write_chunk(chunks_dir, source_id, chunk_key, text):
+    d = chunks_dir / source_id
+    d.mkdir(parents=True, exist_ok=True)
+    (d / f"{chunk_key}.txt").write_text(text, encoding="utf-8")
+
+
+# --------------------------------------------------------------------------
+# step 3 — category normalization
+# --------------------------------------------------------------------------
+def test_category_alias_mapped_to_slug():
+    act = bd.dict_to_activity(_ext_activity(category="teambuilding"), "s.txt")
+    assert act.category == "team-building"
+
+
+def test_unknown_category_falls_back_to_altele():
+    act = bd.dict_to_activity(_ext_activity(category="zzz-not-a-category"), "s.txt")
+    assert act.category == "altele"
+
+
+def test_content_type_normalized():
+    act = bd.dict_to_activity(_ext_activity(content_type="games"), "s.txt")
+    assert act.content_type == "joc"
+
+
+# --------------------------------------------------------------------------
+# step 4 — dedup, three bands
+# --------------------------------------------------------------------------
+def test_dedup_auto_merge_identical_descriptions():
+    """>= 85 similar -> a single merged row."""
+    a = _activity(description="copiii formeaza echipe si traverseaza terenul")
+    b = _activity(description="copiii formeaza echipe si traverseaza terenul")
+    out, stats = bd.dedup_activities([a, b])
+    assert len(out) == 1
+    assert stats["auto_merged"] == 1
+    assert out[0].needs_review == 0
+
+
+def test_dedup_borderline_keeps_both_and_flags_needs_review():
+    """60-85 similar -> both kept, both flagged needs_review."""
+    from rapidfuzz import fuzz
+
+    d1 = "alpha beta gamma delta epsilon"
+    d2 = "alpha beta gamma delta epsilon zeta eta theta iota"
+    score = fuzz.token_sort_ratio(d1, d2)
+    assert 60.0 <= score < 85.0, f"precondition: score={score} not borderline"
+
+    a = _activity(description=d1)
+    b = _activity(description=d2)
+    out, stats = bd.dedup_activities([a, b])
+    assert len(out) == 2
+    assert stats["borderline"] == 2
+    assert all(act.needs_review == 1 for act in out)
+
+
+def test_dedup_low_similarity_kept_as_separate_variants():
+    """< 60 similar -> separate variants, no needs_review."""
+    from rapidfuzz import fuzz
+
+    d1 = "alpha beta gamma delta epsilon"
+    d2 = "quebec romeo sierra tango uniform victor whiskey"
+    assert fuzz.token_sort_ratio(d1, d2) < 60.0
+
+    a = _activity(description=d1)
+    b = _activity(description=d2)
+    out, stats = bd.dedup_activities([a, b])
+    assert len(out) == 2
+    assert stats["auto_merged"] == 0
+    assert all(act.needs_review == 0 for act in out)
+
+
+def test_dedup_never_merges_across_languages():
+    """Same name + same description but EN vs RO -> two distinct rows."""
+    desc = "children form teams and cross the field"
+    ro = _activity(name="Cursa", description=desc, language="ro")
+    en = _activity(name="Cursa", description=desc, language="en")
+    out, stats = bd.dedup_activities([ro, en])
+    assert len(out) == 2
+    assert stats["auto_merged"] == 0
+    langs = {a.language for a in out}
+    assert langs == {"ro", "en"}
+
+
+def test_merge_combines_fields():
+    """On merge: longest description/rules, union materials, accumulated sources."""
+    desc = "copiii formeaza echipe si traverseaza terenul cu obstacole"
+    a = _activity(
+        description=desc,
+        rules="regula scurta",
+        materials_list="franghie, esarfa",
+        source_file="a.txt",
+        keywords="echipa",
+    )
+    b = _activity(
+        description=desc,
+        rules="o regula mult mai lunga si mai detaliata pentru joc",
+        materials_list="busola, esarfa",
+        source_file="b.txt",
+        keywords="cooperare",
+    )
+    out, _ = bd.dedup_activities([a, b])
+    assert len(out) == 1
+    merged = out[0]
+    assert merged.rules == "o regula mult mai lunga si mai detaliata pentru joc"
+    mats = set(m.strip() for m in merged.materials_list.split(","))
+    assert mats == {"franghie", "esarfa", "busola"}
+    assert set(merged.source_files) == {"a.txt", "b.txt"}
+    assert merged.popularity_score == 1
+    assert set(k.strip() for k in merged.keywords.split(",")) == {"echipa", "cooperare"}
+
+
+# --------------------------------------------------------------------------
+# step 5 — review decisions
+# --------------------------------------------------------------------------
+def test_review_decision_drop_removes_row():
+    from import_common import content_key, normalize_name
+
+    a = _activity(description="o descriere de test")
+    key = content_key(normalize_name(a.name), a.language, a.description)
+    kept, stats = bd.apply_review_decisions([a], {key: {"decision": "drop"}})
+    assert kept == []
+    assert stats["dropped"] == 1
+
+
+def test_review_decision_keep_separate_clears_needs_review():
+    from import_common import content_key, normalize_name
+
+    a = _activity(description="o descriere de test")
+    a.needs_review = 1
+    key = content_key(normalize_name(a.name), a.language, a.description)
+    kept, stats = bd.apply_review_decisions([a], {key: {"decision": "keep-separate"}})
+    assert len(kept) == 1 and kept[0].needs_review == 0
+    assert stats["resolved"] == 1
+
+
+# --------------------------------------------------------------------------
+# step 2b — source_excerpt hallucination check
+# --------------------------------------------------------------------------
+def test_hallucinated_excerpt_activity_dropped(tmp_path):
+    extracted = tmp_path / "extracted"
+    chunks = tmp_path / "chunks"
+    sources = tmp_path / "sources"
+
+    good = _ext_activity(
+        name="Joc real", source_excerpt="textul real apare in bucata sursa"
+    )
+    bad = _ext_activity(
+        name="Joc inventat",
+        source_excerpt="acest citat nu exista nicaieri in sursa originala xyzzy",
+    )
+    _write_extraction(extracted, "src01.part01", [good, bad])
+    _write_chunk(
+        chunks, "src01", "src01.part01",
+        "--- PAGE 1 ---\ntextul real apare in bucata sursa pentru jocul real.\n",
+    )
+
+    from import_common import load_schema
+
+    schema = load_schema()
+    res = bd.collect_activities(extracted, chunks, sources, schema)
+    names = {a.name for a in res["activities"]}
+    assert names == {"Joc real"}
+    assert res["activities_hallucinated"] == 1
+    assert (extracted / "_rejected").exists()
+
+
+def test_schema_invalid_file_moved_to_rejected(tmp_path):
+    extracted = tmp_path / "extracted"
+    chunks = tmp_path / "chunks"
+    sources = tmp_path / "sources"
+    extracted.mkdir(parents=True)
+
+    # missing required header keys + bad activity
+    (extracted / "bad.json").write_text(
+        json.dumps({"header": {}, "activities": [{"name": "x"}]}),
+        encoding="utf-8",
+    )
+    from import_common import load_schema
+
+    res = bd.collect_activities(extracted, chunks, sources, load_schema())
+    assert res["files_rejected_schema"] == 1
+    assert not (extracted / "bad.json").exists()
+    assert (extracted / "_rejected" / "bad.json").exists()
+    assert (extracted / "_rejected" / "bad.errors.txt").exists()
+
+
+# --------------------------------------------------------------------------
+# end-to-end rebuild + atomic swap
+# --------------------------------------------------------------------------
+def _setup_corpus(tmp_path):
+    extracted = tmp_path / "extracted"
+    chunks = tmp_path / "chunks"
+    sources = tmp_path / "sources"
+    excerpt = "jocul testului este o activitate de echipa"
+    _write_extraction(
+        extracted, "src01.part01",
+        [_ext_activity(source_excerpt=excerpt)],
+    )
+    _write_chunk(chunks, "src01", "src01.part01",
+                 f"--- PAGE 1 ---\n{excerpt} in aer liber.\n")
+    return extracted, chunks, sources
+
+
+def test_rebuild_creates_database(tmp_path):
+    extracted, chunks, sources = _setup_corpus(tmp_path)
+    db_path = tmp_path / "activities.db"
+
+    report = bd.rebuild(
+        extracted_dir=extracted, chunks_dir=chunks, sources_dir=sources,
+        db_path=db_path,
+    )
+    assert db_path.exists()
+    assert report["final_count"] == 1
+
+    db = DatabaseManager(str(db_path))
+    rows = db.search_activities()
+    assert len(rows) == 1
+    assert rows[0]["category"] == "team-building"
+
+
+def test_atomic_swap_keeps_live_db_intact_on_crash(tmp_path, monkeypatch):
+    """A mid-build crash must leave the live DB byte-identical."""
+    extracted, chunks, sources = _setup_corpus(tmp_path)
+    db_path = tmp_path / "activities.db"
+
+    # a pre-existing live DB with sentinel content
+    live = DatabaseManager(str(db_path))
+    live.insert_activity(_activity(name="Sentinel viu"))
+    before = db_path.read_bytes()
+
+    def boom(self, *a, **k):
+        raise RuntimeError("simulated mid-build crash")
+
+    monkeypatch.setattr(DatabaseManager, "bulk_insert_activities", boom)
+
+    with pytest.raises(RuntimeError, match="simulated mid-build crash"):
+        bd.rebuild(
+            extracted_dir=extracted, chunks_dir=chunks, sources_dir=sources,
+            db_path=db_path,
+        )
+
+    # live DB untouched, tmp cleaned up
+    assert db_path.read_bytes() == before
+    assert not (tmp_path / "activities.db.tmp").exists()
+
+
+def test_rebuild_backs_up_live_db(tmp_path):
+    extracted, chunks, sources = _setup_corpus(tmp_path)
+    db_path = tmp_path / "activities.db"
+    DatabaseManager(str(db_path)).insert_activity(_activity(name="Vechi"))
+
+    report = bd.rebuild(
+        extracted_dir=extracted, chunks_dir=chunks, sources_dir=sources,
+        db_path=db_path,
+    )
+    assert report["backup"] is not None
+    assert Path(report["backup"]).exists()
+    assert os.path.basename(report["backup"]) == "activities.db.bak"
diff --git a/tests/test_chunk_sources.py b/tests/test_chunk_sources.py
new file mode 100644
index 0000000..1b6b5e5
--- /dev/null
+++ b/tests/test_chunk_sources.py
@@ -0,0 +1,183 @@
+# -*- coding: utf-8 -*-
+"""Tests for scripts/chunk_sources.py."""
+
+import json
+
+import chunk_sources as cs
+import normalize_sources as ns
+
+
+def _pages(n):
+    return [(i, f"text-{i}") for i in range(1, n + 1)]
+
+
+# --------------------------------------------------------------------------
+# header parsing
+# --------------------------------------------------------------------------
+def test_parse_source_splits_header_and_body(paginated_source):
+    text = paginated_source.read_text(encoding="utf-8")
+    header, body = cs.parse_source(text)
+    assert header["FORMAT"] == "pdf"
+    assert body.lstrip().startswith("--- PAGE 1 ---")
+
+
+# --------------------------------------------------------------------------
+# page chunking
+# --------------------------------------------------------------------------
+def test_chunk_pages_basic_split():
+    chunks = cs.chunk_pages(_pages(50), pages_per_chunk=20, overlap=4)
+    # stride 16: starts at pages 1, 17, 33, ...
+    assert chunks[0]["page_start"] == 1 and chunks[0]["page_end"] == 20
+    assert chunks[1]["page_start"] == 17
+    assert chunks[-1]["page_end"] == 50
+
+
+def test_chunk_pages_have_overlap():
+    chunks = cs.chunk_pages(_pages(50), pages_per_chunk=20, overlap=4)
+    overlap = chunks[0]["page_end"] - chunks[1]["page_start"] + 1
+    assert overlap == 4
+
+
+def test_chunk_pages_short_document_single_chunk():
+    chunks = cs.chunk_pages(_pages(8), pages_per_chunk=20, overlap=4)
+    assert len(chunks) == 1
+    assert chunks[0]["page_start"] == 1 and chunks[0]["page_end"] == 8
+
+
+def test_chunk_pages_empty():
+    assert cs.chunk_pages([]) == []
+
+
+def test_activity_at_page_boundary_intact_in_one_chunk(paginated_source):
+    """An activity straddling the page 20/21 boundary must appear whole in >=1 chunk."""
+    text = paginated_source.read_text(encoding="utf-8")
+    chunks = cs.make_chunks(text)
+    full = [
+        c for c in chunks
+        if "ACTIVITY-START" in c["text"] and "ACTIVITY-END" in c["text"]
+    ]
+    assert full, "activity spanning a page boundary was split across all chunks"
+
+
+# --------------------------------------------------------------------------
+# word-window chunking for unpaginated text
+# --------------------------------------------------------------------------
+def test_chunk_words_window_and_overlap():
+    text = " ".join(f"w{i}" for i in range(25_000))
+    chunks = cs.chunk_words(text, window=10_000, overlap=2_000)
+    assert len(chunks) == 3  # stride 8000 over 25000 words
+    first = chunks[0]["text"].split()
+    second = chunks[1]["text"].split()
+    assert first[8_000:10_000] == second[0:2_000]  # 2000-word overlap
+
+
+def test_make_chunks_unpaginated_uses_word_windows():
+    body = "cuvant " * 15_000
+    text = "SOURCE: x\nFORMAT: txt\n" + "=" * 50 + "\n\n" + body
+    chunks = cs.make_chunks(text)
+    assert len(chunks) >= 2
+    assert chunks[0]["chunk_range"].startswith("words")
+
+
+# --------------------------------------------------------------------------
+# stable source ids — anti-collision
+# --------------------------------------------------------------------------
+def test_stable_id_same_stem_different_path_no_collision():
+    a = ns.stable_id("camp/games/scout.pdf")
+    b = ns.stable_id("school/lessons/scout.pdf")
+    assert a != b
+    assert a.endswith("_scout") and b.endswith("_scout")
+
+
+def test_stable_id_deterministic():
+    assert ns.stable_id("a/b/c.pdf") == ns.stable_id("a/b/c.pdf")
+
+
+# --------------------------------------------------------------------------
+# manifest registry + idempotency
+# --------------------------------------------------------------------------
+def test_run_writes_chunks_and_manifest(paginated_source, tmp_path):
+    sources_dir = tmp_path / "sources"
+    sources_dir.mkdir()
+    (sources_dir / paginated_source.name).write_text(
+        paginated_source.read_text(encoding="utf-8"), encoding="utf-8"
+    )
+    chunks_dir = tmp_path / "chunks"
+
+    summary = cs.run(sources_dir, chunks_dir)
+    assert summary["sources"] == 1
+    assert summary["chunks"] >= 2
+
+    manifest = json.loads((chunks_dir / "manifest.json").read_text())
+    assert manifest["chunks"]
+    for key, meta in manifest["chunks"].items():
+        assert meta["state"] == "pending"
+        assert meta["expected_json"] == f"{key}.json"
+        assert (chunks_dir.parent / meta["chunk_file"]).exists()
+
+
+def test_manifest_idempotent_preserves_state(paginated_source, tmp_path):
+    sources_dir = tmp_path / "sources"
+    sources_dir.mkdir()
+    (sources_dir / paginated_source.name).write_text(
+        paginated_source.read_text(encoding="utf-8"), encoding="utf-8"
+    )
+    chunks_dir = tmp_path / "chunks"
+    manifest_path = chunks_dir / "manifest.json"
+
+    cs.run(sources_dir, chunks_dir)
+
+    # orchestrator marks one chunk done
+    manifest = json.loads(manifest_path.read_text())
+    first_key = next(iter(manifest["chunks"]))
+    n_before = len(manifest["chunks"])
+    manifest["chunks"][first_key]["state"] = "done"
+    manifest_path.write_text(json.dumps(manifest), encoding="utf-8")
+
+    # re-run: 'done' must survive, no chunk added or lost
+    cs.run(sources_dir, chunks_dir)
+    manifest2 = json.loads(manifest_path.read_text())
+    assert len(manifest2["chunks"]) == n_before
+    assert manifest2["chunks"][first_key]["state"] == "done"
+    assert all(
+        m["state"] in ("pending", "done") for m in manifest2["chunks"].values()
+    )
+
+
+def test_manifest_resets_state_when_source_changes(paginated_source, tmp_path):
+    sources_dir = tmp_path / "sources"
+    sources_dir.mkdir()
+    src = sources_dir / paginated_source.name
+    src.write_text(paginated_source.read_text(encoding="utf-8"), encoding="utf-8")
+    chunks_dir = tmp_path / "chunks"
+    manifest_path = chunks_dir / "manifest.json"
+
+    cs.run(sources_dir, chunks_dir)
+    manifest = json.loads(manifest_path.read_text())
+    first_key = next(iter(manifest["chunks"]))
+    manifest["chunks"][first_key]["state"] = "done"
+    manifest_path.write_text(json.dumps(manifest), encoding="utf-8")
+
+    # mutate the source content -> hash changes -> state resets
+    src.write_text(src.read_text(encoding="utf-8") + "\n--- PAGE 51 ---\nextra\n",
+                   encoding="utf-8")
+    cs.run(sources_dir, chunks_dir)
+    manifest2 = json.loads(manifest_path.read_text())
+    assert manifest2["chunks"][first_key]["state"] == "pending"
+
+
+def test_prune_stale_removes_orphan_entries(paginated_source, tmp_path):
+    sources_dir = tmp_path / "sources"
+    sources_dir.mkdir()
+    src = sources_dir / paginated_source.name
+    src.write_text(paginated_source.read_text(encoding="utf-8"), encoding="utf-8")
+    chunks_dir = tmp_path / "chunks"
+
+    cs.run(sources_dir, chunks_dir)
+    # delete the source -> its chunks become stale
+    src.unlink()
+    summary = cs.run(sources_dir, chunks_dir)
+    assert summary["chunks"] == 0
+    assert summary["pruned"] >= 1
+    manifest = json.loads((chunks_dir / "manifest.json").read_text())
+    assert manifest["chunks"] == {}
diff --git a/tests/test_extract_common.py b/tests/test_extract_common.py
new file mode 100644
index 0000000..17dedee
--- /dev/null
+++ b/tests/test_extract_common.py
@@ -0,0 +1,177 @@
+# -*- coding: utf-8 -*-
+"""Tests for scripts/extract_common.py."""
+
+import shutil
+import zipfile
+
+import pytest
+
+import extract_common as ec
+
+
+# --------------------------------------------------------------------------
+# format detection
+# --------------------------------------------------------------------------
+def test_detect_format():
+    assert ec.detect_format("a/b/file.PDF") == "pdf"
+    assert ec.detect_format("x.docx") == "docx"
+    assert ec.detect_format("x.doc") == "doc"
+    assert ec.detect_format("x.pptx") == "pptx"
+    assert ec.detect_format("x.html") == "html"
+    assert ec.detect_format("x.zip") == "zip"
+    assert ec.detect_format("x.epub") == "epub"
+    assert ec.detect_format("x.xyz") == "unknown"
+
+
+def test_is_junk():
+    assert ec.is_junk("some/desktop.ini")
+    assert ec.is_junk("notes.bak")
+    assert ec.is_junk("README.md")
+    assert not ec.is_junk("1000 Scout Games.pdf")
+
+
+# --------------------------------------------------------------------------
+# PDF — the critical "no max_pages" regression
+# --------------------------------------------------------------------------
+def test_pdf_extracts_all_60_pages(big_pdf):
+    body = ec.extract_pdf(big_pdf)
+    # the old converter capped at 50 pages — page 60 must be present now
+    assert "--- PAGE 60 ---" in body
+    assert "PDFMARK-60" in body
+    assert ec.count_page_markers(body) == 60
+
+
+def test_pdf_does_not_truncate_mid_document(big_pdf):
+    body = ec.extract_pdf(big_pdf)
+    pages = ec.split_pages(body)
+    assert pages[-1][0] == 60  # last marker is the real last page
+
+
+# --------------------------------------------------------------------------
+# page join / split round-trip
+# --------------------------------------------------------------------------
+def test_join_split_round_trip():
+    body = ec.join_pages(["alpha", "beta", "gamma"])
+    pages = ec.split_pages(body)
+    assert [n for n, _ in pages] == [1, 2, 3]
+    assert [t for _, t in pages] == ["alpha", "beta", "gamma"]
+
+
+def test_split_pages_no_markers_returns_empty():
+    assert ec.split_pages("plain text with no markers") == []
+
+
+# --------------------------------------------------------------------------
+# docx — synthetic page markers
+# --------------------------------------------------------------------------
+def test_docx_synthetic_page_markers(sample_docx):
+    body = ec.extract_docx(sample_docx)
+    # 100 paragraphs / 40 per page => 3 pages
+    assert ec.count_page_markers(body) == 3
+    assert "Paragraf 99" in body
+
+
+# --------------------------------------------------------------------------
+# HTML mirror — nav/script/footer stripped
+# --------------------------------------------------------------------------
+def test_html_strips_chrome(html_with_nav):
+    body = ec.extract_html(html_with_nav)
+    assert "Vanatoarea de comori" in body
+    assert "joc real de orientare" in body
+    # chrome must be gone
+    assert "tracking" not in body
+    assert "Site Banner Junk" not in body
+    assert "toate drepturile rezervate" not in body
+    assert "Games" not in body
+
+
+# --------------------------------------------------------------------------
+# content hash + near-duplicate elimination
+# --------------------------------------------------------------------------
+def test_content_hash_ignores_whitespace():
+    assert ec.content_hash("hello  world") == ec.content_hash("hello world\n")
+    assert ec.content_hash("hello world") != ec.content_hash("goodbye world")
+
+
+def test_dedupe_exact_duplicates():
+    items = [("a", "joc identic"), ("b", "joc identic"), ("c", "alt joc")]
+    kept = ec.dedupe_texts(items)
+    assert [k for k, _ in kept] == ["a", "c"]
+
+
+def test_dedupe_near_duplicates():
+    base = "Vanatoarea de comori este un joc de orientare pentru cercetasi in tabara."
+    near = base + " Pagina printata."  # >95% similar
+    items = [("orig", base), ("print", near), ("other", "Cu totul alt continut diferit aici.")]
+    kept = ec.dedupe_texts(items, threshold=85.0)
+    keys = [k for k, _ in kept]
+    assert "orig" in keys
+    assert "print" not in keys
+    assert "other" in keys
+
+
+# --------------------------------------------------------------------------
+# zip recursion
+# --------------------------------------------------------------------------
+def test_zip_recurses_into_inner_files(sample_zip):
+    body = ec.extract_zip(sample_zip)
+    assert "Paragraf 0" in body
+    assert ec.count_page_markers(body) > 0
+
+
+def test_zip_bad_archive_returns_empty(tmp_path):
+    bad = tmp_path / "broken.zip"
+    bad.write_text("not a zip", encoding="utf-8")
+    assert ec.extract_zip(bad) == ""
+
+
+def test_nested_zip(tmp_path, sample_zip):
+    outer = tmp_path / "outer.zip"
+    with zipfile.ZipFile(outer, "w") as zf:
+        zf.write(sample_zip, arcname="nested/archive.zip")
+    body = ec.extract_zip(outer)
+    assert "Paragraf 0" in body
+
+
+# --------------------------------------------------------------------------
+# preflight
+# --------------------------------------------------------------------------
+def test_preflight_python_packages_present():
+    report = ec.preflight()
+    # all required packages are installed in the test environment
+    assert report["missing_python"] == []
+
+
+def test_preflight_reports_libreoffice_state():
+    report = ec.preflight()
+    has_lo = bool(shutil.which("libreoffice") or shutil.which("soffice"))
+    if has_lo:
+        assert all("libreoffice" not in w for w in report["warnings"])
+    else:
+        assert any("libreoffice" in w for w in report["warnings"])
+
+
+def test_preflight_ocr_flag():
+    report = ec.preflight(check_ocr=True)
+    if not shutil.which("tesseract"):
+        assert any("tesseract" in m for m in report["missing_system"])
+
+
+# --------------------------------------------------------------------------
+# legacy .doc — skipped unless libreoffice is installed
+# --------------------------------------------------------------------------
+@pytest.mark.skipif(
+    not (shutil.which("libreoffice") or shutil.which("soffice")),
+    reason="libreoffice not installed",
+)
+def test_doc_conversion(tmp_path, sample_docx):
+    doc_path = tmp_path / "legacy.doc"
+    shutil.copy(sample_docx, doc_path)  # smoke test of the docx path
+    body = ec.extract_doc(doc_path)
+    assert ec.count_page_markers(body) >= 1
+
+
+def test_doc_without_libreoffice_raises(tmp_path, monkeypatch):
+    monkeypatch.setattr(ec.shutil, "which", lambda _: None)
+    with pytest.raises(RuntimeError):
+        ec.extract_doc(tmp_path / "whatever.doc")
diff --git a/tests/test_fts.py b/tests/test_fts.py
new file mode 100644
index 0000000..14e627f
--- /dev/null
+++ b/tests/test_fts.py
@@ -0,0 +1,139 @@
+"""
+Integration tests for the FTS5 search index.
+
+Confirms that materials_list and skills_developed are indexed by FTS5 and kept
+in sync by the insert / update / delete triggers (plan §6, §7).
+"""
+
+import os
+import sys
+import json
+
+import pytest
+
+# Make the project root importable when pytest is run from anywhere.
+PROJECT_ROOT = os.path.abspath(os.path.join(os.path.dirname(__file__), ".."))
+if PROJECT_ROOT not in sys.path:
+    sys.path.insert(0, PROJECT_ROOT)
+
+from app.models.activity import Activity  # noqa: E402
+from app.models.database import DatabaseManager  # noqa: E402
+
+
+@pytest.fixture
+def db(tmp_path):
+    """A fresh DatabaseManager backed by a temporary SQLite file."""
+    return DatabaseManager(str(tmp_path / "test_activities.db"))
+
+
+def _make_activity(**overrides):
+    base = dict(
+        name="Vânătoarea de comori",
+        description="O activitate de echipă în aer liber.",
+        category="camp-outdoor",
+        content_type="joc",
+        source_file="test.txt",
+        language="ro",
+    )
+    base.update(overrides)
+    return Activity(**base)
+
+
+def test_search_by_materials_list(db):
+    """A term that only appears in materials_list returns the activity."""
+    activity = _make_activity(materials_list="frânghie, eșarfă, busolă")
+    db.insert_activity(activity)
+
+    results = db.search_activities(search_text="busolă")
+    assert len(results) == 1
+    assert results[0]["name"] == "Vânătoarea de comori"
+
+
+def test_search_by_skills_developed(db):
+    """A term that only appears in skills_developed returns the activity."""
+    activity = _make_activity(skills_developed="comunicare, leadership, rabdare")
+    db.insert_activity(activity)
+
+    results = db.search_activities(search_text="leadership")
+    assert len(results) == 1
+    assert results[0]["name"] == "Vânătoarea de comori"
+
+
+def test_term_absent_from_indexed_columns_no_hit(db):
+    """A term present in no indexed column yields no hit (control)."""
+    db.insert_activity(_make_activity(materials_list="frânghie"))
+    assert db.search_activities(search_text="zzzunlikelyterm") == []
+
+
+def test_delete_trigger_removes_from_fts(db):
+    """Deleting an activity removes it from the FTS index (delete trigger)."""
+    activity = _make_activity(materials_list="catalige")
+    activity_id = db.insert_activity(activity)
+    assert len(db.search_activities(search_text="catalige")) == 1
+
+    with db._get_connection() as conn:
+        conn.execute("DELETE FROM activities WHERE id = ?", (activity_id,))
+        conn.commit()
+
+    assert db.search_activities(search_text="catalige") == []
+
+
+def test_update_trigger_resyncs_fts(db):
+    """Updating materials_list re-syncs the FTS index (update trigger)."""
+    activity = _make_activity(materials_list="creioane")
+    activity_id = db.insert_activity(activity)
+    assert len(db.search_activities(search_text="creioane")) == 1
+
+    with db._get_connection() as conn:
+        conn.execute(
+            "UPDATE activities SET materials_list = ? WHERE id = ?",
+            ("acuarele", activity_id),
+        )
+        conn.commit()
+
+    # Old term gone, new term found.
+    assert db.search_activities(search_text="creioane") == []
+    assert len(db.search_activities(search_text="acuarele")) == 1
+
+
+def test_rebuild_fts_index(db):
+    """rebuild_fts_index keeps materials_list / skills_developed searchable."""
+    db.insert_activity(_make_activity(skills_developed="orientare"))
+    db.rebuild_fts_index()
+    assert len(db.search_activities(search_text="orientare")) == 1
+
+
+def test_new_schema_columns_round_trip(db):
+    """New activity columns persist and load back via from_dict."""
+    activity = _make_activity(
+        source_files=["a.txt", "b.txt"],
+        source_excerpt="Citat scurt din sursă.",
+        extraction_confidence="high",
+        needs_review=1,
+        normalized_name="vanatoarea de comori",
+    )
+    activity_id = db.insert_activity(activity)
+
+    row = db.get_activity_by_id(activity_id)
+    assert row["content_type"] == "joc"
+    assert row["language"] == "ro"
+    assert row["extraction_confidence"] == "high"
+    assert row["needs_review"] == 1
+    assert row["normalized_name"] == "vanatoarea de comori"
+    assert json.loads(row["source_files"]) == ["a.txt", "b.txt"]
+    assert row["source_excerpt"] == "Citat scurt din sursă."
+
+    loaded = Activity.from_dict(row)
+    assert loaded.source_files == ["a.txt", "b.txt"]
+    assert loaded.content_type == "joc"
+
+
+def test_normalized_name_auto_derived(db):
+    """normalized_name is auto-derived from name when not provided."""
+    activity = Activity(
+        name="Ștafetă cu  Obstacole",
+        description="desc",
+        category="sports-active",
+        source_file="t.txt",
+    )
+    assert activity.normalized_name == "stafeta cu obstacole"
diff --git a/tests/test_search.py b/tests/test_search.py
new file mode 100644
index 0000000..547c9e2
--- /dev/null
+++ b/tests/test_search.py
@@ -0,0 +1,140 @@
+# -*- coding: utf-8 -*-
+"""
+CRITICAL REGRESSION TEST (plan §6, §7).
+
+`search.py` changed the result sets of /search and /api/search: the default
+search now EXCLUDES the non-game content types (rețetă / cântec / ceremonie),
+which surface only when the user explicitly filters that content_type or picks
+a non-game category. This test guards that behaviour.
+"""
+
+import pytest
+
+from app.models.activity import Activity
+from app.models.database import DatabaseManager
+from app.services.search import SearchService
+from app.config_taxonomy import NON_GAME_CONTENT_TYPES
+
+
+# --------------------------------------------------------------------------
+# fixtures
+# --------------------------------------------------------------------------
+def _activity(name, content_type, category="altele", language="ro"):
+    return Activity(
+        name=name,
+        description=f"Descriere pentru {name}, un conținut de tip {content_type}.",
+        category=category,
+        content_type=content_type,
+        language=language,
+        source_file="test/fixture.txt",
+    )
+
+
+@pytest.fixture
+def search_service(tmp_path):
+    """A SearchService over a temp DB seeded with one row per content_type."""
+    db = DatabaseManager(str(tmp_path / "activities.db"))
+    db.clear_database()
+    db.bulk_insert_activities([
+        _activity("Vanatoarea de comori", "joc", category="wide-games"),
+        _activity("Cercul de cunoastere", "activitate", category="icebreakers"),
+        _activity("Reteta de paine la ceaun", "reteta", category="retete"),
+        _activity("Cantecul de tabara", "cantec", category="cantece-ceremonii"),
+        _activity("Ceremonia de inchidere", "ceremonie", category="cantece-ceremonii"),
+        _activity("Game in English", "joc", category="wide-games", language="en"),
+    ])
+    return SearchService(db)
+
+
+def _content_types(results):
+    return {r.get("content_type") for r in results}
+
+
+# --------------------------------------------------------------------------
+# the regression: default search excludes non-game content types
+# --------------------------------------------------------------------------
+def test_default_search_excludes_non_game_content(search_service):
+    """No filters → rețete / cântece / ceremonii must NOT appear."""
+    results = search_service.search_activities()
+    types = _content_types(results)
+
+    assert types, "default search returned nothing"
+    for non_game in NON_GAME_CONTENT_TYPES:
+        assert non_game not in types, (
+            f"default search leaked non-game content_type '{non_game}'"
+        )
+    # game content is still present
+    assert "joc" in types
+    assert "activitate" in types
+
+
+def test_default_search_with_text_excludes_non_game(search_service):
+    """A text query still excludes non-game content by default."""
+    results = search_service.search_activities(search_text="conținut")
+    assert NON_GAME_CONTENT_TYPES[0] not in _content_types(results)
+
+
+# --------------------------------------------------------------------------
+# explicit content_type filter INCLUDES the non-game rows
+# --------------------------------------------------------------------------
+def test_explicit_content_type_filter_includes_non_game(search_service):
+    """Filtering content_type=reteta returns exactly the rețete."""
+    results = search_service.search_activities(filters={"content_type": "reteta"})
+    types = _content_types(results)
+
+    assert types == {"reteta"}, f"expected only rețete, got {types}"
+    assert len(results) == 1
+
+
+def test_explicit_content_type_filter_for_cantec(search_service):
+    results = search_service.search_activities(filters={"content_type": "cantec"})
+    assert _content_types(results) == {"cantec"}
+
+
+# --------------------------------------------------------------------------
+# a non-game CATEGORY filter also lifts the exclusion
+# --------------------------------------------------------------------------
+def test_non_game_category_filter_includes_non_game(search_service):
+    """Picking category=cantece-ceremonii surfaces cântece + ceremonii."""
+    results = search_service.search_activities(
+        filters={"category": "cantece-ceremonii"})
+    types = _content_types(results)
+
+    assert "cantec" in types
+    assert "ceremonie" in types
+
+
+def test_game_category_filter_still_excludes_non_game(search_service):
+    """A normal (game) category filter keeps the non-game exclusion."""
+    results = search_service.search_activities(filters={"category": "wide-games"})
+    types = _content_types(results)
+    for non_game in NON_GAME_CONTENT_TYPES:
+        assert non_game not in types
+
+
+# --------------------------------------------------------------------------
+# language filter
+# --------------------------------------------------------------------------
+def test_language_filter_ro(search_service):
+    results = search_service.search_activities(filters={"language": "ro"})
+    assert results
+    assert all(r.get("language") == "ro" for r in results)
+
+
+def test_language_filter_en(search_service):
+    results = search_service.search_activities(filters={"language": "en"})
+    assert results
+    assert all(r.get("language") == "en" for r in results)
+    assert {r.get("name") for r in results} == {"Game in English"}
+
+
+# --------------------------------------------------------------------------
+# get_filter_options surfaces the new axes
+# --------------------------------------------------------------------------
+def test_filter_options_include_content_type_and_language(search_service):
+    """The dynamic-filter mechanism now exposes content_type + language."""
+    options = search_service.db.get_filter_options()
+    assert "content_type" in options
+    assert "language" in options
+    assert "joc" in options["content_type"]
+    assert set(options["language"]) == {"ro", "en"}
diff --git a/tests/test_validate_extractions.py b/tests/test_validate_extractions.py
new file mode 100644
index 0000000..c452f2d
--- /dev/null
+++ b/tests/test_validate_extractions.py
@@ -0,0 +1,156 @@
+# -*- coding: utf-8 -*-
+"""
+Tests for scripts/validate_extractions.py.
+
+Covers: schema rejection, the source_excerpt hallucination check, the content
+of the generated re-extraction prompt, and the manifest `rejected` marking.
+"""
+
+import json
+import sys
+from pathlib import Path
+
+REPO_ROOT = Path(__file__).resolve().parent.parent
+SCRIPTS_DIR = REPO_ROOT / "scripts"
+for _p in (str(REPO_ROOT), str(SCRIPTS_DIR)):
+    if _p not in sys.path:
+        sys.path.insert(0, _p)
+
+import validate_extractions as ve  # noqa: E402
+
+
+# --------------------------------------------------------------------------
+# helpers
+# --------------------------------------------------------------------------
+def _ext_activity(**over):
+    base = dict(
+        name="Jocul testului",
+        description="O activitate de echipa in aer liber.",
+        category="team-building",
+        content_type="joc",
+        language="ro",
+        extraction_confidence="high",
+        source_excerpt="ancora din bucata sursa",
+        page_reference="page 1",
+    )
+    base.update(over)
+    return base
+
+
+def _write_extraction(extracted_dir, chunk_key, activities, header_extra=None):
+    extracted_dir.mkdir(parents=True, exist_ok=True)
+    header = {
+        "source_hash": "hash1234deadbeef",
+        "schema_version": "1.0",
+        "prompt_version": "1.0",
+        "chunk_range": "pages 1-20",
+        "source_id": "src01",
+        "chunk_key": chunk_key,
+    }
+    if header_extra:
+        header.update(header_extra)
+    payload = {"header": header, "activities": activities}
+    (extracted_dir / f"{chunk_key}.json").write_text(
+        json.dumps(payload, ensure_ascii=False), encoding="utf-8"
+    )
+
+
+def _write_chunk(chunks_dir, source_id, chunk_key, text):
+    d = chunks_dir / source_id
+    d.mkdir(parents=True, exist_ok=True)
+    (d / f"{chunk_key}.txt").write_text(text, encoding="utf-8")
+
+
+# --------------------------------------------------------------------------
+# tests
+# --------------------------------------------------------------------------
+def test_valid_file_passes(tmp_path):
+    extracted = tmp_path / "extracted"
+    chunks = tmp_path / "chunks"
+    excerpt = "ancora din bucata sursa apare aici"
+    _write_extraction(extracted, "src01.part01", [_ext_activity(source_excerpt=excerpt)])
+    _write_chunk(chunks, "src01", "src01.part01", f"--- PAGE 1 ---\n{excerpt}\n")
+
+    report = ve.run(extracted, chunks, tmp_path / "manifest.json")
+    assert report["valid"] == 1
+    assert report["rejected"] == 0
+
+
+def test_schema_invalid_file_rejected(tmp_path):
+    extracted = tmp_path / "extracted"
+    chunks = tmp_path / "chunks"
+    extracted.mkdir(parents=True)
+    (extracted / "src01.part01.json").write_text(
+        json.dumps({"header": {}, "activities": [{"name": "x"}]}), encoding="utf-8"
+    )
+
+    report = ve.run(extracted, chunks, tmp_path / "manifest.json")
+    assert report["rejected"] == 1
+    prompt = extracted / "_reextract" / "src01.part01.prompt.md"
+    assert prompt.exists()
+
+
+def test_hallucinated_excerpt_rejected(tmp_path):
+    extracted = tmp_path / "extracted"
+    chunks = tmp_path / "chunks"
+    _write_extraction(
+        extracted, "src01.part01",
+        [_ext_activity(source_excerpt="citat complet inventat care nu exista qqqq")],
+    )
+    _write_chunk(chunks, "src01", "src01.part01",
+                 "--- PAGE 1 ---\ntext complet diferit despre altceva.\n")
+
+    report = ve.run(extracted, chunks, tmp_path / "manifest.json")
+    assert report["rejected"] == 1
+    errors = report["rejected_chunks"][0]["errors"]
+    assert any("hallucination" in e for e in errors)
+
+
+def test_reextraction_prompt_content(tmp_path):
+    extracted = tmp_path / "extracted"
+    chunks = tmp_path / "chunks"
+    _write_extraction(
+        extracted, "src01.part01",
+        [_ext_activity(source_excerpt="citat inventat care nu exista zzzz")],
+    )
+    _write_chunk(chunks, "src01", "src01.part01",
+                 "--- PAGE 1 ---\ntext despre cu totul altceva aici.\n")
+
+    ve.run(extracted, chunks, tmp_path / "manifest.json")
+    prompt = (extracted / "_reextract" / "src01.part01.prompt.md").read_text(
+        encoding="utf-8"
+    )
+    assert "src01.part01" in prompt
+    assert "REJECTED" in prompt
+    assert "verbatim" in prompt
+    assert "data/extracted/src01.part01.json" in prompt
+
+
+def test_manifest_marks_chunk_rejected(tmp_path):
+    extracted = tmp_path / "extracted"
+    chunks = tmp_path / "chunks"
+    manifest_path = tmp_path / "manifest.json"
+    manifest_path.write_text(
+        json.dumps({"chunks": {"src01.part01": {"state": "done",
+                                                "chunk_file": "chunks/src01/src01.part01.txt"}}}),
+        encoding="utf-8",
+    )
+    _write_extraction(
+        extracted, "src01.part01",
+        [_ext_activity(source_excerpt="citat fabricat absent vvvv")],
+    )
+    _write_chunk(chunks, "src01", "src01.part01",
+                 "--- PAGE 1 ---\nun continut neinrudit.\n")
+
+    ve.run(extracted, chunks, manifest_path)
+    manifest = json.loads(manifest_path.read_text(encoding="utf-8"))
+    assert manifest["chunks"]["src01.part01"]["state"] == "rejected"
+
+
+def test_build_reextraction_prompt_lists_errors():
+    prompt = ve.build_reextraction_prompt(
+        "abc.part03", "data/chunks/abc/abc.part03.txt",
+        ["header: 'source_hash' is a required property"],
+    )
+    assert "abc.part03" in prompt
+    assert "source_hash" in prompt