Rebuild extraction pipeline infrastructure (Faza 0 prep)

Implements the approved plan to replace the broken regex/index-master extraction with an LLM-subagent pipeline. Four parallel lanes: Lane A — scripts/extract_common.py (PDF/docx/doc/pptx/html/zip, no max_pages truncation), normalize_sources.py, chunk_sources.py (~20pg chunks + overlap, manifest registry), activity_schema.json. Lane B — app/config_taxonomy.py (16 fixed category slugs), schema rebuilt from scratch in app/models/ with content_type, language, source_files, source_excerpt, normalized_name, extraction_confidence, needs_review; FTS5 + 3 triggers extended with materials_list and skills_developed. Lane C — build_database.py (--rebuild, atomic swap, schema + fuzzy source_excerpt validation, dedup with needs_review band), validate_extractions.py, review_queue.py, new run_extraction.py orchestrator, SUBAGENT_PROMPT.md. Lane D — search.py content_type/language filters (default search excludes non-game content), E7 schema-compat audit; fixed a NULL keywords AttributeError in _boost_search_relevance. Removes 8 orphaned/dead scripts and app/services/parser.py + indexer.py. Adds tests/ (70 passing, 1 skipped — libreoffice absent). Note: Lane D made one additive edit to app/models/database.py (_update_category_counts) to surface content_type/language in get_filter_options, outside its nominal lane boundary but after Lane B completed. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-05-19 17:43:38 +00:00
parent e0080edf85
commit 66ae831c36
37 changed files with 4101 additions and 1881 deletions
--- a/app/services/search.py
+++ b/app/services/search.py
@@ -5,8 +5,19 @@ Enhanced search with FTS5 and intelligent filtering

 from typing import List, Dict, Any, Optional
 from app.models.database import DatabaseManager
+from app.config_taxonomy import NON_GAME_CONTENT_TYPES
 import re

+# Category slugs that are themselves "non-game" — selecting one of these as a
+# category filter also lifts the default non-game content_type exclusion.
+NON_GAME_CATEGORIES = {"retete", "cantece-ceremonii"}
+
+# When a Python-side post-filter is active the DB LIMIT is applied *before*
+# filtering, so we over-fetch to still satisfy the caller's `limit`.
+_OVERSCAN_FACTOR = 5
+_OVERSCAN_CAP = 2000
+
+
 class SearchService:
    """Enhanced search service with intelligent query processing"""
    
@@ -24,22 +35,72 @@ class SearchService:
        
        if filters is None:
            filters = {}
-        
+
        # Process and normalize search text
        processed_search = self._process_search_text(search_text)
-        
+
        # Map web filters to database fields
        db_filters = self._map_filters_to_db_fields(filters)
-        
+
+        # content_type and language are filtered in Python: the DB layer does
+        # not expose them as query parameters. The DEFAULT search excludes the
+        # non-game content types (rețete / cântece / ceremonii) — they surface
+        # only when the user explicitly filters that content_type, or picks a
+        # non-game category. See plan §6.
+        content_type, exclude_non_game = self._resolve_content_type_filter(filters)
+        language = (filters.get('language') or '').strip().lower() or None
+        post_filtering = bool(content_type or exclude_non_game or language)
+
+        # Over-fetch when post-filtering so the final list can still reach `limit`.
+        fetch_limit = min(limit * _OVERSCAN_FACTOR, _OVERSCAN_CAP) if post_filtering else limit
+
        # Perform database search
        results = self.db.search_activities(
            search_text=processed_search,
            **db_filters,
-            limit=limit
+            limit=fetch_limit
        )
-        
-        # Post-process results for relevance and ranking
-        return self._post_process_results(results, processed_search, filters)
+
+        # Apply content_type / language post-filters
+        results = self._apply_content_type_filter(results, content_type, exclude_non_game)
+        if language:
+            results = [r for r in results
+                       if (r.get('language') or '').strip().lower() == language]
+
+        # Post-process results for relevance and ranking, then honour `limit`
+        results = self._post_process_results(results, processed_search, filters)
+        return results[:limit]
+
+    def _resolve_content_type_filter(self, filters: Dict[str, str]):
+        """Determine the content_type post-filter.
+
+        Returns (explicit_content_type | None, exclude_non_game: bool):
+        - an explicit `content_type` filter → that value, no exclusion;
+        - a `category` filter on a non-game category → no exclusion;
+        - otherwise → default search, exclude non-game content types.
+        """
+        content_type = (filters.get('content_type') or '').strip()
+        if content_type:
+            return content_type, False
+        category = (filters.get('category') or '').strip()
+        if category in NON_GAME_CATEGORIES:
+            return None, False
+        return None, True
+
+    def _apply_content_type_filter(self,
+                                   results: List[Dict[str, Any]],
+                                   content_type: Optional[str],
+                                   exclude_non_game: bool) -> List[Dict[str, Any]]:
+        """Filter results by content_type (explicit include vs default exclude)."""
+        if content_type:
+            return [r for r in results
+                    if (r.get('content_type') or '') == content_type]
+        if exclude_non_game:
+            # Rows with NULL/unknown content_type are kept — only the known
+            # non-game types are dropped from the default search.
+            return [r for r in results
+                    if (r.get('content_type') or '') not in NON_GAME_CONTENT_TYPES]
+        return results
    
    def _process_search_text(self, search_text: Optional[str]) -> Optional[str]:
        """Process and enhance search text for better FTS5 results"""
@@ -83,10 +144,16 @@ class SearchService:
            if not filter_value or not filter_value.strip():
                continue
            
+            # content_type / language are NOT database query params — they are
+            # applied as Python post-filters in search_activities(). Skip them
+            # here so they never reach DatabaseManager.search_activities().
+            if filter_key in ('content_type', 'language'):
+                continue
+
            # Map filter types to database fields
            if filter_key == 'category':
                db_filters['category'] = filter_value
-            
+
            elif filter_key == 'age_group':
                # Parse age range (e.g., "5-8 ani", "12+ ani")
                age_match = re.search(r'(\d+)(?:-(\d+))?\s*ani?', filter_value)
@@ -177,21 +244,22 @@ class SearchService:
            boost_score = 0
            
            # Check name matches (highest priority)
-            name_lower = result.get('name', '').lower()
+            # NB: use `or ''` — nullable columns come back as None, not ''.
+            name_lower = (result.get('name') or '').lower()
            for term in search_terms:
                if term in name_lower:
                    boost_score += 10
                    if name_lower.startswith(term):
                        boost_score += 5  # Extra boost for name starts with term
-            
+
            # Check description matches
-            desc_lower = result.get('description', '').lower()
+            desc_lower = (result.get('description') or '').lower()
            for term in search_terms:
                if term in desc_lower:
                    boost_score += 3
-            
+
            # Check keywords matches
-            keywords_lower = result.get('keywords', '').lower()
+            keywords_lower = (result.get('keywords') or '').lower()
            for term in search_terms:
                if term in keywords_lower:
                    boost_score += 5
@@ -280,11 +348,14 @@ class SearchService:
            return []
        
        try:
-            # Search for activities that match the partial query
+            # Search for activities that match the partial query.
+            # Over-fetch then drop non-game content types so autocomplete
+            # mirrors the default search (no rețete / cântece / ceremonii).
            results = self.db.search_activities(
                search_text=f'"{partial_query}"',
-                limit=limit * 2
+                limit=limit * 6
            )
+            results = self._apply_content_type_filter(results, None, True)
            
            suggestions = []
            seen = set()