Rebuild extraction pipeline infrastructure (Faza 0 prep)
Implements the approved plan to replace the broken regex/index-master extraction with an LLM-subagent pipeline. Four parallel lanes: Lane A — scripts/extract_common.py (PDF/docx/doc/pptx/html/zip, no max_pages truncation), normalize_sources.py, chunk_sources.py (~20pg chunks + overlap, manifest registry), activity_schema.json. Lane B — app/config_taxonomy.py (16 fixed category slugs), schema rebuilt from scratch in app/models/ with content_type, language, source_files, source_excerpt, normalized_name, extraction_confidence, needs_review; FTS5 + 3 triggers extended with materials_list and skills_developed. Lane C — build_database.py (--rebuild, atomic swap, schema + fuzzy source_excerpt validation, dedup with needs_review band), validate_extractions.py, review_queue.py, new run_extraction.py orchestrator, SUBAGENT_PROMPT.md. Lane D — search.py content_type/language filters (default search excludes non-game content), E7 schema-compat audit; fixed a NULL keywords AttributeError in _boost_search_relevance. Removes 8 orphaned/dead scripts and app/services/parser.py + indexer.py. Adds tests/ (70 passing, 1 skipped — libreoffice absent). Note: Lane D made one additive edit to app/models/database.py (_update_category_counts) to surface content_type/language in get_filter_options, outside its nominal lane boundary but after Lane B completed. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -5,8 +5,19 @@ Enhanced search with FTS5 and intelligent filtering
|
||||
|
||||
from typing import List, Dict, Any, Optional
|
||||
from app.models.database import DatabaseManager
|
||||
from app.config_taxonomy import NON_GAME_CONTENT_TYPES
|
||||
import re
|
||||
|
||||
# Category slugs that are themselves "non-game" — selecting one of these as a
|
||||
# category filter also lifts the default non-game content_type exclusion.
|
||||
NON_GAME_CATEGORIES = {"retete", "cantece-ceremonii"}
|
||||
|
||||
# When a Python-side post-filter is active the DB LIMIT is applied *before*
|
||||
# filtering, so we over-fetch to still satisfy the caller's `limit`.
|
||||
_OVERSCAN_FACTOR = 5
|
||||
_OVERSCAN_CAP = 2000
|
||||
|
||||
|
||||
class SearchService:
|
||||
"""Enhanced search service with intelligent query processing"""
|
||||
|
||||
@@ -24,22 +35,72 @@ class SearchService:
|
||||
|
||||
if filters is None:
|
||||
filters = {}
|
||||
|
||||
|
||||
# Process and normalize search text
|
||||
processed_search = self._process_search_text(search_text)
|
||||
|
||||
|
||||
# Map web filters to database fields
|
||||
db_filters = self._map_filters_to_db_fields(filters)
|
||||
|
||||
|
||||
# content_type and language are filtered in Python: the DB layer does
|
||||
# not expose them as query parameters. The DEFAULT search excludes the
|
||||
# non-game content types (rețete / cântece / ceremonii) — they surface
|
||||
# only when the user explicitly filters that content_type, or picks a
|
||||
# non-game category. See plan §6.
|
||||
content_type, exclude_non_game = self._resolve_content_type_filter(filters)
|
||||
language = (filters.get('language') or '').strip().lower() or None
|
||||
post_filtering = bool(content_type or exclude_non_game or language)
|
||||
|
||||
# Over-fetch when post-filtering so the final list can still reach `limit`.
|
||||
fetch_limit = min(limit * _OVERSCAN_FACTOR, _OVERSCAN_CAP) if post_filtering else limit
|
||||
|
||||
# Perform database search
|
||||
results = self.db.search_activities(
|
||||
search_text=processed_search,
|
||||
**db_filters,
|
||||
limit=limit
|
||||
limit=fetch_limit
|
||||
)
|
||||
|
||||
# Post-process results for relevance and ranking
|
||||
return self._post_process_results(results, processed_search, filters)
|
||||
|
||||
# Apply content_type / language post-filters
|
||||
results = self._apply_content_type_filter(results, content_type, exclude_non_game)
|
||||
if language:
|
||||
results = [r for r in results
|
||||
if (r.get('language') or '').strip().lower() == language]
|
||||
|
||||
# Post-process results for relevance and ranking, then honour `limit`
|
||||
results = self._post_process_results(results, processed_search, filters)
|
||||
return results[:limit]
|
||||
|
||||
def _resolve_content_type_filter(self, filters: Dict[str, str]):
|
||||
"""Determine the content_type post-filter.
|
||||
|
||||
Returns (explicit_content_type | None, exclude_non_game: bool):
|
||||
- an explicit `content_type` filter → that value, no exclusion;
|
||||
- a `category` filter on a non-game category → no exclusion;
|
||||
- otherwise → default search, exclude non-game content types.
|
||||
"""
|
||||
content_type = (filters.get('content_type') or '').strip()
|
||||
if content_type:
|
||||
return content_type, False
|
||||
category = (filters.get('category') or '').strip()
|
||||
if category in NON_GAME_CATEGORIES:
|
||||
return None, False
|
||||
return None, True
|
||||
|
||||
def _apply_content_type_filter(self,
|
||||
results: List[Dict[str, Any]],
|
||||
content_type: Optional[str],
|
||||
exclude_non_game: bool) -> List[Dict[str, Any]]:
|
||||
"""Filter results by content_type (explicit include vs default exclude)."""
|
||||
if content_type:
|
||||
return [r for r in results
|
||||
if (r.get('content_type') or '') == content_type]
|
||||
if exclude_non_game:
|
||||
# Rows with NULL/unknown content_type are kept — only the known
|
||||
# non-game types are dropped from the default search.
|
||||
return [r for r in results
|
||||
if (r.get('content_type') or '') not in NON_GAME_CONTENT_TYPES]
|
||||
return results
|
||||
|
||||
def _process_search_text(self, search_text: Optional[str]) -> Optional[str]:
|
||||
"""Process and enhance search text for better FTS5 results"""
|
||||
@@ -83,10 +144,16 @@ class SearchService:
|
||||
if not filter_value or not filter_value.strip():
|
||||
continue
|
||||
|
||||
# content_type / language are NOT database query params — they are
|
||||
# applied as Python post-filters in search_activities(). Skip them
|
||||
# here so they never reach DatabaseManager.search_activities().
|
||||
if filter_key in ('content_type', 'language'):
|
||||
continue
|
||||
|
||||
# Map filter types to database fields
|
||||
if filter_key == 'category':
|
||||
db_filters['category'] = filter_value
|
||||
|
||||
|
||||
elif filter_key == 'age_group':
|
||||
# Parse age range (e.g., "5-8 ani", "12+ ani")
|
||||
age_match = re.search(r'(\d+)(?:-(\d+))?\s*ani?', filter_value)
|
||||
@@ -177,21 +244,22 @@ class SearchService:
|
||||
boost_score = 0
|
||||
|
||||
# Check name matches (highest priority)
|
||||
name_lower = result.get('name', '').lower()
|
||||
# NB: use `or ''` — nullable columns come back as None, not ''.
|
||||
name_lower = (result.get('name') or '').lower()
|
||||
for term in search_terms:
|
||||
if term in name_lower:
|
||||
boost_score += 10
|
||||
if name_lower.startswith(term):
|
||||
boost_score += 5 # Extra boost for name starts with term
|
||||
|
||||
|
||||
# Check description matches
|
||||
desc_lower = result.get('description', '').lower()
|
||||
desc_lower = (result.get('description') or '').lower()
|
||||
for term in search_terms:
|
||||
if term in desc_lower:
|
||||
boost_score += 3
|
||||
|
||||
|
||||
# Check keywords matches
|
||||
keywords_lower = result.get('keywords', '').lower()
|
||||
keywords_lower = (result.get('keywords') or '').lower()
|
||||
for term in search_terms:
|
||||
if term in keywords_lower:
|
||||
boost_score += 5
|
||||
@@ -280,11 +348,14 @@ class SearchService:
|
||||
return []
|
||||
|
||||
try:
|
||||
# Search for activities that match the partial query
|
||||
# Search for activities that match the partial query.
|
||||
# Over-fetch then drop non-game content types so autocomplete
|
||||
# mirrors the default search (no rețete / cântece / ceremonii).
|
||||
results = self.db.search_activities(
|
||||
search_text=f'"{partial_query}"',
|
||||
limit=limit * 2
|
||||
limit=limit * 6
|
||||
)
|
||||
results = self._apply_content_type_filter(results, None, True)
|
||||
|
||||
suggestions = []
|
||||
seen = set()
|
||||
|
||||
Reference in New Issue
Block a user